Skip to content

L2G Prediction

gentropy.dataset.l2g_prediction.L2GPrediction dataclass

Bases: Dataset

Dataset that contains the Locus to Gene predictions.

It is the result of applying the L2G model on a feature matrix, which contains all the study/locus pairs and their functional annotations. The score column informs the confidence of the prediction that a gene is causal to an association.

Source code in src/gentropy/dataset/l2g_prediction.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
@dataclass
class L2GPrediction(Dataset):
    """Dataset that contains the Locus to Gene predictions.

    It is the result of applying the L2G model on a feature matrix, which contains all
    the study/locus pairs and their functional annotations. The score column informs the
    confidence of the prediction that a gene is causal to an association.
    """

    @classmethod
    def get_schema(cls: type[L2GPrediction]) -> StructType:
        """Provides the schema for the L2GPrediction dataset.

        Returns:
            StructType: Schema for the L2GPrediction dataset
        """
        return parse_spark_schema("l2g_predictions.json")

    @classmethod
    def from_credible_set(
        cls: Type[L2GPrediction],
        session: Session,
        credible_set: StudyLocus,
        feature_matrix: L2GFeatureMatrix,
        features_list: list[str],
        model_path: str | None,
        hf_token: str | None = None,
        download_from_hub: bool = True,
    ) -> L2GPrediction:
        """Extract L2G predictions for a set of credible sets derived from GWAS.

        Args:
            session (Session): Session object that contains the Spark session
            credible_set (StudyLocus): Dataset containing credible sets from GWAS only
            feature_matrix (L2GFeatureMatrix): Dataset containing all credible sets and their annotations
            features_list (list[str]): List of features to use for the model
            model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
            hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
            download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.

        Returns:
            L2GPrediction: L2G scores for a set of credible sets.
        """
        # Load the model
        if download_from_hub:
            # Model ID defaults to "opentargets/locus_to_gene" and it assumes the name of the classifier is "classifier.skops".
            model_id = model_path or "opentargets/locus_to_gene"
            l2g_model = LocusToGeneModel.load_from_hub(model_id, hf_token)
        elif model_path:
            l2g_model = LocusToGeneModel.load_from_disk(model_path)

        # Prepare data
        fm = (
            L2GFeatureMatrix(
                _df=(
                    credible_set.df.filter(f.col("studyType") == "gwas")
                    .select("studyLocusId")
                    .join(feature_matrix._df, "studyLocusId")
                    .filter(f.col("isProteinCoding") == 1)
                )
            )
            .fill_na()
            .select_features(features_list)
        )

        return l2g_model.predict(fm, session)

    def to_disease_target_evidence(
        self: L2GPrediction,
        study_locus: StudyLocus,
        study_index: StudyIndex,
        l2g_threshold: float = 0.05,
    ) -> DataFrame:
        """Convert locus to gene predictions to disease target evidence.

        Args:
            study_locus (StudyLocus): Study locus dataset
            study_index (StudyIndex): Study index dataset
            l2g_threshold (float): Threshold to consider a gene as a target. Defaults to 0.05.

        Returns:
            DataFrame: Disease target evidence
        """
        datasource_id = "gwas_credible_sets"
        datatype_id = "genetic_association"

        return (
            self.df.filter(f.col("score") >= l2g_threshold)
            .join(
                study_locus.df.select("studyLocusId", "studyId"),
                on="studyLocusId",
                how="inner",
            )
            .join(
                study_index.df.select("studyId", "diseaseIds"),
                on="studyId",
                how="inner",
            )
            .select(
                f.lit(datatype_id).alias("datatypeId"),
                f.lit(datasource_id).alias("datasourceId"),
                f.col("geneId").alias("targetFromSourceId"),
                f.explode(f.col("diseaseIds")).alias("diseaseFromSourceMappedId"),
                f.col("score").alias("resourceScore"),
                "studyLocusId",
            )
        )

    def add_locus_to_gene_features(
        self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
    ) -> L2GPrediction:
        """Add features used to extract the L2G predictions.

        Args:
            feature_matrix (L2GFeatureMatrix): Feature matrix dataset
            features_list (list[str]): List of features used in the model

        Returns:
            L2GPrediction: L2G predictions with additional features
        """
        # Testing if `locusToGeneFeatures` column already exists:
        if "locusToGeneFeatures" in self.df.columns:
            self.df = self.df.drop("locusToGeneFeatures")

        # Aggregating all features into a single map column:
        aggregated_features = (
            feature_matrix._df.withColumn(
                "locusToGeneFeatures",
                f.create_map(
                    *sum(
                        ((f.lit(feature), f.col(feature)) for feature in features_list),
                        (),
                    )
                ),
            )
            .withColumn(
                "locusToGeneFeatures",
                f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
            )
            .drop(*features_list)
        )
        return L2GPrediction(
            _df=self.df.join(
                aggregated_features, on=["studyLocusId", "geneId"], how="left"
            ),
            _schema=self.get_schema(),
        )

add_locus_to_gene_features(feature_matrix: L2GFeatureMatrix, features_list: list[str]) -> L2GPrediction

Add features used to extract the L2G predictions.

Parameters:

Name Type Description Default
feature_matrix L2GFeatureMatrix

Feature matrix dataset

required
features_list list[str]

List of features used in the model

required

Returns:

Name Type Description
L2GPrediction L2GPrediction

L2G predictions with additional features

Source code in src/gentropy/dataset/l2g_prediction.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def add_locus_to_gene_features(
    self: L2GPrediction, feature_matrix: L2GFeatureMatrix, features_list: list[str]
) -> L2GPrediction:
    """Add features used to extract the L2G predictions.

    Args:
        feature_matrix (L2GFeatureMatrix): Feature matrix dataset
        features_list (list[str]): List of features used in the model

    Returns:
        L2GPrediction: L2G predictions with additional features
    """
    # Testing if `locusToGeneFeatures` column already exists:
    if "locusToGeneFeatures" in self.df.columns:
        self.df = self.df.drop("locusToGeneFeatures")

    # Aggregating all features into a single map column:
    aggregated_features = (
        feature_matrix._df.withColumn(
            "locusToGeneFeatures",
            f.create_map(
                *sum(
                    ((f.lit(feature), f.col(feature)) for feature in features_list),
                    (),
                )
            ),
        )
        .withColumn(
            "locusToGeneFeatures",
            f.expr("map_filter(locusToGeneFeatures, (k, v) -> v != 0)"),
        )
        .drop(*features_list)
    )
    return L2GPrediction(
        _df=self.df.join(
            aggregated_features, on=["studyLocusId", "geneId"], how="left"
        ),
        _schema=self.get_schema(),
    )

from_credible_set(session: Session, credible_set: StudyLocus, feature_matrix: L2GFeatureMatrix, features_list: list[str], model_path: str | None, hf_token: str | None = None, download_from_hub: bool = True) -> L2GPrediction classmethod

Extract L2G predictions for a set of credible sets derived from GWAS.

Parameters:

Name Type Description Default
session Session

Session object that contains the Spark session

required
credible_set StudyLocus

Dataset containing credible sets from GWAS only

required
feature_matrix L2GFeatureMatrix

Dataset containing all credible sets and their annotations

required
features_list list[str]

List of features to use for the model

required
model_path str | None

Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).

required
hf_token str | None

Hugging Face token to download the model from the Hub. Only required if the model is private.

None
download_from_hub bool

Whether to download the model from the Hugging Face Hub. Defaults to True.

True

Returns:

Name Type Description
L2GPrediction L2GPrediction

L2G scores for a set of credible sets.

Source code in src/gentropy/dataset/l2g_prediction.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
@classmethod
def from_credible_set(
    cls: Type[L2GPrediction],
    session: Session,
    credible_set: StudyLocus,
    feature_matrix: L2GFeatureMatrix,
    features_list: list[str],
    model_path: str | None,
    hf_token: str | None = None,
    download_from_hub: bool = True,
) -> L2GPrediction:
    """Extract L2G predictions for a set of credible sets derived from GWAS.

    Args:
        session (Session): Session object that contains the Spark session
        credible_set (StudyLocus): Dataset containing credible sets from GWAS only
        feature_matrix (L2GFeatureMatrix): Dataset containing all credible sets and their annotations
        features_list (list[str]): List of features to use for the model
        model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
        hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
        download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.

    Returns:
        L2GPrediction: L2G scores for a set of credible sets.
    """
    # Load the model
    if download_from_hub:
        # Model ID defaults to "opentargets/locus_to_gene" and it assumes the name of the classifier is "classifier.skops".
        model_id = model_path or "opentargets/locus_to_gene"
        l2g_model = LocusToGeneModel.load_from_hub(model_id, hf_token)
    elif model_path:
        l2g_model = LocusToGeneModel.load_from_disk(model_path)

    # Prepare data
    fm = (
        L2GFeatureMatrix(
            _df=(
                credible_set.df.filter(f.col("studyType") == "gwas")
                .select("studyLocusId")
                .join(feature_matrix._df, "studyLocusId")
                .filter(f.col("isProteinCoding") == 1)
            )
        )
        .fill_na()
        .select_features(features_list)
    )

    return l2g_model.predict(fm, session)

get_schema() -> StructType classmethod

Provides the schema for the L2GPrediction dataset.

Returns:

Name Type Description
StructType StructType

Schema for the L2GPrediction dataset

Source code in src/gentropy/dataset/l2g_prediction.py
32
33
34
35
36
37
38
39
@classmethod
def get_schema(cls: type[L2GPrediction]) -> StructType:
    """Provides the schema for the L2GPrediction dataset.

    Returns:
        StructType: Schema for the L2GPrediction dataset
    """
    return parse_spark_schema("l2g_predictions.json")

to_disease_target_evidence(study_locus: StudyLocus, study_index: StudyIndex, l2g_threshold: float = 0.05) -> DataFrame

Convert locus to gene predictions to disease target evidence.

Parameters:

Name Type Description Default
study_locus StudyLocus

Study locus dataset

required
study_index StudyIndex

Study index dataset

required
l2g_threshold float

Threshold to consider a gene as a target. Defaults to 0.05.

0.05

Returns:

Name Type Description
DataFrame DataFrame

Disease target evidence

Source code in src/gentropy/dataset/l2g_prediction.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def to_disease_target_evidence(
    self: L2GPrediction,
    study_locus: StudyLocus,
    study_index: StudyIndex,
    l2g_threshold: float = 0.05,
) -> DataFrame:
    """Convert locus to gene predictions to disease target evidence.

    Args:
        study_locus (StudyLocus): Study locus dataset
        study_index (StudyIndex): Study index dataset
        l2g_threshold (float): Threshold to consider a gene as a target. Defaults to 0.05.

    Returns:
        DataFrame: Disease target evidence
    """
    datasource_id = "gwas_credible_sets"
    datatype_id = "genetic_association"

    return (
        self.df.filter(f.col("score") >= l2g_threshold)
        .join(
            study_locus.df.select("studyLocusId", "studyId"),
            on="studyLocusId",
            how="inner",
        )
        .join(
            study_index.df.select("studyId", "diseaseIds"),
            on="studyId",
            how="inner",
        )
        .select(
            f.lit(datatype_id).alias("datatypeId"),
            f.lit(datasource_id).alias("datasourceId"),
            f.col("geneId").alias("targetFromSourceId"),
            f.explode(f.col("diseaseIds")).alias("diseaseFromSourceMappedId"),
            f.col("score").alias("resourceScore"),
            "studyLocusId",
        )
    )

Schema

root
 |-- studyLocusId: string (nullable = false)
 |-- geneId: string (nullable = false)
 |-- score: double (nullable = false)
 |-- locusToGeneFeatures: map (nullable = true)
 |    |-- key: string
 |    |-- value: float (valueContainsNull = true)