Skip to content

L2G Gold Standard

gentropy.dataset.l2g_gold_standard.L2GGoldStandard dataclass

Bases: Dataset

L2G gold standard dataset.

Source code in src/gentropy/dataset/l2g_gold_standard.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
@dataclass
class L2GGoldStandard(Dataset):
    """L2G gold standard dataset."""

    INTERACTION_THRESHOLD = 0.7
    GS_POSITIVE_LABEL = "positive"
    GS_NEGATIVE_LABEL = "negative"

    @classmethod
    def from_otg_curation(
        cls: type[L2GGoldStandard],
        gold_standard_curation: DataFrame,
        study_locus_overlap: StudyLocusOverlap,
        variant_index: VariantIndex,
        interactions: DataFrame,
    ) -> L2GGoldStandard:
        """Initialise L2GGoldStandard from source dataset.

        Args:
            gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from
            study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci
            variant_index (VariantIndex): Dataset to bring distance between a variant and a gene's footprint
            interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene

        Returns:
            L2GGoldStandard: L2G Gold Standard dataset
        """
        from gentropy.datasource.open_targets.l2g_gold_standard import (
            OpenTargetsL2GGoldStandard,
        )

        interactions_df = cls.process_gene_interactions(interactions)

        return (
            OpenTargetsL2GGoldStandard.as_l2g_gold_standard(
                gold_standard_curation, variant_index
            )
            .filter_unique_associations(study_locus_overlap)
            .remove_false_negatives(interactions_df)
        )

    @classmethod
    def get_schema(cls: type[L2GGoldStandard]) -> StructType:
        """Provides the schema for the L2GGoldStandard dataset.

        Returns:
            StructType: Spark schema for the L2GGoldStandard dataset
        """
        return parse_spark_schema("l2g_gold_standard.json")

    @classmethod
    def process_gene_interactions(
        cls: Type[L2GGoldStandard], interactions: DataFrame
    ) -> DataFrame:
        """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.

        Args:
            interactions (DataFrame): Gene-gene interactions dataset from the Open Targets Platform

        Returns:
            DataFrame: Top scoring gene-gene interaction per pair of genes

        Examples:
            >>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"])
            >>> L2GGoldStandard.process_gene_interactions(interactions).show()
            +-------+-------+-----+
            |geneIdA|geneIdB|score|
            +-------+-------+-----+
            |  gene1|  gene2|  0.8|
            |  gene2|  gene3|  0.7|
            +-------+-------+-----+
            <BLANKLINE>
        """
        return get_record_with_maximum_value(
            interactions,
            ["targetA", "targetB"],
            "scoring",
        ).selectExpr(
            "targetA as geneIdA",
            "targetB as geneIdB",
            "scoring as score",
        )

    def build_feature_matrix(
        self: L2GGoldStandard,
        full_feature_matrix: L2GFeatureMatrix,
        credible_set: StudyLocus,
    ) -> L2GFeatureMatrix:
        """Return a feature matrix for study loci in the gold standard.

        Args:
            full_feature_matrix (L2GFeatureMatrix): Feature matrix for all study loci to join on
            credible_set (StudyLocus): Full credible sets to annotate the feature matrix with variant and study IDs and perform the join

        Returns:
            L2GFeatureMatrix: Feature matrix for study loci in the gold standard
        """
        from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix

        return L2GFeatureMatrix(
            _df=full_feature_matrix._df.join(
                credible_set.df.select("studyLocusId", "variantId", "studyId"),
                "studyLocusId",
                "left",
            )
            .join(
                f.broadcast(self.df.drop("studyLocusId", "sources")),
                on=["studyId", "variantId", "geneId"],
                how="inner",
            )
            .filter(f.col("isProteinCoding") == 1)
            .drop("studyId", "variantId")
            .distinct(),
            with_gold_standard=True,
        ).fill_na()

    def filter_unique_associations(
        self: L2GGoldStandard,
        study_locus_overlap: StudyLocusOverlap,
    ) -> L2GGoldStandard:
        """Refines the gold standard to filter out loci that are not independent.

        Rules:
        - If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one.
        - If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one.
        - If two loci point to different genes, and have overlapping variants, we keep both.

        Args:
            study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus.

        Returns:
            L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
        """
        squared_overlaps = study_locus_overlap._convert_to_square_matrix()
        unique_associations = (
            self.df.alias("left")
            # identify all the study loci that point to the same gene
            .withColumn(
                "sl_same_gene",
                f.collect_set("studyLocusId").over(Window.partitionBy("geneId")),
            )
            # identify all the study loci that have an overlapping variant
            .join(
                squared_overlaps.df.alias("right"),
                (f.col("left.studyLocusId") == f.col("right.leftStudyLocusId"))
                & (f.col("left.variantId") == f.col("right.tagVariantId")),
                "left",
            )
            .withColumn(
                "overlaps",
                f.when(f.col("right.tagVariantId").isNotNull(), f.lit(True)).otherwise(
                    f.lit(False)
                ),
            )
            # drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus
            .filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1)))
            .select(*self.df.columns)
        )
        return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema())

    def remove_false_negatives(
        self: L2GGoldStandard,
        interactions_df: DataFrame,
    ) -> L2GGoldStandard:
        """Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene.

        Args:
            interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes

        Returns:
            L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.
        """
        squared_interactions = interactions_df.unionByName(
            interactions_df.selectExpr(
                "geneIdB as geneIdA", "geneIdA as geneIdB", "score"
            )
        ).filter(f.col("score") > self.INTERACTION_THRESHOLD)
        df = (
            self.df.alias("left")
            .join(
                # bring gene partners
                squared_interactions.alias("right"),
                f.col("left.geneId") == f.col("right.geneIdA"),
                "left",
            )
            .withColumnRenamed("geneIdB", "interactorGeneId")
            .join(
                # bring gold standard status for gene partners
                self.df.selectExpr(
                    "geneId as interactorGeneId",
                    "goldStandardSet as interactorGeneIdGoldStandardSet",
                ),
                "interactorGeneId",
                "left",
            )
            # remove self-interactions
            .filter(
                (f.col("geneId") != f.col("interactorGeneId"))
                | (f.col("interactorGeneId").isNull())
            )
            # remove false negatives
            .filter(
                # drop rows where the GS gene is negative but the interactor is a GS positive
                ~(f.col("goldStandardSet") == "negative")
                & (f.col("interactorGeneIdGoldStandardSet") == "positive")
                |
                # keep rows where the gene does not interact
                (f.col("interactorGeneId").isNull())
            )
            .select(*self.df.columns)
            .distinct()
        )
        return L2GGoldStandard(_df=df, _schema=self.get_schema())

build_feature_matrix(full_feature_matrix: L2GFeatureMatrix, credible_set: StudyLocus) -> L2GFeatureMatrix

Return a feature matrix for study loci in the gold standard.

Parameters:

Name Type Description Default
full_feature_matrix L2GFeatureMatrix

Feature matrix for all study loci to join on

required
credible_set StudyLocus

Full credible sets to annotate the feature matrix with variant and study IDs and perform the join

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

Feature matrix for study loci in the gold standard

Source code in src/gentropy/dataset/l2g_gold_standard.py
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def build_feature_matrix(
    self: L2GGoldStandard,
    full_feature_matrix: L2GFeatureMatrix,
    credible_set: StudyLocus,
) -> L2GFeatureMatrix:
    """Return a feature matrix for study loci in the gold standard.

    Args:
        full_feature_matrix (L2GFeatureMatrix): Feature matrix for all study loci to join on
        credible_set (StudyLocus): Full credible sets to annotate the feature matrix with variant and study IDs and perform the join

    Returns:
        L2GFeatureMatrix: Feature matrix for study loci in the gold standard
    """
    from gentropy.dataset.l2g_feature_matrix import L2GFeatureMatrix

    return L2GFeatureMatrix(
        _df=full_feature_matrix._df.join(
            credible_set.df.select("studyLocusId", "variantId", "studyId"),
            "studyLocusId",
            "left",
        )
        .join(
            f.broadcast(self.df.drop("studyLocusId", "sources")),
            on=["studyId", "variantId", "geneId"],
            how="inner",
        )
        .filter(f.col("isProteinCoding") == 1)
        .drop("studyId", "variantId")
        .distinct(),
        with_gold_standard=True,
    ).fill_na()

filter_unique_associations(study_locus_overlap: StudyLocusOverlap) -> L2GGoldStandard

Refines the gold standard to filter out loci that are not independent.

Rules: - If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one. - If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one. - If two loci point to different genes, and have overlapping variants, we keep both.

Parameters:

Name Type Description Default
study_locus_overlap StudyLocusOverlap

A dataset detailing variants that overlap between StudyLocus.

required

Returns:

Name Type Description
L2GGoldStandard L2GGoldStandard

L2GGoldStandard updated to exclude false negatives and redundant positives.

Source code in src/gentropy/dataset/l2g_gold_standard.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def filter_unique_associations(
    self: L2GGoldStandard,
    study_locus_overlap: StudyLocusOverlap,
) -> L2GGoldStandard:
    """Refines the gold standard to filter out loci that are not independent.

    Rules:
    - If two loci point to the same gene, one positive and one negative, and have overlapping variants, we keep the positive one.
    - If two loci point to the same gene, both positive or negative, and have overlapping variants, we drop one.
    - If two loci point to different genes, and have overlapping variants, we keep both.

    Args:
        study_locus_overlap (StudyLocusOverlap): A dataset detailing variants that overlap between StudyLocus.

    Returns:
        L2GGoldStandard: L2GGoldStandard updated to exclude false negatives and redundant positives.
    """
    squared_overlaps = study_locus_overlap._convert_to_square_matrix()
    unique_associations = (
        self.df.alias("left")
        # identify all the study loci that point to the same gene
        .withColumn(
            "sl_same_gene",
            f.collect_set("studyLocusId").over(Window.partitionBy("geneId")),
        )
        # identify all the study loci that have an overlapping variant
        .join(
            squared_overlaps.df.alias("right"),
            (f.col("left.studyLocusId") == f.col("right.leftStudyLocusId"))
            & (f.col("left.variantId") == f.col("right.tagVariantId")),
            "left",
        )
        .withColumn(
            "overlaps",
            f.when(f.col("right.tagVariantId").isNotNull(), f.lit(True)).otherwise(
                f.lit(False)
            ),
        )
        # drop redundant rows: where the variantid overlaps and the gene is "explained" by more than one study locus
        .filter(~((f.size("sl_same_gene") > 1) & (f.col("overlaps") == 1)))
        .select(*self.df.columns)
    )
    return L2GGoldStandard(_df=unique_associations, _schema=self.get_schema())

from_otg_curation(gold_standard_curation: DataFrame, study_locus_overlap: StudyLocusOverlap, variant_index: VariantIndex, interactions: DataFrame) -> L2GGoldStandard classmethod

Initialise L2GGoldStandard from source dataset.

Parameters:

Name Type Description Default
gold_standard_curation DataFrame

Gold standard curation dataframe, extracted from

required
study_locus_overlap StudyLocusOverlap

Study locus overlap dataset to remove duplicated loci

required
variant_index VariantIndex

Dataset to bring distance between a variant and a gene's footprint

required
interactions DataFrame

Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene

required

Returns:

Name Type Description
L2GGoldStandard L2GGoldStandard

L2G Gold Standard dataset

Source code in src/gentropy/dataset/l2g_gold_standard.py
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
@classmethod
def from_otg_curation(
    cls: type[L2GGoldStandard],
    gold_standard_curation: DataFrame,
    study_locus_overlap: StudyLocusOverlap,
    variant_index: VariantIndex,
    interactions: DataFrame,
) -> L2GGoldStandard:
    """Initialise L2GGoldStandard from source dataset.

    Args:
        gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from
        study_locus_overlap (StudyLocusOverlap): Study locus overlap dataset to remove duplicated loci
        variant_index (VariantIndex): Dataset to bring distance between a variant and a gene's footprint
        interactions (DataFrame): Gene-gene interactions dataset to remove negative cases where the gene interacts with a positive gene

    Returns:
        L2GGoldStandard: L2G Gold Standard dataset
    """
    from gentropy.datasource.open_targets.l2g_gold_standard import (
        OpenTargetsL2GGoldStandard,
    )

    interactions_df = cls.process_gene_interactions(interactions)

    return (
        OpenTargetsL2GGoldStandard.as_l2g_gold_standard(
            gold_standard_curation, variant_index
        )
        .filter_unique_associations(study_locus_overlap)
        .remove_false_negatives(interactions_df)
    )

get_schema() -> StructType classmethod

Provides the schema for the L2GGoldStandard dataset.

Returns:

Name Type Description
StructType StructType

Spark schema for the L2GGoldStandard dataset

Source code in src/gentropy/dataset/l2g_gold_standard.py
66
67
68
69
70
71
72
73
@classmethod
def get_schema(cls: type[L2GGoldStandard]) -> StructType:
    """Provides the schema for the L2GGoldStandard dataset.

    Returns:
        StructType: Spark schema for the L2GGoldStandard dataset
    """
    return parse_spark_schema("l2g_gold_standard.json")

process_gene_interactions(interactions: DataFrame) -> DataFrame classmethod

Extract top scoring gene-gene interaction from the interactions dataset of the Platform.

Parameters:

Name Type Description Default
interactions DataFrame

Gene-gene interactions dataset from the Open Targets Platform

required

Returns:

Name Type Description
DataFrame DataFrame

Top scoring gene-gene interaction per pair of genes

Examples:

>>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"])
>>> L2GGoldStandard.process_gene_interactions(interactions).show()
+-------+-------+-----+
|geneIdA|geneIdB|score|
+-------+-------+-----+
|  gene1|  gene2|  0.8|
|  gene2|  gene3|  0.7|
+-------+-------+-----+
Source code in src/gentropy/dataset/l2g_gold_standard.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
@classmethod
def process_gene_interactions(
    cls: Type[L2GGoldStandard], interactions: DataFrame
) -> DataFrame:
    """Extract top scoring gene-gene interaction from the interactions dataset of the Platform.

    Args:
        interactions (DataFrame): Gene-gene interactions dataset from the Open Targets Platform

    Returns:
        DataFrame: Top scoring gene-gene interaction per pair of genes

    Examples:
        >>> interactions = spark.createDataFrame([("gene1", "gene2", 0.8), ("gene1", "gene2", 0.5), ("gene2", "gene3", 0.7)], ["targetA", "targetB", "scoring"])
        >>> L2GGoldStandard.process_gene_interactions(interactions).show()
        +-------+-------+-----+
        |geneIdA|geneIdB|score|
        +-------+-------+-----+
        |  gene1|  gene2|  0.8|
        |  gene2|  gene3|  0.7|
        +-------+-------+-----+
        <BLANKLINE>
    """
    return get_record_with_maximum_value(
        interactions,
        ["targetA", "targetB"],
        "scoring",
    ).selectExpr(
        "targetA as geneIdA",
        "targetB as geneIdB",
        "scoring as score",
    )

remove_false_negatives(interactions_df: DataFrame) -> L2GGoldStandard

Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene.

Parameters:

Name Type Description Default
interactions_df DataFrame

Top scoring gene-gene interaction per pair of genes

required

Returns:

Name Type Description
L2GGoldStandard L2GGoldStandard

A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.

Source code in src/gentropy/dataset/l2g_gold_standard.py
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
def remove_false_negatives(
    self: L2GGoldStandard,
    interactions_df: DataFrame,
) -> L2GGoldStandard:
    """Refines the gold standard to remove negative gold standard instances where the gene interacts with a positive gene.

    Args:
        interactions_df (DataFrame): Top scoring gene-gene interaction per pair of genes

    Returns:
        L2GGoldStandard: A refined set of locus-to-gene associations with increased reliability, having excluded loci that were likely false negatives due to gene-gene interaction confounding.
    """
    squared_interactions = interactions_df.unionByName(
        interactions_df.selectExpr(
            "geneIdB as geneIdA", "geneIdA as geneIdB", "score"
        )
    ).filter(f.col("score") > self.INTERACTION_THRESHOLD)
    df = (
        self.df.alias("left")
        .join(
            # bring gene partners
            squared_interactions.alias("right"),
            f.col("left.geneId") == f.col("right.geneIdA"),
            "left",
        )
        .withColumnRenamed("geneIdB", "interactorGeneId")
        .join(
            # bring gold standard status for gene partners
            self.df.selectExpr(
                "geneId as interactorGeneId",
                "goldStandardSet as interactorGeneIdGoldStandardSet",
            ),
            "interactorGeneId",
            "left",
        )
        # remove self-interactions
        .filter(
            (f.col("geneId") != f.col("interactorGeneId"))
            | (f.col("interactorGeneId").isNull())
        )
        # remove false negatives
        .filter(
            # drop rows where the GS gene is negative but the interactor is a GS positive
            ~(f.col("goldStandardSet") == "negative")
            & (f.col("interactorGeneIdGoldStandardSet") == "positive")
            |
            # keep rows where the gene does not interact
            (f.col("interactorGeneId").isNull())
        )
        .select(*self.df.columns)
        .distinct()
    )
    return L2GGoldStandard(_df=df, _schema=self.get_schema())

Schema

root
 |-- studyLocusId: string (nullable = false)
 |-- variantId: string (nullable = false)
 |-- studyId: string (nullable = false)
 |-- geneId: string (nullable = false)
 |-- traitFromSourceMappedId: string (nullable = true)
 |-- goldStandardSet: string (nullable = false)
 |-- sources: array (nullable = true)
 |    |-- element: string (containsNull = true)