Skip to content

L2G Gold Standard

gentropy.datasource.open_targets.l2g_gold_standard.OpenTargetsL2GGoldStandard

Parser for OTGenetics locus to gene gold standards curation.

The curation is processed to generate a dataset with 2 labels
  • Gold Standard Positive (GSP): When the lead variant is part of a curated list of GWAS loci with known gene-trait associations.
  • Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS.
Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
class OpenTargetsL2GGoldStandard:
    """Parser for OTGenetics locus to gene gold standards curation.

    The curation is processed to generate a dataset with 2 labels:
        - Gold Standard Positive (GSP): When the lead variant is part of a curated list of GWAS loci with known gene-trait associations.
        - Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS.
    """

    LOCUS_TO_GENE_WINDOW = 500_000

    @classmethod
    def parse_positive_curation(
        cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame
    ) -> DataFrame:
        """Parse positive set from gold standard curation.

        Args:
            gold_standard_curation (DataFrame): Gold standard curation dataframe

        Returns:
            DataFrame: Positive set
        """
        return (
            gold_standard_curation.filter(
                f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])
            )
            .select(
                f.col("association_info.otg_id").alias("studyId"),
                f.col("gold_standard_info.gene_id").alias("geneId"),
                f.concat_ws(
                    "_",
                    f.col("sentinel_variant.locus_GRCh38.chromosome"),
                    f.col("sentinel_variant.locus_GRCh38.position"),
                    f.col("sentinel_variant.alleles.reference"),
                    f.col("sentinel_variant.alleles.alternative"),
                ).alias("variantId"),
                f.col("metadata.set_label").alias("source"),
            )
            .withColumn(
                "studyLocusId",
                StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
            )
            .groupBy("studyLocusId", "studyId", "variantId", "geneId")
            .agg(f.collect_set("source").alias("sources"))
        )

    @classmethod
    def expand_gold_standard_with_negatives(
        cls: Type[OpenTargetsL2GGoldStandard], positive_set: DataFrame, v2g: V2G
    ) -> DataFrame:
        """Create full set of positive and negative evidence of locus to gene associations.

        Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set.

        Args:
            positive_set (DataFrame): Positive set from curation
            v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS

        Returns:
            DataFrame: Full set of positive and negative evidence of locus to gene associations
        """
        return (
            positive_set.withColumnRenamed("geneId", "curated_geneId")
            .join(
                v2g.df.selectExpr(
                    "variantId", "geneId as non_curated_geneId", "distance"
                ).filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW),
                on="variantId",
                how="left",
            )
            .withColumn(
                "goldStandardSet",
                f.when(
                    (f.col("curated_geneId") == f.col("non_curated_geneId"))
                    # to keep the positives that are outside the v2g dataset
                    | (f.col("non_curated_geneId").isNull()),
                    f.lit(L2GGoldStandard.GS_POSITIVE_LABEL),
                ).otherwise(L2GGoldStandard.GS_NEGATIVE_LABEL),
            )
            .withColumn(
                "geneId",
                f.when(
                    f.col("goldStandardSet") == L2GGoldStandard.GS_POSITIVE_LABEL,
                    f.col("curated_geneId"),
                ).otherwise(f.col("non_curated_geneId")),
            )
            .drop("distance", "curated_geneId", "non_curated_geneId")
        )

    @classmethod
    def as_l2g_gold_standard(
        cls: type[OpenTargetsL2GGoldStandard],
        gold_standard_curation: DataFrame,
        v2g: V2G,
    ) -> L2GGoldStandard:
        """Initialise L2GGoldStandard from source dataset.

        Args:
            gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards
            v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS

        Returns:
            L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
        """
        return L2GGoldStandard(
            _df=cls.parse_positive_curation(gold_standard_curation).transform(
                cls.expand_gold_standard_with_negatives, v2g
            ),
            _schema=L2GGoldStandard.get_schema(),
        )

as_l2g_gold_standard(gold_standard_curation: DataFrame, v2g: V2G) -> L2GGoldStandard classmethod

Initialise L2GGoldStandard from source dataset.

Parameters:

Name Type Description Default
gold_standard_curation DataFrame

Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards

required
v2g V2G

Variant to gene dataset to bring distance between a variant and a gene's TSS

required

Returns:

Name Type Description
L2GGoldStandard L2GGoldStandard

L2G Gold Standard dataset. False negatives have not yet been removed.

Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
@classmethod
def as_l2g_gold_standard(
    cls: type[OpenTargetsL2GGoldStandard],
    gold_standard_curation: DataFrame,
    v2g: V2G,
) -> L2GGoldStandard:
    """Initialise L2GGoldStandard from source dataset.

    Args:
        gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards
        v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS

    Returns:
        L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
    """
    return L2GGoldStandard(
        _df=cls.parse_positive_curation(gold_standard_curation).transform(
            cls.expand_gold_standard_with_negatives, v2g
        ),
        _schema=L2GGoldStandard.get_schema(),
    )

expand_gold_standard_with_negatives(positive_set: DataFrame, v2g: V2G) -> DataFrame classmethod

Create full set of positive and negative evidence of locus to gene associations.

Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set.

Parameters:

Name Type Description Default
positive_set DataFrame

Positive set from curation

required
v2g V2G

Variant to gene dataset to bring distance between a variant and a gene's TSS

required

Returns:

Name Type Description
DataFrame DataFrame

Full set of positive and negative evidence of locus to gene associations

Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
@classmethod
def expand_gold_standard_with_negatives(
    cls: Type[OpenTargetsL2GGoldStandard], positive_set: DataFrame, v2g: V2G
) -> DataFrame:
    """Create full set of positive and negative evidence of locus to gene associations.

    Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set.

    Args:
        positive_set (DataFrame): Positive set from curation
        v2g (V2G): Variant to gene dataset to bring distance between a variant and a gene's TSS

    Returns:
        DataFrame: Full set of positive and negative evidence of locus to gene associations
    """
    return (
        positive_set.withColumnRenamed("geneId", "curated_geneId")
        .join(
            v2g.df.selectExpr(
                "variantId", "geneId as non_curated_geneId", "distance"
            ).filter(f.col("distance") <= cls.LOCUS_TO_GENE_WINDOW),
            on="variantId",
            how="left",
        )
        .withColumn(
            "goldStandardSet",
            f.when(
                (f.col("curated_geneId") == f.col("non_curated_geneId"))
                # to keep the positives that are outside the v2g dataset
                | (f.col("non_curated_geneId").isNull()),
                f.lit(L2GGoldStandard.GS_POSITIVE_LABEL),
            ).otherwise(L2GGoldStandard.GS_NEGATIVE_LABEL),
        )
        .withColumn(
            "geneId",
            f.when(
                f.col("goldStandardSet") == L2GGoldStandard.GS_POSITIVE_LABEL,
                f.col("curated_geneId"),
            ).otherwise(f.col("non_curated_geneId")),
        )
        .drop("distance", "curated_geneId", "non_curated_geneId")
    )

parse_positive_curation(gold_standard_curation: DataFrame) -> DataFrame classmethod

Parse positive set from gold standard curation.

Parameters:

Name Type Description Default
gold_standard_curation DataFrame

Gold standard curation dataframe

required

Returns:

Name Type Description
DataFrame DataFrame

Positive set

Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@classmethod
def parse_positive_curation(
    cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame
) -> DataFrame:
    """Parse positive set from gold standard curation.

    Args:
        gold_standard_curation (DataFrame): Gold standard curation dataframe

    Returns:
        DataFrame: Positive set
    """
    return (
        gold_standard_curation.filter(
            f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])
        )
        .select(
            f.col("association_info.otg_id").alias("studyId"),
            f.col("gold_standard_info.gene_id").alias("geneId"),
            f.concat_ws(
                "_",
                f.col("sentinel_variant.locus_GRCh38.chromosome"),
                f.col("sentinel_variant.locus_GRCh38.position"),
                f.col("sentinel_variant.alleles.reference"),
                f.col("sentinel_variant.alleles.alternative"),
            ).alias("variantId"),
            f.col("metadata.set_label").alias("source"),
        )
        .withColumn(
            "studyLocusId",
            StudyLocus.assign_study_locus_id(f.col("studyId"), f.col("variantId")),
        )
        .groupBy("studyLocusId", "studyId", "variantId", "geneId")
        .agg(f.collect_set("source").alias("sources"))
    )