Skip to content

L2G Gold Standard

gentropy.datasource.open_targets.l2g_gold_standard.OpenTargetsL2GGoldStandard

Parser for OTGenetics locus to gene gold standards curation.

The curation is processed to generate a dataset with 2 labels
  • Gold Standard Positive (GSP): When the lead variant is part of a curated list of GWAS loci with known gene-trait associations.
  • Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS.
Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
class OpenTargetsL2GGoldStandard:
    """Parser for OTGenetics locus to gene gold standards curation.

    The curation is processed to generate a dataset with 2 labels:
        - Gold Standard Positive (GSP): When the lead variant is part of a curated list of GWAS loci with known gene-trait associations.
        - Gold Standard Negative (GSN): When the lead variant is not part of a curated list of GWAS loci with known gene-trait associations but is in the vicinity of a gene's TSS.
    """

    LOCUS_TO_GENE_WINDOW = 500_000

    @classmethod
    def parse_positive_curation(
        cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame
    ) -> DataFrame:
        """Parse positive set from gold standard curation.

        Args:
            gold_standard_curation (DataFrame): Gold standard curation dataframe

        Returns:
            DataFrame: Positive set
        """
        return (
            gold_standard_curation.filter(
                f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])
            )
            .select(
                f.col("association_info.otg_id").alias("studyId"),
                f.col("gold_standard_info.gene_id").alias("geneId"),
                f.concat_ws(
                    "_",
                    f.col("sentinel_variant.locus_GRCh38.chromosome"),
                    f.col("sentinel_variant.locus_GRCh38.position"),
                    f.col("sentinel_variant.alleles.reference"),
                    f.col("sentinel_variant.alleles.alternative"),
                ).alias("variantId"),
                f.col("metadata.set_label").alias("source"),
            )
            .withColumn(
                "studyLocusId",
                StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
            )
            .groupBy("studyLocusId", "studyId", "variantId", "geneId")
            .agg(f.collect_set("source").alias("sources"))
        )

    @classmethod
    def expand_gold_standard_with_negatives(
        cls: Type[OpenTargetsL2GGoldStandard],
        positive_set: DataFrame,
        variant_index: VariantIndex,
    ) -> DataFrame:
        """Create full set of positive and negative evidence of locus to gene associations.

        Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set.

        Args:
            positive_set (DataFrame): Positive set from curation
            variant_index (VariantIndex): Variant index to get distance to gene

        Returns:
            DataFrame: Full set of positive and negative evidence of locus to gene associations
        """
        return (
            positive_set.withColumnRenamed("geneId", "curated_geneId")
            .join(
                variant_index.get_distance_to_gene()
                .selectExpr(
                    "variantId",
                    "targetId as non_curated_geneId",
                    "distanceFromTss",
                )
                .filter(f.col("distanceFromTss") <= cls.LOCUS_TO_GENE_WINDOW),
                on="variantId",
                how="left",
            )
            .withColumn(
                "goldStandardSet",
                f.when(
                    (f.col("curated_geneId") == f.col("non_curated_geneId"))
                    # to keep the positives that are not part of the variant index
                    | (f.col("non_curated_geneId").isNull()),
                    f.lit(L2GGoldStandard.GS_POSITIVE_LABEL),
                ).otherwise(L2GGoldStandard.GS_NEGATIVE_LABEL),
            )
            .withColumn(
                "geneId",
                f.when(
                    f.col("goldStandardSet") == L2GGoldStandard.GS_POSITIVE_LABEL,
                    f.col("curated_geneId"),
                ).otherwise(f.col("non_curated_geneId")),
            )
            .drop("distanceFromTss", "curated_geneId", "non_curated_geneId")
        )

    @classmethod
    def as_l2g_gold_standard(
        cls: type[OpenTargetsL2GGoldStandard],
        gold_standard_curation: DataFrame,
        variant_index: VariantIndex,
    ) -> L2GGoldStandard:
        """Initialise L2GGoldStandard from source dataset.

        Args:
            gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards
            variant_index (VariantIndex): Dataset to bring distance between a variant and a gene's footprint

        Returns:
            L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
        """
        return L2GGoldStandard(
            _df=cls.parse_positive_curation(gold_standard_curation).transform(
                cls.expand_gold_standard_with_negatives, variant_index
            ),
            _schema=L2GGoldStandard.get_schema(),
        )

as_l2g_gold_standard(gold_standard_curation: DataFrame, variant_index: VariantIndex) -> L2GGoldStandard classmethod

Initialise L2GGoldStandard from source dataset.

Parameters:

Name Type Description Default
gold_standard_curation DataFrame

Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards

required
variant_index VariantIndex

Dataset to bring distance between a variant and a gene's footprint

required

Returns:

Name Type Description
L2GGoldStandard L2GGoldStandard

L2G Gold Standard dataset. False negatives have not yet been removed.

Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
@classmethod
def as_l2g_gold_standard(
    cls: type[OpenTargetsL2GGoldStandard],
    gold_standard_curation: DataFrame,
    variant_index: VariantIndex,
) -> L2GGoldStandard:
    """Initialise L2GGoldStandard from source dataset.

    Args:
        gold_standard_curation (DataFrame): Gold standard curation dataframe, extracted from https://github.com/opentargets/genetics-gold-standards
        variant_index (VariantIndex): Dataset to bring distance between a variant and a gene's footprint

    Returns:
        L2GGoldStandard: L2G Gold Standard dataset. False negatives have not yet been removed.
    """
    return L2GGoldStandard(
        _df=cls.parse_positive_curation(gold_standard_curation).transform(
            cls.expand_gold_standard_with_negatives, variant_index
        ),
        _schema=L2GGoldStandard.get_schema(),
    )

expand_gold_standard_with_negatives(positive_set: DataFrame, variant_index: VariantIndex) -> DataFrame classmethod

Create full set of positive and negative evidence of locus to gene associations.

Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set.

Parameters:

Name Type Description Default
positive_set DataFrame

Positive set from curation

required
variant_index VariantIndex

Variant index to get distance to gene

required

Returns:

Name Type Description
DataFrame DataFrame

Full set of positive and negative evidence of locus to gene associations

Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
@classmethod
def expand_gold_standard_with_negatives(
    cls: Type[OpenTargetsL2GGoldStandard],
    positive_set: DataFrame,
    variant_index: VariantIndex,
) -> DataFrame:
    """Create full set of positive and negative evidence of locus to gene associations.

    Negative evidence consists of all genes within a window of 500kb of the lead variant that are not in the positive set.

    Args:
        positive_set (DataFrame): Positive set from curation
        variant_index (VariantIndex): Variant index to get distance to gene

    Returns:
        DataFrame: Full set of positive and negative evidence of locus to gene associations
    """
    return (
        positive_set.withColumnRenamed("geneId", "curated_geneId")
        .join(
            variant_index.get_distance_to_gene()
            .selectExpr(
                "variantId",
                "targetId as non_curated_geneId",
                "distanceFromTss",
            )
            .filter(f.col("distanceFromTss") <= cls.LOCUS_TO_GENE_WINDOW),
            on="variantId",
            how="left",
        )
        .withColumn(
            "goldStandardSet",
            f.when(
                (f.col("curated_geneId") == f.col("non_curated_geneId"))
                # to keep the positives that are not part of the variant index
                | (f.col("non_curated_geneId").isNull()),
                f.lit(L2GGoldStandard.GS_POSITIVE_LABEL),
            ).otherwise(L2GGoldStandard.GS_NEGATIVE_LABEL),
        )
        .withColumn(
            "geneId",
            f.when(
                f.col("goldStandardSet") == L2GGoldStandard.GS_POSITIVE_LABEL,
                f.col("curated_geneId"),
            ).otherwise(f.col("non_curated_geneId")),
        )
        .drop("distanceFromTss", "curated_geneId", "non_curated_geneId")
    )

parse_positive_curation(gold_standard_curation: DataFrame) -> DataFrame classmethod

Parse positive set from gold standard curation.

Parameters:

Name Type Description Default
gold_standard_curation DataFrame

Gold standard curation dataframe

required

Returns:

Name Type Description
DataFrame DataFrame

Positive set

Source code in src/gentropy/datasource/open_targets/l2g_gold_standard.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@classmethod
def parse_positive_curation(
    cls: Type[OpenTargetsL2GGoldStandard], gold_standard_curation: DataFrame
) -> DataFrame:
    """Parse positive set from gold standard curation.

    Args:
        gold_standard_curation (DataFrame): Gold standard curation dataframe

    Returns:
        DataFrame: Positive set
    """
    return (
        gold_standard_curation.filter(
            f.col("gold_standard_info.highest_confidence").isin(["High", "Medium"])
        )
        .select(
            f.col("association_info.otg_id").alias("studyId"),
            f.col("gold_standard_info.gene_id").alias("geneId"),
            f.concat_ws(
                "_",
                f.col("sentinel_variant.locus_GRCh38.chromosome"),
                f.col("sentinel_variant.locus_GRCh38.position"),
                f.col("sentinel_variant.alleles.reference"),
                f.col("sentinel_variant.alleles.alternative"),
            ).alias("variantId"),
            f.col("metadata.set_label").alias("source"),
        )
        .withColumn(
            "studyLocusId",
            StudyLocus.assign_study_locus_id(["studyId", "variantId"]),
        )
        .groupBy("studyLocusId", "studyId", "variantId", "geneId")
        .agg(f.collect_set("source").alias("sources"))
    )