Skip to content

LDAnnotator

gentropy.method.ld.LDAnnotator

Class to annotate linkage disequilibrium (LD) operations from GnomAD.

Source code in src/gentropy/method/ld.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
class LDAnnotator:
    """Class to annotate linkage disequilibrium (LD) operations from GnomAD."""

    @staticmethod
    def _calculate_weighted_r_overall(ld_set: Column) -> Column:
        """Aggregation of weighted R information using ancestry proportions.

        Args:
            ld_set (Column): LD set

        Returns:
            Column: LD set with added 'r2Overall' field
        """
        return f.transform(
            ld_set,
            lambda x: f.struct(
                x["tagVariantId"].alias("tagVariantId"),
                # r2Overall is the accumulated sum of each r2 relative to the population size
                f.aggregate(
                    x["rValues"],
                    f.lit(0.0),
                    lambda acc, y: acc
                    + f.coalesce(
                        f.pow(y["r"], 2) * y["relativeSampleSize"], f.lit(0.0)
                    ),  # we use coalesce to avoid problems when r/relativeSampleSize is null
                ).alias("r2Overall"),
            ),
        )

    @staticmethod
    def _add_population_size(ld_set: Column, study_populations: Column) -> Column:
        """Add population size to each rValues entry in the ldSet.

        Args:
            ld_set (Column): LD set
            study_populations (Column): Study populations

        Returns:
            Column: LD set with added 'relativeSampleSize' field
        """
        # Create a population to relativeSampleSize map from the struct
        populations_map = f.map_from_arrays(
            study_populations["ldPopulation"],
            study_populations["relativeSampleSize"],
        )
        return f.transform(
            ld_set,
            lambda x: f.struct(
                x["tagVariantId"].alias("tagVariantId"),
                f.transform(
                    x["rValues"],
                    lambda y: f.struct(
                        y["population"].alias("population"),
                        y["r"].alias("r"),
                        populations_map[y["population"]].alias("relativeSampleSize"),
                    ),
                ).alias("rValues"),
            ),
        )

    @staticmethod
    def _qc_unresolved_ld(ld_set: Column, quality_controls: Column) -> Column:
        """Flag associations with unresolved LD.

        Args:
            ld_set (Column): LD set
            quality_controls (Column): Quality controls

        Returns:
            Column: Quality controls with added 'UNRESOLVED_LD' field
        """
        return StudyLocus.update_quality_flag(
            quality_controls,
            ld_set.isNull(),
            StudyLocusQualityCheck.UNRESOLVED_LD,
        )

    @staticmethod
    def _rescue_lead_variant(ld_set: Column, variant_id: Column) -> Column:
        """Rescue lead variant.

        In cases in which no LD information is available but a lead variant is available, we include the lead as the only variant in the ldSet.

        Args:
            ld_set (Column): LD set
            variant_id (Column): Variant ID

        Returns:
            Column: LD set with added 'tagVariantId' field
        """
        return f.when(
            ((ld_set.isNull() | (f.size(ld_set) == 0)) & variant_id.isNotNull()),
            f.array(
                f.struct(
                    variant_id.alias("tagVariantId"),
                    f.lit(1).alias("r2Overall"),
                )
            ),
        ).otherwise(ld_set)

    @classmethod
    def ld_annotate(
        cls: type[LDAnnotator],
        associations: StudyLocus,
        studies: StudyIndex,
        ld_index: LDIndex,
    ) -> StudyLocus:
        """Annotate linkage disequilibrium (LD) information to a set of studyLocus.

        This function:
            1. Annotates study locus with population structure information from the study index
            2. Joins the LD index to the StudyLocus
            3. Adds the population size of the study to each rValues entry in the ldSet
            4. Calculates the overall R weighted by the ancestry proportions in every given study.
            5. Flags associations with variants that are not found in the LD reference
            6. Rescues lead variant when no LD information is available but lead variant is available

        Args:
            associations (StudyLocus): Dataset to be LD annotated
            studies (StudyIndex): Dataset with study information
            ld_index (LDIndex): Dataset with LD information for every variant present in LD matrix

        Returns:
            StudyLocus: including additional column with LD information.
        """
        return StudyLocus(
            _df=(
                associations.df
                # Drop ldSet column if already available
                .select(*[col for col in associations.df.columns if col != "ldSet"])
                # Annotate study locus with population structure from study index
                .join(
                    studies.df.select("studyId", "ldPopulationStructure"),
                    on="studyId",
                    how="left",
                )
                # Bring LD information from LD Index
                .join(
                    ld_index.df,
                    on=["variantId", "chromosome"],
                    how="left",
                )
                # Add population size to each rValues entry in the ldSet if population structure available:
                .withColumn(
                    "ldSet",
                    f.when(
                        f.col("ldPopulationStructure").isNotNull(),
                        cls._add_population_size(
                            f.col("ldSet"), f.col("ldPopulationStructure")
                        ),
                    ),
                )
                # Aggregate weighted R information using ancestry proportions
                .withColumn(
                    "ldSet",
                    f.when(
                        f.col("ldPopulationStructure").isNotNull(),
                        cls._calculate_weighted_r_overall(f.col("ldSet")),
                    ),
                )
                .drop("ldPopulationStructure")
                # QC: Flag associations with variants that are not found in the LD reference
                .withColumn(
                    "qualityControls",
                    cls._qc_unresolved_ld(f.col("ldSet"), f.col("qualityControls")),
                )
                # Add lead variant to empty ldSet when no LD information is available but lead variant is available
                .withColumn(
                    "ldSet",
                    cls._rescue_lead_variant(f.col("ldSet"), f.col("variantId")),
                )
            ),
            _schema=StudyLocus.get_schema(),
        )._qc_no_population()

ld_annotate(associations: StudyLocus, studies: StudyIndex, ld_index: LDIndex) -> StudyLocus classmethod

Annotate linkage disequilibrium (LD) information to a set of studyLocus.

This function
  1. Annotates study locus with population structure information from the study index
  2. Joins the LD index to the StudyLocus
  3. Adds the population size of the study to each rValues entry in the ldSet
  4. Calculates the overall R weighted by the ancestry proportions in every given study.
  5. Flags associations with variants that are not found in the LD reference
  6. Rescues lead variant when no LD information is available but lead variant is available

Parameters:

Name Type Description Default
associations StudyLocus

Dataset to be LD annotated

required
studies StudyIndex

Dataset with study information

required
ld_index LDIndex

Dataset with LD information for every variant present in LD matrix

required

Returns:

Name Type Description
StudyLocus StudyLocus

including additional column with LD information.

Source code in src/gentropy/method/ld.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
@classmethod
def ld_annotate(
    cls: type[LDAnnotator],
    associations: StudyLocus,
    studies: StudyIndex,
    ld_index: LDIndex,
) -> StudyLocus:
    """Annotate linkage disequilibrium (LD) information to a set of studyLocus.

    This function:
        1. Annotates study locus with population structure information from the study index
        2. Joins the LD index to the StudyLocus
        3. Adds the population size of the study to each rValues entry in the ldSet
        4. Calculates the overall R weighted by the ancestry proportions in every given study.
        5. Flags associations with variants that are not found in the LD reference
        6. Rescues lead variant when no LD information is available but lead variant is available

    Args:
        associations (StudyLocus): Dataset to be LD annotated
        studies (StudyIndex): Dataset with study information
        ld_index (LDIndex): Dataset with LD information for every variant present in LD matrix

    Returns:
        StudyLocus: including additional column with LD information.
    """
    return StudyLocus(
        _df=(
            associations.df
            # Drop ldSet column if already available
            .select(*[col for col in associations.df.columns if col != "ldSet"])
            # Annotate study locus with population structure from study index
            .join(
                studies.df.select("studyId", "ldPopulationStructure"),
                on="studyId",
                how="left",
            )
            # Bring LD information from LD Index
            .join(
                ld_index.df,
                on=["variantId", "chromosome"],
                how="left",
            )
            # Add population size to each rValues entry in the ldSet if population structure available:
            .withColumn(
                "ldSet",
                f.when(
                    f.col("ldPopulationStructure").isNotNull(),
                    cls._add_population_size(
                        f.col("ldSet"), f.col("ldPopulationStructure")
                    ),
                ),
            )
            # Aggregate weighted R information using ancestry proportions
            .withColumn(
                "ldSet",
                f.when(
                    f.col("ldPopulationStructure").isNotNull(),
                    cls._calculate_weighted_r_overall(f.col("ldSet")),
                ),
            )
            .drop("ldPopulationStructure")
            # QC: Flag associations with variants that are not found in the LD reference
            .withColumn(
                "qualityControls",
                cls._qc_unresolved_ld(f.col("ldSet"), f.col("qualityControls")),
            )
            # Add lead variant to empty ldSet when no LD information is available but lead variant is available
            .withColumn(
                "ldSet",
                cls._rescue_lead_variant(f.col("ldSet"), f.col("variantId")),
            )
        ),
        _schema=StudyLocus.get_schema(),
    )._qc_no_population()