Skip to content

LDAnnotator

gentropy.method.ld.LDAnnotator

Class to annotate linkage disequilibrium (LD) operations from GnomAD.

Source code in src/gentropy/method/ld.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
class LDAnnotator:
    """Class to annotate linkage disequilibrium (LD) operations from GnomAD."""

    @staticmethod
    def _get_major_population(ordered_populations: Column) -> Column:
        """Get major population based on an ldPopulationStructure array ordered by relativeSampleSize.

        If there is a tie for the major population, nfe is selected if it is one of the major populations.
        The first population in the array is selected if there is no tie for the major population, or there is a tie but nfe is not one of the major populations.

        Args:
            ordered_populations (Column): ldPopulationStructure array ordered by relativeSampleSize

        Returns:
            Column: major population
        """
        major_population_size = ordered_populations["relativeSampleSize"][0]
        major_populations = f.filter(
            ordered_populations,
            lambda x: x["relativeSampleSize"] == major_population_size,
        )
        # Check if nfe (Non-Finnish European) is one of the major populations
        has_nfe = f.filter(major_populations, lambda x: x["ldPopulation"] == "nfe")
        return f.when(
            (f.size(major_populations) > 1) & (f.size(has_nfe) == 1), f.lit("nfe")
        ).otherwise(ordered_populations["ldPopulation"][0])

    @staticmethod
    def _calculate_r2_major(ld_set: Column, major_population: Column) -> Column:
        """Calculate R2 using R of the major population in the study.

        Args:
            ld_set (Column): LD set
            major_population (Column): Major population of the study

        Returns:
            Column: LD set with added 'r2Overall' field
        """
        ld_set_with_major_pop = f.transform(
            ld_set,
            lambda x: f.struct(
                x["tagVariantId"].alias("tagVariantId"),
                f.filter(
                    x["rValues"], lambda y: y["population"] == major_population
                ).alias("rValues"),
            ),
        )
        return f.transform(
            ld_set_with_major_pop,
            lambda x: f.struct(
                x["tagVariantId"].alias("tagVariantId"),
                f.coalesce(f.pow(x["rValues"]["r"][0], 2), f.lit(0.0)).alias(
                    "r2Overall"
                ),
            ),
        )

    @staticmethod
    def _qc_unresolved_ld(ld_set: Column, quality_controls: Column) -> Column:
        """Flag associations with unresolved LD.

        Args:
            ld_set (Column): LD set
            quality_controls (Column): Quality controls

        Returns:
            Column: Quality controls with added 'UNRESOLVED_LD' field
        """
        return StudyLocus.update_quality_flag(
            quality_controls,
            ld_set.isNull(),
            StudyLocusQualityCheck.UNRESOLVED_LD,
        )

    @staticmethod
    def _rescue_lead_variant(ld_set: Column, variant_id: Column) -> Column:
        """Rescue lead variant.

        In cases in which no LD information is available but a lead variant is available, we include the lead as the only variant in the ldSet.

        Args:
            ld_set (Column): LD set
            variant_id (Column): Variant ID

        Returns:
            Column: LD set with added 'tagVariantId' field
        """
        return f.when(
            ((ld_set.isNull() | (f.size(ld_set) == 0)) & variant_id.isNotNull()),
            f.array(
                f.struct(
                    variant_id.alias("tagVariantId"),
                    f.lit(1).alias("r2Overall"),
                )
            ),
        ).otherwise(ld_set)

    @classmethod
    def ld_annotate(
        cls: type[LDAnnotator],
        associations: StudyLocus,
        studies: StudyIndex,
        ld_index: LDIndex,
        r2_threshold: float = 0.5,
    ) -> StudyLocus:
        """Annotate linkage disequilibrium (LD) information to a set of studyLocus.

        This function:
            1. Annotates study locus with population structure information ordered by relativeSampleSize from the study index
            2. Joins the LD index to the StudyLocus
            3. Gets the major population from the population structure
            4. Calculates R2 by using the R of the major ancestry
            5. Flags associations with variants that are not found in the LD reference
            6. Rescues lead variant when no LD information is available but lead variant is available

        !!! note
            Because the LD index has a pre-set threshold of R2 = 0.5, this is the minimum threshold for the LD information to be included in the ldSet.

        Args:
            associations (StudyLocus): Dataset to be LD annotated
            studies (StudyIndex): Dataset with study information
            ld_index (LDIndex): Dataset with LD information for every variant present in LD matrix
            r2_threshold (float): R2 threshold to filter the LD set on. Default is 0.5.

        Returns:
            StudyLocus: including additional column with LD information.
        """
        return StudyLocus(
            _df=(
                associations.df
                # Drop ldSet column if already available
                .select(*[col for col in associations.df.columns if col != "ldSet"])
                # Annotate study locus with population structure ordered by relativeSampleSize from study index
                .join(
                    studies.df.select(
                        "studyId",
                        order_array_of_structs_by_field(
                            "ldPopulationStructure", "relativeSampleSize"
                        ).alias("ldPopulationStructure"),
                    ),
                    on="studyId",
                    how="left",
                )
                # Bring LD information from LD Index
                .join(
                    ld_index.df,
                    on=["variantId", "chromosome"],
                    how="left",
                )
                # Get major population from population structure if population structure available
                .withColumn(
                    "majorPopulation",
                    f.when(
                        f.col("ldPopulationStructure").isNotNull(),
                        cls._get_major_population(f.col("ldPopulationStructure")),
                    ),
                )
                # Calculate R2 using R of the major population
                .withColumn(
                    "ldSet",
                    f.when(
                        f.col("ldPopulationStructure").isNotNull(),
                        cls._calculate_r2_major(
                            f.col("ldSet"), f.col("majorPopulation")
                        ),
                    ),
                )
                .drop("ldPopulationStructure", "majorPopulation")
                # Filter the LD set by the R2 threshold and set to null if no LD information passes the threshold
                .withColumn(
                    "ldSet",
                    StudyLocus.filter_ld_set(f.col("ldSet"), r2_threshold),
                )
                .withColumn("ldSet", f.when(f.size("ldSet") > 0, f.col("ldSet")))
                # QC: Flag associations with variants that are not found in the LD reference
                .withColumn(
                    "qualityControls",
                    cls._qc_unresolved_ld(f.col("ldSet"), f.col("qualityControls")),
                )
                # Add lead variant to empty ldSet when no LD information is available but lead variant is available
                .withColumn(
                    "ldSet",
                    cls._rescue_lead_variant(f.col("ldSet"), f.col("variantId")),
                )
                # Ensure that the lead varaitn is always with r2==1
                .withColumn(
                    "ldSet",
                    f.expr(
                        """
                        transform(ldSet, x ->
                            IF(x.tagVariantId == variantId,
                                named_struct('tagVariantId', x.tagVariantId, 'r2Overall', 1.0),
                                x
                            )
                        )
                        """
                    ),
                )
            ),
            _schema=StudyLocus.get_schema(),
        )._qc_no_population()

ld_annotate(associations: StudyLocus, studies: StudyIndex, ld_index: LDIndex, r2_threshold: float = 0.5) -> StudyLocus classmethod

Annotate linkage disequilibrium (LD) information to a set of studyLocus.

This function
  1. Annotates study locus with population structure information ordered by relativeSampleSize from the study index
  2. Joins the LD index to the StudyLocus
  3. Gets the major population from the population structure
  4. Calculates R2 by using the R of the major ancestry
  5. Flags associations with variants that are not found in the LD reference
  6. Rescues lead variant when no LD information is available but lead variant is available

Note

Because the LD index has a pre-set threshold of R2 = 0.5, this is the minimum threshold for the LD information to be included in the ldSet.

Parameters:

Name Type Description Default
associations StudyLocus

Dataset to be LD annotated

required
studies StudyIndex

Dataset with study information

required
ld_index LDIndex

Dataset with LD information for every variant present in LD matrix

required
r2_threshold float

R2 threshold to filter the LD set on. Default is 0.5.

0.5

Returns:

Name Type Description
StudyLocus StudyLocus

including additional column with LD information.

Source code in src/gentropy/method/ld.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
@classmethod
def ld_annotate(
    cls: type[LDAnnotator],
    associations: StudyLocus,
    studies: StudyIndex,
    ld_index: LDIndex,
    r2_threshold: float = 0.5,
) -> StudyLocus:
    """Annotate linkage disequilibrium (LD) information to a set of studyLocus.

    This function:
        1. Annotates study locus with population structure information ordered by relativeSampleSize from the study index
        2. Joins the LD index to the StudyLocus
        3. Gets the major population from the population structure
        4. Calculates R2 by using the R of the major ancestry
        5. Flags associations with variants that are not found in the LD reference
        6. Rescues lead variant when no LD information is available but lead variant is available

    !!! note
        Because the LD index has a pre-set threshold of R2 = 0.5, this is the minimum threshold for the LD information to be included in the ldSet.

    Args:
        associations (StudyLocus): Dataset to be LD annotated
        studies (StudyIndex): Dataset with study information
        ld_index (LDIndex): Dataset with LD information for every variant present in LD matrix
        r2_threshold (float): R2 threshold to filter the LD set on. Default is 0.5.

    Returns:
        StudyLocus: including additional column with LD information.
    """
    return StudyLocus(
        _df=(
            associations.df
            # Drop ldSet column if already available
            .select(*[col for col in associations.df.columns if col != "ldSet"])
            # Annotate study locus with population structure ordered by relativeSampleSize from study index
            .join(
                studies.df.select(
                    "studyId",
                    order_array_of_structs_by_field(
                        "ldPopulationStructure", "relativeSampleSize"
                    ).alias("ldPopulationStructure"),
                ),
                on="studyId",
                how="left",
            )
            # Bring LD information from LD Index
            .join(
                ld_index.df,
                on=["variantId", "chromosome"],
                how="left",
            )
            # Get major population from population structure if population structure available
            .withColumn(
                "majorPopulation",
                f.when(
                    f.col("ldPopulationStructure").isNotNull(),
                    cls._get_major_population(f.col("ldPopulationStructure")),
                ),
            )
            # Calculate R2 using R of the major population
            .withColumn(
                "ldSet",
                f.when(
                    f.col("ldPopulationStructure").isNotNull(),
                    cls._calculate_r2_major(
                        f.col("ldSet"), f.col("majorPopulation")
                    ),
                ),
            )
            .drop("ldPopulationStructure", "majorPopulation")
            # Filter the LD set by the R2 threshold and set to null if no LD information passes the threshold
            .withColumn(
                "ldSet",
                StudyLocus.filter_ld_set(f.col("ldSet"), r2_threshold),
            )
            .withColumn("ldSet", f.when(f.size("ldSet") > 0, f.col("ldSet")))
            # QC: Flag associations with variants that are not found in the LD reference
            .withColumn(
                "qualityControls",
                cls._qc_unresolved_ld(f.col("ldSet"), f.col("qualityControls")),
            )
            # Add lead variant to empty ldSet when no LD information is available but lead variant is available
            .withColumn(
                "ldSet",
                cls._rescue_lead_variant(f.col("ldSet"), f.col("variantId")),
            )
            # Ensure that the lead varaitn is always with r2==1
            .withColumn(
                "ldSet",
                f.expr(
                    """
                    transform(ldSet, x ->
                        IF(x.tagVariantId == variantId,
                            named_struct('tagVariantId', x.tagVariantId, 'r2Overall', 1.0),
                            x
                        )
                    )
                    """
                ),
            )
        ),
        _schema=StudyLocus.get_schema(),
    )._qc_no_population()