Skip to content

LD annotator

Class to annotate linkage disequilibrium (LD) operations from GnomAD.

Source code in src/otg/method/ld.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
class LDAnnotator:
    """Class to annotate linkage disequilibrium (LD) operations from GnomAD."""

    @staticmethod
    def _calculate_weighted_r_overall(ld_set: Column) -> Column:
        """Aggregation of weighted R information using ancestry proportions."""
        return f.transform(
            ld_set,
            lambda x: f.struct(
                x["tagVariantId"].alias("tagVariantId"),
                # r2Overall is the accumulated sum of each r2 relative to the population size
                f.aggregate(
                    x["rValues"],
                    f.lit(0.0),
                    lambda acc, y: acc
                    + f.coalesce(
                        f.pow(y["r"], 2) * y["relativeSampleSize"], f.lit(0.0)
                    ),  # we use coalesce to avoid problems when r/relativeSampleSize is null
                ).alias("r2Overall"),
            ),
        )

    @staticmethod
    def _add_population_size(ld_set: Column, study_populations: Column) -> Column:
        """Add population size to each rValues entry in the ldSet.

        Args:
            ld_set (Column): LD set
            study_populations (Column): Study populations

        Returns:
            Column: LD set with added 'relativeSampleSize' field
        """
        # Create a population to relativeSampleSize map from the struct
        populations_map = f.map_from_arrays(
            study_populations["population"],
            study_populations["relativeSampleSize"],
        )
        return f.transform(
            ld_set,
            lambda x: f.struct(
                x["tagVariantId"].alias("tagVariantId"),
                f.transform(
                    x["rValues"],
                    lambda y: f.struct(
                        y["population"].alias("population"),
                        y["r"].alias("r"),
                        populations_map[y["population"]].alias("relativeSampleSize"),
                    ),
                ).alias("rValues"),
            ),
        )

    @classmethod
    def annotate_variants_with_ld(
        cls: type[LDAnnotator], variants_df: DataFrame, ld_index: LDIndex
    ) -> DataFrame:
        """Annotate linkage disequilibrium (LD) information to a set of variants.

        Args:
            variants_df (DataFrame): Input DataFrame with a `variantId` column containing variant IDs (hg38)
            ld_index (LDIndex): LD index

        Returns:
            DataFrame: DataFrame with LD annotations
        """
        return variants_df.join(ld_index.df, on=["variantId", "chromosome"], how="left")

    @classmethod
    def annotate_associations_with_ld(
        cls: type[LDAnnotator],
        associations_df: DataFrame,
        ld_index: LDIndex,
    ) -> DataFrame:
        """Annotate linkage disequilibrium (LD) information to a set of associations.

        We first join the associations dataframe with the LD index. Then, we add the population size of the study to each rValues entry in the ldSet to calculate the relative r between lead/tag for that study.
        Finally, we aggregate the weighted R information using ancestry proportions.

        Args:
            associations_df (DataFrame): Study locus DataFrame with a `populationsStructure` column containing population structure information
            ld_index (LDIndex): Dataset with LD information for every variant present in gnomAD LD matrix

        Returns:
            DataFrame: Following the same schema as the input DataFrame, but with LD annotations (`ldSet` column)
        """
        return (
            # Bring LD information
            associations_df.join(
                ld_index.df, on=["variantId", "chromosome"], how="left"
            )
            # Add population size to each rValues entry in the ldSet
            .withColumn(
                "ldSet",
                cls._add_population_size(f.col("ldSet"), f.col("populationsStructure")),
            )
            # Aggregate weighted R information using ancestry proportions
            .withColumn(
                "ldSet",
                cls._calculate_weighted_r_overall(f.col("ldSet")),
            ).drop("populationsStructure")
        )

annotate_associations_with_ld(associations_df, ld_index) classmethod

Annotate linkage disequilibrium (LD) information to a set of associations.

We first join the associations dataframe with the LD index. Then, we add the population size of the study to each rValues entry in the ldSet to calculate the relative r between lead/tag for that study. Finally, we aggregate the weighted R information using ancestry proportions.

Parameters:

Name Type Description Default
associations_df DataFrame

Study locus DataFrame with a populationsStructure column containing population structure information

required
ld_index LDIndex

Dataset with LD information for every variant present in gnomAD LD matrix

required

Returns:

Name Type Description
DataFrame DataFrame

Following the same schema as the input DataFrame, but with LD annotations (ldSet column)

Source code in src/otg/method/ld.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
@classmethod
def annotate_associations_with_ld(
    cls: type[LDAnnotator],
    associations_df: DataFrame,
    ld_index: LDIndex,
) -> DataFrame:
    """Annotate linkage disequilibrium (LD) information to a set of associations.

    We first join the associations dataframe with the LD index. Then, we add the population size of the study to each rValues entry in the ldSet to calculate the relative r between lead/tag for that study.
    Finally, we aggregate the weighted R information using ancestry proportions.

    Args:
        associations_df (DataFrame): Study locus DataFrame with a `populationsStructure` column containing population structure information
        ld_index (LDIndex): Dataset with LD information for every variant present in gnomAD LD matrix

    Returns:
        DataFrame: Following the same schema as the input DataFrame, but with LD annotations (`ldSet` column)
    """
    return (
        # Bring LD information
        associations_df.join(
            ld_index.df, on=["variantId", "chromosome"], how="left"
        )
        # Add population size to each rValues entry in the ldSet
        .withColumn(
            "ldSet",
            cls._add_population_size(f.col("ldSet"), f.col("populationsStructure")),
        )
        # Aggregate weighted R information using ancestry proportions
        .withColumn(
            "ldSet",
            cls._calculate_weighted_r_overall(f.col("ldSet")),
        ).drop("populationsStructure")
    )

annotate_variants_with_ld(variants_df, ld_index) classmethod

Annotate linkage disequilibrium (LD) information to a set of variants.

Parameters:

Name Type Description Default
variants_df DataFrame

Input DataFrame with a variantId column containing variant IDs (hg38)

required
ld_index LDIndex

LD index

required

Returns:

Name Type Description
DataFrame DataFrame

DataFrame with LD annotations

Source code in src/otg/method/ld.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@classmethod
def annotate_variants_with_ld(
    cls: type[LDAnnotator], variants_df: DataFrame, ld_index: LDIndex
) -> DataFrame:
    """Annotate linkage disequilibrium (LD) information to a set of variants.

    Args:
        variants_df (DataFrame): Input DataFrame with a `variantId` column containing variant IDs (hg38)
        ld_index (LDIndex): LD index

    Returns:
        DataFrame: DataFrame with LD annotations
    """
    return variants_df.join(ld_index.df, on=["variantId", "chromosome"], how="left")