Skip to content

Summary statistics

gentropy.datasource.gwas_catalog.summary_statistics.GWASCatalogSummaryStatistics dataclass

Bases: SummaryStatistics

GWAS Catalog Summary Statistics reader.

Source code in src/gentropy/datasource/gwas_catalog/summary_statistics.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
@dataclass
class GWASCatalogSummaryStatistics(SummaryStatistics):
    """GWAS Catalog Summary Statistics reader."""

    @classmethod
    def from_gwas_harmonized_summary_stats(
        cls: type[GWASCatalogSummaryStatistics],
        spark: SparkSession,
        sumstats_file: str,
    ) -> GWASCatalogSummaryStatistics:
        """Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.

        Things got slightly complicated given the GWAS Catalog harmonization pipelines changed recently so we had to accomodate to
        both formats.

        Args:
            spark (SparkSession): spark session
            sumstats_file (str): list of GWAS Catalog summary stat files, with study ids in them.

        Returns:
            GWASCatalogSummaryStatistics: Summary statistics object.
        """
        sumstats_df = spark.read.csv(sumstats_file, sep="\t", header=True).withColumn(
            # Parsing GWAS Catalog study identifier from filename:
            "studyId",
            f.lit(filename_to_study_identifier(sumstats_file)),
        )

        # Parsing variant id fields:
        chromosome = (
            f.col("hm_chrom")
            if "hm_chrom" in sumstats_df.columns
            else f.col("chromosome")
        ).cast(t.StringType())
        position = (
            f.col("hm_pos")
            if "hm_pos" in sumstats_df.columns
            else f.col("base_pair_location")
        ).cast(t.IntegerType())
        ref_allele = (
            f.col("hm_other_allele")
            if "hm_other_allele" in sumstats_df.columns
            else f.col("other_allele")
        )
        alt_allele = (
            f.col("hm_effect_allele")
            if "hm_effect_allele" in sumstats_df.columns
            else f.col("effect_allele")
        )

        # Parsing p-value (get a tuple with mantissa and exponent):
        p_value_expression = (
            parse_pvalue(f.col("p_value"))
            if "p_value" in sumstats_df.columns
            else neglog_pvalue_to_mantissa_and_exponent(f.col("neg_log_10_p_value"))
        )

        # The effect allele frequency is an optional column, we have to test if it is there:
        allele_frequency = (
            f.col("effect_allele_frequency")
            if "effect_allele_frequency" in sumstats_df.columns
            else f.lit(None)
        ).cast(t.FloatType())

        # Do we have sample size? This expression captures 99.7% of sample size columns.
        sample_size = (f.col("n") if "n" in sumstats_df.columns else f.lit(None)).cast(
            t.IntegerType()
        )

        # Depending on the input, we might have beta, but the column might not be there at all also old format calls differently:
        beta_expression = (
            f.col("hm_beta")
            if "hm_beta" in sumstats_df.columns
            else f.col("beta")
            if "beta" in sumstats_df.columns
            # If no column, create one:
            else f.lit(None)
        ).cast(t.DoubleType())

        # We might have odds ratio or hazard ratio, wich are basically the same:
        odds_ratio_expression = (
            f.col("hm_odds_ratio")
            if "hm_odds_ratio" in sumstats_df.columns
            else f.col("odds_ratio")
            if "odds_ratio" in sumstats_df.columns
            else f.col("hazard_ratio")
            if "hazard_ratio" in sumstats_df.columns
            # If no column, create one:
            else f.lit(None)
        ).cast(t.DoubleType())

        # Does the file have standard error column?
        standard_error = (
            f.col("standard_error")
            if "standard_error" in sumstats_df.columns
            else f.lit(None)
        ).cast(t.DoubleType())

        # Processing columns of interest:
        processed_sumstats_df = (
            sumstats_df
            # Dropping rows which doesn't have proper position:
            .select(
                "studyId",
                # Adding variant identifier:
                f.concat_ws(
                    "_",
                    chromosome,
                    position,
                    ref_allele,
                    alt_allele,
                ).alias("variantId"),
                chromosome.alias("chromosome"),
                position.alias("position"),
                # Parsing p-value mantissa and exponent:
                *p_value_expression,
                # Converting/calculating effect and confidence interval:
                *convert_odds_ratio_to_beta(
                    beta_expression,
                    odds_ratio_expression,
                    standard_error,
                ),
                allele_frequency.alias("effectAlleleFrequencyFromSource"),
                sample_size.alias("sampleSize"),
            )
            .filter(
                # Dropping associations where no harmonized position is available:
                f.col("position").isNotNull()
                &
                # We are not interested in associations with zero effect:
                (f.col("beta") != 0)
            )
            .orderBy(f.col("chromosome"), f.col("position"))
            # median study size is 200Mb, max is 2.6Gb
            .repartition(20)
        )

        # Initializing summary statistics object:
        return cls(
            _df=processed_sumstats_df,
            _schema=cls.get_schema(),
        )

from_gwas_harmonized_summary_stats(spark: SparkSession, sumstats_file: str) -> GWASCatalogSummaryStatistics classmethod

Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.

Things got slightly complicated given the GWAS Catalog harmonization pipelines changed recently so we had to accomodate to both formats.

Parameters:

Name Type Description Default
spark SparkSession

spark session

required
sumstats_file str

list of GWAS Catalog summary stat files, with study ids in them.

required

Returns:

Name Type Description
GWASCatalogSummaryStatistics GWASCatalogSummaryStatistics

Summary statistics object.

Source code in src/gentropy/datasource/gwas_catalog/summary_statistics.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
@classmethod
def from_gwas_harmonized_summary_stats(
    cls: type[GWASCatalogSummaryStatistics],
    spark: SparkSession,
    sumstats_file: str,
) -> GWASCatalogSummaryStatistics:
    """Create summary statistics object from summary statistics flatfile, harmonized by the GWAS Catalog.

    Things got slightly complicated given the GWAS Catalog harmonization pipelines changed recently so we had to accomodate to
    both formats.

    Args:
        spark (SparkSession): spark session
        sumstats_file (str): list of GWAS Catalog summary stat files, with study ids in them.

    Returns:
        GWASCatalogSummaryStatistics: Summary statistics object.
    """
    sumstats_df = spark.read.csv(sumstats_file, sep="\t", header=True).withColumn(
        # Parsing GWAS Catalog study identifier from filename:
        "studyId",
        f.lit(filename_to_study_identifier(sumstats_file)),
    )

    # Parsing variant id fields:
    chromosome = (
        f.col("hm_chrom")
        if "hm_chrom" in sumstats_df.columns
        else f.col("chromosome")
    ).cast(t.StringType())
    position = (
        f.col("hm_pos")
        if "hm_pos" in sumstats_df.columns
        else f.col("base_pair_location")
    ).cast(t.IntegerType())
    ref_allele = (
        f.col("hm_other_allele")
        if "hm_other_allele" in sumstats_df.columns
        else f.col("other_allele")
    )
    alt_allele = (
        f.col("hm_effect_allele")
        if "hm_effect_allele" in sumstats_df.columns
        else f.col("effect_allele")
    )

    # Parsing p-value (get a tuple with mantissa and exponent):
    p_value_expression = (
        parse_pvalue(f.col("p_value"))
        if "p_value" in sumstats_df.columns
        else neglog_pvalue_to_mantissa_and_exponent(f.col("neg_log_10_p_value"))
    )

    # The effect allele frequency is an optional column, we have to test if it is there:
    allele_frequency = (
        f.col("effect_allele_frequency")
        if "effect_allele_frequency" in sumstats_df.columns
        else f.lit(None)
    ).cast(t.FloatType())

    # Do we have sample size? This expression captures 99.7% of sample size columns.
    sample_size = (f.col("n") if "n" in sumstats_df.columns else f.lit(None)).cast(
        t.IntegerType()
    )

    # Depending on the input, we might have beta, but the column might not be there at all also old format calls differently:
    beta_expression = (
        f.col("hm_beta")
        if "hm_beta" in sumstats_df.columns
        else f.col("beta")
        if "beta" in sumstats_df.columns
        # If no column, create one:
        else f.lit(None)
    ).cast(t.DoubleType())

    # We might have odds ratio or hazard ratio, wich are basically the same:
    odds_ratio_expression = (
        f.col("hm_odds_ratio")
        if "hm_odds_ratio" in sumstats_df.columns
        else f.col("odds_ratio")
        if "odds_ratio" in sumstats_df.columns
        else f.col("hazard_ratio")
        if "hazard_ratio" in sumstats_df.columns
        # If no column, create one:
        else f.lit(None)
    ).cast(t.DoubleType())

    # Does the file have standard error column?
    standard_error = (
        f.col("standard_error")
        if "standard_error" in sumstats_df.columns
        else f.lit(None)
    ).cast(t.DoubleType())

    # Processing columns of interest:
    processed_sumstats_df = (
        sumstats_df
        # Dropping rows which doesn't have proper position:
        .select(
            "studyId",
            # Adding variant identifier:
            f.concat_ws(
                "_",
                chromosome,
                position,
                ref_allele,
                alt_allele,
            ).alias("variantId"),
            chromosome.alias("chromosome"),
            position.alias("position"),
            # Parsing p-value mantissa and exponent:
            *p_value_expression,
            # Converting/calculating effect and confidence interval:
            *convert_odds_ratio_to_beta(
                beta_expression,
                odds_ratio_expression,
                standard_error,
            ),
            allele_frequency.alias("effectAlleleFrequencyFromSource"),
            sample_size.alias("sampleSize"),
        )
        .filter(
            # Dropping associations where no harmonized position is available:
            f.col("position").isNotNull()
            &
            # We are not interested in associations with zero effect:
            (f.col("beta") != 0)
        )
        .orderBy(f.col("chromosome"), f.col("position"))
        # median study size is 200Mb, max is 2.6Gb
        .repartition(20)
    )

    # Initializing summary statistics object:
    return cls(
        _df=processed_sumstats_df,
        _schema=cls.get_schema(),
    )