Skip to content

Summary Statistics

gentropy.dataset.summary_statistics.SummaryStatistics dataclass

Bases: Dataset

Summary Statistics dataset.

A summary statistics dataset contains all single point statistics resulting from a GWAS.

Source code in src/gentropy/dataset/summary_statistics.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
@dataclass
class SummaryStatistics(Dataset):
    """Summary Statistics dataset.

    A summary statistics dataset contains all single point statistics resulting from a GWAS.
    """

    @classmethod
    def get_schema(cls: type[SummaryStatistics]) -> StructType:
        """Provides the schema for the SummaryStatistics dataset.

        Returns:
            StructType: Schema for the SummaryStatistics dataset
        """
        return parse_spark_schema("summary_statistics.json")

    def pvalue_filter(self: SummaryStatistics, pvalue: float) -> SummaryStatistics:
        """Filter summary statistics based on the provided p-value threshold.

        Args:
            pvalue (float): upper limit of the p-value to be filtered upon.

        Returns:
            SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.
        """
        # Converting p-value to mantissa and exponent:
        (mantissa, exponent) = split_pvalue(pvalue)

        # Applying filter:
        df = self._df.filter(
            (f.col("pValueExponent") < exponent)
            | (
                (f.col("pValueExponent") == exponent)
                & (f.col("pValueMantissa") <= mantissa)
            )
        )
        return SummaryStatistics(_df=df, _schema=self._schema)

    def window_based_clumping(
        self: SummaryStatistics,
        distance: int = WindowBasedClumpingStepConfig().distance,
        gwas_significance: float = WindowBasedClumpingStepConfig().gwas_significance,
    ) -> StudyLocus:
        """Generate study-locus from summary statistics using window-based clumping.

        For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]

        Args:
            distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
            gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.

        Returns:
            StudyLocus: Clumped study-locus optionally containing variants based on window.
            Check WindowBasedClumpingStepConfig object for default values.
        """
        from gentropy.method.window_based_clumping import WindowBasedClumping

        return WindowBasedClumping.clump(
            # Before clumping, we filter the summary statistics by p-value:
            self.pvalue_filter(gwas_significance),
            distance=distance,
            # After applying the clumping, we filter the clumped loci by the flag:
        ).valid_rows(["WINDOW_CLUMPED"])

    def locus_breaker_clumping(
        self: SummaryStatistics,
        baseline_pvalue_cutoff: float = LocusBreakerClumpingConfig.lbc_baseline_pvalue,
        distance_cutoff: int = LocusBreakerClumpingConfig.lbc_distance_cutoff,
        pvalue_cutoff: float = LocusBreakerClumpingConfig.lbc_pvalue_threshold,
        flanking_distance: int = LocusBreakerClumpingConfig.lbc_flanking_distance,
    ) -> StudyLocus:
        """Generate study-locus from summary statistics using locus-breaker clumping method with locus boundaries.

        For more info, see [`locus_breaker`][gentropy.method.locus_breaker_clumping.LocusBreakerClumping]

        Args:
            baseline_pvalue_cutoff (float, optional): Baseline significance we consider for the locus.
            distance_cutoff (int, optional): Distance in base pairs to be used for clumping.
            pvalue_cutoff (float, optional): GWAS significance threshold.
            flanking_distance (int, optional): Flank distance in base pairs to be used for clumping.

        Returns:
            StudyLocus: Clumped study-locus optionally containing variants based on window.
            Check LocusBreakerClumpingConfig object for default values.
        """
        from gentropy.method.locus_breaker_clumping import LocusBreakerClumping

        return LocusBreakerClumping.locus_breaker(
            self,
            baseline_pvalue_cutoff,
            distance_cutoff,
            pvalue_cutoff,
            flanking_distance,
        )

    def exclude_region(
        self: SummaryStatistics, region: GenomicRegion
    ) -> SummaryStatistics:
        """Exclude a region from the summary stats dataset.

        Args:
            region (GenomicRegion): Genomic region to be excluded.

        Returns:
            SummaryStatistics: filtered summary statistics.
        """
        return SummaryStatistics(
            _df=(
                self.df.filter(
                    ~(
                        (f.col("chromosome") == region.chromosome)
                        & (
                            (f.col("position") >= region.start)
                            & (f.col("position") <= region.end)
                        )
                    )
                )
            ),
            _schema=SummaryStatistics.get_schema(),
        )

    def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
        """The function filters the summary statistics by sanity filters.

        The function filters the summary statistics by the following filters:
            - The p-value should be less than 1.
            - The pValueMantissa should be greater than 0.
            - The beta should not be equal 0.
            - The p-value, beta and se should not be NaN.
            - The se should be positive.
            - The beta and se should not be infinite.

        Returns:
            SummaryStatistics: The filtered summary statistics.
        """
        gwas_df = self._df
        gwas_df = gwas_df.dropna(
            subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
        )
        gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") > 0))
        gwas_df = gwas_df.filter(
            (f.col("pValueMantissa") * 10 ** f.col("pValueExponent") < 1)
            & (f.col("pValueMantissa") > 0)
        )
        cols = ["beta", "standardError"]
        summary_stats = SummaryStatistics(
            _df=gwas_df,
            _schema=SummaryStatistics.get_schema(),
        ).drop_infinity_values(*cols)

        return summary_stats

exclude_region(region: GenomicRegion) -> SummaryStatistics

Exclude a region from the summary stats dataset.

Parameters:

Name Type Description Default
region GenomicRegion

Genomic region to be excluded.

required

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

filtered summary statistics.

Source code in src/gentropy/dataset/summary_statistics.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
def exclude_region(
    self: SummaryStatistics, region: GenomicRegion
) -> SummaryStatistics:
    """Exclude a region from the summary stats dataset.

    Args:
        region (GenomicRegion): Genomic region to be excluded.

    Returns:
        SummaryStatistics: filtered summary statistics.
    """
    return SummaryStatistics(
        _df=(
            self.df.filter(
                ~(
                    (f.col("chromosome") == region.chromosome)
                    & (
                        (f.col("position") >= region.start)
                        & (f.col("position") <= region.end)
                    )
                )
            )
        ),
        _schema=SummaryStatistics.get_schema(),
    )

get_schema() -> StructType classmethod

Provides the schema for the SummaryStatistics dataset.

Returns:

Name Type Description
StructType StructType

Schema for the SummaryStatistics dataset

Source code in src/gentropy/dataset/summary_statistics.py
29
30
31
32
33
34
35
36
@classmethod
def get_schema(cls: type[SummaryStatistics]) -> StructType:
    """Provides the schema for the SummaryStatistics dataset.

    Returns:
        StructType: Schema for the SummaryStatistics dataset
    """
    return parse_spark_schema("summary_statistics.json")

locus_breaker_clumping(baseline_pvalue_cutoff: float = LocusBreakerClumpingConfig.lbc_baseline_pvalue, distance_cutoff: int = LocusBreakerClumpingConfig.lbc_distance_cutoff, pvalue_cutoff: float = LocusBreakerClumpingConfig.lbc_pvalue_threshold, flanking_distance: int = LocusBreakerClumpingConfig.lbc_flanking_distance) -> StudyLocus

Generate study-locus from summary statistics using locus-breaker clumping method with locus boundaries.

For more info, see locus_breaker

Parameters:

Name Type Description Default
baseline_pvalue_cutoff float

Baseline significance we consider for the locus.

lbc_baseline_pvalue
distance_cutoff int

Distance in base pairs to be used for clumping.

lbc_distance_cutoff
pvalue_cutoff float

GWAS significance threshold.

lbc_pvalue_threshold
flanking_distance int

Flank distance in base pairs to be used for clumping.

lbc_flanking_distance

Returns:

Name Type Description
StudyLocus StudyLocus

Clumped study-locus optionally containing variants based on window.

StudyLocus

Check LocusBreakerClumpingConfig object for default values.

Source code in src/gentropy/dataset/summary_statistics.py
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def locus_breaker_clumping(
    self: SummaryStatistics,
    baseline_pvalue_cutoff: float = LocusBreakerClumpingConfig.lbc_baseline_pvalue,
    distance_cutoff: int = LocusBreakerClumpingConfig.lbc_distance_cutoff,
    pvalue_cutoff: float = LocusBreakerClumpingConfig.lbc_pvalue_threshold,
    flanking_distance: int = LocusBreakerClumpingConfig.lbc_flanking_distance,
) -> StudyLocus:
    """Generate study-locus from summary statistics using locus-breaker clumping method with locus boundaries.

    For more info, see [`locus_breaker`][gentropy.method.locus_breaker_clumping.LocusBreakerClumping]

    Args:
        baseline_pvalue_cutoff (float, optional): Baseline significance we consider for the locus.
        distance_cutoff (int, optional): Distance in base pairs to be used for clumping.
        pvalue_cutoff (float, optional): GWAS significance threshold.
        flanking_distance (int, optional): Flank distance in base pairs to be used for clumping.

    Returns:
        StudyLocus: Clumped study-locus optionally containing variants based on window.
        Check LocusBreakerClumpingConfig object for default values.
    """
    from gentropy.method.locus_breaker_clumping import LocusBreakerClumping

    return LocusBreakerClumping.locus_breaker(
        self,
        baseline_pvalue_cutoff,
        distance_cutoff,
        pvalue_cutoff,
        flanking_distance,
    )

pvalue_filter(pvalue: float) -> SummaryStatistics

Filter summary statistics based on the provided p-value threshold.

Parameters:

Name Type Description Default
pvalue float

upper limit of the p-value to be filtered upon.

required

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

summary statistics object containing single point associations with p-values at least as significant as the provided threshold.

Source code in src/gentropy/dataset/summary_statistics.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def pvalue_filter(self: SummaryStatistics, pvalue: float) -> SummaryStatistics:
    """Filter summary statistics based on the provided p-value threshold.

    Args:
        pvalue (float): upper limit of the p-value to be filtered upon.

    Returns:
        SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.
    """
    # Converting p-value to mantissa and exponent:
    (mantissa, exponent) = split_pvalue(pvalue)

    # Applying filter:
    df = self._df.filter(
        (f.col("pValueExponent") < exponent)
        | (
            (f.col("pValueExponent") == exponent)
            & (f.col("pValueMantissa") <= mantissa)
        )
    )
    return SummaryStatistics(_df=df, _schema=self._schema)

sanity_filter() -> SummaryStatistics

The function filters the summary statistics by sanity filters.

The function filters the summary statistics by the following filters
  • The p-value should be less than 1.
  • The pValueMantissa should be greater than 0.
  • The beta should not be equal 0.
  • The p-value, beta and se should not be NaN.
  • The se should be positive.
  • The beta and se should not be infinite.

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

The filtered summary statistics.

Source code in src/gentropy/dataset/summary_statistics.py
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
    """The function filters the summary statistics by sanity filters.

    The function filters the summary statistics by the following filters:
        - The p-value should be less than 1.
        - The pValueMantissa should be greater than 0.
        - The beta should not be equal 0.
        - The p-value, beta and se should not be NaN.
        - The se should be positive.
        - The beta and se should not be infinite.

    Returns:
        SummaryStatistics: The filtered summary statistics.
    """
    gwas_df = self._df
    gwas_df = gwas_df.dropna(
        subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
    )
    gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") > 0))
    gwas_df = gwas_df.filter(
        (f.col("pValueMantissa") * 10 ** f.col("pValueExponent") < 1)
        & (f.col("pValueMantissa") > 0)
    )
    cols = ["beta", "standardError"]
    summary_stats = SummaryStatistics(
        _df=gwas_df,
        _schema=SummaryStatistics.get_schema(),
    ).drop_infinity_values(*cols)

    return summary_stats

window_based_clumping(distance: int = WindowBasedClumpingStepConfig().distance, gwas_significance: float = WindowBasedClumpingStepConfig().gwas_significance) -> StudyLocus

Generate study-locus from summary statistics using window-based clumping.

For more info, see WindowBasedClumping

Parameters:

Name Type Description Default
distance int

Distance in base pairs to be used for clumping. Defaults to 500_000.

distance
gwas_significance float

GWAS significance threshold. Defaults to 5e-8.

gwas_significance

Returns:

Name Type Description
StudyLocus StudyLocus

Clumped study-locus optionally containing variants based on window.

StudyLocus

Check WindowBasedClumpingStepConfig object for default values.

Source code in src/gentropy/dataset/summary_statistics.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
def window_based_clumping(
    self: SummaryStatistics,
    distance: int = WindowBasedClumpingStepConfig().distance,
    gwas_significance: float = WindowBasedClumpingStepConfig().gwas_significance,
) -> StudyLocus:
    """Generate study-locus from summary statistics using window-based clumping.

    For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]

    Args:
        distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
        gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.

    Returns:
        StudyLocus: Clumped study-locus optionally containing variants based on window.
        Check WindowBasedClumpingStepConfig object for default values.
    """
    from gentropy.method.window_based_clumping import WindowBasedClumping

    return WindowBasedClumping.clump(
        # Before clumping, we filter the summary statistics by p-value:
        self.pvalue_filter(gwas_significance),
        distance=distance,
        # After applying the clumping, we filter the clumped loci by the flag:
    ).valid_rows(["WINDOW_CLUMPED"])

Schema

root
 |-- studyId: string (nullable = false)
 |-- variantId: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- position: integer (nullable = false)
 |-- beta: double (nullable = false)
 |-- sampleSize: integer (nullable = true)
 |-- pValueMantissa: float (nullable = false)
 |-- pValueExponent: integer (nullable = false)
 |-- effectAlleleFrequencyFromSource: float (nullable = true)
 |-- standardError: double (nullable = true)