Skip to content

Summary Statistics

gentropy.dataset.summary_statistics.SummaryStatistics dataclass

Bases: Dataset

Summary Statistics dataset.

A summary statistics dataset contains all single point statistics resulting from a GWAS.

Source code in src/gentropy/dataset/summary_statistics.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
@dataclass
class SummaryStatistics(Dataset):
    """Summary Statistics dataset.

    A summary statistics dataset contains all single point statistics resulting from a GWAS.
    """

    @classmethod
    def get_schema(cls: type[SummaryStatistics]) -> StructType:
        """Provides the schema for the SummaryStatistics dataset.

        Returns:
            StructType: Schema for the SummaryStatistics dataset
        """
        return parse_spark_schema("summary_statistics.json")

    def pvalue_filter(self: SummaryStatistics, pvalue: float) -> SummaryStatistics:
        """Filter summary statistics based on the provided p-value threshold.

        Args:
            pvalue (float): upper limit of the p-value to be filtered upon.

        Returns:
            SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.
        """
        # Converting p-value to mantissa and exponent:
        (mantissa, exponent) = split_pvalue(pvalue)

        # Applying filter:
        df = self._df.filter(
            (f.col("pValueExponent") < exponent)
            | (
                (f.col("pValueExponent") == exponent)
                & (f.col("pValueMantissa") <= mantissa)
            )
        )
        return SummaryStatistics(_df=df, _schema=self._schema)

    def window_based_clumping(
        self: SummaryStatistics,
        distance: int = 500_000,
        gwas_significance: float = 5e-8,
    ) -> StudyLocus:
        """Generate study-locus from summary statistics using window-based clumping.

        For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]

        Args:
            distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
            gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.

        Returns:
            StudyLocus: Clumped study-locus optionally containing variants based on window.
        """
        from gentropy.method.window_based_clumping import WindowBasedClumping

        return WindowBasedClumping.clump(
            self,
            distance=distance,
            gwas_significance=gwas_significance,
        )

    def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
        """Exclude a region from the summary stats dataset.

        Args:
            region (str): region given in "chr##:#####-####" format

        Returns:
            SummaryStatistics: filtered summary statistics.
        """
        (chromosome, start_position, end_position) = parse_region(region)

        return SummaryStatistics(
            _df=(
                self.df.filter(
                    ~(
                        (f.col("chromosome") == chromosome)
                        & (
                            (f.col("position") >= start_position)
                            & (f.col("position") <= end_position)
                        )
                    )
                )
            ),
            _schema=SummaryStatistics.get_schema(),
        )

    def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
        """The function filters the summary statistics by sanity filters.

        The function filters the summary statistics by the following filters:
            - The p-value should not be eqaul 1.
            - The beta and se should not be equal 0.
            - The p-value, beta and se should not be NaN.

        Returns:
            SummaryStatistics: The filtered summary statistics.
        """
        gwas_df = self._df
        gwas_df = gwas_df.dropna(
            subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
        )

        gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") != 0))
        gwas_df = gwas_df.filter(
            f.col("pValueMantissa") * 10 ** f.col("pValueExponent") != 1
        )

        return SummaryStatistics(
            _df=gwas_df,
            _schema=SummaryStatistics.get_schema(),
        )

exclude_region(region: str) -> SummaryStatistics

Exclude a region from the summary stats dataset.

Parameters:

Name Type Description Default
region str

region given in "chr##:#####-####" format

required

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

filtered summary statistics.

Source code in src/gentropy/dataset/summary_statistics.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
    """Exclude a region from the summary stats dataset.

    Args:
        region (str): region given in "chr##:#####-####" format

    Returns:
        SummaryStatistics: filtered summary statistics.
    """
    (chromosome, start_position, end_position) = parse_region(region)

    return SummaryStatistics(
        _df=(
            self.df.filter(
                ~(
                    (f.col("chromosome") == chromosome)
                    & (
                        (f.col("position") >= start_position)
                        & (f.col("position") <= end_position)
                    )
                )
            )
        ),
        _schema=SummaryStatistics.get_schema(),
    )

get_schema() -> StructType classmethod

Provides the schema for the SummaryStatistics dataset.

Returns:

Name Type Description
StructType StructType

Schema for the SummaryStatistics dataset

Source code in src/gentropy/dataset/summary_statistics.py
26
27
28
29
30
31
32
33
@classmethod
def get_schema(cls: type[SummaryStatistics]) -> StructType:
    """Provides the schema for the SummaryStatistics dataset.

    Returns:
        StructType: Schema for the SummaryStatistics dataset
    """
    return parse_spark_schema("summary_statistics.json")

pvalue_filter(pvalue: float) -> SummaryStatistics

Filter summary statistics based on the provided p-value threshold.

Parameters:

Name Type Description Default
pvalue float

upper limit of the p-value to be filtered upon.

required

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

summary statistics object containing single point associations with p-values at least as significant as the provided threshold.

Source code in src/gentropy/dataset/summary_statistics.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def pvalue_filter(self: SummaryStatistics, pvalue: float) -> SummaryStatistics:
    """Filter summary statistics based on the provided p-value threshold.

    Args:
        pvalue (float): upper limit of the p-value to be filtered upon.

    Returns:
        SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.
    """
    # Converting p-value to mantissa and exponent:
    (mantissa, exponent) = split_pvalue(pvalue)

    # Applying filter:
    df = self._df.filter(
        (f.col("pValueExponent") < exponent)
        | (
            (f.col("pValueExponent") == exponent)
            & (f.col("pValueMantissa") <= mantissa)
        )
    )
    return SummaryStatistics(_df=df, _schema=self._schema)

sanity_filter() -> SummaryStatistics

The function filters the summary statistics by sanity filters.

The function filters the summary statistics by the following filters
  • The p-value should not be eqaul 1.
  • The beta and se should not be equal 0.
  • The p-value, beta and se should not be NaN.

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

The filtered summary statistics.

Source code in src/gentropy/dataset/summary_statistics.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
    """The function filters the summary statistics by sanity filters.

    The function filters the summary statistics by the following filters:
        - The p-value should not be eqaul 1.
        - The beta and se should not be equal 0.
        - The p-value, beta and se should not be NaN.

    Returns:
        SummaryStatistics: The filtered summary statistics.
    """
    gwas_df = self._df
    gwas_df = gwas_df.dropna(
        subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
    )

    gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") != 0))
    gwas_df = gwas_df.filter(
        f.col("pValueMantissa") * 10 ** f.col("pValueExponent") != 1
    )

    return SummaryStatistics(
        _df=gwas_df,
        _schema=SummaryStatistics.get_schema(),
    )

window_based_clumping(distance: int = 500000, gwas_significance: float = 5e-08) -> StudyLocus

Generate study-locus from summary statistics using window-based clumping.

For more info, see WindowBasedClumping

Parameters:

Name Type Description Default
distance int

Distance in base pairs to be used for clumping. Defaults to 500_000.

500000
gwas_significance float

GWAS significance threshold. Defaults to 5e-8.

5e-08

Returns:

Name Type Description
StudyLocus StudyLocus

Clumped study-locus optionally containing variants based on window.

Source code in src/gentropy/dataset/summary_statistics.py
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def window_based_clumping(
    self: SummaryStatistics,
    distance: int = 500_000,
    gwas_significance: float = 5e-8,
) -> StudyLocus:
    """Generate study-locus from summary statistics using window-based clumping.

    For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]

    Args:
        distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
        gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.

    Returns:
        StudyLocus: Clumped study-locus optionally containing variants based on window.
    """
    from gentropy.method.window_based_clumping import WindowBasedClumping

    return WindowBasedClumping.clump(
        self,
        distance=distance,
        gwas_significance=gwas_significance,
    )

Schema

root
 |-- studyId: string (nullable = false)
 |-- variantId: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- position: integer (nullable = false)
 |-- beta: double (nullable = false)
 |-- sampleSize: integer (nullable = true)
 |-- pValueMantissa: float (nullable = false)
 |-- pValueExponent: integer (nullable = false)
 |-- effectAlleleFrequencyFromSource: float (nullable = true)
 |-- standardError: double (nullable = true)