Summary Statistics

`gentropy.dataset.summary_statistics.SummaryStatistics` `dataclass` ¶

Bases: Dataset

Summary Statistics dataset.

A summary statistics dataset contains all single point statistics resulting from a GWAS.

Source code in src/gentropy/dataset/summary_statistics.py

@dataclass
class SummaryStatistics(Dataset):
    """Summary Statistics dataset.

    A summary statistics dataset contains all single point statistics resulting from a GWAS.
    """

    @classmethod
    def get_schema(cls: type[SummaryStatistics]) -> StructType:
        """Provides the schema for the SummaryStatistics dataset.

        Returns:
            StructType: Schema for the SummaryStatistics dataset
        """
        return parse_spark_schema("summary_statistics.json")

    def pvalue_filter(self: SummaryStatistics, pvalue: float) -> SummaryStatistics:
        """Filter summary statistics based on the provided p-value threshold.

        Args:
            pvalue (float): upper limit of the p-value to be filtered upon.

        Returns:
            SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.
        """
        # Converting p-value to mantissa and exponent:
        (mantissa, exponent) = split_pvalue(pvalue)

        # Applying filter:
        df = self._df.filter(
            (f.col("pValueExponent") < exponent)
            | (
                (f.col("pValueExponent") == exponent)
                & (f.col("pValueMantissa") <= mantissa)
            )
        )
        return SummaryStatistics(_df=df, _schema=self._schema)

    def window_based_clumping(
        self: SummaryStatistics,
        distance: int = 500_000,
        gwas_significance: float = 5e-8,
    ) -> StudyLocus:
        """Generate study-locus from summary statistics using window-based clumping.

        For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]

        Args:
            distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
            gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.

        Returns:
            StudyLocus: Clumped study-locus optionally containing variants based on window.
        """
        from gentropy.method.window_based_clumping import WindowBasedClumping

        return WindowBasedClumping.clump(
            self,
            distance=distance,
            gwas_significance=gwas_significance,
        )

    def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
        """Exclude a region from the summary stats dataset.

        Args:
            region (str): region given in "chr##:#####-####" format

        Returns:
            SummaryStatistics: filtered summary statistics.
        """
        (chromosome, start_position, end_position) = parse_region(region)

        return SummaryStatistics(
            _df=(
                self.df.filter(
                    ~(
                        (f.col("chromosome") == chromosome)
                        & (
                            (f.col("position") >= start_position)
                            & (f.col("position") <= end_position)
                        )
                    )
                )
            ),
            _schema=SummaryStatistics.get_schema(),
        )

    def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
        """The function filters the summary statistics by sanity filters.

        The function filters the summary statistics by the following filters:
            - The p-value should not be eqaul 1.
            - The beta and se should not be equal 0.
            - The p-value, beta and se should not be NaN.

        Returns:
            SummaryStatistics: The filtered summary statistics.
        """
        gwas_df = self._df
        gwas_df = gwas_df.dropna(
            subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
        )

        gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") != 0))
        gwas_df = gwas_df.filter(
            f.col("pValueMantissa") * 10 ** f.col("pValueExponent") != 1
        )

        return SummaryStatistics(
            _df=gwas_df,
            _schema=SummaryStatistics.get_schema(),
        )

`exclude_region(region: str) -> SummaryStatistics` ¶

Exclude a region from the summary stats dataset.

Parameters:

Name	Type	Description	Default
`region`	`str`	region given in "chr##:#####-####" format	required

Returns:

Name	Type	Description
`SummaryStatistics`	`SummaryStatistics`	filtered summary statistics.

Source code in src/gentropy/dataset/summary_statistics.py

def exclude_region(self: SummaryStatistics, region: str) -> SummaryStatistics:
    """Exclude a region from the summary stats dataset.

    Args:
        region (str): region given in "chr##:#####-####" format

    Returns:
        SummaryStatistics: filtered summary statistics.
    """
    (chromosome, start_position, end_position) = parse_region(region)

    return SummaryStatistics(
        _df=(
            self.df.filter(
                ~(
                    (f.col("chromosome") == chromosome)
                    & (
                        (f.col("position") >= start_position)
                        & (f.col("position") <= end_position)
                    )
                )
            )
        ),
        _schema=SummaryStatistics.get_schema(),
    )

`get_schema() -> StructType` `classmethod` ¶

Provides the schema for the SummaryStatistics dataset.

Returns:

Name	Type	Description
`StructType`	`StructType`	Schema for the SummaryStatistics dataset

Source code in src/gentropy/dataset/summary_statistics.py

@classmethod
def get_schema(cls: type[SummaryStatistics]) -> StructType:
    """Provides the schema for the SummaryStatistics dataset.

    Returns:
        StructType: Schema for the SummaryStatistics dataset
    """
    return parse_spark_schema("summary_statistics.json")

`pvalue_filter(pvalue: float) -> SummaryStatistics` ¶

Filter summary statistics based on the provided p-value threshold.

Parameters:

Name	Type	Description	Default
`pvalue`	`float`	upper limit of the p-value to be filtered upon.	required

Returns:

Name	Type	Description
`SummaryStatistics`	`SummaryStatistics`	summary statistics object containing single point associations with p-values at least as significant as the provided threshold.

Source code in src/gentropy/dataset/summary_statistics.py

def pvalue_filter(self: SummaryStatistics, pvalue: float) -> SummaryStatistics:
    """Filter summary statistics based on the provided p-value threshold.

    Args:
        pvalue (float): upper limit of the p-value to be filtered upon.

    Returns:
        SummaryStatistics: summary statistics object containing single point associations with p-values at least as significant as the provided threshold.
    """
    # Converting p-value to mantissa and exponent:
    (mantissa, exponent) = split_pvalue(pvalue)

    # Applying filter:
    df = self._df.filter(
        (f.col("pValueExponent") < exponent)
        | (
            (f.col("pValueExponent") == exponent)
            & (f.col("pValueMantissa") <= mantissa)
        )
    )
    return SummaryStatistics(_df=df, _schema=self._schema)

`sanity_filter() -> SummaryStatistics` ¶

The function filters the summary statistics by sanity filters.

The function filters the summary statistics by the following filters

The p-value should not be eqaul 1.
The beta and se should not be equal 0.
The p-value, beta and se should not be NaN.

Returns:

Name	Type	Description
`SummaryStatistics`	`SummaryStatistics`	The filtered summary statistics.

Source code in src/gentropy/dataset/summary_statistics.py

def sanity_filter(self: SummaryStatistics) -> SummaryStatistics:
    """The function filters the summary statistics by sanity filters.

    The function filters the summary statistics by the following filters:
        - The p-value should not be eqaul 1.
        - The beta and se should not be equal 0.
        - The p-value, beta and se should not be NaN.

    Returns:
        SummaryStatistics: The filtered summary statistics.
    """
    gwas_df = self._df
    gwas_df = gwas_df.dropna(
        subset=["beta", "standardError", "pValueMantissa", "pValueExponent"]
    )

    gwas_df = gwas_df.filter((f.col("beta") != 0) & (f.col("standardError") != 0))
    gwas_df = gwas_df.filter(
        f.col("pValueMantissa") * 10 ** f.col("pValueExponent") != 1
    )

    return SummaryStatistics(
        _df=gwas_df,
        _schema=SummaryStatistics.get_schema(),
    )

`window_based_clumping(distance: int = 500000, gwas_significance: float = 5e-08) -> StudyLocus` ¶

Generate study-locus from summary statistics using window-based clumping.

For more info, see WindowBasedClumping

Parameters:

Name	Type	Description	Default
`distance`	`int`	Distance in base pairs to be used for clumping. Defaults to 500_000.	`500000`
`gwas_significance`	`float`	GWAS significance threshold. Defaults to 5e-8.	`5e-08`

Returns:

Name	Type	Description
`StudyLocus`	`StudyLocus`	Clumped study-locus optionally containing variants based on window.

Source code in src/gentropy/dataset/summary_statistics.py

def window_based_clumping(
    self: SummaryStatistics,
    distance: int = 500_000,
    gwas_significance: float = 5e-8,
) -> StudyLocus:
    """Generate study-locus from summary statistics using window-based clumping.

    For more info, see [`WindowBasedClumping`][gentropy.method.window_based_clumping.WindowBasedClumping]

    Args:
        distance (int): Distance in base pairs to be used for clumping. Defaults to 500_000.
        gwas_significance (float, optional): GWAS significance threshold. Defaults to 5e-8.

    Returns:
        StudyLocus: Clumped study-locus optionally containing variants based on window.
    """
    from gentropy.method.window_based_clumping import WindowBasedClumping

    return WindowBasedClumping.clump(
        self,
        distance=distance,
        gwas_significance=gwas_significance,
    )

Schema¶

root
 |-- studyId: string (nullable = false)
 |-- variantId: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- position: integer (nullable = false)
 |-- beta: double (nullable = false)
 |-- sampleSize: integer (nullable = true)
 |-- pValueMantissa: float (nullable = false)
 |-- pValueExponent: integer (nullable = false)
 |-- effectAlleleFrequencyFromSource: float (nullable = true)
 |-- standardError: double (nullable = true)

2023-03-30
2024-01-18
Contributors

Summary Statistics

gentropy.dataset.summary_statistics.SummaryStatistics dataclass ¶

exclude_region(region: str) -> SummaryStatistics ¶

get_schema() -> StructType classmethod ¶

pvalue_filter(pvalue: float) -> SummaryStatistics ¶

sanity_filter() -> SummaryStatistics ¶

window_based_clumping(distance: int = 500000, gwas_significance: float = 5e-08) -> StudyLocus ¶

Schema¶

`gentropy.dataset.summary_statistics.SummaryStatistics` `dataclass` ¶

`exclude_region(region: str) -> SummaryStatistics` ¶

`get_schema() -> StructType` `classmethod` ¶

`pvalue_filter(pvalue: float) -> SummaryStatistics` ¶

`sanity_filter() -> SummaryStatistics` ¶

`window_based_clumping(distance: int = 500000, gwas_significance: float = 5e-08) -> StudyLocus` ¶