Skip to content

Summary Statistics

gentropy.datasource.ukb_ppp_eur.summary_stats.UkbPppEurSummaryStats dataclass

Summary statistics dataset for UKB PPP (EUR).

Source code in src/gentropy/datasource/ukb_ppp_eur/summary_stats.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@dataclass
class UkbPppEurSummaryStats:
    """Summary statistics dataset for UKB PPP (EUR)."""

    @classmethod
    def from_source(
        cls: type[UkbPppEurSummaryStats],
        spark: SparkSession,
        raw_summary_stats_path: str,
        tmp_variant_annotation_path: str,
        chromosome: str,
        study_index_path: str,
    ) -> SummaryStatistics:
        """Ingest and harmonise all summary stats for UKB PPP (EUR) data.

        Args:
            spark (SparkSession): Spark session object.
            raw_summary_stats_path (str): Input raw summary stats path.
            tmp_variant_annotation_path (str): Input variant annotation dataset path.
            chromosome (str): Which chromosome to process.
            study_index_path (str): The path to study index, which is necessary in some cases to populate the sample size column.

        Returns:
            SummaryStatistics: Processed summary statistics dataset for a given chromosome.
        """
        df = harmonise_summary_stats(
            spark,
            raw_summary_stats_path,
            tmp_variant_annotation_path,
            chromosome,
            colname_position="GENPOS",
            colname_allele0="ALLELE0",
            colname_allele1="ALLELE1",
            colname_a1freq="A1FREQ",
            colname_info="INFO",
            colname_beta="BETA",
            colname_se="SE",
            colname_mlog10p="LOG10P",
            colname_n="N",
        )

        # Create the summary statistics object.
        return SummaryStatistics(
            _df=df,
            _schema=SummaryStatistics.get_schema(),
        )

from_source(spark: SparkSession, raw_summary_stats_path: str, tmp_variant_annotation_path: str, chromosome: str, study_index_path: str) -> SummaryStatistics classmethod

Ingest and harmonise all summary stats for UKB PPP (EUR) data.

Parameters:

Name Type Description Default
spark SparkSession

Spark session object.

required
raw_summary_stats_path str

Input raw summary stats path.

required
tmp_variant_annotation_path str

Input variant annotation dataset path.

required
chromosome str

Which chromosome to process.

required
study_index_path str

The path to study index, which is necessary in some cases to populate the sample size column.

required

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

Processed summary statistics dataset for a given chromosome.

Source code in src/gentropy/datasource/ukb_ppp_eur/summary_stats.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
@classmethod
def from_source(
    cls: type[UkbPppEurSummaryStats],
    spark: SparkSession,
    raw_summary_stats_path: str,
    tmp_variant_annotation_path: str,
    chromosome: str,
    study_index_path: str,
) -> SummaryStatistics:
    """Ingest and harmonise all summary stats for UKB PPP (EUR) data.

    Args:
        spark (SparkSession): Spark session object.
        raw_summary_stats_path (str): Input raw summary stats path.
        tmp_variant_annotation_path (str): Input variant annotation dataset path.
        chromosome (str): Which chromosome to process.
        study_index_path (str): The path to study index, which is necessary in some cases to populate the sample size column.

    Returns:
        SummaryStatistics: Processed summary statistics dataset for a given chromosome.
    """
    df = harmonise_summary_stats(
        spark,
        raw_summary_stats_path,
        tmp_variant_annotation_path,
        chromosome,
        colname_position="GENPOS",
        colname_allele0="ALLELE0",
        colname_allele1="ALLELE1",
        colname_a1freq="A1FREQ",
        colname_info="INFO",
        colname_beta="BETA",
        colname_se="SE",
        colname_mlog10p="LOG10P",
        colname_n="N",
    )

    # Create the summary statistics object.
    return SummaryStatistics(
        _df=df,
        _schema=SummaryStatistics.get_schema(),
    )