Skip to content

Summary Stats

gentropy.datasource.eqtl_catalogue.summary_stats.EqtlCatalogueSummaryStats dataclass

Summary statistics dataset for eQTL Catalogue.

Source code in src/gentropy/datasource/eqtl_catalogue/summary_stats.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
@dataclass
class EqtlCatalogueSummaryStats:
    """Summary statistics dataset for eQTL Catalogue."""

    @staticmethod
    def _full_study_id_regexp() -> Column:
        """Constructs a full study ID from the URI.

        Returns:
            Column: expression to extract a full study ID from the URI.
        """
        # Example of a URI which is used for parsing:
        # "gs://genetics_etl_python_playground/input/preprocess/eqtl_catalogue/imported/GTEx_V8/ge/Adipose_Subcutaneous.tsv.gz".

        # Regular expession to extract project ID from URI.  Example: "GTEx_V8".
        _project_id = f.regexp_extract(
            f.input_file_name(),
            r"imported/([^/]+)/.*",
            1,
        )
        # Regular expression to extract QTL group from URI.  Example: "Adipose_Subcutaneous".
        _qtl_group = f.regexp_extract(f.input_file_name(), r"([^/]+)\.tsv\.gz", 1)
        # Extracting gene ID from the column.  Example: "ENSG00000225630".
        _gene_id = f.col("gene_id")

        # We can now construct the full study ID based on all fields.
        # Example: "GTEx_V8_Adipose_Subcutaneous_ENSG00000225630".
        return f.concat(_project_id, f.lit("_"), _qtl_group, f.lit("_"), _gene_id)

    @classmethod
    def from_source(
        cls: type[EqtlCatalogueSummaryStats],
        summary_stats_df: DataFrame,
    ) -> SummaryStatistics:
        """Ingests all summary stats for all eQTL Catalogue studies.

        Args:
            summary_stats_df (DataFrame): an ingested but unprocessed summary statistics dataframe from eQTL Catalogue.

        Returns:
            SummaryStatistics: a processed summary statistics dataframe for eQTL Catalogue.
        """
        processed_summary_stats_df = (
            summary_stats_df.select(
                # Construct study ID from the appropriate columns.
                cls._full_study_id_regexp().alias("studyId"),
                # Add variant information.
                f.concat_ws(
                    "_",
                    f.col("chromosome"),
                    f.col("position"),
                    f.col("ref"),
                    f.col("alt"),
                ).alias("variantId"),
                f.col("chromosome"),
                f.col("position").cast(t.IntegerType()),
                # Parse p-value into mantissa and exponent.
                *parse_pvalue(f.col("pvalue")),
                # Add beta, standard error, and allele frequency information.
                f.col("beta").cast("double"),
                f.col("se").cast("double").alias("standardError"),
                f.col("maf").cast("float").alias("effectAlleleFrequencyFromSource"),
            )
            # Drop rows which don't have proper position or beta value.
            .filter(
                f.col("position").cast(t.IntegerType()).isNotNull()
                & (f.col("beta") != 0)
            )
        )

        # Initialise a summary statistics object.
        return SummaryStatistics(
            _df=processed_summary_stats_df,
            _schema=SummaryStatistics.get_schema(),
        )

from_source(summary_stats_df: DataFrame) -> SummaryStatistics classmethod

Ingests all summary stats for all eQTL Catalogue studies.

Parameters:

Name Type Description Default
summary_stats_df DataFrame

an ingested but unprocessed summary statistics dataframe from eQTL Catalogue.

required

Returns:

Name Type Description
SummaryStatistics SummaryStatistics

a processed summary statistics dataframe for eQTL Catalogue.

Source code in src/gentropy/datasource/eqtl_catalogue/summary_stats.py
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
@classmethod
def from_source(
    cls: type[EqtlCatalogueSummaryStats],
    summary_stats_df: DataFrame,
) -> SummaryStatistics:
    """Ingests all summary stats for all eQTL Catalogue studies.

    Args:
        summary_stats_df (DataFrame): an ingested but unprocessed summary statistics dataframe from eQTL Catalogue.

    Returns:
        SummaryStatistics: a processed summary statistics dataframe for eQTL Catalogue.
    """
    processed_summary_stats_df = (
        summary_stats_df.select(
            # Construct study ID from the appropriate columns.
            cls._full_study_id_regexp().alias("studyId"),
            # Add variant information.
            f.concat_ws(
                "_",
                f.col("chromosome"),
                f.col("position"),
                f.col("ref"),
                f.col("alt"),
            ).alias("variantId"),
            f.col("chromosome"),
            f.col("position").cast(t.IntegerType()),
            # Parse p-value into mantissa and exponent.
            *parse_pvalue(f.col("pvalue")),
            # Add beta, standard error, and allele frequency information.
            f.col("beta").cast("double"),
            f.col("se").cast("double").alias("standardError"),
            f.col("maf").cast("float").alias("effectAlleleFrequencyFromSource"),
        )
        # Drop rows which don't have proper position or beta value.
        .filter(
            f.col("position").cast(t.IntegerType()).isNotNull()
            & (f.col("beta") != 0)
        )
    )

    # Initialise a summary statistics object.
    return SummaryStatistics(
        _df=processed_summary_stats_df,
        _schema=SummaryStatistics.get_schema(),
    )