Skip to content

Summary Statistics Quality Control

gentropy.dataset.summary_statistics_qc.SummaryStatisticsQC dataclass

Bases: Dataset

Summary Statistics Quality Controls dataset.

Examples:

>>> data = [("S1", 0.45, 6.78, 8.47, 0.55, 2, 1), ("S2", 0.26, -2.15, 4.38, 0.04, 2, 0)]
>>> df = spark.createDataFrame(data, schema=SummaryStatisticsQC.get_schema())
>>> qc = SummaryStatisticsQC(_df=df)
>>> isinstance(qc, SummaryStatisticsQC)
True
>>> qc.df.show()
+-------+---------+------------+----------+---------+----------+--------------+
|studyId|mean_beta|mean_diff_pz|se_diff_pz|gc_lambda|n_variants|n_variants_sig|
+-------+---------+------------+----------+---------+----------+--------------+
|     S1|     0.45|        6.78|      8.47|     0.55|         2|             1|
|     S2|     0.26|       -2.15|      4.38|     0.04|         2|             0|
+-------+---------+------------+----------+---------+----------+--------------+
Source code in src/gentropy/dataset/summary_statistics_qc.py
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
@dataclass
class SummaryStatisticsQC(Dataset):
    """Summary Statistics Quality Controls dataset.

    Examples:
        >>> data = [("S1", 0.45, 6.78, 8.47, 0.55, 2, 1), ("S2", 0.26, -2.15, 4.38, 0.04, 2, 0)]
        >>> df = spark.createDataFrame(data, schema=SummaryStatisticsQC.get_schema())
        >>> qc = SummaryStatisticsQC(_df=df)
        >>> isinstance(qc, SummaryStatisticsQC)
        True
        >>> qc.df.show()
        +-------+---------+------------+----------+---------+----------+--------------+
        |studyId|mean_beta|mean_diff_pz|se_diff_pz|gc_lambda|n_variants|n_variants_sig|
        +-------+---------+------------+----------+---------+----------+--------------+
        |     S1|     0.45|        6.78|      8.47|     0.55|         2|             1|
        |     S2|     0.26|       -2.15|      4.38|     0.04|         2|             0|
        +-------+---------+------------+----------+---------+----------+--------------+
        <BLANKLINE>
    """

    @classmethod
    def get_schema(cls: type[SummaryStatisticsQC]) -> StructType:
        """Provide the schema for the SummaryStatisticsQC dataset.

        Returns:
            StructType: The schema for the SummaryStatisticsQC dataset.
        """
        return parse_spark_schema("summary_statistics_qc.json")

    @classmethod
    def from_summary_statistics(
        cls: type[SummaryStatisticsQC],
        gwas: SummaryStatistics,
        pval_threshold: float = 1e-8,
    ) -> SummaryStatisticsQC:
        """The function calculates the quality control metrics for the summary statistics.

        Args:
            gwas (SummaryStatistics): The instance of the SummaryStatistics class.
            pval_threshold (float): The p-value threshold for the QC. Default is 1e-8.

        Returns:
            SummaryStatisticsQC: Dataset with quality control metrics for the summary statistics.

        Examples:
            >>> from pyspark.sql import functions as f
            >>> s = 'studyId STRING, variantId STRING, chromosome STRING, position INT, beta DOUBLE, standardError DOUBLE, pValueMantissa FLOAT, pValueExponent INTEGER'
            >>> v1 = [("S1", "1_10000_A_T", "1", 10000, 1.0, 0.2, 9.9, -20), ("S1", "X_10001_C_T", "X", 10001, -0.1, 0.2, 1.0, -1)]
            >>> v2 = [("S2", "1_10001_C_T", "1", 10001, 0.028, 0.2, 1.0, -1), ("S2", "1_10002_G_C", "1", 10002, 0.5, 0.1, 1.0, -1)]
            >>> df = spark.createDataFrame(v1 + v2, s)
            >>> df.show()
            +-------+-----------+----------+--------+-----+-------------+--------------+--------------+
            |studyId|  variantId|chromosome|position| beta|standardError|pValueMantissa|pValueExponent|
            +-------+-----------+----------+--------+-----+-------------+--------------+--------------+
            |     S1|1_10000_A_T|         1|   10000|  1.0|          0.2|           9.9|           -20|
            |     S1|X_10001_C_T|         X|   10001| -0.1|          0.2|           1.0|            -1|
            |     S2|1_10001_C_T|         1|   10001|0.028|          0.2|           1.0|            -1|
            |     S2|1_10002_G_C|         1|   10002|  0.5|          0.1|           1.0|            -1|
            +-------+-----------+----------+--------+-----+-------------+--------------+--------------+
            <BLANKLINE>

            ** This method outputs one value per study, mean beta, mean diff pz, se diff pz, gc lambda, n variants and n variants sig**

            >>> stats = SummaryStatistics(df)
            >>> qc = SummaryStatisticsQC.from_summary_statistics(stats)
            >>> isinstance(qc, SummaryStatisticsQC)
            True
            >>> mean_beta = f.round("mean_beta", 2).alias("mean_beta")
            >>> mean_diff_pz = f.round("mean_diff_pz", 2).alias("mean_diff_pz")
            >>> se_diff_pz = f.round("se_diff_pz", 2).alias("se_diff_pz")
            >>> gc_lambda = f.round("gc_lambda", 2).alias("gc_lambda")
            >>> qc.df.select('studyId', mean_beta, mean_diff_pz, se_diff_pz, gc_lambda, 'n_variants', 'n_variants_sig').show()
            +-------+---------+------------+----------+---------+----------+--------------+
            |studyId|mean_beta|mean_diff_pz|se_diff_pz|gc_lambda|n_variants|n_variants_sig|
            +-------+---------+------------+----------+---------+----------+--------------+
            |     S1|     0.45|        6.78|      8.47|     0.55|         2|             1|
            |     S2|     0.26|       -2.15|      4.38|     0.04|         2|             0|
            +-------+---------+------------+----------+---------+----------+--------------+
            <BLANKLINE>
        """
        n_variants: Callable[[DataFrame], DataFrame] = lambda df: number_of_variants(
            df, pval_threshold
        )
        QC_TESTS = [
            QCTest(["mean_beta"], mean_beta_check),
            QCTest(["mean_diff_pz", "se_diff_pz"], p_z_test),
            QCTest(["gc_lambda"], gc_lambda_check),
            QCTest(["n_variants", "n_variants_sig"], n_variants),
        ]

        qc = reduce(
            lambda qc1, qc2: qc1.join(qc2, on="studyId", how="outer"),
            [test.call_test(gwas.df) for test in QC_TESTS],
        )

        return cls(_df=qc)

from_summary_statistics(gwas: SummaryStatistics, pval_threshold: float = 1e-08) -> SummaryStatisticsQC classmethod

The function calculates the quality control metrics for the summary statistics.

Parameters:

Name Type Description Default
gwas SummaryStatistics

The instance of the SummaryStatistics class.

required
pval_threshold float

The p-value threshold for the QC. Default is 1e-8.

1e-08

Returns:

Name Type Description
SummaryStatisticsQC SummaryStatisticsQC

Dataset with quality control metrics for the summary statistics.

Examples:

>>> from pyspark.sql import functions as f
>>> s = 'studyId STRING, variantId STRING, chromosome STRING, position INT, beta DOUBLE, standardError DOUBLE, pValueMantissa FLOAT, pValueExponent INTEGER'
>>> v1 = [("S1", "1_10000_A_T", "1", 10000, 1.0, 0.2, 9.9, -20), ("S1", "X_10001_C_T", "X", 10001, -0.1, 0.2, 1.0, -1)]
>>> v2 = [("S2", "1_10001_C_T", "1", 10001, 0.028, 0.2, 1.0, -1), ("S2", "1_10002_G_C", "1", 10002, 0.5, 0.1, 1.0, -1)]
>>> df = spark.createDataFrame(v1 + v2, s)
>>> df.show()
+-------+-----------+----------+--------+-----+-------------+--------------+--------------+
|studyId|  variantId|chromosome|position| beta|standardError|pValueMantissa|pValueExponent|
+-------+-----------+----------+--------+-----+-------------+--------------+--------------+
|     S1|1_10000_A_T|         1|   10000|  1.0|          0.2|           9.9|           -20|
|     S1|X_10001_C_T|         X|   10001| -0.1|          0.2|           1.0|            -1|
|     S2|1_10001_C_T|         1|   10001|0.028|          0.2|           1.0|            -1|
|     S2|1_10002_G_C|         1|   10002|  0.5|          0.1|           1.0|            -1|
+-------+-----------+----------+--------+-----+-------------+--------------+--------------+

** This method outputs one value per study, mean beta, mean diff pz, se diff pz, gc lambda, n variants and n variants sig**

>>> stats = SummaryStatistics(df)
>>> qc = SummaryStatisticsQC.from_summary_statistics(stats)
>>> isinstance(qc, SummaryStatisticsQC)
True
>>> mean_beta = f.round("mean_beta", 2).alias("mean_beta")
>>> mean_diff_pz = f.round("mean_diff_pz", 2).alias("mean_diff_pz")
>>> se_diff_pz = f.round("se_diff_pz", 2).alias("se_diff_pz")
>>> gc_lambda = f.round("gc_lambda", 2).alias("gc_lambda")
>>> qc.df.select('studyId', mean_beta, mean_diff_pz, se_diff_pz, gc_lambda, 'n_variants', 'n_variants_sig').show()
+-------+---------+------------+----------+---------+----------+--------------+
|studyId|mean_beta|mean_diff_pz|se_diff_pz|gc_lambda|n_variants|n_variants_sig|
+-------+---------+------------+----------+---------+----------+--------------+
|     S1|     0.45|        6.78|      8.47|     0.55|         2|             1|
|     S2|     0.26|       -2.15|      4.38|     0.04|         2|             0|
+-------+---------+------------+----------+---------+----------+--------------+
Source code in src/gentropy/dataset/summary_statistics_qc.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
@classmethod
def from_summary_statistics(
    cls: type[SummaryStatisticsQC],
    gwas: SummaryStatistics,
    pval_threshold: float = 1e-8,
) -> SummaryStatisticsQC:
    """The function calculates the quality control metrics for the summary statistics.

    Args:
        gwas (SummaryStatistics): The instance of the SummaryStatistics class.
        pval_threshold (float): The p-value threshold for the QC. Default is 1e-8.

    Returns:
        SummaryStatisticsQC: Dataset with quality control metrics for the summary statistics.

    Examples:
        >>> from pyspark.sql import functions as f
        >>> s = 'studyId STRING, variantId STRING, chromosome STRING, position INT, beta DOUBLE, standardError DOUBLE, pValueMantissa FLOAT, pValueExponent INTEGER'
        >>> v1 = [("S1", "1_10000_A_T", "1", 10000, 1.0, 0.2, 9.9, -20), ("S1", "X_10001_C_T", "X", 10001, -0.1, 0.2, 1.0, -1)]
        >>> v2 = [("S2", "1_10001_C_T", "1", 10001, 0.028, 0.2, 1.0, -1), ("S2", "1_10002_G_C", "1", 10002, 0.5, 0.1, 1.0, -1)]
        >>> df = spark.createDataFrame(v1 + v2, s)
        >>> df.show()
        +-------+-----------+----------+--------+-----+-------------+--------------+--------------+
        |studyId|  variantId|chromosome|position| beta|standardError|pValueMantissa|pValueExponent|
        +-------+-----------+----------+--------+-----+-------------+--------------+--------------+
        |     S1|1_10000_A_T|         1|   10000|  1.0|          0.2|           9.9|           -20|
        |     S1|X_10001_C_T|         X|   10001| -0.1|          0.2|           1.0|            -1|
        |     S2|1_10001_C_T|         1|   10001|0.028|          0.2|           1.0|            -1|
        |     S2|1_10002_G_C|         1|   10002|  0.5|          0.1|           1.0|            -1|
        +-------+-----------+----------+--------+-----+-------------+--------------+--------------+
        <BLANKLINE>

        ** This method outputs one value per study, mean beta, mean diff pz, se diff pz, gc lambda, n variants and n variants sig**

        >>> stats = SummaryStatistics(df)
        >>> qc = SummaryStatisticsQC.from_summary_statistics(stats)
        >>> isinstance(qc, SummaryStatisticsQC)
        True
        >>> mean_beta = f.round("mean_beta", 2).alias("mean_beta")
        >>> mean_diff_pz = f.round("mean_diff_pz", 2).alias("mean_diff_pz")
        >>> se_diff_pz = f.round("se_diff_pz", 2).alias("se_diff_pz")
        >>> gc_lambda = f.round("gc_lambda", 2).alias("gc_lambda")
        >>> qc.df.select('studyId', mean_beta, mean_diff_pz, se_diff_pz, gc_lambda, 'n_variants', 'n_variants_sig').show()
        +-------+---------+------------+----------+---------+----------+--------------+
        |studyId|mean_beta|mean_diff_pz|se_diff_pz|gc_lambda|n_variants|n_variants_sig|
        +-------+---------+------------+----------+---------+----------+--------------+
        |     S1|     0.45|        6.78|      8.47|     0.55|         2|             1|
        |     S2|     0.26|       -2.15|      4.38|     0.04|         2|             0|
        +-------+---------+------------+----------+---------+----------+--------------+
        <BLANKLINE>
    """
    n_variants: Callable[[DataFrame], DataFrame] = lambda df: number_of_variants(
        df, pval_threshold
    )
    QC_TESTS = [
        QCTest(["mean_beta"], mean_beta_check),
        QCTest(["mean_diff_pz", "se_diff_pz"], p_z_test),
        QCTest(["gc_lambda"], gc_lambda_check),
        QCTest(["n_variants", "n_variants_sig"], n_variants),
    ]

    qc = reduce(
        lambda qc1, qc2: qc1.join(qc2, on="studyId", how="outer"),
        [test.call_test(gwas.df) for test in QC_TESTS],
    )

    return cls(_df=qc)

get_schema() -> StructType classmethod

Provide the schema for the SummaryStatisticsQC dataset.

Returns:

Name Type Description
StructType StructType

The schema for the SummaryStatisticsQC dataset.

Source code in src/gentropy/dataset/summary_statistics_qc.py
64
65
66
67
68
69
70
71
@classmethod
def get_schema(cls: type[SummaryStatisticsQC]) -> StructType:
    """Provide the schema for the SummaryStatisticsQC dataset.

    Returns:
        StructType: The schema for the SummaryStatisticsQC dataset.
    """
    return parse_spark_schema("summary_statistics_qc.json")

Schema

root
 |-- studyId: string (nullable = true)
 |-- mean_beta: double (nullable = true)
 |-- mean_diff_pz: double (nullable = true)
 |-- se_diff_pz: double (nullable = true)
 |-- gc_lambda: double (nullable = true)
 |-- n_variants: long (nullable = true)
 |-- n_variants_sig: long (nullable = true)