Skip to content

ukb_ppp_eur_sumstat_preprocess

gentropy.ukb_ppp_eur_sumstat_preprocess.UkbPppEurStep

UKB PPP (EUR) data ingestion and harmonisation.

Source code in src/gentropy/ukb_ppp_eur_sumstat_preprocess.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class UkbPppEurStep:
    """UKB PPP (EUR) data ingestion and harmonisation."""

    def __init__(
        self,
        session: Session,
        raw_study_index_path_from_tsv: str,
        raw_summary_stats_path: str,
        variant_annotation_path: str,
        tmp_variant_annotation_path: str,
        study_index_output_path: str,
        summary_stats_output_path: str,
    ) -> None:
        """Run UKB PPP (EUR) data ingestion and harmonisation step.

        Args:
            session (Session): Session object.
            raw_study_index_path_from_tsv (str): Input raw study index path.
            raw_summary_stats_path (str): Input raw summary stats path.
            variant_annotation_path (str): Input variant annotation dataset path.
            tmp_variant_annotation_path (str): Temporary output path for variant annotation dataset.
            study_index_output_path (str): Study index output path.
            summary_stats_output_path (str): Summary stats output path.
        """
        session.logger.info(
            "Pre-compute the direct and flipped variant annotation dataset."
        )
        prepare_va(session, variant_annotation_path, tmp_variant_annotation_path)

        session.logger.info("Process study index.")
        (
            UkbPppEurStudyIndex.from_source(
                spark=session.spark,
                raw_study_index_path_from_tsv=raw_study_index_path_from_tsv,
                raw_summary_stats_path=raw_summary_stats_path,
            )
            .df.write.mode("overwrite")
            .parquet(study_index_output_path)
        )

        session.logger.info("Process and harmonise summary stats.")
        process_summary_stats_per_chromosome(
            session,
            UkbPppEurSummaryStats,
            raw_summary_stats_path,
            tmp_variant_annotation_path,
            summary_stats_output_path,
            study_index_output_path,
        )

__init__(session: Session, raw_study_index_path_from_tsv: str, raw_summary_stats_path: str, variant_annotation_path: str, tmp_variant_annotation_path: str, study_index_output_path: str, summary_stats_output_path: str) -> None

Run UKB PPP (EUR) data ingestion and harmonisation step.

Parameters:

Name Type Description Default
session Session

Session object.

required
raw_study_index_path_from_tsv str

Input raw study index path.

required
raw_summary_stats_path str

Input raw summary stats path.

required
variant_annotation_path str

Input variant annotation dataset path.

required
tmp_variant_annotation_path str

Temporary output path for variant annotation dataset.

required
study_index_output_path str

Study index output path.

required
summary_stats_output_path str

Summary stats output path.

required
Source code in src/gentropy/ukb_ppp_eur_sumstat_preprocess.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def __init__(
    self,
    session: Session,
    raw_study_index_path_from_tsv: str,
    raw_summary_stats_path: str,
    variant_annotation_path: str,
    tmp_variant_annotation_path: str,
    study_index_output_path: str,
    summary_stats_output_path: str,
) -> None:
    """Run UKB PPP (EUR) data ingestion and harmonisation step.

    Args:
        session (Session): Session object.
        raw_study_index_path_from_tsv (str): Input raw study index path.
        raw_summary_stats_path (str): Input raw summary stats path.
        variant_annotation_path (str): Input variant annotation dataset path.
        tmp_variant_annotation_path (str): Temporary output path for variant annotation dataset.
        study_index_output_path (str): Study index output path.
        summary_stats_output_path (str): Summary stats output path.
    """
    session.logger.info(
        "Pre-compute the direct and flipped variant annotation dataset."
    )
    prepare_va(session, variant_annotation_path, tmp_variant_annotation_path)

    session.logger.info("Process study index.")
    (
        UkbPppEurStudyIndex.from_source(
            spark=session.spark,
            raw_study_index_path_from_tsv=raw_study_index_path_from_tsv,
            raw_summary_stats_path=raw_summary_stats_path,
        )
        .df.write.mode("overwrite")
        .parquet(study_index_output_path)
    )

    session.logger.info("Process and harmonise summary stats.")
    process_summary_stats_per_chromosome(
        session,
        UkbPppEurSummaryStats,
        raw_summary_stats_path,
        tmp_variant_annotation_path,
        summary_stats_output_path,
        study_index_output_path,
    )