Skip to content

GWAS Catalog sumstat preprocess

Bases: GWASCatalogSumstatsPreprocessConfig

Step to preprocess GWAS Catalog harmonised summary stats.

Source code in src/otg/gwas_catalog_sumstat_preprocess.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
@dataclass
class GWASCatalogSumstatsPreprocessStep(GWASCatalogSumstatsPreprocessConfig):
    """Step to preprocess GWAS Catalog harmonised summary stats."""

    session: Session = Session()

    def run(self: GWASCatalogSumstatsPreprocessStep) -> None:
        """Run Step."""
        # Extract
        self.session.logger.info(self.raw_sumstats_path)
        self.session.logger.info(self.out_sumstats_path)
        self.session.logger.info(self.study_id)

        # Reading dataset:
        raw_dataset = self.session.spark.read.csv(
            self.raw_sumstats_path, header=True, sep="\t"
        )
        self.session.logger.info(
            f"Number of single point associations: {raw_dataset.count()}"
        )

        # Processing dataset:
        SummaryStatistics.from_gwas_harmonized_summary_stats(
            raw_dataset, self.study_id
        ).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)
        self.session.logger.info("Processing dataset successfully completed.")

run()

Run Step.

Source code in src/otg/gwas_catalog_sumstat_preprocess.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
def run(self: GWASCatalogSumstatsPreprocessStep) -> None:
    """Run Step."""
    # Extract
    self.session.logger.info(self.raw_sumstats_path)
    self.session.logger.info(self.out_sumstats_path)
    self.session.logger.info(self.study_id)

    # Reading dataset:
    raw_dataset = self.session.spark.read.csv(
        self.raw_sumstats_path, header=True, sep="\t"
    )
    self.session.logger.info(
        f"Number of single point associations: {raw_dataset.count()}"
    )

    # Processing dataset:
    SummaryStatistics.from_gwas_harmonized_summary_stats(
        raw_dataset, self.study_id
    ).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)
    self.session.logger.info("Processing dataset successfully completed.")

GWAS Catalog Sumstats Preprocessing step requirements.

Attributes:

Name Type Description
raw_sumstats_path str

Input raw GWAS Catalog summary statistics path.

out_sumstats_path str

Output GWAS Catalog summary statistics path.

study_id str

GWAS Catalog study identifier.

Source code in src/otg/config.py
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
@dataclass
class GWASCatalogSumstatsPreprocessConfig:
    """GWAS Catalog Sumstats Preprocessing step requirements.

    Attributes:
        raw_sumstats_path (str): Input raw GWAS Catalog summary statistics path.
        out_sumstats_path (str): Output GWAS Catalog summary statistics path.
        study_id (str): GWAS Catalog study identifier.
    """

    _target_: str = (
        "otg.gwas_catalog_sumstat_preprocess.GWASCatalogSumstatsPreprocessStep"
    )
    raw_sumstats_path: str = MISSING
    out_sumstats_path: str = MISSING
    study_id: str = MISSING