Bases: GWASCatalogSumstatsPreprocessConfig
Step to preprocess GWAS Catalog harmonised summary stats.
Source code in src/otg/gwas_catalog_sumstat_preprocess.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 | @dataclass
class GWASCatalogSumstatsPreprocessStep(GWASCatalogSumstatsPreprocessConfig):
"""Step to preprocess GWAS Catalog harmonised summary stats."""
session: Session = Session()
def run(self: GWASCatalogSumstatsPreprocessStep) -> None:
"""Run Step."""
# Extract
self.session.logger.info(self.raw_sumstats_path)
self.session.logger.info(self.out_sumstats_path)
self.session.logger.info(self.study_id)
# Reading dataset:
raw_dataset = self.session.spark.read.csv(
self.raw_sumstats_path, header=True, sep="\t"
)
self.session.logger.info(
f"Number of single point associations: {raw_dataset.count()}"
)
# Processing dataset:
SummaryStatistics.from_gwas_harmonized_summary_stats(
raw_dataset, self.study_id
).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)
self.session.logger.info("Processing dataset successfully completed.")
|
run()
Run Step.
Source code in src/otg/gwas_catalog_sumstat_preprocess.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36 | def run(self: GWASCatalogSumstatsPreprocessStep) -> None:
"""Run Step."""
# Extract
self.session.logger.info(self.raw_sumstats_path)
self.session.logger.info(self.out_sumstats_path)
self.session.logger.info(self.study_id)
# Reading dataset:
raw_dataset = self.session.spark.read.csv(
self.raw_sumstats_path, header=True, sep="\t"
)
self.session.logger.info(
f"Number of single point associations: {raw_dataset.count()}"
)
# Processing dataset:
SummaryStatistics.from_gwas_harmonized_summary_stats(
raw_dataset, self.study_id
).df.write.mode(self.session.write_mode).parquet(self.out_sumstats_path)
self.session.logger.info("Processing dataset successfully completed.")
|