GWAS Catalog study index generation.
This step generates a study index from the GWAS Catalog studies and ancestry files. It can also add additional curation information and summary statistics QC information when available.
''' warning
This step does not generate study index for gwas catalog top hits.
This step provides several optional arguments to add additional information to the study index:
- gwas_catalog_study_curation_file: csv file or URL containing the curation table. If provided it annotates the study index with the additional curation information performed by the Open Targets team.
- sumstats_qc_path: Path to the summary statistics QC table. If provided it annotates the study index with the summary statistics QC information in the
sumstatQCValues columns (e.g. n_variants, n_variants_sig etc.).
Source code in src/gentropy/gwas_catalog_study_index.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90 | class GWASCatalogStudyIndexGenerationStep:
"""GWAS Catalog study index generation.
This step generates a study index from the GWAS Catalog studies and ancestry files. It can also add additional curation information and summary statistics QC information when available.
''' warning
This step does not generate study index for gwas catalog top hits.
This step provides several optional arguments to add additional information to the study index:
- gwas_catalog_study_curation_file: csv file or URL containing the curation table. If provided it annotates the study index with the additional curation information performed by the Open Targets team.
- sumstats_qc_path: Path to the summary statistics QC table. If provided it annotates the study index with the summary statistics QC information in the `sumstatQCValues` columns (e.g. `n_variants`, `n_variants_sig` etc.).
"""
def __init__(
self,
session: Session,
catalog_study_files: list[str],
catalog_ancestry_files: list[str],
study_index_path: str,
gwas_catalog_study_curation_file: str | None = None,
sumstats_qc_path: str | None = None,
) -> None:
"""Run step.
Args:
session (Session): Session objecct.
catalog_study_files (list[str]): List of raw GWAS catalog studies file.
catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
study_index_path (str): Output GWAS catalog studies path.
gwas_catalog_study_curation_file (str | None): csv file or URL containing the curation table. Optional.
sumstats_qc_path (str | None): Path to the summary statistics QC table. Optional.
Raises:
ValueError: If the curation file is provided but not a CSV file or URL.
"""
# Core Study Index Generation:
study_index = StudyIndexGWASCatalogParser.from_source(
session.spark.read.csv(list(catalog_study_files), sep="\t", header=True),
session.spark.read.csv(list(catalog_ancestry_files), sep="\t", header=True),
)
# Annotate with curation if provided:
if gwas_catalog_study_curation_file:
if gwas_catalog_study_curation_file.endswith(
".tsv"
) | gwas_catalog_study_curation_file.endswith(".tsv"):
gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_csv(
session, gwas_catalog_study_curation_file
)
elif gwas_catalog_study_curation_file.startswith("http"):
gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_url(
session, gwas_catalog_study_curation_file
)
else:
raise ValueError(
"Only CSV/TSV files or URLs are accepted as curation file."
)
study_index = study_index.annotate_from_study_curation(
gwas_catalog_study_curation
)
# Annotate with sumstats QC if provided:
if sumstats_qc_path:
sumstats_qc = SummaryStatisticsQC.from_parquet(
session=session,
path=sumstats_qc_path,
recursiveFileLookup=True,
)
study_index_with_qc = study_index.annotate_sumstats_qc(sumstats_qc)
# Write the study
study_index_with_qc.df.coalesce(session.output_partitions).write.mode(
session.write_mode
).parquet(study_index_path)
else:
study_index.df.coalesce(session.output_partitions).write.mode(
session.write_mode
).parquet(study_index_path)
|
__init__(session: Session, catalog_study_files: list[str], catalog_ancestry_files: list[str], study_index_path: str, gwas_catalog_study_curation_file: str | None = None, sumstats_qc_path: str | None = None) -> None
Run step.
Parameters:
| Name |
Type |
Description |
Default |
session
|
Session
|
|
required
|
catalog_study_files
|
list[str]
|
List of raw GWAS catalog studies file.
|
required
|
catalog_ancestry_files
|
list[str]
|
List of raw ancestry annotations files from GWAS Catalog.
|
required
|
study_index_path
|
str
|
Output GWAS catalog studies path.
|
required
|
gwas_catalog_study_curation_file
|
str | None
|
csv file or URL containing the curation table. Optional.
|
None
|
sumstats_qc_path
|
str | None
|
Path to the summary statistics QC table. Optional.
|
None
|
Raises:
| Type |
Description |
ValueError
|
If the curation file is provided but not a CSV file or URL.
|
Source code in src/gentropy/gwas_catalog_study_index.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90 | def __init__(
self,
session: Session,
catalog_study_files: list[str],
catalog_ancestry_files: list[str],
study_index_path: str,
gwas_catalog_study_curation_file: str | None = None,
sumstats_qc_path: str | None = None,
) -> None:
"""Run step.
Args:
session (Session): Session objecct.
catalog_study_files (list[str]): List of raw GWAS catalog studies file.
catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
study_index_path (str): Output GWAS catalog studies path.
gwas_catalog_study_curation_file (str | None): csv file or URL containing the curation table. Optional.
sumstats_qc_path (str | None): Path to the summary statistics QC table. Optional.
Raises:
ValueError: If the curation file is provided but not a CSV file or URL.
"""
# Core Study Index Generation:
study_index = StudyIndexGWASCatalogParser.from_source(
session.spark.read.csv(list(catalog_study_files), sep="\t", header=True),
session.spark.read.csv(list(catalog_ancestry_files), sep="\t", header=True),
)
# Annotate with curation if provided:
if gwas_catalog_study_curation_file:
if gwas_catalog_study_curation_file.endswith(
".tsv"
) | gwas_catalog_study_curation_file.endswith(".tsv"):
gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_csv(
session, gwas_catalog_study_curation_file
)
elif gwas_catalog_study_curation_file.startswith("http"):
gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_url(
session, gwas_catalog_study_curation_file
)
else:
raise ValueError(
"Only CSV/TSV files or URLs are accepted as curation file."
)
study_index = study_index.annotate_from_study_curation(
gwas_catalog_study_curation
)
# Annotate with sumstats QC if provided:
if sumstats_qc_path:
sumstats_qc = SummaryStatisticsQC.from_parquet(
session=session,
path=sumstats_qc_path,
recursiveFileLookup=True,
)
study_index_with_qc = study_index.annotate_sumstats_qc(sumstats_qc)
# Write the study
study_index_with_qc.df.coalesce(session.output_partitions).write.mode(
session.write_mode
).parquet(study_index_path)
else:
study_index.df.coalesce(session.output_partitions).write.mode(
session.write_mode
).parquet(study_index_path)
|