eQTL Catalogue ingestion step.
From SuSIE fine mapping results (available at their FTP ), we extract credible sets and study index datasets from gene expression QTL studies.
Source code in src/gentropy/eqtl_catalogue.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 | class EqtlCatalogueStep:
"""eQTL Catalogue ingestion step.
From SuSIE fine mapping results (available at [their FTP](https://ftp.ebi.ac.uk/pub/databases/spot/eQTL/susie/) ), we extract credible sets and study index datasets from gene expression QTL studies.
"""
def __init__(
self,
session: Session,
mqtl_quantification_methods_blacklist: list[str],
eqtl_catalogue_paths_imported: str,
eqtl_catalogue_study_index_out: str,
eqtl_catalogue_credible_sets_out: str,
eqtl_lead_pvalue_threshold: float = EqtlCatalogueConfig().eqtl_lead_pvalue_threshold,
) -> None:
"""Run eQTL Catalogue ingestion step.
Args:
session (Session): Session object.
mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv
eqtl_catalogue_paths_imported (str): Input eQTL Catalogue fine mapping results path.
eqtl_catalogue_study_index_out (str): Output eQTL Catalogue study index path.
eqtl_catalogue_credible_sets_out (str): Output eQTL Catalogue credible sets path.
eqtl_lead_pvalue_threshold (float, optional): Lead p-value threshold. Defaults to EqtlCatalogueConfig().eqtl_lead_pvalue_threshold.
"""
# Extract
studies_metadata = EqtlCatalogueStudyIndex.read_studies_from_source(
session, list(mqtl_quantification_methods_blacklist)
)
# Load raw data only for the studies we are interested in ingestion. This makes the proces much lighter.
studies_to_ingest = EqtlCatalogueStudyIndex.get_studies_of_interest(
studies_metadata
)
credible_sets_df = EqtlCatalogueFinemapping.read_credible_set_from_source(
session,
credible_set_path=[
f"{eqtl_catalogue_paths_imported}/{qtd_id}.credible_sets.tsv"
for qtd_id in studies_to_ingest
],
)
lbf_df = EqtlCatalogueFinemapping.read_lbf_from_source(
session,
lbf_path=[
f"{eqtl_catalogue_paths_imported}/{qtd_id}.lbf_variable.txt"
for qtd_id in studies_to_ingest
],
)
# Transform
processed_susie_df = EqtlCatalogueFinemapping.parse_susie_results(
credible_sets_df, lbf_df, studies_metadata
)
(
EqtlCatalogueStudyIndex.from_susie_results(processed_susie_df)
# Writing the output:
.df.write.mode(session.write_mode)
.parquet(eqtl_catalogue_study_index_out)
)
(
EqtlCatalogueFinemapping.from_susie_results(processed_susie_df)
# Flagging sub-significnat loci:
.validate_lead_pvalue(pvalue_cutoff=eqtl_lead_pvalue_threshold)
# Writing the output:
.df.write.mode(session.write_mode)
.parquet(eqtl_catalogue_credible_sets_out)
)
|
__init__(session: Session, mqtl_quantification_methods_blacklist: list[str], eqtl_catalogue_paths_imported: str, eqtl_catalogue_study_index_out: str, eqtl_catalogue_credible_sets_out: str, eqtl_lead_pvalue_threshold: float = EqtlCatalogueConfig().eqtl_lead_pvalue_threshold) -> None
Run eQTL Catalogue ingestion step.
Parameters:
Name |
Type |
Description |
Default |
session
|
Session
|
|
required
|
mqtl_quantification_methods_blacklist
|
list[str]
|
Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv
|
required
|
eqtl_catalogue_paths_imported
|
str
|
Input eQTL Catalogue fine mapping results path.
|
required
|
eqtl_catalogue_study_index_out
|
str
|
Output eQTL Catalogue study index path.
|
required
|
eqtl_catalogue_credible_sets_out
|
str
|
Output eQTL Catalogue credible sets path.
|
required
|
eqtl_lead_pvalue_threshold
|
float
|
Lead p-value threshold. Defaults to EqtlCatalogueConfig().eqtl_lead_pvalue_threshold.
|
eqtl_lead_pvalue_threshold
|
Source code in src/gentropy/eqtl_catalogue.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79 | def __init__(
self,
session: Session,
mqtl_quantification_methods_blacklist: list[str],
eqtl_catalogue_paths_imported: str,
eqtl_catalogue_study_index_out: str,
eqtl_catalogue_credible_sets_out: str,
eqtl_lead_pvalue_threshold: float = EqtlCatalogueConfig().eqtl_lead_pvalue_threshold,
) -> None:
"""Run eQTL Catalogue ingestion step.
Args:
session (Session): Session object.
mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv
eqtl_catalogue_paths_imported (str): Input eQTL Catalogue fine mapping results path.
eqtl_catalogue_study_index_out (str): Output eQTL Catalogue study index path.
eqtl_catalogue_credible_sets_out (str): Output eQTL Catalogue credible sets path.
eqtl_lead_pvalue_threshold (float, optional): Lead p-value threshold. Defaults to EqtlCatalogueConfig().eqtl_lead_pvalue_threshold.
"""
# Extract
studies_metadata = EqtlCatalogueStudyIndex.read_studies_from_source(
session, list(mqtl_quantification_methods_blacklist)
)
# Load raw data only for the studies we are interested in ingestion. This makes the proces much lighter.
studies_to_ingest = EqtlCatalogueStudyIndex.get_studies_of_interest(
studies_metadata
)
credible_sets_df = EqtlCatalogueFinemapping.read_credible_set_from_source(
session,
credible_set_path=[
f"{eqtl_catalogue_paths_imported}/{qtd_id}.credible_sets.tsv"
for qtd_id in studies_to_ingest
],
)
lbf_df = EqtlCatalogueFinemapping.read_lbf_from_source(
session,
lbf_path=[
f"{eqtl_catalogue_paths_imported}/{qtd_id}.lbf_variable.txt"
for qtd_id in studies_to_ingest
],
)
# Transform
processed_susie_df = EqtlCatalogueFinemapping.parse_susie_results(
credible_sets_df, lbf_df, studies_metadata
)
(
EqtlCatalogueStudyIndex.from_susie_results(processed_susie_df)
# Writing the output:
.df.write.mode(session.write_mode)
.parquet(eqtl_catalogue_study_index_out)
)
(
EqtlCatalogueFinemapping.from_susie_results(processed_susie_df)
# Flagging sub-significnat loci:
.validate_lead_pvalue(pvalue_cutoff=eqtl_lead_pvalue_threshold)
# Writing the output:
.df.write.mode(session.write_mode)
.parquet(eqtl_catalogue_credible_sets_out)
)
|