Study index validation step.
This step reads and outputs a study index dataset with flagged studies
when target of disease validation fails.
Source code in src/gentropy/study_locus_validation.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 | class StudyLocusValidationStep:
"""Study index validation step.
This step reads and outputs a study index dataset with flagged studies
when target of disease validation fails.
"""
def __init__(
self,
session: Session,
study_index_path: str,
study_locus_path: list[str],
valid_study_locus_path: str,
invalid_study_locus_path: str,
invalid_qc_reasons: list[str] = [],
) -> None:
"""Initialize step.
Args:
session (Session): Session object.
study_index_path (str): Path to study index file.
study_locus_path (list[str]): Path to study locus dataset.
valid_study_locus_path (str): Path to write the valid records.
invalid_study_locus_path (str): Path to write the output file.
invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyLocusQualityCheck` (e.g. ['SUBSIGNIFICANT_FLAG']).
"""
# Reading datasets:
study_index = StudyIndex.from_parquet(session, study_index_path)
# Running validation then writing output:
study_locus_with_qc = (
StudyLocus.from_parquet(session, list(study_locus_path))
# Add flag for MHC region
.qc_MHC_region()
.validate_chromosome_label() # Flagging credible sets with unsupported chromosomes
.validate_study(study_index) # Flagging studies not in study index
.annotate_study_type(study_index) # Add study type to study locus
.qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics
.qc_explained_by_SuSiE() # Flagging credible sets in regions explained by SuSiE
# Annotates credible intervals and filter to only keep 95% credible sets
.filter_credible_set(credible_interval=CredibleInterval.IS95)
# Flagging credible sets with PIP > 1 or PIP < 0.95
.qc_abnormal_pips(
sum_pips_lower_threshold=0.95, sum_pips_upper_threshold=1.0001
)
# Annotate credible set confidence:
.assign_confidence()
).persist() # we will need this for 2 types of outputs
# Valid study locus partitioned to simplify the finding of overlaps
study_locus_with_qc.valid_rows(invalid_qc_reasons).df.repartitionByRange(
session.output_partitions, "chromosome", "position"
).sortWithinPartitions("chromosome", "position").write.mode(
session.write_mode
).parquet(valid_study_locus_path)
# Invalid study locus
study_locus_with_qc.valid_rows(invalid_qc_reasons, invalid=True).df.coalesce(
session.output_partitions
).write.mode(session.write_mode).parquet(invalid_study_locus_path)
|
__init__(session: Session, study_index_path: str, study_locus_path: list[str], valid_study_locus_path: str, invalid_study_locus_path: str, invalid_qc_reasons: list[str] = []) -> None
Initialize step.
Parameters:
Name |
Type |
Description |
Default |
session
|
Session
|
|
required
|
study_index_path
|
str
|
Path to study index file.
|
required
|
study_locus_path
|
list[str]
|
Path to study locus dataset.
|
required
|
valid_study_locus_path
|
str
|
Path to write the valid records.
|
required
|
invalid_study_locus_path
|
str
|
Path to write the output file.
|
required
|
invalid_qc_reasons
|
list[str]
|
List of invalid quality check reason names from StudyLocusQualityCheck (e.g. ['SUBSIGNIFICANT_FLAG']).
|
[]
|
Source code in src/gentropy/study_locus_validation.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69 | def __init__(
self,
session: Session,
study_index_path: str,
study_locus_path: list[str],
valid_study_locus_path: str,
invalid_study_locus_path: str,
invalid_qc_reasons: list[str] = [],
) -> None:
"""Initialize step.
Args:
session (Session): Session object.
study_index_path (str): Path to study index file.
study_locus_path (list[str]): Path to study locus dataset.
valid_study_locus_path (str): Path to write the valid records.
invalid_study_locus_path (str): Path to write the output file.
invalid_qc_reasons (list[str]): List of invalid quality check reason names from `StudyLocusQualityCheck` (e.g. ['SUBSIGNIFICANT_FLAG']).
"""
# Reading datasets:
study_index = StudyIndex.from_parquet(session, study_index_path)
# Running validation then writing output:
study_locus_with_qc = (
StudyLocus.from_parquet(session, list(study_locus_path))
# Add flag for MHC region
.qc_MHC_region()
.validate_chromosome_label() # Flagging credible sets with unsupported chromosomes
.validate_study(study_index) # Flagging studies not in study index
.annotate_study_type(study_index) # Add study type to study locus
.qc_redundant_top_hits_from_PICS() # Flagging top hits from studies with PICS summary statistics
.qc_explained_by_SuSiE() # Flagging credible sets in regions explained by SuSiE
# Annotates credible intervals and filter to only keep 95% credible sets
.filter_credible_set(credible_interval=CredibleInterval.IS95)
# Flagging credible sets with PIP > 1 or PIP < 0.95
.qc_abnormal_pips(
sum_pips_lower_threshold=0.95, sum_pips_upper_threshold=1.0001
)
# Annotate credible set confidence:
.assign_confidence()
).persist() # we will need this for 2 types of outputs
# Valid study locus partitioned to simplify the finding of overlaps
study_locus_with_qc.valid_rows(invalid_qc_reasons).df.repartitionByRange(
session.output_partitions, "chromosome", "position"
).sortWithinPartitions("chromosome", "position").write.mode(
session.write_mode
).parquet(valid_study_locus_path)
# Invalid study locus
study_locus_with_qc.valid_rows(invalid_qc_reasons, invalid=True).df.coalesce(
session.output_partitions
).write.mode(session.write_mode).parquet(invalid_study_locus_path)
|