Skip to content

credible_set_qc

gentropy.credible_set_qc.CredibleSetQCStep

Credible set quality control step for fine mapped StudyLoci.

Source code in src/gentropy/credible_set_qc.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
class CredibleSetQCStep:
    """Credible set quality control step for fine mapped StudyLoci."""

    def __init__(
        self,
        session: Session,
        credible_sets_path: str,
        output_path: str,
        p_value_threshold: float,
        purity_min_r2: float,
        clump: bool,
        ld_index_path: str | None,
        study_index_path: str | None,
        ld_min_r2: float | None,
        n_partitions: int | None,
    ) -> None:
        """Run credible set quality control step.

        Check defaults used by steps in hydra configuration `gentropy.config.CredibleSetQCStepConfig`

        Due to the large number of partitions at the input credible_set_path after finemapping, the
        best strategy it is to repartition and save the dataset after deduplication.

        The `clump` mode will perform additional LD based clumping on the input credible sets.
        Enabling `clump` mode requires providing `ld_index_path`, `study_index_path` and `ld_min_r2`.

        Args:
            session (Session): Session object.
            credible_sets_path (str): Path to credible sets file.
            output_path (str): Path to write the output file.
            p_value_threshold (float): P-value threshold for credible set quality control.
            purity_min_r2 (float): Minimum R2 for purity estimation.
            clump (bool): Whether to clump the credible sets by LD.
            ld_index_path (str | None): Path to LD index file.
            study_index_path (str | None): Path to study index file.
            ld_min_r2 (float | None): Minimum R2 for LD estimation.
            n_partitions (int | None): Number of partitions to coalesce the dataset after reading. Defaults to 200
        """
        n_partitions = n_partitions or 200

        ld_index = (
            LDIndex.from_parquet(session, ld_index_path) if ld_index_path else None
        )
        study_index = (
            StudyIndex.from_parquet(session, study_index_path)
            if study_index_path
            else None
        )

        cred_sets = StudyLocus.from_parquet(
            session, credible_sets_path, recursiveFileLookup=True
        ).coalesce(n_partitions)

        cred_sets_clean = SUSIE_inf.credible_set_qc(
            cred_sets,
            p_value_threshold,
            purity_min_r2,
            clump,
            ld_index,
            study_index,
            ld_min_r2,
        )
        # ensure the saved object is still a valid StudyLocus
        StudyLocus(
            _df=cred_sets_clean.df, _schema=StudyLocus.get_schema()
        ).df.write.mode(session.write_mode).parquet(output_path)

__init__(session: Session, credible_sets_path: str, output_path: str, p_value_threshold: float, purity_min_r2: float, clump: bool, ld_index_path: str | None, study_index_path: str | None, ld_min_r2: float | None, n_partitions: int | None) -> None

Run credible set quality control step.

Check defaults used by steps in hydra configuration gentropy.config.CredibleSetQCStepConfig

Due to the large number of partitions at the input credible_set_path after finemapping, the best strategy it is to repartition and save the dataset after deduplication.

The clump mode will perform additional LD based clumping on the input credible sets. Enabling clump mode requires providing ld_index_path, study_index_path and ld_min_r2.

Parameters:

Name Type Description Default
session Session

Session object.

required
credible_sets_path str

Path to credible sets file.

required
output_path str

Path to write the output file.

required
p_value_threshold float

P-value threshold for credible set quality control.

required
purity_min_r2 float

Minimum R2 for purity estimation.

required
clump bool

Whether to clump the credible sets by LD.

required
ld_index_path str | None

Path to LD index file.

required
study_index_path str | None

Path to study index file.

required
ld_min_r2 float | None

Minimum R2 for LD estimation.

required
n_partitions int | None

Number of partitions to coalesce the dataset after reading. Defaults to 200

required
Source code in src/gentropy/credible_set_qc.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
def __init__(
    self,
    session: Session,
    credible_sets_path: str,
    output_path: str,
    p_value_threshold: float,
    purity_min_r2: float,
    clump: bool,
    ld_index_path: str | None,
    study_index_path: str | None,
    ld_min_r2: float | None,
    n_partitions: int | None,
) -> None:
    """Run credible set quality control step.

    Check defaults used by steps in hydra configuration `gentropy.config.CredibleSetQCStepConfig`

    Due to the large number of partitions at the input credible_set_path after finemapping, the
    best strategy it is to repartition and save the dataset after deduplication.

    The `clump` mode will perform additional LD based clumping on the input credible sets.
    Enabling `clump` mode requires providing `ld_index_path`, `study_index_path` and `ld_min_r2`.

    Args:
        session (Session): Session object.
        credible_sets_path (str): Path to credible sets file.
        output_path (str): Path to write the output file.
        p_value_threshold (float): P-value threshold for credible set quality control.
        purity_min_r2 (float): Minimum R2 for purity estimation.
        clump (bool): Whether to clump the credible sets by LD.
        ld_index_path (str | None): Path to LD index file.
        study_index_path (str | None): Path to study index file.
        ld_min_r2 (float | None): Minimum R2 for LD estimation.
        n_partitions (int | None): Number of partitions to coalesce the dataset after reading. Defaults to 200
    """
    n_partitions = n_partitions or 200

    ld_index = (
        LDIndex.from_parquet(session, ld_index_path) if ld_index_path else None
    )
    study_index = (
        StudyIndex.from_parquet(session, study_index_path)
        if study_index_path
        else None
    )

    cred_sets = StudyLocus.from_parquet(
        session, credible_sets_path, recursiveFileLookup=True
    ).coalesce(n_partitions)

    cred_sets_clean = SUSIE_inf.credible_set_qc(
        cred_sets,
        p_value_threshold,
        purity_min_r2,
        clump,
        ld_index,
        study_index,
        ld_min_r2,
    )
    # ensure the saved object is still a valid StudyLocus
    StudyLocus(
        _df=cred_sets_clean.df, _schema=StudyLocus.get_schema()
    ).df.write.mode(session.write_mode).parquet(output_path)

gentropy.config.CredibleSetQCStepConfig dataclass

Bases: StepConfig

Credible set quality control step configuration.

Source code in src/gentropy/config.py
608
609
610
611
612
613
614
615
616
617
618
619
620
621
@dataclass
class CredibleSetQCStepConfig(StepConfig):
    """Credible set quality control step configuration."""

    credible_sets_path: str = MISSING
    output_path: str = MISSING
    p_value_threshold: float = 1e-5
    purity_min_r2: float = 0.01
    clump: bool = False
    ld_index_path: str | None = None
    study_index_path: str | None = None
    ld_min_r2: float | None = 0.8
    n_partitions: int | None = 200
    _target_: str = "gentropy.credible_set_qc.CredibleSetQCStep"