Skip to content

locus_breaker_clumping

gentropy.locus_breaker_clumping.LocusBreakerClumpingStep

Step to perform locus-breaker clumping on a study.

Source code in src/gentropy/locus_breaker_clumping.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class LocusBreakerClumpingStep:
    """Step to perform locus-breaker clumping on a study."""

    def __init__(
        self,
        session: Session,
        summary_statistics_input_path: str,
        clumped_study_locus_output_path: str,
        lbc_baseline_pvalue: float,
        lbc_distance_cutoff: int,
        lbc_pvalue_threshold: float,
        lbc_flanking_distance: int,
        large_loci_size: int,
        wbc_clump_distance: int,
        wbc_pvalue_threshold: float,
        collect_locus: bool = False,
        remove_mhc: bool = True,
    ) -> None:
        """Run locus-breaker clumping step.

        This step will perform locus-breaker clumping on the full set of summary statistics.
        StudyLocus larger than the large_loci_size, by distance, will be further clumped with window-based
        clumping.

        Args:
            session (Session): Session object.
            summary_statistics_input_path (str): Path to the input study locus.
            clumped_study_locus_output_path (str): path of the resulting, clumped study-locus dataset.
            lbc_baseline_pvalue (float): Baseline p-value for locus breaker clumping.
            lbc_distance_cutoff (int): Distance cutoff for locus breaker clumping.
            lbc_pvalue_threshold (float): P-value threshold for locus breaker clumping.
            lbc_flanking_distance (int): Flanking distance for locus breaker clumping.
            large_loci_size (int): Threshold distance to define large loci for window-based clumping.
            wbc_clump_distance (int): Clump distance for window breaker clumping.
            wbc_pvalue_threshold (float): P-value threshold for window breaker clumping.
            collect_locus (bool, optional): Whether to collect locus. Defaults to False.
            remove_mhc (bool, optional): If true will use exclude_region() to remove the MHC region.
        """
        sum_stats = SummaryStatistics.from_parquet(
            session,
            summary_statistics_input_path,
        )
        lbc = sum_stats.locus_breaker_clumping(
            lbc_baseline_pvalue,
            lbc_distance_cutoff,
            lbc_pvalue_threshold,
            lbc_flanking_distance,
        )
        wbc = sum_stats.window_based_clumping(wbc_clump_distance, wbc_pvalue_threshold)

        clumped_result = LocusBreakerClumping.process_locus_breaker_output(
            lbc,
            wbc,
            large_loci_size,
        )
        if remove_mhc:
            clumped_result = clumped_result.exclude_region(
                GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC),
                exclude_overlap=True,
            )

        if collect_locus:
            clumped_result = clumped_result.annotate_locus_statistics_boundaries(
                sum_stats
            )
        clumped_result.df.write.partitionBy("studyLocusId").mode(
            session.write_mode
        ).parquet(clumped_study_locus_output_path)

__init__(session: Session, summary_statistics_input_path: str, clumped_study_locus_output_path: str, lbc_baseline_pvalue: float, lbc_distance_cutoff: int, lbc_pvalue_threshold: float, lbc_flanking_distance: int, large_loci_size: int, wbc_clump_distance: int, wbc_pvalue_threshold: float, collect_locus: bool = False, remove_mhc: bool = True) -> None

Run locus-breaker clumping step.

This step will perform locus-breaker clumping on the full set of summary statistics. StudyLocus larger than the large_loci_size, by distance, will be further clumped with window-based clumping.

Parameters:

Name Type Description Default
session Session

Session object.

required
summary_statistics_input_path str

Path to the input study locus.

required
clumped_study_locus_output_path str

path of the resulting, clumped study-locus dataset.

required
lbc_baseline_pvalue float

Baseline p-value for locus breaker clumping.

required
lbc_distance_cutoff int

Distance cutoff for locus breaker clumping.

required
lbc_pvalue_threshold float

P-value threshold for locus breaker clumping.

required
lbc_flanking_distance int

Flanking distance for locus breaker clumping.

required
large_loci_size int

Threshold distance to define large loci for window-based clumping.

required
wbc_clump_distance int

Clump distance for window breaker clumping.

required
wbc_pvalue_threshold float

P-value threshold for window breaker clumping.

required
collect_locus bool

Whether to collect locus. Defaults to False.

False
remove_mhc bool

If true will use exclude_region() to remove the MHC region.

True
Source code in src/gentropy/locus_breaker_clumping.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def __init__(
    self,
    session: Session,
    summary_statistics_input_path: str,
    clumped_study_locus_output_path: str,
    lbc_baseline_pvalue: float,
    lbc_distance_cutoff: int,
    lbc_pvalue_threshold: float,
    lbc_flanking_distance: int,
    large_loci_size: int,
    wbc_clump_distance: int,
    wbc_pvalue_threshold: float,
    collect_locus: bool = False,
    remove_mhc: bool = True,
) -> None:
    """Run locus-breaker clumping step.

    This step will perform locus-breaker clumping on the full set of summary statistics.
    StudyLocus larger than the large_loci_size, by distance, will be further clumped with window-based
    clumping.

    Args:
        session (Session): Session object.
        summary_statistics_input_path (str): Path to the input study locus.
        clumped_study_locus_output_path (str): path of the resulting, clumped study-locus dataset.
        lbc_baseline_pvalue (float): Baseline p-value for locus breaker clumping.
        lbc_distance_cutoff (int): Distance cutoff for locus breaker clumping.
        lbc_pvalue_threshold (float): P-value threshold for locus breaker clumping.
        lbc_flanking_distance (int): Flanking distance for locus breaker clumping.
        large_loci_size (int): Threshold distance to define large loci for window-based clumping.
        wbc_clump_distance (int): Clump distance for window breaker clumping.
        wbc_pvalue_threshold (float): P-value threshold for window breaker clumping.
        collect_locus (bool, optional): Whether to collect locus. Defaults to False.
        remove_mhc (bool, optional): If true will use exclude_region() to remove the MHC region.
    """
    sum_stats = SummaryStatistics.from_parquet(
        session,
        summary_statistics_input_path,
    )
    lbc = sum_stats.locus_breaker_clumping(
        lbc_baseline_pvalue,
        lbc_distance_cutoff,
        lbc_pvalue_threshold,
        lbc_flanking_distance,
    )
    wbc = sum_stats.window_based_clumping(wbc_clump_distance, wbc_pvalue_threshold)

    clumped_result = LocusBreakerClumping.process_locus_breaker_output(
        lbc,
        wbc,
        large_loci_size,
    )
    if remove_mhc:
        clumped_result = clumped_result.exclude_region(
            GenomicRegion.from_known_genomic_region(KnownGenomicRegions.MHC),
            exclude_overlap=True,
        )

    if collect_locus:
        clumped_result = clumped_result.annotate_locus_statistics_boundaries(
            sum_stats
        )
    clumped_result.df.write.partitionBy("studyLocusId").mode(
        session.write_mode
    ).parquet(clumped_study_locus_output_path)