Skip to content

LD index

Bases: LDIndexStepConfig

LD index step.

This step is resource intensive

Suggested params: high memory machine, 5TB of boot disk, no SSDs.

Source code in src/otg/ld_index.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
@dataclass
class LDIndexStep(LDIndexStepConfig):
    """LD index step.

    !!! warning "This step is resource intensive"
        Suggested params: high memory machine, 5TB of boot disk, no SSDs.

    """

    session: Session = Session()

    def run(self: LDIndexStep) -> None:
        """Run LD index dump step."""
        hl.init(sc=self.session.spark.sparkContext, log="/dev/null")
        ld_index = LDIndex.from_gnomad(
            self.ld_populations,
            self.ld_matrix_template,
            self.ld_index_raw_template,
            self.grch37_to_grch38_chain_path,
            self.min_r2,
        )
        self.session.logger.info(f"Writing LD index to: {self.ld_index_out}")
        (
            ld_index.df.write.partitionBy("chromosome")
            .mode(self.session.write_mode)
            .parquet(f"{self.ld_index_out}")
        )

run()

Run LD index dump step.

Source code in src/otg/ld_index.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def run(self: LDIndexStep) -> None:
    """Run LD index dump step."""
    hl.init(sc=self.session.spark.sparkContext, log="/dev/null")
    ld_index = LDIndex.from_gnomad(
        self.ld_populations,
        self.ld_matrix_template,
        self.ld_index_raw_template,
        self.grch37_to_grch38_chain_path,
        self.min_r2,
    )
    self.session.logger.info(f"Writing LD index to: {self.ld_index_out}")
    (
        ld_index.df.write.partitionBy("chromosome")
        .mode(self.session.write_mode)
        .parquet(f"{self.ld_index_out}")
    )

LD matrix step requirements.

Attributes:

Name Type Description
ld_matrix_template str

Template path for LD matrix from gnomAD.

ld_index_raw_template str

Template path for the variant indices correspondance in the LD Matrix from gnomAD.

min_r2 float

Minimum r2 to consider when considering variants within a window.

grch37_to_grch38_chain_path str

Path to GRCh37 to GRCh38 chain file.

ld_populations List[str]

List of population-specific LD matrices to process.

ld_index_out str

Output LD index path.

Source code in src/otg/config.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@dataclass
class LDIndexStepConfig:
    """LD matrix step requirements.

    Attributes:
        ld_matrix_template (str): Template path for LD matrix from gnomAD.
        ld_index_raw_template (str): Template path for the variant indices correspondance in the LD Matrix from gnomAD.
        min_r2 (float): Minimum r2 to consider when considering variants within a window.
        grch37_to_grch38_chain_path (str): Path to GRCh37 to GRCh38 chain file.
        ld_populations (List[str]): List of population-specific LD matrices to process.
        ld_index_out (str): Output LD index path.
    """

    _target_: str = "otg.ld_index.LDIndexStep"
    ld_matrix_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.adj.ld.bm"
    ld_index_raw_template: str = "gs://gcp-public-data--gnomad/release/2.1.1/ld/gnomad.genomes.r2.1.1.{POP}.common.ld.variant_indices.ht"
    min_r2: float = 0.5
    grch37_to_grch38_chain_path: str = (
        "gs://hail-common/references/grch37_to_grch38.over.chain.gz"
    )
    ld_populations: List[str] = field(
        default_factory=lambda: [
            "afr",  # African-American
            "amr",  # American Admixed/Latino
            "asj",  # Ashkenazi Jewish
            "eas",  # East Asian
            "fin",  # Finnish
            "nfe",  # Non-Finnish European
            "nwe",  # Northwestern European
            "seu",  # Southeastern European
        ]
    )
    ld_index_out: str = MISSING