Skip to content

GnomAD Linkage data ingestion

gentropy.gnomad_ingestion.LDIndexStep

LD index step.

This step is resource intensive

Suggested params: high memory machine, 5TB of boot disk, no SSDs.

Source code in src/gentropy/gnomad_ingestion.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
class LDIndexStep:
    """LD index step.

    !!! warning "This step is resource intensive"

        Suggested params: high memory machine, 5TB of boot disk, no SSDs.

    """

    def __init__(
        self,
        session: Session,
        ld_index_out: str,
        min_r2: float = LDIndexConfig().min_r2,
        ld_matrix_template: str = LDIndexConfig().ld_matrix_template,
        ld_index_raw_template: str = LDIndexConfig().ld_index_raw_template,
        ld_populations: list[LD_Population | str] = LDIndexConfig().ld_populations,
        liftover_ht_path: str = LDIndexConfig().liftover_ht_path,
        grch37_to_grch38_chain_path: str = LDIndexConfig().grch37_to_grch38_chain_path,
        use_version_from_input: bool = LDIndexConfig().use_version_from_input,
    ) -> None:
        """Run step.

        Args:
            session (Session): Session object.
            ld_index_out (str): Output LD index path. (required)
            min_r2 (float): Minimum r2 to consider when considering variants within a window.
            ld_matrix_template (str): Input path to the gnomAD ld file with placeholder for population
            ld_index_raw_template (str): Input path to the raw gnomAD LD indices file with placeholder for population string
            ld_populations (list[LD_Population | str]): Population names derived from the ld file paths
            liftover_ht_path (str): Path to the liftover ht file
            grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates.
            use_version_from_input (bool): Append version derived from input ld_matrix_template to the output ld_index_out. Defaults to False.

        In case use_version_from_input is set to True,
        data source version inferred from ld_matrix_temolate is appended as the last path segment to the output path.
        Default values are provided in LDIndexConfig.
        """
        if use_version_from_input:
            # amend data source version to output path
            ld_index_out = VersionEngine("gnomad").amend_version(
                ld_matrix_template, ld_index_out
            )
        (
            GnomADLDMatrix(
                ld_matrix_template=ld_matrix_template,
                ld_index_raw_template=ld_index_raw_template,
                grch37_to_grch38_chain_path=grch37_to_grch38_chain_path,
                ld_populations=ld_populations,
                liftover_ht_path=liftover_ht_path,
            )
            .as_ld_index(min_r2)
            .df.write.partitionBy("chromosome")
            .mode(session.write_mode)
            .parquet(ld_index_out)
        )
        session.logger.info(ld_index_out)

__init__(session: Session, ld_index_out: str, min_r2: float = LDIndexConfig().min_r2, ld_matrix_template: str = LDIndexConfig().ld_matrix_template, ld_index_raw_template: str = LDIndexConfig().ld_index_raw_template, ld_populations: list[LD_Population | str] = LDIndexConfig().ld_populations, liftover_ht_path: str = LDIndexConfig().liftover_ht_path, grch37_to_grch38_chain_path: str = LDIndexConfig().grch37_to_grch38_chain_path, use_version_from_input: bool = LDIndexConfig().use_version_from_input) -> None

Run step.

Parameters:

Name Type Description Default
session Session

Session object.

required
ld_index_out str

Output LD index path. (required)

required
min_r2 float

Minimum r2 to consider when considering variants within a window.

min_r2
ld_matrix_template str

Input path to the gnomAD ld file with placeholder for population

ld_matrix_template
ld_index_raw_template str

Input path to the raw gnomAD LD indices file with placeholder for population string

ld_index_raw_template
ld_populations list[LD_Population | str]

Population names derived from the ld file paths

ld_populations
liftover_ht_path str

Path to the liftover ht file

liftover_ht_path
grch37_to_grch38_chain_path str

Path to the chain file used to lift over the coordinates.

grch37_to_grch38_chain_path
use_version_from_input bool

Append version derived from input ld_matrix_template to the output ld_index_out. Defaults to False.

use_version_from_input

In case use_version_from_input is set to True, data source version inferred from ld_matrix_temolate is appended as the last path segment to the output path. Default values are provided in LDIndexConfig.

Source code in src/gentropy/gnomad_ingestion.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def __init__(
    self,
    session: Session,
    ld_index_out: str,
    min_r2: float = LDIndexConfig().min_r2,
    ld_matrix_template: str = LDIndexConfig().ld_matrix_template,
    ld_index_raw_template: str = LDIndexConfig().ld_index_raw_template,
    ld_populations: list[LD_Population | str] = LDIndexConfig().ld_populations,
    liftover_ht_path: str = LDIndexConfig().liftover_ht_path,
    grch37_to_grch38_chain_path: str = LDIndexConfig().grch37_to_grch38_chain_path,
    use_version_from_input: bool = LDIndexConfig().use_version_from_input,
) -> None:
    """Run step.

    Args:
        session (Session): Session object.
        ld_index_out (str): Output LD index path. (required)
        min_r2 (float): Minimum r2 to consider when considering variants within a window.
        ld_matrix_template (str): Input path to the gnomAD ld file with placeholder for population
        ld_index_raw_template (str): Input path to the raw gnomAD LD indices file with placeholder for population string
        ld_populations (list[LD_Population | str]): Population names derived from the ld file paths
        liftover_ht_path (str): Path to the liftover ht file
        grch37_to_grch38_chain_path (str): Path to the chain file used to lift over the coordinates.
        use_version_from_input (bool): Append version derived from input ld_matrix_template to the output ld_index_out. Defaults to False.

    In case use_version_from_input is set to True,
    data source version inferred from ld_matrix_temolate is appended as the last path segment to the output path.
    Default values are provided in LDIndexConfig.
    """
    if use_version_from_input:
        # amend data source version to output path
        ld_index_out = VersionEngine("gnomad").amend_version(
            ld_matrix_template, ld_index_out
        )
    (
        GnomADLDMatrix(
            ld_matrix_template=ld_matrix_template,
            ld_index_raw_template=ld_index_raw_template,
            grch37_to_grch38_chain_path=grch37_to_grch38_chain_path,
            ld_populations=ld_populations,
            liftover_ht_path=liftover_ht_path,
        )
        .as_ld_index(min_r2)
        .df.write.partitionBy("chromosome")
        .mode(session.write_mode)
        .parquet(ld_index_out)
    )
    session.logger.info(ld_index_out)