Skip to content

GnomAD variant data ingestion

gentropy.gnomad_ingestion.GnomadVariantIndexStep

A step to generate variant index dataset from gnomad data.

Variant annotation step produces a dataset of the type VariantIndex derived from gnomADs gnomad.genomes.vX.X.X.sites.ht Hail's table. This dataset is used to validate variants and as a source of annotation.

Source code in src/gentropy/gnomad_ingestion.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class GnomadVariantIndexStep:
    """A step to generate variant index dataset from gnomad data.

    Variant annotation step produces a dataset of the type `VariantIndex` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table.
    This dataset is used to validate variants and as a source of annotation.
    """

    def __init__(
        self,
        session: Session,
        variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path,
        gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
        gnomad_variant_populations: list[
            VariantPopulation | str
        ] = GnomadVariantConfig().gnomad_variant_populations,
        use_version_from_input: bool = GnomadVariantConfig().use_version_from_input,
    ) -> None:
        """Run Variant Annotation step.

        Args:
            session (Session): Session object.
            variant_annotation_path (str): Path to resulting dataset.
            gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
            gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
            use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False.

        In case use_version_from_input is set to True,
        data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path.
        All defaults are stored in the GnomadVariantConfig.
        """
        # amend data source version to output path
        if use_version_from_input:
            variant_annotation_path = VersionEngine("gnomad").amend_version(
                gnomad_genomes_path, variant_annotation_path
            )

        # Parse variant info from source.
        (
            GnomADVariants(
                gnomad_genomes_path=gnomad_genomes_path,
                gnomad_variant_populations=gnomad_variant_populations,
            )
            # Convert data to variant index:
            .as_variant_index()
            # Write file:
            .df.write.mode(session.write_mode)
            .parquet(variant_annotation_path)
        )

__init__(session: Session, variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path, gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path, gnomad_variant_populations: list[VariantPopulation | str] = GnomadVariantConfig().gnomad_variant_populations, use_version_from_input: bool = GnomadVariantConfig().use_version_from_input) -> None

Run Variant Annotation step.

Parameters:

Name Type Description Default
session Session

Session object.

required
variant_annotation_path str

Path to resulting dataset.

variant_annotation_path
gnomad_genomes_path str

Path to gnomAD genomes hail table, e.g. gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/.

gnomad_genomes_path
gnomad_variant_populations list[VariantPopulation | str]

List of populations to include.

gnomad_variant_populations
use_version_from_input bool

Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False.

use_version_from_input

In case use_version_from_input is set to True, data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path. All defaults are stored in the GnomadVariantConfig.

Source code in src/gentropy/gnomad_ingestion.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def __init__(
    self,
    session: Session,
    variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path,
    gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
    gnomad_variant_populations: list[
        VariantPopulation | str
    ] = GnomadVariantConfig().gnomad_variant_populations,
    use_version_from_input: bool = GnomadVariantConfig().use_version_from_input,
) -> None:
    """Run Variant Annotation step.

    Args:
        session (Session): Session object.
        variant_annotation_path (str): Path to resulting dataset.
        gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
        gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.
        use_version_from_input (bool): Append version derived from input gnomad_genomes_path to the output variant_annotation_path. Defaults to False.

    In case use_version_from_input is set to True,
    data source version inferred from gnomad_genomes_path is appended as the last path segment to the output path.
    All defaults are stored in the GnomadVariantConfig.
    """
    # amend data source version to output path
    if use_version_from_input:
        variant_annotation_path = VersionEngine("gnomad").amend_version(
            gnomad_genomes_path, variant_annotation_path
        )

    # Parse variant info from source.
    (
        GnomADVariants(
            gnomad_genomes_path=gnomad_genomes_path,
            gnomad_variant_populations=gnomad_variant_populations,
        )
        # Convert data to variant index:
        .as_variant_index()
        # Write file:
        .df.write.mode(session.write_mode)
        .parquet(variant_annotation_path)
    )