Skip to content

GnomAD variant data ingestion

gentropy.gnomad_ingestion.GnomadVariantIndexStep

A step to generate variant index dataset from gnomad data.

Variant annotation step produces a dataset of the type VariantIndex derived from gnomADs gnomad.genomes.vX.X.X.sites.ht Hail's table. This dataset is used to validate variants and as a source of annotation.

Source code in src/gentropy/gnomad_ingestion.py
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class GnomadVariantIndexStep:
    """A step to generate variant index dataset from gnomad data.

    Variant annotation step produces a dataset of the type `VariantIndex` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table.
    This dataset is used to validate variants and as a source of annotation.
    """

    def __init__(
        self,
        session: Session,
        variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path,
        gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
        gnomad_variant_populations: list[
            VariantPopulation | str
        ] = GnomadVariantConfig().gnomad_variant_populations,
    ) -> None:
        """Run Variant Annotation step.

        Args:
            session (Session): Session object.
            variant_annotation_path (str): Path to resulting dataset.
            gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
            gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.

        All defaults are stored in the GnomadVariantConfig.
        """
        # amend data source version to output path
        session.logger.info("Gnomad variant annotation path:")
        session.logger.info(variant_annotation_path)
        # Parse variant info from source.
        (
            GnomADVariants(
                gnomad_genomes_path=gnomad_genomes_path,
                gnomad_variant_populations=gnomad_variant_populations,
            )
            # Convert data to variant index:
            .as_variant_index()
            # Write file:
            .df.repartitionByRange("chromosome", "position")
            .sortWithinPartitions("chromosome", "position")
            .write.mode(session.write_mode)
            .parquet(variant_annotation_path)
        )

__init__(session: Session, variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path, gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path, gnomad_variant_populations: list[VariantPopulation | str] = GnomadVariantConfig().gnomad_variant_populations) -> None

Run Variant Annotation step.

Parameters:

Name Type Description Default
session Session

Session object.

required
variant_annotation_path str

Path to resulting dataset.

variant_annotation_path
gnomad_genomes_path str

Path to gnomAD genomes hail table, e.g. gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/.

gnomad_genomes_path
gnomad_variant_populations list[VariantPopulation | str]

List of populations to include.

gnomad_variant_populations

All defaults are stored in the GnomadVariantConfig.

Source code in src/gentropy/gnomad_ingestion.py
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(
    self,
    session: Session,
    variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path,
    gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
    gnomad_variant_populations: list[
        VariantPopulation | str
    ] = GnomadVariantConfig().gnomad_variant_populations,
) -> None:
    """Run Variant Annotation step.

    Args:
        session (Session): Session object.
        variant_annotation_path (str): Path to resulting dataset.
        gnomad_genomes_path (str): Path to gnomAD genomes hail table, e.g. `gs://gcp-public-data--gnomad/release/4.0/ht/genomes/gnomad.genomes.v4.0.sites.ht/`.
        gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include.

    All defaults are stored in the GnomadVariantConfig.
    """
    # amend data source version to output path
    session.logger.info("Gnomad variant annotation path:")
    session.logger.info(variant_annotation_path)
    # Parse variant info from source.
    (
        GnomADVariants(
            gnomad_genomes_path=gnomad_genomes_path,
            gnomad_variant_populations=gnomad_variant_populations,
        )
        # Convert data to variant index:
        .as_variant_index()
        # Write file:
        .df.repartitionByRange("chromosome", "position")
        .sortWithinPartitions("chromosome", "position")
        .write.mode(session.write_mode)
        .parquet(variant_annotation_path)
    )