Skip to content

GnomAD variant data ingestion

gentropy.gnomad_ingestion.GnomadVariantIndexStep

A step to generate variant index dataset from gnomad data.

Variant annotation step produces a dataset of the type VariantIndex derived from gnomADs gnomad.genomes.vX.X.X.sites.ht Hail's table. This dataset is used to validate variants and as a source of annotation.

Source code in src/gentropy/gnomad_ingestion.py
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class GnomadVariantIndexStep:
    """A step to generate variant index dataset from gnomad data.

    Variant annotation step produces a dataset of the type `VariantIndex` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table.
    This dataset is used to validate variants and as a source of annotation.
    """

    def __init__(
        self,
        session: Session,
        variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path,
        gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
        gnomad_joint_path: str = GnomadVariantConfig().gnomad_joint_path,
        gnomad_variant_populations: list[
            VariantPopulation | str
        ] = GnomadVariantConfig().gnomad_variant_populations,
    ) -> None:
        """Run Variant Annotation step.

        Args:
            session (Session): Session object.
            variant_annotation_path (str): Output path for the variant annotation dataset.
            gnomad_genomes_path (str): Path to the gnomAD genomes hail table.
            gnomad_joint_path (str): Path to the gnomAD joint hail table.
            gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include in the annotation.

        All defaults are stored in the GnomadVariantConfig.
        """
        # amend data source version to output path
        session.logger.info("Gnomad variant annotation path:")
        session.logger.info(variant_annotation_path)

        gnomad_rsids = GnomADVariantRsIds(
            gnomad_genomes_path=gnomad_genomes_path,
        ).as_variant_index()

        gnomad_allele_frequencies = GnomADVariantFrequencies(
            gnomad_joint_path=gnomad_joint_path,
            gnomad_variant_populations=gnomad_variant_populations,
        ).as_variant_index()

        # Parse variant info from source.
        (
            gnomad_allele_frequencies.add_annotation(gnomad_rsids)
            .df.repartitionByRange("chromosome", "position")
            .sortWithinPartitions("chromosome", "position")
            .write.mode(session.write_mode)
            .parquet(variant_annotation_path)
        )

__init__(session: Session, variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path, gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path, gnomad_joint_path: str = GnomadVariantConfig().gnomad_joint_path, gnomad_variant_populations: list[VariantPopulation | str] = GnomadVariantConfig().gnomad_variant_populations) -> None

Run Variant Annotation step.

Parameters:

Name Type Description Default
session Session

Session object.

required
variant_annotation_path str

Output path for the variant annotation dataset.

variant_annotation_path
gnomad_genomes_path str

Path to the gnomAD genomes hail table.

gnomad_genomes_path
gnomad_joint_path str

Path to the gnomAD joint hail table.

gnomad_joint_path
gnomad_variant_populations list[VariantPopulation | str]

List of populations to include in the annotation.

gnomad_variant_populations

All defaults are stored in the GnomadVariantConfig.

Source code in src/gentropy/gnomad_ingestion.py
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def __init__(
    self,
    session: Session,
    variant_annotation_path: str = GnomadVariantConfig().variant_annotation_path,
    gnomad_genomes_path: str = GnomadVariantConfig().gnomad_genomes_path,
    gnomad_joint_path: str = GnomadVariantConfig().gnomad_joint_path,
    gnomad_variant_populations: list[
        VariantPopulation | str
    ] = GnomadVariantConfig().gnomad_variant_populations,
) -> None:
    """Run Variant Annotation step.

    Args:
        session (Session): Session object.
        variant_annotation_path (str): Output path for the variant annotation dataset.
        gnomad_genomes_path (str): Path to the gnomAD genomes hail table.
        gnomad_joint_path (str): Path to the gnomAD joint hail table.
        gnomad_variant_populations (list[VariantPopulation | str]): List of populations to include in the annotation.

    All defaults are stored in the GnomadVariantConfig.
    """
    # amend data source version to output path
    session.logger.info("Gnomad variant annotation path:")
    session.logger.info(variant_annotation_path)

    gnomad_rsids = GnomADVariantRsIds(
        gnomad_genomes_path=gnomad_genomes_path,
    ).as_variant_index()

    gnomad_allele_frequencies = GnomADVariantFrequencies(
        gnomad_joint_path=gnomad_joint_path,
        gnomad_variant_populations=gnomad_variant_populations,
    ).as_variant_index()

    # Parse variant info from source.
    (
        gnomad_allele_frequencies.add_annotation(gnomad_rsids)
        .df.repartitionByRange("chromosome", "position")
        .sortWithinPartitions("chromosome", "position")
        .write.mode(session.write_mode)
        .parquet(variant_annotation_path)
    )