Skip to content

Variant annotation

Bases: VariantAnnotationStepConfig

Variant annotation step.

Variant annotation step produces a dataset of the type VariantAnnotation derived from gnomADs gnomad.genomes.vX.X.X.sites.ht Hail's table. This dataset is used to validate variants and as a source of annotation.

Source code in src/otg/variant_annotation.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
@dataclass
class VariantAnnotationStep(VariantAnnotationStepConfig):
    """Variant annotation step.

    Variant annotation step produces a dataset of the type `VariantAnnotation` derived from gnomADs `gnomad.genomes.vX.X.X.sites.ht` Hail's table. This dataset is used to validate variants and as a source of annotation.
    """

    session: Session = Session()

    def run(self: VariantAnnotationStep) -> None:
        """Run variant annotation step."""
        # init hail session
        hl.init(sc=self.session.spark.sparkContext, log="/dev/null")

        """Run variant annotation step."""
        variant_annotation = VariantAnnotation.from_gnomad(
            self.gnomad_genomes,
            self.chain_38_to_37,
            self.populations,
        )
        # Writing data partitioned by chromosome and position:
        (
            variant_annotation.df.repartition(400, "chromosome")
            .sortWithinPartitions("chromosome", "position")
            .write.partitionBy("chromosome")
            .mode(self.session.write_mode)
            .parquet(self.variant_annotation_path)
        )

run()

Run variant annotation step.

Source code in src/otg/variant_annotation.py
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
def run(self: VariantAnnotationStep) -> None:
    """Run variant annotation step."""
    # init hail session
    hl.init(sc=self.session.spark.sparkContext, log="/dev/null")

    """Run variant annotation step."""
    variant_annotation = VariantAnnotation.from_gnomad(
        self.gnomad_genomes,
        self.chain_38_to_37,
        self.populations,
    )
    # Writing data partitioned by chromosome and position:
    (
        variant_annotation.df.repartition(400, "chromosome")
        .sortWithinPartitions("chromosome", "position")
        .write.partitionBy("chromosome")
        .mode(self.session.write_mode)
        .parquet(self.variant_annotation_path)
    )

Variant annotation step requirements.

Attributes:

Name Type Description
gnomad_genomes str

Path to gnomAD genomes hail table.

chain_38_to_37 str

Path to GRCh38 to GRCh37 chain file.

variant_annotation_path str

Output variant annotation path.

populations List[str]

List of populations to include.

Source code in src/otg/config.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
@dataclass
class VariantAnnotationStepConfig:
    """Variant annotation step requirements.

    Attributes:
        gnomad_genomes (str): Path to gnomAD genomes hail table.
        chain_38_to_37 (str): Path to GRCh38 to GRCh37 chain file.
        variant_annotation_path (str): Output variant annotation path.
        populations (List[str]): List of populations to include.
    """

    _target_: str = "otg.variant_annotation.VariantAnnotationStep"
    gnomad_genomes: str = MISSING
    chain_38_to_37: str = MISSING
    variant_annotation_path: str = MISSING
    populations: List[str] = field(
        default_factory=lambda: [
            "afr",  # African-American
            "amr",  # American Admixed/Latino
            "ami",  # Amish ancestry
            "asj",  # Ashkenazi Jewish
            "eas",  # East Asian
            "fin",  # Finnish
            "nfe",  # Non-Finnish European
            "mid",  # Middle Eastern
            "sas",  # South Asian
            "oth",  # Other
        ]
    )