Skip to content

V2G

Bases: V2GStepConfig

Variant-to-gene (V2G) step.

This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:

  1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).
  2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.
  3. Distance between the variant and each gene's canonical transcription start site (TSS).
Source code in src/otg/v2g.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
@dataclass
class V2GStep(V2GStepConfig):
    """Variant-to-gene (V2G) step.

    This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:

    1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).
    2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.
    3. Distance between the variant and each gene's canonical transcription start site (TSS).

    """

    session: Session = Session()

    def run(self: V2GStep) -> None:
        """Run V2G dataset generation."""
        # Filter gene index by approved biotypes to define V2G gene universe
        gene_index_filtered = GeneIndex.from_parquet(
            self.session, self.gene_index_path
        ).filter_by_biotypes(self.approved_biotypes)

        vi = VariantIndex.from_parquet(self.session, self.variant_index_path).persist()
        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)
        vep_consequences = self.session.spark.read.csv(
            self.vep_consequences_path, sep="\t", header=True
        )

        # Variant annotation reduced to the variant index to define V2G variant universe
        va_slimmed = va.filter_by_variant_df(vi.df, ["id", "chromosome"]).persist()

        # lift over variants to hg38
        lift = LiftOverSpark(
            self.liftover_chain_file_path, self.liftover_max_length_difference
        )

        v2g_datasets = [
            va_slimmed.get_distance_to_tss(gene_index_filtered, self.max_distance),
            # variant effects
            va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),
            va_slimmed.get_polyphen_v2g(gene_index_filtered),
            va_slimmed.get_sift_v2g(gene_index_filtered),
            va_slimmed.get_plof_v2g(gene_index_filtered),
            # intervals
            Intervals.parse_andersson(
                self.session, self.anderson_path, gene_index_filtered, lift
            ).v2g(vi),
            Intervals.parse_javierre(
                self.session, self.javierre_path, gene_index_filtered, lift
            ).v2g(vi),
            Intervals.parse_jung(
                self.session, self.jung_path, gene_index_filtered, lift
            ).v2g(vi),
            Intervals.parse_thurman(
                self.session, self.thurnman_path, gene_index_filtered, lift
            ).v2g(vi),
        ]

        # merge all V2G datasets
        v2g = V2G(
            _df=reduce(
                lambda x, y: x.unionByName(y, allowMissingColumns=True),
                [dataset.df for dataset in v2g_datasets],
            ).repartition("chromosome")
        )
        # write V2G dataset
        (
            v2g.df.write.partitionBy("chromosome")
            .mode(self.session.write_mode)
            .parquet(self.v2g_path)
        )

run()

Run V2G dataset generation.

Source code in src/otg/v2g.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
def run(self: V2GStep) -> None:
    """Run V2G dataset generation."""
    # Filter gene index by approved biotypes to define V2G gene universe
    gene_index_filtered = GeneIndex.from_parquet(
        self.session, self.gene_index_path
    ).filter_by_biotypes(self.approved_biotypes)

    vi = VariantIndex.from_parquet(self.session, self.variant_index_path).persist()
    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)
    vep_consequences = self.session.spark.read.csv(
        self.vep_consequences_path, sep="\t", header=True
    )

    # Variant annotation reduced to the variant index to define V2G variant universe
    va_slimmed = va.filter_by_variant_df(vi.df, ["id", "chromosome"]).persist()

    # lift over variants to hg38
    lift = LiftOverSpark(
        self.liftover_chain_file_path, self.liftover_max_length_difference
    )

    v2g_datasets = [
        va_slimmed.get_distance_to_tss(gene_index_filtered, self.max_distance),
        # variant effects
        va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),
        va_slimmed.get_polyphen_v2g(gene_index_filtered),
        va_slimmed.get_sift_v2g(gene_index_filtered),
        va_slimmed.get_plof_v2g(gene_index_filtered),
        # intervals
        Intervals.parse_andersson(
            self.session, self.anderson_path, gene_index_filtered, lift
        ).v2g(vi),
        Intervals.parse_javierre(
            self.session, self.javierre_path, gene_index_filtered, lift
        ).v2g(vi),
        Intervals.parse_jung(
            self.session, self.jung_path, gene_index_filtered, lift
        ).v2g(vi),
        Intervals.parse_thurman(
            self.session, self.thurnman_path, gene_index_filtered, lift
        ).v2g(vi),
    ]

    # merge all V2G datasets
    v2g = V2G(
        _df=reduce(
            lambda x, y: x.unionByName(y, allowMissingColumns=True),
            [dataset.df for dataset in v2g_datasets],
        ).repartition("chromosome")
    )
    # write V2G dataset
    (
        v2g.df.write.partitionBy("chromosome")
        .mode(self.session.write_mode)
        .parquet(self.v2g_path)
    )

Variant to gene (V2G) step requirements.

Attributes:

Name Type Description
variant_index_path str

Input variant index path.

variant_annotation_path str

Input variant annotation path.

gene_index_path str

Input gene index path.

vep_consequences_path str

Input VEP consequences path.

lift_over_chain_file_path str

Path to GRCh37 to GRCh38 chain file.

approved_biotypes list[str]

List of approved biotypes.

anderson_path str

Anderson intervals path.

javierre_path str

Javierre intervals path.

jung_path str

Jung intervals path.

thurnman_path str

Thurnman intervals path.

liftover_max_length_difference int

Maximum length difference for liftover.

max_distance int

Maximum distance to consider.

output_path str

Output V2G path.

Source code in src/otg/config.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
@dataclass
class V2GStepConfig:
    """Variant to gene (V2G) step requirements.

    Attributes:
        variant_index_path (str): Input variant index path.
        variant_annotation_path (str): Input variant annotation path.
        gene_index_path (str): Input gene index path.
        vep_consequences_path (str): Input VEP consequences path.
        lift_over_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
        approved_biotypes (list[str]): List of approved biotypes.
        anderson_path (str): Anderson intervals path.
        javierre_path (str): Javierre intervals path.
        jung_path (str): Jung intervals path.
        thurnman_path (str): Thurnman intervals path.
        liftover_max_length_difference (int): Maximum length difference for liftover.
        max_distance (int): Maximum distance to consider.
        output_path (str): Output V2G path.
    """

    _target_: str = "otg.v2g.V2GStep"
    variant_index_path: str = MISSING
    variant_annotation_path: str = MISSING
    gene_index_path: str = MISSING
    vep_consequences_path: str = MISSING
    liftover_chain_file_path: str = MISSING
    anderson_path: str = MISSING
    javierre_path: str = MISSING
    jung_path: str = MISSING
    thurnman_path: str = MISSING
    liftover_max_length_difference: int = 100
    max_distance: int = 500_000
    v2g_path: str = MISSING
    approved_biotypes: List[str] = field(
        default_factory=lambda: [
            "protein_coding",
            "3prime_overlapping_ncRNA",
            "antisense",
            "bidirectional_promoter_lncRNA",
            "IG_C_gene",
            "IG_D_gene",
            "IG_J_gene",
            "IG_V_gene",
            "lincRNA",
            "macro_lncRNA",
            "non_coding",
            "sense_intronic",
            "sense_overlapping",
        ]
    )