Skip to content

variant_to_gene

gentropy.v2g.V2GStep

Variant-to-gene (V2G) step.

This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:

  1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).
  2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.
  3. Distance between the variant and each gene's canonical transcription start site (TSS).

Attributes:

Name Type Description
session Session

Session object.

variant_index_path str

Input variant index path.

variant_annotation_path str

Input variant annotation path.

gene_index_path str

Input gene index path.

vep_consequences_path str

Input VEP consequences path.

liftover_chain_file_path str

Path to GRCh37 to GRCh38 chain file.

liftover_max_length_difference str

Maximum length difference for liftover.

max_distance int

Maximum distance to consider.

approved_biotypes list[str]

List of approved biotypes.

intervals dict

Dictionary of interval sources.

v2g_path str

Output V2G path.

Source code in src/gentropy/v2g.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
class V2GStep:
    """Variant-to-gene (V2G) step.

    This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:

    1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).
    2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.
    3. Distance between the variant and each gene's canonical transcription start site (TSS).

    Attributes:
        session (Session): Session object.
        variant_index_path (str): Input variant index path.
        variant_annotation_path (str): Input variant annotation path.
        gene_index_path (str): Input gene index path.
        vep_consequences_path (str): Input VEP consequences path.
        liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
        liftover_max_length_difference: Maximum length difference for liftover.
        max_distance (int): Maximum distance to consider.
        approved_biotypes (list[str]): List of approved biotypes.
        intervals (dict): Dictionary of interval sources.
        v2g_path (str): Output V2G path.
    """

    def __init__(
        self,
        session: Session,
        variant_index_path: str,
        variant_annotation_path: str,
        gene_index_path: str,
        vep_consequences_path: str,
        liftover_chain_file_path: str,
        approved_biotypes: list[str],
        interval_sources: dict[str, str],
        v2g_path: str,
        max_distance: int = 500_000,
        liftover_max_length_difference: int = 100,
    ) -> None:
        """Run Variant-to-gene (V2G) step.

        Args:
            session (Session): Session object.
            variant_index_path (str): Input variant index path.
            variant_annotation_path (str): Input variant annotation path.
            gene_index_path (str): Input gene index path.
            vep_consequences_path (str): Input VEP consequences path.
            liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
            approved_biotypes (list[str]): List of approved biotypes.
            interval_sources (dict[str, str]): Dictionary of interval sources.
            v2g_path (str): Output V2G path.
            max_distance (int): Maximum distance to consider.
            liftover_max_length_difference (int): Maximum length difference for liftover.
        """
        # Read
        gene_index = GeneIndex.from_parquet(session, gene_index_path)
        vi = VariantIndex.from_parquet(session, variant_index_path).persist()
        va = VariantAnnotation.from_parquet(session, variant_annotation_path)
        vep_consequences = session.spark.read.csv(
            vep_consequences_path, sep="\t", header=True
        ).select(
            f.element_at(f.split("Accession", r"/"), -1).alias(
                "variantFunctionalConsequenceId"
            ),
            f.col("Term").alias("label"),
            f.col("v2g_score").cast("double").alias("score"),
        )

        # Transform
        lift = LiftOverSpark(
            # lift over variants to hg38
            liftover_chain_file_path,
            liftover_max_length_difference,
        )
        gene_index_filtered = gene_index.filter_by_biotypes(
            # Filter gene index by approved biotypes to define V2G gene universe
            list(approved_biotypes)
        )
        va_slimmed = va.filter_by_variant_df(
            # Variant annotation reduced to the variant index to define V2G variant universe
            vi.df
        ).persist()
        intervals = Intervals(
            _df=reduce(
                lambda x, y: x.unionByName(y, allowMissingColumns=True),
                # create interval instances by parsing each source
                [
                    Intervals.from_source(
                        session.spark, source_name, source_path, gene_index, lift
                    ).df
                    for source_name, source_path in interval_sources.items()
                ],
            ),
            _schema=Intervals.get_schema(),
        )
        v2g_datasets = [
            va_slimmed.get_distance_to_tss(gene_index_filtered, max_distance),
            va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),
            va_slimmed.get_plof_v2g(gene_index_filtered),
            intervals.v2g(vi),
        ]
        v2g = V2G(
            _df=reduce(
                lambda x, y: x.unionByName(y, allowMissingColumns=True),
                [dataset.df for dataset in v2g_datasets],
            ).repartition("chromosome"),
            _schema=V2G.get_schema(),
        )

        # Load
        (
            v2g.df.write.partitionBy("chromosome")
            .mode(session.write_mode)
            .parquet(v2g_path)
        )

__init__(session: Session, variant_index_path: str, variant_annotation_path: str, gene_index_path: str, vep_consequences_path: str, liftover_chain_file_path: str, approved_biotypes: list[str], interval_sources: dict[str, str], v2g_path: str, max_distance: int = 500000, liftover_max_length_difference: int = 100) -> None

Run Variant-to-gene (V2G) step.

Parameters:

Name Type Description Default
session Session

Session object.

required
variant_index_path str

Input variant index path.

required
variant_annotation_path str

Input variant annotation path.

required
gene_index_path str

Input gene index path.

required
vep_consequences_path str

Input VEP consequences path.

required
liftover_chain_file_path str

Path to GRCh37 to GRCh38 chain file.

required
approved_biotypes list[str]

List of approved biotypes.

required
interval_sources dict[str, str]

Dictionary of interval sources.

required
v2g_path str

Output V2G path.

required
max_distance int

Maximum distance to consider.

500000
liftover_max_length_difference int

Maximum length difference for liftover.

100
Source code in src/gentropy/v2g.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def __init__(
    self,
    session: Session,
    variant_index_path: str,
    variant_annotation_path: str,
    gene_index_path: str,
    vep_consequences_path: str,
    liftover_chain_file_path: str,
    approved_biotypes: list[str],
    interval_sources: dict[str, str],
    v2g_path: str,
    max_distance: int = 500_000,
    liftover_max_length_difference: int = 100,
) -> None:
    """Run Variant-to-gene (V2G) step.

    Args:
        session (Session): Session object.
        variant_index_path (str): Input variant index path.
        variant_annotation_path (str): Input variant annotation path.
        gene_index_path (str): Input gene index path.
        vep_consequences_path (str): Input VEP consequences path.
        liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
        approved_biotypes (list[str]): List of approved biotypes.
        interval_sources (dict[str, str]): Dictionary of interval sources.
        v2g_path (str): Output V2G path.
        max_distance (int): Maximum distance to consider.
        liftover_max_length_difference (int): Maximum length difference for liftover.
    """
    # Read
    gene_index = GeneIndex.from_parquet(session, gene_index_path)
    vi = VariantIndex.from_parquet(session, variant_index_path).persist()
    va = VariantAnnotation.from_parquet(session, variant_annotation_path)
    vep_consequences = session.spark.read.csv(
        vep_consequences_path, sep="\t", header=True
    ).select(
        f.element_at(f.split("Accession", r"/"), -1).alias(
            "variantFunctionalConsequenceId"
        ),
        f.col("Term").alias("label"),
        f.col("v2g_score").cast("double").alias("score"),
    )

    # Transform
    lift = LiftOverSpark(
        # lift over variants to hg38
        liftover_chain_file_path,
        liftover_max_length_difference,
    )
    gene_index_filtered = gene_index.filter_by_biotypes(
        # Filter gene index by approved biotypes to define V2G gene universe
        list(approved_biotypes)
    )
    va_slimmed = va.filter_by_variant_df(
        # Variant annotation reduced to the variant index to define V2G variant universe
        vi.df
    ).persist()
    intervals = Intervals(
        _df=reduce(
            lambda x, y: x.unionByName(y, allowMissingColumns=True),
            # create interval instances by parsing each source
            [
                Intervals.from_source(
                    session.spark, source_name, source_path, gene_index, lift
                ).df
                for source_name, source_path in interval_sources.items()
            ],
        ),
        _schema=Intervals.get_schema(),
    )
    v2g_datasets = [
        va_slimmed.get_distance_to_tss(gene_index_filtered, max_distance),
        va_slimmed.get_most_severe_vep_v2g(vep_consequences, gene_index_filtered),
        va_slimmed.get_plof_v2g(gene_index_filtered),
        intervals.v2g(vi),
    ]
    v2g = V2G(
        _df=reduce(
            lambda x, y: x.unionByName(y, allowMissingColumns=True),
            [dataset.df for dataset in v2g_datasets],
        ).repartition("chromosome"),
        _schema=V2G.get_schema(),
    )

    # Load
    (
        v2g.df.write.partitionBy("chromosome")
        .mode(session.write_mode)
        .parquet(v2g_path)
    )