Skip to content

variant_to_gene

gentropy.variant_to_gene.V2GStep

Variant-to-gene (V2G) step.

This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:

  1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).
  2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.
  3. Distance between the variant and each gene's canonical transcription start site (TSS).

Attributes:

Name Type Description
session Session

Session object.

variant_index_path str

Input variant index path.

gene_index_path str

Input gene index path.

vep_consequences_path str

Input VEP consequences path.

liftover_chain_file_path str

Path to GRCh37 to GRCh38 chain file.

liftover_max_length_difference str

Maximum length difference for liftover.

max_distance int

Maximum distance to consider.

approved_biotypes list[str]

List of approved biotypes.

intervals dict

Dictionary of interval sources.

v2g_path str

Output V2G path.

Source code in src/gentropy/variant_to_gene.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
class V2GStep:
    """Variant-to-gene (V2G) step.

    This step aims to generate a dataset that contains multiple pieces of evidence supporting the functional association of specific variants with genes. Some of the evidence types include:

    1. Chromatin interaction experiments, e.g. Promoter Capture Hi-C (PCHi-C).
    2. In silico functional predictions, e.g. Variant Effect Predictor (VEP) from Ensembl.
    3. Distance between the variant and each gene's canonical transcription start site (TSS).

    Attributes:
        session (Session): Session object.
        variant_index_path (str): Input variant index path.
        gene_index_path (str): Input gene index path.
        vep_consequences_path (str): Input VEP consequences path.
        liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
        liftover_max_length_difference: Maximum length difference for liftover.
        max_distance (int): Maximum distance to consider.
        approved_biotypes (list[str]): List of approved biotypes.
        intervals (dict): Dictionary of interval sources.
        v2g_path (str): Output V2G path.
    """

    def __init__(
        self,
        session: Session,
        variant_index_path: str,
        gene_index_path: str,
        vep_consequences_path: str,
        liftover_chain_file_path: str,
        approved_biotypes: list[str],
        interval_sources: dict[str, str],
        v2g_path: str,
        max_distance: int = 500_000,
        liftover_max_length_difference: int = 100,
    ) -> None:
        """Run Variant-to-gene (V2G) step.

        Args:
            session (Session): Session object.
            variant_index_path (str): Input variant index path.
            gene_index_path (str): Input gene index path.
            vep_consequences_path (str): Input VEP consequences path.
            liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
            approved_biotypes (list[str]): List of approved biotypes.
            interval_sources (dict[str, str]): Dictionary of interval sources.
            v2g_path (str): Output V2G path.
            max_distance (int): Maximum distance to consider.
            liftover_max_length_difference (int): Maximum length difference for liftover.
        """
        # Read
        gene_index = GeneIndex.from_parquet(session, gene_index_path)
        vi = VariantIndex.from_parquet(session, variant_index_path).persist()
        # Reading VEP consequence to score table and cast the score to the right type:
        vep_consequences = session.spark.read.csv(
            vep_consequences_path, sep="\t", header=True
        ).withColumn("score", f.col("score").cast("double"))

        # Transform
        lift = LiftOverSpark(
            # lift over variants to hg38
            liftover_chain_file_path,
            liftover_max_length_difference,
        )
        gene_index_filtered = gene_index.filter_by_biotypes(
            # Filter gene index by approved biotypes to define V2G gene universe
            list(approved_biotypes)
        )

        intervals = Intervals(
            _df=reduce(
                lambda x, y: x.unionByName(y, allowMissingColumns=True),
                # create interval instances by parsing each source
                [
                    Intervals.from_source(
                        session.spark, source_name, source_path, gene_index, lift
                    ).df
                    for source_name, source_path in interval_sources.items()
                ],
            ),
            _schema=Intervals.get_schema(),
        )
        v2g_datasets = [
            vi.get_distance_to_tss(gene_index_filtered, max_distance),
            vi.get_most_severe_transcript_consequence(
                vep_consequences, gene_index_filtered
            ),
            vi.get_plof_v2g(gene_index_filtered),
            intervals.v2g(vi),
        ]
        v2g = V2G(
            _df=reduce(
                lambda x, y: x.unionByName(y, allowMissingColumns=True),
                [dataset.df for dataset in v2g_datasets],
            ).repartition("chromosome"),
            _schema=V2G.get_schema(),
        )

        # Load
        (
            v2g.df.write.partitionBy("chromosome")
            .mode(session.write_mode)
            .parquet(v2g_path)
        )

__init__(session: Session, variant_index_path: str, gene_index_path: str, vep_consequences_path: str, liftover_chain_file_path: str, approved_biotypes: list[str], interval_sources: dict[str, str], v2g_path: str, max_distance: int = 500000, liftover_max_length_difference: int = 100) -> None

Run Variant-to-gene (V2G) step.

Parameters:

Name Type Description Default
session Session

Session object.

required
variant_index_path str

Input variant index path.

required
gene_index_path str

Input gene index path.

required
vep_consequences_path str

Input VEP consequences path.

required
liftover_chain_file_path str

Path to GRCh37 to GRCh38 chain file.

required
approved_biotypes list[str]

List of approved biotypes.

required
interval_sources dict[str, str]

Dictionary of interval sources.

required
v2g_path str

Output V2G path.

required
max_distance int

Maximum distance to consider.

500000
liftover_max_length_difference int

Maximum length difference for liftover.

100
Source code in src/gentropy/variant_to_gene.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def __init__(
    self,
    session: Session,
    variant_index_path: str,
    gene_index_path: str,
    vep_consequences_path: str,
    liftover_chain_file_path: str,
    approved_biotypes: list[str],
    interval_sources: dict[str, str],
    v2g_path: str,
    max_distance: int = 500_000,
    liftover_max_length_difference: int = 100,
) -> None:
    """Run Variant-to-gene (V2G) step.

    Args:
        session (Session): Session object.
        variant_index_path (str): Input variant index path.
        gene_index_path (str): Input gene index path.
        vep_consequences_path (str): Input VEP consequences path.
        liftover_chain_file_path (str): Path to GRCh37 to GRCh38 chain file.
        approved_biotypes (list[str]): List of approved biotypes.
        interval_sources (dict[str, str]): Dictionary of interval sources.
        v2g_path (str): Output V2G path.
        max_distance (int): Maximum distance to consider.
        liftover_max_length_difference (int): Maximum length difference for liftover.
    """
    # Read
    gene_index = GeneIndex.from_parquet(session, gene_index_path)
    vi = VariantIndex.from_parquet(session, variant_index_path).persist()
    # Reading VEP consequence to score table and cast the score to the right type:
    vep_consequences = session.spark.read.csv(
        vep_consequences_path, sep="\t", header=True
    ).withColumn("score", f.col("score").cast("double"))

    # Transform
    lift = LiftOverSpark(
        # lift over variants to hg38
        liftover_chain_file_path,
        liftover_max_length_difference,
    )
    gene_index_filtered = gene_index.filter_by_biotypes(
        # Filter gene index by approved biotypes to define V2G gene universe
        list(approved_biotypes)
    )

    intervals = Intervals(
        _df=reduce(
            lambda x, y: x.unionByName(y, allowMissingColumns=True),
            # create interval instances by parsing each source
            [
                Intervals.from_source(
                    session.spark, source_name, source_path, gene_index, lift
                ).df
                for source_name, source_path in interval_sources.items()
            ],
        ),
        _schema=Intervals.get_schema(),
    )
    v2g_datasets = [
        vi.get_distance_to_tss(gene_index_filtered, max_distance),
        vi.get_most_severe_transcript_consequence(
            vep_consequences, gene_index_filtered
        ),
        vi.get_plof_v2g(gene_index_filtered),
        intervals.v2g(vi),
    ]
    v2g = V2G(
        _df=reduce(
            lambda x, y: x.unionByName(y, allowMissingColumns=True),
            [dataset.df for dataset in v2g_datasets],
        ).repartition("chromosome"),
        _schema=V2G.get_schema(),
    )

    # Load
    (
        v2g.df.write.partitionBy("chromosome")
        .mode(session.write_mode)
        .parquet(v2g_path)
    )