Skip to content

Variant-to-gene

gentropy.dataset.v2g.V2G dataclass

Bases: Dataset

Variant-to-gene (V2G) evidence dataset.

A variant-to-gene (V2G) evidence is understood as any piece of evidence that supports the association of a variant with a likely causal gene. The evidence can sometimes be context-specific and refer to specific biofeatures (e.g. cell types)

Source code in src/gentropy/dataset/v2g.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
@dataclass
class V2G(Dataset):
    """Variant-to-gene (V2G) evidence dataset.

    A variant-to-gene (V2G) evidence is understood as any piece of evidence that supports the association of a variant with a likely causal gene. The evidence can sometimes be context-specific and refer to specific `biofeatures` (e.g. cell types)
    """

    @classmethod
    def get_schema(cls: type[V2G]) -> StructType:
        """Provides the schema for the V2G dataset.

        Returns:
            StructType: Schema for the V2G dataset
        """
        return parse_spark_schema("v2g.json")

    def filter_by_genes(self: V2G, genes: GeneIndex) -> V2G:
        """Filter V2G dataset by genes.

        Args:
            genes (GeneIndex): Gene index dataset to filter by

        Returns:
            V2G: V2G dataset filtered by genes
        """
        self.df = self._df.join(genes.df.select("geneId"), on="geneId", how="inner")
        return self

    def extract_distance_tss_minimum(self: V2G) -> None:
        """Extract minimum distance to TSS."""
        self.df = self._df.filter(f.col("distance")).withColumn(
            "distanceTssMinimum",
            f.expr("min(distTss) OVER (PARTITION BY studyLocusId)"),
        )

extract_distance_tss_minimum() -> None

Extract minimum distance to TSS.

Source code in src/gentropy/dataset/v2g.py
46
47
48
49
50
51
def extract_distance_tss_minimum(self: V2G) -> None:
    """Extract minimum distance to TSS."""
    self.df = self._df.filter(f.col("distance")).withColumn(
        "distanceTssMinimum",
        f.expr("min(distTss) OVER (PARTITION BY studyLocusId)"),
    )

filter_by_genes(genes: GeneIndex) -> V2G

Filter V2G dataset by genes.

Parameters:

Name Type Description Default
genes GeneIndex

Gene index dataset to filter by

required

Returns:

Name Type Description
V2G V2G

V2G dataset filtered by genes

Source code in src/gentropy/dataset/v2g.py
34
35
36
37
38
39
40
41
42
43
44
def filter_by_genes(self: V2G, genes: GeneIndex) -> V2G:
    """Filter V2G dataset by genes.

    Args:
        genes (GeneIndex): Gene index dataset to filter by

    Returns:
        V2G: V2G dataset filtered by genes
    """
    self.df = self._df.join(genes.df.select("geneId"), on="geneId", how="inner")
    return self

get_schema() -> StructType classmethod

Provides the schema for the V2G dataset.

Returns:

Name Type Description
StructType StructType

Schema for the V2G dataset

Source code in src/gentropy/dataset/v2g.py
25
26
27
28
29
30
31
32
@classmethod
def get_schema(cls: type[V2G]) -> StructType:
    """Provides the schema for the V2G dataset.

    Returns:
        StructType: Schema for the V2G dataset
    """
    return parse_spark_schema("v2g.json")

Schema

root
 |-- geneId: string (nullable = false)
 |-- variantId: string (nullable = false)
 |-- distance: long (nullable = true)
 |-- chromosome: string (nullable = false)
 |-- datatypeId: string (nullable = false)
 |-- datasourceId: string (nullable = false)
 |-- score: double (nullable = true)
 |-- resourceScore: double (nullable = true)
 |-- pmid: string (nullable = true)
 |-- biofeature: string (nullable = true)
 |-- variantFunctionalConsequenceId: string (nullable = true)
 |-- isHighQualityPlof: boolean (nullable = true)