Skip to content

Gene Index

gentropy.dataset.gene_index.GeneIndex dataclass

Bases: Dataset

Gene index dataset.

Gene-based annotation.

Source code in src/gentropy/dataset/gene_index.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
@dataclass
class GeneIndex(Dataset):
    """Gene index dataset.

    Gene-based annotation.
    """

    @classmethod
    def get_schema(cls: type[GeneIndex]) -> StructType:
        """Provides the schema for the GeneIndex dataset.

        Returns:
            StructType: Schema for the GeneIndex dataset
        """
        return parse_spark_schema("gene_index.json")

    def filter_by_biotypes(self: GeneIndex, biotypes: list[str]) -> GeneIndex:
        """Filter by approved biotypes.

        Args:
            biotypes (list[str]): List of Ensembl biotypes to keep.

        Returns:
            GeneIndex: Gene index dataset filtered by biotypes.
        """
        self.df = self._df.filter(f.col("biotype").isin(biotypes))
        return self

    def locations_lut(self: GeneIndex) -> DataFrame:
        """Gene location information.

        Returns:
            DataFrame: Gene LUT including genomic location information.
        """
        return self.df.select(
            "geneId",
            "chromosome",
            "start",
            "end",
            "strand",
            "tss",
        )

    def symbols_lut(self: GeneIndex) -> DataFrame:
        """Gene symbol lookup table.

        Pre-processess gene/target dataset to create lookup table of gene symbols, including
        obsoleted gene symbols.

        Returns:
            DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.
        """
        return self.df.select(
            f.explode(
                f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols"))
            ).alias("geneSymbol"),
            "*",
        )

filter_by_biotypes(biotypes: list[str]) -> GeneIndex

Filter by approved biotypes.

Parameters:

Name Type Description Default
biotypes list[str]

List of Ensembl biotypes to keep.

required

Returns:

Name Type Description
GeneIndex GeneIndex

Gene index dataset filtered by biotypes.

Source code in src/gentropy/dataset/gene_index.py
33
34
35
36
37
38
39
40
41
42
43
def filter_by_biotypes(self: GeneIndex, biotypes: list[str]) -> GeneIndex:
    """Filter by approved biotypes.

    Args:
        biotypes (list[str]): List of Ensembl biotypes to keep.

    Returns:
        GeneIndex: Gene index dataset filtered by biotypes.
    """
    self.df = self._df.filter(f.col("biotype").isin(biotypes))
    return self

get_schema() -> StructType classmethod

Provides the schema for the GeneIndex dataset.

Returns:

Name Type Description
StructType StructType

Schema for the GeneIndex dataset

Source code in src/gentropy/dataset/gene_index.py
24
25
26
27
28
29
30
31
@classmethod
def get_schema(cls: type[GeneIndex]) -> StructType:
    """Provides the schema for the GeneIndex dataset.

    Returns:
        StructType: Schema for the GeneIndex dataset
    """
    return parse_spark_schema("gene_index.json")

locations_lut() -> DataFrame

Gene location information.

Returns:

Name Type Description
DataFrame DataFrame

Gene LUT including genomic location information.

Source code in src/gentropy/dataset/gene_index.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def locations_lut(self: GeneIndex) -> DataFrame:
    """Gene location information.

    Returns:
        DataFrame: Gene LUT including genomic location information.
    """
    return self.df.select(
        "geneId",
        "chromosome",
        "start",
        "end",
        "strand",
        "tss",
    )

symbols_lut() -> DataFrame

Gene symbol lookup table.

Pre-processess gene/target dataset to create lookup table of gene symbols, including obsoleted gene symbols.

Returns:

Name Type Description
DataFrame DataFrame

Gene LUT for symbol mapping containing geneId and geneSymbol columns.

Source code in src/gentropy/dataset/gene_index.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def symbols_lut(self: GeneIndex) -> DataFrame:
    """Gene symbol lookup table.

    Pre-processess gene/target dataset to create lookup table of gene symbols, including
    obsoleted gene symbols.

    Returns:
        DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.
    """
    return self.df.select(
        f.explode(
            f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols"))
        ).alias("geneSymbol"),
        "*",
    )

Schema

root
 |-- geneId: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- approvedName: string (nullable = true)
 |-- obsoleteSymbols: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tss: long (nullable = true)
 |-- start: long (nullable = true)
 |-- end: long (nullable = true)
 |-- strand: integer (nullable = true)