Skip to content

Gene index

Bases: Dataset

Gene index dataset.

Gene-based annotation.

Source code in src/otg/dataset/gene_index.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
@dataclass
class GeneIndex(Dataset):
    """Gene index dataset.

    Gene-based annotation.
    """

    @staticmethod
    def _get_gene_tss(strand_col: Column, start_col: Column, end_col: Column) -> Column:
        """Returns the TSS of a gene based on its orientation.

        Args:
            strand_col (Column): Column containing 1 if the coding strand of the gene is forward, and -1 if it is reverse.
            start_col (Column): Column containing the start position of the gene.
            end_col (Column): Column containing the end position of the gene.

        Returns:
            Column: Column containing the TSS of the gene.

        Examples:
            >>> df = spark.createDataFrame([{"strand": 1, "start": 100, "end": 200}, {"strand": -1, "start": 100, "end": 200}])
            >>> df.withColumn("tss", GeneIndex._get_gene_tss(f.col("strand"), f.col("start"), f.col("end"))).show()
            +---+-----+------+---+
            |end|start|strand|tss|
            +---+-----+------+---+
            |200|  100|     1|100|
            |200|  100|    -1|200|
            +---+-----+------+---+
            <BLANKLINE>

        """
        return f.when(strand_col == 1, start_col).when(strand_col == -1, end_col)

    @classmethod
    def get_schema(cls: type[GeneIndex]) -> StructType:
        """Provides the schema for the GeneIndex dataset."""
        return parse_spark_schema("targets.json")

    @classmethod
    def from_source(cls: type[GeneIndex], target_index: DataFrame) -> GeneIndex:
        """Initialise GeneIndex from source dataset.

        Args:
            target_index (DataFrame): Target index dataframe

        Returns:
            GeneIndex: Gene index dataset
        """
        return cls(
            _df=target_index.select(
                f.coalesce(f.col("id"), f.lit("unknown")).alias("geneId"),
                f.coalesce(f.col("genomicLocation.chromosome"), f.lit("unknown")).alias(
                    "chromosome"
                ),
                GeneIndex._get_gene_tss(
                    f.col("genomicLocation.strand"),
                    f.col("genomicLocation.start"),
                    f.col("genomicLocation.end"),
                ).alias("tss"),
                "biotype",
                "approvedSymbol",
                "obsoleteSymbols",
            ),
            _schema=cls.get_schema(),
        )

    def filter_by_biotypes(self: GeneIndex, biotypes: list) -> GeneIndex:
        """Filter by approved biotypes.

        Args:
            biotypes (list): List of Ensembl biotypes to keep.

        Returns:
            GeneIndex: Gene index dataset filtered by biotypes.
        """
        self.df = self._df.filter(f.col("biotype").isin(biotypes))
        return self

    def locations_lut(self: GeneIndex) -> DataFrame:
        """Gene location information.

        Returns:
            DataFrame: Gene LUT including genomic location information.
        """
        return self.df.select(
            "geneId",
            "chromosome",
            "tss",
        )

    def symbols_lut(self: GeneIndex) -> DataFrame:
        """Gene symbol lookup table.

        Pre-processess gene/target dataset to create lookup table of gene symbols, including
        obsoleted gene symbols.

        Returns:
            DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.
        """
        return self.df.select(
            "geneId",
            f.explode(
                f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols.label"))
            ).alias("geneSymbol"),
        )

filter_by_biotypes(biotypes)

Filter by approved biotypes.

Parameters:

Name Type Description Default
biotypes list

List of Ensembl biotypes to keep.

required

Returns:

Name Type Description
GeneIndex GeneIndex

Gene index dataset filtered by biotypes.

Source code in src/otg/dataset/gene_index.py
83
84
85
86
87
88
89
90
91
92
93
def filter_by_biotypes(self: GeneIndex, biotypes: list) -> GeneIndex:
    """Filter by approved biotypes.

    Args:
        biotypes (list): List of Ensembl biotypes to keep.

    Returns:
        GeneIndex: Gene index dataset filtered by biotypes.
    """
    self.df = self._df.filter(f.col("biotype").isin(biotypes))
    return self

from_source(target_index) classmethod

Initialise GeneIndex from source dataset.

Parameters:

Name Type Description Default
target_index DataFrame

Target index dataframe

required

Returns:

Name Type Description
GeneIndex GeneIndex

Gene index dataset

Source code in src/otg/dataset/gene_index.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
@classmethod
def from_source(cls: type[GeneIndex], target_index: DataFrame) -> GeneIndex:
    """Initialise GeneIndex from source dataset.

    Args:
        target_index (DataFrame): Target index dataframe

    Returns:
        GeneIndex: Gene index dataset
    """
    return cls(
        _df=target_index.select(
            f.coalesce(f.col("id"), f.lit("unknown")).alias("geneId"),
            f.coalesce(f.col("genomicLocation.chromosome"), f.lit("unknown")).alias(
                "chromosome"
            ),
            GeneIndex._get_gene_tss(
                f.col("genomicLocation.strand"),
                f.col("genomicLocation.start"),
                f.col("genomicLocation.end"),
            ).alias("tss"),
            "biotype",
            "approvedSymbol",
            "obsoleteSymbols",
        ),
        _schema=cls.get_schema(),
    )

get_schema() classmethod

Provides the schema for the GeneIndex dataset.

Source code in src/otg/dataset/gene_index.py
50
51
52
53
@classmethod
def get_schema(cls: type[GeneIndex]) -> StructType:
    """Provides the schema for the GeneIndex dataset."""
    return parse_spark_schema("targets.json")

locations_lut()

Gene location information.

Returns:

Name Type Description
DataFrame DataFrame

Gene LUT including genomic location information.

Source code in src/otg/dataset/gene_index.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
def locations_lut(self: GeneIndex) -> DataFrame:
    """Gene location information.

    Returns:
        DataFrame: Gene LUT including genomic location information.
    """
    return self.df.select(
        "geneId",
        "chromosome",
        "tss",
    )

symbols_lut()

Gene symbol lookup table.

Pre-processess gene/target dataset to create lookup table of gene symbols, including obsoleted gene symbols.

Returns:

Name Type Description
DataFrame DataFrame

Gene LUT for symbol mapping containing geneId and geneSymbol columns.

Source code in src/otg/dataset/gene_index.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def symbols_lut(self: GeneIndex) -> DataFrame:
    """Gene symbol lookup table.

    Pre-processess gene/target dataset to create lookup table of gene symbols, including
    obsoleted gene symbols.

    Returns:
        DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.
    """
    return self.df.select(
        "geneId",
        f.explode(
            f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols.label"))
        ).alias("geneSymbol"),
    )

Schema

root
 |-- geneId: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- approvedName: string (nullable = true)
 |-- obsoleteSymbols: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- tss: long (nullable = true)