Skip to content

Target Index

gentropy.dataset.target_index.TargetIndex dataclass

Bases: Dataset

Target index dataset.

Gene-based annotation.

Source code in src/gentropy/dataset/target_index.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@dataclass
class TargetIndex(Dataset):
    """Target index dataset.

    Gene-based annotation.
    """

    @classmethod
    def get_schema(cls: type[TargetIndex]) -> StructType:
        """Provides the schema for the TargetIndex dataset.

        Returns:
            StructType: Schema for the TargetIndex dataset
        """
        return parse_spark_schema("target_index.json")

    def filter_by_biotypes(self: TargetIndex, biotypes: list[str]) -> TargetIndex:
        """Filter by approved biotypes.

        Args:
            biotypes (list[str]): List of Ensembl biotypes to keep.

        Returns:
            TargetIndex: Target index dataset filtered by biotypes.
        """
        self.df = self._df.filter(f.col("biotype").isin(biotypes))
        return self

    def locations_lut(self: TargetIndex) -> DataFrame:
        """Gene location information.

        Returns:
            DataFrame: Gene LUT including genomic location information.
        """
        return self.df.select(
            f.col("id").alias("geneId"),
            f.col("genomicLocation.chromosome").alias("chromosome"),
            f.col("genomicLocation.start").alias("start"),
            f.col("genomicLocation.end").alias("end"),
            f.col("genomicLocation.strand").alias("strand"),
            "tss",
        )

    def symbols_lut(self: TargetIndex) -> DataFrame:
        """Gene symbol lookup table.

        Pre-processess gene/target dataset to create lookup table of gene symbols, including
        obsoleted gene symbols.

        Returns:
            DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.
        """
        return self.df.select(
            f.explode(
                f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols.label"))
            ).alias("geneSymbol"),
            f.col("id").alias("geneId"),
            f.col("genomicLocation.chromosome").alias("chromosome"),
            "tss",
        )

filter_by_biotypes(biotypes: list[str]) -> TargetIndex

Filter by approved biotypes.

Parameters:

Name Type Description Default
biotypes list[str]

List of Ensembl biotypes to keep.

required

Returns:

Name Type Description
TargetIndex TargetIndex

Target index dataset filtered by biotypes.

Source code in src/gentropy/dataset/target_index.py
33
34
35
36
37
38
39
40
41
42
43
def filter_by_biotypes(self: TargetIndex, biotypes: list[str]) -> TargetIndex:
    """Filter by approved biotypes.

    Args:
        biotypes (list[str]): List of Ensembl biotypes to keep.

    Returns:
        TargetIndex: Target index dataset filtered by biotypes.
    """
    self.df = self._df.filter(f.col("biotype").isin(biotypes))
    return self

get_schema() -> StructType classmethod

Provides the schema for the TargetIndex dataset.

Returns:

Name Type Description
StructType StructType

Schema for the TargetIndex dataset

Source code in src/gentropy/dataset/target_index.py
24
25
26
27
28
29
30
31
@classmethod
def get_schema(cls: type[TargetIndex]) -> StructType:
    """Provides the schema for the TargetIndex dataset.

    Returns:
        StructType: Schema for the TargetIndex dataset
    """
    return parse_spark_schema("target_index.json")

locations_lut() -> DataFrame

Gene location information.

Returns:

Name Type Description
DataFrame DataFrame

Gene LUT including genomic location information.

Source code in src/gentropy/dataset/target_index.py
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def locations_lut(self: TargetIndex) -> DataFrame:
    """Gene location information.

    Returns:
        DataFrame: Gene LUT including genomic location information.
    """
    return self.df.select(
        f.col("id").alias("geneId"),
        f.col("genomicLocation.chromosome").alias("chromosome"),
        f.col("genomicLocation.start").alias("start"),
        f.col("genomicLocation.end").alias("end"),
        f.col("genomicLocation.strand").alias("strand"),
        "tss",
    )

symbols_lut() -> DataFrame

Gene symbol lookup table.

Pre-processess gene/target dataset to create lookup table of gene symbols, including obsoleted gene symbols.

Returns:

Name Type Description
DataFrame DataFrame

Gene LUT for symbol mapping containing geneId and geneSymbol columns.

Source code in src/gentropy/dataset/target_index.py
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def symbols_lut(self: TargetIndex) -> DataFrame:
    """Gene symbol lookup table.

    Pre-processess gene/target dataset to create lookup table of gene symbols, including
    obsoleted gene symbols.

    Returns:
        DataFrame: Gene LUT for symbol mapping containing `geneId` and `geneSymbol` columns.
    """
    return self.df.select(
        f.explode(
            f.array_union(f.array("approvedSymbol"), f.col("obsoleteSymbols.label"))
        ).alias("geneSymbol"),
        f.col("id").alias("geneId"),
        f.col("genomicLocation.chromosome").alias("chromosome"),
        "tss",
    )

Schema

root
 |-- id: string (nullable = false)
 |-- approvedSymbol: string (nullable = true)
 |-- biotype: string (nullable = true)
 |-- transcriptIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- canonicalTranscript: struct (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: string (nullable = true)
 |-- canonicalExons: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- genomicLocation: struct (nullable = true)
 |    |-- chromosome: string (nullable = true)
 |    |-- start: long (nullable = true)
 |    |-- end: long (nullable = true)
 |    |-- strand: integer (nullable = true)
 |-- alternativeGenes: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- approvedName: string (nullable = true)
 |-- go: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- evidence: string (nullable = true)
 |    |    |-- aspect: string (nullable = true)
 |    |    |-- geneProduct: string (nullable = true)
 |    |    |-- ecoId: string (nullable = true)
 |-- hallmarks: struct (nullable = true)
 |    |-- attributes: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- pmid: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- attribute_name: string (nullable = true)
 |    |-- cancerHallmarks: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- pmid: long (nullable = true)
 |    |    |    |-- description: string (nullable = true)
 |    |    |    |-- impact: string (nullable = true)
 |    |    |    |-- label: string (nullable = true)
 |-- synonyms: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- symbolSynonyms: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- nameSynonyms: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- functionDescriptions: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- subcellularLocations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- location: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |    |    |-- termSL: string (nullable = true)
 |    |    |-- labelSL: string (nullable = true)
 |-- targetClass: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- level: string (nullable = true)
 |-- obsoleteSymbols: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- obsoleteNames: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- label: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- constraint: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- constraintType: string (nullable = true)
 |    |    |-- score: float (nullable = true)
 |    |    |-- exp: float (nullable = true)
 |    |    |-- obs: integer (nullable = true)
 |    |    |-- oe: float (nullable = true)
 |    |    |-- oeLower: float (nullable = true)
 |    |    |-- oeUpper: float (nullable = true)
 |    |    |-- upperRank: integer (nullable = true)
 |    |    |-- upperBin: integer (nullable = true)
 |    |    |-- upperBin6: integer (nullable = true)
 |-- tep: struct (nullable = true)
 |    |-- targetFromSourceId: string (nullable = true)
 |    |-- description: string (nullable = true)
 |    |-- therapeuticArea: string (nullable = true)
 |    |-- url: string (nullable = true)
 |-- proteinIds: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- dbXrefs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- chemicalProbes: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- control: string (nullable = true)
 |    |    |-- drugId: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- isHighQuality: boolean (nullable = true)
 |    |    |-- mechanismOfAction: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- origin: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- probeMinerScore: long (nullable = true)
 |    |    |-- probesDrugsScore: long (nullable = true)
 |    |    |-- scoreInCells: long (nullable = true)
 |    |    |-- scoreInOrganisms: long (nullable = true)
 |    |    |-- targetFromSourceId: string (nullable = true)
 |    |    |-- urls: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- niceName: string (nullable = true)
 |    |    |    |    |-- url: string (nullable = true)
 |-- homologues: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- speciesId: string (nullable = true)
 |    |    |-- speciesName: string (nullable = true)
 |    |    |-- homologyType: string (nullable = true)
 |    |    |-- targetGeneId: string (nullable = true)
 |    |    |-- isHighConfidence: string (nullable = true)
 |    |    |-- targetGeneSymbol: string (nullable = true)
 |    |    |-- queryPercentageIdentity: double (nullable = true)
 |    |    |-- targetPercentageIdentity: double (nullable = true)
 |    |    |-- priority: integer (nullable = true)
 |-- tractability: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- modality: string (nullable = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- value: boolean (nullable = true)
 |-- safetyLiabilities: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- event: string (nullable = true)
 |    |    |-- eventId: string (nullable = true)
 |    |    |-- effects: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- direction: string (nullable = true)
 |    |    |    |    |-- dosing: string (nullable = true)
 |    |    |-- biosamples: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- cellFormat: string (nullable = true)
 |    |    |    |    |-- cellLabel: string (nullable = true)
 |    |    |    |    |-- tissueId: string (nullable = true)
 |    |    |    |    |-- tissueLabel: string (nullable = true)
 |    |    |-- datasource: string (nullable = true)
 |    |    |-- literature: string (nullable = true)
 |    |    |-- url: string (nullable = true)
 |    |    |-- studies: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- description: string (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true)
 |-- pathways: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- pathwayId: string (nullable = true)
 |    |    |-- pathway: string (nullable = true)
 |    |    |-- topLevelTerm: string (nullable = true)
 |-- tss: long (nullable = true)