Skip to content

Target

gentropy.datasource.open_targets.target.OpenTargetsTarget

Parser for OTPlatform target dataset.

Genomic data from Open Targets provides gene identification and genomic coordinates that are integrated into the gene index of our ETL pipeline.

The EMBL-EBI Ensembl database is used as a source for human targets in the Platform, with the Ensembl gene ID as the primary identifier. The criteria for target inclusion is: - Genes from all biotypes encoded in canonical chromosomes - Genes in alternative assemblies encoding for a reviewed protein product.

Source code in src/gentropy/datasource/open_targets/target.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
class OpenTargetsTarget:
    """Parser for OTPlatform target dataset.

    Genomic data from Open Targets provides gene identification and genomic coordinates that are integrated into the gene index of our ETL pipeline.

    The EMBL-EBI Ensembl database is used as a source for human targets in the Platform, with the Ensembl gene ID as the primary identifier. The criteria for target inclusion is:
    - Genes from all biotypes encoded in canonical chromosomes
    - Genes in alternative assemblies encoding for a reviewed protein product.
    """

    @staticmethod
    def _get_gene_tss(strand_col: Column, start_col: Column, end_col: Column) -> Column:
        """Returns the TSS of a gene based on its orientation.

        Args:
            strand_col (Column): Column containing 1 if the coding strand of the gene is forward, and -1 if it is reverse.
            start_col (Column): Column containing the start position of the gene.
            end_col (Column): Column containing the end position of the gene.

        Returns:
            Column: Column containing the TSS of the gene.

        Examples:
            >>> df = spark.createDataFrame([{"strand": 1, "start": 100, "end": 200}, {"strand": -1, "start": 100, "end": 200}])
            >>> df.withColumn("tss", OpenTargetsTarget._get_gene_tss(f.col("strand"), f.col("start"), f.col("end"))).show()
            +---+-----+------+---+
            |end|start|strand|tss|
            +---+-----+------+---+
            |200|  100|     1|100|
            |200|  100|    -1|200|
            +---+-----+------+---+
            <BLANKLINE>

        """
        return f.when(strand_col == 1, start_col).when(strand_col == -1, end_col)

    @classmethod
    def as_gene_index(
        cls: type[OpenTargetsTarget], target_index: DataFrame
    ) -> GeneIndex:
        """Initialise GeneIndex from source dataset.

        Args:
            target_index (DataFrame): Target index dataframe

        Returns:
            GeneIndex: Gene index dataset
        """
        return GeneIndex(
            _df=target_index.select(
                f.coalesce(f.col("id"), f.lit("unknown")).alias("geneId"),
                "approvedSymbol",
                "approvedName",
                "biotype",
                f.col("obsoleteSymbols.label").alias("obsoleteSymbols"),
                f.coalesce(f.col("genomicLocation.chromosome"), f.lit("unknown")).alias(
                    "chromosome"
                ),
                OpenTargetsTarget._get_gene_tss(
                    f.col("genomicLocation.strand"),
                    f.col("genomicLocation.start"),
                    f.col("genomicLocation.end"),
                ).alias("tss"),
                f.col("genomicLocation.start").alias("start"),
                f.col("genomicLocation.end").alias("end"),
                f.col("genomicLocation.strand").alias("strand"),
            ),
            _schema=GeneIndex.get_schema(),
        )

as_gene_index(target_index: DataFrame) -> GeneIndex classmethod

Initialise GeneIndex from source dataset.

Parameters:

Name Type Description Default
target_index DataFrame

Target index dataframe

required

Returns:

Name Type Description
GeneIndex GeneIndex

Gene index dataset

Source code in src/gentropy/datasource/open_targets/target.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
@classmethod
def as_gene_index(
    cls: type[OpenTargetsTarget], target_index: DataFrame
) -> GeneIndex:
    """Initialise GeneIndex from source dataset.

    Args:
        target_index (DataFrame): Target index dataframe

    Returns:
        GeneIndex: Gene index dataset
    """
    return GeneIndex(
        _df=target_index.select(
            f.coalesce(f.col("id"), f.lit("unknown")).alias("geneId"),
            "approvedSymbol",
            "approvedName",
            "biotype",
            f.col("obsoleteSymbols.label").alias("obsoleteSymbols"),
            f.coalesce(f.col("genomicLocation.chromosome"), f.lit("unknown")).alias(
                "chromosome"
            ),
            OpenTargetsTarget._get_gene_tss(
                f.col("genomicLocation.strand"),
                f.col("genomicLocation.start"),
                f.col("genomicLocation.end"),
            ).alias("tss"),
            f.col("genomicLocation.start").alias("start"),
            f.col("genomicLocation.end").alias("end"),
            f.col("genomicLocation.strand").alias("strand"),
        ),
        _schema=GeneIndex.get_schema(),
    )