Skip to content

Thurman et al.

gentropy.datasource.intervals.thurman.IntervalsThurman

Interval dataset from Thurman et al. 2012.

Source code in src/gentropy/datasource/intervals/thurman.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
class IntervalsThurman:
    """Interval dataset from Thurman et al. 2012."""

    @staticmethod
    def read(spark: SparkSession, path: str) -> DataFrame:
        """Read thurman dataset.

        Args:
            spark (SparkSession): Spark session
            path (str): Path to dataset

        Returns:
            DataFrame: DataFrame with raw thurman data
        """
        thurman_schema = t.StructType(
            [
                t.StructField("gene_chr", t.StringType(), False),
                t.StructField("gene_start", t.IntegerType(), False),
                t.StructField("gene_end", t.IntegerType(), False),
                t.StructField("gene_name", t.StringType(), False),
                t.StructField("chrom", t.StringType(), False),
                t.StructField("start", t.IntegerType(), False),
                t.StructField("end", t.IntegerType(), False),
                t.StructField("score", t.FloatType(), False),
            ]
        )
        return spark.read.csv(path, sep="\t", header=False, schema=thurman_schema)

    @classmethod
    def parse(
        cls: type[IntervalsThurman],
        thurman_raw: DataFrame,
        gene_index: GeneIndex,
        lift: LiftOverSpark,
    ) -> Intervals:
        """Parse the Thurman et al. 2012 dataset.

        Args:
            thurman_raw (DataFrame): raw Thurman et al. 2019 dataset
            gene_index (GeneIndex): gene index
            lift (LiftOverSpark): LiftOverSpark instance

        Returns:
            Intervals: Interval dataset containing Thurman et al. 2012 data
        """
        dataset_name = "thurman2012"
        experiment_type = "dhscor"
        pmid = "22955617"

        return Intervals(
            _df=(
                thurman_raw.select(
                    f.regexp_replace(f.col("chrom"), "chr", "").alias("chrom"),
                    "start",
                    "end",
                    "gene_name",
                    "score",
                )
                # Lift over to the GRCh38 build:
                .transform(
                    lambda df: lift.convert_intervals(df, "chrom", "start", "end")
                )
                .alias("intervals")
                # Map gene names to gene IDs:
                .join(
                    gene_index.symbols_lut().alias("genes"),
                    on=[
                        f.col("intervals.gene_name") == f.col("genes.geneSymbol"),
                        f.col("intervals.chrom") == f.col("genes.chromosome"),
                    ],
                    how="inner",
                )
                # Select relevant columns and add constant columns:
                .select(
                    f.col("chrom").alias("chromosome"),
                    f.col("mapped_start").alias("start"),
                    f.col("mapped_end").alias("end"),
                    "geneId",
                    f.col("score").cast(t.DoubleType()).alias("resourceScore"),
                    f.lit(dataset_name).alias("datasourceId"),
                    f.lit(experiment_type).alias("datatypeId"),
                    f.lit(pmid).alias("pmid"),
                )
                .distinct()
            ),
            _schema=Intervals.get_schema(),
        )

parse(thurman_raw: DataFrame, gene_index: GeneIndex, lift: LiftOverSpark) -> Intervals classmethod

Parse the Thurman et al. 2012 dataset.

Parameters:

Name Type Description Default
thurman_raw DataFrame

raw Thurman et al. 2019 dataset

required
gene_index GeneIndex

gene index

required
lift LiftOverSpark

LiftOverSpark instance

required

Returns:

Name Type Description
Intervals Intervals

Interval dataset containing Thurman et al. 2012 data

Source code in src/gentropy/datasource/intervals/thurman.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
@classmethod
def parse(
    cls: type[IntervalsThurman],
    thurman_raw: DataFrame,
    gene_index: GeneIndex,
    lift: LiftOverSpark,
) -> Intervals:
    """Parse the Thurman et al. 2012 dataset.

    Args:
        thurman_raw (DataFrame): raw Thurman et al. 2019 dataset
        gene_index (GeneIndex): gene index
        lift (LiftOverSpark): LiftOverSpark instance

    Returns:
        Intervals: Interval dataset containing Thurman et al. 2012 data
    """
    dataset_name = "thurman2012"
    experiment_type = "dhscor"
    pmid = "22955617"

    return Intervals(
        _df=(
            thurman_raw.select(
                f.regexp_replace(f.col("chrom"), "chr", "").alias("chrom"),
                "start",
                "end",
                "gene_name",
                "score",
            )
            # Lift over to the GRCh38 build:
            .transform(
                lambda df: lift.convert_intervals(df, "chrom", "start", "end")
            )
            .alias("intervals")
            # Map gene names to gene IDs:
            .join(
                gene_index.symbols_lut().alias("genes"),
                on=[
                    f.col("intervals.gene_name") == f.col("genes.geneSymbol"),
                    f.col("intervals.chrom") == f.col("genes.chromosome"),
                ],
                how="inner",
            )
            # Select relevant columns and add constant columns:
            .select(
                f.col("chrom").alias("chromosome"),
                f.col("mapped_start").alias("start"),
                f.col("mapped_end").alias("end"),
                "geneId",
                f.col("score").cast(t.DoubleType()).alias("resourceScore"),
                f.lit(dataset_name).alias("datasourceId"),
                f.lit(experiment_type).alias("datatypeId"),
                f.lit(pmid).alias("pmid"),
            )
            .distinct()
        ),
        _schema=Intervals.get_schema(),
    )

read(spark: SparkSession, path: str) -> DataFrame staticmethod

Read thurman dataset.

Parameters:

Name Type Description Default
spark SparkSession

Spark session

required
path str

Path to dataset

required

Returns:

Name Type Description
DataFrame DataFrame

DataFrame with raw thurman data

Source code in src/gentropy/datasource/intervals/thurman.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
@staticmethod
def read(spark: SparkSession, path: str) -> DataFrame:
    """Read thurman dataset.

    Args:
        spark (SparkSession): Spark session
        path (str): Path to dataset

    Returns:
        DataFrame: DataFrame with raw thurman data
    """
    thurman_schema = t.StructType(
        [
            t.StructField("gene_chr", t.StringType(), False),
            t.StructField("gene_start", t.IntegerType(), False),
            t.StructField("gene_end", t.IntegerType(), False),
            t.StructField("gene_name", t.StringType(), False),
            t.StructField("chrom", t.StringType(), False),
            t.StructField("start", t.IntegerType(), False),
            t.StructField("end", t.IntegerType(), False),
            t.StructField("score", t.FloatType(), False),
        ]
    )
    return spark.read.csv(path, sep="\t", header=False, schema=thurman_schema)