Skip to content

Jung et al.

gentropy.datasource.intervals.jung.IntervalsJung

Interval dataset from Jung et al. 2019.

Source code in src/gentropy/datasource/intervals/jung.py
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
class IntervalsJung:
    """Interval dataset from Jung et al. 2019."""

    @staticmethod
    def read(spark: SparkSession, path: str) -> DataFrame:
        """Read jung dataset.

        Args:
            spark (SparkSession): Spark session
            path (str): Path to dataset

        Returns:
            DataFrame: DataFrame with raw jung data
        """
        return spark.read.csv(path, sep=",", header=True)

    @classmethod
    def parse(
        cls: type[IntervalsJung],
        jung_raw: DataFrame,
        gene_index: GeneIndex,
        lift: LiftOverSpark,
    ) -> Intervals:
        """Parse the Jung et al. 2019 dataset.

        Args:
            jung_raw (DataFrame): raw Jung et al. 2019 dataset
            gene_index (GeneIndex): gene index
            lift (LiftOverSpark): LiftOverSpark instance

        Returns:
            Intervals: Interval dataset containing Jung et al. 2019 data
        """
        dataset_name = "jung2019"
        experiment_type = "pchic"
        pmid = "31501517"

        # Lifting over the coordinates:
        return Intervals(
            _df=(
                jung_raw.withColumn(
                    "interval", f.split(f.col("Interacting_fragment"), r"\.")
                )
                .select(
                    # Parsing intervals:
                    f.regexp_replace(f.col("interval")[0], "chr", "").alias("chrom"),
                    f.col("interval")[1].cast(t.IntegerType()).alias("start"),
                    f.col("interval")[2].cast(t.IntegerType()).alias("end"),
                    # Extract other columns:
                    f.col("Promoter").alias("gene_name"),
                    f.col("Tissue_type").alias("tissue"),
                )
                # Lifting over to GRCh38 interval 1:
                .transform(
                    lambda df: lift.convert_intervals(df, "chrom", "start", "end")
                )
                .select(
                    "chrom",
                    f.col("mapped_start").alias("start"),
                    f.col("mapped_end").alias("end"),
                    f.explode(f.split(f.col("gene_name"), ";")).alias("gene_name"),
                    "tissue",
                )
                .alias("intervals")
                # Joining with genes:
                .join(
                    gene_index.symbols_lut().alias("genes"),
                    on=[f.col("intervals.gene_name") == f.col("genes.geneSymbol")],
                    how="inner",
                )
                # Finalize dataset:
                .select(
                    "chromosome",
                    f.col("intervals.start").alias("start"),
                    f.col("intervals.end").alias("end"),
                    "geneId",
                    f.col("tissue").alias("biofeature"),
                    f.lit(1.0).alias("score"),
                    f.lit(dataset_name).alias("datasourceId"),
                    f.lit(experiment_type).alias("datatypeId"),
                    f.lit(pmid).alias("pmid"),
                )
                .drop_duplicates()
            ),
            _schema=Intervals.get_schema(),
        )

parse(jung_raw: DataFrame, gene_index: GeneIndex, lift: LiftOverSpark) -> Intervals classmethod

Parse the Jung et al. 2019 dataset.

Parameters:

Name Type Description Default
jung_raw DataFrame

raw Jung et al. 2019 dataset

required
gene_index GeneIndex

gene index

required
lift LiftOverSpark

LiftOverSpark instance

required

Returns:

Name Type Description
Intervals Intervals

Interval dataset containing Jung et al. 2019 data

Source code in src/gentropy/datasource/intervals/jung.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
@classmethod
def parse(
    cls: type[IntervalsJung],
    jung_raw: DataFrame,
    gene_index: GeneIndex,
    lift: LiftOverSpark,
) -> Intervals:
    """Parse the Jung et al. 2019 dataset.

    Args:
        jung_raw (DataFrame): raw Jung et al. 2019 dataset
        gene_index (GeneIndex): gene index
        lift (LiftOverSpark): LiftOverSpark instance

    Returns:
        Intervals: Interval dataset containing Jung et al. 2019 data
    """
    dataset_name = "jung2019"
    experiment_type = "pchic"
    pmid = "31501517"

    # Lifting over the coordinates:
    return Intervals(
        _df=(
            jung_raw.withColumn(
                "interval", f.split(f.col("Interacting_fragment"), r"\.")
            )
            .select(
                # Parsing intervals:
                f.regexp_replace(f.col("interval")[0], "chr", "").alias("chrom"),
                f.col("interval")[1].cast(t.IntegerType()).alias("start"),
                f.col("interval")[2].cast(t.IntegerType()).alias("end"),
                # Extract other columns:
                f.col("Promoter").alias("gene_name"),
                f.col("Tissue_type").alias("tissue"),
            )
            # Lifting over to GRCh38 interval 1:
            .transform(
                lambda df: lift.convert_intervals(df, "chrom", "start", "end")
            )
            .select(
                "chrom",
                f.col("mapped_start").alias("start"),
                f.col("mapped_end").alias("end"),
                f.explode(f.split(f.col("gene_name"), ";")).alias("gene_name"),
                "tissue",
            )
            .alias("intervals")
            # Joining with genes:
            .join(
                gene_index.symbols_lut().alias("genes"),
                on=[f.col("intervals.gene_name") == f.col("genes.geneSymbol")],
                how="inner",
            )
            # Finalize dataset:
            .select(
                "chromosome",
                f.col("intervals.start").alias("start"),
                f.col("intervals.end").alias("end"),
                "geneId",
                f.col("tissue").alias("biofeature"),
                f.lit(1.0).alias("score"),
                f.lit(dataset_name).alias("datasourceId"),
                f.lit(experiment_type).alias("datatypeId"),
                f.lit(pmid).alias("pmid"),
            )
            .drop_duplicates()
        ),
        _schema=Intervals.get_schema(),
    )

read(spark: SparkSession, path: str) -> DataFrame staticmethod

Read jung dataset.

Parameters:

Name Type Description Default
spark SparkSession

Spark session

required
path str

Path to dataset

required

Returns:

Name Type Description
DataFrame DataFrame

DataFrame with raw jung data

Source code in src/gentropy/datasource/intervals/jung.py
21
22
23
24
25
26
27
28
29
30
31
32
@staticmethod
def read(spark: SparkSession, path: str) -> DataFrame:
    """Read jung dataset.

    Args:
        spark (SparkSession): Spark session
        path (str): Path to dataset

    Returns:
        DataFrame: DataFrame with raw jung data
    """
    return spark.read.csv(path, sep=",", header=True)