Skip to content

Intervals

gentropy.dataset.intervals.Intervals dataclass

Bases: Dataset

Intervals dataset links genes to genomic regions based on genome interaction studies.

Source code in src/gentropy/dataset/intervals.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@dataclass
class Intervals(Dataset):
    """Intervals dataset links genes to genomic regions based on genome interaction studies."""

    @classmethod
    def get_schema(cls: type[Intervals]) -> StructType:
        """Provides the schema for the Intervals dataset.

        Returns:
            StructType: Schema for the Intervals dataset
        """
        return parse_spark_schema("intervals.json")

    @classmethod
    def from_source(
        cls: type[Intervals],
        spark: SparkSession,
        source_name: str,
        source_path: str,
        target_index: TargetIndex,
        biosample_index: BiosampleIndex,
        biosample_mapping: DataFrame,
    ) -> Intervals:
        """Collect interval data for a particular source.

        Args:
            spark (SparkSession): Spark session
            source_name (str): Name of the interval source
            source_path (str): Path to the interval source file
            target_index (TargetIndex): Target index
            biosample_index (BiosampleIndex): Biosample index
            biosample_mapping (DataFrame): Biosample mapping DataFrame

        Returns:
            Intervals: Intervals dataset

        Raises:
            ValueError: If the source name is not recognised
        """
        from gentropy.datasource.intervals.e2g import IntervalsE2G
        from gentropy.datasource.intervals.epiraction import IntervalsEpiraction

        if source_name == "e2g":
            raw = IntervalsE2G.read(spark, source_path)
            return IntervalsE2G.parse(
                raw_e2g_df=raw,
                biosample_mapping=biosample_mapping,
                target_index=target_index,
                biosample_index=biosample_index,
            )

        if source_name == "epiraction":
            raw = IntervalsEpiraction.read(spark, source_path)
            return IntervalsEpiraction.parse(
                raw_epiraction_df=raw,
                target_index=target_index,
            )

        raise ValueError(f"Unknown interval source: {source_name!r}")

from_source(spark: SparkSession, source_name: str, source_path: str, target_index: TargetIndex, biosample_index: BiosampleIndex, biosample_mapping: DataFrame) -> Intervals classmethod

Collect interval data for a particular source.

Parameters:

Name Type Description Default
spark SparkSession

Spark session

required
source_name str

Name of the interval source

required
source_path str

Path to the interval source file

required
target_index TargetIndex

Target index

required
biosample_index BiosampleIndex

Biosample index

required
biosample_mapping DataFrame

Biosample mapping DataFrame

required

Returns:

Name Type Description
Intervals Intervals

Intervals dataset

Raises:

Type Description
ValueError

If the source name is not recognised

Source code in src/gentropy/dataset/intervals.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@classmethod
def from_source(
    cls: type[Intervals],
    spark: SparkSession,
    source_name: str,
    source_path: str,
    target_index: TargetIndex,
    biosample_index: BiosampleIndex,
    biosample_mapping: DataFrame,
) -> Intervals:
    """Collect interval data for a particular source.

    Args:
        spark (SparkSession): Spark session
        source_name (str): Name of the interval source
        source_path (str): Path to the interval source file
        target_index (TargetIndex): Target index
        biosample_index (BiosampleIndex): Biosample index
        biosample_mapping (DataFrame): Biosample mapping DataFrame

    Returns:
        Intervals: Intervals dataset

    Raises:
        ValueError: If the source name is not recognised
    """
    from gentropy.datasource.intervals.e2g import IntervalsE2G
    from gentropy.datasource.intervals.epiraction import IntervalsEpiraction

    if source_name == "e2g":
        raw = IntervalsE2G.read(spark, source_path)
        return IntervalsE2G.parse(
            raw_e2g_df=raw,
            biosample_mapping=biosample_mapping,
            target_index=target_index,
            biosample_index=biosample_index,
        )

    if source_name == "epiraction":
        raw = IntervalsEpiraction.read(spark, source_path)
        return IntervalsEpiraction.parse(
            raw_epiraction_df=raw,
            target_index=target_index,
        )

    raise ValueError(f"Unknown interval source: {source_name!r}")

get_schema() -> StructType classmethod

Provides the schema for the Intervals dataset.

Returns:

Name Type Description
StructType StructType

Schema for the Intervals dataset

Source code in src/gentropy/dataset/intervals.py
22
23
24
25
26
27
28
29
@classmethod
def get_schema(cls: type[Intervals]) -> StructType:
    """Provides the schema for the Intervals dataset.

    Returns:
        StructType: Schema for the Intervals dataset
    """
    return parse_spark_schema("intervals.json")

Schema

root
 |-- chromosome: string (nullable = false)
 |-- start: string (nullable = false)
 |-- end: string (nullable = false)
 |-- geneId: string (nullable = true)
 |-- score: double (nullable = true)
 |-- distanceToTss: integer (nullable = true)
 |-- resourceScore: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- name: string (nullable = false)
 |    |    |-- value: float (nullable = false)
 |-- datasourceId: string (nullable = false)
 |-- intervalType: string (nullable = false)
 |-- pmid: string (nullable = true)
 |-- biofeature: string (nullable = true)
 |-- biosampleName: string (nullable = true)
 |-- biosampleId: string (nullable = true)
 |-- studyId: string (nullable = true)
 |-- intervalId: string (nullable = true)