Skip to content

Intervals

gentropy.dataset.intervals.Intervals dataclass

Bases: Dataset

Intervals dataset links genes to genomic regions based on genome interaction studies.

Source code in src/gentropy/dataset/intervals.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
@dataclass
class Intervals(Dataset):
    """Intervals dataset links genes to genomic regions based on genome interaction studies."""

    @classmethod
    def get_schema(cls: type[Intervals]) -> StructType:
        """Provides the schema for the Intervals dataset.

        Returns:
            StructType: Schema for the Intervals dataset
        """
        return parse_spark_schema("intervals.json")

    @classmethod
    def from_source(
        cls: type[Intervals],
        spark: SparkSession,
        source_name: str,
        source_path: str,
        gene_index: GeneIndex,
        lift: LiftOverSpark,
    ) -> Intervals:
        """Collect interval data for a particular source.

        Args:
            spark (SparkSession): Spark session
            source_name (str): Name of the interval source
            source_path (str): Path to the interval source file
            gene_index (GeneIndex): Gene index
            lift (LiftOverSpark): LiftOverSpark instance to convert coordinats from hg37 to hg38

        Returns:
            Intervals: Intervals dataset

        Raises:
            ValueError: If the source name is not recognised
        """
        from gentropy.datasource.intervals.andersson import IntervalsAndersson
        from gentropy.datasource.intervals.javierre import IntervalsJavierre
        from gentropy.datasource.intervals.jung import IntervalsJung
        from gentropy.datasource.intervals.thurman import IntervalsThurman

        source_to_class = {
            "andersson": IntervalsAndersson,
            "javierre": IntervalsJavierre,
            "jung": IntervalsJung,
            "thurman": IntervalsThurman,
        }

        if source_name not in source_to_class:
            raise ValueError(f"Unknown interval source: {source_name}")

        source_class = source_to_class[source_name]
        data = source_class.read(spark, source_path)  # type: ignore
        return source_class.parse(data, gene_index, lift)  # type: ignore

from_source(spark: SparkSession, source_name: str, source_path: str, gene_index: GeneIndex, lift: LiftOverSpark) -> Intervals classmethod

Collect interval data for a particular source.

Parameters:

Name Type Description Default
spark SparkSession

Spark session

required
source_name str

Name of the interval source

required
source_path str

Path to the interval source file

required
gene_index GeneIndex

Gene index

required
lift LiftOverSpark

LiftOverSpark instance to convert coordinats from hg37 to hg38

required

Returns:

Name Type Description
Intervals Intervals

Intervals dataset

Raises:

Type Description
ValueError

If the source name is not recognised

Source code in src/gentropy/dataset/intervals.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
@classmethod
def from_source(
    cls: type[Intervals],
    spark: SparkSession,
    source_name: str,
    source_path: str,
    gene_index: GeneIndex,
    lift: LiftOverSpark,
) -> Intervals:
    """Collect interval data for a particular source.

    Args:
        spark (SparkSession): Spark session
        source_name (str): Name of the interval source
        source_path (str): Path to the interval source file
        gene_index (GeneIndex): Gene index
        lift (LiftOverSpark): LiftOverSpark instance to convert coordinats from hg37 to hg38

    Returns:
        Intervals: Intervals dataset

    Raises:
        ValueError: If the source name is not recognised
    """
    from gentropy.datasource.intervals.andersson import IntervalsAndersson
    from gentropy.datasource.intervals.javierre import IntervalsJavierre
    from gentropy.datasource.intervals.jung import IntervalsJung
    from gentropy.datasource.intervals.thurman import IntervalsThurman

    source_to_class = {
        "andersson": IntervalsAndersson,
        "javierre": IntervalsJavierre,
        "jung": IntervalsJung,
        "thurman": IntervalsThurman,
    }

    if source_name not in source_to_class:
        raise ValueError(f"Unknown interval source: {source_name}")

    source_class = source_to_class[source_name]
    data = source_class.read(spark, source_path)  # type: ignore
    return source_class.parse(data, gene_index, lift)  # type: ignore

get_schema() -> StructType classmethod

Provides the schema for the Intervals dataset.

Returns:

Name Type Description
StructType StructType

Schema for the Intervals dataset

Source code in src/gentropy/dataset/intervals.py
23
24
25
26
27
28
29
30
@classmethod
def get_schema(cls: type[Intervals]) -> StructType:
    """Provides the schema for the Intervals dataset.

    Returns:
        StructType: Schema for the Intervals dataset
    """
    return parse_spark_schema("intervals.json")

Schema

root
 |-- chromosome: string (nullable = false)
 |-- start: string (nullable = false)
 |-- end: string (nullable = false)
 |-- geneId: string (nullable = false)
 |-- resourceScore: double (nullable = true)
 |-- score: double (nullable = true)
 |-- datasourceId: string (nullable = false)
 |-- datatypeId: string (nullable = false)
 |-- pmid: string (nullable = true)
 |-- biofeature: string (nullable = true)