Skip to content

Intervals

gentropy.dataset.intervals.Intervals dataclass

Bases: Dataset

Intervals dataset links genes to genomic regions based on genome interaction studies.

Source code in src/gentropy/dataset/intervals.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
@dataclass
class Intervals(Dataset):
    """Intervals dataset links genes to genomic regions based on genome interaction studies."""

    @classmethod
    def get_schema(cls: type[Intervals]) -> StructType:
        """Provides the schema for the Intervals dataset.

        Returns:
            StructType: Schema for the Intervals dataset
        """
        return parse_spark_schema("intervals.json")

    @classmethod
    def from_source(
        cls: type[Intervals],
        spark: SparkSession,
        source_name: str,
        source_path: str,
        gene_index: GeneIndex,
        lift: LiftOverSpark,
    ) -> Intervals:
        """Collect interval data for a particular source.

        Args:
            spark (SparkSession): Spark session
            source_name (str): Name of the interval source
            source_path (str): Path to the interval source file
            gene_index (GeneIndex): Gene index
            lift (LiftOverSpark): LiftOverSpark instance to convert coordinats from hg37 to hg38

        Returns:
            Intervals: Intervals dataset

        Raises:
            ValueError: If the source name is not recognised
        """
        from gentropy.datasource.intervals.andersson import IntervalsAndersson
        from gentropy.datasource.intervals.javierre import IntervalsJavierre
        from gentropy.datasource.intervals.jung import IntervalsJung
        from gentropy.datasource.intervals.thurman import IntervalsThurman

        source_to_class = {
            "andersson": IntervalsAndersson,
            "javierre": IntervalsJavierre,
            "jung": IntervalsJung,
            "thurman": IntervalsThurman,
        }

        if source_name not in source_to_class:
            raise ValueError(f"Unknown interval source: {source_name}")

        source_class = source_to_class[source_name]
        data = source_class.read(spark, source_path)  # type: ignore
        return source_class.parse(data, gene_index, lift)  # type: ignore

    def v2g(self: Intervals, variant_index: VariantIndex) -> V2G:
        """Convert intervals into V2G by intersecting with a variant index.

        Args:
            variant_index (VariantIndex): Variant index dataset

        Returns:
            V2G: Variant-to-gene evidence dataset
        """
        return V2G(
            _df=(
                self.df.alias("interval")
                .join(
                    variant_index.df.selectExpr(
                        "chromosome as vi_chromosome", "variantId", "position"
                    ).alias("vi"),
                    on=[
                        f.col("vi.vi_chromosome") == f.col("interval.chromosome"),
                        f.col("vi.position").between(
                            f.col("interval.start"), f.col("interval.end")
                        ),
                    ],
                    how="inner",
                )
                .drop("start", "end", "vi_chromosome", "position")
            ),
            _schema=V2G.get_schema(),
        )

from_source(spark: SparkSession, source_name: str, source_path: str, gene_index: GeneIndex, lift: LiftOverSpark) -> Intervals classmethod

Collect interval data for a particular source.

Parameters:

Name Type Description Default
spark SparkSession

Spark session

required
source_name str

Name of the interval source

required
source_path str

Path to the interval source file

required
gene_index GeneIndex

Gene index

required
lift LiftOverSpark

LiftOverSpark instance to convert coordinats from hg37 to hg38

required

Returns:

Name Type Description
Intervals Intervals

Intervals dataset

Raises:

Type Description
ValueError

If the source name is not recognised

Source code in src/gentropy/dataset/intervals.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@classmethod
def from_source(
    cls: type[Intervals],
    spark: SparkSession,
    source_name: str,
    source_path: str,
    gene_index: GeneIndex,
    lift: LiftOverSpark,
) -> Intervals:
    """Collect interval data for a particular source.

    Args:
        spark (SparkSession): Spark session
        source_name (str): Name of the interval source
        source_path (str): Path to the interval source file
        gene_index (GeneIndex): Gene index
        lift (LiftOverSpark): LiftOverSpark instance to convert coordinats from hg37 to hg38

    Returns:
        Intervals: Intervals dataset

    Raises:
        ValueError: If the source name is not recognised
    """
    from gentropy.datasource.intervals.andersson import IntervalsAndersson
    from gentropy.datasource.intervals.javierre import IntervalsJavierre
    from gentropy.datasource.intervals.jung import IntervalsJung
    from gentropy.datasource.intervals.thurman import IntervalsThurman

    source_to_class = {
        "andersson": IntervalsAndersson,
        "javierre": IntervalsJavierre,
        "jung": IntervalsJung,
        "thurman": IntervalsThurman,
    }

    if source_name not in source_to_class:
        raise ValueError(f"Unknown interval source: {source_name}")

    source_class = source_to_class[source_name]
    data = source_class.read(spark, source_path)  # type: ignore
    return source_class.parse(data, gene_index, lift)  # type: ignore

get_schema() -> StructType classmethod

Provides the schema for the Intervals dataset.

Returns:

Name Type Description
StructType StructType

Schema for the Intervals dataset

Source code in src/gentropy/dataset/intervals.py
26
27
28
29
30
31
32
33
@classmethod
def get_schema(cls: type[Intervals]) -> StructType:
    """Provides the schema for the Intervals dataset.

    Returns:
        StructType: Schema for the Intervals dataset
    """
    return parse_spark_schema("intervals.json")

v2g(variant_index: VariantIndex) -> V2G

Convert intervals into V2G by intersecting with a variant index.

Parameters:

Name Type Description Default
variant_index VariantIndex

Variant index dataset

required

Returns:

Name Type Description
V2G V2G

Variant-to-gene evidence dataset

Source code in src/gentropy/dataset/intervals.py
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def v2g(self: Intervals, variant_index: VariantIndex) -> V2G:
    """Convert intervals into V2G by intersecting with a variant index.

    Args:
        variant_index (VariantIndex): Variant index dataset

    Returns:
        V2G: Variant-to-gene evidence dataset
    """
    return V2G(
        _df=(
            self.df.alias("interval")
            .join(
                variant_index.df.selectExpr(
                    "chromosome as vi_chromosome", "variantId", "position"
                ).alias("vi"),
                on=[
                    f.col("vi.vi_chromosome") == f.col("interval.chromosome"),
                    f.col("vi.position").between(
                        f.col("interval.start"), f.col("interval.end")
                    ),
                ],
                how="inner",
            )
            .drop("start", "end", "vi_chromosome", "position")
        ),
        _schema=V2G.get_schema(),
    )

Schema

root
 |-- chromosome: string (nullable = false)
 |-- start: string (nullable = false)
 |-- end: string (nullable = false)
 |-- geneId: string (nullable = false)
 |-- resourceScore: double (nullable = true)
 |-- score: double (nullable = true)
 |-- datasourceId: string (nullable = false)
 |-- datatypeId: string (nullable = false)
 |-- pmid: string (nullable = true)
 |-- biofeature: string (nullable = true)