Intervals

`gentropy.dataset.intervals.Intervals` `dataclass` ¶

Bases: Dataset

Intervals dataset links genes to genomic regions based on genome interaction studies.

Source code in src/gentropy/dataset/intervals.py

@dataclass
class Intervals(Dataset):
    """Intervals dataset links genes to genomic regions based on genome interaction studies."""

    @classmethod
    def get_schema(cls: type[Intervals]) -> StructType:
        """Provides the schema for the Intervals dataset.

        Returns:
            StructType: Schema for the Intervals dataset
        """
        return parse_spark_schema("intervals.json")

    @classmethod
    def from_source(
        cls: type[Intervals],
        spark: SparkSession,
        source_name: str,
        source_path: str,
        target_index: TargetIndex,
        lift: LiftOverSpark,
    ) -> Intervals:
        """Collect interval data for a particular source.

        Args:
            spark (SparkSession): Spark session
            source_name (str): Name of the interval source
            source_path (str): Path to the interval source file
            target_index (TargetIndex): Target index
            lift (LiftOverSpark): LiftOverSpark instance to convert coordinats from hg37 to hg38

        Returns:
            Intervals: Intervals dataset

        Raises:
            ValueError: If the source name is not recognised
        """
        from gentropy.datasource.intervals.andersson import IntervalsAndersson
        from gentropy.datasource.intervals.javierre import IntervalsJavierre
        from gentropy.datasource.intervals.jung import IntervalsJung
        from gentropy.datasource.intervals.thurman import IntervalsThurman

        source_to_class = {
            "andersson": IntervalsAndersson,
            "javierre": IntervalsJavierre,
            "jung": IntervalsJung,
            "thurman": IntervalsThurman,
        }

        if source_name not in source_to_class:
            raise ValueError(f"Unknown interval source: {source_name}")

        source_class = source_to_class[source_name]
        data = source_class.read(spark, source_path)  # type: ignore
        return source_class.parse(data, target_index, lift)  # type: ignore

`from_source(spark: SparkSession, source_name: str, source_path: str, target_index: TargetIndex, lift: LiftOverSpark) -> Intervals` `classmethod` ¶

Collect interval data for a particular source.

Parameters:

Name	Type	Description	Default
`spark`	`SparkSession`	Spark session	required
`source_name`	`str`	Name of the interval source	required
`source_path`	`str`	Path to the interval source file	required
`target_index`	`TargetIndex`	Target index	required
`lift`	`LiftOverSpark`	LiftOverSpark instance to convert coordinats from hg37 to hg38	required

Returns:

Name	Type	Description
`Intervals`	`Intervals`	Intervals dataset

Raises:

Type	Description
`ValueError`	If the source name is not recognised

Source code in src/gentropy/dataset/intervals.py

@classmethod
def from_source(
    cls: type[Intervals],
    spark: SparkSession,
    source_name: str,
    source_path: str,
    target_index: TargetIndex,
    lift: LiftOverSpark,
) -> Intervals:
    """Collect interval data for a particular source.

    Args:
        spark (SparkSession): Spark session
        source_name (str): Name of the interval source
        source_path (str): Path to the interval source file
        target_index (TargetIndex): Target index
        lift (LiftOverSpark): LiftOverSpark instance to convert coordinats from hg37 to hg38

    Returns:
        Intervals: Intervals dataset

    Raises:
        ValueError: If the source name is not recognised
    """
    from gentropy.datasource.intervals.andersson import IntervalsAndersson
    from gentropy.datasource.intervals.javierre import IntervalsJavierre
    from gentropy.datasource.intervals.jung import IntervalsJung
    from gentropy.datasource.intervals.thurman import IntervalsThurman

    source_to_class = {
        "andersson": IntervalsAndersson,
        "javierre": IntervalsJavierre,
        "jung": IntervalsJung,
        "thurman": IntervalsThurman,
    }

    if source_name not in source_to_class:
        raise ValueError(f"Unknown interval source: {source_name}")

    source_class = source_to_class[source_name]
    data = source_class.read(spark, source_path)  # type: ignore
    return source_class.parse(data, target_index, lift)  # type: ignore

`get_schema() -> StructType` `classmethod` ¶

Provides the schema for the Intervals dataset.

Returns:

Name	Type	Description
`StructType`	`StructType`	Schema for the Intervals dataset

Source code in src/gentropy/dataset/intervals.py

@classmethod
def get_schema(cls: type[Intervals]) -> StructType:
    """Provides the schema for the Intervals dataset.

    Returns:
        StructType: Schema for the Intervals dataset
    """
    return parse_spark_schema("intervals.json")

Schema¶

root
 |-- chromosome: string (nullable = false)
 |-- start: string (nullable = false)
 |-- end: string (nullable = false)
 |-- geneId: string (nullable = false)
 |-- resourceScore: double (nullable = true)
 |-- score: double (nullable = true)
 |-- datasourceId: string (nullable = false)
 |-- datatypeId: string (nullable = false)
 |-- pmid: string (nullable = true)
 |-- biofeature: string (nullable = true)

2023-01-15
2024-01-17
Contributors

Intervals

gentropy.dataset.intervals.Intervals dataclass ¶

from_source(spark: SparkSession, source_name: str, source_path: str, target_index: TargetIndex, lift: LiftOverSpark) -> Intervals classmethod ¶

get_schema() -> StructType classmethod ¶

Schema¶

`gentropy.dataset.intervals.Intervals` `dataclass` ¶

`from_source(spark: SparkSession, source_name: str, source_path: str, target_index: TargetIndex, lift: LiftOverSpark) -> Intervals` `classmethod` ¶

`get_schema() -> StructType` `classmethod` ¶