Skip to content

Variant index

gentropy.dataset.variant_index.VariantIndex dataclass

Bases: Dataset

Variant index dataset.

Variant index dataset is the result of intersecting the variant annotation dataset with the variants with V2D available information.

Source code in src/gentropy/dataset/variant_index.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
@dataclass
class VariantIndex(Dataset):
    """Variant index dataset.

    Variant index dataset is the result of intersecting the variant annotation dataset with the variants with V2D available information.
    """

    @classmethod
    def get_schema(cls: type[VariantIndex]) -> StructType:
        """Provides the schema for the VariantIndex dataset.

        Returns:
            StructType: Schema for the VariantIndex dataset
        """
        return parse_spark_schema("variant_index.json")

    @classmethod
    def from_variant_annotation(
        cls: type[VariantIndex],
        variant_annotation: VariantAnnotation,
        study_locus: StudyLocus,
    ) -> VariantIndex:
        """Initialise VariantIndex from pre-existing variant annotation dataset.

        Args:
            variant_annotation (VariantAnnotation): Variant annotation dataset
            study_locus (StudyLocus): Study locus dataset with the variants to intersect with the variant annotation dataset

        Returns:
            VariantIndex: Variant index dataset
        """
        unchanged_cols = [
            "variantId",
            "chromosome",
            "position",
            "referenceAllele",
            "alternateAllele",
            "chromosomeB37",
            "positionB37",
            "alleleType",
            "alleleFrequencies",
            "inSilicoPredictors",
        ]
        va_slimmed = variant_annotation.filter_by_variant_df(
            study_locus.unique_variants_in_locus()
        )
        return cls(
            _df=(
                va_slimmed.df.select(
                    *unchanged_cols,
                    f.col("vep.mostSevereConsequence").alias("mostSevereConsequence"),
                    # filters/rsid are arrays that can be empty, in this case we convert them to null
                    nullify_empty_array(f.col("rsIds")).alias("rsIds"),
                )
                .repartition(400, "chromosome")
                .sortWithinPartitions("chromosome", "position")
            ),
            _schema=cls.get_schema(),
        )

from_variant_annotation(variant_annotation: VariantAnnotation, study_locus: StudyLocus) -> VariantIndex classmethod

Initialise VariantIndex from pre-existing variant annotation dataset.

Parameters:

Name Type Description Default
variant_annotation VariantAnnotation

Variant annotation dataset

required
study_locus StudyLocus

Study locus dataset with the variants to intersect with the variant annotation dataset

required

Returns:

Name Type Description
VariantIndex VariantIndex

Variant index dataset

Source code in src/gentropy/dataset/variant_index.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
@classmethod
def from_variant_annotation(
    cls: type[VariantIndex],
    variant_annotation: VariantAnnotation,
    study_locus: StudyLocus,
) -> VariantIndex:
    """Initialise VariantIndex from pre-existing variant annotation dataset.

    Args:
        variant_annotation (VariantAnnotation): Variant annotation dataset
        study_locus (StudyLocus): Study locus dataset with the variants to intersect with the variant annotation dataset

    Returns:
        VariantIndex: Variant index dataset
    """
    unchanged_cols = [
        "variantId",
        "chromosome",
        "position",
        "referenceAllele",
        "alternateAllele",
        "chromosomeB37",
        "positionB37",
        "alleleType",
        "alleleFrequencies",
        "inSilicoPredictors",
    ]
    va_slimmed = variant_annotation.filter_by_variant_df(
        study_locus.unique_variants_in_locus()
    )
    return cls(
        _df=(
            va_slimmed.df.select(
                *unchanged_cols,
                f.col("vep.mostSevereConsequence").alias("mostSevereConsequence"),
                # filters/rsid are arrays that can be empty, in this case we convert them to null
                nullify_empty_array(f.col("rsIds")).alias("rsIds"),
            )
            .repartition(400, "chromosome")
            .sortWithinPartitions("chromosome", "position")
        ),
        _schema=cls.get_schema(),
    )

get_schema() -> StructType classmethod

Provides the schema for the VariantIndex dataset.

Returns:

Name Type Description
StructType StructType

Schema for the VariantIndex dataset

Source code in src/gentropy/dataset/variant_index.py
27
28
29
30
31
32
33
34
@classmethod
def get_schema(cls: type[VariantIndex]) -> StructType:
    """Provides the schema for the VariantIndex dataset.

    Returns:
        StructType: Schema for the VariantIndex dataset
    """
    return parse_spark_schema("variant_index.json")

Schema

root
 |-- variantId: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- position: integer (nullable = false)
 |-- referenceAllele: string (nullable = false)
 |-- alternateAllele: string (nullable = false)
 |-- chromosomeB37: string (nullable = true)
 |-- positionB37: integer (nullable = true)
 |-- alleleType: string (nullable = false)
 |-- alleleFrequencies: array (nullable = false)
 |    |-- element: struct (containsNull = true)
 |    |    |-- populationName: string (nullable = true)
 |    |    |-- alleleFrequency: double (nullable = true)
 |-- inSilicoPredictors: struct (nullable = false)
 |    |-- cadd: struct (nullable = true)
 |    |    |-- raw: float (nullable = true)
 |    |    |-- phred: float (nullable = true)
 |    |-- revelMax: double (nullable = true)
 |    |-- spliceaiDsMax: float (nullable = true)
 |    |-- pangolinLargestDs: double (nullable = true)
 |    |-- phylop: double (nullable = true)
 |    |-- siftMax: double (nullable = true)
 |    |-- polyphenMax: double (nullable = true)
 |-- mostSevereConsequence: string (nullable = true)
 |-- rsIds: array (nullable = true)
 |    |-- element: string (containsNull = true)