Variant index

`gentropy.dataset.variant_index.VariantIndex` `dataclass` ¶

Bases: Dataset

Dataset for representing variants and methods applied on them.

Source code in src/gentropy/dataset/variant_index.py

@dataclass
class VariantIndex(Dataset):
    """Dataset for representing variants and methods applied on them."""

    id_threshold: int = field(default=300)

    def __post_init__(self: VariantIndex) -> None:
        """Forcing the presence of empty arrays even if the schema allows missing values.

        To bring in annotations from other sources, we use the `array_union()` function. However it assumes
        both columns have arrays (not just the array schema!). If one of the array is null, the union
        is nullified. This needs to be avoided.
        """
        # Calling dataset's post init to validate schema:
        super().__post_init__()

        # Composing a list of expressions to replace nulls with empty arrays if the schema assumes:
        array_columns = {
            column.name: f.when(f.col(column.name).isNull(), f.array()).otherwise(
                f.col(column.name)
            )
            for column in self.df.schema
            if "ArrayType" in column.dataType.__str__()
        }

        # Not returning, but changing the data:
        self.df = self.df.withColumns(array_columns).withColumn(
            # Hashing long variant identifiers:
            "variantId",
            self.hash_long_variant_ids(
                f.col("variantId"),
                f.col("chromosome"),
                f.col("position"),
                self.id_threshold,
            ),
        )

    @classmethod
    def get_schema(cls: type[VariantIndex]) -> StructType:
        """Provides the schema for the variant index dataset.

        Returns:
            StructType: Schema for the VariantIndex dataset
        """
        return parse_spark_schema("variant_index.json")

    @staticmethod
    def hash_long_variant_ids(
        variant_id: Column, chromosome: Column, position: Column, threshold: int
    ) -> Column:
        """Hash long variant identifiers.

        Args:
            variant_id (Column): Column containing variant identifiers.
            chromosome (Column): Chromosome column.
            position (Column): position column.
            threshold (int): Above this limit, a hash will be generated.

        Returns:
            Column: Hashed variant identifiers for long variants.

        Examples:
            >>> (
            ...    spark.createDataFrame([('v_short', 'x', 23),('v_looooooong', '23', 23), ('no_chrom', None, None), (None, None, None)], ['variantId', 'chromosome', 'position'])
            ...    .select('variantId', VariantIndex.hash_long_variant_ids(f.col('variantId'), f.col('chromosome'), f.col('position'), 10).alias('hashedVariantId'))
            ...    .show(truncate=False)
            ... )
            +------------+--------------------------------------------+
            |variantId   |hashedVariantId                             |
            +------------+--------------------------------------------+
            |v_short     |v_short                                     |
            |v_looooooong|OTVAR_23_23_3749d019d645894770c364992ae70a05|
            |no_chrom    |OTVAR_41acfcd7d4fd523b33600b504914ef25      |
            |NULL        |NULL                                        |
            +------------+--------------------------------------------+
            <BLANKLINE>
        """
        return (
            # If either the position or the chromosome is missing, we hash the identifier:
            f.when(
                chromosome.isNull() | position.isNull(),
                f.concat(
                    f.lit("OTVAR_"),
                    f.md5(variant_id).cast("string"),
                ),
            )
            # If chromosome and position are given, but alleles are too long, create hash:
            .when(
                f.length(variant_id) >= threshold,
                f.concat_ws(
                    "_",
                    f.lit("OTVAR"),
                    chromosome,
                    position,
                    f.md5(variant_id).cast("string"),
                ),
            )
            # Missing and regular variant identifiers are left unchanged:
            .otherwise(variant_id)
        )

    def add_annotation(
        self: VariantIndex, annotation_source: VariantIndex
    ) -> VariantIndex:
        """Import annotation from an other variant index dataset.

        At this point the annotation can be extended with extra cross-references,
        variant effects, allele frequencies, and variant descriptions.

        Args:
            annotation_source (VariantIndex): Annotation to add to the dataset

        Returns:
            VariantIndex: VariantIndex dataset with the annotation added
        """
        # Prefix for renaming columns:
        prefix = "annotation_"

        # Generate select expressions that to merge and import columns from annotation:
        select_expressions = []

        # Collect columns by iterating over the variant index schema:
        for schema_field in VariantIndex.get_schema():
            column = schema_field.name

            # If an annotation column can be found in both datasets:
            if (column in self.df.columns) and (column in annotation_source.df.columns):
                # Arrays are merged:
                if isinstance(schema_field.dataType, t.ArrayType):
                    fields_order = None
                    if isinstance(schema_field.dataType.elementType, t.StructType):
                        # Extract the schema of the array to get the order of the fields:
                        array_schema = [
                            schema_field
                            for schema_field in VariantIndex.get_schema().fields
                            if schema_field.name == column
                        ][0].dataType
                        fields_order = get_nested_struct_schema(
                            array_schema
                        ).fieldNames()
                    select_expressions.append(
                        safe_array_union(
                            f.col(column), f.col(f"{prefix}{column}"), fields_order
                        ).alias(column)
                    )
                # variantDescription columns are concatenated:
                elif column == "variantDescription":
                    select_expressions.append(
                        f.concat_ws(
                            " ", f.col(column), f.col(f"{prefix}{column}")
                        ).alias(column)
                    )
                # All other non-array columns are coalesced:
                else:
                    select_expressions.append(
                        f.coalesce(f.col(column), f.col(f"{prefix}{column}")).alias(
                            column
                        )
                    )
            # If the column is only found in the annotation dataset rename it:
            elif column in annotation_source.df.columns:
                select_expressions.append(f.col(f"{prefix}{column}").alias(column))
            # If the column is only found in the main dataset:
            elif column in self.df.columns:
                select_expressions.append(f.col(column))
            # VariantIndex columns not found in either dataset are ignored.

        # Join the annotation to the dataset:
        return VariantIndex(
            _df=(
                f.broadcast(self.df)
                .join(
                    rename_all_columns(annotation_source.df, prefix),
                    on=[f.col("variantId") == f.col(f"{prefix}variantId")],
                    how="left",
                )
                .select(*select_expressions)
            ),
            _schema=self.schema,
        )

    def max_maf(self: VariantIndex) -> Column:
        """Maximum minor allele frequency accross all populations assuming all variants biallelic.

        Returns:
            Column: Maximum minor allele frequency accross all populations.

        Raises:
            ValueError: Allele frequencies are not present in the dataset.
        """
        if "alleleFrequencies" not in self.df.columns:
            raise ValueError("Allele frequencies are not present in the dataset.")

        return f.array_max(
            f.transform(
                self.df.alleleFrequencies,
                lambda af: f.when(
                    af.alleleFrequency > 0.5, 1 - af.alleleFrequency
                ).otherwise(af.alleleFrequency),
            )
        )

    def filter_by_variant(self: VariantIndex, df: DataFrame) -> VariantIndex:
        """Filter variant annotation dataset by a variant dataframe.

        Args:
            df (DataFrame): A dataframe of variants.

        Returns:
            VariantIndex: A filtered variant annotation dataset.

        Raises:
            AssertionError: When the variant dataframe does not contain eiter `variantId` or `chromosome` column.
        """
        join_columns = ["variantId", "chromosome"]

        assert all(
            col in df.columns for col in join_columns
        ), "The variant dataframe must contain the columns 'variantId' and 'chromosome'."

        return VariantIndex(
            _df=self._df.join(
                f.broadcast(df.select(*join_columns).distinct()),
                on=join_columns,
                how="inner",
            ),
            _schema=self.schema,
        )

    def get_distance_to_gene(
        self: VariantIndex,
        *,
        distance_type: str = "distanceFromTss",
        max_distance: int = 500_000,
    ) -> DataFrame:
        """Extracts variant to gene assignments for variants falling within a window of a gene's TSS or footprint.

        Args:
            distance_type (str): The type of distance to use. Can be "distanceFromTss" or "distanceFromFootprint". Defaults to "distanceFromTss".
            max_distance (int): The maximum distance to consider. Defaults to 500_000, the default window size for VEP.

        Returns:
            DataFrame: A dataframe with the distance between a variant and a gene's TSS or footprint.

        Raises:
            ValueError: Invalid distance type.
        """
        if distance_type not in {"distanceFromTss", "distanceFromFootprint"}:
            raise ValueError(
                f"Invalid distance_type: {distance_type}. Must be 'distanceFromTss' or 'distanceFromFootprint'."
            )
        df = self.df.select(
            "variantId", f.explode("transcriptConsequences").alias("tc")
        ).select("variantId", "tc.targetId", f"tc.{distance_type}")
        if max_distance == 500_000:
            return df
        elif max_distance < 500_000:
            return df.filter(f"{distance_type} <= {max_distance}")
        else:
            raise ValueError(
                f"max_distance must be less than 500_000. Got {max_distance}."
            )

    def annotate_with_amino_acid_consequences(
        self: VariantIndex, annotation: AminoAcidVariants
    ) -> VariantIndex:
        """Enriching variant effect assessments with amino-acid derived predicted consequences.

        Args:
            annotation (AminoAcidVariants): amino-acid level variant consequences.

        Returns:
            VariantIndex: where amino-acid causing variants are enriched with extra annotation
        """
        w = Window.partitionBy("variantId").orderBy(f.size("variantEffect").desc())

        return VariantIndex(
            _df=self.df
            # Extracting variant consequence on Uniprot and amino-acid changes from the transcripts:
            .withColumns(
                {
                    "aminoAcidChange": f.filter(
                        "transcriptConsequences",
                        lambda vep: vep.aminoAcidChange.isNotNull(),
                    )[0].aminoAcidChange,
                    "uniprotAccession": f.explode_outer(
                        f.filter(
                            "transcriptConsequences",
                            lambda vep: vep.aminoAcidChange.isNotNull(),
                        )[0].uniprotAccessions
                    ),
                }
            )
            # Joining with amino-acid predictions:
            .join(
                annotation.df.withColumnRenamed("variantEffect", "annotations"),
                on=["uniprotAccession", "aminoAcidChange"],
                how="left",
            )
            # Merge predictors:
            .withColumn(
                "variantEffect",
                f.when(
                    f.col("annotations").isNotNull(),
                    f.array_union("variantEffect", "annotations"),
                ).otherwise(f.col("variantEffect")),
            )
            # Dropping unused columns:
            .drop("uniprotAccession", "aminoAcidChange", "annotations")
            # Dropping potentially exploded variant rows:
            .distinct()
            .withColumn("rank", f.row_number().over(w))
            .filter(f.col("rank") == 1)
            .drop("rank"),
            _schema=self.get_schema(),
        )

    def get_loftee(self: VariantIndex) -> DataFrame:
        """Returns a dataframe with a flag indicating whether a variant is predicted to cause loss of function in a gene. The source of this information is the LOFTEE algorithm (https://github.com/konradjk/loftee).

        !!! note, "This will return a filtered dataframe with only variants that have been annotated by LOFTEE."

        Returns:
            DataFrame: variant to gene assignments from the LOFTEE algorithm
        """
        return (
            self.df.select("variantId", f.explode("transcriptConsequences").alias("tc"))
            .filter(f.col("tc.lofteePrediction").isNotNull())
            .withColumn(
                "isHighQualityPlof",
                f.when(f.col("tc.lofteePrediction") == "HC", True).when(
                    f.col("tc.lofteePrediction") == "LC", False
                ),
            )
            .select(
                "variantId",
                f.col("tc.targetId"),
                f.col("tc.lofteePrediction"),
                "isHighQualityPlof",
            )
        )

`add_annotation(annotation_source: VariantIndex) -> VariantIndex` ¶

Import annotation from an other variant index dataset.

At this point the annotation can be extended with extra cross-references, variant effects, allele frequencies, and variant descriptions.

Parameters:

Name	Type	Description	Default
`annotation_source`	`VariantIndex`	Annotation to add to the dataset	required

Returns:

Name	Type	Description
`VariantIndex`	`VariantIndex`	VariantIndex dataset with the annotation added

Source code in src/gentropy/dataset/variant_index.py

def add_annotation(
    self: VariantIndex, annotation_source: VariantIndex
) -> VariantIndex:
    """Import annotation from an other variant index dataset.

    At this point the annotation can be extended with extra cross-references,
    variant effects, allele frequencies, and variant descriptions.

    Args:
        annotation_source (VariantIndex): Annotation to add to the dataset

    Returns:
        VariantIndex: VariantIndex dataset with the annotation added
    """
    # Prefix for renaming columns:
    prefix = "annotation_"

    # Generate select expressions that to merge and import columns from annotation:
    select_expressions = []

    # Collect columns by iterating over the variant index schema:
    for schema_field in VariantIndex.get_schema():
        column = schema_field.name

        # If an annotation column can be found in both datasets:
        if (column in self.df.columns) and (column in annotation_source.df.columns):
            # Arrays are merged:
            if isinstance(schema_field.dataType, t.ArrayType):
                fields_order = None
                if isinstance(schema_field.dataType.elementType, t.StructType):
                    # Extract the schema of the array to get the order of the fields:
                    array_schema = [
                        schema_field
                        for schema_field in VariantIndex.get_schema().fields
                        if schema_field.name == column
                    ][0].dataType
                    fields_order = get_nested_struct_schema(
                        array_schema
                    ).fieldNames()
                select_expressions.append(
                    safe_array_union(
                        f.col(column), f.col(f"{prefix}{column}"), fields_order
                    ).alias(column)
                )
            # variantDescription columns are concatenated:
            elif column == "variantDescription":
                select_expressions.append(
                    f.concat_ws(
                        " ", f.col(column), f.col(f"{prefix}{column}")
                    ).alias(column)
                )
            # All other non-array columns are coalesced:
            else:
                select_expressions.append(
                    f.coalesce(f.col(column), f.col(f"{prefix}{column}")).alias(
                        column
                    )
                )
        # If the column is only found in the annotation dataset rename it:
        elif column in annotation_source.df.columns:
            select_expressions.append(f.col(f"{prefix}{column}").alias(column))
        # If the column is only found in the main dataset:
        elif column in self.df.columns:
            select_expressions.append(f.col(column))
        # VariantIndex columns not found in either dataset are ignored.

    # Join the annotation to the dataset:
    return VariantIndex(
        _df=(
            f.broadcast(self.df)
            .join(
                rename_all_columns(annotation_source.df, prefix),
                on=[f.col("variantId") == f.col(f"{prefix}variantId")],
                how="left",
            )
            .select(*select_expressions)
        ),
        _schema=self.schema,
    )

`annotate_with_amino_acid_consequences(annotation: AminoAcidVariants) -> VariantIndex` ¶

Enriching variant effect assessments with amino-acid derived predicted consequences.

Parameters:

Name	Type	Description	Default
`annotation`	`AminoAcidVariants`	amino-acid level variant consequences.	required

Returns:

Name	Type	Description
`VariantIndex`	`VariantIndex`	where amino-acid causing variants are enriched with extra annotation

Source code in src/gentropy/dataset/variant_index.py

def annotate_with_amino_acid_consequences(
    self: VariantIndex, annotation: AminoAcidVariants
) -> VariantIndex:
    """Enriching variant effect assessments with amino-acid derived predicted consequences.

    Args:
        annotation (AminoAcidVariants): amino-acid level variant consequences.

    Returns:
        VariantIndex: where amino-acid causing variants are enriched with extra annotation
    """
    w = Window.partitionBy("variantId").orderBy(f.size("variantEffect").desc())

    return VariantIndex(
        _df=self.df
        # Extracting variant consequence on Uniprot and amino-acid changes from the transcripts:
        .withColumns(
            {
                "aminoAcidChange": f.filter(
                    "transcriptConsequences",
                    lambda vep: vep.aminoAcidChange.isNotNull(),
                )[0].aminoAcidChange,
                "uniprotAccession": f.explode_outer(
                    f.filter(
                        "transcriptConsequences",
                        lambda vep: vep.aminoAcidChange.isNotNull(),
                    )[0].uniprotAccessions
                ),
            }
        )
        # Joining with amino-acid predictions:
        .join(
            annotation.df.withColumnRenamed("variantEffect", "annotations"),
            on=["uniprotAccession", "aminoAcidChange"],
            how="left",
        )
        # Merge predictors:
        .withColumn(
            "variantEffect",
            f.when(
                f.col("annotations").isNotNull(),
                f.array_union("variantEffect", "annotations"),
            ).otherwise(f.col("variantEffect")),
        )
        # Dropping unused columns:
        .drop("uniprotAccession", "aminoAcidChange", "annotations")
        # Dropping potentially exploded variant rows:
        .distinct()
        .withColumn("rank", f.row_number().over(w))
        .filter(f.col("rank") == 1)
        .drop("rank"),
        _schema=self.get_schema(),
    )

`filter_by_variant(df: DataFrame) -> VariantIndex` ¶

Filter variant annotation dataset by a variant dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	A dataframe of variants.	required

Returns:

Name	Type	Description
`VariantIndex`	`VariantIndex`	A filtered variant annotation dataset.

Raises:

Type	Description
`AssertionError`	When the variant dataframe does not contain eiter `variantId` or `chromosome` column.

Source code in src/gentropy/dataset/variant_index.py

def filter_by_variant(self: VariantIndex, df: DataFrame) -> VariantIndex:
    """Filter variant annotation dataset by a variant dataframe.

    Args:
        df (DataFrame): A dataframe of variants.

    Returns:
        VariantIndex: A filtered variant annotation dataset.

    Raises:
        AssertionError: When the variant dataframe does not contain eiter `variantId` or `chromosome` column.
    """
    join_columns = ["variantId", "chromosome"]

    assert all(
        col in df.columns for col in join_columns
    ), "The variant dataframe must contain the columns 'variantId' and 'chromosome'."

    return VariantIndex(
        _df=self._df.join(
            f.broadcast(df.select(*join_columns).distinct()),
            on=join_columns,
            how="inner",
        ),
        _schema=self.schema,
    )

`get_distance_to_gene(*, distance_type: str = 'distanceFromTss', max_distance: int = 500000) -> DataFrame` ¶

Extracts variant to gene assignments for variants falling within a window of a gene's TSS or footprint.

Parameters:

Name	Type	Description	Default
`distance_type`	`str`	The type of distance to use. Can be "distanceFromTss" or "distanceFromFootprint". Defaults to "distanceFromTss".	`'distanceFromTss'`
`max_distance`	`int`	The maximum distance to consider. Defaults to 500_000, the default window size for VEP.	`500000`

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	A dataframe with the distance between a variant and a gene's TSS or footprint.

Raises:

Type	Description
`ValueError`	Invalid distance type.

Source code in src/gentropy/dataset/variant_index.py

def get_distance_to_gene(
    self: VariantIndex,
    *,
    distance_type: str = "distanceFromTss",
    max_distance: int = 500_000,
) -> DataFrame:
    """Extracts variant to gene assignments for variants falling within a window of a gene's TSS or footprint.

    Args:
        distance_type (str): The type of distance to use. Can be "distanceFromTss" or "distanceFromFootprint". Defaults to "distanceFromTss".
        max_distance (int): The maximum distance to consider. Defaults to 500_000, the default window size for VEP.

    Returns:
        DataFrame: A dataframe with the distance between a variant and a gene's TSS or footprint.

    Raises:
        ValueError: Invalid distance type.
    """
    if distance_type not in {"distanceFromTss", "distanceFromFootprint"}:
        raise ValueError(
            f"Invalid distance_type: {distance_type}. Must be 'distanceFromTss' or 'distanceFromFootprint'."
        )
    df = self.df.select(
        "variantId", f.explode("transcriptConsequences").alias("tc")
    ).select("variantId", "tc.targetId", f"tc.{distance_type}")
    if max_distance == 500_000:
        return df
    elif max_distance < 500_000:
        return df.filter(f"{distance_type} <= {max_distance}")
    else:
        raise ValueError(
            f"max_distance must be less than 500_000. Got {max_distance}."
        )

`get_loftee() -> DataFrame` ¶

Returns a dataframe with a flag indicating whether a variant is predicted to cause loss of function in a gene. The source of this information is the LOFTEE algorithm (https://github.com/konradjk/loftee).

!!! note, "This will return a filtered dataframe with only variants that have been annotated by LOFTEE."

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	variant to gene assignments from the LOFTEE algorithm

Source code in src/gentropy/dataset/variant_index.py

def get_loftee(self: VariantIndex) -> DataFrame:
    """Returns a dataframe with a flag indicating whether a variant is predicted to cause loss of function in a gene. The source of this information is the LOFTEE algorithm (https://github.com/konradjk/loftee).

    !!! note, "This will return a filtered dataframe with only variants that have been annotated by LOFTEE."

    Returns:
        DataFrame: variant to gene assignments from the LOFTEE algorithm
    """
    return (
        self.df.select("variantId", f.explode("transcriptConsequences").alias("tc"))
        .filter(f.col("tc.lofteePrediction").isNotNull())
        .withColumn(
            "isHighQualityPlof",
            f.when(f.col("tc.lofteePrediction") == "HC", True).when(
                f.col("tc.lofteePrediction") == "LC", False
            ),
        )
        .select(
            "variantId",
            f.col("tc.targetId"),
            f.col("tc.lofteePrediction"),
            "isHighQualityPlof",
        )
    )

`get_schema() -> StructType` `classmethod` ¶

Provides the schema for the variant index dataset.

Returns:

Name	Type	Description
`StructType`	`StructType`	Schema for the VariantIndex dataset

Source code in src/gentropy/dataset/variant_index.py

@classmethod
def get_schema(cls: type[VariantIndex]) -> StructType:
    """Provides the schema for the variant index dataset.

    Returns:
        StructType: Schema for the VariantIndex dataset
    """
    return parse_spark_schema("variant_index.json")

`hash_long_variant_ids(variant_id: Column, chromosome: Column, position: Column, threshold: int) -> Column` `staticmethod` ¶

Hash long variant identifiers.

Parameters:

Name	Type	Description	Default
`variant_id`	`Column`	Column containing variant identifiers.	required
`chromosome`	`Column`	Chromosome column.	required
`position`	`Column`	position column.	required
`threshold`	`int`	Above this limit, a hash will be generated.	required

Returns:

Name	Type	Description
`Column`	`Column`	Hashed variant identifiers for long variants.

Examples:

>>> (
...    spark.createDataFrame([('v_short', 'x', 23),('v_looooooong', '23', 23), ('no_chrom', None, None), (None, None, None)], ['variantId', 'chromosome', 'position'])
...    .select('variantId', VariantIndex.hash_long_variant_ids(f.col('variantId'), f.col('chromosome'), f.col('position'), 10).alias('hashedVariantId'))
...    .show(truncate=False)
... )
+------------+--------------------------------------------+
|variantId   |hashedVariantId                             |
+------------+--------------------------------------------+
|v_short     |v_short                                     |
|v_looooooong|OTVAR_23_23_3749d019d645894770c364992ae70a05|
|no_chrom    |OTVAR_41acfcd7d4fd523b33600b504914ef25      |
|NULL        |NULL                                        |
+------------+--------------------------------------------+

Source code in src/gentropy/dataset/variant_index.py

@staticmethod
def hash_long_variant_ids(
    variant_id: Column, chromosome: Column, position: Column, threshold: int
) -> Column:
    """Hash long variant identifiers.

    Args:
        variant_id (Column): Column containing variant identifiers.
        chromosome (Column): Chromosome column.
        position (Column): position column.
        threshold (int): Above this limit, a hash will be generated.

    Returns:
        Column: Hashed variant identifiers for long variants.

    Examples:
        >>> (
        ...    spark.createDataFrame([('v_short', 'x', 23),('v_looooooong', '23', 23), ('no_chrom', None, None), (None, None, None)], ['variantId', 'chromosome', 'position'])
        ...    .select('variantId', VariantIndex.hash_long_variant_ids(f.col('variantId'), f.col('chromosome'), f.col('position'), 10).alias('hashedVariantId'))
        ...    .show(truncate=False)
        ... )
        +------------+--------------------------------------------+
        |variantId   |hashedVariantId                             |
        +------------+--------------------------------------------+
        |v_short     |v_short                                     |
        |v_looooooong|OTVAR_23_23_3749d019d645894770c364992ae70a05|
        |no_chrom    |OTVAR_41acfcd7d4fd523b33600b504914ef25      |
        |NULL        |NULL                                        |
        +------------+--------------------------------------------+
        <BLANKLINE>
    """
    return (
        # If either the position or the chromosome is missing, we hash the identifier:
        f.when(
            chromosome.isNull() | position.isNull(),
            f.concat(
                f.lit("OTVAR_"),
                f.md5(variant_id).cast("string"),
            ),
        )
        # If chromosome and position are given, but alleles are too long, create hash:
        .when(
            f.length(variant_id) >= threshold,
            f.concat_ws(
                "_",
                f.lit("OTVAR"),
                chromosome,
                position,
                f.md5(variant_id).cast("string"),
            ),
        )
        # Missing and regular variant identifiers are left unchanged:
        .otherwise(variant_id)
    )

`max_maf() -> Column` ¶

Maximum minor allele frequency accross all populations assuming all variants biallelic.

Returns:

Name	Type	Description
`Column`	`Column`	Maximum minor allele frequency accross all populations.

Raises:

Type	Description
`ValueError`	Allele frequencies are not present in the dataset.

Source code in src/gentropy/dataset/variant_index.py

def max_maf(self: VariantIndex) -> Column:
    """Maximum minor allele frequency accross all populations assuming all variants biallelic.

    Returns:
        Column: Maximum minor allele frequency accross all populations.

    Raises:
        ValueError: Allele frequencies are not present in the dataset.
    """
    if "alleleFrequencies" not in self.df.columns:
        raise ValueError("Allele frequencies are not present in the dataset.")

    return f.array_max(
        f.transform(
            self.df.alleleFrequencies,
            lambda af: f.when(
                af.alleleFrequency > 0.5, 1 - af.alleleFrequency
            ).otherwise(af.alleleFrequency),
        )
    )

Schema¶

root
 |-- variantId: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- position: integer (nullable = false)
 |-- referenceAllele: string (nullable = false)
 |-- alternateAllele: string (nullable = false)
 |-- variantEffect: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- method: string (nullable = true)
 |    |    |-- assessment: string (nullable = true)
 |    |    |-- score: float (nullable = true)
 |    |    |-- assessmentFlag: string (nullable = true)
 |    |    |-- targetId: string (nullable = true)
 |    |    |-- normalisedScore: double (nullable = true)
 |-- mostSevereConsequenceId: string (nullable = true)
 |-- transcriptConsequences: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- variantFunctionalConsequenceIds: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- aminoAcidChange: string (nullable = true)
 |    |    |-- uniprotAccessions: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)
 |    |    |-- isEnsemblCanonical: boolean (nullable = false)
 |    |    |-- codons: string (nullable = true)
 |    |    |-- distanceFromFootprint: long (nullable = true)
 |    |    |-- distanceFromTss: long (nullable = true)
 |    |    |-- appris: string (nullable = true)
 |    |    |-- maneSelect: string (nullable = true)
 |    |    |-- targetId: string (nullable = true)
 |    |    |-- impact: string (nullable = true)
 |    |    |-- lofteePrediction: string (nullable = true)
 |    |    |-- siftPrediction: float (nullable = true)
 |    |    |-- polyphenPrediction: float (nullable = true)
 |    |    |-- consequenceScore: float (nullable = true)
 |    |    |-- transcriptIndex: integer (nullable = true)
 |    |    |-- approvedSymbol: string (nullable = true)
 |    |    |-- biotype: string (nullable = true)
 |    |    |-- transcriptId: string (nullable = true)
 |-- rsIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hgvsId: string (nullable = true)
 |-- alleleFrequencies: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- populationName: string (nullable = true)
 |    |    |-- alleleFrequency: double (nullable = true)
 |-- dbXrefs: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: string (nullable = true)
 |    |    |-- source: string (nullable = true)
 |-- variantDescription: string (nullable = true)

2023-01-15
2024-01-17
Contributors

Variant index

gentropy.dataset.variant_index.VariantIndex dataclass ¶

add_annotation(annotation_source: VariantIndex) -> VariantIndex ¶

annotate_with_amino_acid_consequences(annotation: AminoAcidVariants) -> VariantIndex ¶

filter_by_variant(df: DataFrame) -> VariantIndex ¶

get_distance_to_gene(*, distance_type: str = 'distanceFromTss', max_distance: int = 500000) -> DataFrame ¶

get_loftee() -> DataFrame ¶

get_schema() -> StructType classmethod ¶

hash_long_variant_ids(variant_id: Column, chromosome: Column, position: Column, threshold: int) -> Column staticmethod ¶

max_maf() -> Column ¶

Schema¶

`gentropy.dataset.variant_index.VariantIndex` `dataclass` ¶

`add_annotation(annotation_source: VariantIndex) -> VariantIndex` ¶

`annotate_with_amino_acid_consequences(annotation: AminoAcidVariants) -> VariantIndex` ¶

`filter_by_variant(df: DataFrame) -> VariantIndex` ¶

`get_distance_to_gene(*, distance_type: str = 'distanceFromTss', max_distance: int = 500000) -> DataFrame` ¶

`get_loftee() -> DataFrame` ¶

`get_schema() -> StructType` `classmethod` ¶

`hash_long_variant_ids(variant_id: Column, chromosome: Column, position: Column, threshold: int) -> Column` `staticmethod` ¶

`max_maf() -> Column` ¶