Skip to content

From VEP

List of features

gentropy.dataset.l2g_features.vep.VepMeanFeature dataclass

Bases: L2GFeature

Average functional consequence score among all variants in a credible set for a studyLocus/gene.

The mean severity score is weighted by the posterior probability of each variant.

Source code in src/gentropy/dataset/l2g_features/vep.py
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
class VepMeanFeature(L2GFeature):
    """Average functional consequence score among all variants in a credible set for a studyLocus/gene.

    The mean severity score is weighted by the posterior probability of each variant.
    """

    feature_dependency_type = VariantIndex
    feature_name = "vepMean"

    @classmethod
    def compute(
        cls: type[VepMeanFeature],
        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
        feature_dependency: dict[str, Any],
    ) -> VepMeanFeature:
        """Computes the feature.

        Args:
            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

        Returns:
            VepMeanFeature: Feature dataset
        """
        return cls(
            _df=convert_from_wide_to_long(
                common_vep_feature_logic(
                    study_loci_to_annotate=study_loci_to_annotate,
                    feature_name=cls.feature_name,
                    **feature_dependency,
                ),
                id_vars=("studyLocusId", "geneId"),
                var_name="featureName",
                value_name="featureValue",
            ),
            _schema=cls.get_schema(),
        )

compute(study_loci_to_annotate: StudyLocus | L2GGoldStandard, feature_dependency: dict[str, Any]) -> VepMeanFeature classmethod

Computes the feature.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
feature_dependency dict[str, Any]

Dataset that contains the functional consequence information

required

Returns:

Name Type Description
VepMeanFeature VepMeanFeature

Feature dataset

Source code in src/gentropy/dataset/l2g_features/vep.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
@classmethod
def compute(
    cls: type[VepMeanFeature],
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    feature_dependency: dict[str, Any],
) -> VepMeanFeature:
    """Computes the feature.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
        feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

    Returns:
        VepMeanFeature: Feature dataset
    """
    return cls(
        _df=convert_from_wide_to_long(
            common_vep_feature_logic(
                study_loci_to_annotate=study_loci_to_annotate,
                feature_name=cls.feature_name,
                **feature_dependency,
            ),
            id_vars=("studyLocusId", "geneId"),
            var_name="featureName",
            value_name="featureValue",
        ),
        _schema=cls.get_schema(),
    )

gentropy.dataset.l2g_features.vep.VepMeanNeighbourhoodFeature dataclass

Bases: L2GFeature

Mean functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity.

The mean severity score is weighted by the posterior probability of each variant.

Source code in src/gentropy/dataset/l2g_features/vep.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
class VepMeanNeighbourhoodFeature(L2GFeature):
    """Mean functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity.

    The mean severity score is weighted by the posterior probability of each variant.
    """

    feature_dependency_type = [VariantIndex, GeneIndex]
    feature_name = "vepMeanNeighbourhood"

    @classmethod
    def compute(
        cls: type[VepMeanNeighbourhoodFeature],
        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
        feature_dependency: dict[str, Any],
    ) -> VepMeanNeighbourhoodFeature:
        """Computes the feature.

        Args:
            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

        Returns:
            VepMeanNeighbourhoodFeature: Feature dataset
        """
        return cls(
            _df=convert_from_wide_to_long(
                common_neighbourhood_vep_feature_logic(
                    study_loci_to_annotate,
                    feature_name=cls.feature_name,
                    **feature_dependency,
                ),
                id_vars=("studyLocusId", "geneId"),
                var_name="featureName",
                value_name="featureValue",
            ),
            _schema=cls.get_schema(),
        )

compute(study_loci_to_annotate: StudyLocus | L2GGoldStandard, feature_dependency: dict[str, Any]) -> VepMeanNeighbourhoodFeature classmethod

Computes the feature.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
feature_dependency dict[str, Any]

Dataset that contains the functional consequence information

required

Returns:

Name Type Description
VepMeanNeighbourhoodFeature VepMeanNeighbourhoodFeature

Feature dataset

Source code in src/gentropy/dataset/l2g_features/vep.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
@classmethod
def compute(
    cls: type[VepMeanNeighbourhoodFeature],
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    feature_dependency: dict[str, Any],
) -> VepMeanNeighbourhoodFeature:
    """Computes the feature.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
        feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

    Returns:
        VepMeanNeighbourhoodFeature: Feature dataset
    """
    return cls(
        _df=convert_from_wide_to_long(
            common_neighbourhood_vep_feature_logic(
                study_loci_to_annotate,
                feature_name=cls.feature_name,
                **feature_dependency,
            ),
            id_vars=("studyLocusId", "geneId"),
            var_name="featureName",
            value_name="featureValue",
        ),
        _schema=cls.get_schema(),
    )

gentropy.dataset.l2g_features.vep.VepMaximumFeature dataclass

Bases: L2GFeature

Maximum functional consequence score among all variants in a credible set for a studyLocus/gene.

Source code in src/gentropy/dataset/l2g_features/vep.py
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
class VepMaximumFeature(L2GFeature):
    """Maximum functional consequence score among all variants in a credible set for a studyLocus/gene."""

    feature_dependency_type = VariantIndex
    feature_name = "vepMaximum"

    @classmethod
    def compute(
        cls: type[VepMaximumFeature],
        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
        feature_dependency: dict[str, Any],
    ) -> VepMaximumFeature:
        """Computes the feature.

        Args:
            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

        Returns:
            VepMaximumFeature: Feature dataset
        """
        return cls(
            _df=convert_from_wide_to_long(
                common_vep_feature_logic(
                    study_loci_to_annotate=study_loci_to_annotate,
                    feature_name=cls.feature_name,
                    **feature_dependency,
                ),
                id_vars=("studyLocusId", "geneId"),
                var_name="featureName",
                value_name="featureValue",
            ),
            _schema=cls.get_schema(),
        )

compute(study_loci_to_annotate: StudyLocus | L2GGoldStandard, feature_dependency: dict[str, Any]) -> VepMaximumFeature classmethod

Computes the feature.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
feature_dependency dict[str, Any]

Dataset that contains the functional consequence information

required

Returns:

Name Type Description
VepMaximumFeature VepMaximumFeature

Feature dataset

Source code in src/gentropy/dataset/l2g_features/vep.py
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
@classmethod
def compute(
    cls: type[VepMaximumFeature],
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    feature_dependency: dict[str, Any],
) -> VepMaximumFeature:
    """Computes the feature.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
        feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

    Returns:
        VepMaximumFeature: Feature dataset
    """
    return cls(
        _df=convert_from_wide_to_long(
            common_vep_feature_logic(
                study_loci_to_annotate=study_loci_to_annotate,
                feature_name=cls.feature_name,
                **feature_dependency,
            ),
            id_vars=("studyLocusId", "geneId"),
            var_name="featureName",
            value_name="featureValue",
        ),
        _schema=cls.get_schema(),
    )

gentropy.dataset.l2g_features.vep.VepMaximumNeighbourhoodFeature dataclass

Bases: L2GFeature

Maximum functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity.

Source code in src/gentropy/dataset/l2g_features/vep.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
class VepMaximumNeighbourhoodFeature(L2GFeature):
    """Maximum functional consequence score among all variants in a credible set for a studyLocus/gene relative to the mean VEP score across all protein coding genes in the vicinity."""

    feature_dependency_type = [VariantIndex, GeneIndex]
    feature_name = "vepMaximumNeighbourhood"

    @classmethod
    def compute(
        cls: type[VepMaximumNeighbourhoodFeature],
        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
        feature_dependency: dict[str, Any],
    ) -> VepMaximumNeighbourhoodFeature:
        """Computes the feature.

        Args:
            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
            feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

        Returns:
            VepMaximumNeighbourhoodFeature: Feature dataset
        """
        return cls(
            _df=convert_from_wide_to_long(
                common_neighbourhood_vep_feature_logic(
                    study_loci_to_annotate,
                    feature_name=cls.feature_name,
                    **feature_dependency,
                ),
                id_vars=("studyLocusId", "geneId"),
                var_name="featureName",
                value_name="featureValue",
            ),
            _schema=cls.get_schema(),
        )

compute(study_loci_to_annotate: StudyLocus | L2GGoldStandard, feature_dependency: dict[str, Any]) -> VepMaximumNeighbourhoodFeature classmethod

Computes the feature.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
feature_dependency dict[str, Any]

Dataset that contains the functional consequence information

required

Returns:

Name Type Description
VepMaximumNeighbourhoodFeature VepMaximumNeighbourhoodFeature

Feature dataset

Source code in src/gentropy/dataset/l2g_features/vep.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
@classmethod
def compute(
    cls: type[VepMaximumNeighbourhoodFeature],
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    feature_dependency: dict[str, Any],
) -> VepMaximumNeighbourhoodFeature:
    """Computes the feature.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
        feature_dependency (dict[str, Any]): Dataset that contains the functional consequence information

    Returns:
        VepMaximumNeighbourhoodFeature: Feature dataset
    """
    return cls(
        _df=convert_from_wide_to_long(
            common_neighbourhood_vep_feature_logic(
                study_loci_to_annotate,
                feature_name=cls.feature_name,
                **feature_dependency,
            ),
            id_vars=("studyLocusId", "geneId"),
            var_name="featureName",
            value_name="featureValue",
        ),
        _schema=cls.get_schema(),
    )

Common logic

gentropy.dataset.l2g_features.vep.common_vep_feature_logic(study_loci_to_annotate: L2GGoldStandard | StudyLocus, *, variant_index: VariantIndex, feature_name: str) -> DataFrame

Extracts variant severity score computed from VEP.

Parameters:

Name Type Description Default
study_loci_to_annotate L2GGoldStandard | StudyLocus

The dataset containing study loci that will be used for annotation

required
variant_index VariantIndex

The dataset containing functional consequence information

required
feature_name str

The name of the feature

required

Returns:

Name Type Description
DataFrame DataFrame

Feature dataset

Source code in src/gentropy/dataset/l2g_features/vep.py
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
def common_vep_feature_logic(
    study_loci_to_annotate: L2GGoldStandard | StudyLocus,
    *,
    variant_index: VariantIndex,
    feature_name: str,
) -> DataFrame:
    """Extracts variant severity score computed from VEP.

    Args:
        study_loci_to_annotate (L2GGoldStandard | StudyLocus): The dataset containing study loci that will be used for annotation
        variant_index (VariantIndex): The dataset containing functional consequence information
        feature_name (str): The name of the feature

    Returns:
        DataFrame: Feature dataset
    """
    # Variant/Target/Severity dataframe
    consequences_dataset = variant_index.df.withColumn(
        "transcriptConsequence", f.explode("transcriptConsequences")
    ).select(
        "variantId",
        f.col("transcriptConsequence.targetId").alias("geneId"),
        f.col("transcriptConsequence.consequenceScore").alias("severityScore"),
    )
    if isinstance(study_loci_to_annotate, StudyLocus):
        variants_df = (
            study_loci_to_annotate.df.withColumn(
                "variantInLocus", f.explode_outer("locus")
            )
            .select(
                "studyLocusId",
                f.col("variantInLocus.variantId").alias("variantId"),
                f.col("variantInLocus.posteriorProbability").alias(
                    "posteriorProbability"
                ),
            )
            .join(consequences_dataset, "variantId")
        )
    elif isinstance(study_loci_to_annotate, L2GGoldStandard):
        variants_df = study_loci_to_annotate.df.select(
            "studyLocusId", "variantId", f.lit(1.0).alias("posteriorProbability")
        ).join(consequences_dataset, "variantId")

    if "Maximum" in feature_name:
        agg_expr = f.max("severityScore")
    elif "Mean" in feature_name:
        variants_df = variants_df.withColumn(
            "weightedScore", f.col("severityScore") * f.col("posteriorProbability")
        )
        agg_expr = f.mean("weightedScore")
    return variants_df.groupBy("studyLocusId", "geneId").agg(
        agg_expr.alias(feature_name)
    )

gentropy.dataset.l2g_features.vep.common_neighbourhood_vep_feature_logic(study_loci_to_annotate: StudyLocus | L2GGoldStandard, *, variant_index: VariantIndex, gene_index: GeneIndex, feature_name: str) -> DataFrame

Extracts variant severity score computed from VEP for any gene, based on what is the max score for protein coding genes that are nearby the locus.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
variant_index VariantIndex

The dataset containing functional consequence information

required
gene_index GeneIndex

The dataset containing the gene biotype

required
feature_name str

The name of the feature

required

Returns:

Name Type Description
DataFrame DataFrame

Feature dataset

Source code in src/gentropy/dataset/l2g_features/vep.py
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
def common_neighbourhood_vep_feature_logic(
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    *,
    variant_index: VariantIndex,
    gene_index: GeneIndex,
    feature_name: str,
) -> DataFrame:
    """Extracts variant severity score computed from VEP for any gene, based on what is the max score for protein coding genes that are nearby the locus.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci that will be used for annotation
        variant_index (VariantIndex): The dataset containing functional consequence information
        gene_index (GeneIndex): The dataset containing the gene biotype
        feature_name (str): The name of the feature

    Returns:
        DataFrame: Feature dataset
    """
    local_feature_name = feature_name.replace("Neighbourhood", "")
    local_metric = common_vep_feature_logic(
        study_loci_to_annotate,
        feature_name=local_feature_name,
        variant_index=variant_index,
    )
    return (
        local_metric
        # Compute average score in the vicinity (feature will be the same for any gene associated with a studyLocus)
        # (non protein coding genes in the vicinity are excluded see #3552)
        .join(
            gene_index.df.filter(f.col("biotype") == "protein_coding").select("geneId"),
            "geneId",
            "inner",
        )
        .withColumn(
            "regional_max",
            f.max(local_feature_name).over(Window.partitionBy("studyLocusId")),
        )
        .withColumn(
            feature_name,
            f.when(
                (f.col("regional_max").isNotNull()) & (f.col("regional_max") != 0.0),
                f.col(local_feature_name)
                / f.coalesce(f.col("regional_max"), f.lit(0.0)),
            ).otherwise(f.lit(0.0)),
        )
        .drop("regional_max", local_feature_name)
    )