Skip to content

Epigenetic regulatory region features

List of features

gentropy.dataset.l2g_features.intervals.E2gMeanFeature dataclass

Bases: L2GFeature

e2gMean feature from E2G intervals.

Source code in src/gentropy/dataset/l2g_features/intervals.py
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
class E2gMeanFeature(L2GFeature):
    """e2gMean feature from E2G intervals."""

    feature_dependency_type = Intervals
    feature_name = "e2gMean"

    @classmethod
    def compute(
        cls: type[E2gMeanFeature],
        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
        feature_dependency: dict[str, Any],
    ) -> E2gMeanFeature:
        """Compute e2gMean feature.

        Args:
            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
                that will be used for annotation
            feature_dependency (dict[str, Any]): Dataset that contains the e2g information, expecting intervals

        Returns:
            E2gMeanFeature: Computed e2gMean feature.
        """
        wide = get_or_make_e2g_wide(
            study_loci_to_annotate,
            feature_dependency=feature_dependency,
            base_name=cls.feature_name,
            use_binned=True,
        )
        df_long = convert_from_wide_to_long(
            wide.select("studyLocusId", "geneId", cls.feature_name),
            id_vars=("studyLocusId", "geneId"),
            var_name="featureName",
            value_name="featureValue",
            value_vars=(cls.feature_name,),
        )
        return cls(_df=df_long, _schema=cls.get_schema())

compute(study_loci_to_annotate: StudyLocus | L2GGoldStandard, feature_dependency: dict[str, Any]) -> E2gMeanFeature classmethod

Compute e2gMean feature.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
feature_dependency dict[str, Any]

Dataset that contains the e2g information, expecting intervals

required

Returns:

Name Type Description
E2gMeanFeature E2gMeanFeature

Computed e2gMean feature.

Source code in src/gentropy/dataset/l2g_features/intervals.py
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
@classmethod
def compute(
    cls: type[E2gMeanFeature],
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    feature_dependency: dict[str, Any],
) -> E2gMeanFeature:
    """Compute e2gMean feature.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
            that will be used for annotation
        feature_dependency (dict[str, Any]): Dataset that contains the e2g information, expecting intervals

    Returns:
        E2gMeanFeature: Computed e2gMean feature.
    """
    wide = get_or_make_e2g_wide(
        study_loci_to_annotate,
        feature_dependency=feature_dependency,
        base_name=cls.feature_name,
        use_binned=True,
    )
    df_long = convert_from_wide_to_long(
        wide.select("studyLocusId", "geneId", cls.feature_name),
        id_vars=("studyLocusId", "geneId"),
        var_name="featureName",
        value_name="featureValue",
        value_vars=(cls.feature_name,),
    )
    return cls(_df=df_long, _schema=cls.get_schema())

gentropy.dataset.l2g_features.intervals.E2gMeanNeighbourhoodFeature dataclass

Bases: L2GFeature

e2gMeanNeighbourhood feature from E2G intervals.

Source code in src/gentropy/dataset/l2g_features/intervals.py
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
class E2gMeanNeighbourhoodFeature(L2GFeature):
    """e2gMeanNeighbourhood feature from E2G intervals."""

    feature_dependency_type = Intervals
    feature_name = "e2gMeanNeighbourhood"

    @classmethod
    def compute(
        cls: type[E2gMeanNeighbourhoodFeature],
        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
        feature_dependency: dict[str, Any],
    ) -> E2gMeanNeighbourhoodFeature:
        """Compute e2gMeanNeighbourhood feature.

        Args:
            study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
                that will be used for annotation
            feature_dependency (dict[str, Any]): Dataset that contains the e2g information, expecting intervals

        Returns:
            E2gMeanNeighbourhoodFeature: Computed e2gMeanNeighbourhood feature.
        """
        wide = get_or_make_e2g_wide(
            study_loci_to_annotate,
            feature_dependency=feature_dependency,
            base_name="e2gMean",
            use_binned=True,
        )
        df_long = convert_from_wide_to_long(
            wide.select("studyLocusId", "geneId", cls.feature_name),
            id_vars=("studyLocusId", "geneId"),
            var_name="featureName",
            value_name="featureValue",
            value_vars=(cls.feature_name,),
        )
        return cls(_df=df_long, _schema=cls.get_schema())

compute(study_loci_to_annotate: StudyLocus | L2GGoldStandard, feature_dependency: dict[str, Any]) -> E2gMeanNeighbourhoodFeature classmethod

Compute e2gMeanNeighbourhood feature.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
feature_dependency dict[str, Any]

Dataset that contains the e2g information, expecting intervals

required

Returns:

Name Type Description
E2gMeanNeighbourhoodFeature E2gMeanNeighbourhoodFeature

Computed e2gMeanNeighbourhood feature.

Source code in src/gentropy/dataset/l2g_features/intervals.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
@classmethod
def compute(
    cls: type[E2gMeanNeighbourhoodFeature],
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    feature_dependency: dict[str, Any],
) -> E2gMeanNeighbourhoodFeature:
    """Compute e2gMeanNeighbourhood feature.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
            that will be used for annotation
        feature_dependency (dict[str, Any]): Dataset that contains the e2g information, expecting intervals

    Returns:
        E2gMeanNeighbourhoodFeature: Computed e2gMeanNeighbourhood feature.
    """
    wide = get_or_make_e2g_wide(
        study_loci_to_annotate,
        feature_dependency=feature_dependency,
        base_name="e2gMean",
        use_binned=True,
    )
    df_long = convert_from_wide_to_long(
        wide.select("studyLocusId", "geneId", cls.feature_name),
        id_vars=("studyLocusId", "geneId"),
        var_name="featureName",
        value_name="featureValue",
        value_vars=(cls.feature_name,),
    )
    return cls(_df=df_long, _schema=cls.get_schema())

Common logic

gentropy.dataset.l2g_features.intervals.e2g_interval_feature_wide_logic(study_loci_to_annotate: StudyLocus | L2GGoldStandard, *, intervals: Intervals, base_name: str = 'e2gMean', use_binned: bool = True, pp_min: float = 0.001, bin_size: int = 50000, max_bins_per_interval: int = 200, repartitions_variants: int | None = None, repartitions_intervals: int | None = None) -> DataFrame

Wrapper that defaults to the binned implementation.

Set use_binned=False to fall back to a plain overlap if ever needed.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
intervals Intervals

The dataset containing interval information

required
base_name str

The base name of the feature

'e2gMean'
use_binned bool

Whether to use the binned overlap logic

True
pp_min float

Minimum posterior probability to consider a variant

0.001
bin_size int

Size of bins for the binned overlap

50000
max_bins_per_interval int

Maximum number of bins to explode per interval

200
repartitions_variants int | None

Number of repartitions for variant side

None
repartitions_intervals int | None

Number of repartitions for interval side

None

Returns:

Name Type Description
DataFrame DataFrame

a WIDE DF with studyLocusId, geneId, e2gMean, e2gMeanNeighbourhood, neighbourhood is ratio-centred:

DataFrame

e2gMeanNeighbourhood = e2gMean / mean(e2gMean within locus)

Source code in src/gentropy/dataset/l2g_features/intervals.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
def e2g_interval_feature_wide_logic(
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    *,
    intervals: Intervals,
    base_name: str = "e2gMean",
    use_binned: bool = True,
    pp_min: float = 0.001,
    bin_size: int = 50_000,
    max_bins_per_interval: int = 200,
    repartitions_variants: int | None = None,
    repartitions_intervals: int | None = None,
) -> DataFrame:
    """Wrapper that defaults to the binned implementation.

    Set use_binned=False to fall back to a plain overlap if ever needed.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
            that will be used for annotation
        intervals (Intervals): The dataset containing interval information
        base_name (str): The base name of the feature
        use_binned (bool): Whether to use the binned overlap logic
        pp_min (float): Minimum posterior probability to consider a variant
        bin_size (int): Size of bins for the binned overlap
        max_bins_per_interval (int): Maximum number of bins to explode per interval
        repartitions_variants (int | None): Number of repartitions for variant side
        repartitions_intervals (int | None): Number of repartitions for interval side

    Returns:
        DataFrame: a WIDE DF with studyLocusId, geneId, e2gMean, e2gMeanNeighbourhood, neighbourhood is ratio-centred:
        e2gMeanNeighbourhood = e2gMean / mean(e2gMean within locus)
    """
    if use_binned:
        return e2g_interval_feature_wide_logic_binned(
            study_loci_to_annotate,
            intervals=intervals,
            base_name=base_name,
            pp_min=pp_min,
            bin_size=bin_size,
            max_bins_per_interval=max_bins_per_interval,
            repartitions_variants=repartitions_variants,
            repartitions_intervals=repartitions_intervals,
        )

    # Fallback: original plain overlap logic (kept for completeness)
    sl = study_loci_to_annotate.df.alias("sl")
    iv = intervals.df.alias("iv")
    study_loci_exploded = (
        sl.withColumn("variantInLocus", f.explode_outer("locus"))
        .withColumn(
            "chromosome",
            extract_chromosome(f.col("variantInLocus").getField("variantId")),
        )
        .withColumn(
            "position",
            extract_position(f.col("variantInLocus").getField("variantId")).cast("int"),
        )
        .withColumn(
            "posteriorProbability",
            f.col("variantInLocus.posteriorProbability").cast("double"),
        )
        .filter(f.col("posteriorProbability") > f.lit(pp_min))
        .select(
            f.col("studyLocusId").alias("studyLocusId"),
            f.col("chromosome").alias("sl_chromosome"),
            f.col("position").alias("position"),
            f.col("posteriorProbability").alias("pp"),
        )
        .alias("slx")
    )
    intervals_filtered = iv.select(
        f.col("chromosome").alias("iv_chromosome"),
        f.col("start").cast("int").alias("start"),
        f.col("end").cast("int").alias("end"),
        f.col("geneId").alias("geneId"),
        f.col("score").alias("score"),
    ).alias("ivf")

    joined = study_loci_exploded.join(
        intervals_filtered,
        (f.col("slx.sl_chromosome") == f.col("ivf.iv_chromosome"))
        & (f.col("position") >= f.col("start"))
        & (f.col("position") <= f.col("end")),
        "inner",
    ).select(
        f.col("studyLocusId"),
        f.col("slx.sl_chromosome").alias("chromosome"),
        f.col("position"),
        f.col("pp"),
        f.col("geneId"),
        f.col("score"),
    )

    per_variant_gene = joined.groupBy(
        "studyLocusId", "chromosome", "position", "geneId"
    ).agg(
        f.max("score").alias("maxScore"),
        f.first("pp", ignorenulls=True).alias("pp"),
    )

    base_df = (
        per_variant_gene.withColumn(
            "weightedIntervalScore", f.col("maxScore") * f.col("pp")
        )
        .groupBy("studyLocusId", "geneId")
        .agg(f.sum("weightedIntervalScore").alias(base_name))
    ).persist()

    w = Window.partitionBy("studyLocusId")
    with_max = base_df.withColumn("regional_max", f.max(base_name).over(w))
    neigh_ratio = f.when(
        f.col("regional_max") != 0, f.col(base_name) / f.col("regional_max")
    ).otherwise(f.lit(0.0))

    wide = with_max.select(
        "studyLocusId",
        "geneId",
        f.col(base_name).alias(base_name),
        neigh_ratio.alias(f"{base_name}Neighbourhood"),
    )
    return wide

gentropy.dataset.l2g_features.intervals.get_or_make_e2g_wide(study_loci_to_annotate: StudyLocus | L2GGoldStandard, *, feature_dependency: dict[str, Any], base_name: str = 'e2gMean', use_binned: bool = True, pp_min: float = 0.001, bin_size: int = 50000, max_bins_per_interval: int = 200, repartitions_variants: int | None = None, repartitions_intervals: int | None = None) -> DataFrame

Compute or retrieve the e2g wide feature DataFrame with optional binned join settings.

This method implements a caching registry within the feature_dependency dictionary object defined by parent caller. The method stores the reference to wide e2g dataframe execution plan under specific cache_key, so subsequent feature factory calls to the E2GFeature.compute() can reference the cached resource instead of recomputing the plan.

Note

The caching mechanism acts on the feature_dependency dictionary and modifies it in place as of side effect.

The cache key incorporates parameters that affect output.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

The dataset containing study loci that will be used for annotation

required
feature_dependency dict[str, Any]

Dataset that contains the e2g information

required
base_name str

The base name of the feature

'e2gMean'
use_binned bool

Whether to use the binned overlap logic

True
pp_min float

Minimum posterior probability to consider a variant

0.001
bin_size int

Size of bins for the binned overlap

50000
max_bins_per_interval int

Maximum number of bins to explode per interval

200
repartitions_variants int | None

Number of repartitions for variant side

None
repartitions_intervals int | None

Number of repartitions for interval side

None

Returns:

Name Type Description
DataFrame DataFrame

Features dataset

Source code in src/gentropy/dataset/l2g_features/intervals.py
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
def get_or_make_e2g_wide(
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    *,
    feature_dependency: dict[str, Any],
    base_name: str = "e2gMean",
    use_binned: bool = True,
    pp_min: float = 0.001,
    bin_size: int = 50_000,
    max_bins_per_interval: int = 200,
    repartitions_variants: int | None = None,
    repartitions_intervals: int | None = None,
) -> DataFrame:
    """Compute or retrieve the e2g wide feature DataFrame with optional binned join settings.

    This method implements a caching registry within the `feature_dependency` dictionary object defined by parent caller.
    The method stores the reference to wide e2g dataframe execution plan under specific cache_key,
    so subsequent feature factory calls to the E2GFeature.compute() can reference the cached resource instead of recomputing the plan.

    Note:
        The caching mechanism acts on the `feature_dependency` dictionary and modifies it in place as of side effect.

    The cache key incorporates parameters that affect output.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): The dataset containing study loci
            that will be used for annotation
        feature_dependency (dict[str, Any]): Dataset that contains the e2g information
        base_name (str): The base name of the feature
        use_binned (bool): Whether to use the binned overlap logic
        pp_min (float): Minimum posterior probability to consider a variant
        bin_size (int): Size of bins for the binned overlap
        max_bins_per_interval (int): Maximum number of bins to explode per interval
        repartitions_variants (int | None): Number of repartitions for variant side
        repartitions_intervals (int | None): Number of repartitions for interval side

    Returns:
        DataFrame: Features dataset
    """
    cache_key = f"_e2g_wide::{base_name}::binned={use_binned}::ppmin={pp_min}::bin={bin_size}::cap={max_bins_per_interval}"
    if cache_key not in feature_dependency:
        wide = e2g_interval_feature_wide_logic(
            study_loci_to_annotate,
            intervals=feature_dependency["intervals"],
            base_name=base_name,
            use_binned=use_binned,
            pp_min=pp_min,
            bin_size=bin_size,
            max_bins_per_interval=max_bins_per_interval,
            repartitions_variants=repartitions_variants,
            repartitions_intervals=repartitions_intervals,
        ).persist()
        feature_dependency[cache_key] = wide
    return feature_dependency[cache_key]