Skip to content

L2G Prediction

gentropy.dataset.l2g_prediction.L2GPrediction dataclass

Bases: Dataset

Dataset that contains the Locus to Gene predictions.

It is the result of applying the L2G model on a feature matrix, which contains all the study/locus pairs and their functional annotations. The score column informs the confidence of the prediction that a gene is causal to an association.

Source code in src/gentropy/dataset/l2g_prediction.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@dataclass
class L2GPrediction(Dataset):
    """Dataset that contains the Locus to Gene predictions.

    It is the result of applying the L2G model on a feature matrix, which contains all
    the study/locus pairs and their functional annotations. The score column informs the
    confidence of the prediction that a gene is causal to an association.
    """

    @classmethod
    def get_schema(cls: type[L2GPrediction]) -> StructType:
        """Provides the schema for the L2GPrediction dataset.

        Returns:
            StructType: Schema for the L2GPrediction dataset
        """
        return parse_spark_schema("l2g_predictions.json")

    @classmethod
    def from_credible_set(
        cls: Type[L2GPrediction],
        model_path: str,
        features_list: list[str],
        credible_set: StudyLocus,
        study_index: StudyIndex,
        v2g: V2G,
        coloc: Colocalisation,
    ) -> L2GPrediction:
        """Extract L2G predictions for a set of credible sets derived from GWAS.

        Args:
            model_path (str): Path to the fitted model
            features_list (list[str]): List of features to use for the model
            credible_set (StudyLocus): Credible set dataset
            study_index (StudyIndex): Study index dataset
            v2g (V2G): Variant to gene dataset
            coloc (Colocalisation): Colocalisation dataset

        Returns:
            L2GPrediction: L2G dataset
        """
        fm = L2GFeatureMatrix.generate_features(
            features_list=features_list,
            credible_set=credible_set,
            study_index=study_index,
            variant_gene=v2g,
            colocalisation=coloc,
        ).fill_na()

        gwas_fm = L2GFeatureMatrix(
            _df=(
                fm.df.join(
                    credible_set.filter_by_study_type("gwas", study_index).df,
                    on="studyLocusId",
                )
            ),
            _schema=cls.get_schema(),
        )
        return L2GPrediction(
            # Load and apply fitted model
            _df=(
                LocusToGeneModel.load_from_disk(
                    model_path,
                    features_list=features_list,
                )
                .predict(gwas_fm)
                # the probability of the positive class is the second element inside the probability array
                # - this is selected as the L2G probability
                .select(
                    "studyLocusId",
                    "geneId",
                    vector_to_array(f.col("probability"))[1].alias("score"),
                )
            ),
            _schema=cls.get_schema(),
        )

from_credible_set(model_path: str, features_list: list[str], credible_set: StudyLocus, study_index: StudyIndex, v2g: V2G, coloc: Colocalisation) -> L2GPrediction classmethod

Extract L2G predictions for a set of credible sets derived from GWAS.

Parameters:

Name Type Description Default
model_path str

Path to the fitted model

required
features_list list[str]

List of features to use for the model

required
credible_set StudyLocus

Credible set dataset

required
study_index StudyIndex

Study index dataset

required
v2g V2G

Variant to gene dataset

required
coloc Colocalisation

Colocalisation dataset

required

Returns:

Name Type Description
L2GPrediction L2GPrediction

L2G dataset

Source code in src/gentropy/dataset/l2g_prediction.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
@classmethod
def from_credible_set(
    cls: Type[L2GPrediction],
    model_path: str,
    features_list: list[str],
    credible_set: StudyLocus,
    study_index: StudyIndex,
    v2g: V2G,
    coloc: Colocalisation,
) -> L2GPrediction:
    """Extract L2G predictions for a set of credible sets derived from GWAS.

    Args:
        model_path (str): Path to the fitted model
        features_list (list[str]): List of features to use for the model
        credible_set (StudyLocus): Credible set dataset
        study_index (StudyIndex): Study index dataset
        v2g (V2G): Variant to gene dataset
        coloc (Colocalisation): Colocalisation dataset

    Returns:
        L2GPrediction: L2G dataset
    """
    fm = L2GFeatureMatrix.generate_features(
        features_list=features_list,
        credible_set=credible_set,
        study_index=study_index,
        variant_gene=v2g,
        colocalisation=coloc,
    ).fill_na()

    gwas_fm = L2GFeatureMatrix(
        _df=(
            fm.df.join(
                credible_set.filter_by_study_type("gwas", study_index).df,
                on="studyLocusId",
            )
        ),
        _schema=cls.get_schema(),
    )
    return L2GPrediction(
        # Load and apply fitted model
        _df=(
            LocusToGeneModel.load_from_disk(
                model_path,
                features_list=features_list,
            )
            .predict(gwas_fm)
            # the probability of the positive class is the second element inside the probability array
            # - this is selected as the L2G probability
            .select(
                "studyLocusId",
                "geneId",
                vector_to_array(f.col("probability"))[1].alias("score"),
            )
        ),
        _schema=cls.get_schema(),
    )

get_schema() -> StructType classmethod

Provides the schema for the L2GPrediction dataset.

Returns:

Name Type Description
StructType StructType

Schema for the L2GPrediction dataset

Source code in src/gentropy/dataset/l2g_prediction.py
33
34
35
36
37
38
39
40
@classmethod
def get_schema(cls: type[L2GPrediction]) -> StructType:
    """Provides the schema for the L2GPrediction dataset.

    Returns:
        StructType: Schema for the L2GPrediction dataset
    """
    return parse_spark_schema("l2g_predictions.json")

Schema