Skip to content

L2G Prediction

gentropy.dataset.l2g_prediction.L2GPrediction dataclass

Bases: Dataset

Dataset that contains the Locus to Gene predictions.

It is the result of applying the L2G model on a feature matrix, which contains all the study/locus pairs and their functional annotations. The score column informs the confidence of the prediction that a gene is causal to an association.

Source code in src/gentropy/dataset/l2g_prediction.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
@dataclass
class L2GPrediction(Dataset):
    """Dataset that contains the Locus to Gene predictions.

    It is the result of applying the L2G model on a feature matrix, which contains all
    the study/locus pairs and their functional annotations. The score column informs the
    confidence of the prediction that a gene is causal to an association.
    """

    @classmethod
    def get_schema(cls: type[L2GPrediction]) -> StructType:
        """Provides the schema for the L2GPrediction dataset.

        Returns:
            StructType: Schema for the L2GPrediction dataset
        """
        return parse_spark_schema("l2g_predictions.json")

    @classmethod
    def from_credible_set(
        cls: Type[L2GPrediction],
        features_list: list[str],
        credible_set: StudyLocus,
        study_index: StudyIndex,
        v2g: V2G,
        coloc: Colocalisation,
        session: Session,
        model_path: str | None,
        hf_token: str | None = None,
        download_from_hub: bool = True,
    ) -> tuple[L2GPrediction, L2GFeatureMatrix]:
        """Extract L2G predictions for a set of credible sets derived from GWAS.

        Args:
            features_list (list[str]): List of features to use for the model
            credible_set (StudyLocus): Credible set dataset
            study_index (StudyIndex): Study index dataset
            v2g (V2G): Variant to gene dataset
            coloc (Colocalisation): Colocalisation dataset
            session (Session): Session object that contains the Spark session
            model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
            hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
            download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.

        Returns:
            tuple[L2GPrediction, L2GFeatureMatrix]: L2G dataset and feature matrix limited to GWAS study only.
        """
        # Load the model
        if download_from_hub:
            # Model ID defaults to "opentargets/locus_to_gene" and it assumes the name of the classifier is "classifier.skops".
            model_id = model_path or "opentargets/locus_to_gene"
            l2g_model = LocusToGeneModel.load_from_hub(model_id, hf_token)
        elif model_path:
            l2g_model = LocusToGeneModel.load_from_disk(model_path)

        # Prepare data
        fm = L2GFeatureMatrix.generate_features(
            features_list=features_list,
            credible_set=credible_set,
            study_index=study_index,
            variant_gene=v2g,
            colocalisation=coloc,
        ).fill_na()

        gwas_fm = (
            L2GFeatureMatrix(
                _df=(
                    fm.df.join(
                        credible_set.filter_by_study_type(
                            "gwas", study_index
                        ).df.select("studyLocusId"),
                        on="studyLocusId",
                    )
                ),
                _schema=L2GFeatureMatrix.get_schema(),
                mode="predict",
            )
            .select_features(features_list)
            .persist()
        )
        return (
            l2g_model.predict(gwas_fm, session),
            gwas_fm,
        )

from_credible_set(features_list: list[str], credible_set: StudyLocus, study_index: StudyIndex, v2g: V2G, coloc: Colocalisation, session: Session, model_path: str | None, hf_token: str | None = None, download_from_hub: bool = True) -> tuple[L2GPrediction, L2GFeatureMatrix] classmethod

Extract L2G predictions for a set of credible sets derived from GWAS.

Parameters:

Name Type Description Default
features_list list[str]

List of features to use for the model

required
credible_set StudyLocus

Credible set dataset

required
study_index StudyIndex

Study index dataset

required
v2g V2G

Variant to gene dataset

required
coloc Colocalisation

Colocalisation dataset

required
session Session

Session object that contains the Spark session

required
model_path str | None

Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).

required
hf_token str | None

Hugging Face token to download the model from the Hub. Only required if the model is private.

None
download_from_hub bool

Whether to download the model from the Hugging Face Hub. Defaults to True.

True

Returns:

Type Description
tuple[L2GPrediction, L2GFeatureMatrix]

tuple[L2GPrediction, L2GFeatureMatrix]: L2G dataset and feature matrix limited to GWAS study only.

Source code in src/gentropy/dataset/l2g_prediction.py
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
@classmethod
def from_credible_set(
    cls: Type[L2GPrediction],
    features_list: list[str],
    credible_set: StudyLocus,
    study_index: StudyIndex,
    v2g: V2G,
    coloc: Colocalisation,
    session: Session,
    model_path: str | None,
    hf_token: str | None = None,
    download_from_hub: bool = True,
) -> tuple[L2GPrediction, L2GFeatureMatrix]:
    """Extract L2G predictions for a set of credible sets derived from GWAS.

    Args:
        features_list (list[str]): List of features to use for the model
        credible_set (StudyLocus): Credible set dataset
        study_index (StudyIndex): Study index dataset
        v2g (V2G): Variant to gene dataset
        coloc (Colocalisation): Colocalisation dataset
        session (Session): Session object that contains the Spark session
        model_path (str | None): Path to the model file. It can be either in the filesystem or the name on the Hugging Face Hub (in the form of username/repo_name).
        hf_token (str | None): Hugging Face token to download the model from the Hub. Only required if the model is private.
        download_from_hub (bool): Whether to download the model from the Hugging Face Hub. Defaults to True.

    Returns:
        tuple[L2GPrediction, L2GFeatureMatrix]: L2G dataset and feature matrix limited to GWAS study only.
    """
    # Load the model
    if download_from_hub:
        # Model ID defaults to "opentargets/locus_to_gene" and it assumes the name of the classifier is "classifier.skops".
        model_id = model_path or "opentargets/locus_to_gene"
        l2g_model = LocusToGeneModel.load_from_hub(model_id, hf_token)
    elif model_path:
        l2g_model = LocusToGeneModel.load_from_disk(model_path)

    # Prepare data
    fm = L2GFeatureMatrix.generate_features(
        features_list=features_list,
        credible_set=credible_set,
        study_index=study_index,
        variant_gene=v2g,
        colocalisation=coloc,
    ).fill_na()

    gwas_fm = (
        L2GFeatureMatrix(
            _df=(
                fm.df.join(
                    credible_set.filter_by_study_type(
                        "gwas", study_index
                    ).df.select("studyLocusId"),
                    on="studyLocusId",
                )
            ),
            _schema=L2GFeatureMatrix.get_schema(),
            mode="predict",
        )
        .select_features(features_list)
        .persist()
    )
    return (
        l2g_model.predict(gwas_fm, session),
        gwas_fm,
    )

get_schema() -> StructType classmethod

Provides the schema for the L2GPrediction dataset.

Returns:

Name Type Description
StructType StructType

Schema for the L2GPrediction dataset

Source code in src/gentropy/dataset/l2g_prediction.py
31
32
33
34
35
36
37
38
@classmethod
def get_schema(cls: type[L2GPrediction]) -> StructType:
    """Provides the schema for the L2GPrediction dataset.

    Returns:
        StructType: Schema for the L2GPrediction dataset
    """
    return parse_spark_schema("l2g_predictions.json")

Schema

root
 |-- studyLocusId: long (nullable = false)
 |-- geneId: string (nullable = false)
 |-- score: double (nullable = false)