Skip to content

L2G Feature Matrix

gentropy.dataset.l2g_feature_matrix.L2GFeatureMatrix dataclass

Bases: Dataset

Dataset with features for Locus to Gene prediction.

Attributes:

Name Type Description
features_list list[str] | None

List of features to use. If None, all possible features are used.

fixed_cols list[str]

Columns that should be kept fixed in the feature matrix, although not considered as features.

mode str

Mode of the feature matrix. Defaults to "train". Can be either "train" or "predict".

Source code in src/gentropy/dataset/l2g_feature_matrix.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
@dataclass
class L2GFeatureMatrix(Dataset):
    """Dataset with features for Locus to Gene prediction.

    Attributes:
        features_list (list[str] | None): List of features to use. If None, all possible features are used.
        fixed_cols (list[str]): Columns that should be kept fixed in the feature matrix, although not considered as features.
        mode (str): Mode of the feature matrix. Defaults to "train". Can be either "train" or "predict".
    """

    features_list: list[str] | None = None
    fixed_cols: list[str] = field(default_factory=lambda: ["studyLocusId", "geneId"])
    mode: str = "train"

    def __post_init__(self: L2GFeatureMatrix) -> None:
        """Post-initialisation to set the features list. If not provided, all columns except the fixed ones are used.

        Raises:
            ValueError: If the mode is neither 'train' nor 'predict'.
        """
        if self.mode not in ["train", "predict"]:
            raise ValueError("Mode should be either 'train' or 'predict'")
        if self.mode == "train":
            self.fixed_cols = self.fixed_cols + ["goldStandardSet"]
        self.features_list = self.features_list or [
            col for col in self._df.columns if col not in self.fixed_cols
        ]
        self.validate_schema()

    @classmethod
    def generate_features(
        cls: Type[L2GFeatureMatrix],
        features_list: list[str],
        credible_set: StudyLocus,
        study_index: StudyIndex,
        variant_gene: V2G,
        colocalisation: Colocalisation,
    ) -> L2GFeatureMatrix:
        """Generate features from the gentropy datasets.

        Args:
            features_list (list[str]): List of features to generate
            credible_set (StudyLocus): Credible set dataset
            study_index (StudyIndex): Study index dataset
            variant_gene (V2G): Variant to gene dataset
            colocalisation (Colocalisation): Colocalisation dataset

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset

        Raises:
            ValueError: If the feature matrix is empty
        """
        if features_dfs := [
            # Extract features
            ColocalisationFactory._get_max_coloc_per_credible_set(
                colocalisation,
                credible_set,
                study_index,
            ).df,
            StudyLocusFactory._get_tss_distance_features(credible_set, variant_gene).df,
            StudyLocusFactory._get_vep_features(credible_set, variant_gene).df,
        ]:
            fm = reduce(
                lambda x, y: x.unionByName(y),
                features_dfs,
            )
        else:
            raise ValueError("No features found")

        # raise error if the feature matrix is empty
        return cls(
            _df=convert_from_long_to_wide(
                fm, ["studyLocusId", "geneId"], "featureName", "featureValue"
            ),
            _schema=cls.get_schema(),
            features_list=features_list,
        )

    @classmethod
    def get_schema(cls: type[L2GFeatureMatrix]) -> StructType:
        """Provides the schema for the L2gFeatureMatrix dataset.

        Returns:
            StructType: Schema for the L2gFeatureMatrix dataset
        """
        return parse_spark_schema("l2g_feature_matrix.json")

    def calculate_feature_missingness_rate(
        self: L2GFeatureMatrix,
    ) -> dict[str, float]:
        """Calculate the proportion of missing values in each feature.

        Returns:
            dict[str, float]: Dictionary of feature names and their missingness rate.

        Raises:
            ValueError: If no features are found.
        """
        total_count = self._df.count()
        if not self.features_list:
            raise ValueError("No features found")

        return {
            feature: (
                self._df.filter(
                    (self._df[feature].isNull()) | (self._df[feature] == 0)
                ).count()
                / total_count
            )
            for feature in self.features_list
        }

    def fill_na(
        self: L2GFeatureMatrix, value: float = 0.0, subset: list[str] | None = None
    ) -> L2GFeatureMatrix:
        """Fill missing values in a column with a given value.

        Args:
            value (float): Value to replace missing values with. Defaults to 0.0.
            subset (list[str] | None): Subset of columns to consider. Defaults to None.

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset
        """
        self.df = self._df.fillna(value, subset=subset)
        return self

    def select_features(
        self: L2GFeatureMatrix,
        features_list: list[str] | None,
    ) -> L2GFeatureMatrix:
        """Select a subset of features from the feature matrix.

        Args:
            features_list (list[str] | None): List of features to select

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset

        Raises:
            ValueError: If no features have been selected.
        """
        if features_list := features_list or self.features_list:
            self.df = self._df.select(self.fixed_cols + features_list)
            return self
        raise ValueError("features_list cannot be None")

calculate_feature_missingness_rate() -> dict[str, float]

Calculate the proportion of missing values in each feature.

Returns:

Type Description
dict[str, float]

dict[str, float]: Dictionary of feature names and their missingness rate.

Raises:

Type Description
ValueError

If no features are found.

Source code in src/gentropy/dataset/l2g_feature_matrix.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
def calculate_feature_missingness_rate(
    self: L2GFeatureMatrix,
) -> dict[str, float]:
    """Calculate the proportion of missing values in each feature.

    Returns:
        dict[str, float]: Dictionary of feature names and their missingness rate.

    Raises:
        ValueError: If no features are found.
    """
    total_count = self._df.count()
    if not self.features_list:
        raise ValueError("No features found")

    return {
        feature: (
            self._df.filter(
                (self._df[feature].isNull()) | (self._df[feature] == 0)
            ).count()
            / total_count
        )
        for feature in self.features_list
    }

fill_na(value: float = 0.0, subset: list[str] | None = None) -> L2GFeatureMatrix

Fill missing values in a column with a given value.

Parameters:

Name Type Description Default
value float

Value to replace missing values with. Defaults to 0.0.

0.0
subset list[str] | None

Subset of columns to consider. Defaults to None.

None

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
def fill_na(
    self: L2GFeatureMatrix, value: float = 0.0, subset: list[str] | None = None
) -> L2GFeatureMatrix:
    """Fill missing values in a column with a given value.

    Args:
        value (float): Value to replace missing values with. Defaults to 0.0.
        subset (list[str] | None): Subset of columns to consider. Defaults to None.

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset
    """
    self.df = self._df.fillna(value, subset=subset)
    return self

generate_features(features_list: list[str], credible_set: StudyLocus, study_index: StudyIndex, variant_gene: V2G, colocalisation: Colocalisation) -> L2GFeatureMatrix classmethod

Generate features from the gentropy datasets.

Parameters:

Name Type Description Default
features_list list[str]

List of features to generate

required
credible_set StudyLocus

Credible set dataset

required
study_index StudyIndex

Study index dataset

required
variant_gene V2G

Variant to gene dataset

required
colocalisation Colocalisation

Colocalisation dataset

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Raises:

Type Description
ValueError

If the feature matrix is empty

Source code in src/gentropy/dataset/l2g_feature_matrix.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
@classmethod
def generate_features(
    cls: Type[L2GFeatureMatrix],
    features_list: list[str],
    credible_set: StudyLocus,
    study_index: StudyIndex,
    variant_gene: V2G,
    colocalisation: Colocalisation,
) -> L2GFeatureMatrix:
    """Generate features from the gentropy datasets.

    Args:
        features_list (list[str]): List of features to generate
        credible_set (StudyLocus): Credible set dataset
        study_index (StudyIndex): Study index dataset
        variant_gene (V2G): Variant to gene dataset
        colocalisation (Colocalisation): Colocalisation dataset

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset

    Raises:
        ValueError: If the feature matrix is empty
    """
    if features_dfs := [
        # Extract features
        ColocalisationFactory._get_max_coloc_per_credible_set(
            colocalisation,
            credible_set,
            study_index,
        ).df,
        StudyLocusFactory._get_tss_distance_features(credible_set, variant_gene).df,
        StudyLocusFactory._get_vep_features(credible_set, variant_gene).df,
    ]:
        fm = reduce(
            lambda x, y: x.unionByName(y),
            features_dfs,
        )
    else:
        raise ValueError("No features found")

    # raise error if the feature matrix is empty
    return cls(
        _df=convert_from_long_to_wide(
            fm, ["studyLocusId", "geneId"], "featureName", "featureValue"
        ),
        _schema=cls.get_schema(),
        features_list=features_list,
    )

get_schema() -> StructType classmethod

Provides the schema for the L2gFeatureMatrix dataset.

Returns:

Name Type Description
StructType StructType

Schema for the L2gFeatureMatrix dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
102
103
104
105
106
107
108
109
@classmethod
def get_schema(cls: type[L2GFeatureMatrix]) -> StructType:
    """Provides the schema for the L2gFeatureMatrix dataset.

    Returns:
        StructType: Schema for the L2gFeatureMatrix dataset
    """
    return parse_spark_schema("l2g_feature_matrix.json")

select_features(features_list: list[str] | None) -> L2GFeatureMatrix

Select a subset of features from the feature matrix.

Parameters:

Name Type Description Default
features_list list[str] | None

List of features to select

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Raises:

Type Description
ValueError

If no features have been selected.

Source code in src/gentropy/dataset/l2g_feature_matrix.py
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def select_features(
    self: L2GFeatureMatrix,
    features_list: list[str] | None,
) -> L2GFeatureMatrix:
    """Select a subset of features from the feature matrix.

    Args:
        features_list (list[str] | None): List of features to select

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset

    Raises:
        ValueError: If no features have been selected.
    """
    if features_list := features_list or self.features_list:
        self.df = self._df.select(self.fixed_cols + features_list)
        return self
    raise ValueError("features_list cannot be None")

Schema

root
 |-- studyLocusId: long (nullable = false)
 |-- geneId: string (nullable = false)
 |-- goldStandardSet: string (nullable = true)
 |-- distanceTssMean: float (nullable = true)
 |-- distanceTssMinimum: float (nullable = true)
 |-- vepMaximumNeighborhood: float (nullable = true)
 |-- vepMaximum: float (nullable = true)
 |-- vepMeanNeighborhood: float (nullable = true)
 |-- vepMean: float (nullable = true)
 |-- eqtlColocClppMaximum: float (nullable = true)
 |-- eqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- eqtlColocLlrMaximum: float (nullable = true)
 |-- eqtlColocLlrMaximumNeighborhood: float (nullable = true)
 |-- pqtlColocClppMaximum: float (nullable = true)
 |-- pqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- pqtlColocLlrMaximum: float (nullable = true)
 |-- pqtlColocLlrMaximumNeighborhood: float (nullable = true)
 |-- sqtlColocClppMaximum: float (nullable = true)
 |-- sqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- sqtlColocLlrMaximum: float (nullable = true)
 |-- sqtlColocLlrMaximumNeighborhood: float (nullable = true)
 |-- tuqtlColocClppMaximum: float (nullable = true)
 |-- tuqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- tuqtlColocLlrMaximum: float (nullable = true)
 |-- tuqtlColocLlrMaximumNeighborhood: float (nullable = true)