Skip to content

L2G Feature Matrix

gentropy.dataset.l2g_feature_matrix.L2GFeatureMatrix

Dataset with features for Locus to Gene prediction.

Source code in src/gentropy/dataset/l2g_feature_matrix.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
class L2GFeatureMatrix:
    """Dataset with features for Locus to Gene prediction."""

    def __init__(
        self,
        _df: DataFrame,
        features_list: list[str] | None = None,
        with_gold_standard: bool = False,
        label_col: str = "goldStandardSet",
    ) -> None:
        """Post-initialisation to set the features list. If not provided, all columns except the fixed ones are used.

        Args:
            _df (DataFrame): Feature matrix dataset
            features_list (list[str] | None): List of features to use. If None, all possible features are used.
            with_gold_standard (bool): Whether to include the gold standard set in the feature matrix.
            label_col (str): The target column when the feature matrix represents the gold standard

        """
        self.with_gold_standard = with_gold_standard
        self.fixed_cols = ["studyLocusId", "geneId"]
        if self.with_gold_standard:
            self.label_col = label_col
            self.fixed_cols.append(label_col)
        if "traitFromSourceMappedId" in _df.columns:
            self.fixed_cols.append("traitFromSourceMappedId")

        self.features_list = features_list or [
            col for col in _df.columns if col not in self.fixed_cols
        ]
        self._df = _df.selectExpr(
            self.fixed_cols
            + [
                f"CAST({feature} AS FLOAT) AS {feature}"
                for feature in self.features_list
            ]
        )

    @classmethod
    def from_features_list(
        cls: type[L2GFeatureMatrix],
        study_loci_to_annotate: StudyLocus | L2GGoldStandard,
        features_list: list[str],
        features_input_loader: L2GFeatureInputLoader,
    ) -> L2GFeatureMatrix:
        """Generate features from the gentropy datasets by calling the feature factory that will instantiate the corresponding features.

        Args:
            study_loci_to_annotate (StudyLocus | L2GGoldStandard): Study locus pairs to annotate
            features_list (list[str]): List of feature names to be computed.
            features_input_loader (L2GFeatureInputLoader): Object that contais features input.

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset
        """
        features_long_df = reduce(
            lambda x, y: x.unionByName(y, allowMissingColumns=True),
            [
                # Compute all features and merge them into a single dataframe
                feature.df
                for feature in FeatureFactory(
                    study_loci_to_annotate, features_list
                ).generate_features(features_input_loader)
            ],
        )
        if isinstance(study_loci_to_annotate, L2GGoldStandard):
            return cls(
                _df=convert_from_long_to_wide(
                    # Add gold standard set to the feature matrix
                    features_long_df.join(
                        study_loci_to_annotate.df.select(
                            "studyLocusId", "geneId", "goldStandardSet"
                        ),
                        ["studyLocusId", "geneId"],
                    ),
                    ["studyLocusId", "geneId", "goldStandardSet"],
                    "featureName",
                    "featureValue",
                ),
                with_gold_standard=True,
            )
        return cls(
            _df=convert_from_long_to_wide(
                features_long_df,
                ["studyLocusId", "geneId"],
                "featureName",
                "featureValue",
            ),
            with_gold_standard=False,
        )

    def calculate_feature_missingness_rate(
        self: L2GFeatureMatrix,
    ) -> dict[str, float]:
        """Calculate the proportion of missing values in each feature.

        Returns:
            dict[str, float]: Dictionary of feature names and their missingness rate.

        Raises:
            ValueError: If no features are found.
        """
        total_count = self._df.count()
        if not self.features_list:
            raise ValueError("No features found")

        return {
            feature: (
                self._df.filter(
                    (self._df[feature].isNull()) | (self._df[feature] == 0)
                ).count()
                / total_count
            )
            for feature in self.features_list
        }

    def fill_na(
        self: L2GFeatureMatrix, na_value: float = 0.0, subset: list[str] | None = None
    ) -> L2GFeatureMatrix:
        """Fill missing values in a column with a given value.

        For features that correspond to gene attributes, missing values are imputed using the mean of the column.

        Args:
            na_value (float): Value to replace missing values with. Defaults to 0.0.
            subset (list[str] | None): Subset of columns to consider. Defaults to None.

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset
        """
        cols_to_impute = [
            "proteinGeneCount500kb",
            "geneCount500kb",
        ]
        for col in cols_to_impute:
            if col not in self._df.columns:
                continue
            else:
                self._df = self._df.withColumn(
                    col,
                    f.when(
                        f.col(col).isNull(),
                        f.mean(f.col(col)).over(Window.partitionBy("studyLocusId")),
                    ).otherwise(f.col(col)),
                )
        self._df = self._df.fillna(na_value, subset=subset)
        return self

    def select_features(
        self: L2GFeatureMatrix,
        features_list: list[str] | None,
    ) -> L2GFeatureMatrix:
        """Returns a new object with a subset of features from the original feature matrix.

        Args:
            features_list (list[str] | None): List of features to select

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset

        Raises:
            ValueError: If no features have been selected.
        """
        if features_list := features_list or self.features_list:
            # cast to float every feature in the features_list
            return L2GFeatureMatrix(
                _df=self._df.selectExpr(
                    self.fixed_cols
                    + [
                        f"CAST({feature} AS FLOAT) AS {feature}"
                        for feature in features_list
                    ]
                ),
                features_list=features_list,
                with_gold_standard=self.with_gold_standard,
            )
        raise ValueError("features_list cannot be None")

    def persist(self: Self) -> Self:
        """Persist the feature matrix in memory.

        Returns:
            Self: Persisted Dataset
        """
        self._df = self._df.persist()
        return self

    def append_null_features(self, features_list: list[str]) -> L2GFeatureMatrix:
        """Add features from the list that are not already in the dataframe as null columns filled with 0.0.

        Args:
            features_list (list[str]): List of features to check and add if missing

        Returns:
            L2GFeatureMatrix: Updated feature matrix with additional features
        """
        null_features = [
            feature for feature in features_list if feature not in self._df.columns
        ]
        if null_features:
            for feature in null_features:
                self._df = self._df.withColumn(feature, f.lit(0.0))
            self.features_list.extend(null_features)

        return self

    def generate_train_test_split(
        self,
        test_size: float,
        verbose: bool,
        label_encoder: dict[str, int],
        label_col: str,
    ) -> tuple[pd_dataframe, pd_dataframe]:
        """Generate train and test splits for the feature matrix.

        Args:
            test_size (float): Proportion of the test set
            verbose (bool): Whether to print verbose output
            label_encoder (dict[str, int]): Label encoder for the gold standard set
            label_col (str): Column name for the gold standard set

        Returns:
            tuple[pd_dataframe, pd_dataframe]: Train and test splits
        """
        from gentropy.method.l2g.trainer import LocusToGeneTrainer

        data_df = self._df.toPandas()

        # Encode labels in `goldStandardSet` to a numeric value
        data_df[label_col] = data_df[label_col].map(label_encoder)

        # Generate train, held out sets
        return LocusToGeneTrainer.hierarchical_split(
            data_df, test_size=test_size, verbose=verbose
        )

__init__(_df: DataFrame, features_list: list[str] | None = None, with_gold_standard: bool = False, label_col: str = 'goldStandardSet') -> None

Post-initialisation to set the features list. If not provided, all columns except the fixed ones are used.

Parameters:

Name Type Description Default
_df DataFrame

Feature matrix dataset

required
features_list list[str] | None

List of features to use. If None, all possible features are used.

None
with_gold_standard bool

Whether to include the gold standard set in the feature matrix.

False
label_col str

The target column when the feature matrix represents the gold standard

'goldStandardSet'
Source code in src/gentropy/dataset/l2g_feature_matrix.py
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def __init__(
    self,
    _df: DataFrame,
    features_list: list[str] | None = None,
    with_gold_standard: bool = False,
    label_col: str = "goldStandardSet",
) -> None:
    """Post-initialisation to set the features list. If not provided, all columns except the fixed ones are used.

    Args:
        _df (DataFrame): Feature matrix dataset
        features_list (list[str] | None): List of features to use. If None, all possible features are used.
        with_gold_standard (bool): Whether to include the gold standard set in the feature matrix.
        label_col (str): The target column when the feature matrix represents the gold standard

    """
    self.with_gold_standard = with_gold_standard
    self.fixed_cols = ["studyLocusId", "geneId"]
    if self.with_gold_standard:
        self.label_col = label_col
        self.fixed_cols.append(label_col)
    if "traitFromSourceMappedId" in _df.columns:
        self.fixed_cols.append("traitFromSourceMappedId")

    self.features_list = features_list or [
        col for col in _df.columns if col not in self.fixed_cols
    ]
    self._df = _df.selectExpr(
        self.fixed_cols
        + [
            f"CAST({feature} AS FLOAT) AS {feature}"
            for feature in self.features_list
        ]
    )

append_null_features(features_list: list[str]) -> L2GFeatureMatrix

Add features from the list that are not already in the dataframe as null columns filled with 0.0.

Parameters:

Name Type Description Default
features_list list[str]

List of features to check and add if missing

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

Updated feature matrix with additional features

Source code in src/gentropy/dataset/l2g_feature_matrix.py
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def append_null_features(self, features_list: list[str]) -> L2GFeatureMatrix:
    """Add features from the list that are not already in the dataframe as null columns filled with 0.0.

    Args:
        features_list (list[str]): List of features to check and add if missing

    Returns:
        L2GFeatureMatrix: Updated feature matrix with additional features
    """
    null_features = [
        feature for feature in features_list if feature not in self._df.columns
    ]
    if null_features:
        for feature in null_features:
            self._df = self._df.withColumn(feature, f.lit(0.0))
        self.features_list.extend(null_features)

    return self

calculate_feature_missingness_rate() -> dict[str, float]

Calculate the proportion of missing values in each feature.

Returns:

Type Description
dict[str, float]

dict[str, float]: Dictionary of feature names and their missingness rate.

Raises:

Type Description
ValueError

If no features are found.

Source code in src/gentropy/dataset/l2g_feature_matrix.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def calculate_feature_missingness_rate(
    self: L2GFeatureMatrix,
) -> dict[str, float]:
    """Calculate the proportion of missing values in each feature.

    Returns:
        dict[str, float]: Dictionary of feature names and their missingness rate.

    Raises:
        ValueError: If no features are found.
    """
    total_count = self._df.count()
    if not self.features_list:
        raise ValueError("No features found")

    return {
        feature: (
            self._df.filter(
                (self._df[feature].isNull()) | (self._df[feature] == 0)
            ).count()
            / total_count
        )
        for feature in self.features_list
    }

fill_na(na_value: float = 0.0, subset: list[str] | None = None) -> L2GFeatureMatrix

Fill missing values in a column with a given value.

For features that correspond to gene attributes, missing values are imputed using the mean of the column.

Parameters:

Name Type Description Default
na_value float

Value to replace missing values with. Defaults to 0.0.

0.0
subset list[str] | None

Subset of columns to consider. Defaults to None.

None

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def fill_na(
    self: L2GFeatureMatrix, na_value: float = 0.0, subset: list[str] | None = None
) -> L2GFeatureMatrix:
    """Fill missing values in a column with a given value.

    For features that correspond to gene attributes, missing values are imputed using the mean of the column.

    Args:
        na_value (float): Value to replace missing values with. Defaults to 0.0.
        subset (list[str] | None): Subset of columns to consider. Defaults to None.

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset
    """
    cols_to_impute = [
        "proteinGeneCount500kb",
        "geneCount500kb",
    ]
    for col in cols_to_impute:
        if col not in self._df.columns:
            continue
        else:
            self._df = self._df.withColumn(
                col,
                f.when(
                    f.col(col).isNull(),
                    f.mean(f.col(col)).over(Window.partitionBy("studyLocusId")),
                ).otherwise(f.col(col)),
            )
    self._df = self._df.fillna(na_value, subset=subset)
    return self

from_features_list(study_loci_to_annotate: StudyLocus | L2GGoldStandard, features_list: list[str], features_input_loader: L2GFeatureInputLoader) -> L2GFeatureMatrix classmethod

Generate features from the gentropy datasets by calling the feature factory that will instantiate the corresponding features.

Parameters:

Name Type Description Default
study_loci_to_annotate StudyLocus | L2GGoldStandard

Study locus pairs to annotate

required
features_list list[str]

List of feature names to be computed.

required
features_input_loader L2GFeatureInputLoader

Object that contais features input.

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
@classmethod
def from_features_list(
    cls: type[L2GFeatureMatrix],
    study_loci_to_annotate: StudyLocus | L2GGoldStandard,
    features_list: list[str],
    features_input_loader: L2GFeatureInputLoader,
) -> L2GFeatureMatrix:
    """Generate features from the gentropy datasets by calling the feature factory that will instantiate the corresponding features.

    Args:
        study_loci_to_annotate (StudyLocus | L2GGoldStandard): Study locus pairs to annotate
        features_list (list[str]): List of feature names to be computed.
        features_input_loader (L2GFeatureInputLoader): Object that contais features input.

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset
    """
    features_long_df = reduce(
        lambda x, y: x.unionByName(y, allowMissingColumns=True),
        [
            # Compute all features and merge them into a single dataframe
            feature.df
            for feature in FeatureFactory(
                study_loci_to_annotate, features_list
            ).generate_features(features_input_loader)
        ],
    )
    if isinstance(study_loci_to_annotate, L2GGoldStandard):
        return cls(
            _df=convert_from_long_to_wide(
                # Add gold standard set to the feature matrix
                features_long_df.join(
                    study_loci_to_annotate.df.select(
                        "studyLocusId", "geneId", "goldStandardSet"
                    ),
                    ["studyLocusId", "geneId"],
                ),
                ["studyLocusId", "geneId", "goldStandardSet"],
                "featureName",
                "featureValue",
            ),
            with_gold_standard=True,
        )
    return cls(
        _df=convert_from_long_to_wide(
            features_long_df,
            ["studyLocusId", "geneId"],
            "featureName",
            "featureValue",
        ),
        with_gold_standard=False,
    )

generate_train_test_split(test_size: float, verbose: bool, label_encoder: dict[str, int], label_col: str) -> tuple[pd_dataframe, pd_dataframe]

Generate train and test splits for the feature matrix.

Parameters:

Name Type Description Default
test_size float

Proportion of the test set

required
verbose bool

Whether to print verbose output

required
label_encoder dict[str, int]

Label encoder for the gold standard set

required
label_col str

Column name for the gold standard set

required

Returns:

Type Description
tuple[DataFrame, DataFrame]

tuple[pd_dataframe, pd_dataframe]: Train and test splits

Source code in src/gentropy/dataset/l2g_feature_matrix.py
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
def generate_train_test_split(
    self,
    test_size: float,
    verbose: bool,
    label_encoder: dict[str, int],
    label_col: str,
) -> tuple[pd_dataframe, pd_dataframe]:
    """Generate train and test splits for the feature matrix.

    Args:
        test_size (float): Proportion of the test set
        verbose (bool): Whether to print verbose output
        label_encoder (dict[str, int]): Label encoder for the gold standard set
        label_col (str): Column name for the gold standard set

    Returns:
        tuple[pd_dataframe, pd_dataframe]: Train and test splits
    """
    from gentropy.method.l2g.trainer import LocusToGeneTrainer

    data_df = self._df.toPandas()

    # Encode labels in `goldStandardSet` to a numeric value
    data_df[label_col] = data_df[label_col].map(label_encoder)

    # Generate train, held out sets
    return LocusToGeneTrainer.hierarchical_split(
        data_df, test_size=test_size, verbose=verbose
    )

persist() -> Self

Persist the feature matrix in memory.

Returns:

Name Type Description
Self Self

Persisted Dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
201
202
203
204
205
206
207
208
def persist(self: Self) -> Self:
    """Persist the feature matrix in memory.

    Returns:
        Self: Persisted Dataset
    """
    self._df = self._df.persist()
    return self

select_features(features_list: list[str] | None) -> L2GFeatureMatrix

Returns a new object with a subset of features from the original feature matrix.

Parameters:

Name Type Description Default
features_list list[str] | None

List of features to select

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Raises:

Type Description
ValueError

If no features have been selected.

Source code in src/gentropy/dataset/l2g_feature_matrix.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def select_features(
    self: L2GFeatureMatrix,
    features_list: list[str] | None,
) -> L2GFeatureMatrix:
    """Returns a new object with a subset of features from the original feature matrix.

    Args:
        features_list (list[str] | None): List of features to select

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset

    Raises:
        ValueError: If no features have been selected.
    """
    if features_list := features_list or self.features_list:
        # cast to float every feature in the features_list
        return L2GFeatureMatrix(
            _df=self._df.selectExpr(
                self.fixed_cols
                + [
                    f"CAST({feature} AS FLOAT) AS {feature}"
                    for feature in features_list
                ]
            ),
            features_list=features_list,
            with_gold_standard=self.with_gold_standard,
        )
    raise ValueError("features_list cannot be None")

Schema