Skip to content

L2G Feature Matrix

gentropy.dataset.l2g_feature_matrix.L2GFeatureMatrix dataclass

Bases: Dataset

Dataset with features for Locus to Gene prediction.

Attributes:

Name Type Description
features_list list[str] | None

List of features to use. If None, all possible features are used.

Source code in src/gentropy/dataset/l2g_feature_matrix.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
@dataclass
class L2GFeatureMatrix(Dataset):
    """Dataset with features for Locus to Gene prediction.

    Attributes:
        features_list (list[str] | None): List of features to use. If None, all possible features are used.
    """

    features_list: list[str] | None = None

    def __post_init__(self: L2GFeatureMatrix) -> None:
        """Post-initialisation to set the features list. If not provided, all columns except the fixed ones are used."""
        fixed_cols = ["studyLocusId", "geneId", "goldStandardSet"]
        self.features_list = self.features_list or [
            col for col in self._df.columns if col not in fixed_cols
        ]

    @classmethod
    def generate_features(
        cls: Type[L2GFeatureMatrix],
        features_list: list[str],
        credible_set: StudyLocus,
        study_index: StudyIndex,
        variant_gene: V2G,
        colocalisation: Colocalisation,
    ) -> L2GFeatureMatrix:
        """Generate features from the gentropy datasets.

        Args:
            features_list (list[str]): List of features to generate
            credible_set (StudyLocus): Credible set dataset
            study_index (StudyIndex): Study index dataset
            variant_gene (V2G): Variant to gene dataset
            colocalisation (Colocalisation): Colocalisation dataset

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset

        Raises:
            ValueError: If the feature matrix is empty
        """
        if features_dfs := [
            # Extract features
            ColocalisationFactory._get_max_coloc_per_credible_set(
                colocalisation,
                credible_set,
                study_index,
            ).df,
            StudyLocusFactory._get_tss_distance_features(credible_set, variant_gene).df,
            StudyLocusFactory._get_vep_features(credible_set, variant_gene).df,
        ]:
            fm = reduce(
                lambda x, y: x.unionByName(y),
                features_dfs,
            )
        else:
            raise ValueError("No features found")

        # raise error if the feature matrix is empty
        return cls(
            _df=convert_from_long_to_wide(
                fm, ["studyLocusId", "geneId"], "featureName", "featureValue"
            ),
            _schema=cls.get_schema(),
            features_list=features_list,
        )

    @classmethod
    def get_schema(cls: type[L2GFeatureMatrix]) -> StructType:
        """Provides the schema for the L2gFeatureMatrix dataset.

        Returns:
            StructType: Schema for the L2gFeatureMatrix dataset
        """
        return parse_spark_schema("l2g_feature_matrix.json")

    def calculate_feature_missingness_rate(
        self: L2GFeatureMatrix,
    ) -> dict[str, float]:
        """Calculate the proportion of missing values in each feature.

        Returns:
            dict[str, float]: Dictionary of feature names and their missingness rate.

        Raises:
            ValueError: If no features are found.
        """
        total_count = self._df.count()
        if not self.features_list:
            raise ValueError("No features found")

        return {
            feature: (
                self._df.filter(
                    (self._df[feature].isNull()) | (self._df[feature] == 0)
                ).count()
                / total_count
            )
            for feature in self.features_list
        }

    def fill_na(
        self: L2GFeatureMatrix, value: float = 0.0, subset: list[str] | None = None
    ) -> L2GFeatureMatrix:
        """Fill missing values in a column with a given value.

        Args:
            value (float): Value to replace missing values with. Defaults to 0.0.
            subset (list[str] | None): Subset of columns to consider. Defaults to None.

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset
        """
        self.df = self._df.fillna(value, subset=subset)
        return self

    def select_features(
        self: L2GFeatureMatrix, features_list: list[str] | None
    ) -> L2GFeatureMatrix:
        """Select a subset of features from the feature matrix.

        Args:
            features_list (list[str] | None): List of features to select

        Returns:
            L2GFeatureMatrix: L2G feature matrix dataset
        """
        features_list = features_list or self.features_list
        fixed_cols = ["studyLocusId", "geneId", "goldStandardSet"]
        self.df = self._df.select(fixed_cols + features_list)  # type: ignore
        return self

    def train_test_split(
        self: L2GFeatureMatrix, fraction: float
    ) -> tuple[L2GFeatureMatrix, L2GFeatureMatrix]:
        """Split the dataset into training and test sets.

        Args:
            fraction (float): Fraction of the dataset to use for training

        Returns:
            tuple[L2GFeatureMatrix, L2GFeatureMatrix]: Training and test datasets
        """
        train, test = self._df.randomSplit([fraction, 1 - fraction], seed=42)
        return (
            L2GFeatureMatrix(_df=train, _schema=L2GFeatureMatrix.get_schema()),
            L2GFeatureMatrix(_df=test, _schema=L2GFeatureMatrix.get_schema()),
        )

calculate_feature_missingness_rate() -> dict[str, float]

Calculate the proportion of missing values in each feature.

Returns:

Type Description
dict[str, float]

dict[str, float]: Dictionary of feature names and their missingness rate.

Raises:

Type Description
ValueError

If no features are found.

Source code in src/gentropy/dataset/l2g_feature_matrix.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def calculate_feature_missingness_rate(
    self: L2GFeatureMatrix,
) -> dict[str, float]:
    """Calculate the proportion of missing values in each feature.

    Returns:
        dict[str, float]: Dictionary of feature names and their missingness rate.

    Raises:
        ValueError: If no features are found.
    """
    total_count = self._df.count()
    if not self.features_list:
        raise ValueError("No features found")

    return {
        feature: (
            self._df.filter(
                (self._df[feature].isNull()) | (self._df[feature] == 0)
            ).count()
            / total_count
        )
        for feature in self.features_list
    }

fill_na(value: float = 0.0, subset: list[str] | None = None) -> L2GFeatureMatrix

Fill missing values in a column with a given value.

Parameters:

Name Type Description Default
value float

Value to replace missing values with. Defaults to 0.0.

0.0
subset list[str] | None

Subset of columns to consider. Defaults to None.

None

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def fill_na(
    self: L2GFeatureMatrix, value: float = 0.0, subset: list[str] | None = None
) -> L2GFeatureMatrix:
    """Fill missing values in a column with a given value.

    Args:
        value (float): Value to replace missing values with. Defaults to 0.0.
        subset (list[str] | None): Subset of columns to consider. Defaults to None.

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset
    """
    self.df = self._df.fillna(value, subset=subset)
    return self

generate_features(features_list: list[str], credible_set: StudyLocus, study_index: StudyIndex, variant_gene: V2G, colocalisation: Colocalisation) -> L2GFeatureMatrix classmethod

Generate features from the gentropy datasets.

Parameters:

Name Type Description Default
features_list list[str]

List of features to generate

required
credible_set StudyLocus

Credible set dataset

required
study_index StudyIndex

Study index dataset

required
variant_gene V2G

Variant to gene dataset

required
colocalisation Colocalisation

Colocalisation dataset

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Raises:

Type Description
ValueError

If the feature matrix is empty

Source code in src/gentropy/dataset/l2g_feature_matrix.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
@classmethod
def generate_features(
    cls: Type[L2GFeatureMatrix],
    features_list: list[str],
    credible_set: StudyLocus,
    study_index: StudyIndex,
    variant_gene: V2G,
    colocalisation: Colocalisation,
) -> L2GFeatureMatrix:
    """Generate features from the gentropy datasets.

    Args:
        features_list (list[str]): List of features to generate
        credible_set (StudyLocus): Credible set dataset
        study_index (StudyIndex): Study index dataset
        variant_gene (V2G): Variant to gene dataset
        colocalisation (Colocalisation): Colocalisation dataset

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset

    Raises:
        ValueError: If the feature matrix is empty
    """
    if features_dfs := [
        # Extract features
        ColocalisationFactory._get_max_coloc_per_credible_set(
            colocalisation,
            credible_set,
            study_index,
        ).df,
        StudyLocusFactory._get_tss_distance_features(credible_set, variant_gene).df,
        StudyLocusFactory._get_vep_features(credible_set, variant_gene).df,
    ]:
        fm = reduce(
            lambda x, y: x.unionByName(y),
            features_dfs,
        )
    else:
        raise ValueError("No features found")

    # raise error if the feature matrix is empty
    return cls(
        _df=convert_from_long_to_wide(
            fm, ["studyLocusId", "geneId"], "featureName", "featureValue"
        ),
        _schema=cls.get_schema(),
        features_list=features_list,
    )

get_schema() -> StructType classmethod

Provides the schema for the L2gFeatureMatrix dataset.

Returns:

Name Type Description
StructType StructType

Schema for the L2gFeatureMatrix dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
89
90
91
92
93
94
95
96
@classmethod
def get_schema(cls: type[L2GFeatureMatrix]) -> StructType:
    """Provides the schema for the L2gFeatureMatrix dataset.

    Returns:
        StructType: Schema for the L2gFeatureMatrix dataset
    """
    return parse_spark_schema("l2g_feature_matrix.json")

select_features(features_list: list[str] | None) -> L2GFeatureMatrix

Select a subset of features from the feature matrix.

Parameters:

Name Type Description Default
features_list list[str] | None

List of features to select

required

Returns:

Name Type Description
L2GFeatureMatrix L2GFeatureMatrix

L2G feature matrix dataset

Source code in src/gentropy/dataset/l2g_feature_matrix.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
def select_features(
    self: L2GFeatureMatrix, features_list: list[str] | None
) -> L2GFeatureMatrix:
    """Select a subset of features from the feature matrix.

    Args:
        features_list (list[str] | None): List of features to select

    Returns:
        L2GFeatureMatrix: L2G feature matrix dataset
    """
    features_list = features_list or self.features_list
    fixed_cols = ["studyLocusId", "geneId", "goldStandardSet"]
    self.df = self._df.select(fixed_cols + features_list)  # type: ignore
    return self

train_test_split(fraction: float) -> tuple[L2GFeatureMatrix, L2GFeatureMatrix]

Split the dataset into training and test sets.

Parameters:

Name Type Description Default
fraction float

Fraction of the dataset to use for training

required

Returns:

Type Description
tuple[L2GFeatureMatrix, L2GFeatureMatrix]

tuple[L2GFeatureMatrix, L2GFeatureMatrix]: Training and test datasets

Source code in src/gentropy/dataset/l2g_feature_matrix.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def train_test_split(
    self: L2GFeatureMatrix, fraction: float
) -> tuple[L2GFeatureMatrix, L2GFeatureMatrix]:
    """Split the dataset into training and test sets.

    Args:
        fraction (float): Fraction of the dataset to use for training

    Returns:
        tuple[L2GFeatureMatrix, L2GFeatureMatrix]: Training and test datasets
    """
    train, test = self._df.randomSplit([fraction, 1 - fraction], seed=42)
    return (
        L2GFeatureMatrix(_df=train, _schema=L2GFeatureMatrix.get_schema()),
        L2GFeatureMatrix(_df=test, _schema=L2GFeatureMatrix.get_schema()),
    )

Schema

root
 |-- studyLocusId: long (nullable = false)
 |-- geneId: string (nullable = false)
 |-- goldStandardSet: string (nullable = true)
 |-- distanceTssMean: float (nullable = true)
 |-- distanceTssMinimum: float (nullable = true)
 |-- vepMaximumNeighborhood: float (nullable = true)
 |-- vepMaximum: float (nullable = true)
 |-- vepMeanNeighborhood: float (nullable = true)
 |-- vepMean: float (nullable = true)
 |-- eqtlColocClppMaximum: float (nullable = true)
 |-- eqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- eqtlColocLlrMaximum: float (nullable = true)
 |-- eqtlColocLlrMaximumNeighborhood: float (nullable = true)
 |-- pqtlColocClppMaximum: float (nullable = true)
 |-- pqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- pqtlColocLlrMaximum: float (nullable = true)
 |-- pqtlColocLlrMaximumNeighborhood: float (nullable = true)
 |-- sqtlColocClppMaximum: float (nullable = true)
 |-- sqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- sqtlColocLlrMaximum: float (nullable = true)
 |-- sqtlColocLlrMaximumNeighborhood: float (nullable = true)
 |-- tuqtlColocClppMaximum: float (nullable = true)
 |-- tuqtlColocClppMaximumNeighborhood: float (nullable = true)
 |-- tuqtlColocLlrMaximum: float (nullable = true)
 |-- tuqtlColocLlrMaximumNeighborhood: float (nullable = true)