Skip to content

Colocalisation

gentropy.dataset.colocalisation.Colocalisation dataclass

Bases: Dataset

Colocalisation results for pairs of overlapping study-locus.

Source code in src/gentropy/dataset/colocalisation.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
@dataclass
class Colocalisation(Dataset):
    """Colocalisation results for pairs of overlapping study-locus."""

    @classmethod
    def get_schema(cls: type[Colocalisation]) -> StructType:
        """Provides the schema for the Colocalisation dataset.

        Returns:
            StructType: Schema for the Colocalisation dataset
        """
        return parse_spark_schema("colocalisation.json")

    def extract_maximum_coloc_probability_per_region_and_gene(
        self: Colocalisation,
        study_locus: StudyLocus,
        study_index: StudyIndex,
        *,
        filter_by_colocalisation_method: str,
        filter_by_qtls: str | list[str] | None = None,
    ) -> DataFrame:
        """Get maximum colocalisation probability for a (studyLocus, gene) window.

        Args:
            study_locus (StudyLocus): Dataset containing study loci to filter the colocalisation dataset on and the geneId linked to the region
            study_index (StudyIndex): Study index to use to get study metadata
            filter_by_colocalisation_method (str): optional filter to apply on the colocalisation dataset
            filter_by_qtls (str | list[str] | None): optional filter to apply on the colocalisation dataset

        Returns:
            DataFrame: table with the maximum colocalisation scores for the provided study loci

        Raises:
            ValueError: if filter_by_qtl is not in the list of valid QTL types or is not in the list of valid colocalisation methods
        """
        from gentropy.colocalisation import ColocalisationStep

        valid_qtls = list(
            set(EqtlCatalogueStudyIndex.method_to_qtl_type_mapping.values())
        ) + [
            f"sc{qtl}"
            for qtl in set(
                EqtlCatalogueStudyIndex.method_to_qtl_type_mapping.values()
            )
        ]

        if filter_by_qtls:
            filter_by_qtls = (
                list(map(str.lower, [filter_by_qtls]))
                if isinstance(filter_by_qtls, str)
                else list(map(str.lower, filter_by_qtls))
            )
            if any(qtl not in valid_qtls for qtl in filter_by_qtls):
                raise ValueError(f"There are no studies with QTL type {filter_by_qtls}")

        if filter_by_colocalisation_method not in [
            "ECaviar",
            "Coloc",
        ]:  # TODO: Write helper class to retrieve coloc method names
            raise ValueError(
                f"Colocalisation method {filter_by_colocalisation_method} is not supported."
            )

        method_colocalisation_metric = ColocalisationStep._get_colocalisation_class(
            filter_by_colocalisation_method
        ).METHOD_METRIC

        coloc_filtering_expr = [
            f.col("rightGeneId").isNotNull(),
            f.lower("colocalisationMethod") == filter_by_colocalisation_method.lower(),
        ]
        if filter_by_qtls:
            coloc_filtering_expr.append(f.lower("rightStudyType").isin(filter_by_qtls))

        filtered_colocalisation = (
            # Bring rightStudyType and rightGeneId and filter by rows where the gene is null,
            # which is equivalent to filtering studyloci from gwas on the right side
            self.append_study_metadata(
                study_locus,
                study_index,
                metadata_cols=["geneId", "studyType"],
                colocalisation_side="right",
            )
            # it also filters based on method and qtl type
            .filter(reduce(lambda a, b: a & b, coloc_filtering_expr))
            # and filters colocalisation results to only include the subset of studylocus that contains gwas studylocusid
            .join(
                study_locus.df.selectExpr("studyLocusId as leftStudyLocusId"),
                "leftStudyLocusId",
            )
        )

        return get_record_with_maximum_value(
            filtered_colocalisation.withColumnRenamed(
                "leftStudyLocusId", "studyLocusId"
            ).withColumnRenamed("rightGeneId", "geneId"),
            ["studyLocusId", "geneId"],
            method_colocalisation_metric,
        )

    def append_study_metadata(
        self: Colocalisation,
        study_locus: StudyLocus,
        study_index: StudyIndex,
        *,
        metadata_cols: list[str],
        colocalisation_side: str = "right",
    ) -> DataFrame:
        """Appends metadata from the study to the requested side of the colocalisation dataset.

        Args:
            study_locus (StudyLocus): Dataset containing study loci that links the colocalisation dataset and the study index via the studyId
            study_index (StudyIndex): Dataset containing study index that contains the metadata
            metadata_cols (list[str]): List of study columns to append
            colocalisation_side (str): Which side of the colocalisation dataset to append metadata to. Must be either 'right' or 'left'

        Returns:
            DataFrame: Colocalisation dataset with appended metadata of the study from the requested side

        Raises:
            ValueError: if colocalisation_side is not 'right' or 'left'
        """
        metadata_cols = ["studyId", *metadata_cols]
        if colocalisation_side not in ["right", "left"]:
            raise ValueError(
                f"colocalisation_side must be either 'right' or 'left', got {colocalisation_side}"
            )

        study_loci_w_metadata = (
            study_locus.df.select("studyLocusId", "studyId")
            .join(
                f.broadcast(study_index.df.select("studyId", *metadata_cols)),
                "studyId",
            )
            .distinct()
        )
        coloc_df = (
            # drop `rightStudyType` in case it is requested
            self.df.drop("rightStudyType")
            if "studyType" in metadata_cols and colocalisation_side == "right"
            else self.df
        )
        return (
            # Append that to the respective side of the colocalisation dataset
            study_loci_w_metadata.selectExpr(
                f"studyLocusId as {colocalisation_side}StudyLocusId",
                *[
                    f"{col} as {colocalisation_side}{col[0].upper() + col[1:]}"
                    for col in metadata_cols
                ],
            ).join(coloc_df, f"{colocalisation_side}StudyLocusId", "right")
        )

append_study_metadata(study_locus: StudyLocus, study_index: StudyIndex, *, metadata_cols: list[str], colocalisation_side: str = 'right') -> DataFrame

Appends metadata from the study to the requested side of the colocalisation dataset.

Parameters:

Name Type Description Default
study_locus StudyLocus

Dataset containing study loci that links the colocalisation dataset and the study index via the studyId

required
study_index StudyIndex

Dataset containing study index that contains the metadata

required
metadata_cols list[str]

List of study columns to append

required
colocalisation_side str

Which side of the colocalisation dataset to append metadata to. Must be either 'right' or 'left'

'right'

Returns:

Name Type Description
DataFrame DataFrame

Colocalisation dataset with appended metadata of the study from the requested side

Raises:

Type Description
ValueError

if colocalisation_side is not 'right' or 'left'

Source code in src/gentropy/dataset/colocalisation.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
def append_study_metadata(
    self: Colocalisation,
    study_locus: StudyLocus,
    study_index: StudyIndex,
    *,
    metadata_cols: list[str],
    colocalisation_side: str = "right",
) -> DataFrame:
    """Appends metadata from the study to the requested side of the colocalisation dataset.

    Args:
        study_locus (StudyLocus): Dataset containing study loci that links the colocalisation dataset and the study index via the studyId
        study_index (StudyIndex): Dataset containing study index that contains the metadata
        metadata_cols (list[str]): List of study columns to append
        colocalisation_side (str): Which side of the colocalisation dataset to append metadata to. Must be either 'right' or 'left'

    Returns:
        DataFrame: Colocalisation dataset with appended metadata of the study from the requested side

    Raises:
        ValueError: if colocalisation_side is not 'right' or 'left'
    """
    metadata_cols = ["studyId", *metadata_cols]
    if colocalisation_side not in ["right", "left"]:
        raise ValueError(
            f"colocalisation_side must be either 'right' or 'left', got {colocalisation_side}"
        )

    study_loci_w_metadata = (
        study_locus.df.select("studyLocusId", "studyId")
        .join(
            f.broadcast(study_index.df.select("studyId", *metadata_cols)),
            "studyId",
        )
        .distinct()
    )
    coloc_df = (
        # drop `rightStudyType` in case it is requested
        self.df.drop("rightStudyType")
        if "studyType" in metadata_cols and colocalisation_side == "right"
        else self.df
    )
    return (
        # Append that to the respective side of the colocalisation dataset
        study_loci_w_metadata.selectExpr(
            f"studyLocusId as {colocalisation_side}StudyLocusId",
            *[
                f"{col} as {colocalisation_side}{col[0].upper() + col[1:]}"
                for col in metadata_cols
            ],
        ).join(coloc_df, f"{colocalisation_side}StudyLocusId", "right")
    )

extract_maximum_coloc_probability_per_region_and_gene(study_locus: StudyLocus, study_index: StudyIndex, *, filter_by_colocalisation_method: str, filter_by_qtls: str | list[str] | None = None) -> DataFrame

Get maximum colocalisation probability for a (studyLocus, gene) window.

Parameters:

Name Type Description Default
study_locus StudyLocus

Dataset containing study loci to filter the colocalisation dataset on and the geneId linked to the region

required
study_index StudyIndex

Study index to use to get study metadata

required
filter_by_colocalisation_method str

optional filter to apply on the colocalisation dataset

required
filter_by_qtls str | list[str] | None

optional filter to apply on the colocalisation dataset

None

Returns:

Name Type Description
DataFrame DataFrame

table with the maximum colocalisation scores for the provided study loci

Raises:

Type Description
ValueError

if filter_by_qtl is not in the list of valid QTL types or is not in the list of valid colocalisation methods

Source code in src/gentropy/dataset/colocalisation.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def extract_maximum_coloc_probability_per_region_and_gene(
    self: Colocalisation,
    study_locus: StudyLocus,
    study_index: StudyIndex,
    *,
    filter_by_colocalisation_method: str,
    filter_by_qtls: str | list[str] | None = None,
) -> DataFrame:
    """Get maximum colocalisation probability for a (studyLocus, gene) window.

    Args:
        study_locus (StudyLocus): Dataset containing study loci to filter the colocalisation dataset on and the geneId linked to the region
        study_index (StudyIndex): Study index to use to get study metadata
        filter_by_colocalisation_method (str): optional filter to apply on the colocalisation dataset
        filter_by_qtls (str | list[str] | None): optional filter to apply on the colocalisation dataset

    Returns:
        DataFrame: table with the maximum colocalisation scores for the provided study loci

    Raises:
        ValueError: if filter_by_qtl is not in the list of valid QTL types or is not in the list of valid colocalisation methods
    """
    from gentropy.colocalisation import ColocalisationStep

    valid_qtls = list(
        set(EqtlCatalogueStudyIndex.method_to_qtl_type_mapping.values())
    ) + [
        f"sc{qtl}"
        for qtl in set(
            EqtlCatalogueStudyIndex.method_to_qtl_type_mapping.values()
        )
    ]

    if filter_by_qtls:
        filter_by_qtls = (
            list(map(str.lower, [filter_by_qtls]))
            if isinstance(filter_by_qtls, str)
            else list(map(str.lower, filter_by_qtls))
        )
        if any(qtl not in valid_qtls for qtl in filter_by_qtls):
            raise ValueError(f"There are no studies with QTL type {filter_by_qtls}")

    if filter_by_colocalisation_method not in [
        "ECaviar",
        "Coloc",
    ]:  # TODO: Write helper class to retrieve coloc method names
        raise ValueError(
            f"Colocalisation method {filter_by_colocalisation_method} is not supported."
        )

    method_colocalisation_metric = ColocalisationStep._get_colocalisation_class(
        filter_by_colocalisation_method
    ).METHOD_METRIC

    coloc_filtering_expr = [
        f.col("rightGeneId").isNotNull(),
        f.lower("colocalisationMethod") == filter_by_colocalisation_method.lower(),
    ]
    if filter_by_qtls:
        coloc_filtering_expr.append(f.lower("rightStudyType").isin(filter_by_qtls))

    filtered_colocalisation = (
        # Bring rightStudyType and rightGeneId and filter by rows where the gene is null,
        # which is equivalent to filtering studyloci from gwas on the right side
        self.append_study_metadata(
            study_locus,
            study_index,
            metadata_cols=["geneId", "studyType"],
            colocalisation_side="right",
        )
        # it also filters based on method and qtl type
        .filter(reduce(lambda a, b: a & b, coloc_filtering_expr))
        # and filters colocalisation results to only include the subset of studylocus that contains gwas studylocusid
        .join(
            study_locus.df.selectExpr("studyLocusId as leftStudyLocusId"),
            "leftStudyLocusId",
        )
    )

    return get_record_with_maximum_value(
        filtered_colocalisation.withColumnRenamed(
            "leftStudyLocusId", "studyLocusId"
        ).withColumnRenamed("rightGeneId", "geneId"),
        ["studyLocusId", "geneId"],
        method_colocalisation_metric,
    )

get_schema() -> StructType classmethod

Provides the schema for the Colocalisation dataset.

Returns:

Name Type Description
StructType StructType

Schema for the Colocalisation dataset

Source code in src/gentropy/dataset/colocalisation.py
30
31
32
33
34
35
36
37
@classmethod
def get_schema(cls: type[Colocalisation]) -> StructType:
    """Provides the schema for the Colocalisation dataset.

    Returns:
        StructType: Schema for the Colocalisation dataset
    """
    return parse_spark_schema("colocalisation.json")

Schema

root
 |-- leftStudyLocusId: string (nullable = false)
 |-- rightStudyLocusId: string (nullable = false)
 |-- rightStudyType: string (nullable = false)
 |-- chromosome: string (nullable = false)
 |-- colocalisationMethod: string (nullable = false)
 |-- numberColocalisingVariants: long (nullable = false)
 |-- h0: double (nullable = true)
 |-- h1: double (nullable = true)
 |-- h2: double (nullable = true)
 |-- h3: double (nullable = true)
 |-- h4: double (nullable = true)
 |-- clpp: double (nullable = true)
 |-- betaRatioSignAverage: double (nullable = true)