Skip to content

Study Index

gentropy.datasource.eqtl_catalogue.study_index.EqtlCatalogueStudyIndex

Study index dataset from eQTL Catalogue.

We extract study level metadata from eQTL Catalogue's fine mapping results. All available studies can be found here.

One study from the eQTL Catalogue clusters together all the molecular QTLs (mQTLs) that were found:

- in the same publication (e.g. Alasoo_2018)
- in the same cell type or tissue (e.g. monocytes)
- and for the same measured molecular trait (e.g. ENSG00000141510)
Source code in src/gentropy/datasource/eqtl_catalogue/study_index.py
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
class EqtlCatalogueStudyIndex:
    """Study index dataset from eQTL Catalogue.

    We extract study level metadata from eQTL Catalogue's fine mapping results. All available studies can be found [here](https://www.ebi.ac.uk/eqtl/Studies/).

    One study from the eQTL Catalogue clusters together all the molecular QTLs (mQTLs) that were found:

        - in the same publication (e.g. Alasoo_2018)
        - in the same cell type or tissue (e.g. monocytes)
        - and for the same measured molecular trait (e.g. ENSG00000141510)

    """

    raw_studies_metadata_schema: StructType = StructType(
        [
            StructField("study_id", StringType(), True),
            StructField("dataset_id", StringType(), True),
            StructField("study_label", StringType(), True),
            StructField("sample_group", StringType(), True),
            StructField("tissue_id", StringType(), True),
            StructField("tissue_label", StringType(), True),
            StructField("condition_label", StringType(), True),
            StructField("sample_size", IntegerType(), True),
            StructField("quant_method", StringType(), True),
            StructField("pmid", StringType(), True),
        ]
    )
    raw_studies_metadata_path = "https://raw.githubusercontent.com/eQTL-Catalogue/eQTL-Catalogue-resources/092e01a9601feb404f1c88f86311b43b907a88f6/data_tables/dataset_metadata_upcoming.tsv"

    @classmethod
    def _identify_study_type(
        cls: type[EqtlCatalogueStudyIndex],
        quantification_method_col: Column,
        biosample_col: Column,
    ) -> Column:
        """Identify the study type based on the method to quantify the trait and the biosample where the trait was measured.

        The quantification method identifies the type of molecular QTLs that were found.
        The biosample identifies the biosample where the trait was measured, distinguishing between bulk and single cell.

        Args:
            quantification_method_col (Column): column with the label of the method to quantify the trait. Available methods are [here](https://www.ebi.ac.uk/eqtl/Methods/)
            biosample_col (Column): column with the label of the biosample where the trait was measured.

        Returns:
            Column: The study type.

        Examples:
            >>> df = spark.createDataFrame([("ge", "CL_1"), ("leafcutter", "UBERON_2"), ("tx", "EFO_3")], ["quant_method", "tissue_id"])
            >>> df.withColumn("study_type", EqtlCatalogueStudyIndex._identify_study_type(f.col("quant_method"), f.col("tissue_id"))).show()
            +------------+---------+----------+
            |quant_method|tissue_id|study_type|
            +------------+---------+----------+
            |          ge|     CL_1|    sceqtl|
            |  leafcutter| UBERON_2|      sqtl|
            |          tx|    EFO_3|      eqtl|
            +------------+---------+----------+
            <BLANKLINE>
        """
        method_to_study_type_mapping = {
            "ge": "eqtl",
            "exon": "eqtl",
            "tx": "eqtl",
            "microarray": "eqtl",
            "leafcutter": "sqtl",
            "aptamer": "pqtl",
            "txrev": "tuqtl",
        }
        qtl_type_mapping = f.create_map(
            *[f.lit(x) for x in chain(*method_to_study_type_mapping.items())]
        )[quantification_method_col]
        return f.when(
            biosample_col.startswith("CL"), f.concat(f.lit("sc"), qtl_type_mapping)
        ).otherwise(qtl_type_mapping)

    @classmethod
    def get_studies_of_interest(
        cls: type[EqtlCatalogueStudyIndex],
        studies_metadata: DataFrame,
    ) -> list[str]:
        """Filter studies of interest from the raw studies metadata.

        Args:
            studies_metadata (DataFrame): raw studies metadata filtered with studies of interest.

        Returns:
            list[str]: QTD IDs defining the studies of interest for ingestion.
        """
        return (
            studies_metadata.select("dataset_id")
            .distinct()
            .toPandas()["dataset_id"]
            .tolist()
        )

    @classmethod
    def from_susie_results(
        cls: type[EqtlCatalogueStudyIndex],
        processed_finemapping_df: DataFrame,
    ) -> StudyIndex:
        """Ingest study level metadata from eQTL Catalogue.

        Args:
            processed_finemapping_df (DataFrame): processed fine mapping results with study metadata.

        Returns:
            StudyIndex: eQTL Catalogue study index dataset derived from the selected SuSIE results.
        """
        study_index_cols = [
            field.name
            for field in StudyIndex.get_schema().fields
            if field.name in processed_finemapping_df.columns
        ]
        return StudyIndex(
            _df=processed_finemapping_df.select(study_index_cols).distinct(),
            _schema=StudyIndex.get_schema(),
        )

    @classmethod
    def read_studies_from_source(
        cls: type[EqtlCatalogueStudyIndex],
        session: Session,
        mqtl_quantification_methods_blacklist: list[str],
    ) -> DataFrame:
        """Read raw studies metadata from eQTL Catalogue.

        Args:
            session (Session): Spark session.
            mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv

        Returns:
            DataFrame: raw studies metadata.
        """
        pd.DataFrame.iteritems = pd.DataFrame.items
        return session.spark.createDataFrame(
            pd.read_csv(cls.raw_studies_metadata_path, sep="\t"),
            schema=cls.raw_studies_metadata_schema,
        ).filter(~(f.col("quant_method").isin(mqtl_quantification_methods_blacklist)))

from_susie_results(processed_finemapping_df: DataFrame) -> StudyIndex classmethod

Ingest study level metadata from eQTL Catalogue.

Parameters:

Name Type Description Default
processed_finemapping_df DataFrame

processed fine mapping results with study metadata.

required

Returns:

Name Type Description
StudyIndex StudyIndex

eQTL Catalogue study index dataset derived from the selected SuSIE results.

Source code in src/gentropy/datasource/eqtl_catalogue/study_index.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
@classmethod
def from_susie_results(
    cls: type[EqtlCatalogueStudyIndex],
    processed_finemapping_df: DataFrame,
) -> StudyIndex:
    """Ingest study level metadata from eQTL Catalogue.

    Args:
        processed_finemapping_df (DataFrame): processed fine mapping results with study metadata.

    Returns:
        StudyIndex: eQTL Catalogue study index dataset derived from the selected SuSIE results.
    """
    study_index_cols = [
        field.name
        for field in StudyIndex.get_schema().fields
        if field.name in processed_finemapping_df.columns
    ]
    return StudyIndex(
        _df=processed_finemapping_df.select(study_index_cols).distinct(),
        _schema=StudyIndex.get_schema(),
    )

get_studies_of_interest(studies_metadata: DataFrame) -> list[str] classmethod

Filter studies of interest from the raw studies metadata.

Parameters:

Name Type Description Default
studies_metadata DataFrame

raw studies metadata filtered with studies of interest.

required

Returns:

Type Description
list[str]

list[str]: QTD IDs defining the studies of interest for ingestion.

Source code in src/gentropy/datasource/eqtl_catalogue/study_index.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
@classmethod
def get_studies_of_interest(
    cls: type[EqtlCatalogueStudyIndex],
    studies_metadata: DataFrame,
) -> list[str]:
    """Filter studies of interest from the raw studies metadata.

    Args:
        studies_metadata (DataFrame): raw studies metadata filtered with studies of interest.

    Returns:
        list[str]: QTD IDs defining the studies of interest for ingestion.
    """
    return (
        studies_metadata.select("dataset_id")
        .distinct()
        .toPandas()["dataset_id"]
        .tolist()
    )

read_studies_from_source(session: Session, mqtl_quantification_methods_blacklist: list[str]) -> DataFrame classmethod

Read raw studies metadata from eQTL Catalogue.

Parameters:

Name Type Description Default
session Session

Spark session.

required
mqtl_quantification_methods_blacklist list[str]

Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv

required

Returns:

Name Type Description
DataFrame DataFrame

raw studies metadata.

Source code in src/gentropy/datasource/eqtl_catalogue/study_index.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
@classmethod
def read_studies_from_source(
    cls: type[EqtlCatalogueStudyIndex],
    session: Session,
    mqtl_quantification_methods_blacklist: list[str],
) -> DataFrame:
    """Read raw studies metadata from eQTL Catalogue.

    Args:
        session (Session): Spark session.
        mqtl_quantification_methods_blacklist (list[str]): Molecular trait quantification methods that we don't want to ingest. Available options in https://github.com/eQTL-Catalogue/eQTL-Catalogue-resources/blob/master/data_tables/dataset_metadata.tsv

    Returns:
        DataFrame: raw studies metadata.
    """
    pd.DataFrame.iteritems = pd.DataFrame.items
    return session.spark.createDataFrame(
        pd.read_csv(cls.raw_studies_metadata_path, sep="\t"),
        schema=cls.raw_studies_metadata_schema,
    ).filter(~(f.col("quant_method").isin(mqtl_quantification_methods_blacklist)))