Skip to content

Study Index

gentropy.dataset.study_index.StudyIndex dataclass

Bases: Dataset

Study index dataset.

A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.

Source code in src/gentropy/dataset/study_index.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
@dataclass
class StudyIndex(Dataset):
    """Study index dataset.

    A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.
    """

    @staticmethod
    def _aggregate_samples_by_ancestry(merged: Column, ancestry: Column) -> Column:
        """Aggregate sample counts by ancestry in a list of struct colmns.

        Args:
            merged (Column): A column representing merged data (list of structs).
            ancestry (Column): The `ancestry` parameter is a column that represents the ancestry of each
                sample. (a struct)

        Returns:
            Column: the modified "merged" column after aggregating the samples by ancestry.
        """
        # Iterating over the list of ancestries and adding the sample size if label matches:
        return f.transform(
            merged,
            lambda a: f.when(
                a.ancestry == ancestry.ancestry,
                f.struct(
                    a.ancestry.alias("ancestry"),
                    (a.sampleSize + ancestry.sampleSize).alias("sampleSize"),
                ),
            ).otherwise(a),
        )

    @staticmethod
    def _map_ancestries_to_ld_population(gwas_ancestry_label: Column) -> Column:
        """Normalise ancestry column from GWAS studies into reference LD panel based on a pre-defined map.

        This function assumes all possible ancestry categories have a corresponding
        LD panel in the LD index. It is very important to have the ancestry labels
        moved to the LD panel map.

        Args:
            gwas_ancestry_label (Column): A struct column with ancestry label like Finnish,
                European, African etc. and the corresponding sample size.

        Returns:
            Column: Struct column with the mapped LD population label and the sample size.
        """
        # Loading ancestry label to LD population label:
        json_dict = json.loads(
            pkg_resources.read_text(
                data, "gwas_population_2_LD_panel_map.json", encoding="utf-8"
            )
        )
        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])

        return f.struct(
            map_expr[gwas_ancestry_label.ancestry].alias("ancestry"),
            gwas_ancestry_label.sampleSize.alias("sampleSize"),
        )

    @classmethod
    def get_schema(cls: type[StudyIndex]) -> StructType:
        """Provide the schema for the StudyIndex dataset.

        Returns:
            StructType: The schema of the StudyIndex dataset.
        """
        return parse_spark_schema("study_index.json")

    @classmethod
    def aggregate_and_map_ancestries(
        cls: type[StudyIndex], discovery_samples: Column
    ) -> Column:
        """Map ancestries to populations in the LD reference and calculate relative sample size.

        Args:
            discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns

        Returns:
            Column: A list of struct with mapped LD population and their relative sample size.
        """
        # Map ancestry categories to population labels of the LD index:
        mapped_ancestries = f.transform(
            discovery_samples, cls._map_ancestries_to_ld_population
        )

        # Aggregate sample sizes belonging to the same LD population:
        aggregated_counts = f.aggregate(
            mapped_ancestries,
            f.array_distinct(
                f.transform(
                    mapped_ancestries,
                    lambda x: f.struct(
                        x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize")
                    ),
                )
            ),
            cls._aggregate_samples_by_ancestry,
        )
        # Getting total sample count:
        total_sample_count = f.aggregate(
            aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize
        ).alias("sampleSize")

        # Calculating relative sample size for each LD population:
        return f.transform(
            aggregated_counts,
            lambda ld_population: f.struct(
                ld_population.ancestry.alias("ldPopulation"),
                (ld_population.sampleSize / total_sample_count).alias(
                    "relativeSampleSize"
                ),
            ),
        )

    def study_type_lut(self: StudyIndex) -> DataFrame:
        """Return a lookup table of study type.

        Returns:
            DataFrame: A dataframe containing `studyId` and `studyType` columns.
        """
        return self.df.select("studyId", "studyType")

    def is_qtl(self: StudyIndex) -> Column:
        """Return a boolean column with true values for QTL studies.

        Returns:
            Column: True if the study is a QTL study.
        """
        return self.df.studyType.endswith("qtl")

    def is_gwas(self: StudyIndex) -> Column:
        """Return a boolean column with true values for GWAS studies.

        Returns:
            Column: True if the study is a GWAS study.
        """
        return self.df.studyType == "gwas"

    def has_mapped_trait(self: StudyIndex) -> Column:
        """Return a boolean column indicating if a study has mapped disease.

        Returns:
            Column: True if the study has mapped disease.
        """
        return f.size(self.df.traitFromSourceMappedIds) > 0

    def is_quality_flagged(self: StudyIndex) -> Column:
        """Return a boolean column indicating if a study is flagged due to quality issues.

        Returns:
            Column: True if the study is flagged.
        """
        # Testing for the presence of the qualityControls column:
        if "qualityControls" not in self.df.columns:
            return f.lit(False)
        else:
            return f.size(self.df.qualityControls) != 0

    def has_summarystats(self: StudyIndex) -> Column:
        """Return a boolean column indicating if a study has harmonized summary statistics.

        Returns:
            Column: True if the study has harmonized summary statistics.
        """
        return self.df.hasSumstats

aggregate_and_map_ancestries(discovery_samples: Column) -> Column classmethod

Map ancestries to populations in the LD reference and calculate relative sample size.

Parameters:

Name Type Description Default
discovery_samples Column

A list of struct column. Has an ancestry column and a sampleSize columns

required

Returns:

Name Type Description
Column Column

A list of struct with mapped LD population and their relative sample size.

Source code in src/gentropy/dataset/study_index.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
@classmethod
def aggregate_and_map_ancestries(
    cls: type[StudyIndex], discovery_samples: Column
) -> Column:
    """Map ancestries to populations in the LD reference and calculate relative sample size.

    Args:
        discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns

    Returns:
        Column: A list of struct with mapped LD population and their relative sample size.
    """
    # Map ancestry categories to population labels of the LD index:
    mapped_ancestries = f.transform(
        discovery_samples, cls._map_ancestries_to_ld_population
    )

    # Aggregate sample sizes belonging to the same LD population:
    aggregated_counts = f.aggregate(
        mapped_ancestries,
        f.array_distinct(
            f.transform(
                mapped_ancestries,
                lambda x: f.struct(
                    x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize")
                ),
            )
        ),
        cls._aggregate_samples_by_ancestry,
    )
    # Getting total sample count:
    total_sample_count = f.aggregate(
        aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize
    ).alias("sampleSize")

    # Calculating relative sample size for each LD population:
    return f.transform(
        aggregated_counts,
        lambda ld_population: f.struct(
            ld_population.ancestry.alias("ldPopulation"),
            (ld_population.sampleSize / total_sample_count).alias(
                "relativeSampleSize"
            ),
        ),
    )

get_schema() -> StructType classmethod

Provide the schema for the StudyIndex dataset.

Returns:

Name Type Description
StructType StructType

The schema of the StudyIndex dataset.

Source code in src/gentropy/dataset/study_index.py
80
81
82
83
84
85
86
87
@classmethod
def get_schema(cls: type[StudyIndex]) -> StructType:
    """Provide the schema for the StudyIndex dataset.

    Returns:
        StructType: The schema of the StudyIndex dataset.
    """
    return parse_spark_schema("study_index.json")

has_mapped_trait() -> Column

Return a boolean column indicating if a study has mapped disease.

Returns:

Name Type Description
Column Column

True if the study has mapped disease.

Source code in src/gentropy/dataset/study_index.py
159
160
161
162
163
164
165
def has_mapped_trait(self: StudyIndex) -> Column:
    """Return a boolean column indicating if a study has mapped disease.

    Returns:
        Column: True if the study has mapped disease.
    """
    return f.size(self.df.traitFromSourceMappedIds) > 0

has_summarystats() -> Column

Return a boolean column indicating if a study has harmonized summary statistics.

Returns:

Name Type Description
Column Column

True if the study has harmonized summary statistics.

Source code in src/gentropy/dataset/study_index.py
179
180
181
182
183
184
185
def has_summarystats(self: StudyIndex) -> Column:
    """Return a boolean column indicating if a study has harmonized summary statistics.

    Returns:
        Column: True if the study has harmonized summary statistics.
    """
    return self.df.hasSumstats

is_gwas() -> Column

Return a boolean column with true values for GWAS studies.

Returns:

Name Type Description
Column Column

True if the study is a GWAS study.

Source code in src/gentropy/dataset/study_index.py
151
152
153
154
155
156
157
def is_gwas(self: StudyIndex) -> Column:
    """Return a boolean column with true values for GWAS studies.

    Returns:
        Column: True if the study is a GWAS study.
    """
    return self.df.studyType == "gwas"

is_qtl() -> Column

Return a boolean column with true values for QTL studies.

Returns:

Name Type Description
Column Column

True if the study is a QTL study.

Source code in src/gentropy/dataset/study_index.py
143
144
145
146
147
148
149
def is_qtl(self: StudyIndex) -> Column:
    """Return a boolean column with true values for QTL studies.

    Returns:
        Column: True if the study is a QTL study.
    """
    return self.df.studyType.endswith("qtl")

is_quality_flagged() -> Column

Return a boolean column indicating if a study is flagged due to quality issues.

Returns:

Name Type Description
Column Column

True if the study is flagged.

Source code in src/gentropy/dataset/study_index.py
167
168
169
170
171
172
173
174
175
176
177
def is_quality_flagged(self: StudyIndex) -> Column:
    """Return a boolean column indicating if a study is flagged due to quality issues.

    Returns:
        Column: True if the study is flagged.
    """
    # Testing for the presence of the qualityControls column:
    if "qualityControls" not in self.df.columns:
        return f.lit(False)
    else:
        return f.size(self.df.qualityControls) != 0

study_type_lut() -> DataFrame

Return a lookup table of study type.

Returns:

Name Type Description
DataFrame DataFrame

A dataframe containing studyId and studyType columns.

Source code in src/gentropy/dataset/study_index.py
135
136
137
138
139
140
141
def study_type_lut(self: StudyIndex) -> DataFrame:
    """Return a lookup table of study type.

    Returns:
        DataFrame: A dataframe containing `studyId` and `studyType` columns.
    """
    return self.df.select("studyId", "studyType")

Schema

root
 |-- studyId: string (nullable = false)
 |-- projectId: string (nullable = false)
 |-- studyType: string (nullable = false)
 |-- traitFromSource: string (nullable = true)
 |-- traitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- geneId: string (nullable = true)
 |-- tissueFromSourceId: string (nullable = true)
 |-- pubmedId: string (nullable = true)
 |-- publicationTitle: string (nullable = true)
 |-- publicationFirstAuthor: string (nullable = true)
 |-- publicationDate: string (nullable = true)
 |-- publicationJournal: string (nullable = true)
 |-- backgroundTraitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- initialSampleSize: string (nullable = true)
 |-- nCases: integer (nullable = true)
 |-- nControls: integer (nullable = true)
 |-- nSamples: integer (nullable = true)
 |-- cohorts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ldPopulationStructure: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- ldPopulation: string (nullable = true)
 |    |    |-- relativeSampleSize: double (nullable = true)
 |-- discoverySamples: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- sampleSize: integer (nullable = true)
 |    |    |-- ancestry: string (nullable = true)
 |-- replicationSamples: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- sampleSize: integer (nullable = true)
 |    |    |-- ancestry: string (nullable = true)
 |-- qualityControls: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- analysisFlags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- summarystatsLocation: string (nullable = true)
 |-- hasSumstats: boolean (nullable = true)