Study Index

`gentropy.dataset.study_index.StudyIndex` `dataclass` ¶

Bases: Dataset

Study index dataset.

A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.

Source code in src/gentropy/dataset/study_index.py

@dataclass
class StudyIndex(Dataset):
    """Study index dataset.

    A study index dataset captures all the metadata for all studies including GWAS and Molecular QTL.
    """

    VALID_TYPES = [
        "gwas",
        "eqtl",
        "pqtl",
        "sqtl",
        "tuqtl",
        "sceqtl",
        "scpqtl",
        "scsqtl",
        "sctuqtl",
    ]

    @staticmethod
    def _aggregate_samples_by_ancestry(merged: Column, ancestry: Column) -> Column:
        """Aggregate sample counts by ancestry in a list of struct colmns.

        Args:
            merged (Column): A column representing merged data (list of structs).
            ancestry (Column): The `ancestry` parameter is a column that represents the ancestry of each
                sample. (a struct)

        Returns:
            Column: the modified "merged" column after aggregating the samples by ancestry.
        """
        # Iterating over the list of ancestries and adding the sample size if label matches:
        return f.transform(
            merged,
            lambda a: f.when(
                a.ancestry == ancestry.ancestry,
                f.struct(
                    a.ancestry.alias("ancestry"),
                    (a.sampleSize + ancestry.sampleSize).alias("sampleSize"),
                ),
            ).otherwise(a),
        )

    @staticmethod
    def _map_ancestries_to_ld_population(gwas_ancestry_label: Column) -> Column:
        """Normalise ancestry column from GWAS studies into reference LD panel based on a pre-defined map.

        This function assumes all possible ancestry categories have a corresponding
        LD panel in the LD index. It is very important to have the ancestry labels
        moved to the LD panel map.

        Args:
            gwas_ancestry_label (Column): A struct column with ancestry label like Finnish,
                European, African etc. and the corresponding sample size.

        Returns:
            Column: Struct column with the mapped LD population label and the sample size.
        """
        # Loading ancestry label to LD population label:
        json_dict = json.loads(
            pkg_resources.read_text(
                data, "gwas_population_2_LD_panel_map.json", encoding="utf-8"
            )
        )
        map_expr = f.create_map(*[f.lit(x) for x in chain(*json_dict.items())])

        return f.struct(
            map_expr[gwas_ancestry_label.ancestry].alias("ancestry"),
            gwas_ancestry_label.sampleSize.alias("sampleSize"),
        )

    @classmethod
    def get_schema(cls: type[StudyIndex]) -> StructType:
        """Provide the schema for the StudyIndex dataset.

        Returns:
            StructType: The schema of the StudyIndex dataset.
        """
        return parse_spark_schema("study_index.json")

    @classmethod
    def get_QC_column_name(cls: type[StudyIndex]) -> str:
        """Return the name of the quality control column.

        Returns:
            str: The name of the quality control column.
        """
        return "qualityControls"

    @classmethod
    def get_QC_mappings(cls: type[StudyIndex]) -> dict[str, str]:
        """Quality control flag to QC column category mappings.

        Returns:
            dict[str, str]: Mapping between flag name and QC column category value.
        """
        return {member.name: member.value for member in StudyQualityCheck}

    @classmethod
    def aggregate_and_map_ancestries(
        cls: type[StudyIndex], discovery_samples: Column
    ) -> Column:
        """Map ancestries to populations in the LD reference and calculate relative sample size.

        Args:
            discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns

        Returns:
            Column: A list of struct with mapped LD population and their relative sample size.
        """
        # Map ancestry categories to population labels of the LD index:
        mapped_ancestries = f.transform(
            discovery_samples, cls._map_ancestries_to_ld_population
        )

        # Aggregate sample sizes belonging to the same LD population:
        aggregated_counts = f.aggregate(
            mapped_ancestries,
            f.array_distinct(
                f.transform(
                    mapped_ancestries,
                    lambda x: f.struct(
                        x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize")
                    ),
                )
            ),
            cls._aggregate_samples_by_ancestry,
        )
        # Getting total sample count:
        total_sample_count = f.aggregate(
            aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize
        ).alias("sampleSize")
        # Calculating relative sample size for each LD population:
        return f.transform(
            aggregated_counts,
            lambda ld_population: f.struct(
                ld_population.ancestry.alias("ldPopulation"),
                (ld_population.sampleSize / total_sample_count).alias(
                    "relativeSampleSize"
                ),
            ),
        )

    def study_type_lut(self: StudyIndex) -> DataFrame:
        """Return a lookup table of study type.

        Returns:
            DataFrame: A dataframe containing `studyId` and `studyType` columns.
        """
        return self.df.select("studyId", "studyType")

    def is_qtl(self: StudyIndex) -> Column:
        """Return a boolean column with true values for QTL studies.

        Returns:
            Column: True if the study is a QTL study.
        """
        return self.df.studyType.endswith("qtl")

    def is_gwas(self: StudyIndex) -> Column:
        """Return a boolean column with true values for GWAS studies.

        Returns:
            Column: True if the study is a GWAS study.
        """
        return self.df.studyType == "gwas"

    def has_mapped_trait(self: StudyIndex) -> Column:
        """Return a boolean column indicating if a study has mapped disease.

        Returns:
            Column: True if the study has mapped disease.
        """
        return f.size(self.df.traitFromSourceMappedIds) > 0

    def is_quality_flagged(self: StudyIndex) -> Column:
        """Return a boolean column indicating if a study is flagged due to quality issues.

        Returns:
            Column: True if the study is flagged.
        """
        # Testing for the presence of the qualityControls column:
        if "qualityControls" not in self.df.columns:
            return f.lit(False)
        else:
            return f.size(self.df["qualityControls"]) != 0

    def has_summarystats(self: StudyIndex) -> Column:
        """Return a boolean column indicating if a study has harmonized summary statistics.

        Returns:
            Column: True if the study has harmonized summary statistics.
        """
        return self.df.hasSumstats

    def validate_unique_study_id(self: StudyIndex) -> StudyIndex:
        """Validating the uniqueness of study identifiers and flagging duplicated studies.

        Returns:
            StudyIndex: with flagged duplicated studies.
        """
        return StudyIndex(
            _df=self.df.withColumn(
                "qualityControls",
                self.update_quality_flag(
                    f.col("qualityControls"),
                    self.flag_duplicates(f.col("studyId")),
                    StudyQualityCheck.DUPLICATED_STUDY,
                ),
            ),
            _schema=StudyIndex.get_schema(),
        )

    def _normalise_disease(
        self: StudyIndex,
        source_disease_column_name: str,
        disease_column_name: str,
        disease_map: DataFrame,
    ) -> DataFrame:
        """Normalising diseases in the study index.

        Given a reference disease map (containing all potential EFO ids with the corresponding reference disease ids),
        this function maps all EFO ids in the study index to the reference disease ids.

        Args:
            source_disease_column_name (str): The column name of the disease column to validate.
            disease_column_name (str): The resulting disease column name that contains the validated ids.
            disease_map (DataFrame): Reference dataframe with diseases

        Returns:
            DataFrame: where the newly added diseaseIds column will contain the validated EFO identifiers.
        """
        return (
            self.df
            # Only validating studies with diseases:
            .filter(f.size(f.col(source_disease_column_name)) > 0)
            # Explode disease column:
            .select(
                "studyId",
                "studyType",
                f.explode_outer(source_disease_column_name).alias("efo"),
            )
            # Join disease map:
            .join(disease_map, on="efo", how="left")
            .groupBy("studyId")
            .agg(
                f.collect_set(f.col("diseaseId")).alias(disease_column_name),
            )
        )

    def validate_disease(self: StudyIndex, disease_map: DataFrame) -> StudyIndex:
        """Validate diseases in the study index dataset.

        Args:
            disease_map (DataFrame): a dataframe with two columns (efo, diseaseId).

        Returns:
            StudyIndex: where gwas studies are flagged where no valid disease id could be found.
        """
        # Because the disease ids are not mandatory fields of the schema, we skip vaildation if these columns are not present:
        if ("traitFromSourceMappedIds" not in self.df.columns) or (
            "backgroundTraitFromSourceMappedIds" not in self.df.columns
        ):
            return self

        # Disease Column names:
        foreground_disease_column = "diseaseIds"
        background_disease_column = "backgroundDiseaseIds"

        # If diseaseId in schema, we need to drop it:
        drop_columns = [
            column
            for column in self.df.columns
            if column in [foreground_disease_column, background_disease_column]
        ]

        if len(drop_columns) > 0:
            self.df = self.df.drop(*drop_columns)

        # Normalise disease:
        normalised_disease = self._normalise_disease(
            "traitFromSourceMappedIds", foreground_disease_column, disease_map
        )
        normalised_background_disease = self._normalise_disease(
            "backgroundTraitFromSourceMappedIds", background_disease_column, disease_map
        )

        return StudyIndex(
            _df=(
                self.df.join(normalised_disease, on="studyId", how="left")
                .join(normalised_background_disease, on="studyId", how="left")
                # Updating disease columns:
                .withColumn(
                    foreground_disease_column,
                    f.when(
                        f.col(foreground_disease_column).isNull(), f.array()
                    ).otherwise(f.col(foreground_disease_column)),
                )
                .withColumn(
                    background_disease_column,
                    f.when(
                        f.col(background_disease_column).isNull(), f.array()
                    ).otherwise(f.col(background_disease_column)),
                )
                # Flagging gwas studies where no valid disease is avilable:
                .withColumn(
                    "qualityControls",
                    StudyIndex.update_quality_flag(
                        f.col("qualityControls"),
                        # Flagging all gwas studies with no normalised disease:
                        (f.size(f.col(foreground_disease_column)) == 0)
                        & (f.col("studyType") == "gwas"),
                        StudyQualityCheck.UNRESOLVED_DISEASE,
                    ),
                )
                # Added to avoid Spark optimisation (see: https://github.com/opentargets/issues/issues/3906#issuecomment-2949299965)
                .persist()
            ),
            _schema=StudyIndex.get_schema(),
        )

    def validate_study_type(self: StudyIndex) -> StudyIndex:
        """Validating study type and flag unsupported types.

        Returns:
            StudyIndex: with flagged studies with unsupported type.
        """
        validated_df = (
            self.df
            # Flagging unsupported study types:
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    f.when(
                        (f.col("studyType") == "gwas")
                        | f.col("studyType").endswith("qtl"),
                        False,
                    ).otherwise(True),
                    StudyQualityCheck.UNKNOWN_STUDY_TYPE,
                ),
            )
        )
        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())

    def validate_target(self: StudyIndex, target_index: TargetIndex) -> StudyIndex:
        """Validating gene identifiers in the study index against the provided target index.

        Args:
            target_index (TargetIndex): target index containing the reference gene identifiers (Ensembl gene identifiers).

        Returns:
            StudyIndex: with flagged studies if geneId could not be validated.
        """
        gene_set = target_index.df.select(
            f.col("id").alias("geneId"), f.lit(True).alias("isIdFound")
        )

        # As the geneId is not a mandatory field of study index, we return if the column is not there:
        if "geneId" not in self.df.columns:
            return self

        validated_df = (
            self.df.join(gene_set, on="geneId", how="left")
            .withColumn(
                "isIdFound",
                f.when(
                    (f.col("studyType") != "gwas") & f.col("isIdFound").isNull(),
                    f.lit(False),
                ).otherwise(f.lit(True)),
            )
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    ~f.col("isIdFound"),
                    StudyQualityCheck.UNRESOLVED_TARGET,
                ),
            )
            .drop("isIdFound")
        )

        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())

    def validate_biosample(
        self: StudyIndex, biosample_index: BiosampleIndex
    ) -> StudyIndex:
        """Validating biosample identifiers in the study index against the provided biosample index.

        Args:
            biosample_index (BiosampleIndex): Biosample index containing a reference of biosample identifiers e.g. cell types, tissues, cell lines, etc.

        Returns:
            StudyIndex: where non-gwas studies are flagged if biosampleIndex could not be validated.
        """
        biosample_set = biosample_index.df.select(
            "biosampleId", f.lit(True).alias("isIdFound")
        )

        # If biosampleId in df, we need to drop it:
        if "biosampleId" in self.df.columns:
            self.df = self.df.drop("biosampleId")

        # As the biosampleFromSourceId is not a mandatory field of study index, we return if the column is not there:
        if "biosampleFromSourceId" not in self.df.columns:
            return self

        validated_df = (
            self.df.join(
                biosample_set,
                self.df.biosampleFromSourceId == biosample_set.biosampleId,
                how="left",
            )
            .withColumn(
                "isIdFound",
                f.when(
                    (f.col("studyType") != "gwas") & (f.col("isIdFound").isNull()),
                    f.lit(False),
                ).otherwise(f.lit(True)),
            )
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    ~f.col("isIdFound"),
                    StudyQualityCheck.UNKNOWN_BIOSAMPLE,
                ),
            )
            .drop("isIdFound")
        )

        return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())

    def annotate_sumstats_qc(
        self: StudyIndex,
        sumstats_qc: SummaryStatisticsQC,
        threshold_mean_beta: float = 0.05,
        threshold_mean_diff_pz: float = 0.05,
        threshold_se_diff_pz: float = 0.05,
        threshold_min_gc_lambda: float = 0.7,
        threshold_max_gc_lambda: float = 2.5,
        threshold_min_n_variants: int = 2_000_000,
    ) -> StudyIndex:
        """Annotate summary stats QC information.

        Args:
            sumstats_qc (SummaryStatisticsQC): Dataset containing summary statistics-based quality controls.
            threshold_mean_beta (float): Threshold for mean beta check. Defaults to 0.05.
            threshold_mean_diff_pz (float): Threshold for mean diff PZ check. Defaults to 0.05.
            threshold_se_diff_pz (float): Threshold for SE diff PZ check. Defaults to 0.05.
            threshold_min_gc_lambda (float): Minimum threshold for GC lambda check. Defaults to 0.7.
            threshold_max_gc_lambda (float): Maximum threshold for GC lambda check. Defaults to 2.5.
            threshold_min_n_variants (int): Minimum number of variants for SuSiE check. Defaults to 2_000_000.

        Returns:
            StudyIndex: Updated study index with QC information
        """
        # convert all columns in sumstats_qc dataframe in array of structs grouped by studyId
        qc_check_cols = [c for c in sumstats_qc.df.columns if c != "studyId"]

        studies = self.df

        # Prepare the QC flags format for the study index:
        melted_qc = convert_from_wide_to_long(
            sumstats_qc.df,
            id_vars=["studyId"],
            value_vars=qc_check_cols,
            var_name="QCCheckName",
            value_name="QCCheckValue",
        )
        qc_struct = f.struct(f.col("QCCheckName"), f.col("QCCheckValue"))
        qc_df = (
            melted_qc.groupBy("studyId")
            .agg(f.collect_list(qc_struct).alias("sumstatQCValues"))
            .select("studyId", "sumstatQCValues")
            .withColumn("hasSumstats", f.lit(True))
        )
        extract_qc_value: Callable[[str], Column] = lambda x: filter_array_struct(
            "sumstatQCValues", "QCCheckName", x, "QCCheckValue"
        )

        df = (
            studies.drop("sumstatQCValues", "hasSumstats")
            .join(qc_df, how="left", on="studyId")
            .withColumn("hasSumstats", f.coalesce(f.col("hasSumstats"), f.lit(False)))
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    ~f.col("hasSumstats"),
                    StudyQualityCheck.SUMSTATS_NOT_AVAILABLE,
                ),
            )
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    ~(f.abs(extract_qc_value("mean_beta")) <= threshold_mean_beta),
                    StudyQualityCheck.FAILED_MEAN_BETA_CHECK,
                ),
            )
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    ~(
                        (
                            f.abs(extract_qc_value("mean_diff_pz"))
                            <= threshold_mean_diff_pz
                        )
                        & (extract_qc_value("se_diff_pz") <= threshold_se_diff_pz)
                    ),
                    StudyQualityCheck.FAILED_PZ_CHECK,
                ),
            )
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    ~(
                        (extract_qc_value("gc_lambda") <= threshold_max_gc_lambda)
                        & (extract_qc_value("gc_lambda") >= threshold_min_gc_lambda)
                    ),
                    StudyQualityCheck.FAILED_GC_LAMBDA_CHECK,
                ),
            )
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    extract_qc_value("n_variants") < threshold_min_n_variants,
                    StudyQualityCheck.SMALL_NUMBER_OF_SNPS,
                ),
            )
        )

        # Annotate study index with QC information:
        return StudyIndex(
            _df=df,
            _schema=StudyIndex.get_schema(),
        )

    def validate_analysis_flags(self: StudyIndex) -> StudyIndex:
        """Validating analysis flags in the study index dataset.

        This method checks the analysis flags and reports the the outcome of predicates to the quality control column.

        Returns:
            StudyIndex: with qualityControls column flagged based on analysisFlags.
        """
        predicate = f.array_contains(
            "analysisFlags", StudyAnalysisFlag.CASE_CASE_STUDY.value
        )
        df = self.df.withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                predicate,
                StudyQualityCheck.CASE_CASE_STUDY_DESIGN,
            ),
        )
        return StudyIndex(_df=df, _schema=StudyIndex.get_schema())

    def deconvolute_studies(self: StudyIndex) -> StudyIndex:
        """Deconvolute the study index dataset.

        When ingesting the study index dataset, the same studyId might be ingested from more than one source.
        In such cases, the data needs to be merged and the quality control flags need to be combined.

        Returns:
            StudyIndex: Deconvoluted study index dataset.
        """
        # Windowing by study ID assume random order, but this is OK, because we are not selecting rows by a specific order.
        study_id_window = Window.partitionBy("studyId").orderBy(f.rand())

        # For certain aggregation, the full window is needed to be considered:
        full_study_id_window = study_id_window.orderBy("studyId").rangeBetween(
            Window.unboundedPreceding, Window.unboundedFollowing
        )

        # Temporary columns to drop at the end:
        columns_to_drop = ["keepTopHit", "mostGranular", "rank"]

        return StudyIndex(
            _df=(
                self.df
                # Initialising quality controls column, if not present:
                .withColumn(
                    "qualityControls",
                    f.when(
                        f.col("qualityControls").isNull(),
                        f.array().cast(ArrayType(StringType())),
                    ).otherwise(f.col("qualityControls")),
                )
                # Keeping top hit studies unless the same study is available from a summmary statistics source:
                # This value will be set for all rows for the same `studyId`:
                .withColumn(
                    "keepTopHit",
                    f.when(
                        f.array_contains(
                            f.collect_set(f.col("hasSumstats")).over(
                                full_study_id_window
                            ),
                            True,
                        ),
                        f.lit(False),
                    ).otherwise(True),
                )
                # For studies without summary statistics, we remove the "Not curated by Open Targets" flag:
                .withColumn(
                    "qualityControls",
                    f.when(
                        ~f.col("hasSumstats"),
                        f.array_remove(
                            f.col("qualityControls"),
                            StudyQualityCheck.NO_OT_CURATION.value,
                        ),
                    ).otherwise(f.col("qualityControls")),
                )
                # If top hits are not kept, we remove the "sumstats not available" flag from all QC lists:
                .withColumn(
                    "qualityControls",
                    f.when(
                        ~f.col("keepTopHit"),
                        f.array_remove(
                            f.col("qualityControls"),
                            StudyQualityCheck.SUMSTATS_NOT_AVAILABLE.value,
                        ),
                    ).otherwise(f.col("qualityControls")),
                )
                # Then propagate quality checks for all sources of the same study:
                .withColumn(
                    "qualityControls",
                    f.array_distinct(
                        f.flatten(
                            f.collect_set("qualityControls").over(full_study_id_window)
                        )
                    ),
                )
                # Propagating sumstatQCValues -> map, cannot be flatten:
                .withColumn(
                    "sumstatQCValues",
                    f.first("sumstatQCValues", ignorenulls=True).over(
                        full_study_id_window
                    ),
                )
                # Propagating analysisFlags:
                .withColumn(
                    "analysisFlags",
                    f.flatten(
                        f.collect_list("analysisFlags").over(full_study_id_window)
                    ),
                )
                # Propagating hasSumstatsFlag - if no flag, leave null:
                .withColumn(
                    "hasSumstats",
                    f.when(
                        # There's a true:
                        f.array_contains(
                            f.collect_set("hasSumstats").over(full_study_id_window),
                            True,
                        ),
                        f.lit(True),
                    ).when(
                        # There's a false:
                        f.array_contains(
                            f.collect_set("hasSumstats").over(full_study_id_window),
                            False,
                        ),
                        f.lit(False),
                    ),
                )
                # Propagating disease: when different sets of diseases available for the same study,
                # we pick the shortest list, becasuse we assume, that is the most accurate disease assignment:
                .withColumn(
                    "mostGranular",
                    f.size(f.col("traitFromSourceMappedIds"))
                    == f.min(f.size(f.col("traitFromSourceMappedIds"))).over(
                        full_study_id_window
                    ),
                )
                # Remove less granular disease mappings:
                .withColumn(
                    "traitFromSourceMappedIds",
                    f.when(f.col("mostGranular"), f.col("traitFromSourceMappedIds")),
                )
                # Propagate mapped disease:
                .withColumn(
                    "traitFromSourceMappedIds",
                    f.last(f.col("traitFromSourceMappedIds"), True).over(
                        full_study_id_window
                    ),
                )
                # Repeating these steps for the `traitFromSource` column:
                .withColumn(
                    "traitFromSource",
                    f.when(f.col("mostGranular"), f.col("traitFromSource")),
                )
                # Propagate disease:
                .withColumn(
                    "traitFromSource",
                    f.last(f.col("traitFromSource"), True).over(full_study_id_window),
                )
                # Distinct study types are joined together into a string. So, if there's ambiguite, the study will be flagged when the study type is validated:
                .withColumn(
                    "studyType",
                    f.concat_ws(
                        ",", f.collect_set("studyType").over(full_study_id_window)
                    ),
                )
                # At this point, all studies in one window is expected to be identical. Let's just pick one:
                .withColumn("rank", f.row_number().over(study_id_window))
                .filter(f.col("rank") == 1)
                .drop(*columns_to_drop)
                # Added to avoid Spark optimisation (see: https://github.com/opentargets/issues/issues/3906#issuecomment-2949299965)
                .persist()
            ),
            _schema=StudyIndex.get_schema(),
        )

`aggregate_and_map_ancestries(discovery_samples: Column) -> Column` `classmethod` ¶

Map ancestries to populations in the LD reference and calculate relative sample size.

Parameters:

Name	Type	Description	Default
`discovery_samples`	`Column`	A list of struct column. Has an `ancestry` column and a `sampleSize` columns	required

Returns:

Name	Type	Description
`Column`	`Column`	A list of struct with mapped LD population and their relative sample size.

Source code in src/gentropy/dataset/study_index.py

@classmethod
def aggregate_and_map_ancestries(
    cls: type[StudyIndex], discovery_samples: Column
) -> Column:
    """Map ancestries to populations in the LD reference and calculate relative sample size.

    Args:
        discovery_samples (Column): A list of struct column. Has an `ancestry` column and a `sampleSize` columns

    Returns:
        Column: A list of struct with mapped LD population and their relative sample size.
    """
    # Map ancestry categories to population labels of the LD index:
    mapped_ancestries = f.transform(
        discovery_samples, cls._map_ancestries_to_ld_population
    )

    # Aggregate sample sizes belonging to the same LD population:
    aggregated_counts = f.aggregate(
        mapped_ancestries,
        f.array_distinct(
            f.transform(
                mapped_ancestries,
                lambda x: f.struct(
                    x.ancestry.alias("ancestry"), f.lit(0.0).alias("sampleSize")
                ),
            )
        ),
        cls._aggregate_samples_by_ancestry,
    )
    # Getting total sample count:
    total_sample_count = f.aggregate(
        aggregated_counts, f.lit(0.0), lambda total, pop: total + pop.sampleSize
    ).alias("sampleSize")
    # Calculating relative sample size for each LD population:
    return f.transform(
        aggregated_counts,
        lambda ld_population: f.struct(
            ld_population.ancestry.alias("ldPopulation"),
            (ld_population.sampleSize / total_sample_count).alias(
                "relativeSampleSize"
            ),
        ),
    )

`annotate_sumstats_qc(sumstats_qc: SummaryStatisticsQC, threshold_mean_beta: float = 0.05, threshold_mean_diff_pz: float = 0.05, threshold_se_diff_pz: float = 0.05, threshold_min_gc_lambda: float = 0.7, threshold_max_gc_lambda: float = 2.5, threshold_min_n_variants: int = 2000000) -> StudyIndex` ¶

Annotate summary stats QC information.

Parameters:

Name	Type	Description	Default
`sumstats_qc`	`SummaryStatisticsQC`	Dataset containing summary statistics-based quality controls.	required
`threshold_mean_beta`	`float`	Threshold for mean beta check. Defaults to 0.05.	`0.05`
`threshold_mean_diff_pz`	`float`	Threshold for mean diff PZ check. Defaults to 0.05.	`0.05`
`threshold_se_diff_pz`	`float`	Threshold for SE diff PZ check. Defaults to 0.05.	`0.05`
`threshold_min_gc_lambda`	`float`	Minimum threshold for GC lambda check. Defaults to 0.7.	`0.7`
`threshold_max_gc_lambda`	`float`	Maximum threshold for GC lambda check. Defaults to 2.5.	`2.5`
`threshold_min_n_variants`	`int`	Minimum number of variants for SuSiE check. Defaults to 2_000_000.	`2000000`

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	Updated study index with QC information

Source code in src/gentropy/dataset/study_index.py

def annotate_sumstats_qc(
    self: StudyIndex,
    sumstats_qc: SummaryStatisticsQC,
    threshold_mean_beta: float = 0.05,
    threshold_mean_diff_pz: float = 0.05,
    threshold_se_diff_pz: float = 0.05,
    threshold_min_gc_lambda: float = 0.7,
    threshold_max_gc_lambda: float = 2.5,
    threshold_min_n_variants: int = 2_000_000,
) -> StudyIndex:
    """Annotate summary stats QC information.

    Args:
        sumstats_qc (SummaryStatisticsQC): Dataset containing summary statistics-based quality controls.
        threshold_mean_beta (float): Threshold for mean beta check. Defaults to 0.05.
        threshold_mean_diff_pz (float): Threshold for mean diff PZ check. Defaults to 0.05.
        threshold_se_diff_pz (float): Threshold for SE diff PZ check. Defaults to 0.05.
        threshold_min_gc_lambda (float): Minimum threshold for GC lambda check. Defaults to 0.7.
        threshold_max_gc_lambda (float): Maximum threshold for GC lambda check. Defaults to 2.5.
        threshold_min_n_variants (int): Minimum number of variants for SuSiE check. Defaults to 2_000_000.

    Returns:
        StudyIndex: Updated study index with QC information
    """
    # convert all columns in sumstats_qc dataframe in array of structs grouped by studyId
    qc_check_cols = [c for c in sumstats_qc.df.columns if c != "studyId"]

    studies = self.df

    # Prepare the QC flags format for the study index:
    melted_qc = convert_from_wide_to_long(
        sumstats_qc.df,
        id_vars=["studyId"],
        value_vars=qc_check_cols,
        var_name="QCCheckName",
        value_name="QCCheckValue",
    )
    qc_struct = f.struct(f.col("QCCheckName"), f.col("QCCheckValue"))
    qc_df = (
        melted_qc.groupBy("studyId")
        .agg(f.collect_list(qc_struct).alias("sumstatQCValues"))
        .select("studyId", "sumstatQCValues")
        .withColumn("hasSumstats", f.lit(True))
    )
    extract_qc_value: Callable[[str], Column] = lambda x: filter_array_struct(
        "sumstatQCValues", "QCCheckName", x, "QCCheckValue"
    )

    df = (
        studies.drop("sumstatQCValues", "hasSumstats")
        .join(qc_df, how="left", on="studyId")
        .withColumn("hasSumstats", f.coalesce(f.col("hasSumstats"), f.lit(False)))
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                ~f.col("hasSumstats"),
                StudyQualityCheck.SUMSTATS_NOT_AVAILABLE,
            ),
        )
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                ~(f.abs(extract_qc_value("mean_beta")) <= threshold_mean_beta),
                StudyQualityCheck.FAILED_MEAN_BETA_CHECK,
            ),
        )
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                ~(
                    (
                        f.abs(extract_qc_value("mean_diff_pz"))
                        <= threshold_mean_diff_pz
                    )
                    & (extract_qc_value("se_diff_pz") <= threshold_se_diff_pz)
                ),
                StudyQualityCheck.FAILED_PZ_CHECK,
            ),
        )
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                ~(
                    (extract_qc_value("gc_lambda") <= threshold_max_gc_lambda)
                    & (extract_qc_value("gc_lambda") >= threshold_min_gc_lambda)
                ),
                StudyQualityCheck.FAILED_GC_LAMBDA_CHECK,
            ),
        )
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                extract_qc_value("n_variants") < threshold_min_n_variants,
                StudyQualityCheck.SMALL_NUMBER_OF_SNPS,
            ),
        )
    )

    # Annotate study index with QC information:
    return StudyIndex(
        _df=df,
        _schema=StudyIndex.get_schema(),
    )

`deconvolute_studies() -> StudyIndex` ¶

Deconvolute the study index dataset.

When ingesting the study index dataset, the same studyId might be ingested from more than one source. In such cases, the data needs to be merged and the quality control flags need to be combined.

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	Deconvoluted study index dataset.

Source code in src/gentropy/dataset/study_index.py

def deconvolute_studies(self: StudyIndex) -> StudyIndex:
    """Deconvolute the study index dataset.

    When ingesting the study index dataset, the same studyId might be ingested from more than one source.
    In such cases, the data needs to be merged and the quality control flags need to be combined.

    Returns:
        StudyIndex: Deconvoluted study index dataset.
    """
    # Windowing by study ID assume random order, but this is OK, because we are not selecting rows by a specific order.
    study_id_window = Window.partitionBy("studyId").orderBy(f.rand())

    # For certain aggregation, the full window is needed to be considered:
    full_study_id_window = study_id_window.orderBy("studyId").rangeBetween(
        Window.unboundedPreceding, Window.unboundedFollowing
    )

    # Temporary columns to drop at the end:
    columns_to_drop = ["keepTopHit", "mostGranular", "rank"]

    return StudyIndex(
        _df=(
            self.df
            # Initialising quality controls column, if not present:
            .withColumn(
                "qualityControls",
                f.when(
                    f.col("qualityControls").isNull(),
                    f.array().cast(ArrayType(StringType())),
                ).otherwise(f.col("qualityControls")),
            )
            # Keeping top hit studies unless the same study is available from a summmary statistics source:
            # This value will be set for all rows for the same `studyId`:
            .withColumn(
                "keepTopHit",
                f.when(
                    f.array_contains(
                        f.collect_set(f.col("hasSumstats")).over(
                            full_study_id_window
                        ),
                        True,
                    ),
                    f.lit(False),
                ).otherwise(True),
            )
            # For studies without summary statistics, we remove the "Not curated by Open Targets" flag:
            .withColumn(
                "qualityControls",
                f.when(
                    ~f.col("hasSumstats"),
                    f.array_remove(
                        f.col("qualityControls"),
                        StudyQualityCheck.NO_OT_CURATION.value,
                    ),
                ).otherwise(f.col("qualityControls")),
            )
            # If top hits are not kept, we remove the "sumstats not available" flag from all QC lists:
            .withColumn(
                "qualityControls",
                f.when(
                    ~f.col("keepTopHit"),
                    f.array_remove(
                        f.col("qualityControls"),
                        StudyQualityCheck.SUMSTATS_NOT_AVAILABLE.value,
                    ),
                ).otherwise(f.col("qualityControls")),
            )
            # Then propagate quality checks for all sources of the same study:
            .withColumn(
                "qualityControls",
                f.array_distinct(
                    f.flatten(
                        f.collect_set("qualityControls").over(full_study_id_window)
                    )
                ),
            )
            # Propagating sumstatQCValues -> map, cannot be flatten:
            .withColumn(
                "sumstatQCValues",
                f.first("sumstatQCValues", ignorenulls=True).over(
                    full_study_id_window
                ),
            )
            # Propagating analysisFlags:
            .withColumn(
                "analysisFlags",
                f.flatten(
                    f.collect_list("analysisFlags").over(full_study_id_window)
                ),
            )
            # Propagating hasSumstatsFlag - if no flag, leave null:
            .withColumn(
                "hasSumstats",
                f.when(
                    # There's a true:
                    f.array_contains(
                        f.collect_set("hasSumstats").over(full_study_id_window),
                        True,
                    ),
                    f.lit(True),
                ).when(
                    # There's a false:
                    f.array_contains(
                        f.collect_set("hasSumstats").over(full_study_id_window),
                        False,
                    ),
                    f.lit(False),
                ),
            )
            # Propagating disease: when different sets of diseases available for the same study,
            # we pick the shortest list, becasuse we assume, that is the most accurate disease assignment:
            .withColumn(
                "mostGranular",
                f.size(f.col("traitFromSourceMappedIds"))
                == f.min(f.size(f.col("traitFromSourceMappedIds"))).over(
                    full_study_id_window
                ),
            )
            # Remove less granular disease mappings:
            .withColumn(
                "traitFromSourceMappedIds",
                f.when(f.col("mostGranular"), f.col("traitFromSourceMappedIds")),
            )
            # Propagate mapped disease:
            .withColumn(
                "traitFromSourceMappedIds",
                f.last(f.col("traitFromSourceMappedIds"), True).over(
                    full_study_id_window
                ),
            )
            # Repeating these steps for the `traitFromSource` column:
            .withColumn(
                "traitFromSource",
                f.when(f.col("mostGranular"), f.col("traitFromSource")),
            )
            # Propagate disease:
            .withColumn(
                "traitFromSource",
                f.last(f.col("traitFromSource"), True).over(full_study_id_window),
            )
            # Distinct study types are joined together into a string. So, if there's ambiguite, the study will be flagged when the study type is validated:
            .withColumn(
                "studyType",
                f.concat_ws(
                    ",", f.collect_set("studyType").over(full_study_id_window)
                ),
            )
            # At this point, all studies in one window is expected to be identical. Let's just pick one:
            .withColumn("rank", f.row_number().over(study_id_window))
            .filter(f.col("rank") == 1)
            .drop(*columns_to_drop)
            # Added to avoid Spark optimisation (see: https://github.com/opentargets/issues/issues/3906#issuecomment-2949299965)
            .persist()
        ),
        _schema=StudyIndex.get_schema(),
    )

`get_QC_column_name() -> str` `classmethod` ¶

Return the name of the quality control column.

Returns:

Name	Type	Description
`str`	`str`	The name of the quality control column.

Source code in src/gentropy/dataset/study_index.py

@classmethod
def get_QC_column_name(cls: type[StudyIndex]) -> str:
    """Return the name of the quality control column.

    Returns:
        str: The name of the quality control column.
    """
    return "qualityControls"

`get_QC_mappings() -> dict[str, str]` `classmethod` ¶

Quality control flag to QC column category mappings.

Returns:

Type	Description
`dict[str, str]`	dict[str, str]: Mapping between flag name and QC column category value.

Source code in src/gentropy/dataset/study_index.py

@classmethod
def get_QC_mappings(cls: type[StudyIndex]) -> dict[str, str]:
    """Quality control flag to QC column category mappings.

    Returns:
        dict[str, str]: Mapping between flag name and QC column category value.
    """
    return {member.name: member.value for member in StudyQualityCheck}

`get_schema() -> StructType` `classmethod` ¶

Provide the schema for the StudyIndex dataset.

Returns:

Name	Type	Description
`StructType`	`StructType`	The schema of the StudyIndex dataset.

Source code in src/gentropy/dataset/study_index.py

@classmethod
def get_schema(cls: type[StudyIndex]) -> StructType:
    """Provide the schema for the StudyIndex dataset.

    Returns:
        StructType: The schema of the StudyIndex dataset.
    """
    return parse_spark_schema("study_index.json")

`has_mapped_trait() -> Column` ¶

Return a boolean column indicating if a study has mapped disease.

Returns:

Name	Type	Description
`Column`	`Column`	True if the study has mapped disease.

Source code in src/gentropy/dataset/study_index.py

def has_mapped_trait(self: StudyIndex) -> Column:
    """Return a boolean column indicating if a study has mapped disease.

    Returns:
        Column: True if the study has mapped disease.
    """
    return f.size(self.df.traitFromSourceMappedIds) > 0

`has_summarystats() -> Column` ¶

Return a boolean column indicating if a study has harmonized summary statistics.

Returns:

Name	Type	Description
`Column`	`Column`	True if the study has harmonized summary statistics.

Source code in src/gentropy/dataset/study_index.py

def has_summarystats(self: StudyIndex) -> Column:
    """Return a boolean column indicating if a study has harmonized summary statistics.

    Returns:
        Column: True if the study has harmonized summary statistics.
    """
    return self.df.hasSumstats

`is_gwas() -> Column` ¶

Return a boolean column with true values for GWAS studies.

Returns:

Name	Type	Description
`Column`	`Column`	True if the study is a GWAS study.

Source code in src/gentropy/dataset/study_index.py

def is_gwas(self: StudyIndex) -> Column:
    """Return a boolean column with true values for GWAS studies.

    Returns:
        Column: True if the study is a GWAS study.
    """
    return self.df.studyType == "gwas"

`is_qtl() -> Column` ¶

Return a boolean column with true values for QTL studies.

Returns:

Name	Type	Description
`Column`	`Column`	True if the study is a QTL study.

Source code in src/gentropy/dataset/study_index.py

def is_qtl(self: StudyIndex) -> Column:
    """Return a boolean column with true values for QTL studies.

    Returns:
        Column: True if the study is a QTL study.
    """
    return self.df.studyType.endswith("qtl")

`is_quality_flagged() -> Column` ¶

Return a boolean column indicating if a study is flagged due to quality issues.

Returns:

Name	Type	Description
`Column`	`Column`	True if the study is flagged.

Source code in src/gentropy/dataset/study_index.py

def is_quality_flagged(self: StudyIndex) -> Column:
    """Return a boolean column indicating if a study is flagged due to quality issues.

    Returns:
        Column: True if the study is flagged.
    """
    # Testing for the presence of the qualityControls column:
    if "qualityControls" not in self.df.columns:
        return f.lit(False)
    else:
        return f.size(self.df["qualityControls"]) != 0

`study_type_lut() -> DataFrame` ¶

Return a lookup table of study type.

Returns:

Name	Type	Description
`DataFrame`	`DataFrame`	A dataframe containing `studyId` and `studyType` columns.

Source code in src/gentropy/dataset/study_index.py

def study_type_lut(self: StudyIndex) -> DataFrame:
    """Return a lookup table of study type.

    Returns:
        DataFrame: A dataframe containing `studyId` and `studyType` columns.
    """
    return self.df.select("studyId", "studyType")

`validate_analysis_flags() -> StudyIndex` ¶

Validating analysis flags in the study index dataset.

This method checks the analysis flags and reports the the outcome of predicates to the quality control column.

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	with qualityControls column flagged based on analysisFlags.

Source code in src/gentropy/dataset/study_index.py

def validate_analysis_flags(self: StudyIndex) -> StudyIndex:
    """Validating analysis flags in the study index dataset.

    This method checks the analysis flags and reports the the outcome of predicates to the quality control column.

    Returns:
        StudyIndex: with qualityControls column flagged based on analysisFlags.
    """
    predicate = f.array_contains(
        "analysisFlags", StudyAnalysisFlag.CASE_CASE_STUDY.value
    )
    df = self.df.withColumn(
        "qualityControls",
        StudyIndex.update_quality_flag(
            f.col("qualityControls"),
            predicate,
            StudyQualityCheck.CASE_CASE_STUDY_DESIGN,
        ),
    )
    return StudyIndex(_df=df, _schema=StudyIndex.get_schema())

`validate_biosample(biosample_index: BiosampleIndex) -> StudyIndex` ¶

Validating biosample identifiers in the study index against the provided biosample index.

Parameters:

Name	Type	Description	Default
`biosample_index`	`BiosampleIndex`	Biosample index containing a reference of biosample identifiers e.g. cell types, tissues, cell lines, etc.	required

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	where non-gwas studies are flagged if biosampleIndex could not be validated.

Source code in src/gentropy/dataset/study_index.py

def validate_biosample(
    self: StudyIndex, biosample_index: BiosampleIndex
) -> StudyIndex:
    """Validating biosample identifiers in the study index against the provided biosample index.

    Args:
        biosample_index (BiosampleIndex): Biosample index containing a reference of biosample identifiers e.g. cell types, tissues, cell lines, etc.

    Returns:
        StudyIndex: where non-gwas studies are flagged if biosampleIndex could not be validated.
    """
    biosample_set = biosample_index.df.select(
        "biosampleId", f.lit(True).alias("isIdFound")
    )

    # If biosampleId in df, we need to drop it:
    if "biosampleId" in self.df.columns:
        self.df = self.df.drop("biosampleId")

    # As the biosampleFromSourceId is not a mandatory field of study index, we return if the column is not there:
    if "biosampleFromSourceId" not in self.df.columns:
        return self

    validated_df = (
        self.df.join(
            biosample_set,
            self.df.biosampleFromSourceId == biosample_set.biosampleId,
            how="left",
        )
        .withColumn(
            "isIdFound",
            f.when(
                (f.col("studyType") != "gwas") & (f.col("isIdFound").isNull()),
                f.lit(False),
            ).otherwise(f.lit(True)),
        )
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                ~f.col("isIdFound"),
                StudyQualityCheck.UNKNOWN_BIOSAMPLE,
            ),
        )
        .drop("isIdFound")
    )

    return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())

`validate_disease(disease_map: DataFrame) -> StudyIndex` ¶

Validate diseases in the study index dataset.

Parameters:

Name	Type	Description	Default
`disease_map`	`DataFrame`	a dataframe with two columns (efo, diseaseId).	required

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	where gwas studies are flagged where no valid disease id could be found.

Source code in src/gentropy/dataset/study_index.py

def validate_disease(self: StudyIndex, disease_map: DataFrame) -> StudyIndex:
    """Validate diseases in the study index dataset.

    Args:
        disease_map (DataFrame): a dataframe with two columns (efo, diseaseId).

    Returns:
        StudyIndex: where gwas studies are flagged where no valid disease id could be found.
    """
    # Because the disease ids are not mandatory fields of the schema, we skip vaildation if these columns are not present:
    if ("traitFromSourceMappedIds" not in self.df.columns) or (
        "backgroundTraitFromSourceMappedIds" not in self.df.columns
    ):
        return self

    # Disease Column names:
    foreground_disease_column = "diseaseIds"
    background_disease_column = "backgroundDiseaseIds"

    # If diseaseId in schema, we need to drop it:
    drop_columns = [
        column
        for column in self.df.columns
        if column in [foreground_disease_column, background_disease_column]
    ]

    if len(drop_columns) > 0:
        self.df = self.df.drop(*drop_columns)

    # Normalise disease:
    normalised_disease = self._normalise_disease(
        "traitFromSourceMappedIds", foreground_disease_column, disease_map
    )
    normalised_background_disease = self._normalise_disease(
        "backgroundTraitFromSourceMappedIds", background_disease_column, disease_map
    )

    return StudyIndex(
        _df=(
            self.df.join(normalised_disease, on="studyId", how="left")
            .join(normalised_background_disease, on="studyId", how="left")
            # Updating disease columns:
            .withColumn(
                foreground_disease_column,
                f.when(
                    f.col(foreground_disease_column).isNull(), f.array()
                ).otherwise(f.col(foreground_disease_column)),
            )
            .withColumn(
                background_disease_column,
                f.when(
                    f.col(background_disease_column).isNull(), f.array()
                ).otherwise(f.col(background_disease_column)),
            )
            # Flagging gwas studies where no valid disease is avilable:
            .withColumn(
                "qualityControls",
                StudyIndex.update_quality_flag(
                    f.col("qualityControls"),
                    # Flagging all gwas studies with no normalised disease:
                    (f.size(f.col(foreground_disease_column)) == 0)
                    & (f.col("studyType") == "gwas"),
                    StudyQualityCheck.UNRESOLVED_DISEASE,
                ),
            )
            # Added to avoid Spark optimisation (see: https://github.com/opentargets/issues/issues/3906#issuecomment-2949299965)
            .persist()
        ),
        _schema=StudyIndex.get_schema(),
    )

`validate_study_type() -> StudyIndex` ¶

Validating study type and flag unsupported types.

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	with flagged studies with unsupported type.

Source code in src/gentropy/dataset/study_index.py

def validate_study_type(self: StudyIndex) -> StudyIndex:
    """Validating study type and flag unsupported types.

    Returns:
        StudyIndex: with flagged studies with unsupported type.
    """
    validated_df = (
        self.df
        # Flagging unsupported study types:
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                f.when(
                    (f.col("studyType") == "gwas")
                    | f.col("studyType").endswith("qtl"),
                    False,
                ).otherwise(True),
                StudyQualityCheck.UNKNOWN_STUDY_TYPE,
            ),
        )
    )
    return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())

`validate_target(target_index: TargetIndex) -> StudyIndex` ¶

Validating gene identifiers in the study index against the provided target index.

Parameters:

Name	Type	Description	Default
`target_index`	`TargetIndex`	target index containing the reference gene identifiers (Ensembl gene identifiers).	required

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	with flagged studies if geneId could not be validated.

Source code in src/gentropy/dataset/study_index.py

def validate_target(self: StudyIndex, target_index: TargetIndex) -> StudyIndex:
    """Validating gene identifiers in the study index against the provided target index.

    Args:
        target_index (TargetIndex): target index containing the reference gene identifiers (Ensembl gene identifiers).

    Returns:
        StudyIndex: with flagged studies if geneId could not be validated.
    """
    gene_set = target_index.df.select(
        f.col("id").alias("geneId"), f.lit(True).alias("isIdFound")
    )

    # As the geneId is not a mandatory field of study index, we return if the column is not there:
    if "geneId" not in self.df.columns:
        return self

    validated_df = (
        self.df.join(gene_set, on="geneId", how="left")
        .withColumn(
            "isIdFound",
            f.when(
                (f.col("studyType") != "gwas") & f.col("isIdFound").isNull(),
                f.lit(False),
            ).otherwise(f.lit(True)),
        )
        .withColumn(
            "qualityControls",
            StudyIndex.update_quality_flag(
                f.col("qualityControls"),
                ~f.col("isIdFound"),
                StudyQualityCheck.UNRESOLVED_TARGET,
            ),
        )
        .drop("isIdFound")
    )

    return StudyIndex(_df=validated_df, _schema=StudyIndex.get_schema())

`validate_unique_study_id() -> StudyIndex` ¶

Validating the uniqueness of study identifiers and flagging duplicated studies.

Returns:

Name	Type	Description
`StudyIndex`	`StudyIndex`	with flagged duplicated studies.

Source code in src/gentropy/dataset/study_index.py

def validate_unique_study_id(self: StudyIndex) -> StudyIndex:
    """Validating the uniqueness of study identifiers and flagging duplicated studies.

    Returns:
        StudyIndex: with flagged duplicated studies.
    """
    return StudyIndex(
        _df=self.df.withColumn(
            "qualityControls",
            self.update_quality_flag(
                f.col("qualityControls"),
                self.flag_duplicates(f.col("studyId")),
                StudyQualityCheck.DUPLICATED_STUDY,
            ),
        ),
        _schema=StudyIndex.get_schema(),
    )

Schema¶

root
 |-- studyId: string (nullable = false)
 |-- projectId: string (nullable = false)
 |-- studyType: string (nullable = false)
 |-- traitFromSource: string (nullable = true)
 |-- traitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- diseaseIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- geneId: string (nullable = true)
 |-- biosampleFromSourceId: string (nullable = true)
 |-- biosampleId: string (nullable = true)
 |-- pubmedId: string (nullable = true)
 |-- publicationTitle: string (nullable = true)
 |-- publicationFirstAuthor: string (nullable = true)
 |-- publicationDate: string (nullable = true)
 |-- publicationJournal: string (nullable = true)
 |-- backgroundTraitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- backgroundDiseaseIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- initialSampleSize: string (nullable = true)
 |-- nCases: integer (nullable = true)
 |-- nControls: integer (nullable = true)
 |-- nSamples: integer (nullable = true)
 |-- cohorts: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ldPopulationStructure: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- ldPopulation: string (nullable = true)
 |    |    |-- relativeSampleSize: double (nullable = true)
 |-- discoverySamples: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- sampleSize: integer (nullable = true)
 |    |    |-- ancestry: string (nullable = true)
 |-- replicationSamples: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- sampleSize: integer (nullable = true)
 |    |    |-- ancestry: string (nullable = true)
 |-- qualityControls: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- analysisFlags: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- summarystatsLocation: string (nullable = true)
 |-- hasSumstats: boolean (nullable = true)
 |-- condition: string (nullable = true)
 |-- sumstatQCValues: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- QCCheckName: string (nullable = false)
 |    |    |-- QCCheckValue: float (nullable = false)

2023-01-20
2024-01-17
Contributors

Study Index

gentropy.dataset.study_index.StudyIndex dataclass ¶

aggregate_and_map_ancestries(discovery_samples: Column) -> Column classmethod ¶

annotate_sumstats_qc(sumstats_qc: SummaryStatisticsQC, threshold_mean_beta: float = 0.05, threshold_mean_diff_pz: float = 0.05, threshold_se_diff_pz: float = 0.05, threshold_min_gc_lambda: float = 0.7, threshold_max_gc_lambda: float = 2.5, threshold_min_n_variants: int = 2000000) -> StudyIndex ¶

deconvolute_studies() -> StudyIndex ¶

get_QC_column_name() -> str classmethod ¶

get_QC_mappings() -> dict[str, str] classmethod ¶

get_schema() -> StructType classmethod ¶

has_mapped_trait() -> Column ¶

has_summarystats() -> Column ¶

is_gwas() -> Column ¶

is_qtl() -> Column ¶

is_quality_flagged() -> Column ¶

study_type_lut() -> DataFrame ¶

validate_analysis_flags() -> StudyIndex ¶

validate_biosample(biosample_index: BiosampleIndex) -> StudyIndex ¶

validate_disease(disease_map: DataFrame) -> StudyIndex ¶

validate_study_type() -> StudyIndex ¶

validate_target(target_index: TargetIndex) -> StudyIndex ¶

validate_unique_study_id() -> StudyIndex ¶

Schema¶

`gentropy.dataset.study_index.StudyIndex` `dataclass` ¶

`aggregate_and_map_ancestries(discovery_samples: Column) -> Column` `classmethod` ¶

`annotate_sumstats_qc(sumstats_qc: SummaryStatisticsQC, threshold_mean_beta: float = 0.05, threshold_mean_diff_pz: float = 0.05, threshold_se_diff_pz: float = 0.05, threshold_min_gc_lambda: float = 0.7, threshold_max_gc_lambda: float = 2.5, threshold_min_n_variants: int = 2000000) -> StudyIndex` ¶

`deconvolute_studies() -> StudyIndex` ¶

`get_QC_column_name() -> str` `classmethod` ¶

`get_QC_mappings() -> dict[str, str]` `classmethod` ¶

`get_schema() -> StructType` `classmethod` ¶

`has_mapped_trait() -> Column` ¶

`has_summarystats() -> Column` ¶

`is_gwas() -> Column` ¶

`is_qtl() -> Column` ¶

`is_quality_flagged() -> Column` ¶

`study_type_lut() -> DataFrame` ¶

`validate_analysis_flags() -> StudyIndex` ¶

`validate_biosample(biosample_index: BiosampleIndex) -> StudyIndex` ¶

`validate_disease(disease_map: DataFrame) -> StudyIndex` ¶

`validate_study_type() -> StudyIndex` ¶

`validate_target(target_index: TargetIndex) -> StudyIndex` ¶

`validate_unique_study_id() -> StudyIndex` ¶