Skip to content

Study Index

gentropy.datasource.finngen.study_index.FinnGenStudyIndex

Study index dataset from FinnGen.

The following information is aggregated/extracted:

  • Study ID in the special format (e.g. FINNGEN_R10_*)
  • Trait name (for example, Amoebiasis)
  • Number of cases and controls
  • Link to the summary statistics location

Some fields are also populated as constants, such as study type and the initial sample size.

Source code in src/gentropy/datasource/finngen/study_index.py
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class FinnGenStudyIndex:
    """Study index dataset from FinnGen.

    The following information is aggregated/extracted:

    - Study ID in the special format (e.g. FINNGEN_R10_*)
    - Trait name (for example, Amoebiasis)
    - Number of cases and controls
    - Link to the summary statistics location

    Some fields are also populated as constants, such as study type and the initial sample size.
    """

    finngen_phenotype_table_url: str = "https://r10.finngen.fi/api/phenos"
    finngen_release_prefix: str = "FINNGEN_R10"
    finngen_summary_stats_url_prefix: str = (
        "gs://finngen-public-data-r10/summary_stats/finngen_R10_"
    )
    finngen_summary_stats_url_suffix: str = ".gz"

    @classmethod
    def from_source(
        cls: type[FinnGenStudyIndex],
        spark: SparkSession,
    ) -> StudyIndex:
        """This function ingests study level metadata from FinnGen.

        Args:
            spark (SparkSession): Spark session object.

        Returns:
            StudyIndex: Parsed and annotated FinnGen study table.
        """
        json_data = urlopen(cls.finngen_phenotype_table_url).read().decode("utf-8")
        rdd = spark.sparkContext.parallelize([json_data])
        raw_df = spark.read.json(rdd)
        return StudyIndex(
            _df=raw_df.select(
                f.concat(
                    f.lit(f"{cls.finngen_release_prefix}_"), f.col("phenocode")
                ).alias("studyId"),
                f.col("phenostring").alias("traitFromSource"),
                f.col("num_cases").cast("integer").alias("nCases"),
                f.col("num_controls").cast("integer").alias("nControls"),
                (f.col("num_cases") + f.col("num_controls"))
                .cast("integer")
                .alias("nSamples"),
                f.lit(cls.finngen_release_prefix).alias("projectId"),
                f.lit("gwas").alias("studyType"),
                f.lit(True).alias("hasSumstats"),
                f.lit("377,277 (210,870 females and 166,407 males)").alias(
                    "initialSampleSize"
                ),
                f.array(
                    f.struct(
                        f.lit(377277).cast("integer").alias("sampleSize"),
                        f.lit("Finnish").alias("ancestry"),
                    )
                ).alias("discoverySamples"),
                # Cohort label is consistent with GWAS Catalog curation.
                f.array(f.lit("FinnGen")).alias("cohorts"),
                f.concat(
                    f.lit(cls.finngen_summary_stats_url_prefix),
                    f.col("phenocode"),
                    f.lit(cls.finngen_summary_stats_url_suffix),
                ).alias("summarystatsLocation"),
            ).withColumn(
                "ldPopulationStructure",
                StudyIndex.aggregate_and_map_ancestries(f.col("discoverySamples")),
            ),
            _schema=StudyIndex.get_schema(),
        )

from_source(spark: SparkSession) -> StudyIndex classmethod

This function ingests study level metadata from FinnGen.

Parameters:

Name Type Description Default
spark SparkSession

Spark session object.

required

Returns:

Name Type Description
StudyIndex StudyIndex

Parsed and annotated FinnGen study table.

Source code in src/gentropy/datasource/finngen/study_index.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
@classmethod
def from_source(
    cls: type[FinnGenStudyIndex],
    spark: SparkSession,
) -> StudyIndex:
    """This function ingests study level metadata from FinnGen.

    Args:
        spark (SparkSession): Spark session object.

    Returns:
        StudyIndex: Parsed and annotated FinnGen study table.
    """
    json_data = urlopen(cls.finngen_phenotype_table_url).read().decode("utf-8")
    rdd = spark.sparkContext.parallelize([json_data])
    raw_df = spark.read.json(rdd)
    return StudyIndex(
        _df=raw_df.select(
            f.concat(
                f.lit(f"{cls.finngen_release_prefix}_"), f.col("phenocode")
            ).alias("studyId"),
            f.col("phenostring").alias("traitFromSource"),
            f.col("num_cases").cast("integer").alias("nCases"),
            f.col("num_controls").cast("integer").alias("nControls"),
            (f.col("num_cases") + f.col("num_controls"))
            .cast("integer")
            .alias("nSamples"),
            f.lit(cls.finngen_release_prefix).alias("projectId"),
            f.lit("gwas").alias("studyType"),
            f.lit(True).alias("hasSumstats"),
            f.lit("377,277 (210,870 females and 166,407 males)").alias(
                "initialSampleSize"
            ),
            f.array(
                f.struct(
                    f.lit(377277).cast("integer").alias("sampleSize"),
                    f.lit("Finnish").alias("ancestry"),
                )
            ).alias("discoverySamples"),
            # Cohort label is consistent with GWAS Catalog curation.
            f.array(f.lit("FinnGen")).alias("cohorts"),
            f.concat(
                f.lit(cls.finngen_summary_stats_url_prefix),
                f.col("phenocode"),
                f.lit(cls.finngen_summary_stats_url_suffix),
            ).alias("summarystatsLocation"),
        ).withColumn(
            "ldPopulationStructure",
            StudyIndex.aggregate_and_map_ancestries(f.col("discoverySamples")),
        ),
        _schema=StudyIndex.get_schema(),
    )