Skip to content

Study Index

gentropy.datasource.finngen.study_index.FinnGenStudyIndex

Study index dataset from FinnGen.

The following information is aggregated/extracted:

  • Study ID in the special format (e.g. FINNGEN_R11_*)
  • Trait name (for example, Amoebiasis)
  • Number of cases and controls
  • Link to the summary statistics location
  • EFO mapping from curated EFO mapping file

Some fields are also populated as constants, such as study type and the initial sample size.

Source code in src/gentropy/datasource/finngen/study_index.py
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
class FinnGenStudyIndex:
    """Study index dataset from FinnGen.

    The following information is aggregated/extracted:

    - Study ID in the special format (e.g. FINNGEN_R11_*)
    - Trait name (for example, Amoebiasis)
    - Number of cases and controls
    - Link to the summary statistics location
    - EFO mapping from curated EFO mapping file

    Some fields are also populated as constants, such as study type and the initial sample size.
    """

    CONSTANTS = {
        "studyType": "gwas",
        "hasSumstats": True,
        "initialSampleSize": "500,348 (282,064 females and 218,284 males)",
        "pubmedId": "36653562",
    }

    @staticmethod
    def validate_release_prefix(release_prefix: str) -> FinngenPrefixMatch:
        """Validate release prefix passed to finngen StudyIndex.

        Args:
            release_prefix (str): Finngen release prefix, should be a string like FINNGEN_R*.

        Returns:
            FinngenPrefixMatch: Object containing valid prefix and release strings.

        Raises:
            ValueError: when incorrect release prefix is provided.

        This method ensures that the trailing underscore is removed from prefix.
        """
        pattern = re.compile(r"FINNGEN_(?P<release>R\d+){1}_?")
        pattern_match = pattern.match(release_prefix)
        if not pattern_match:
            raise ValueError(
                f"Invalid FinnGen release prefix: {release_prefix}, use the format FINNGEN_R*"
            )
        release = pattern_match.group("release").upper()
        if release_prefix.endswith("_"):
            release_prefix = release_prefix[:-1]
        return FinngenPrefixMatch(prefix=release_prefix, release=release)

    @classmethod
    def from_source(
        cls: type[FinnGenStudyIndex],
        spark: SparkSession,
        finngen_phenotype_table_url: str,
        finngen_release_prefix: str,
        finngen_summary_stats_url_prefix: str,
        finngen_summary_stats_url_suffix: str,
        sample_size: int,
    ) -> StudyIndex:
        """This function ingests study level metadata from FinnGen.

        Args:
            spark (SparkSession): Spark session object.
            finngen_phenotype_table_url (str): URL to the FinnGen phenotype table.
            finngen_release_prefix (str): FinnGen release prefix.
            finngen_summary_stats_url_prefix (str): FinnGen summary stats URL prefix.
            finngen_summary_stats_url_suffix (str): FinnGen summary stats URL suffix.
            sample_size (int): Number of individuals participated in sample collection.

        Returns:
            StudyIndex: Parsed and annotated FinnGen study table.
        """
        json_data = urlopen(finngen_phenotype_table_url).read().decode("utf-8")
        rdd = spark.sparkContext.parallelize([json_data])
        raw_df = spark.read.json(rdd)

        return StudyIndex(
            _df=raw_df.select(
                f.concat(
                    f.concat_ws("_", f.lit(finngen_release_prefix), f.col("phenocode"))
                ).alias("studyId"),
                f.col("phenostring").alias("traitFromSource"),
                f.col("num_cases").cast("integer").alias("nCases"),
                f.col("num_controls").cast("integer").alias("nControls"),
                (f.col("num_cases") + f.col("num_controls"))
                .cast("integer")
                .alias("nSamples"),
                f.array(
                    f.struct(
                        f.lit(sample_size).cast("integer").alias("sampleSize"),
                        f.lit("Finnish").alias("ancestry"),
                    )
                ).alias("discoverySamples"),
                # Cohort label is consistent with GWAS Catalog curation.
                f.array(f.lit("FinnGen")).alias("cohorts"),
                f.concat(
                    f.lit(finngen_summary_stats_url_prefix),
                    f.col("phenocode"),
                    f.lit(finngen_summary_stats_url_suffix),
                ).alias("summarystatsLocation"),
                f.lit(finngen_release_prefix).alias("projectId"),
                *[f.lit(value).alias(key) for key, value in cls.CONSTANTS.items()],
            ).withColumn(
                "ldPopulationStructure",
                StudyIndex.aggregate_and_map_ancestries(f.col("discoverySamples")),
            ),
            _schema=StudyIndex.get_schema(),
        )

from_source(spark: SparkSession, finngen_phenotype_table_url: str, finngen_release_prefix: str, finngen_summary_stats_url_prefix: str, finngen_summary_stats_url_suffix: str, sample_size: int) -> StudyIndex classmethod

This function ingests study level metadata from FinnGen.

Parameters:

Name Type Description Default
spark SparkSession

Spark session object.

required
finngen_phenotype_table_url str

URL to the FinnGen phenotype table.

required
finngen_release_prefix str

FinnGen release prefix.

required
finngen_summary_stats_url_prefix str

FinnGen summary stats URL prefix.

required
finngen_summary_stats_url_suffix str

FinnGen summary stats URL suffix.

required
sample_size int

Number of individuals participated in sample collection.

required

Returns:

Name Type Description
StudyIndex StudyIndex

Parsed and annotated FinnGen study table.

Source code in src/gentropy/datasource/finngen/study_index.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
@classmethod
def from_source(
    cls: type[FinnGenStudyIndex],
    spark: SparkSession,
    finngen_phenotype_table_url: str,
    finngen_release_prefix: str,
    finngen_summary_stats_url_prefix: str,
    finngen_summary_stats_url_suffix: str,
    sample_size: int,
) -> StudyIndex:
    """This function ingests study level metadata from FinnGen.

    Args:
        spark (SparkSession): Spark session object.
        finngen_phenotype_table_url (str): URL to the FinnGen phenotype table.
        finngen_release_prefix (str): FinnGen release prefix.
        finngen_summary_stats_url_prefix (str): FinnGen summary stats URL prefix.
        finngen_summary_stats_url_suffix (str): FinnGen summary stats URL suffix.
        sample_size (int): Number of individuals participated in sample collection.

    Returns:
        StudyIndex: Parsed and annotated FinnGen study table.
    """
    json_data = urlopen(finngen_phenotype_table_url).read().decode("utf-8")
    rdd = spark.sparkContext.parallelize([json_data])
    raw_df = spark.read.json(rdd)

    return StudyIndex(
        _df=raw_df.select(
            f.concat(
                f.concat_ws("_", f.lit(finngen_release_prefix), f.col("phenocode"))
            ).alias("studyId"),
            f.col("phenostring").alias("traitFromSource"),
            f.col("num_cases").cast("integer").alias("nCases"),
            f.col("num_controls").cast("integer").alias("nControls"),
            (f.col("num_cases") + f.col("num_controls"))
            .cast("integer")
            .alias("nSamples"),
            f.array(
                f.struct(
                    f.lit(sample_size).cast("integer").alias("sampleSize"),
                    f.lit("Finnish").alias("ancestry"),
                )
            ).alias("discoverySamples"),
            # Cohort label is consistent with GWAS Catalog curation.
            f.array(f.lit("FinnGen")).alias("cohorts"),
            f.concat(
                f.lit(finngen_summary_stats_url_prefix),
                f.col("phenocode"),
                f.lit(finngen_summary_stats_url_suffix),
            ).alias("summarystatsLocation"),
            f.lit(finngen_release_prefix).alias("projectId"),
            *[f.lit(value).alias(key) for key, value in cls.CONSTANTS.items()],
        ).withColumn(
            "ldPopulationStructure",
            StudyIndex.aggregate_and_map_ancestries(f.col("discoverySamples")),
        ),
        _schema=StudyIndex.get_schema(),
    )

validate_release_prefix(release_prefix: str) -> FinngenPrefixMatch staticmethod

Validate release prefix passed to finngen StudyIndex.

Parameters:

Name Type Description Default
release_prefix str

Finngen release prefix, should be a string like FINNGEN_R*.

required

Returns:

Name Type Description
FinngenPrefixMatch FinngenPrefixMatch

Object containing valid prefix and release strings.

Raises:

Type Description
ValueError

when incorrect release prefix is provided.

This method ensures that the trailing underscore is removed from prefix.

Source code in src/gentropy/datasource/finngen/study_index.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@staticmethod
def validate_release_prefix(release_prefix: str) -> FinngenPrefixMatch:
    """Validate release prefix passed to finngen StudyIndex.

    Args:
        release_prefix (str): Finngen release prefix, should be a string like FINNGEN_R*.

    Returns:
        FinngenPrefixMatch: Object containing valid prefix and release strings.

    Raises:
        ValueError: when incorrect release prefix is provided.

    This method ensures that the trailing underscore is removed from prefix.
    """
    pattern = re.compile(r"FINNGEN_(?P<release>R\d+){1}_?")
    pattern_match = pattern.match(release_prefix)
    if not pattern_match:
        raise ValueError(
            f"Invalid FinnGen release prefix: {release_prefix}, use the format FINNGEN_R*"
        )
    release = pattern_match.group("release").upper()
    if release_prefix.endswith("_"):
        release_prefix = release_prefix[:-1]
    return FinngenPrefixMatch(prefix=release_prefix, release=release)