Skip to content

Study index finngen

Bases: StudyIndex

Study index dataset from FinnGen.

The following information is aggregated/extracted:

  • Study ID in the special format (FINNGEN_R9_*)
  • Trait name (for example, Amoebiasis)
  • Number of cases and controls
  • Link to the summary statistics location

Some fields are also populated as constants, such as study type and the initial sample size.

Source code in src/otg/dataset/study_index.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
@dataclass
class StudyIndexFinnGen(StudyIndex):
    """Study index dataset from FinnGen.

    The following information is aggregated/extracted:

    - Study ID in the special format (FINNGEN_R9_*)
    - Trait name (for example, Amoebiasis)
    - Number of cases and controls
    - Link to the summary statistics location

    Some fields are also populated as constants, such as study type and the initial sample size.
    """

    @classmethod
    def get_schema(cls: type[StudyIndexFinnGen]) -> StructType:
        """Provides the schema for the StudyIndexFinnGen dataset.

        This method is a duplication from the parent class, but by definition, the use of abstract methods require that every child class implements them.

        Returns:
            StructType: Spark schema for the StudyIndexFinnGen dataset.
        """
        return parse_spark_schema("studies.json")

    @classmethod
    def from_source(
        cls: type[StudyIndexFinnGen],
        finngen_studies: DataFrame,
        finngen_release_prefix: str,
        finngen_sumstat_url_prefix: str,
        finngen_sumstat_url_suffix: str,
    ) -> StudyIndexFinnGen:
        """This function ingests study level metadata from FinnGen.

        Args:
            finngen_studies (DataFrame): FinnGen raw study table
            finngen_release_prefix (str): Release prefix pattern.
            finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.
            finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.

        Returns:
            StudyIndexFinnGen: Parsed and annotated FinnGen study table.
        """
        return cls(
            _df=finngen_studies.select(
                f.concat(f.lit(f"{finngen_release_prefix}_"), f.col("phenocode")).alias(
                    "studyId"
                ),
                f.col("phenostring").alias("traitFromSource"),
                f.col("num_cases").alias("nCases"),
                f.col("num_controls").alias("nControls"),
                f.lit(finngen_release_prefix).alias("projectId"),
                f.lit("gwas").alias("studyType"),
                f.lit(True).alias("hasSumstats"),
                f.lit("377,277 (210,870 females and 166,407 males)").alias(
                    "initialSampleSize"
                ),
            )
            .withColumn("nSamples", f.col("nCases") + f.col("nControls"))
            .withColumn(
                "summarystatsLocation",
                f.concat(
                    f.lit(finngen_sumstat_url_prefix),
                    f.col("studyId"),
                    f.lit(finngen_sumstat_url_suffix),
                ),
            ),
            _schema=cls.get_schema(),
        )

from_source(finngen_studies, finngen_release_prefix, finngen_sumstat_url_prefix, finngen_sumstat_url_suffix) classmethod

This function ingests study level metadata from FinnGen.

Parameters:

Name Type Description Default
finngen_studies DataFrame

FinnGen raw study table

required
finngen_release_prefix str

Release prefix pattern.

required
finngen_sumstat_url_prefix str

URL prefix for summary statistics location.

required
finngen_sumstat_url_suffix str

URL prefix suffix for summary statistics location.

required

Returns:

Name Type Description
StudyIndexFinnGen StudyIndexFinnGen

Parsed and annotated FinnGen study table.

Source code in src/otg/dataset/study_index.py
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
@classmethod
def from_source(
    cls: type[StudyIndexFinnGen],
    finngen_studies: DataFrame,
    finngen_release_prefix: str,
    finngen_sumstat_url_prefix: str,
    finngen_sumstat_url_suffix: str,
) -> StudyIndexFinnGen:
    """This function ingests study level metadata from FinnGen.

    Args:
        finngen_studies (DataFrame): FinnGen raw study table
        finngen_release_prefix (str): Release prefix pattern.
        finngen_sumstat_url_prefix (str): URL prefix for summary statistics location.
        finngen_sumstat_url_suffix (str): URL prefix suffix for summary statistics location.

    Returns:
        StudyIndexFinnGen: Parsed and annotated FinnGen study table.
    """
    return cls(
        _df=finngen_studies.select(
            f.concat(f.lit(f"{finngen_release_prefix}_"), f.col("phenocode")).alias(
                "studyId"
            ),
            f.col("phenostring").alias("traitFromSource"),
            f.col("num_cases").alias("nCases"),
            f.col("num_controls").alias("nControls"),
            f.lit(finngen_release_prefix).alias("projectId"),
            f.lit("gwas").alias("studyType"),
            f.lit(True).alias("hasSumstats"),
            f.lit("377,277 (210,870 females and 166,407 males)").alias(
                "initialSampleSize"
            ),
        )
        .withColumn("nSamples", f.col("nCases") + f.col("nControls"))
        .withColumn(
            "summarystatsLocation",
            f.concat(
                f.lit(finngen_sumstat_url_prefix),
                f.col("studyId"),
                f.lit(finngen_sumstat_url_suffix),
            ),
        ),
        _schema=cls.get_schema(),
    )

get_schema() classmethod

Provides the schema for the StudyIndexFinnGen dataset.

This method is a duplication from the parent class, but by definition, the use of abstract methods require that every child class implements them.

Returns:

Name Type Description
StructType StructType

Spark schema for the StudyIndexFinnGen dataset.

Source code in src/otg/dataset/study_index.py
446
447
448
449
450
451
452
453
454
455
@classmethod
def get_schema(cls: type[StudyIndexFinnGen]) -> StructType:
    """Provides the schema for the StudyIndexFinnGen dataset.

    This method is a duplication from the parent class, but by definition, the use of abstract methods require that every child class implements them.

    Returns:
        StructType: Spark schema for the StudyIndexFinnGen dataset.
    """
    return parse_spark_schema("studies.json")

Schema

root
 |-- studyId: string (nullable = false)
 |-- projectId: string (nullable = false)
 |-- studyType: string (nullable = false)
 |-- traitFromSource: string (nullable = false)
 |-- traitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- pubmedId: string (nullable = true)
 |-- publicationTitle: string (nullable = true)
 |-- publicationFirstAuthor: string (nullable = true)
 |-- publicationDate: string (nullable = true)
 |-- publicationJournal: string (nullable = true)
 |-- backgroundTraitFromSourceMappedIds: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- initialSampleSize: string (nullable = true)
 |-- nCases: long (nullable = true)
 |-- nControls: long (nullable = true)
 |-- nSamples: long (nullable = true)
 |-- discoverySamples: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- sampleSize: string (nullable = true)
 |    |    |-- ancestry: string (nullable = true)
 |-- replicationSamples: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- sampleSize: string (nullable = true)
 |    |    |-- ancestry: string (nullable = true)
 |-- summarystatsLocation: string (nullable = true)
 |-- hasSumstats: boolean (nullable = true)