Skip to content

gwas_catalog_study_curation

gentropy.gwas_catalog_study_curation.GWASCatalogStudyCurationStep

Annotate GWAS Catalog studies with additional curation and create a curation backlog.

Source code in src/gentropy/gwas_catalog_study_curation.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
class GWASCatalogStudyCurationStep:
    """Annotate GWAS Catalog studies with additional curation and create a curation backlog."""

    def __init__(
        self,
        session: Session,
        catalog_study_files: list[str],
        catalog_ancestry_files: list[str],
        catalog_sumstats_lut: str,
        gwas_catalog_study_curation_out: str,
        gwas_catalog_study_curation_file: str | None,
    ) -> None:
        """Run step to annotate and create backlog.

        Args:
            session (Session): Session object.
            catalog_study_files (list[str]): List of raw GWAS catalog studies file.
            catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
            catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.
            gwas_catalog_study_curation_out (str): Path for the updated curation table.
            gwas_catalog_study_curation_file (str | None): Path to the original curation table. Optinal
        """
        catalog_studies = session.spark.read.csv(
            list(catalog_study_files), sep="\t", header=True
        )
        ancestry_lut = session.spark.read.csv(
            list(catalog_ancestry_files), sep="\t", header=True
        )
        sumstats_lut = session.spark.read.csv(
            catalog_sumstats_lut, sep="\t", header=False
        )
        gwas_catalog_study_curation = read_curation_table(
            gwas_catalog_study_curation_file, session
        )

        # Process GWAS Catalog studies and get list of studies for curation:
        (
            StudyIndexGWASCatalogParser.from_source(
                catalog_studies, ancestry_lut, sumstats_lut
            )
            # Adding existing curation:
            .annotate_from_study_curation(gwas_catalog_study_curation)
            # Extract new studies for curation:
            .extract_studies_for_curation(gwas_catalog_study_curation)
            # Save table:
            .toPandas()
            .to_csv(gwas_catalog_study_curation_out, sep="\t", index=False)
        )

__init__(session: Session, catalog_study_files: list[str], catalog_ancestry_files: list[str], catalog_sumstats_lut: str, gwas_catalog_study_curation_out: str, gwas_catalog_study_curation_file: str | None) -> None

Run step to annotate and create backlog.

Parameters:

Name Type Description Default
session Session

Session object.

required
catalog_study_files list[str]

List of raw GWAS catalog studies file.

required
catalog_ancestry_files list[str]

List of raw ancestry annotations files from GWAS Catalog.

required
catalog_sumstats_lut str

GWAS Catalog summary statistics lookup table.

required
gwas_catalog_study_curation_out str

Path for the updated curation table.

required
gwas_catalog_study_curation_file str | None

Path to the original curation table. Optinal

required
Source code in src/gentropy/gwas_catalog_study_curation.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def __init__(
    self,
    session: Session,
    catalog_study_files: list[str],
    catalog_ancestry_files: list[str],
    catalog_sumstats_lut: str,
    gwas_catalog_study_curation_out: str,
    gwas_catalog_study_curation_file: str | None,
) -> None:
    """Run step to annotate and create backlog.

    Args:
        session (Session): Session object.
        catalog_study_files (list[str]): List of raw GWAS catalog studies file.
        catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
        catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.
        gwas_catalog_study_curation_out (str): Path for the updated curation table.
        gwas_catalog_study_curation_file (str | None): Path to the original curation table. Optinal
    """
    catalog_studies = session.spark.read.csv(
        list(catalog_study_files), sep="\t", header=True
    )
    ancestry_lut = session.spark.read.csv(
        list(catalog_ancestry_files), sep="\t", header=True
    )
    sumstats_lut = session.spark.read.csv(
        catalog_sumstats_lut, sep="\t", header=False
    )
    gwas_catalog_study_curation = read_curation_table(
        gwas_catalog_study_curation_file, session
    )

    # Process GWAS Catalog studies and get list of studies for curation:
    (
        StudyIndexGWASCatalogParser.from_source(
            catalog_studies, ancestry_lut, sumstats_lut
        )
        # Adding existing curation:
        .annotate_from_study_curation(gwas_catalog_study_curation)
        # Extract new studies for curation:
        .extract_studies_for_curation(gwas_catalog_study_curation)
        # Save table:
        .toPandas()
        .to_csv(gwas_catalog_study_curation_out, sep="\t", index=False)
    )