Skip to content

gwas_catalog_study_curation

gentropy.gwas_catalog_study_curation.GWASCatalogStudyCurationStep

Annotate GWAS Catalog studies with additional curation and create a curation backlog.

Source code in src/gentropy/gwas_catalog_study_curation.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
class GWASCatalogStudyCurationStep:
    """Annotate GWAS Catalog studies with additional curation and create a curation backlog."""

    def __init__(
        self,
        session: Session,
        catalog_study_files: list[str],
        catalog_ancestry_files: list[str],
        gwas_catalog_study_curation_out: str,
        gwas_catalog_study_curation_file: str | None,
    ) -> None:
        """Run step to annotate and create backlog.

        Args:
            session (Session): Session object.
            catalog_study_files (list[str]): List of raw GWAS catalog studies file.
            catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
            gwas_catalog_study_curation_out (str): Path for the updated curation table.
            gwas_catalog_study_curation_file (str | None): Path to the original curation table. Optinal

        Raises:
            ValueError: If the curation file is provided but not a CSV file or URL.
        """
        catalog_studies = session.spark.read.csv(
            list(catalog_study_files), sep="\t", header=True
        )
        ancestry_lut = session.spark.read.csv(
            list(catalog_ancestry_files), sep="\t", header=True
        )

        if gwas_catalog_study_curation_file:
            if gwas_catalog_study_curation_file.endswith(".csv"):
                gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_csv(
                    session, gwas_catalog_study_curation_file
                )
            elif gwas_catalog_study_curation_file.startswith("http"):
                gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_url(
                    session, gwas_catalog_study_curation_file
                )
            else:
                raise ValueError(
                    "Only CSV files or URLs are accepted as curation file."
                )

        # Process GWAS Catalog studies and get list of studies for curation:
        (
            StudyIndexGWASCatalogParser.from_source(catalog_studies, ancestry_lut)
            # Adding existing curation:
            .annotate_from_study_curation(gwas_catalog_study_curation)
            # Extract new studies for curation:
            .extract_studies_for_curation(gwas_catalog_study_curation)
            # Save table:
            .toPandas()
            .to_csv(gwas_catalog_study_curation_out, sep="\t", index=False)
        )

__init__(session: Session, catalog_study_files: list[str], catalog_ancestry_files: list[str], gwas_catalog_study_curation_out: str, gwas_catalog_study_curation_file: str | None) -> None

Run step to annotate and create backlog.

Parameters:

Name Type Description Default
session Session

Session object.

required
catalog_study_files list[str]

List of raw GWAS catalog studies file.

required
catalog_ancestry_files list[str]

List of raw ancestry annotations files from GWAS Catalog.

required
gwas_catalog_study_curation_out str

Path for the updated curation table.

required
gwas_catalog_study_curation_file str | None

Path to the original curation table. Optinal

required

Raises:

Type Description
ValueError

If the curation file is provided but not a CSV file or URL.

Source code in src/gentropy/gwas_catalog_study_curation.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def __init__(
    self,
    session: Session,
    catalog_study_files: list[str],
    catalog_ancestry_files: list[str],
    gwas_catalog_study_curation_out: str,
    gwas_catalog_study_curation_file: str | None,
) -> None:
    """Run step to annotate and create backlog.

    Args:
        session (Session): Session object.
        catalog_study_files (list[str]): List of raw GWAS catalog studies file.
        catalog_ancestry_files (list[str]): List of raw ancestry annotations files from GWAS Catalog.
        gwas_catalog_study_curation_out (str): Path for the updated curation table.
        gwas_catalog_study_curation_file (str | None): Path to the original curation table. Optinal

    Raises:
        ValueError: If the curation file is provided but not a CSV file or URL.
    """
    catalog_studies = session.spark.read.csv(
        list(catalog_study_files), sep="\t", header=True
    )
    ancestry_lut = session.spark.read.csv(
        list(catalog_ancestry_files), sep="\t", header=True
    )

    if gwas_catalog_study_curation_file:
        if gwas_catalog_study_curation_file.endswith(".csv"):
            gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_csv(
                session, gwas_catalog_study_curation_file
            )
        elif gwas_catalog_study_curation_file.startswith("http"):
            gwas_catalog_study_curation = StudyIndexGWASCatalogOTCuration.from_url(
                session, gwas_catalog_study_curation_file
            )
        else:
            raise ValueError(
                "Only CSV files or URLs are accepted as curation file."
            )

    # Process GWAS Catalog studies and get list of studies for curation:
    (
        StudyIndexGWASCatalogParser.from_source(catalog_studies, ancestry_lut)
        # Adding existing curation:
        .annotate_from_study_curation(gwas_catalog_study_curation)
        # Extract new studies for curation:
        .extract_studies_for_curation(gwas_catalog_study_curation)
        # Save table:
        .toPandas()
        .to_csv(gwas_catalog_study_curation_out, sep="\t", index=False)
    )