Skip to content

GWAS Catalog

Bases: GWASCatalogStepConfig

GWAS Catalog step.

Source code in src/otg/gwas_catalog.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@dataclass
class GWASCatalogStep(GWASCatalogStepConfig):
    """GWAS Catalog step."""

    session: Session = Session()

    def run(self: GWASCatalogStep) -> None:
        """Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables."""
        hl.init(sc=self.session.spark.sparkContext, log="/dev/null")
        # All inputs:
        # Variant annotation dataset
        va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)
        # GWAS Catalog raw study information
        catalog_studies = self.session.spark.read.csv(
            self.catalog_studies_file, sep="\t", header=True
        )
        # GWAS Catalog ancestry information
        ancestry_lut = self.session.spark.read.csv(
            self.catalog_ancestry_file, sep="\t", header=True
        )
        # GWAS Catalog summary statistics information
        sumstats_lut = self.session.spark.read.csv(
            self.catalog_sumstats_lut, sep="\t", header=False
        )
        # GWAS Catalog raw association information
        catalog_associations = self.session.spark.read.csv(
            self.catalog_associations_file, sep="\t", header=True
        )
        # LD index dataset
        ld_index = LDIndex.from_parquet(self.session, self.ld_index_path)

        # Transform:
        # GWAS Catalog study index and study-locus splitted
        study_index, study_locus = GWASCatalogSplitter.split(
            StudyIndexGWASCatalog.from_source(
                catalog_studies, ancestry_lut, sumstats_lut
            ),
            StudyLocusGWASCatalog.from_source(catalog_associations, va),
        )

        # Annotate LD information and clump associations dataset
        study_locus = study_locus.annotate_ld(study_index, ld_index).clump()

        # Fine-mapping LD-clumped study-locus using PICS
        finemapped_study_locus = PICS.finemap(study_locus).annotate_credible_sets()

        # Write:
        study_index.df.write.mode(self.session.write_mode).parquet(
            self.catalog_studies_out
        )
        finemapped_study_locus.df.write.mode(self.session.write_mode).parquet(
            self.catalog_associations_out
        )

run()

Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables.

Source code in src/otg/gwas_catalog.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def run(self: GWASCatalogStep) -> None:
    """Run GWAS Catalog ingestion step to extract GWASCatalog Study and StudyLocus tables."""
    hl.init(sc=self.session.spark.sparkContext, log="/dev/null")
    # All inputs:
    # Variant annotation dataset
    va = VariantAnnotation.from_parquet(self.session, self.variant_annotation_path)
    # GWAS Catalog raw study information
    catalog_studies = self.session.spark.read.csv(
        self.catalog_studies_file, sep="\t", header=True
    )
    # GWAS Catalog ancestry information
    ancestry_lut = self.session.spark.read.csv(
        self.catalog_ancestry_file, sep="\t", header=True
    )
    # GWAS Catalog summary statistics information
    sumstats_lut = self.session.spark.read.csv(
        self.catalog_sumstats_lut, sep="\t", header=False
    )
    # GWAS Catalog raw association information
    catalog_associations = self.session.spark.read.csv(
        self.catalog_associations_file, sep="\t", header=True
    )
    # LD index dataset
    ld_index = LDIndex.from_parquet(self.session, self.ld_index_path)

    # Transform:
    # GWAS Catalog study index and study-locus splitted
    study_index, study_locus = GWASCatalogSplitter.split(
        StudyIndexGWASCatalog.from_source(
            catalog_studies, ancestry_lut, sumstats_lut
        ),
        StudyLocusGWASCatalog.from_source(catalog_associations, va),
    )

    # Annotate LD information and clump associations dataset
    study_locus = study_locus.annotate_ld(study_index, ld_index).clump()

    # Fine-mapping LD-clumped study-locus using PICS
    finemapped_study_locus = PICS.finemap(study_locus).annotate_credible_sets()

    # Write:
    study_index.df.write.mode(self.session.write_mode).parquet(
        self.catalog_studies_out
    )
    finemapped_study_locus.df.write.mode(self.session.write_mode).parquet(
        self.catalog_associations_out
    )

GWAS Catalog step requirements.

Attributes:

Name Type Description
catalog_studies_file str

Raw GWAS catalog studies file.

catalog_ancestry_file str

Ancestry annotations file from GWAS Catalog.

catalog_sumstats_lut str

GWAS Catalog summary statistics lookup table.

catalog_associations_file str

Raw GWAS catalog associations file.

variant_annotation_path str

Input variant annotation path.

ld_populations list

List of populations to include.

min_r2 float

Minimum r2 to consider when considering variants within a window.

catalog_studies_out str

Output GWAS catalog studies path.

catalog_associations_out str

Output GWAS catalog associations path.

Source code in src/otg/config.py
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
@dataclass
class GWASCatalogStepConfig:
    """GWAS Catalog step requirements.

    Attributes:
        catalog_studies_file (str): Raw GWAS catalog studies file.
        catalog_ancestry_file (str): Ancestry annotations file from GWAS Catalog.
        catalog_sumstats_lut (str): GWAS Catalog summary statistics lookup table.
        catalog_associations_file (str): Raw GWAS catalog associations file.
        variant_annotation_path (str): Input variant annotation path.
        ld_populations (list): List of populations to include.
        min_r2 (float): Minimum r2 to consider when considering variants within a window.
        catalog_studies_out (str): Output GWAS catalog studies path.
        catalog_associations_out (str): Output GWAS catalog associations path.
    """

    _target_: str = "otg.gwas_catalog.GWASCatalogStep"
    catalog_studies_file: str = MISSING
    catalog_ancestry_file: str = MISSING
    catalog_sumstats_lut: str = MISSING
    catalog_associations_file: str = MISSING
    variant_annotation_path: str = MISSING
    ld_index_path: str = MISSING
    min_r2: float = 0.5
    catalog_studies_out: str = MISSING
    catalog_associations_out: str = MISSING