Source code for obnb.data.annotation.disgenet

import pprint

import pandas as pd

from obnb.data.annotation.base import BaseAnnotationData
from obnb.typing import List, Optional


[docs]class DisGeNETAnnotation(BaseAnnotationData): """DisGeNET disease gene annotations. Disease gene associations are retrieved from disgenet.org. There are four different categories of annotation sources from DisGeNET ( see below). By default, we only use the *Curated* and the *Inferred* data sources. User can change the sources by passing the list of sources to the `data_sources` argument. (Note: ~70% of the disease-gene annotations in DisGeNET are only available in the *literature* data source). See the DisGeNET `data sources <https://www.disgenet.org/dbinfo>`_ documentation page for more information. - Curated (``CURATED``): - ``CGI`` Caner Genome Interpreter - ``CLINGEN`` Clinical Genome Resource - ``CTD_human`` Comparative Toxicogenomics Database (Human) - ``GENOMICS_ENGLAND`` Genomics England PanelApp - ``ORPHANET`` Orphan drugs and rare diseases - ``PSYGENET`` Psychiatric disorders gene association network - ``CLINVAR`` ClinVar disease-gene information with supported evidences - Inferred (``INFERRED``): - ``HPO`` Human Phenotype Ontology - ``UNIPROT`` UniProt/SwissProt database - ``GWASCAT`` GWAS Catalog curated SNPs (p-val < 1e-6) - ``GWASDB`` GWASdb (p-val < 1e-6) - Animal models (``ANIMAL``): - ``CTD_mouse`` Comparative Toxicogenomics Database (Mouse) - ``CTD_rat`` Comparative Toxicogenomics Database (Rat) - ``MGD`` Mouse Genome Database - ``RGD`` Rat Genome Database - Literature (``LITERATURE``): - ``BEFREE`` Disease-gene association extracted from MEDLINE using BeFree - ``LHGDN`` Literature derived human disease network **[Last updated: 2023-01-14]** Args: root: Root directory of the data. data_sources: List of evidence types to be considered. If not set, then use the default channels (curated and inferred evidences). dsi_min: Minimum value of ``DSI`` below which the annotations are removed. dsi_max: Maximum value of ``DSI`` above which the annotations are removed. dpi_min: Minimum value of ``DPI`` below which the annotations are removed. dpi_max: Maximum value of ``DPI`` above which the annotations are removed. Notes: ``DSI`` and ``DPI`` stands for *Disease Specificity Index* and *Disease Pleiotropy Index*. The two metrics measure how specific a gene is associated to a particular disease (vs. being associated to many diseases) and how pleiotropic a gene is (i.e., does the gene contribute to a wide variety of disease types, according to MeSH disease classes). The exact definitions of ``DSI`` and ``DPI`` can be found on in the DisGeNET `documentation <https://www.disgenet.org/dbinfo>`_ webpage. """ annotation_file_name = "all_gene_disease_associations.tsv" annotation_url = "https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz" def __init__( self, root: str, *, data_sources: Optional[List[str]] = None, dsi_min: Optional[float] = None, dsi_max: Optional[float] = None, dpi_min: Optional[float] = None, dpi_max: Optional[float] = None, **kwargs, ): """Initialize DisGeNET annotation data object.""" self._data_sources = data_sources self.dsi_min = dsi_min self.dsi_max = dsi_max self.dpi_min = dpi_min self.dpi_max = dpi_max super().__init__(root, **kwargs) @property def data_sources(self) -> List[str]: if self._data_sources is None: return [ # Curated "CGI", "CLINGEN", "CTD_human", "GENOMICS_ENGLAND", "ORPHANET", "PSYGENET", "UNIPROT", # Inferred "CLINVAR", "GWASCAT", "GWASDB", "HPO", ] else: return self._data_sources
[docs] def load_processed_data(self): path = self.raw_file_path(0) self.plogger.info(f"Loading raw annotation from {path}") annot_df = pd.read_csv(path, sep="\t") # Select specified channels evidence_str = pprint.pformat(self.data_sources) self.plogger.info(f"Subsetting annotations to evidences:\n{evidence_str}") annot_df = annot_df[ ( annot_df.source.str.split(";", expand=True) .isin(self.data_sources) .any(axis=1) ) ] # Filter by DSI and DPI scores if self.dsi_max is not None: self.plogger.info(f"Removing annotations above DSI: {self.dsi_max}") annot_df = annot_df[annot_df["DSI"] <= self.dsi_max] if self.dsi_min is not None: self.plogger.info(f"Removing annotations below DSI: {self.dsi_min}") annot_df = annot_df[annot_df["DSI"] >= self.dsi_min] if self.dpi_max is not None: self.plogger.info(f"Removing annotations above DPI: {self.dpi_max}") annot_df = annot_df[annot_df["DPI"] <= self.dpi_max] if self.dpi_min is not None: self.plogger.info(f"Removing annotations below DPI: {self.dpi_min}") annot_df = annot_df[annot_df["DPI"] >= self.dpi_min] # Select relevant columns and rename to standardized column names annot_df = annot_df[["geneId", "diseaseId"]].reset_index(drop=True) annot_df.columns = ["gene_id", "term_id"] # Specify id prefixes annot_df["gene_id"] = annot_df["gene_id"].astype(str) annot_df["term_id"] = "UMLS:" + annot_df["term_id"].astype(str).values # Save attributes self.data = annot_df.copy()