Source code for obnb.data.annotation.gene_ontology

import pprint

import pandas as pd

from obnb.alltypes import List, Optional
from obnb.data.annotation.base import BaseAnnotationData


[docs]class GeneOntologyAnnotation(BaseAnnotationData): """Gene Ontology annotations. Gene ontology annotations are retrieved from geneontology.org. There are sevone categories of gene annotation evidences from the Gene Ontology. By default, we only use *Experimental evidences*, *Author Statements*, and the *Curator inferred* evidence types to ensure the quality of the annotations. See the gene ontology `evidence codes <http://geneontology.org/docs/guide-go-evidence-codes/>`_ documentation page for more information. - Experimental evidences (``EXPERIMENTAL``): - ``EXP`` Experiment - ``IDA`` Direct assay - ``IPI`` Physical interaction - ``IMP`` Mutant phenotype - ``IGI`` Genetic interaction - ``IEP`` Expression pattern - Phylogenetically-inffered (``PHYLOGENIC``): - ``IBA`` Biological aspect of ancestor - ``IBD`` Biological aspect of descendant - ``IKR`` Key residues - ``IRD`` Rapid divergence - Computational analysis (``COMPUTATIONAL``): - ``ISS`` Sequence or structural similarity - ``ISO`` Sequence orthology - ``ISA`` Sequence alignment - ``ISM`` Sequence model - ``IGC`` Genomic context - ``RCA`` Reviewed computational analysis - Author statements (``AUTHOR``): - ``TAS`` Tracable author statement - ``NAS`` Nontracable author statement - Curator statements (``CURATOR``): - ``IC`` Inferred by curator - ``ND`` No biological data available - Electronic annotation evidences (``ELECTRONIC``): - ``IEA`` Electronic annotation **[Last updated: 2023-03-10]** Args: root: Root directory of the data. data_sources: List of evidene types to be considered. If not set, then use the default channels (experimental evidences, author and curator statements). """ annotation_file_name = "goa_human.gaf" annotation_url = "http://geneontology.org/gene-associations/goa_human.gaf.gz" def __init__( self, root: str, *, data_sources: Optional[List[str]] = None, **kwargs, ): """Initialize GeneOntology annotation data object.""" self._data_sources = data_sources super().__init__(root, **kwargs) @property def data_sources(self) -> List[str]: if self._data_sources is None: return [ "EXP", # Experiment "IDA", # Direct Assay "IPI", # Physical Interaction "IMP", # Mutant Phenotype "IGI", # Genetic Interaction "IEP", # Expression Pattern "TAS", # Traceable Author Statement "NAS", # Non-traceable Author Statement "IC", # Inferred by Curator ] else: return self._data_sources
[docs] def load_processed_data(self): path = self.raw_file_path(0) self.plogger.info(f"Loading raw annotation from {path}") # Load gene annotation data (gaf-version: 2.2) # http://geneontology.org/docs/go-annotation-file-gaf-format-2.2/ annot_df = pd.read_csv( path, sep="\t", comment="!", header=0, names=[ "db", "db_id", "db_symbol", "qual", "go_id", "db_ref", "ec", "wof", "aspect", "eb_name", "db_syn", "db_type", "taxon", "date", "assigned_by", "annot_ext", "gene_prod_id", ], low_memory=False, ) # Select specified channels evidence_str = pprint.pformat(self.data_sources) self.plogger.info(f"Subsetting annotations to evidences:\n{evidence_str}") ind = annot_df["ec"].isin(self.data_sources) self.plogger.info(f"{ind.sum():,} (out of {ind.shape[0]:,}) entries selected") annot_df = annot_df[ind] # Convert gene ids gene_id_converter = self.get_gene_id_converter() gene_id_converter.map_df(annot_df, "db_id", "gene_id") annot_df["term_id"] = annot_df["go_id"] # Save attributes self.data = annot_df[["gene_id", "term_id"]].copy()