Source code for obnb.data.network.consensuspathdb

import itertools
import re

import numpy as np
import pandas as pd

from obnb.data.network.base import BaseURLSparseGraphData
from obnb.typing import List, Literal, Mapping, Optional, Union
from obnb.util.download import download_unzip


[docs]class ConsensusPathDB(BaseURLSparseGraphData):
    """The ConsensusPathDB interaction network.

    The `ConsensusPathDB <http://cpdb.molgen.mpg.de/>`_ integrates gene
    interaction evidences from many databases:

        - ``BIND``
        - ``BioCarta``
        - ``Biogrid``
        - ``CORUM``
        - ``DIP``
        - ``HPRD``
        - ``HumanCyc``
        - ``INOH``
        - ``InnateDB``
        - ``IntAct``
        - ``MINT``
        - ``MIPS-MPPI``
        - ``Manual upload``
        - ``MatrixDB``
        - ``NetPath``
        - ``PDB``
        - ``PDZBase``
        - ``PID``
        - ``PINdb``
        - ``PhosphoPOINT``
        - ``Reactome``
        - ``Spike``

    These sources cover a wide range of interaction tyeps:

        - Protein interactions
        - Signaling reactions
        - Metabolic reactions
        - Gene regulations
        - Genetic interactions
        - Drug-target interactions
        - Biochemical pathways

    Check out the `ConsensusPathDB <http://cpdb.molgen.mpg.de/>`_ webpage for
    more information about the specific types of interactions provided by each
    source databases.

    **[Last updated: 2023-02-13]**

    """

    url = "http://cpdb.molgen.mpg.de/download/ConsensusPathDB_human_PPI.gz"
    selected_sources: List[str] = [
        "BIND",
        "BioCarta",
        "Biogrid",
        "CORUM",
        "DIP",
        "HPRD",
        "HumanCyc",
        "INOH",
        "InnateDB",
        "IntAct",
        "MINT",
        "MIPS-MPPI",
        "Manual upload",
        "MatrixDB",
        "NetPath",
        "PDB",
        "PDZBase",
        "PID",
        "PINdb",
        "PhosphoPOINT",
        "Reactome",
        "Spike",
    ]

    def __init__(
        self,
        root: str,
        weighted: bool = True,
        directed: bool = False,
        largest_comp: bool = True,
        gene_id_converter: Optional[Union[Mapping[str, str], str]] = None,
        fill_value: Literal["mean", "max"] = "max",
        **kwargs,
    ):
        """Initialize the ConsensusPathDB object."""
        self.fill_value = fill_value
        super().__init__(
            root,
            weighted=weighted,
            directed=directed,
            largest_comp=largest_comp,
            gene_id_converter=gene_id_converter,
            **kwargs,
        )

    @property
    def raw_files(self) -> List[str]:
        return ["data_clean.txt", "data.txt"]

    def download(self):
        download_unzip(
            self.url,
            self.raw_dir,
            zip_type=self.download_zip_type,
            rename=self.raw_files[1],
            logger=self.plogger,
        )

        # Load interaction table
        df = pd.read_csv(
            self.raw_file_path(1),
            sep="\t",
            comment="#",
            header=0,
            names=[
                "source_db",
                "publications",
                "uniprot_entry",
                "uniprot_id",
                "gene_name",
                "hgnc_id",
                "entrez",
                "ensg",
                "score",
            ],
        )

        # Filter by sources
        df = df[df["source_db"].str.contains("|".join(self.selected_sources))]

        # Fill in missing interaction weights
        if self.fill_value == "mean":
            fill_value = np.nanmean(df["score"].values)
        elif self.fill_value == "max":
            fill_value = np.nanmax(df["score"].values)
        else:
            raise ValueError(
                f"Unknown fill value option {self.fill_value}, "
                "supported options are 'mean' and 'max'",
            )
        df["score"].fillna(fill_value, inplace=True)

        # Construct interactions to undirected edges
        df = df[~pd.isna(df["entrez"])]
        edges = []
        for genes, score in df[["entrez", "score"]].values:
            genes = re.split(r",|\.", genes)
            genes = list(filter(None, genes))  # remove empty string
            if len(genes) < 2:  # discard self-loops
                continue

            # Prepare edge list: [(gene1, gene2, score), ...]
            edges.extend(i + (score,) for i in itertools.combinations(genes, 2))
        edge_df = pd.DataFrame(edges)

        # Make undirected by filling in the connections from reversed direction
        edge_df = pd.concat((edge_df, edge_df.rename(columns={0: 1, 1: 0})))
        self.plogger.info(f"Converted interactions to edge list:\n{edge_df}")

        # Drop duplicated edges and keep the largest weight
        edge_df = (
            edge_df.sort_values(2, ascending=False)
            .drop_duplicates([0, 1])
            .sort_values([0, 1])
            .reset_index(drop=True)
        )
        self.plogger.info(f"Dropped duplicates:\n{edge_df}")

        out_path = self.raw_file_path(0)
        edge_df.to_csv(out_path, sep="\t", index=False, header=None)
        self.plogger.info(f"Cleaned raw file saved to {out_path}")