Source code for obnb.data.network.comppi

import io

import pandas as pd
import requests

from obnb.alltypes import Any, Dict, List, Mapping, Optional, Union
from obnb.data.network.base import BaseURLSparseGraphData


class ComPPI(BaseURLSparseGraphData):
    r"""The Compartmentalized Protein-Protein Interaction Database.

    The ComPPI database comes with interactomes with different contexts,
    including compartmentalization and species. To request download from the
    webserver, a `POST` request is send with the following options.

    - ``fDlSet``: What type of data to download, available options are

        - ``int``: Integrated protein-protein interactions across compartments.
        - ``comp``: Compartmentalized interactions.
        - ``protnloc``: Subcellular localization information of proteins (this
          is not interaction data).

    - ``fDlSpec``: What species to use, available options are

        - ``0``: H. sapiens (human).
        - ``1``: D. melanogaster (fruit fly).
        - ``2``: C.elegans (worm).
        - ``3``: S. cerevisiae (yeast).
        - ``all``: use all the above, the default option.

    - ``fDlMLoc``: What subcellular localization to use (do not specify when
      ``fDlSet`` is set to ``int``), available options are

        - ``0``: Cytosol.
        - ``1``: Mitochondrion.
        - ``2``: Nucleus.
        - ``3``: Extracellular.
        - ``4``: Secretory pathway.
        - ``5``: Membrane.
        - ``all``: Use all the above, the default option.

    Example:
        Request the file for integrated human interactom file and load into
        a pandas dataframe ``df`` via

        >>> r = requests.post("https://comppi.linkgroup.hu/downloads",
        ...                   data={"fDlSet": "int", "fDlSpec": "0"})
        >>> df = pd.read_csv(io.BytesIO(r.content), sep="\t",
        ...                  compression="gzip")

    **[Last updated: 2023-11-17]**

    """

    CONFIG_KEYS: List[str] = BaseURLSparseGraphData.CONFIG_KEYS + [
        "selected_columns",
    ]
    url: str = "https://comppi.linkgroup.hu/downloads"
    # TODO: parase args and setup at init
    url_kwargs: Dict[str, Any] = {}
    selected_columns: List[str] = ["Protein A", "Protein B", "Interaction Score"]

    def __init__(
        self,
        root: str,
        weighted: bool = True,
        directed: bool = False,
        largest_comp: bool = True,
        gene_id_converter: Optional[Union[Mapping[str, str], str]] = "HumanEntrez",
        **kwargs,
    ):
        """Initialize the CompPPI object."""
        super().__init__(
            root,
            weighted=weighted,
            directed=directed,
            largest_comp=largest_comp,
            gene_id_converter=gene_id_converter,
            **kwargs,
        )

    @property
    def raw_files(self) -> List[str]:
        return ["data_clean.txt", "data.txt"]

    def download(self):
        """Download data from URL."""
        self.plogger.info(
            f"Downloading data via POST from {self.url} with params: {self.url_kwargs}",
        )
        r = requests.post(self.url, **self.url_kwargs)

        self.plogger.info("Finished downloading, start unpacking...")
        df = pd.read_csv(io.BytesIO(r.content), sep="\t", compression="gzip")
        df_clean = df[self.selected_columns]

        clean_path, raw_path = self.raw_file_path(0), self.raw_file_path(1)
        df.to_csv(raw_path, sep="\t", index=False)
        self.plogger.info(f"Raw file saved to {raw_path}")
        df_clean.to_csv(clean_path, sep="\t", index=False, header=None)
        self.plogger.info(f"Cleaned raw file saved to {clean_path}")


[docs]class ComPPIHumanInt(ComPPI): """The ComPPI human integrated interaction network.""" url_kwargs: Dict[str, Any] = {"data": {"fDlSet": "int", "fDlSpec": "0"}}