Source code for obnb.data.network.omnipath

import io

import pandas as pd
import requests

from obnb.data.network.base import BaseURLSparseGraphData
from obnb.typing import List, Mapping, Optional, Union
from obnb.util.download import stream_download


[docs]class OmniPath(BaseURLSparseGraphData):
    """The OmniPath intra- dand inter-cellular signaling knowledge base.

    https://omnipathdb.org/

    - ``dorothea`` Interactions obtained from the DoRothEA database, which
      contains comprehensive resource of TF-promoter interactions curated from
      over 18 sources. Only the interactions with confidence from A-D are
      included in the OmniPath database.
    - ``kinaseextra`` Addition kinase-substrate interactions from prior
      knowledge.
    - ``ligrecextra`` Ligand-receptor interactions from prior knowledge.
    - ``lncrna_mrna`` Interactions between long non-coding RNAs and mRNAs,
      curated from three literatures.
    - ``mirnatarget`` Micro RNA target interactions.
    - ``omnipath`` Interaction information from literature curation, high
      throughput experiments, and prior knowledge.
    - ``pathwayextra`` Pathway information from prior konwledge.
    - ``small_molecule`` Small molecul protein interactions.
    - ``tf_mirna`` Transcription factor micro RNA interaction curated from
      two literature sources.
    - ``tf_target`` Transcription factor target curated from six literatures.
    - ``tfregulons`` Transcription factor regulon interacions.

    Note:
        ``Prior knolwedge`` means annotations done by the aurhors without any
        literature references.

    """

    url: str = "https://omnipathdb.org/interactions"
    omnipath_datasets: List[str] = [
        "dorothea",
        "kinaseextra",
        "ligrecextra",
        "lncrna_mrna",
        "mirnatarget",
        "omnipath",
        "pathwayextra",
        "small_molecule",
        "tf_mirna",
        "tf_target",
        "tfregulons",
    ]
    omnipath_fields: List[str] = [
        "curation_effort",
        "references",
        "sources",
        "type",
    ]
    selected_columns: List[str] = ["source", "target"]

    def __init__(
        self,
        root,
        weighted: bool = False,
        directed: bool = False,  # FIX: should be True, but need to fix LCC first
        largest_comp: bool = True,
        gene_id_converter: Optional[Union[Mapping[str, str], str]] = "HumanEntrez",
        **kwargs,
    ):
        """Initialize the OmniPath object."""
        super().__init__(
            root,
            weighted=weighted,
            directed=directed,
            largest_comp=largest_comp,
            gene_id_converter=gene_id_converter,
            **kwargs,
        )

    @property
    def raw_files(self) -> List[str]:
        return ["data_clean.txt", "data.txt"]

    def download(self):
        """Download data from URL."""
        # Prepare URL and parameters
        datasets_str = ",".join(self.omnipath_datasets)
        fields_str = ",".join(self.omnipath_fields)
        params = {"datasets": datasets_str, "fields": fields_str, "format": "tsv"}
        self.plogger.info(f"Base url: {self.url}")
        self.plogger.info(f"URL parameters: {params}")

        # Construct URL to obtain raw data
        req = requests.Request("GET", self.url, params=params)
        s = requests.Session()
        prepped_url = s.prepare_request(req).url
        self.plogger.info(f"Start download data from {prepped_url}")
        _, content = stream_download(prepped_url)

        # Download data from URL
        self.plogger.info("Finished downloading, start unpacking...")
        df = pd.read_csv(io.BytesIO(content), sep="\t")
        df_clean = df[self.selected_columns]

        # Save raw data
        clean_path, raw_path = self.raw_file_path(0), self.raw_file_path(1)
        df.to_csv(raw_path, sep="\t", index=False)
        self.plogger.info(f"Raw file saved to {raw_path}")
        df_clean.to_csv(clean_path, sep="\t", index=False, header=None)
        self.plogger.info(f"Cleaned raw file saved to {clean_path}")