Source code for pyBiodatafuse.annotators.mitocarta

# coding: utf-8

"""
Python file for MitoCarta ETL process.

You can download the MitoCarta dataset from **MitoCarta**. Please visit the following page for the download:

[MitoCarta Download Page](https://personal.broadinstitute.org/scalvo/MitoCarta3.0/)

The datasets you need can be downloaded from the following links:

**For Humans (Homo sapiens):**
- [Human.MitoCarta3.0.xls](https://personal.broadinstitute.org/scalvo/MitoCarta3.0/Human.MitoCarta3.0.xls)

**For Mice (Mus musculus):**
- [Mouse.MitoCarta3.0.xls](https://personal.broadinstitute.org/scalvo/MitoCarta3.0/Mouse.MitoCarta3.0.xls)

These files contain the MitoCarta data in a simple format for each species.
"""

import os
from datetime import datetime
from typing import Tuple

import pandas as pd
import requests

import pyBiodatafuse.constants as Cons
from pyBiodatafuse.utils import collapse_data_sources, get_identifier_of_interest


def download_mitocarta_dataset(
    mitocarta_file: str, filename: str, sheet_name: str = "A Human MitoCarta3.0"
) -> Tuple[pd.DataFrame, dict]:
    """Download, save, and read a MitoCarta dataset.

    :param mitocarta_file: The MitoCarta dataset to download. Human "Human.MitoCarta3.0.xls".
    :param sheet_name: The name of the sheet in the Excel file to read. Default is "A Human MitoCarta3.0".
    :param filename: The local file path to save the downloaded dataset.
    :returns: A MitoCarta DataFrame and dictionary of the MitoCarta metadata.
    :raises ValueError: If the file cannot be downloaded.
    """
    # Dowonload the TF-Target dataset
    url = f"{Cons.MITOCARTA_DOWNLOAD_URL}/{mitocarta_file}"
    if not os.path.exists(filename):
        response = requests.get(url)
        try:
            response.raise_for_status()
        except requests.HTTPError as e:
            raise ValueError(f"Failed to download file. HTTP Error: {e}")
        else:
            with open(filename, "wb") as file:
                file.write(response.content)

    mitocarta_df = pd.read_excel(filename, sheet_name=sheet_name)

    if mitocarta_df is not None:
        # Add version
        mitocarta_metadata = {
            "datasource": Cons.MITOCARTA,
            "metadata": {
                "download date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                "download link": f"{Cons.MITOCARTA_DOWNLOAD_URL}/{mitocarta_file}",
            },
        }

        return mitocarta_df, mitocarta_metadata

    # Return empty DataFrame and metadata if mitocarta_df is None
    empty_metadata = {
        "datasource": Cons.MITOCARTA,
        "metadata": {
            "download date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "download link": f"{Cons.MITOCARTA_DOWNLOAD_URL}/{mitocarta_file}",
            "note": "No data found in the downloaded file",
        },
    }
    return pd.DataFrame(), empty_metadata


def process_mitocarta(mitocarta_df: pd.DataFrame, species: str = "hsapiens") -> pd.DataFrame:
    """Add targets and TFs to each row (gene).

    :param mitocarta_df: The mitocarta dataset.
    :param species: The species to process the data for; defaults to "hsapiens".
    :returns: mitocarta_df with targets and TFs in each row.
    :raises ValueError: If species is not supported.
    """
    # Select relevant columns for inclusion in the graph
    if species == "hsapiens":
        selected_columns = Cons.MITO_SELECTED_COLUMNS["human"]
    elif species == "mmusculus":
        selected_columns = Cons.MITO_SELECTED_COLUMNS["mouse"]

    else:
        raise ValueError(f"Species {species} not supported.")

    # rename columns
    mitocarta_subset = mitocarta_df[selected_columns]
    mitocarta_subset.rename(columns=Cons.MITOCART_COL_MAPPER, inplace=True, errors="ignore")

    mitocarta_subset[Cons.MITO_PATHWAYS] = (
        mitocarta_subset[Cons.MITO_PATHWAYS]
        .str.split(">")
        .str[-1]
        .str.split("|")
        .str[0]
        .str.strip()
    )

    return mitocarta_subset


[docs] def get_gene_mito_pathways( bridgedb_df: pd.DataFrame, mitocarta_file: str, filename: str, species: str = "hsapiens", sheet_name: str = "A Human MitoCarta3.0", ) -> Tuple[pd.DataFrame, dict]: """Get gene and mitochondia pathways from MitoCarta. :param bridgedb_df: BridgeDb output for creating the list of gene ids to query. :param mitocarta_file: Name of the remote MitoCarta file to download. :param filename: The local file path to save the downloaded dataset. :param species: Species for which to process the data; defaults to "hsapiens". :param sheet_name: Excel sheet name to read from the file; defaults to "A Human MitoCarta3.0". :returns: A tuple containing the processed DataFrame and a metadata dictionary. """ # Download dataset and get metadata mitocarta_df, mitocarta_metadata = download_mitocarta_dataset( mitocarta_file=mitocarta_file, filename=filename, sheet_name=sheet_name ) # Subset the dataset according to species subset_df = process_mitocarta(mitocarta_df=mitocarta_df, species=species) # Merge the processed DataFrame with the original bridgedb_df data_df = get_identifier_of_interest(bridgedb_df, Cons.MITOCARTA_GENE_INPUT_ID) merged_df = collapse_data_sources( data_df=data_df, source_namespace=Cons.MITOCARTA_GENE_INPUT_ID, target_df=subset_df, common_cols=[Cons.TARGET_COL], target_specific_cols=Cons.MITOCART_OUTPUT, col_name=Cons.MITOCART_PATHWAY_COL, ) return merged_df, mitocarta_metadata