Source code for pyBiodatafuse.human_homologs

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""Python file for queriying Ensembl to get human homologs for mouse genes."""

import datetime
import warnings

import numpy as np
import pandas as pd
import requests

import pyBiodatafuse.constants as Cons
from pyBiodatafuse.utils import get_identifier_of_interest


[docs] def check_endpoint_ensembl() -> bool: """Check if the endpoint of the Ensembl API is available. :returns: A True statement if the endpoint is available, else return False """ response = requests.get(f"{Cons.ENSEMBL_ENDPOINT}/info/ping") # Check if API is down if response.status_code == 200: return True else: return False
[docs] def check_version_ensembl() -> str: """Check the current version of the REST API. :returns: A True statement if the endpoint is available, else return False """ response = requests.get( f"{Cons.ENSEMBL_ENDPOINT}/info/rest", headers={"Content-Type": "application/json"} ) # Check if API is down return response.text
[docs] def get_human_homologs(row): """Retrieve human homologs for mouse genes using Ensembl API. :param row: row from input dataframe. :returns: dictionary mapping mouse genes to human homologs. """ response = requests.get( f"{Cons.ENSEMBL_ENDPOINT}/homology/id/mouse/{row['target']}", headers={"Content-Type": "application/json"}, params={"target_species": "homo_sapiens"}, ) if response.status_code != 200: return [{Cons.ENSEMBL_HOMOLOG_MAIN_LABEL: np.nan}] data = response.json() if "data" in data and len(data["data"]) > 0: for homology in data["data"][0].get("homologies", []): if homology["target"]["species"] == "homo_sapiens": homolog = homology["target"]["id"] return [{Cons.ENSEMBL_HOMOLOG_MAIN_LABEL: homolog}] return [{Cons.ENSEMBL_HOMOLOG_MAIN_LABEL: np.nan}]
[docs] def get_homologs(bridgedb_df): """Retrieve homologs for input DataFrame. :param bridgedb_df: input dataframe. :returns: dataframe including the human homologs as well as the metadata. """ api_available = check_endpoint_ensembl() if not api_available: warnings.warn( f"{Cons.ENSEMBL} endpoint is not available. Unable to retrieve data.", stacklevel=2 ) return pd.DataFrame(), {} ensembl_version = check_version_ensembl() # Record the start time start_time = datetime.datetime.now() data_df = get_identifier_of_interest(bridgedb_df, Cons.ENSEMBL_GENE_INPUT_ID) data_df = data_df.reset_index(drop=True) gene_list = list(set(data_df[Cons.TARGET_COL].tolist())) # Get the human homologs data_df[Cons.ENSEMBL_HOMOLOG_COL] = data_df.apply(lambda row: get_human_homologs(row), axis=1) # Record the end time end_time = datetime.datetime.now() """Metadata details""" # Get the current date and time current_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Calculate the time elapsed time_elapsed = str(end_time - start_time) # Calculate new edges num_new_edges = data_df.shape[0] # Add the datasource, query, query time, and the date to metadata kegg_metadata = { "datasource": Cons.ENSEMBL, "metadata": {"source_version": ensembl_version}, "query": { "size": len(gene_list), "input_type": Cons.ENSEMBL_GENE_INPUT_ID, "number_of_added_edges": num_new_edges, "time": time_elapsed, "date": current_date, "url": Cons.ENSEMBL_ENDPOINT, }, } return data_df, kegg_metadata