#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Python file for querying IntAct (https://www.ebi.ac.uk/intact/)."""
import datetime
import json
import urllib.parse
import warnings
from typing import Dict, List
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm
import pyBiodatafuse.constants as Cons
from pyBiodatafuse.logging_config import get_logger
from pyBiodatafuse.utils import get_identifier_of_interest, give_annotator_warning
logger = get_logger(__name__)
def check_endpoint_intact() -> bool:
"""Check if the IntAct API is reachable by making a test request.
:returns: True if the endpoint is available, False otherwise.
"""
response = requests.get(f"{Cons.INTACT_ENDPOINT}/ws/interaction/findInteractions/P53")
return response.status_code == 200
# TODO: Wait for this function to be implemented in the IntAct API
def check_version_intact() -> dict:
"""Get version of IntAct API.
:returns: a dictionary containing the version information
"""
try:
version_call = requests.get(f"{Cons.INTACT_ENDPOINT}/version", timeout=10)
version_call.raise_for_status()
version_json = version_call.json()
return {"source_version": version_json.get("version", "unknown")}
except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
logger.error(f"Error getting IntAct version: {e}")
return {"source_version": "unknown"}
def get_intact_interactions(gene_ids: List[str]) -> List[dict]:
"""Retrieve protein interactions for a list of genes from IntAct.
:param gene_ids: List of gene identifiers.
:returns: List of interactions for the given genes.
"""
if not gene_ids:
return []
joined_ids = " - ".join(gene_ids)
encoded_ids = urllib.parse.quote(joined_ids)
url = f"{Cons.INTACT_ENDPOINT}/ws/interaction/findInteractions/{encoded_ids}?pageSize=200"
try:
response = requests.get(url, timeout=60)
data = response.json()
content = data.get("content", [])
if not content:
return []
interation_info = {
Cons.INTACT_INTERACTION_ID: "ac",
Cons.INTACT_INTERACTOR_ID_A: "acA",
Cons.INTACT_INTERACTOR_ID_B: "acB",
Cons.INTACT_SCORE: "intactMiscore",
Cons.INTACT_BIOLOGICAL_ROLE_A: "biologicalRoleA",
Cons.INTACT_BIOLOGICAL_ROLE_B: "biologicalRoleB",
Cons.INTACT_TYPE: "type",
Cons.INTACT_DETECTION_METHOD: "detectionMethod",
Cons.INTACT_HOST_ORGANISM: "hostOrganism",
Cons.INTACT_INTERACTOR_A_NAME: "intactNameA",
Cons.INTACT_INTERACTOR_B_NAME: "intactNameB",
Cons.INTACT_INTERACTOR_A_SPECIES: "speciesA",
Cons.INTACT_INTERACTOR_B_SPECIES: "speciesB",
Cons.INTACT_MOLECULE_A: "moleculeA",
Cons.INTACT_MOLECULE_B: "moleculeB",
Cons.INTACT_ID_A: "idA",
Cons.INTACT_ID_B: "idB",
Cons.INTACT_PUBMED_PUBLICATION_ID: "publicationPubmedIdentifier",
}
interactions = [
{key: item.get(value, np.nan) for key, value in interation_info.items()}
for item in content
]
# cleanup the alternative ids
for interaction in interactions:
ids_a = interaction[Cons.INTACT_ID_A]
ids_b = interaction[Cons.INTACT_ID_B]
if ":" in ids_a:
interaction[Cons.INTACT_ID_A] = ids_a.split(" ")[0] # stays the same
else:
idx = ids_a.split(" ")[0]
namespace = ids_a.split(" ")[1].replace("(", "").replace(")", "")
interaction[Cons.INTACT_ID_A] = f"{namespace}:{idx}"
if ":" in ids_b:
interaction[Cons.INTACT_ID_B] = ids_b.split(" ")[0] # stays the same
else:
idx = ids_b.split(" ")[0]
namespace = ids_b.split(" ")[1].replace("(", "").replace(")", "")
interaction[Cons.INTACT_ID_B] = f"{namespace}:{idx}"
return interactions
except requests.RequestException as e:
logger.warning(f"Batch request failed for genes {gene_ids}: {e}")
return []
def get_protein_intact_acs(id_of_interest: str) -> List[str]:
"""Get all IntAct ACs for protein interactors from a given Ensembl ID.
:param id_of_interest: input gene Ensembl identifier.
:returns: Interactor information if possible, empty list if not.
"""
url = f"{Cons.INTACT_ENDPOINT}/ws/interactor/findInteractor/{id_of_interest}?pageSize=100"
try:
response = requests.get(url, timeout=120)
response.raise_for_status()
data = response.json()
content = data.get("content", [])
protein_acs = []
for item in content:
interactor_type = item.get("interactorType")
interactor_ac = item.get("interactorAc")
if interactor_type == "protein":
protein_acs.append(interactor_ac)
return protein_acs
except requests.exceptions.RequestException as e:
logger.warning(f"Failed to get interactors for {id_of_interest}: {e}")
return []
def get_filtered_interactions(
batch_ids: List[str],
valid_intact_acs: set,
intact_ac_to_entity: dict,
entity_to_input_id: dict,
is_compound: bool,
interaction_type: str = "gene_gene",
) -> Dict[str, List[dict]]:
"""Filter interactions based on data type.
:param batch_ids: List of input IDs.
:param valid_intact_acs: Set of valid IntAct ACs.
:param intact_ac_to_entity: Dictionary mapping IntAct ACs to entity.
:param entity_to_input_id: Dictionary mapping entities to input IDs.
:param is_compound: Boolean if the input datatype are compounds.
:param interaction_type: Either 'gene_gene', 'gene_compound', 'compound_compound', 'compound_gene', or 'both'.
If the input identifiers are genes, then 'both' will refer to 'gene_gene' and 'gene_compound'. If the
input identifiers are compounds, then 'both' will refer to 'compound_compound' and 'compound_gene'.
:returns: A dictionary of filtered interactions per input ID.
"""
results: Dict[str, List[dict]] = {idx: [] for idx in batch_ids}
interactions = get_intact_interactions(batch_ids)
for interaction in interactions:
if interaction_type in Cons.INTACT_GENE_INTERACTION_TYPES and not is_compound:
id_a = interaction.get(Cons.INTACT_INTERACTOR_ID_A)
id_b = interaction.get(Cons.INTACT_INTERACTOR_ID_B)
alt_ids_a = interaction.get(Cons.INTACT_ID_A)
alt_ids_b = interaction.get(Cons.INTACT_ID_B)
else:
id_a = interaction.get(Cons.INTACT_ID_A)
id_b = interaction.get(Cons.INTACT_ID_B)
alt_ids_a = interaction.get(Cons.INTACT_INTERACTOR_ID_A)
alt_ids_b = interaction.get(Cons.INTACT_INTERACTOR_ID_B)
has_uniprot_a = any("uniprotkb" in x.lower() for x in [id_a, alt_ids_a]) # type: ignore
has_uniprot_b = any("uniprotkb" in x.lower() for x in [id_b, alt_ids_b]) # type: ignore
has_chebi_a = any("chebi" in x.lower() for x in [id_a, alt_ids_a]) # type: ignore
has_chebi_b = any("chebi" in x.lower() for x in [id_b, alt_ids_b]) # type: ignore
keep_interaction = False
if interaction_type == "gene_gene":
if (
has_uniprot_a
and has_uniprot_b
and id_a in valid_intact_acs
and id_b in valid_intact_acs
):
keep_interaction = True
elif interaction_type == "gene_compound":
if (has_chebi_a and has_uniprot_b) or (has_chebi_b and has_uniprot_a):
keep_interaction = True
elif interaction_type == "compound_compound":
if (
has_chebi_a
and has_chebi_b
and id_a in valid_intact_acs
and id_b in valid_intact_acs
):
keep_interaction = True
elif interaction_type == "compound_gene":
if (has_chebi_a and has_uniprot_b) or (has_chebi_b and has_uniprot_a):
keep_interaction = True
elif "both" in interaction_type:
is_gene_gene = (
has_uniprot_a
and has_uniprot_b
and id_a in valid_intact_acs
and id_b in valid_intact_acs
)
is_gene_compound = (has_chebi_a and has_uniprot_b) or (has_chebi_b and has_uniprot_a)
is_compound_compound = (
has_chebi_a
and has_chebi_b
and id_a in valid_intact_acs
and id_b in valid_intact_acs
)
if is_gene_gene or is_gene_compound or is_compound_compound:
keep_interaction = True
if not keep_interaction:
continue
for idx in batch_ids:
if id_a in valid_intact_acs and intact_ac_to_entity.get(id_a) == idx:
partner_id = intact_ac_to_entity.get(id_b)
partner_display_id = entity_to_input_id.get(partner_id, partner_id or alt_ids_b)
elif id_b in valid_intact_acs and intact_ac_to_entity.get(id_b) == idx:
partner_id = intact_ac_to_entity.get(id_a)
partner_display_id = entity_to_input_id.get(partner_id, partner_id or alt_ids_a)
else:
continue
interaction_copy = dict(interaction)
interaction_copy["intact_link_to"] = partner_display_id
results[idx].append(interaction_copy)
for gene_id in batch_ids:
if not results[gene_id]:
empty_entry = {key: np.nan for key in Cons.INTACT_OUTPUT_DICT}
empty_entry["intact_link_to"] = np.nan
results[gene_id] = [empty_entry]
return results
[docs]
def get_gene_interactions(bridgedb_df: pd.DataFrame, interaction_type: str = "both"):
"""Annotate genes with interaction data from IntAct.
:param bridgedb_df: BridgeDb output for creating the list of gene ids to query.
:param interaction_type: Either 'gene_gene', 'gene_compound' or 'both'. If the input is 'both', 'gene_gene' and
'gene_compound' will be queried.
:raises ValueError: If an invalid interaction_type is provided.
:returns: a tuple (DataFrame containing the IntAct output, metadata dictionary)
"""
api_available = check_endpoint_intact()
if not api_available:
warnings.warn("IntAct API endpoint is unavailable. Cannot retrieve data.", stacklevel=2)
return pd.DataFrame(), {}
start_time = datetime.datetime.now()
data_df = get_identifier_of_interest(bridgedb_df, Cons.INTACT_GENE_INPUT_ID).reset_index(
drop=True
)
if interaction_type not in Cons.INTACT_GENE_INTERACTION_TYPES:
raise ValueError(
f"Invalid interaction_type: {interaction_type}. Must be {Cons.INTACT_GENE_INTERACTION_TYPES}."
)
ensembl_gene_list = list(set(data_df[Cons.TARGET_COL].tolist()))
ensembl_to_input_id = {
row[Cons.TARGET_COL]: row[Cons.IDENTIFIER_COL] for _, row in data_df.iterrows()
}
ensembl_to_intact_map = {
gene_id: get_protein_intact_acs(gene_id) for gene_id in ensembl_gene_list
}
intact_ac_to_ensembl = {ac: gene for gene, acs in ensembl_to_intact_map.items() for ac in acs}
all_results = {}
batch_size = 10
for i in tqdm(range(0, len(ensembl_gene_list), batch_size), desc="Querying IntAct for genes"):
batch = ensembl_gene_list[i : i + batch_size]
batch_results = get_filtered_interactions(
batch,
set(intact_ac_to_ensembl.keys()),
intact_ac_to_ensembl,
ensembl_to_input_id,
is_compound=False,
interaction_type=interaction_type,
)
all_results.update(batch_results)
data_df[Cons.INTACT_INTERACT_COL] = data_df[Cons.TARGET_COL].map(all_results)
end_time = datetime.datetime.now()
time_elapsed = str(end_time - start_time)
current_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
intact_metadata = {
"datasource": Cons.INTACT,
"metadata": {"source_version": "unknown"},
"query": {
"size": len(ensembl_gene_list),
"input_type": Cons.INTACT_GENE_INPUT_ID,
"time": time_elapsed,
"date": current_date,
"url": Cons.INTACT_ENDPOINT,
},
}
"""Update metadata"""
# Calculate the number of new nodes
num_new_nodes = 0 # TODO: Implement this
# Calculate the number of new edges
num_new_edges = len(all_results)
# Check the intermediate_df
if num_new_edges != len(data_df):
give_annotator_warning(Cons.INTACT)
# Add the number of new nodes and edges to metadata
intact_metadata[Cons.QUERY][Cons.NUM_NODES] = num_new_nodes # type: ignore
intact_metadata[Cons.QUERY][Cons.NUM_EDGES] = num_new_edges # type: ignore
return data_df, intact_metadata
[docs]
def get_compound_interactions(bridgedb_df: pd.DataFrame, interaction_type: str = "both"):
"""Annotate compounds with interaction data from IntAct.
:param bridgedb_df: BridgeDb output for creating the list of compound ids to query.
:param interaction_type: Either 'compound_compound', 'compound_gene' or 'both'. If the input is 'both',
'compound_compound' and 'compound_gene' will be queried.
:raises ValueError: If an invalid interaction_type is provided.
:returns: a tuple (DataFrame containing the IntAct output, metadata dictionary)
"""
api_available = check_endpoint_intact()
if not api_available:
warnings.warn("IntAct API endpoint is unavailable. Cannot retrieve data.", stacklevel=2)
return pd.DataFrame(), {}
start_time = datetime.datetime.now()
data_df = get_identifier_of_interest(bridgedb_df, Cons.INTACT_COMPOUND_INPUT_ID).reset_index(
drop=True
)
data_df = data_df[data_df[Cons.TARGET_COL].str.startswith("CHEBI:")].reset_index(drop=True)
if interaction_type not in Cons.INTACT_COMPOUND_INTERACTION_TYPES:
raise ValueError(
f"Invalid interaction_type: {interaction_type}. Must be {Cons.INTACT_COMPOUND_INTERACTION_TYPES}."
)
chebi_list = list(set(data_df[Cons.TARGET_COL].tolist()))
chebi_to_input_id = {
row[Cons.TARGET_COL]: row[Cons.IDENTIFIER_COL] for _, row in data_df.iterrows()
}
intact_ac_to_chebi = {
chebi_id: chebi_id for chebi_id in chebi_list
} # intact id is same as chebi id
all_results = {}
batch_size = 10
for i in tqdm(range(0, len(chebi_list), batch_size), desc="Querying IntAct for compounds"):
batch = chebi_list[i : i + batch_size]
batch_results = get_filtered_interactions(
batch_ids=batch,
valid_intact_acs=set(chebi_list),
intact_ac_to_entity=intact_ac_to_chebi,
entity_to_input_id=chebi_to_input_id,
is_compound=True,
interaction_type=interaction_type,
)
all_results.update(batch_results)
data_df[Cons.INTACT_COMPOUND_INTERACT_COL] = data_df[Cons.TARGET_COL].map(all_results)
end_time = datetime.datetime.now()
time_elapsed = str(end_time - start_time)
current_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
intact_metadata = {
"datasource": Cons.INTACT,
"metadata": {"source_version": "unknown"},
"query": {
"size": len(chebi_list),
"input_type": Cons.INTACT_COMPOUND_INPUT_ID,
"time": time_elapsed,
"date": current_date,
"url": Cons.INTACT_ENDPOINT,
},
}
"""Update metadata"""
# Calculate the number of new nodes
num_new_nodes = 0 # TODO: Implement this
# Calculate the number of new edges
num_new_edges = sum(data_df[Cons.INTACT_COMPOUND_INTERACT_COL].apply(len))
# Check the intermediate_df
if num_new_edges != len(data_df):
give_annotator_warning(Cons.INTACT)
# Add the number of new nodes and edges to metadata
intact_metadata[Cons.QUERY][Cons.NUM_NODES] = num_new_nodes # type: ignore
intact_metadata[Cons.QUERY][Cons.NUM_EDGES] = num_new_edges # type: ignore
return data_df, intact_metadata