Source code for pyBiodatafuse.graph.rdf.utils

"""Provide utils for BDF RDF."""

import os

import numpy as np
import pandas as pd
from bioregistry import curie_from_iri, normalize_curie, parse_iri
from rdflib import BNode, Graph, Literal, URIRef
from rdflib.namespace import RDF, RDFS, SH, XSD
from shexer.consts import SHACL_TURTLE, TURTLE
from shexer.shaper import Shaper

from pyBiodatafuse.constants import (
    DATA_SOURCES,
    NAMESPACE_BINDINGS,
    NAMESPACE_SHAPES,
    NODE_TYPES,
    VOID_TYPES,
)
from pyBiodatafuse.logging_config import get_logger

logger = get_logger(__name__)



[docs]
def replace_na_none(item):
    """Replace occurrences of NA values (such as 'na', 'nan', 'none') with None.

    :param item: Item to process. Can be a string, float, list, dict, or numpy array.
    :return: Processed item with NA values replaced by None.
    """
    if isinstance(item, str) and item.lower() in ["na", "nan", "none"]:
        return None
    elif item is None or (isinstance(item, float) and pd.isna(item)):
        return None
    elif isinstance(item, list):
        return [replace_na_none(sub_item) for sub_item in item]
    elif isinstance(item, dict):
        return {key: replace_na_none(value) for key, value in item.items()}
    elif isinstance(item, np.ndarray):
        return np.array([replace_na_none(sub_item) for sub_item in item], dtype=object)
    return item




[docs]
def extract_curie(prefix, identifier):
    """Generate a CURIE by normalizing a prefix and identifier.

    :param prefix: Prefix string, such as a registry identifier.
    :param identifier: Identifier to be appended to the prefix.
    :return: Normalized CURIE or None if normalization fails.
    """
    curie = normalize_curie(f"{prefix}:{identifier}")
    if not curie:
        logger.warning("Could not normalize CURIE for %s:%s", prefix, identifier)
    return curie




[docs]
def construct_uri(base_uri, identifier):
    """Construct a URIRef from a base URI and an identifier.

    :param base_uri: Base URI string for the RDF resource.
    :param identifier: Identifier to append to the base URI.
    :return: A URIRef representing the constructed URI.
    """
    return URIRef(f"{base_uri}/{identifier}")




[docs]
def add_data_source_node(g: Graph, source: str) -> URIRef:
    """Create and add a data source node to the RDF graph.

    Uses DCAT Dataset and VoID Dataset types, aligned with dataset_provenance.py.

    :param g: RDF graph to which the data source node will be added.
    :param source: String containing the name of the source of the data.
    :return: URIRef for the created data source node.
    """
    # TODO: fix
    if "OpenTargets" in source:
        source = "OpenTargets"
    data_source_name = Literal(source, datatype=XSD.string)
    data_source_url = URIRef(DATA_SOURCES[source])
    # Add both DCAT and VoID Dataset types (aligned with dataset_provenance.py)
    g.add((data_source_url, RDF.type, URIRef(NODE_TYPES["data_source_node"])))
    g.add((data_source_url, RDF.type, URIRef(VOID_TYPES["dataset"])))
    g.add((data_source_url, RDFS.label, data_source_name))
    return data_source_url




[docs]
def get_shapes(
    g,
    base_uri,
    path,
    threshold=0.000000001,
    graph_type="shex",  # "shex" or "shacl"
    uml_figure_path=None,
    print_string_output=True,
    additional_namespaces=None,
):
    """Use shexer (https://github.com/DaniFdezAlvarez/shexer) on the BDF graph to generate Shex or SHACL.

    :param g: RDF graph to generate shapes from.
    :param base_uri: The graph iri to be added to shaper namespaces.
    :param path: relative path in which the graph TTL will be saved, if provided.
    :param threshold: float between [0,1] used to accept shapes based on frequency.
    :param graph_type: "shex" or "shacl", to specify which graph type to generate.
    :param uml_figure_path: str path where the generated UML is stored.
    :param print_string_output: bool, print or not the generated TTL as a string.
    :param additional_namespaces: dictionary containing {namespace: prefix} pairs.
    :raises ValueError: If the graph type is not a valid string or not in ['shex', 'shacl'].
    :return: shaper shex or shacl graph
    """
    # Graph type: shex or shacl
    graph_type = graph_type.lower()
    if graph_type not in ["shex", "shacl"]:
        raise ValueError("Invalid graph_type specified. Choose 'shex' or 'shacl'.")

    try:
        graph_type = graph_type.lower()
    except AttributeError as exc:
        raise ValueError("graph_type must be a string.") from exc

    # Default namespaces: NAMESPACE_SHAPES
    NAMESPACE_SHAPES[base_uri] = "graph"
    # Merge with additional namespaces if provided
    if additional_namespaces:
        NAMESPACE_SHAPES.update(additional_namespaces)
    # Initialize Shaper with the given graph and namespaces
    shaper = Shaper(
        all_classes_mode=True,
        rdflib_graph=g,
        # input_format=TURTLE,
        namespaces_dict=NAMESPACE_SHAPES,
        # disable_or_statements=False, Workaround for bug
    )
    graph_result = None
    # Generate the appropriate graph (Shex or SHACL)
    rdf_png_path = os.path.join(os.getcwd(), uml_figure_path) if uml_figure_path else None
    if graph_type == "shex":
        graph_result = shaper.shex_graph(
            string_output=True, acceptance_threshold=threshold, to_uml_path=rdf_png_path
        )
    elif graph_type == "shacl":
        graph_result = shaper.shex_graph(
            string_output=True,
            acceptance_threshold=threshold,
            to_uml_path=rdf_png_path,
            output_format=SHACL_TURTLE,
        )

    # Save the output to a file if path is provided
    if path and graph_result:
        # Ensure the directory exists before saving the file
        dir_path = os.path.dirname(path)
        if dir_path and not os.path.exists(dir_path):
            os.makedirs(dir_path)  # Create the directory if it doesn't exist
        with open(path, "a", encoding="utf-8") as f:
            f.write(graph_result)

    # Optionally print the graph to the console if print_string_output is True
    if print_string_output:
        print(graph_result)

    return graph_result




[docs]
def get_shacl_prefixes(namespaces, path, new_uris, print_string_output):
    """
    Generate SHACL prefix declarations and save them in Turtle format.

    :param namespaces: Optional dictionary of prefix to namespace URI mappings to include in the SHACL declarations.
    :param path: Optional path to a file where the Turtle data will be written. If not provided, the data is not written to disk.
    :param new_uris: Dictionary of prefix to namespace URI mappings to include in the SHACL declarations.
    :param print_string_output: bool, print or not the generated TTL as a string.

    :return: A RDFLib Graph containing the SHACL prefix declarations.
    """
    graph = Graph()

    def add_declarations(prefix_dict):
        """
        Add declarations for an dictionary of namespaces.

        :param prefix_dict: Dictionary of  {prefix:namespace,}.
        """
        for prefix, ns_uri in prefix_dict.items():
            declare_node = BNode()
            graph.add((declare_node, SH.prefix, Literal(prefix)))
            graph.add((declare_node, SH.namespace, Literal(ns_uri, datatype=XSD.anyURI)))
            graph.add((BNode(), SH.declare, declare_node))

    add_declarations(new_uris)
    add_declarations(NAMESPACE_BINDINGS)
    if namespaces:
        add_declarations(namespaces)

    ttl_data = graph.serialize(format="ttl")

    if path:
        try:
            with open(path, "w", encoding="UTF-8") as f:
                f.write(ttl_data)
        except IOError as e:
            logger.error("Error writing to file %s: %s", path, e)

    # Optionally print the graph to the console if print_string_output is True
    if print_string_output:
        print(ttl_data)

    return graph




[docs]
def get_node_label(g, node):
    """
    Retrieve the label of a given node from an RDF graph.

    :param g: The RDF graph containing the data.
    :param node: The node whose label is to be retrieved.

    :return: The label of the node if it exists, otherwise None.
    """
    for stmt in g.triples((node, RDFS.label, None)):
        return stmt[2]
    return None




[docs]
def discover_prefixes_from_graph(g: Graph) -> dict:
    """
    Discover namespace prefixes from all URIs in a graph using bioregistry.

    This function collects all URIs from subjects, predicates, and objects
    in the graph, then uses bioregistry to identify prefixes and their
    corresponding namespace URIs.

    :param g: The RDF graph to analyze.
    :return: Dictionary mapping prefix names to namespace URIs.
    """
    logger.debug("Discovering prefixes from graph using bioregistry")

    # Collect all URIs from the graph (subjects, predicates, objects)
    all_uris = set()
    for s, p, o in g:
        if isinstance(s, URIRef):
            all_uris.add(str(s))
        if isinstance(p, URIRef):
            all_uris.add(str(p))
        if isinstance(o, URIRef):
            all_uris.add(str(o))

    logger.debug(f"Collected {len(all_uris)} unique URIs from graph")

    # Track discovered prefixes
    discovered_prefixes = {}

    # Process each URI with bioregistry
    for uri in all_uris:
        try:
            # Parse IRI to get prefix and local ID
            parsed = parse_iri(uri)
            if parsed:
                prefix, local_id = parsed
                # Get CURIE to understand the pattern
                curie = curie_from_iri(uri)
                if curie and ":" in curie:
                    # Extract namespace pattern from original URI
                    # Try to find where the local_id appears in the URI
                    if local_id in uri:
                        # Find the position and extract namespace
                        idx = uri.rfind(local_id)
                        namespace_uri = uri[:idx]
                    else:
                        # Fallback: use standard patterns
                        if "#" in uri:
                            namespace_uri = uri.rsplit("#", 1)[0] + "#"
                        else:
                            namespace_uri = uri.rsplit("/", 1)[0] + "/"

                    # Only add if not already in NAMESPACE_BINDINGS
                    if prefix not in NAMESPACE_BINDINGS:
                        discovered_prefixes[prefix] = namespace_uri

        except Exception:
            # Fallback for URIs that bioregistry doesn't recognize
            try:
                curie = curie_from_iri(uri)
                if curie and ":" in curie:
                    prefix = curie.split(":", 1)[0]
                    # Extract namespace from URI structure
                    if "#" in uri:
                        namespace_uri = uri.rsplit("#", 1)[0] + "#"
                    else:
                        namespace_uri = uri.rsplit("/", 1)[0] + "/"
                    # Only add if not already in NAMESPACE_BINDINGS
                    if prefix not in NAMESPACE_BINDINGS:
                        discovered_prefixes[prefix] = namespace_uri
            except Exception as e:
                logger.debug("Skipping URI parsing: %s", e)

    logger.debug(f"Discovered {len(discovered_prefixes)} new prefixes via bioregistry")
    return discovered_prefixes