Source code for pyBiodatafuse.data_loader

# coding: utf-8

"""Python script to conver a list of identifiers to a dataframe."""

import re
from typing import Optional

import pandas as pd



[docs]
def create_df_from_file(file_path: str) -> pd.DataFrame:
    """Create a DataFrame from a file containing a list of identifiers.

    :param file_path: path to the file containing the list of identifiers
    :returns: a DataFrame containing the list of identifiers
    """
    # Initialize an empty list to store the data
    data = []

    # Open the file and read its contents
    with open(file_path, "r") as file:
        content = file.read()
        # Split the content using regular expressions to handle multiple delimiters (',' and '\n')
        identifiers = [val.strip() for val in re.split(r"[,\n]+", content) if val.strip()]
        data.extend(identifiers)

    # Create a DataFrame using pandas
    df = pd.DataFrame(data, columns=["identifier"])

    return df




[docs]
def create_df_from_text(text_input: str) -> pd.DataFrame:
    """Create a DataFrame from a text containing a list of identifiers.

    :param text_input: text containing the list of identifiers with each identifier on a new line.
    :returns: a DataFrame containing the list of identifiers
    """
    # Initialize an empty list to store the data
    data = []

    # Split the text using newline characters to create a list of identifiers
    identifiers = [val.strip() for val in text_input.split("\n") if val.strip()]
    data.extend(identifiers)

    # Create a DataFrame using pandas
    df = pd.DataFrame(data, columns=["identifier"])

    return df




[docs]
def create_df_from_dea(file_path: str) -> pd.DataFrame:
    """Read a dataframe containing the result of the differential expression analysis (DEA).

    :param file_path: path to the file containing the result of DEA
    :returns: the DEA dataframe with proper column name
    :raises ValueError: if the file is not value
    """
    # Get the file extension
    file_extension = file_path.split(".")[-1].lower()
    if file_extension == "xlsx":
        # Read Excel file (xlsx)
        try:
            df = pd.read_excel(file_path)
            df = df.rename(columns={df.columns[0]: "identifier"})
            return df
        except Exception as e:
            raise ValueError(f"Error reading Excel file: {str(e)}")
    if file_extension == "xls":
        # Read Excel file (xls)
        try:
            df = pd.read_excel(file_path, engine="xlrd")
            df = df.rename(columns={df.columns[0]: "identifier"})
            return df
        except Exception as e:
            raise ValueError(f"Error reading Excel file: {str(e)}")
    elif file_extension == "csv" or file_extension == "txt":
        # Read CSV or text file
        try:
            delimiter = "," if file_extension == "csv" else "\t"
            df = pd.read_csv(file_path, sep=delimiter)
            df = df.rename(columns={df.columns[0]: "identifier"})
            return df
        except Exception as e:
            raise ValueError(f"Error reading CSV/text file: {str(e)}")
    else:
        raise ValueError("Unsupported file format. Please provide an Excel, CSV, or TXT file.")




[docs]
def filter_dea(
    data: pd.DataFrame,
    column_name: str,
    min_value: Optional[float] = None,
    max_value: Optional[float] = None,
    abs_value: Optional[float] = None,
) -> pd.DataFrame:
    """Filter the differential expression analysis (DEA) table.

    :param data: DEA dataframe
    :param column_name: the column to filter
    :param min_value: the minimum value
    :param max_value: the maximum value
    :param abs_value: the absolute value (when filtering for LogFoldChange)
    :returns: the filtered DEA dataframe
    :raises ValueError: if the paramaters are invalid
    """
    if (min_value is not None or max_value is not None) and abs_value is not None:
        raise ValueError(
            "When providing abs_value, min_value and max_value should not be specified"
        )
    elif min_value is None and max_value is None:
        filtered_abs_df = data[abs(data[column_name]) >= abs_value]
        return filtered_abs_df
    if abs_value is None:
        filtered_max_df = pd.DataFrame()
        filtered_abs_df = pd.DataFrame()
        if min_value is not None:
            filtered_min_df = data[data[column_name] >= min_value]
        if max_value is not None:
            filtered_max_df = data[data[column_name] <= max_value]

        filtered_df = pd.concat([filtered_min_df, filtered_max_df])

        return filtered_df