Source code for pyBiodatafuse.data_loader

# coding: utf-8

"""Python script to conver a list of identifiers to a dataframe."""

import re
from typing import Optional

import pandas as pd


[docs] def create_df_from_file(file_path: str) -> pd.DataFrame: """Create a DataFrame from a file containing a list of identifiers. :param file_path: path to the file containing the list of identifiers :returns: a DataFrame containing the list of identifiers """ # Initialize an empty list to store the data data = [] # Open the file and read its contents with open(file_path, "r") as file: content = file.read() # Split the content using regular expressions to handle multiple delimiters (',' and '\n') identifiers = [val.strip() for val in re.split(r"[,\n]+", content) if val.strip()] data.extend(identifiers) # Create a DataFrame using pandas df = pd.DataFrame(data, columns=["identifier"]) return df
[docs] def create_df_from_text(text_input: str) -> pd.DataFrame: """Create a DataFrame from a text containing a list of identifiers. :param text_input: text containing the list of identifiers with each identifier on a new line. :returns: a DataFrame containing the list of identifiers """ # Initialize an empty list to store the data data = [] # Split the text using newline characters to create a list of identifiers identifiers = [val.strip() for val in text_input.split("\n") if val.strip()] data.extend(identifiers) # Create a DataFrame using pandas df = pd.DataFrame(data, columns=["identifier"]) return df
[docs] def create_df_from_dea(file_path: str) -> pd.DataFrame: """Read a dataframe containing the result of the differential expression analysis (DEA). :param file_path: path to the file containing the result of DEA :returns: the DEA dataframe with proper column name :raises ValueError: if the file is not value """ # Get the file extension file_extension = file_path.split(".")[-1].lower() if file_extension == "xlsx": # Read Excel file (xlsx) try: df = pd.read_excel(file_path) df = df.rename(columns={df.columns[0]: "identifier"}) return df except Exception as e: raise ValueError(f"Error reading Excel file: {str(e)}") if file_extension == "xls": # Read Excel file (xls) try: df = pd.read_excel(file_path, engine="xlrd") df = df.rename(columns={df.columns[0]: "identifier"}) return df except Exception as e: raise ValueError(f"Error reading Excel file: {str(e)}") elif file_extension == "csv" or file_extension == "txt": # Read CSV or text file try: delimiter = "," if file_extension == "csv" else "\t" df = pd.read_csv(file_path, sep=delimiter) df = df.rename(columns={df.columns[0]: "identifier"}) return df except Exception as e: raise ValueError(f"Error reading CSV/text file: {str(e)}") else: raise ValueError("Unsupported file format. Please provide an Excel, CSV, or TXT file.")
[docs] def filter_dea( data: pd.DataFrame, column_name: str, min_value: Optional[float] = None, max_value: Optional[float] = None, abs_value: Optional[float] = None, ) -> pd.DataFrame: """Filter the differential expression analysis (DEA) table. :param data: DEA dataframe :param column_name: the column to filter :param min_value: the minimum value :param max_value: the maximum value :param abs_value: the absolute value (when filtering for LogFoldChange) :returns: the filtered DEA dataframe :raises ValueError: if the paramaters are invalid """ if (min_value is not None or max_value is not None) and abs_value is not None: raise ValueError( "When providing abs_value, min_value and max_value should not be specified" ) elif min_value is None and max_value is None: filtered_abs_df = data[abs(data[column_name]) >= abs_value] return filtered_abs_df if abs_value is None: filtered_max_df = pd.DataFrame() filtered_abs_df = pd.DataFrame() if min_value is not None: filtered_min_df = data[data[column_name] >= min_value] if max_value is not None: filtered_max_df = data[data[column_name] <= max_value] filtered_df = pd.concat([filtered_min_df, filtered_max_df]) return filtered_df