view table_pandas_rename_columns_regex.py @ 0:b722dba91064 draft default tip

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/tables commit d0ff40eb2b536fec6c973c3a9ea8e7f31cd9a0d6
author recetox
date Wed, 29 Jan 2025 15:35:51 +0000
parents
children
line wrap: on
line source

import argparse
import logging
import re
from typing import List, Tuple


import pandas as pd
from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction


def rename_columns(
    df: pd.DataFrame, columns: List[int], regex_check: str, regex_replace: str
) -> pd.DataFrame:
    """
    Rename columns in the dataframe based on regex patterns.

    Parameters:
    df (pd.DataFrame): The input dataframe.
    columns (List[int]): The 0-based indices of the columns to rename.
    regex_check (str): The regex pattern to check for in column names.
    regex_replace (str): The regex pattern to replace with in column names.

    Returns:
    pd.DataFrame: The dataframe with renamed columns.
    """
    try:
        # Map column indices to column names
        column_names = [df.columns[i] for i in columns]

        # Rename the specified columns using the regex patterns
        for column in column_names:
            if column in df.columns:
                new_column_name = re.sub(regex_check, regex_replace, column)
                df.rename(columns={column: new_column_name}, inplace=True)
        return df
    except IndexError as e:
        logging.error(f"Invalid column index: {e}")
        raise
    except re.error as e:
        logging.error(f"Invalid regex pattern: {e}")
        raise
    except Exception as e:
        logging.error(f"Error renaming columns: {e}")
        raise


def main(
    input_dataset: pd.DataFrame,
    columns: List[int],
    regex_check: str,
    regex_replace: str,
    output_dataset: Tuple[callable, str],
) -> None:
    """
    Main function to load the dataset, rename columns, and save the result.

    Parameters:
    input_dataset (Tuple[pd.DataFrame, str]): The input dataset and its file extension.
    columns (List[int]): The 0-based indices of the columns to rename.
    regex_check (str): The regex pattern to check for in column names.
    regex_replace (str): The regex pattern to replace with in column names.
    output_dataset (Tuple[callable, str]): The output dataset and its file extension.
    """
    try:
        write_func, file_path = output_dataset
        write_func(rename_columns(input_dataset, columns, regex_check, regex_replace), file_path)
    except Exception as e:
        logging.error(f"Error in main function: {e}")
        raise


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    parser = argparse.ArgumentParser(
        description="Apply regex-based transformations on multiple dataframe columns."
    )
    parser.add_argument(
        "--input_dataset",
        nargs=2,
        action=LoadDataAction,
        required=True,
        help="Path to the input dataset and its file extension (csv, tsv, parquet)",
    )
    parser.add_argument(
        "--columns",
        action=SplitColumnIndicesAction,
        required=True,
        help="Comma-separated list of 1-based indices of the columns to apply the transformation on",
    )
    parser.add_argument(
        "--regex_check",
        type=str,
        required=True,
        help="Regex pattern to check for in column names",
    )
    parser.add_argument(
        "--regex_replace",
        type=str,
        required=True,
        help="Regex pattern to replace with in column names",
    )
    parser.add_argument(
        "--output_dataset",
        nargs=2,
        action=StoreOutputAction,
        required=True,
        help="Path to the output dataset and its file extension (csv, tsv, parquet)",
    )

    args = parser.parse_args()
    main(
        args.input_dataset,
        args.columns,
        args.regex_check,
        args.regex_replace,
        args.output_dataset,
    )