Mercurial > repos > recetox > table_pandas_transform

diff table_pandas_rename_columns_regex.py @ 0:b722dba91064 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/tables commit d0ff40eb2b536fec6c973c3a9ea8e7f31cd9a0d6
author: recetox
date: Wed, 29 Jan 2025 15:35:51 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/table_pandas_rename_columns_regex.py	Wed Jan 29 15:35:51 2025 +0000
@@ -0,0 +1,117 @@
+import argparse
+import logging
+import re
+from typing import List, Tuple
+
+
+import pandas as pd
+from utils import LoadDataAction, SplitColumnIndicesAction, StoreOutputAction
+
+
+def rename_columns(
+    df: pd.DataFrame, columns: List[int], regex_check: str, regex_replace: str
+) -> pd.DataFrame:
+    """
+    Rename columns in the dataframe based on regex patterns.
+
+    Parameters:
+    df (pd.DataFrame): The input dataframe.
+    columns (List[int]): The 0-based indices of the columns to rename.
+    regex_check (str): The regex pattern to check for in column names.
+    regex_replace (str): The regex pattern to replace with in column names.
+
+    Returns:
+    pd.DataFrame: The dataframe with renamed columns.
+    """
+    try:
+        # Map column indices to column names
+        column_names = [df.columns[i] for i in columns]
+
+        # Rename the specified columns using the regex patterns
+        for column in column_names:
+            if column in df.columns:
+                new_column_name = re.sub(regex_check, regex_replace, column)
+                df.rename(columns={column: new_column_name}, inplace=True)
+        return df
+    except IndexError as e:
+        logging.error(f"Invalid column index: {e}")
+        raise
+    except re.error as e:
+        logging.error(f"Invalid regex pattern: {e}")
+        raise
+    except Exception as e:
+        logging.error(f"Error renaming columns: {e}")
+        raise
+
+
+def main(
+    input_dataset: pd.DataFrame,
+    columns: List[int],
+    regex_check: str,
+    regex_replace: str,
+    output_dataset: Tuple[callable, str],
+) -> None:
+    """
+    Main function to load the dataset, rename columns, and save the result.
+
+    Parameters:
+    input_dataset (Tuple[pd.DataFrame, str]): The input dataset and its file extension.
+    columns (List[int]): The 0-based indices of the columns to rename.
+    regex_check (str): The regex pattern to check for in column names.
+    regex_replace (str): The regex pattern to replace with in column names.
+    output_dataset (Tuple[callable, str]): The output dataset and its file extension.
+    """
+    try:
+        write_func, file_path = output_dataset
+        write_func(rename_columns(input_dataset, columns, regex_check, regex_replace), file_path)
+    except Exception as e:
+        logging.error(f"Error in main function: {e}")
+        raise
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    parser = argparse.ArgumentParser(
+        description="Apply regex-based transformations on multiple dataframe columns."
+    )
+    parser.add_argument(
+        "--input_dataset",
+        nargs=2,
+        action=LoadDataAction,
+        required=True,
+        help="Path to the input dataset and its file extension (csv, tsv, parquet)",
+    )
+    parser.add_argument(
+        "--columns",
+        action=SplitColumnIndicesAction,
+        required=True,
+        help="Comma-separated list of 1-based indices of the columns to apply the transformation on",
+    )
+    parser.add_argument(
+        "--regex_check",
+        type=str,
+        required=True,
+        help="Regex pattern to check for in column names",
+    )
+    parser.add_argument(
+        "--regex_replace",
+        type=str,
+        required=True,
+        help="Regex pattern to replace with in column names",
+    )
+    parser.add_argument(
+        "--output_dataset",
+        nargs=2,
+        action=StoreOutputAction,
+        required=True,
+        help="Path to the output dataset and its file extension (csv, tsv, parquet)",
+    )
+
+    args = parser.parse_args()
+    main(
+        args.input_dataset,
+        args.columns,
+        args.regex_check,
+        args.regex_replace,
+        args.output_dataset,
+    )
author	recetox
date	Wed, 29 Jan 2025 15:35:51 +0000
parents
children