Mercurial > repos > iuc > table_compute
diff scripts/table_compute.py @ 0:1b0f96ed73f2 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/table_compute commit 1ee75135483d5db22c540bc043746cd986f85762"
author | iuc |
---|---|
date | Sat, 17 Aug 2019 16:25:37 -0400 |
parents | |
children | dddadbbac949 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/scripts/table_compute.py Sat Aug 17 16:25:37 2019 -0400 @@ -0,0 +1,310 @@ +#!/usr/bin/env python3 +""" +Table Compute tool - a wrapper around pandas with parameter input validation. +""" + +__version__ = "0.8" + +import csv +import math +from sys import argv + +import numpy as np +import pandas as pd +import userconfig as uc +from safety import Safety +# This should be generated in the same directory + +# Version command should not need to copy the config +if len(argv) == 2 and argv[1] == "--version": + print(__version__) + exit(-1) + + +class Utils: + @staticmethod + def getOneValueMathOp(op_name): + "Returns a simple one value math operator such as log, sqrt, etc" + return getattr(math, op_name) + + @staticmethod + def getVectorPandaOp(op_name): + "Returns a valid DataFrame vector operator" + return getattr(pd.DataFrame, op_name) + + @staticmethod + def getTwoValuePandaOp(op_name, pd_obj): + "Returns a valid two value DataFrame or Series operator" + return getattr(type(pd_obj), "__" + op_name + "__") + + +# Math is imported but not directly used because users +# may specify a "math.<function>" when inserting a custom +# function. To remove linting errors, which break CI testing +# we will just use an arbitrary math statement here. +__ = math.log + + +# Set decimal precision +pd.options.display.precision = uc.Default["precision"] + +user_mode = uc.Default["user_mode"] +user_mode_single = None +out_table = None +params = uc.Data["params"] + +if user_mode == "single": + # Read in TSV file + data = pd.read_csv( + uc.Data["tables"][0]["reader_file"], + header=uc.Data["tables"][0]["reader_header"], + index_col=uc.Data["tables"][0]["reader_row_col"], + keep_default_na=uc.Default["narm"], + sep='\t' + ) + # Fix whitespace issues in index or column names + data.columns = [col.strip() if type(col) is str else col + for col in data.columns] + data.index = [row.strip() if type(row) is str else row + for row in data.index] + + user_mode_single = params["user_mode_single"] + + if user_mode_single == "precision": + # Useful for changing decimal precision on write out + out_table = data + + elif user_mode_single == "select": + cols_specified = params["select_cols_wanted"] + rows_specified = params["select_rows_wanted"] + + # Select all indexes if empty array of values + if not cols_specified: + cols_specified = range(len(data.columns)) + if not rows_specified: + rows_specified = range(len(data)) + + # do not use duplicate indexes + # e.g. [2,3,2,5,5,4,2] to [2,3,5,4] + nodupes_col = not params["select_cols_unique"] + nodupes_row = not params["select_rows_unique"] + + if nodupes_col: + cols_specified = [x for i, x in enumerate(cols_specified) + if x not in cols_specified[:i]] + if nodupes_row: + rows_specified = [x for i, x in enumerate(rows_specified) + if x not in rows_specified[:i]] + + out_table = data.iloc[rows_specified, cols_specified] + + elif user_mode_single == "filtersumval": + mode = params["filtersumval_mode"] + axis = params["filtersumval_axis"] + operation = params["filtersumval_op"] + compare_operation = params["filtersumval_compare"] + value = params["filtersumval_against"] + minmatch = params["filtersumval_minmatch"] + + if mode == "operation": + # Perform axis operation + summary_op = Utils.getVectorPandaOp(operation) + axis_summary = summary_op(data, axis=axis) + # Perform vector comparison + compare_op = Utils.getTwoValuePandaOp( + compare_operation, axis_summary + ) + axis_bool = compare_op(axis_summary, value) + + elif mode == "element": + if operation.startswith("str_"): + data = data.astype("str") + value = str(value) + # Convert str_eq to eq + operation = operation[4:] + else: + value = float(value) + + op = Utils.getTwoValuePandaOp(operation, data) + bool_mat = op(data, value) + axis_bool = np.sum(bool_mat, axis=axis) >= minmatch + + out_table = data.loc[:, axis_bool] if axis == 0 else data.loc[axis_bool, :] + + elif user_mode_single == "matrixapply": + # 0 - column, 1 - row + axis = params["matrixapply_dimension"] + # sd, mean, max, min, sum, median, summary + operation = params["matrixapply_op"] + + if operation is None: + use_custom = params["matrixapply_custom"] + if use_custom: + custom_func = params["matrixapply_custom_func"] + + def fun(vec): + """Dummy Function""" + return vec + + ss = Safety(custom_func, ['vec'], 'pd.Series') + fun_string = ss.generateFunction() + exec(fun_string) # SUPER DUPER SAFE... + + out_table = data.apply(fun, axis) + else: + print("No operation given") + exit(-1) + else: + op = getattr(pd.DataFrame, operation) + out_table = op(data, axis) + + elif user_mode_single == "element": + # lt, gt, ge, etc. + operation = params["element_op"] + if operation is not None: + op = Utils.getTwoValuePandaOp(operation, data) + value = params["element_value"] + try: + # Could be numeric + value = float(value) + except ValueError: + pass + # generate filter matrix of True/False values + bool_mat = op(data, value) + else: + # implement no filtering through a filter matrix filled with + # True values. + bool_mat = np.full(data.shape, True) + + # Get the main processing mode + mode = params["element_mode"] + if mode == "replace": + replacement_val = params["element_replace"] + out_table = data.mask(bool_mat, replacement_val) + elif mode == "modify": + mod_op = Utils.getOneValueMathOp(params["element_modify_op"]) + out_table = data.mask( + bool_mat, data.where(bool_mat).applymap(mod_op) + ) + elif mode == "scale": + scale_op = Utils.getTwoValuePandaOp( + params["element_scale_op"], data + ) + scale_value = params["element_scale_value"] + out_table = data.mask( + bool_mat, scale_op(data.where(bool_mat), scale_value) + ) + elif mode == "custom": + element_customop = params["element_customop"] + + def fun(elem): + """Dummy Function""" + return elem + + ss = Safety(element_customop, ['elem']) + fun_string = ss.generateFunction() + exec(fun_string) # SUPER DUPER SAFE... + + out_table = data.mask( + bool_mat, data.where(bool_mat).applymap(fun) + ) + else: + print("No such element mode!", mode) + exit(-1) + + elif user_mode_single == "fulltable": + general_mode = params["mode"] + + if general_mode == "melt": + melt_ids = params["MELT"]["melt_ids"] + melt_values = params["MELT"]["melt_values"] + + out_table = pd.melt(data, id_vars=melt_ids, value_vars=melt_values) + elif general_mode == "pivot": + pivot_index = params["PIVOT"]["pivot_index"] + pivot_column = params["PIVOT"]["pivot_column"] + pivot_values = params["PIVOT"]["pivot_values"] + + out_table = data.pivot( + index=pivot_index, columns=pivot_column, values=pivot_values + ) + elif general_mode == "custom": + custom_func = params["fulltable_customop"] + + def fun(tableau): + """Dummy Function""" + return tableau + + ss = Safety(custom_func, ['table'], 'pd.DataFrame') + fun_string = ss.generateFunction() + exec(fun_string) # SUPER DUPER SAFE... + + out_table = fun(data) + + else: + print("No such mode!", user_mode_single) + exit(-1) + + +elif user_mode == "multiple": + + table_sections = uc.Data["tables"] + + if not table_sections: + print("Multiple table sets not given!") + exit(-1) + + reader_skip = uc.Default["reader_skip"] + + # Data + table = [] + # 1-based handlers for users "table1", "table2", etc. + table_names = [] + # Actual 0-based references "table[0]", "table[1]", etc. + table_names_real = [] + + # Read and populate tables + for x, t_sect in enumerate(table_sections): + tmp = pd.read_csv( + t_sect["file"], + header=t_sect["header"], + index_col=t_sect["row_names"], + keep_default_na=uc.Default["narm"], + sep="\t" + ) + table.append(tmp) + table_names.append("table" + str(x + 1)) + table_names_real.append("table[" + str(x) + "]") + + custom_op = params["fulltable_customop"] + ss = Safety(custom_op, table_names, 'pd.DataFrame') + fun_string = ss.generateFunction() + # Change the argument to table + fun_string = fun_string.replace("fun(table1):", "fun():") + # table1 to table[1] + for name, name_real in zip(table_names, table_names_real): + fun_string = fun_string.replace(name, name_real) + + fun_string = fun_string.replace("fun():", "fun(table):") + exec(fun_string) # SUPER DUPER SAFE... + out_table = fun(table) + +else: + print("No such mode!", user_mode) + exit(-1) + +if not isinstance(out_table, (pd.DataFrame, pd.Series)): + print('The specified operation did not result in a table to return.') + raise RuntimeError( + 'The operation did not generate a pd.DataFrame or pd.Series to return.' + ) +out_parameters = { + "sep": "\t", + "float_format": "%%.%df" % pd.options.display.precision, + "header": uc.Default["out_headers_col"], + "index": uc.Default["out_headers_row"] +} +if user_mode_single not in ('matrixapply', None): + out_parameters["quoting"] = csv.QUOTE_NONE + +out_table.to_csv(uc.Default["outtable"], **out_parameters)