Mercurial > repos > iuc > table_compute
comparison scripts/table_compute.py @ 0:1b0f96ed73f2 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/table_compute commit 1ee75135483d5db22c540bc043746cd986f85762"
| author | iuc |
|---|---|
| date | Sat, 17 Aug 2019 16:25:37 -0400 |
| parents | |
| children | dddadbbac949 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1b0f96ed73f2 |
|---|---|
| 1 #!/usr/bin/env python3 | |
| 2 """ | |
| 3 Table Compute tool - a wrapper around pandas with parameter input validation. | |
| 4 """ | |
| 5 | |
| 6 __version__ = "0.8" | |
| 7 | |
| 8 import csv | |
| 9 import math | |
| 10 from sys import argv | |
| 11 | |
| 12 import numpy as np | |
| 13 import pandas as pd | |
| 14 import userconfig as uc | |
| 15 from safety import Safety | |
| 16 # This should be generated in the same directory | |
| 17 | |
| 18 # Version command should not need to copy the config | |
| 19 if len(argv) == 2 and argv[1] == "--version": | |
| 20 print(__version__) | |
| 21 exit(-1) | |
| 22 | |
| 23 | |
| 24 class Utils: | |
| 25 @staticmethod | |
| 26 def getOneValueMathOp(op_name): | |
| 27 "Returns a simple one value math operator such as log, sqrt, etc" | |
| 28 return getattr(math, op_name) | |
| 29 | |
| 30 @staticmethod | |
| 31 def getVectorPandaOp(op_name): | |
| 32 "Returns a valid DataFrame vector operator" | |
| 33 return getattr(pd.DataFrame, op_name) | |
| 34 | |
| 35 @staticmethod | |
| 36 def getTwoValuePandaOp(op_name, pd_obj): | |
| 37 "Returns a valid two value DataFrame or Series operator" | |
| 38 return getattr(type(pd_obj), "__" + op_name + "__") | |
| 39 | |
| 40 | |
| 41 # Math is imported but not directly used because users | |
| 42 # may specify a "math.<function>" when inserting a custom | |
| 43 # function. To remove linting errors, which break CI testing | |
| 44 # we will just use an arbitrary math statement here. | |
| 45 __ = math.log | |
| 46 | |
| 47 | |
| 48 # Set decimal precision | |
| 49 pd.options.display.precision = uc.Default["precision"] | |
| 50 | |
| 51 user_mode = uc.Default["user_mode"] | |
| 52 user_mode_single = None | |
| 53 out_table = None | |
| 54 params = uc.Data["params"] | |
| 55 | |
| 56 if user_mode == "single": | |
| 57 # Read in TSV file | |
| 58 data = pd.read_csv( | |
| 59 uc.Data["tables"][0]["reader_file"], | |
| 60 header=uc.Data["tables"][0]["reader_header"], | |
| 61 index_col=uc.Data["tables"][0]["reader_row_col"], | |
| 62 keep_default_na=uc.Default["narm"], | |
| 63 sep='\t' | |
| 64 ) | |
| 65 # Fix whitespace issues in index or column names | |
| 66 data.columns = [col.strip() if type(col) is str else col | |
| 67 for col in data.columns] | |
| 68 data.index = [row.strip() if type(row) is str else row | |
| 69 for row in data.index] | |
| 70 | |
| 71 user_mode_single = params["user_mode_single"] | |
| 72 | |
| 73 if user_mode_single == "precision": | |
| 74 # Useful for changing decimal precision on write out | |
| 75 out_table = data | |
| 76 | |
| 77 elif user_mode_single == "select": | |
| 78 cols_specified = params["select_cols_wanted"] | |
| 79 rows_specified = params["select_rows_wanted"] | |
| 80 | |
| 81 # Select all indexes if empty array of values | |
| 82 if not cols_specified: | |
| 83 cols_specified = range(len(data.columns)) | |
| 84 if not rows_specified: | |
| 85 rows_specified = range(len(data)) | |
| 86 | |
| 87 # do not use duplicate indexes | |
| 88 # e.g. [2,3,2,5,5,4,2] to [2,3,5,4] | |
| 89 nodupes_col = not params["select_cols_unique"] | |
| 90 nodupes_row = not params["select_rows_unique"] | |
| 91 | |
| 92 if nodupes_col: | |
| 93 cols_specified = [x for i, x in enumerate(cols_specified) | |
| 94 if x not in cols_specified[:i]] | |
| 95 if nodupes_row: | |
| 96 rows_specified = [x for i, x in enumerate(rows_specified) | |
| 97 if x not in rows_specified[:i]] | |
| 98 | |
| 99 out_table = data.iloc[rows_specified, cols_specified] | |
| 100 | |
| 101 elif user_mode_single == "filtersumval": | |
| 102 mode = params["filtersumval_mode"] | |
| 103 axis = params["filtersumval_axis"] | |
| 104 operation = params["filtersumval_op"] | |
| 105 compare_operation = params["filtersumval_compare"] | |
| 106 value = params["filtersumval_against"] | |
| 107 minmatch = params["filtersumval_minmatch"] | |
| 108 | |
| 109 if mode == "operation": | |
| 110 # Perform axis operation | |
| 111 summary_op = Utils.getVectorPandaOp(operation) | |
| 112 axis_summary = summary_op(data, axis=axis) | |
| 113 # Perform vector comparison | |
| 114 compare_op = Utils.getTwoValuePandaOp( | |
| 115 compare_operation, axis_summary | |
| 116 ) | |
| 117 axis_bool = compare_op(axis_summary, value) | |
| 118 | |
| 119 elif mode == "element": | |
| 120 if operation.startswith("str_"): | |
| 121 data = data.astype("str") | |
| 122 value = str(value) | |
| 123 # Convert str_eq to eq | |
| 124 operation = operation[4:] | |
| 125 else: | |
| 126 value = float(value) | |
| 127 | |
| 128 op = Utils.getTwoValuePandaOp(operation, data) | |
| 129 bool_mat = op(data, value) | |
| 130 axis_bool = np.sum(bool_mat, axis=axis) >= minmatch | |
| 131 | |
| 132 out_table = data.loc[:, axis_bool] if axis == 0 else data.loc[axis_bool, :] | |
| 133 | |
| 134 elif user_mode_single == "matrixapply": | |
| 135 # 0 - column, 1 - row | |
| 136 axis = params["matrixapply_dimension"] | |
| 137 # sd, mean, max, min, sum, median, summary | |
| 138 operation = params["matrixapply_op"] | |
| 139 | |
| 140 if operation is None: | |
| 141 use_custom = params["matrixapply_custom"] | |
| 142 if use_custom: | |
| 143 custom_func = params["matrixapply_custom_func"] | |
| 144 | |
| 145 def fun(vec): | |
| 146 """Dummy Function""" | |
| 147 return vec | |
| 148 | |
| 149 ss = Safety(custom_func, ['vec'], 'pd.Series') | |
| 150 fun_string = ss.generateFunction() | |
| 151 exec(fun_string) # SUPER DUPER SAFE... | |
| 152 | |
| 153 out_table = data.apply(fun, axis) | |
| 154 else: | |
| 155 print("No operation given") | |
| 156 exit(-1) | |
| 157 else: | |
| 158 op = getattr(pd.DataFrame, operation) | |
| 159 out_table = op(data, axis) | |
| 160 | |
| 161 elif user_mode_single == "element": | |
| 162 # lt, gt, ge, etc. | |
| 163 operation = params["element_op"] | |
| 164 if operation is not None: | |
| 165 op = Utils.getTwoValuePandaOp(operation, data) | |
| 166 value = params["element_value"] | |
| 167 try: | |
| 168 # Could be numeric | |
| 169 value = float(value) | |
| 170 except ValueError: | |
| 171 pass | |
| 172 # generate filter matrix of True/False values | |
| 173 bool_mat = op(data, value) | |
| 174 else: | |
| 175 # implement no filtering through a filter matrix filled with | |
| 176 # True values. | |
| 177 bool_mat = np.full(data.shape, True) | |
| 178 | |
| 179 # Get the main processing mode | |
| 180 mode = params["element_mode"] | |
| 181 if mode == "replace": | |
| 182 replacement_val = params["element_replace"] | |
| 183 out_table = data.mask(bool_mat, replacement_val) | |
| 184 elif mode == "modify": | |
| 185 mod_op = Utils.getOneValueMathOp(params["element_modify_op"]) | |
| 186 out_table = data.mask( | |
| 187 bool_mat, data.where(bool_mat).applymap(mod_op) | |
| 188 ) | |
| 189 elif mode == "scale": | |
| 190 scale_op = Utils.getTwoValuePandaOp( | |
| 191 params["element_scale_op"], data | |
| 192 ) | |
| 193 scale_value = params["element_scale_value"] | |
| 194 out_table = data.mask( | |
| 195 bool_mat, scale_op(data.where(bool_mat), scale_value) | |
| 196 ) | |
| 197 elif mode == "custom": | |
| 198 element_customop = params["element_customop"] | |
| 199 | |
| 200 def fun(elem): | |
| 201 """Dummy Function""" | |
| 202 return elem | |
| 203 | |
| 204 ss = Safety(element_customop, ['elem']) | |
| 205 fun_string = ss.generateFunction() | |
| 206 exec(fun_string) # SUPER DUPER SAFE... | |
| 207 | |
| 208 out_table = data.mask( | |
| 209 bool_mat, data.where(bool_mat).applymap(fun) | |
| 210 ) | |
| 211 else: | |
| 212 print("No such element mode!", mode) | |
| 213 exit(-1) | |
| 214 | |
| 215 elif user_mode_single == "fulltable": | |
| 216 general_mode = params["mode"] | |
| 217 | |
| 218 if general_mode == "melt": | |
| 219 melt_ids = params["MELT"]["melt_ids"] | |
| 220 melt_values = params["MELT"]["melt_values"] | |
| 221 | |
| 222 out_table = pd.melt(data, id_vars=melt_ids, value_vars=melt_values) | |
| 223 elif general_mode == "pivot": | |
| 224 pivot_index = params["PIVOT"]["pivot_index"] | |
| 225 pivot_column = params["PIVOT"]["pivot_column"] | |
| 226 pivot_values = params["PIVOT"]["pivot_values"] | |
| 227 | |
| 228 out_table = data.pivot( | |
| 229 index=pivot_index, columns=pivot_column, values=pivot_values | |
| 230 ) | |
| 231 elif general_mode == "custom": | |
| 232 custom_func = params["fulltable_customop"] | |
| 233 | |
| 234 def fun(tableau): | |
| 235 """Dummy Function""" | |
| 236 return tableau | |
| 237 | |
| 238 ss = Safety(custom_func, ['table'], 'pd.DataFrame') | |
| 239 fun_string = ss.generateFunction() | |
| 240 exec(fun_string) # SUPER DUPER SAFE... | |
| 241 | |
| 242 out_table = fun(data) | |
| 243 | |
| 244 else: | |
| 245 print("No such mode!", user_mode_single) | |
| 246 exit(-1) | |
| 247 | |
| 248 | |
| 249 elif user_mode == "multiple": | |
| 250 | |
| 251 table_sections = uc.Data["tables"] | |
| 252 | |
| 253 if not table_sections: | |
| 254 print("Multiple table sets not given!") | |
| 255 exit(-1) | |
| 256 | |
| 257 reader_skip = uc.Default["reader_skip"] | |
| 258 | |
| 259 # Data | |
| 260 table = [] | |
| 261 # 1-based handlers for users "table1", "table2", etc. | |
| 262 table_names = [] | |
| 263 # Actual 0-based references "table[0]", "table[1]", etc. | |
| 264 table_names_real = [] | |
| 265 | |
| 266 # Read and populate tables | |
| 267 for x, t_sect in enumerate(table_sections): | |
| 268 tmp = pd.read_csv( | |
| 269 t_sect["file"], | |
| 270 header=t_sect["header"], | |
| 271 index_col=t_sect["row_names"], | |
| 272 keep_default_na=uc.Default["narm"], | |
| 273 sep="\t" | |
| 274 ) | |
| 275 table.append(tmp) | |
| 276 table_names.append("table" + str(x + 1)) | |
| 277 table_names_real.append("table[" + str(x) + "]") | |
| 278 | |
| 279 custom_op = params["fulltable_customop"] | |
| 280 ss = Safety(custom_op, table_names, 'pd.DataFrame') | |
| 281 fun_string = ss.generateFunction() | |
| 282 # Change the argument to table | |
| 283 fun_string = fun_string.replace("fun(table1):", "fun():") | |
| 284 # table1 to table[1] | |
| 285 for name, name_real in zip(table_names, table_names_real): | |
| 286 fun_string = fun_string.replace(name, name_real) | |
| 287 | |
| 288 fun_string = fun_string.replace("fun():", "fun(table):") | |
| 289 exec(fun_string) # SUPER DUPER SAFE... | |
| 290 out_table = fun(table) | |
| 291 | |
| 292 else: | |
| 293 print("No such mode!", user_mode) | |
| 294 exit(-1) | |
| 295 | |
| 296 if not isinstance(out_table, (pd.DataFrame, pd.Series)): | |
| 297 print('The specified operation did not result in a table to return.') | |
| 298 raise RuntimeError( | |
| 299 'The operation did not generate a pd.DataFrame or pd.Series to return.' | |
| 300 ) | |
| 301 out_parameters = { | |
| 302 "sep": "\t", | |
| 303 "float_format": "%%.%df" % pd.options.display.precision, | |
| 304 "header": uc.Default["out_headers_col"], | |
| 305 "index": uc.Default["out_headers_row"] | |
| 306 } | |
| 307 if user_mode_single not in ('matrixapply', None): | |
| 308 out_parameters["quoting"] = csv.QUOTE_NONE | |
| 309 | |
| 310 out_table.to_csv(uc.Default["outtable"], **out_parameters) |
