comparison scripts/table_compute.py @ 0:1b0f96ed73f2 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/table_compute commit 1ee75135483d5db22c540bc043746cd986f85762"
author iuc
date Sat, 17 Aug 2019 16:25:37 -0400
parents
children dddadbbac949
comparison
equal deleted inserted replaced
-1:000000000000 0:1b0f96ed73f2
1 #!/usr/bin/env python3
2 """
3 Table Compute tool - a wrapper around pandas with parameter input validation.
4 """
5
6 __version__ = "0.8"
7
8 import csv
9 import math
10 from sys import argv
11
12 import numpy as np
13 import pandas as pd
14 import userconfig as uc
15 from safety import Safety
16 # This should be generated in the same directory
17
18 # Version command should not need to copy the config
19 if len(argv) == 2 and argv[1] == "--version":
20 print(__version__)
21 exit(-1)
22
23
24 class Utils:
25 @staticmethod
26 def getOneValueMathOp(op_name):
27 "Returns a simple one value math operator such as log, sqrt, etc"
28 return getattr(math, op_name)
29
30 @staticmethod
31 def getVectorPandaOp(op_name):
32 "Returns a valid DataFrame vector operator"
33 return getattr(pd.DataFrame, op_name)
34
35 @staticmethod
36 def getTwoValuePandaOp(op_name, pd_obj):
37 "Returns a valid two value DataFrame or Series operator"
38 return getattr(type(pd_obj), "__" + op_name + "__")
39
40
41 # Math is imported but not directly used because users
42 # may specify a "math.<function>" when inserting a custom
43 # function. To remove linting errors, which break CI testing
44 # we will just use an arbitrary math statement here.
45 __ = math.log
46
47
48 # Set decimal precision
49 pd.options.display.precision = uc.Default["precision"]
50
51 user_mode = uc.Default["user_mode"]
52 user_mode_single = None
53 out_table = None
54 params = uc.Data["params"]
55
56 if user_mode == "single":
57 # Read in TSV file
58 data = pd.read_csv(
59 uc.Data["tables"][0]["reader_file"],
60 header=uc.Data["tables"][0]["reader_header"],
61 index_col=uc.Data["tables"][0]["reader_row_col"],
62 keep_default_na=uc.Default["narm"],
63 sep='\t'
64 )
65 # Fix whitespace issues in index or column names
66 data.columns = [col.strip() if type(col) is str else col
67 for col in data.columns]
68 data.index = [row.strip() if type(row) is str else row
69 for row in data.index]
70
71 user_mode_single = params["user_mode_single"]
72
73 if user_mode_single == "precision":
74 # Useful for changing decimal precision on write out
75 out_table = data
76
77 elif user_mode_single == "select":
78 cols_specified = params["select_cols_wanted"]
79 rows_specified = params["select_rows_wanted"]
80
81 # Select all indexes if empty array of values
82 if not cols_specified:
83 cols_specified = range(len(data.columns))
84 if not rows_specified:
85 rows_specified = range(len(data))
86
87 # do not use duplicate indexes
88 # e.g. [2,3,2,5,5,4,2] to [2,3,5,4]
89 nodupes_col = not params["select_cols_unique"]
90 nodupes_row = not params["select_rows_unique"]
91
92 if nodupes_col:
93 cols_specified = [x for i, x in enumerate(cols_specified)
94 if x not in cols_specified[:i]]
95 if nodupes_row:
96 rows_specified = [x for i, x in enumerate(rows_specified)
97 if x not in rows_specified[:i]]
98
99 out_table = data.iloc[rows_specified, cols_specified]
100
101 elif user_mode_single == "filtersumval":
102 mode = params["filtersumval_mode"]
103 axis = params["filtersumval_axis"]
104 operation = params["filtersumval_op"]
105 compare_operation = params["filtersumval_compare"]
106 value = params["filtersumval_against"]
107 minmatch = params["filtersumval_minmatch"]
108
109 if mode == "operation":
110 # Perform axis operation
111 summary_op = Utils.getVectorPandaOp(operation)
112 axis_summary = summary_op(data, axis=axis)
113 # Perform vector comparison
114 compare_op = Utils.getTwoValuePandaOp(
115 compare_operation, axis_summary
116 )
117 axis_bool = compare_op(axis_summary, value)
118
119 elif mode == "element":
120 if operation.startswith("str_"):
121 data = data.astype("str")
122 value = str(value)
123 # Convert str_eq to eq
124 operation = operation[4:]
125 else:
126 value = float(value)
127
128 op = Utils.getTwoValuePandaOp(operation, data)
129 bool_mat = op(data, value)
130 axis_bool = np.sum(bool_mat, axis=axis) >= minmatch
131
132 out_table = data.loc[:, axis_bool] if axis == 0 else data.loc[axis_bool, :]
133
134 elif user_mode_single == "matrixapply":
135 # 0 - column, 1 - row
136 axis = params["matrixapply_dimension"]
137 # sd, mean, max, min, sum, median, summary
138 operation = params["matrixapply_op"]
139
140 if operation is None:
141 use_custom = params["matrixapply_custom"]
142 if use_custom:
143 custom_func = params["matrixapply_custom_func"]
144
145 def fun(vec):
146 """Dummy Function"""
147 return vec
148
149 ss = Safety(custom_func, ['vec'], 'pd.Series')
150 fun_string = ss.generateFunction()
151 exec(fun_string) # SUPER DUPER SAFE...
152
153 out_table = data.apply(fun, axis)
154 else:
155 print("No operation given")
156 exit(-1)
157 else:
158 op = getattr(pd.DataFrame, operation)
159 out_table = op(data, axis)
160
161 elif user_mode_single == "element":
162 # lt, gt, ge, etc.
163 operation = params["element_op"]
164 if operation is not None:
165 op = Utils.getTwoValuePandaOp(operation, data)
166 value = params["element_value"]
167 try:
168 # Could be numeric
169 value = float(value)
170 except ValueError:
171 pass
172 # generate filter matrix of True/False values
173 bool_mat = op(data, value)
174 else:
175 # implement no filtering through a filter matrix filled with
176 # True values.
177 bool_mat = np.full(data.shape, True)
178
179 # Get the main processing mode
180 mode = params["element_mode"]
181 if mode == "replace":
182 replacement_val = params["element_replace"]
183 out_table = data.mask(bool_mat, replacement_val)
184 elif mode == "modify":
185 mod_op = Utils.getOneValueMathOp(params["element_modify_op"])
186 out_table = data.mask(
187 bool_mat, data.where(bool_mat).applymap(mod_op)
188 )
189 elif mode == "scale":
190 scale_op = Utils.getTwoValuePandaOp(
191 params["element_scale_op"], data
192 )
193 scale_value = params["element_scale_value"]
194 out_table = data.mask(
195 bool_mat, scale_op(data.where(bool_mat), scale_value)
196 )
197 elif mode == "custom":
198 element_customop = params["element_customop"]
199
200 def fun(elem):
201 """Dummy Function"""
202 return elem
203
204 ss = Safety(element_customop, ['elem'])
205 fun_string = ss.generateFunction()
206 exec(fun_string) # SUPER DUPER SAFE...
207
208 out_table = data.mask(
209 bool_mat, data.where(bool_mat).applymap(fun)
210 )
211 else:
212 print("No such element mode!", mode)
213 exit(-1)
214
215 elif user_mode_single == "fulltable":
216 general_mode = params["mode"]
217
218 if general_mode == "melt":
219 melt_ids = params["MELT"]["melt_ids"]
220 melt_values = params["MELT"]["melt_values"]
221
222 out_table = pd.melt(data, id_vars=melt_ids, value_vars=melt_values)
223 elif general_mode == "pivot":
224 pivot_index = params["PIVOT"]["pivot_index"]
225 pivot_column = params["PIVOT"]["pivot_column"]
226 pivot_values = params["PIVOT"]["pivot_values"]
227
228 out_table = data.pivot(
229 index=pivot_index, columns=pivot_column, values=pivot_values
230 )
231 elif general_mode == "custom":
232 custom_func = params["fulltable_customop"]
233
234 def fun(tableau):
235 """Dummy Function"""
236 return tableau
237
238 ss = Safety(custom_func, ['table'], 'pd.DataFrame')
239 fun_string = ss.generateFunction()
240 exec(fun_string) # SUPER DUPER SAFE...
241
242 out_table = fun(data)
243
244 else:
245 print("No such mode!", user_mode_single)
246 exit(-1)
247
248
249 elif user_mode == "multiple":
250
251 table_sections = uc.Data["tables"]
252
253 if not table_sections:
254 print("Multiple table sets not given!")
255 exit(-1)
256
257 reader_skip = uc.Default["reader_skip"]
258
259 # Data
260 table = []
261 # 1-based handlers for users "table1", "table2", etc.
262 table_names = []
263 # Actual 0-based references "table[0]", "table[1]", etc.
264 table_names_real = []
265
266 # Read and populate tables
267 for x, t_sect in enumerate(table_sections):
268 tmp = pd.read_csv(
269 t_sect["file"],
270 header=t_sect["header"],
271 index_col=t_sect["row_names"],
272 keep_default_na=uc.Default["narm"],
273 sep="\t"
274 )
275 table.append(tmp)
276 table_names.append("table" + str(x + 1))
277 table_names_real.append("table[" + str(x) + "]")
278
279 custom_op = params["fulltable_customop"]
280 ss = Safety(custom_op, table_names, 'pd.DataFrame')
281 fun_string = ss.generateFunction()
282 # Change the argument to table
283 fun_string = fun_string.replace("fun(table1):", "fun():")
284 # table1 to table[1]
285 for name, name_real in zip(table_names, table_names_real):
286 fun_string = fun_string.replace(name, name_real)
287
288 fun_string = fun_string.replace("fun():", "fun(table):")
289 exec(fun_string) # SUPER DUPER SAFE...
290 out_table = fun(table)
291
292 else:
293 print("No such mode!", user_mode)
294 exit(-1)
295
296 if not isinstance(out_table, (pd.DataFrame, pd.Series)):
297 print('The specified operation did not result in a table to return.')
298 raise RuntimeError(
299 'The operation did not generate a pd.DataFrame or pd.Series to return.'
300 )
301 out_parameters = {
302 "sep": "\t",
303 "float_format": "%%.%df" % pd.options.display.precision,
304 "header": uc.Default["out_headers_col"],
305 "index": uc.Default["out_headers_row"]
306 }
307 if user_mode_single not in ('matrixapply', None):
308 out_parameters["quoting"] = csv.QUOTE_NONE
309
310 out_table.to_csv(uc.Default["outtable"], **out_parameters)