Mercurial > repos > iuc > filter_tabular
changeset 9:69b08fc9557c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit daa9af57fe07ee83a45ddc9f855716f9d14a8e12"
author | iuc |
---|---|
date | Sat, 12 Sep 2020 01:21:45 +0000 |
parents | 59278960a2e7 |
children | 9cc411a62277 |
files | filter_tabular.xml filters.py load_db.py macros.xml query_db.py test-data/filtered_customers_results.tsv |
diffstat | 6 files changed, 88 insertions(+), 8 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_tabular.xml Thu Jan 23 07:36:21 2020 -0500 +++ b/filter_tabular.xml Sat Sep 12 01:21:45 2020 +0000 @@ -1,4 +1,4 @@ -<tool id="filter_tabular" name="Filter Tabular" version="2.0.1"> +<tool id="filter_tabular" name="Filter Tabular" version="3.1.0"> <description></description> <macros> @@ -101,6 +101,35 @@ <output name="output" file="filtered_pets_results.tsv"/> </test> + <test> + <param name="input" ftype="tabular" value="customers.tsv"/> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="regex"/> + <param name="regex_pattern" value="^\d+"/> + <param name="regex_action" value="include_find"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="replace"/> + <param name="column" value="c4"/> + <param name="regex_pattern" value=".*@(.*)"/> + <param name="regex_replace" value="\1"/> + <param name="add" value="after"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="replace"/> + <param name="column" value="c4"/> + <param name="regex_pattern" value=".*(com|org|net)$"/> + <param name="regex_replace" value="\1"/> + <param name="add" value="append"/> + </conditional> + </repeat> + <output name="output" file="filtered_customers_results.tsv"/> + </test> </tests> <help><![CDATA[ ==============
--- a/filters.py Thu Jan 23 07:36:21 2020 -0500 +++ b/filters.py Sat Sep 12 01:21:45 2020 +0000 @@ -32,9 +32,20 @@ p = filter_dict['pattern'] r = filter_dict['replace'] c = int(filter_dict['column']) - 1 - self.func = lambda i, l: '\t'.join( - [x if j != c else re.sub(p, r, x) - for j, x in enumerate(l.split('\t'))]) + if 'add' not in filter_dict\ + or filter_dict['add'] not in ['prepend', + 'append', + 'before', + 'after']: + self.func = lambda i, l: '\t'.join( + [x if j != c else re.sub(p, r, x) + for j, x in enumerate(l.split('\t'))]) + else: + a = 0 if filter_dict['add'] == 'prepend'\ + else min(0, c - 1) if filter_dict['add'] == 'before'\ + else c + 1 if filter_dict['add'] == 'after'\ + else None + self.func = lambda i, l: self.replace_add(l, p, r, c, a) elif filter_dict['filter'] == 'prepend_line_num': self.func = lambda i, l: '%d\t%s' % (i, l) elif filter_dict['filter'] == 'append_line_num': @@ -69,6 +80,14 @@ fields = line.split('\t') return '\t'.join([fields[x] for x in cols]) + def replace_add(self, line, pat, rep, col, pos): + fields = line.rstrip('\r\n').split('\t') + i = pos if pos else len(fields) + val = '' + if col < len(fields) and re.search(pat, fields[col]): + val = re.sub(pat, rep, fields[col]).replace('\t', ' ') + return '\t'.join(fields[:i] + [val] + fields[i:]) + def normalize(self, line, split_cols, sep): lines = [] fields = line.rstrip('\r\n').split('\t')
--- a/load_db.py Thu Jan 23 07:36:21 2020 -0500 +++ b/load_db.py Sat Sep 12 01:21:45 2020 +0000 @@ -176,7 +176,7 @@ def get_column_def(file_path, table_name, skip=0, comment_char='#', - column_names=None, max_lines=100, load_named_columns=False, + column_names=None, max_lines=1000, load_named_columns=False, firstlinenames=False, filters=None): col_pref = ['TEXT', 'REAL', 'INTEGER', None] col_types = [] @@ -272,8 +272,26 @@ if x else None for i, x in enumerate(fields)] c.execute(insert_stmt, vals) except Exception as e: - print('Failed at line: %d err: %s' % (linenum, e), + print('Load %s Failed line: %d err: %s' % (file_path, linenum, e), file=sys.stderr) + for i, val in enumerate(fields): + try: + col_func[i](val) + except Exception: + colType = getValueType(val) + col_func[i] = float if colType == 'REAL' else int if colType == 'INTEGER' else str + print('Changing %s from %s to %s' % (col_names[i], col_types[i], colType), + file=sys.stderr) + col_types[i] = colType + vals = [col_func[i](x) + if x else None for i, x in enumerate(fields)] + print('%s %s' % (insert_stmt, vals), + file=sys.stderr) + try: + c.execute(insert_stmt, vals) + except Exception as e: + print('Insert %s line: %d Failed err: %s' % (file_path, linenum, e), + file=sys.stderr) conn.commit() c.close() for i, index in enumerate(unique_indexes):
--- a/macros.xml Thu Jan 23 07:36:21 2020 -0500 +++ b/macros.xml Sat Sep 12 01:21:45 2020 +0000 @@ -40,6 +40,9 @@ #set $filter_dict['column'] = int(str($fi.filter.column).replace('c','')) #set $filter_dict['pattern'] = str($fi.filter.regex_pattern) #set $filter_dict['replace'] = str($fi.filter.regex_replace) + #if $fi.filter.add: + #set $filter_dict['add'] = str($fi.filter.add) + #end if #silent $input_filters.append($filter_dict) #elif str($fi.filter.filter_type).endswith('pend_line_num'): #set $filter_dict = dict() @@ -172,7 +175,7 @@ <when value="replace"> <param name="column" type="text" value="" label="enter column number to replace" help="example: 1 or c1 (selects the first column)"> - <validator type="regex" message="Column ordinal position separated by commas">^(c?[1-9]\d*)$</validator> + <validator type="regex" message="Column ordinal position">^(c?[1-9]\d*)$</validator> </param> <param name="regex_pattern" type="text" value="" label="regex pattern"> <sanitizer sanitize="False"/> @@ -180,6 +183,12 @@ <param name="regex_replace" type="text" value="" label="replacement expression"> <sanitizer sanitize="False"/> </param> + <param name="add" type="select" optional="true" label="Instead of replacing, Add as new column:"> + <option value="prepend">prepend to beginning of row</option> + <option value="append">append to the end of row</option> + <option value="before">insert before column field</option> + <option value="after">insert after column field</option> + </param> </when> <when value="normalize"> <param name="columns" type="text" value="" label="enter column numbers to normalize"> @@ -211,6 +220,7 @@ - by regex expression matching *include/exclude* lines the match the regex expression - select columns choose to include only selected columns in the order specified - regex replace value in column replace a field in a column using a regex substitution (good for date reformatting) + - regex replace value in column add a new column using a regex substitution of a column value - prepend a line number column each line has the ordinal value of the line read by this filter as the first column - append a line number column each line has the ordinal value of the line read by this filter as the last column - prepend a text column each line has the text string as the first column
--- a/query_db.py Thu Jan 23 07:36:21 2020 -0500 +++ b/query_db.py Sat Sep 12 01:21:45 2020 +0000 @@ -52,7 +52,7 @@ except Exception as exc: print("Warning: %s" % exc, file=sys.stderr) except Exception as e: - exit('Error: %s' % (e)) + exit('describe_tables Error: %s' % (e)) exit(0)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_customers_results.tsv Sat Sep 12 01:21:45 2020 +0000 @@ -0,0 +1,4 @@ +1 John Smith John.Smith@yahoo.com yahoo.com 1968-02-04 626 222-2222 com +2 Steven Goldfish goldfish@fishhere.net fishhere.net 1974-04-04 323 455-4545 net +3 Paula Brown pb@herowndomain.org herowndomain.org 1978-05-24 416 323-3232 org +4 James Smith jim@supergig.co.uk supergig.co.uk 1980-10-20 416 323-8888