Mercurial > repos > iuc > filter_tabular
changeset 13:4d5aae46f850 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 35576d64a12fa664d72559172c5960c09da2b632"
author | iuc |
---|---|
date | Thu, 19 Aug 2021 19:39:58 +0000 |
parents | 37cde8134c6a |
children | 557ec8d7087d |
files | filter_tabular.py filter_tabular.xml filters.py macros.xml test-data/filtered_IEDB.tsv test-data/psm_dbmod_output.tsv test-data/psm_dbmod_output1.tsv test-data/psm_report.tsv test-data/psm_report_out2.tsv |
diffstat | 9 files changed, 131 insertions(+), 27 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_tabular.py Sat Jun 19 14:16:06 2021 +0000 +++ b/filter_tabular.py Thu Aug 19 19:39:58 2021 +0000 @@ -19,6 +19,8 @@ help='JSON array of filter specifications') parser.add_option('-o', '--output', dest='output', default=None, help='Output file for query results') + parser.add_option('-c', '--comment_char)', dest='comment_char', default=None, + help='Line comment character') parser.add_option('-v', '--verbose', dest='verbose', default=False, action='store_true', help='verbose') @@ -59,7 +61,7 @@ file=sys.stdout) try: - filter_file(inputFile, outputFile, filters=filters) + filter_file(inputFile, outputFile, comment_char=options.comment_char, filters=filters) except Exception as e: exit('Error: %s' % (e))
--- a/filter_tabular.xml Sat Jun 19 14:16:06 2021 +0000 +++ b/filter_tabular.xml Thu Aug 19 19:39:58 2021 +0000 @@ -1,4 +1,4 @@ -<tool id="filter_tabular" name="Filter Tabular" version="3.1.2"> +<tool id="filter_tabular" name="Filter Tabular" version="3.3.0"> <description></description> <macros> @@ -8,12 +8,14 @@ <command detect_errors="exit_code"><![CDATA[ python '$__tool_directory__/filter_tabular.py' -i '$input' + $comment_char -j '$filter_json' -o '$output' ]]></command> <configfiles> <configfile name="filter_json"> #import json +#set $dataset_name = $input.element_identifier @LINEFILTERS@ #if $input_filters: #echo $json.dumps($input_filters) @@ -22,6 +24,7 @@ </configfiles> <inputs> <param name="input" type="data" format="tabular" label="Tabular Dataset to filter"/> + <param name="comment_char" type="boolean" truevalue="--comment_char '#'" falsevalue="" checked="true" label="exclude lines starting with #"/> <expand macro="macro_line_filters" /> </inputs> <outputs> @@ -139,6 +142,30 @@ </repeat> <output name="output" file="filtered_customers_results.tsv"/> </test> + <test> + <param name="input" ftype="tabular" value="IEDB.tsv"/> + <param name="comment_char" value="False"/> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="replace"/> + <param name="column" value="c1"/> + <param name="regex_pattern" value="#ID"/> + <param name="regex_replace" value="ID"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="select_column_slices"/> + <param name="columns" value="6,0:6,8,-3"/> + </conditional> + </repeat> + <repeat name="linefilters"> + <conditional name="filter"> + <param name="filter_type" value="append_dataset_name"/> + </conditional> + </repeat> + <output name="output" file="filtered_IEDB.tsv"/> + </test> </tests> <help><![CDATA[ ==============
--- a/filters.py Sat Jun 19 14:16:06 2021 +0000 +++ b/filters.py Thu Aug 19 19:39:58 2021 +0000 @@ -4,6 +4,7 @@ import re import sys +from itertools import chain class LineFilter(object): @@ -13,6 +14,15 @@ self.func = lambda i, l: l.rstrip('\r\n') if l else None self.src_lines = [] self.src_line_cnt = 0 + + def xint(x): + if isinstance(x, int): + return x + try: + return int(x) + except Exception: + return x if x else None + if not filter_dict: return if filter_dict['filter'] == 'regex': @@ -28,6 +38,13 @@ elif filter_dict['filter'] == 'select_columns': cols = [int(c) - 1 for c in filter_dict['columns']] self.func = lambda i, l: self.select_columns(l, cols) + elif filter_dict['filter'] == 'select_column_slices': + cols = [x if isinstance(x, int) else [y if y is not None else None for y in [xint(k) for k in x.split(':')]] for x in [xint(c) for c in filter_dict['columns']]] + if all([isinstance(x, int) for x in cols]): + self.func = lambda i, l: self.select_columns(l, cols) + else: + cols = [slice(x[0], x[1], x[2] if len(x) > 2 else None) if isinstance(x, list) else x for x in cols] + self.func = lambda i, l: self.select_slices(l, cols) elif filter_dict['filter'] == 'replace': p = filter_dict['pattern'] r = filter_dict['replace'] @@ -80,6 +97,10 @@ fields = line.split('\t') return '\t'.join([fields[x] for x in cols]) + def select_slices(self, line, cols): + fields = line.split('\t') + return '\t'.join(chain.from_iterable([y if isinstance(y, list) else [y] for y in [fields[x] for x in cols]])) + def replace_add(self, line, pat, rep, col, pos): fields = line.rstrip('\r\n').split('\t') i = pos if pos is not None else len(fields)
--- a/macros.xml Sat Jun 19 14:16:06 2021 +0000 +++ b/macros.xml Thu Aug 19 19:39:58 2021 +0000 @@ -32,7 +32,12 @@ #elif $fi.filter.filter_type == 'select_columns': #set $filter_dict = dict() #set $filter_dict['filter'] = str($fi.filter.filter_type) - #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')] + #set $filter_dict['columns'] = [int($c) for $c in str($fi.filter.columns).replace('c','').split(',')] + #silent $input_filters.append($filter_dict) + #elif $fi.filter.filter_type == 'select_column_slices': + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type) + #set $filter_dict['columns'] = [$c for $c in str($fi.filter.columns).split(',')] #silent $input_filters.append($filter_dict) #elif $fi.filter.filter_type == 'replace': #set $filter_dict = dict() @@ -53,6 +58,11 @@ #set $filter_dict['filter'] = str($fi.filter.filter_type) #set $filter_dict['column_text'] = str($fi.filter.column_text) #silent $input_filters.append($filter_dict) + #elif str($fi.filter.filter_type).endswith('pend_dataset_name'): + #set $filter_dict = dict() + #set $filter_dict['filter'] = str($fi.filter.filter_type).replace('dataset_name', 'text') + #set $filter_dict['column_text'] = $dataset_name + #silent $input_filters.append($filter_dict) #elif $fi.filter.filter_type == 'normalize': #set $filter_dict = dict() #set $filter_dict['filter'] = str($fi.filter.filter_type) @@ -129,9 +139,12 @@ <option value="comment">comment char</option> <option value="regex">by regex expression matching</option> <option value="select_columns">select columns</option> + <option value="select_column_slices">select columns by indices/slices</option> <option value="replace">regex replace value in column</option> <option value="prepend_line_num">prepend a line number column</option> <option value="append_line_num">append a line number column</option> + <option value="prepend_dataset_name">prepend a column with the dataset name</option> + <option value="append_dataset_name">append a column with the dataset name</option> <option value="prepend_text">prepend a column with the given text</option> <option value="append_text">append a column with the given text</option> <option value="normalize">normalize list columns, replicates row for each item in list</option> @@ -147,6 +160,8 @@ </when> <when value="prepend_line_num"/> <when value="append_line_num"/> + <when value="prepend_dataset_name"/> + <when value="append_dataset_name"/> <when value="prepend_text"> <param name="column_text" type="text" value="" label="text for column"> </param> @@ -172,6 +187,26 @@ <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator> </param> </when> + <when value="select_column_slices"> + <param name="columns" type="text" value="" label="enter indices or slices of the columns to keep"> + <help><![CDATA[ Python offset indexes or slices. Examples: + <ul> + <li>Column offset indexes: 0,3,1 (selects the first,fourth, and second columns)</li> + <li>Negative column numbers: -1,-2 (selects the last, and second last columns)</li> + <li>python slices ( slice(start, stop[, step]) select a range of columns): <li> + <ul> + <li>0:3 or :3 (selects the first 3 columns)</li> + <li>3:5 (selects the fourth and fifth columns)</li> + <li>2: (selects all columns after the second)</li> + <li>-2: (selects the last 2 columns)</li> + <li>2::-1 (selects the first 3 columns n reverse order: third,second,first)</li> + </ul> + </ul> + ]]></help> + <validator type="regex" message="Column ordinal positions separated by commas">^(-?[1-9]\d*|((-?\d+)?:(-?\d*(:-?\d*)?)))(,(-?[1-9]\d*|((-?\d+)?:(-?\d*(:-?\d*)?))))*$</validator> + </param> + </when> + <when value="replace"> <param name="column" type="text" value="" label="enter column number to replace" help="example: 1 or c1 (selects the first column)"> @@ -211,21 +246,24 @@ <![CDATA[ **Input Line Filters** - As a tabular file is being read, line filters may be applied. - - :: + As a tabular file is being read, line filters may be applied: - - skip leading lines skip the first *number* of lines - - comment char omit any lines that start with the specified comment character - - by regex expression matching *include/exclude* lines the match the regex expression - - select columns choose to include only selected columns in the order specified - - regex replace value in column replace a field in a column using a regex substitution (good for date reformatting) - - regex replace value in column add a new column using a regex substitution of a column value - - prepend a line number column each line has the ordinal value of the line read by this filter as the first column - - append a line number column each line has the ordinal value of the line read by this filter as the last column - - prepend a text column each line has the text string as the first column - - append a text column each line has the text string as the last column - - normalize list columns replicates the line for each item in the specified list *columns* + - skip leading lines - skip the first *number* of lines + - comment char - omit any lines that start with the specified comment character + - by regex expression matching - *include/exclude* lines that match the regex expression + - select columns - choose to include only selected columns in the order specified + - select columns by indices/slices - *indices or slices* of the columns to keep (python_list_ indexing) + - regex replace value in column - replace a field in a column using a regex substitution (good for date reformatting) + - regex replace value in column - add a new column using a regex substitution of a column value + - prepend a line number column - each line has the *ordinal* value of the line read by this filter as the first column + - append a line number column - each line has the *ordinal* value of the line read by this filter as the last column + - prepend a text column - each line has the text string as the first column + - append a text column - each line has the text string as the last column + - prepend the dataset name - each line has the *dataset name* as the first column + - append the dataset name - each line has the *dataset name* as the last column + - normalize list columns - replicates the line for each item in the specified list *columns* + +.. _python_list: https://docs.python.org/3/library/stdtypes.html#common-sequence-operations ]]> </token> @@ -284,13 +322,13 @@ 0 Jane Doe 1978-05-24 5 3 1 James Smith 1980-10-20 Spot 6 4 - Filter 6 - append a line number column: + Filter 6 - select columns by indices/slices: '1:6' - 2 Paula Brown 1978-05-24 Rex dog 3 1 1 - 2 Paula Brown 1978-05-24 Fluff cat 3 1 2 - 1 Steven Jones 1974-04-04 Allie cat 4 2 3 - 0 Jane Doe 1978-05-24 5 3 4 - 1 James Smith 1980-10-20 Spot 6 4 5 + Paula Brown 1978-05-24 Rex dog + Paula Brown 1978-05-24 Fluff cat + Steven Jones 1974-04-04 Allie cat + Jane Doe 1978-05-24 + James Smith 1980-10-20 Spot ]]> </token>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/filtered_IEDB.tsv Thu Aug 19 19:39:58 2021 +0000 @@ -0,0 +1,17 @@ +peptide ID allele seq_num start end length percentile_rank comblib_sidney2008_rank IEDB.tsv +GMYCMVFLV PPAP2C HLA-A*02:01 1 3 11 9 0.2 0.5 IEDB.tsv +SFGMYCMVF PPAP2C HLA-A*23:01 1 1 9 9 0.5 - IEDB.tsv +MYCMVFLVK PPAP2C HLA-A*23:01 1 4 12 9 0.65 - IEDB.tsv +FGMYCMVFL PPAP2C HLA-A*02:01 1 2 10 9 2.3 1.3 IEDB.tsv +GMYCMVFLV PPAP2C HLA-A*23:01 1 3 11 9 4.95 - IEDB.tsv +FGMYCMVFL PPAP2C HLA-A*23:01 1 2 10 9 6.55 - IEDB.tsv +SFGMYCMVF PPAP2C HLA-A*02:01 1 1 9 9 45 91 IEDB.tsv +MYCMVFLVK PPAP2C HLA-A*02:01 1 4 12 9 54 86 IEDB.tsv +SLDMCISGL ADAMTSL1 HLA-A*02:01 1 1 9 9 1 1.7 IEDB.tsv +MCISGLCQL ADAMTSL1 HLA-A*23:01 1 4 12 9 6.65 - IEDB.tsv +MCISGLCQL ADAMTSL1 HLA-A*02:01 1 4 12 9 14 24 IEDB.tsv +SLDMCISGL ADAMTSL1 HLA-A*23:01 1 1 9 9 30.5 - IEDB.tsv +LDMCISGLC ADAMTSL1 HLA-A*02:01 1 2 10 9 42 71 IEDB.tsv +DMCISGLCQ ADAMTSL1 HLA-A*23:01 1 3 11 9 64.5 - IEDB.tsv +LDMCISGLC ADAMTSL1 HLA-A*23:01 1 2 10 9 76.0 - IEDB.tsv +DMCISGLCQ ADAMTSL1 HLA-A*02:01 1 3 11 9 97 97 IEDB.tsv
--- a/test-data/psm_dbmod_output.tsv Sat Jun 19 14:16:06 2021 +0000 +++ b/test-data/psm_dbmod_output.tsv Thu Aug 19 19:39:58 2021 +0000 @@ -1,4 +1,4 @@ -#scan m\/z Precursor m\/z Error Sequence Protein\(s\) confidence +#scan m\/z Precursor m\/z Error( \[ppm])? Sequence Protein\(s\) confidence 1 523.27\d* -4.42\d* PYANQPTVR NP_116558 99.9\d* 3 652.84\d* 4.02\d* SSWAGLQFPVGR NP_066544_R21W 99.9\d* 4 788.87\d* 1.27\d* AQACNLDQSGTNVAK NP_112092_rs7285167:R182C 99.9\d*
--- a/test-data/psm_dbmod_output1.tsv Sat Jun 19 14:16:06 2021 +0000 +++ b/test-data/psm_dbmod_output1.tsv Thu Aug 19 19:39:58 2021 +0000 @@ -1,4 +1,4 @@ -scan Sequence Protein\(s\) Position m\/z Precursor m\/z Error confidence +scan Sequence Protein\(s\) Position m\/z Precursor m\/z Error( \[ppm])? confidence 1 PYANQPTVR NP_116558 2 523.27\d* -4.42\d* 99.99\d* 3 SSWAGLQFPVGR NP_066544_R21W 19 652.84\d* 4.02\d* 99.99\d* 4 AQACNLDQSGTNVAK NP_112092_rs7285167:R182C 179 788.87\d* 1.27\d* 99.99\d*
--- a/test-data/psm_report.tsv Sat Jun 19 14:16:06 2021 +0000 +++ b/test-data/psm_report.tsv Thu Aug 19 19:39:58 2021 +0000 @@ -1,4 +1,3 @@ - Protein(s) Sequence AAs Before AAs After Position Modified Sequence Variable Modifications Fixed Modifications Spectrum File Spectrum Title Spectrum Scan Number RT m/z Measured Charge Identification Charge Theoretical Mass Isotope Number Precursor m/z Error [ppm] Localization Confidence Probabilistic PTM score D-score Confidence [%] Validation 1 NP_116558 PYANQPTVR M IT 2 NH2-PYANQPTVR-COOH trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.04679.04679.2 4679 -1.0 523.272583 2+ 2+ 1044.53524305008 0 -4.4240452979909675 100.0 Doubtful 2 NP_443137, NP_443137_S1016F DANTQVHTLR YK; YK KM; KM 443; 443 NH2-DANTQVHTLR-COOH trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.03894.03894.2 3894 -1.0 577.799622 2+ 2+ 1153.5839841476504 0 0.6117338355350196 95.0 Doubtful
--- a/test-data/psm_report_out2.tsv Sat Jun 19 14:16:06 2021 +0000 +++ b/test-data/psm_report_out2.tsv Thu Aug 19 19:39:58 2021 +0000 @@ -1,4 +1,4 @@ -Scan m\/z Precursor m\/z Error Sequence Protein\(s\) +Scan m\/z Precursor m\/z Error( \[ppm])? Sequence Protein\(s\) 1 523.27\d* -4.42\d* PYANQPTVR NP_116558 3 652.84\d* 4.02\d* SSWAGLQFPVGR NP_066544_R21W 4 788.87\d* 1.27\d* AQACNLDQSGTNVAK NP_112092_rs7285167:R182C