Mercurial > repos > iuc > filter_tabular

--- a/filter_tabular.py	Sat Jun 19 14:16:06 2021 +0000
+++ b/filter_tabular.py	Thu Aug 19 19:39:58 2021 +0000
@@ -19,6 +19,8 @@
                       help='JSON array of filter specifications')
     parser.add_option('-o', '--output', dest='output', default=None,
                       help='Output file for query results')
+    parser.add_option('-c', '--comment_char)', dest='comment_char', default=None,
+                      help='Line comment character')
     parser.add_option('-v', '--verbose', dest='verbose', default=False,
                       action='store_true',
                       help='verbose')
@@ -59,7 +61,7 @@
                   file=sys.stdout)

     try:
-        filter_file(inputFile, outputFile, filters=filters)
+        filter_file(inputFile, outputFile, comment_char=options.comment_char, filters=filters)
     except Exception as e:
         exit('Error: %s' % (e))
--- a/filter_tabular.xml	Sat Jun 19 14:16:06 2021 +0000
+++ b/filter_tabular.xml	Thu Aug 19 19:39:58 2021 +0000
@@ -1,4 +1,4 @@
-<tool id="filter_tabular" name="Filter Tabular" version="3.1.2">
+<tool id="filter_tabular" name="Filter Tabular" version="3.3.0">
     <description></description>

     <macros>
@@ -8,12 +8,14 @@
     <command detect_errors="exit_code"><![CDATA[
         python '$__tool_directory__/filter_tabular.py'
         -i '$input'
+        $comment_char
         -j '$filter_json'
         -o '$output'
     ]]></command>
     <configfiles>
         <configfile name="filter_json">
 #import json
+#set $dataset_name = $input.element_identifier
 @LINEFILTERS@
 #if $input_filters:
 #echo $json.dumps($input_filters)
@@ -22,6 +24,7 @@
     </configfiles>
     <inputs>
         <param name="input" type="data" format="tabular" label="Tabular Dataset to filter"/>
+        <param name="comment_char" type="boolean" truevalue="--comment_char '#'" falsevalue="" checked="true" label="exclude lines starting with #"/>
         <expand macro="macro_line_filters" />
     </inputs>
     <outputs>
@@ -139,6 +142,30 @@
             </repeat>
             <output name="output" file="filtered_customers_results.tsv"/>
         </test>
+        <test>
+            <param name="input" ftype="tabular" value="IEDB.tsv"/>
+            <param name="comment_char" value="False"/>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="replace"/>
+                    <param name="column" value="c1"/>
+                    <param name="regex_pattern" value="#ID"/>
+                    <param name="regex_replace" value="ID"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="select_column_slices"/>
+                    <param name="columns" value="6,0:6,8,-3"/>
+                </conditional>
+            </repeat>
+            <repeat name="linefilters">
+                <conditional name="filter">
+                    <param name="filter_type" value="append_dataset_name"/>
+                </conditional>
+            </repeat>
+            <output name="output" file="filtered_IEDB.tsv"/>
+        </test>
     </tests>
     <help><![CDATA[
 ==============
--- a/filters.py	Sat Jun 19 14:16:06 2021 +0000
+++ b/filters.py	Thu Aug 19 19:39:58 2021 +0000
@@ -4,6 +4,7 @@

 import re
 import sys
+from itertools import chain


 class LineFilter(object):
@@ -13,6 +14,15 @@
         self.func = lambda i, l: l.rstrip('\r\n') if l else None
         self.src_lines = []
         self.src_line_cnt = 0
+
+        def xint(x):
+            if isinstance(x, int):
+                return x
+            try:
+                return int(x)
+            except Exception:
+                return x if x else None
+
         if not filter_dict:
             return
         if filter_dict['filter'] == 'regex':
@@ -28,6 +38,13 @@
         elif filter_dict['filter'] == 'select_columns':
             cols = [int(c) - 1 for c in filter_dict['columns']]
             self.func = lambda i, l: self.select_columns(l, cols)
+        elif filter_dict['filter'] == 'select_column_slices':
+            cols = [x if isinstance(x, int) else [y if y is not None else None for y in [xint(k) for k in x.split(':')]] for x in [xint(c) for c in filter_dict['columns']]]
+            if all([isinstance(x, int) for x in cols]):
+                self.func = lambda i, l: self.select_columns(l, cols)
+            else:
+                cols = [slice(x[0], x[1], x[2] if len(x) > 2 else None) if isinstance(x, list) else x for x in cols]
+                self.func = lambda i, l: self.select_slices(l, cols)
         elif filter_dict['filter'] == 'replace':
             p = filter_dict['pattern']
             r = filter_dict['replace']
@@ -80,6 +97,10 @@
         fields = line.split('\t')
         return '\t'.join([fields[x] for x in cols])

+    def select_slices(self, line, cols):
+        fields = line.split('\t')
+        return '\t'.join(chain.from_iterable([y if isinstance(y, list) else [y] for y in [fields[x] for x in cols]]))
+
     def replace_add(self, line, pat, rep, col, pos):
         fields = line.rstrip('\r\n').split('\t')
         i = pos if pos is not None else len(fields)
--- a/macros.xml	Sat Jun 19 14:16:06 2021 +0000
+++ b/macros.xml	Thu Aug 19 19:39:58 2021 +0000
@@ -32,7 +32,12 @@
     #elif $fi.filter.filter_type == 'select_columns':
       #set $filter_dict = dict()
       #set $filter_dict['filter'] = str($fi.filter.filter_type)
-      #set $filter_dict['columns'] = [int(str($ci).replace('c','')) for $ci in str($fi.filter.columns).split(',')]
+      #set $filter_dict['columns'] = [int($c) for $c in str($fi.filter.columns).replace('c','').split(',')]
+      #silent $input_filters.append($filter_dict)
+    #elif $fi.filter.filter_type == 'select_column_slices':
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type)
+      #set $filter_dict['columns'] = [$c for $c in str($fi.filter.columns).split(',')]
       #silent $input_filters.append($filter_dict)
     #elif $fi.filter.filter_type == 'replace':
       #set $filter_dict = dict()
@@ -53,6 +58,11 @@
       #set $filter_dict['filter'] = str($fi.filter.filter_type)
       #set $filter_dict['column_text'] = str($fi.filter.column_text)
       #silent $input_filters.append($filter_dict)
+    #elif str($fi.filter.filter_type).endswith('pend_dataset_name'):
+      #set $filter_dict = dict()
+      #set $filter_dict['filter'] = str($fi.filter.filter_type).replace('dataset_name', 'text')
+      #set $filter_dict['column_text'] = $dataset_name
+      #silent $input_filters.append($filter_dict)
     #elif $fi.filter.filter_type == 'normalize':
       #set $filter_dict = dict()
       #set $filter_dict['filter'] = str($fi.filter.filter_type)
@@ -129,9 +139,12 @@
                             <option value="comment">comment char</option>
                             <option value="regex">by regex expression matching</option>
                             <option value="select_columns">select columns</option>
+                            <option value="select_column_slices">select columns by indices/slices</option>
                             <option value="replace">regex replace value in column</option>
                             <option value="prepend_line_num">prepend a line number column</option>
                             <option value="append_line_num">append a line number column</option>
+                            <option value="prepend_dataset_name">prepend a column with the dataset name</option>
+                            <option value="append_dataset_name">append a column with the dataset name</option>
                             <option value="prepend_text">prepend a column with the given text</option>
                             <option value="append_text">append a column with the given text</option>
                             <option value="normalize">normalize list columns, replicates row for each item in list</option>
@@ -147,6 +160,8 @@
                         </when>
                         <when value="prepend_line_num"/>
                         <when value="append_line_num"/>
+                        <when value="prepend_dataset_name"/>
+                        <when value="append_dataset_name"/>
                         <when value="prepend_text">
                             <param name="column_text" type="text" value="" label="text for column">
                             </param>
@@ -172,6 +187,26 @@
                                 <validator type="regex" message="Column ordinal positions separated by commas">^(c?[1-9]\d*)(,c?[1-9]\d*)*$</validator>
                             </param>
                         </when>
+                        <when value="select_column_slices">
+                            <param name="columns" type="text" value="" label="enter indices or slices of the columns to keep">
+                                <help><![CDATA[ Python offset indexes or slices.  Examples:
+                                 <ul>
+                                 <li>Column offset indexes: 0,3,1 (selects the first,fourth, and second columns)</li>
+                                 <li>Negative column numbers: -1,-2 (selects the last, and second last columns)</li>
+                                 <li>python slices ( slice(start, stop[, step]) select a range of columns): <li>
+                                   <ul>
+                                     <li>0:3 or :3 (selects the first 3 columns)</li>
+                                     <li>3:5 (selects the fourth and fifth columns)</li>
+                                     <li>2: (selects all columns after the second)</li>
+                                     <li>-2: (selects the last 2 columns)</li>
+                                     <li>2::-1 (selects the first 3 columns n reverse order: third,second,first)</li>
+                                   </ul>
+                                 </ul>
+                                ]]></help>
+                                <validator type="regex" message="Column ordinal positions separated by commas">^(-?[1-9]\d*|((-?\d+)?:(-?\d*(:-?\d*)?)))(,(-?[1-9]\d*|((-?\d+)?:(-?\d*(:-?\d*)?))))*$</validator>
+                            </param>
+                        </when>
+
                         <when value="replace">
                             <param name="column" type="text" value="" label="enter column number to replace"
                                 help="example: 1 or c1 (selects the first column)">
@@ -211,21 +246,24 @@
 <![CDATA[
 **Input Line Filters**

-  As a tabular file is being read, line filters may be applied.
-
-  ::
+  As a tabular file is being read, line filters may be applied:

-  - skip leading lines              skip the first *number* of lines
-  - comment char                    omit any lines that start with the specified comment character
-  - by regex expression matching    *include/exclude* lines the match the regex expression
-  - select columns                  choose to include only selected columns in the order specified
-  - regex replace value in column   replace a field in a column using a regex substitution (good for date reformatting)
-  - regex replace value in column   add a new column using a regex substitution of a column value
-  - prepend a line number column    each line has the ordinal value of the line read by this filter as the first column
-  - append a line number column     each line has the ordinal value of the line read by this filter as the last column
-  - prepend a text column           each line has the text string as the first column
-  - append a text column            each line has the text string as the last column
-  - normalize list columns          replicates the line for each item in the specified list *columns*
+    - skip leading lines                - skip the first *number* of lines
+    - comment char                      - omit any lines that start with the specified comment character
+    - by regex expression matching      - *include/exclude* lines that match the regex expression
+    - select columns                    - choose to include only selected columns in the order specified
+    - select columns by indices/slices  - *indices or slices* of the columns to keep (python_list_ indexing)
+    - regex replace value in column     - replace a field in a column using a regex substitution (good for date reformatting)
+    - regex replace value in column     - add a new column using a regex substitution of a column value
+    - prepend a line number column      - each line has the *ordinal* value of the line read by this filter as the first column
+    - append a line number column       - each line has the *ordinal* value of the line read by this filter as the last column
+    - prepend a text column             - each line has the text string as the first column
+    - append a text column              - each line has the text string as the last column
+    - prepend the dataset name          - each line has the *dataset name* as the first column
+    - append the dataset name           - each line has the *dataset name* as the last column
+    - normalize list columns            - replicates the line for each item in the specified list *columns*
+
+.. _python_list: https://docs.python.org/3/library/stdtypes.html#common-sequence-operations
 ]]>
   </token>

@@ -284,13 +322,13 @@
     0    Jane                Doe        1978-05-24                      5  3
     1    James               Smith      1980-10-20  Spot                6  4

-    Filter 6 - append a line number column:
+    Filter 6 - select columns by indices/slices: '1:6'

-    2    Paula               Brown      1978-05-24  Rex       dog       3  1  1
-    2    Paula               Brown      1978-05-24  Fluff     cat       3  1  2
-    1    Steven              Jones      1974-04-04  Allie     cat       4  2  3
-    0    Jane                Doe        1978-05-24                      5  3  4
-    1    James               Smith      1980-10-20  Spot                6  4  5
+    Paula               Brown      1978-05-24  Rex       dog
+    Paula               Brown      1978-05-24  Fluff     cat
+    Steven              Jones      1974-04-04  Allie     cat
+    Jane                Doe        1978-05-24
+    James               Smith      1980-10-20  Spot

 ]]>
   </token>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtered_IEDB.tsv	Thu Aug 19 19:39:58 2021 +0000
@@ -0,0 +1,17 @@
+peptide	ID	allele	seq_num	start	end	length	percentile_rank	comblib_sidney2008_rank	IEDB.tsv
+GMYCMVFLV	PPAP2C	HLA-A*02:01	1	3	11	9	0.2	0.5	IEDB.tsv
+SFGMYCMVF	PPAP2C	HLA-A*23:01	1	1	9	9	0.5	-	IEDB.tsv
+MYCMVFLVK	PPAP2C	HLA-A*23:01	1	4	12	9	0.65	-	IEDB.tsv
+FGMYCMVFL	PPAP2C	HLA-A*02:01	1	2	10	9	2.3	1.3	IEDB.tsv
+GMYCMVFLV	PPAP2C	HLA-A*23:01	1	3	11	9	4.95	-	IEDB.tsv
+FGMYCMVFL	PPAP2C	HLA-A*23:01	1	2	10	9	6.55	-	IEDB.tsv
+SFGMYCMVF	PPAP2C	HLA-A*02:01	1	1	9	9	45	91	IEDB.tsv
+MYCMVFLVK	PPAP2C	HLA-A*02:01	1	4	12	9	54	86	IEDB.tsv
+SLDMCISGL	ADAMTSL1	HLA-A*02:01	1	1	9	9	1	1.7	IEDB.tsv
+MCISGLCQL	ADAMTSL1	HLA-A*23:01	1	4	12	9	6.65	-	IEDB.tsv
+MCISGLCQL	ADAMTSL1	HLA-A*02:01	1	4	12	9	14	24	IEDB.tsv
+SLDMCISGL	ADAMTSL1	HLA-A*23:01	1	1	9	9	30.5	-	IEDB.tsv
+LDMCISGLC	ADAMTSL1	HLA-A*02:01	1	2	10	9	42	71	IEDB.tsv
+DMCISGLCQ	ADAMTSL1	HLA-A*23:01	1	3	11	9	64.5	-	IEDB.tsv
+LDMCISGLC	ADAMTSL1	HLA-A*23:01	1	2	10	9	76.0	-	IEDB.tsv
+DMCISGLCQ	ADAMTSL1	HLA-A*02:01	1	3	11	9	97	97	IEDB.tsv
--- a/test-data/psm_dbmod_output.tsv	Sat Jun 19 14:16:06 2021 +0000
+++ b/test-data/psm_dbmod_output.tsv	Thu Aug 19 19:39:58 2021 +0000
@@ -1,4 +1,4 @@
-#scan	m\/z	Precursor m\/z Error	Sequence	Protein\(s\)	confidence
+#scan	m\/z	Precursor m\/z Error( \[ppm])?	Sequence	Protein\(s\)	confidence
 1	523.27\d*	-4.42\d*	PYANQPTVR	NP_116558	99.9\d*
 3	652.84\d*	4.02\d*	SSWAGLQFPVGR	NP_066544_R21W	99.9\d*
 4	788.87\d*	1.27\d*	AQACNLDQSGTNVAK	NP_112092_rs7285167:R182C	99.9\d*
--- a/test-data/psm_dbmod_output1.tsv	Sat Jun 19 14:16:06 2021 +0000
+++ b/test-data/psm_dbmod_output1.tsv	Thu Aug 19 19:39:58 2021 +0000
@@ -1,4 +1,4 @@
-scan	Sequence	Protein\(s\)	Position	m\/z	Precursor m\/z Error	confidence
+scan	Sequence	Protein\(s\)	Position	m\/z	Precursor m\/z Error( \[ppm])?	confidence
 1	PYANQPTVR	NP_116558	2	523.27\d*	-4.42\d*	99.99\d*
 3	SSWAGLQFPVGR	NP_066544_R21W	19	652.84\d*	4.02\d*	99.99\d*
 4	AQACNLDQSGTNVAK	NP_112092_rs7285167:R182C	179	788.87\d*	1.27\d*	99.99\d*
--- a/test-data/psm_report.tsv	Sat Jun 19 14:16:06 2021 +0000
+++ b/test-data/psm_report.tsv	Thu Aug 19 19:39:58 2021 +0000
@@ -1,4 +1,3 @@
-
 	Protein(s)	Sequence	AAs Before	AAs After	Position	Modified Sequence	Variable Modifications	Fixed Modifications	Spectrum File	Spectrum Title	Spectrum Scan Number	RT	m/z	Measured Charge	Identification Charge	Theoretical Mass	Isotope Number	Precursor m/z Error [ppm]	Localization Confidence	Probabilistic PTM score	D-score	Confidence [%]	Validation
 1	NP_116558	PYANQPTVR	M	IT	2	NH2-PYANQPTVR-COOH			trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.04679.04679.2	4679	-1.0	523.272583	2+	2+	1044.53524305008	0	-4.4240452979909675				100.0	Doubtful
 2	NP_443137, NP_443137_S1016F	DANTQVHTLR	YK; YK	KM; KM	443; 443	NH2-DANTQVHTLR-COOH			trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.03894.03894.2	3894	-1.0	577.799622	2+	2+	1153.5839841476504	0	0.6117338355350196				95.0	Doubtful
--- a/test-data/psm_report_out2.tsv	Sat Jun 19 14:16:06 2021 +0000
+++ b/test-data/psm_report_out2.tsv	Thu Aug 19 19:39:58 2021 +0000
@@ -1,4 +1,4 @@
-Scan	m\/z	Precursor m\/z Error	Sequence	Protein\(s\)
+Scan	m\/z	Precursor m\/z Error( \[ppm])?	Sequence	Protein\(s\)
 1	523.27\d*	-4.42\d*	PYANQPTVR	NP_116558
 3	652.84\d*	4.02\d*	SSWAGLQFPVGR	NP_066544_R21W
 4	788.87\d*	1.27\d*	AQACNLDQSGTNVAK	NP_112092_rs7285167:R182C