Mercurial > repos > iuc > filter_tabular

--- a/filter_tabular.xml	Tue Jul 18 09:06:47 2017 -0400
+++ b/filter_tabular.xml	Fri Aug 18 16:47:57 2017 -0400
@@ -1,4 +1,4 @@
-<tool id="filter_tabular" name="Filter Tabular" version="1.0.0">
+<tool id="filter_tabular" name="Filter Tabular" version="2.0.0">
     <description></description>

     <macros>
--- a/filters.py	Tue Jul 18 09:06:47 2017 -0400
+++ b/filters.py	Fri Aug 18 16:47:57 2017 -0400
@@ -33,7 +33,8 @@
             r = filter_dict['replace']
             c = int(filter_dict['column']) - 1
             self.func = lambda i, l: '\t'.join(
-                [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))])
+                [x if j != c else re.sub(p, r, x)
+                 for j, x in enumerate(l.split('\t'))])
         elif filter_dict['filter'] == 'prepend_line_num':
             self.func = lambda i, l: '%d\t%s' % (i, l)
         elif filter_dict['filter'] == 'append_line_num':
--- a/load_db.py	Tue Jul 18 09:06:47 2017 -0400
+++ b/load_db.py	Fri Aug 18 16:47:57 2017 -0400
@@ -2,11 +2,165 @@

 from __future__ import print_function

+import re
 import sys

 from filters import TabularReader


+SQLITE_KEYWORDS = [
+    'ABORT',
+    'ACTION',
+    'ADD',
+    'AFTER',
+    'ALL',
+    'ALTER',
+    'ANALYZE',
+    'AND',
+    'AS',
+    'ASC',
+    'ATTACH',
+    'AUTOINCREMENT',
+    'BEFORE',
+    'BEGIN',
+    'BETWEEN',
+    'BY',
+    'CASCADE',
+    'CASE',
+    'CAST',
+    'CHECK',
+    'COLLATE',
+    'COLUMN',
+    'COMMIT',
+    'CONFLICT',
+    'CONSTRAINT',
+    'CREATE',
+    'CROSS',
+    'CURRENT_DATE',
+    'CURRENT_TIME',
+    'CURRENT_TIMESTAMP',
+    'DATABASE',
+    'DEFAULT',
+    'DEFERRABLE',
+    'DEFERRED',
+    'DELETE',
+    'DESC',
+    'DETACH',
+    'DISTINCT',
+    'DROP',
+    'EACH',
+    'ELSE',
+    'END',
+    'ESCAPE',
+    'EXCEPT',
+    'EXCLUSIVE',
+    'EXISTS',
+    'EXPLAIN',
+    'FAIL',
+    'FOR',
+    'FOREIGN',
+    'FROM',
+    'FULL',
+    'GLOB',
+    'GROUP',
+    'HAVING',
+    'IF',
+    'IGNORE',
+    'IMMEDIATE',
+    'IN',
+    'INDEX',
+    'INDEXED',
+    'INITIALLY',
+    'INNER',
+    'INSERT',
+    'INSTEAD',
+    'INTERSECT',
+    'INTO',
+    'IS',
+    'ISNULL',
+    'JOIN',
+    'KEY',
+    'LEFT',
+    'LIKE',
+    'LIMIT',
+    'MATCH',
+    'NATURAL',
+    'NO',
+    'NOT',
+    'NOTNULL',
+    'NULL',
+    'OF',
+    'OFFSET',
+    'ON',
+    'OR',
+    'ORDER',
+    'OUTER',
+    'PLAN',
+    'PRAGMA',
+    'PRIMARY',
+    'QUERY',
+    'RAISE',
+    'RECURSIVE',
+    'REFERENCES',
+    'REGEXP',
+    'REINDEX',
+    'RELEASE',
+    'RENAME',
+    'REPLACE',
+    'RESTRICT',
+    'RIGHT',
+    'ROLLBACK',
+    'ROW',
+    'SAVEPOINT',
+    'SELECT',
+    'SET',
+    'TABLE',
+    'TEMP',
+    'TEMPORARY',
+    'THEN',
+    'TO',
+    'TRANSACTION',
+    'TRIGGER',
+    'UNION',
+    'UNIQUE',
+    'UPDATE',
+    'USING',
+    'VACUUM',
+    'VALUES',
+    'VIEW',
+    'VIRTUAL',
+    'WHEN',
+    'WHERE',
+    'WITH',
+    'WITHOUT'
+]
+
+
+def get_valid_column_name(name):
+    valid_name = name
+    if not name or not name.strip():
+        return None
+    elif name.upper() in SQLITE_KEYWORDS:
+        valid_name = '"%s"' % name
+    elif re.match('^[a-zA-Z]\w*$', name):
+        pass
+    elif re.match('^"[^"]+"$', name):
+        pass
+    elif re.match('^\[[^\[\]]*\]$', name):
+        pass
+    elif re.match("^`[^`]+`$", name):
+        pass
+    elif name.find('"') < 0:
+        valid_name = '"%s"' % name
+    elif name.find('[') < 0 and name.find(']') < 0:
+        valid_name = '[%s]' % name
+    elif name.find('`') < 0:
+        valid_name = '`%s`' % name
+    elif name.find("'") < 0:
+        valid_name = "'%s'" % name
+    return valid_name
+
+
 def getValueType(val):
     if val or 0. == val:
         try:
@@ -23,14 +177,19 @@

 def get_column_def(file_path, table_name, skip=0, comment_char='#',
                    column_names=None, max_lines=100, load_named_columns=False,
-                   filters=None):
+                   firstlinenames=False, filters=None):
     col_pref = ['TEXT', 'REAL', 'INTEGER', None]
     col_types = []
     col_idx = None
+    col_names = []
     try:
         tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
                            col_idx=None, filters=filters)
         for linenum, fields in enumerate(tr):
+            if linenum == 0 and firstlinenames:
+                col_names = [get_valid_column_name(name) or 'c%d' % (i + 1)
+                             for i, name in enumerate(fields)]
+                continue
             if linenum > max_lines:
                 break
             try:
@@ -48,24 +207,24 @@
     for i, col_type in enumerate(col_types):
         if not col_type:
             col_types[i] = 'TEXT'
+    if not col_names:
+        col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
     if column_names:
-        col_names = []
         if load_named_columns:
             col_idx = []
+            cnames = []
             for i, cname in enumerate(
                     [cn.strip() for cn in column_names.split(',')]):
                 if cname != '':
                     col_idx.append(i)
-                    col_names.append(cname)
+                    cnames.append(cname)
             col_types = [col_types[i] for i in col_idx]
+            col_names = cnames
         else:
-            col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
             for i, cname in enumerate(
                     [cn.strip() for cn in column_names.split(',')]):
                 if cname and i < len(col_names):
                     col_names[i] = cname
-    else:
-        col_names = ['c%d' % i for i in range(1, len(col_types) + 1)]
     col_def = []
     for i, col_name in enumerate(col_names):
         col_def.append('%s %s' % (col_names[i], col_types[i]))
@@ -74,12 +233,14 @@

 def create_table(conn, file_path, table_name, skip=0, comment_char='#',
                  pkey_autoincr=None, column_names=None,
-                 load_named_columns=False, filters=None,
-                 unique_indexes=[], indexes=[]):
+                 load_named_columns=False, firstlinenames=False,
+                 filters=None, unique_indexes=[], indexes=[]):
     col_names, col_types, col_def, col_idx = \
         get_column_def(file_path, table_name, skip=skip,
                        comment_char=comment_char, column_names=column_names,
-                       load_named_columns=load_named_columns, filters=filters)
+                       load_named_columns=load_named_columns,
+                       firstlinenames=firstlinenames,
+                       filters=filters)
     col_func = [float if t == 'REAL' else int
                 if t == 'INTEGER' else str for t in col_types]
     table_def = 'CREATE TABLE %s (\n    %s%s\n);' % (
@@ -98,19 +259,13 @@
         c.execute(table_def)
         conn.commit()
         c.close()
-        for i, index in enumerate(unique_indexes):
-            index_name = 'idx_uniq_%s_%d' % (table_name, i)
-            index_columns = index.split(',')
-            create_index(conn, table_name, index_name, index_columns,
-                         unique=True)
-        for i, index in enumerate(indexes):
-            index_name = 'idx_%s_%d' % (table_name, i)
-            index_columns = index.split(',')
-            create_index(conn, table_name, index_name, index_columns)
+
         c = conn.cursor()
         tr = TabularReader(file_path, skip=skip, comment_char=comment_char,
                            col_idx=col_idx, filters=filters)
         for linenum, fields in enumerate(tr):
+            if linenum == 0 and firstlinenames:
+                continue
             data_lines += 1
             try:
                 vals = [col_func[i](x)
@@ -121,15 +276,28 @@
                       file=sys.stderr)
         conn.commit()
         c.close()
+        for i, index in enumerate(unique_indexes):
+            index_name = 'idx_uniq_%s_%d' % (table_name, i)
+            index_columns = index.split(',')
+            create_index(conn, table_name, index_name, index_columns,
+                         unique=True)
+        for i, index in enumerate(indexes):
+            index_name = 'idx_%s_%d' % (table_name, i)
+            index_columns = index.split(',')
+            create_index(conn, table_name, index_name, index_columns)
     except Exception as e:
         exit('Error: %s' % (e))


 def create_index(conn, table_name, index_name, index_columns, unique=False):
-    index_def = "CREATE %s INDEX %s on %s(%s)" % (
+    index_def = 'CREATE %s INDEX %s on %s(%s)' % (
                 'UNIQUE' if unique else '', index_name,
                 table_name, ','.join(index_columns))
-    c = conn.cursor()
-    c.execute(index_def)
-    conn.commit()
-    c.close()
+    try:
+        c = conn.cursor()
+        c.execute(index_def)
+        conn.commit()
+        c.close()
+    except Exception as e:
+        print('Failed: %s err: %s' % (index_def, e), file=sys.stderr)
+        raise(e)
--- a/macros.xml	Tue Jul 18 09:06:47 2017 -0400
+++ b/macros.xml	Fri Aug 18 16:47:57 2017 -0400
@@ -60,6 +60,52 @@
   #end for
 ]]>
   </token>
+  <token name="@RESULT_HEADER@">
+<![CDATA[
+    #if $query_result.header == 'yes':
+        #if $query_result.header_prefix:
+            #set $header_prefix = chr(int(str($query_result.header_prefix)))
+            --comment_char='$header_prefix'
+        #end if
+    #else
+        --no_header
+    #end if
+]]>
+  </token>
+  <xml name="comment_char_options">
+        <option value="62">&gt;</option>
+        <option value="64">@</option>
+        <option value="43">+</option>
+        <option value="60">&lt;</option>
+        <option value="42">*</option>
+        <option value="45">-</option>
+        <option value="61">=</option>
+        <option value="124">|</option>
+        <option value="63">?</option>
+        <option value="36">$</option>
+        <option value="46">.</option>
+        <option value="58">:</option>
+        <option value="38">&amp;</option>
+        <option value="37">%</option>
+        <option value="94">^</option>
+        <option value="35" selected="true">&#35;</option>
+        <option value="33">!</option>
+  </xml>
+  <xml name="result_results_header_line">
+      <conditional name="query_result">
+          <param name="header" type="select" label="include query result column headers">
+              <option value="yes">Yes</option>
+              <option value="no">No</option>
+          </param>
+          <when value="yes">
+              <param name="header_prefix" type="select" optional="true" label="Prefix character for column_header line">
+                  <option value="">no comment character prefix</option>
+                  <expand macro="comment_char_options" />
+              </param>
+          </when>
+          <when value="no"/>
+      </conditional>
+  </xml>
   <xml name="macro_line_filters">
                 <repeat name="linefilters" title="Filter Tabular Input Lines">
                     <conditional name="filter">
@@ -81,23 +127,7 @@
                         </when>
                         <when value="comment">
                             <param name="comment_char" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are skipped">
-                                <option value="62">&gt;</option>
-                                <option value="64">@</option>
-                                <option value="43">+</option>
-                                <option value="60">&lt;</option>
-                                <option value="42">*</option>
-                                <option value="45">-</option>
-                                <option value="61">=</option>
-                                <option value="124">|</option>
-                                <option value="63">?</option>
-                                <option value="36">$</option>
-                                <option value="46">.</option>
-                                <option value="58">:</option>
-                                <option value="38">&amp;</option>
-                                <option value="37">%</option>
-                                <option value="94">^</option>
-                                <option value="35">&#35;</option>
-                                <option value="33">!</option>
+                                <expand macro="comment_char_options" />
                             </param>
                         </when>
                         <when value="prepend_line_num"/>
--- a/query_db.py	Tue Jul 18 09:06:47 2017 -0400
+++ b/query_db.py	Fri Aug 18 16:47:57 2017 -0400
@@ -56,12 +56,12 @@
     exit(0)


-def run_query(conn, query, outputFile, no_header=False):
+def run_query(conn, query, outputFile, no_header=False, comment_char='#'):
     cur = conn.cursor()
     results = cur.execute(query)
     if not no_header:
-        outputFile.write("#%s\n" % '\t'.join(
-            [str(col[0]) for col in cur.description]))
+        outputFile.write("%s%s\n" % (comment_char, '\t'.join(
+            [str(col[0]) for col in cur.description])))
     for i, row in enumerate(results):
         outputFile.write("%s\n" % '\t'.join(
             [str(val) if val is not None else '' for val in row]))
--- a/query_tabular.py	Tue Jul 18 09:06:47 2017 -0400
+++ b/query_tabular.py	Fri Aug 18 16:47:57 2017 -0400
@@ -56,6 +56,8 @@
     parser.add_option('-n', '--no_header', dest='no_header', default=False,
                       action='store_true',
                       help='Include a column headers line')
+    parser.add_option('-c', '--comment_char', dest='comment_char', default='',
+                      help='comment character to prefix column header line')
     parser.add_option('-o', '--output', dest='output', default=None,
                       help='Output file for query results')
     (options, args) = parser.parse_args()
@@ -80,6 +82,8 @@
             table['comment_char'] if 'comment_char' in table else None
         column_names =\
             table['column_names'] if 'column_names' in table else None
+        firstlinenames =\
+            table['firstlinenames'] if 'firstlinenames' in table else False
         if column_names:
             load_named_columns =\
                 table['load_named_columns']\
@@ -93,6 +97,7 @@
             table['pkey_autoincr'] if 'pkey_autoincr' in table else None
         create_table(get_connection(options.sqlitedb), path, table_name,
                      pkey_autoincr=pkey_autoincr,
+                     firstlinenames=firstlinenames,
                      column_names=column_names,
                      skip=comment_lines,
                      comment_char=comment_char,
@@ -128,7 +133,8 @@
     else:
         try:
             run_query(get_connection(options.sqlitedb), query, outputFile,
-                      no_header=options.no_header)
+                      no_header=options.no_header,
+                      comment_char=options.comment_char)
         except Exception as e:
             exit('Error: %s' % (e))
--- a/sqlite_to_tabular.py	Tue Jul 18 09:06:47 2017 -0400
+++ b/sqlite_to_tabular.py	Fri Aug 18 16:47:57 2017 -0400
@@ -21,6 +21,8 @@
     parser.add_option('-n', '--no_header', dest='no_header', default=False,
                       action='store_true',
                       help='Include a column headers line')
+    parser.add_option('-c', '--comment_char', dest='comment_char', default='',
+                      help='comment character to prefix column header line')
     parser.add_option('-o', '--output', dest='output', default=None,
                       help='Output file for query results')
     (options, args) = parser.parse_args()
@@ -51,7 +53,8 @@
     else:
         try:
             run_query(get_connection(options.sqlitedb), query, outputFile,
-                      no_header=options.no_header)
+                      no_header=options.no_header,
+                      comment_char=options.comment_char)
         except Exception as e:
             exit('Error: %s' % (e))
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/psm_report.tsv	Fri Aug 18 16:47:57 2017 -0400
@@ -0,0 +1,10 @@
+
+	Protein(s)	Sequence	AAs Before	AAs After	Position	Modified Sequence	Variable Modifications	Fixed Modifications	Spectrum File	Spectrum Title	Spectrum Scan Number	RT	m/z	Measured Charge	Identification Charge	Theoretical Mass	Isotope Number	Precursor m/z Error [ppm]	Localization Confidence	Probabilistic PTM score	D-score	Confidence [%]	Validation
+1	NP_116558	PYANQPTVR	M	IT	2	NH2-PYANQPTVR-COOH			trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.04679.04679.2	4679	-1.0	523.272583	2+	2+	1044.53524305008	0	-4.4240452979909675				100.0	Doubtful
+2	NP_443137, NP_443137_S1016F	DANTQVHTLR	YK; YK	KM; KM	443; 443	NH2-DANTQVHTLR-COOH			trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.03894.03894.2	3894	-1.0	577.799622	2+	2+	1153.5839841476504	0	0.6117338355350196				95.0	Doubtful
+3	NP_066544_R21W	SSWAGLQFPVGR	TR	VH	19	NH2-SSWAGLQFPVGR-COOH			trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.20932.20932.2	20932	-1.0	652.843567	2+	2+	1303.6673198487902	0	4.029478922381296				100.0	Doubtful
+4	NP_112092_rs7285167:R182C	AQACNLDQSGTNVAK	AR	VM	179	NH2-AQAC<cmm>NLDQSGTNVAK-COOH		Carbamidomethylation of C(4)	trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.05257.05257.2	5257	-1.0	788.87384	2+	2+	1575.7311189571503	0	1.2727711831177586				100.0	Doubtful
+5	NP_001273432_F75C, NP_001273433_F77C, NP_001273434_F77C, NP_006635_F75C	ACNDPFIQK	GR; GR; GR; GR	EK; EK; EK; EK	74; 76; 76; 74	NH2-AC<cmm>NDPFIQK-COOH		Carbamidomethylation of C(2)	trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.09560.09560.2	9560	-1.0	546.760803	2+	2+	1091.50697969691	0	0.06709466946167521				94.73684210526316	Doubtful
+6	NP_001028196, NP_001244919, NP_036311	LQLLPESFICK	KR; KR; KR	EK; EK; EK	198; 163; 277	NH2-LQLLPESFIC<cmm>K-COOH		Carbamidomethylation of C(10)	trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.22345.22345.2	22345	-1.0	674.370911	2+	2+	1346.7268088708604	0	0.3412036795550375				99.38650306748467	Doubtful
+7	NP_001909_rs72973763:I239V,rs12021720:S384G	DMTVPVLVSKPPVFTGK	PK	DK	234	NH2-DMTVPVLVSKPPVFTGK-COOH			trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.20464.20464.2	20464	-1.0	908.008545	2+	2+	1814.0011927988103	0	0.7402290254293575				94.73684210526316	Doubtful
+8	NP_001265138, NP_001265138_rs2274136:A858T,F879L, NP_001265139, NP_001265139_rs2274136:A785T,F806L, NP_005115, NP_005115_rs2274136:A827T,F848L	FGVSSESKPEEVK	FK; FK; FK; FK; FK; FK	KD; KD; KD; KD; KD; KD	992; 992; 919; 919; 961; 961	NH2-FGVSSESKPEEVK-COOH			trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf	tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.06665.06665.2	6665	-1.0	711.858643	2+	2+	1421.7038244968105	0	-0.7666055674357796				100.0	Doubtful
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/psm_report_out1.tsv	Fri Aug 18 16:47:57 2017 -0400
@@ -0,0 +1,9 @@
+Scan	Sequence	Protein(s)	Position	m/z	Precursor m/z Error
+1	PYANQPTVR	NP_116558	2	523.272583	-4.42404529799
+2	DANTQVHTLR	NP_443137, NP_443137_S1016F	443; 443	577.799622	0.611733835535
+3	SSWAGLQFPVGR	NP_066544_R21W	19	652.843567	4.02947892238
+4	AQACNLDQSGTNVAK	NP_112092_rs7285167:R182C	179	788.87384	1.27277118312
+5	ACNDPFIQK	NP_001273432_F75C, NP_001273433_F77C, NP_001273434_F77C, NP_006635_F75C	74; 76; 76; 74	546.760803	0.0670946694617
+6	LQLLPESFICK	NP_001028196, NP_001244919, NP_036311	198; 163; 277	674.370911	0.341203679555
+7	DMTVPVLVSKPPVFTGK	NP_001909_rs72973763:I239V,rs12021720:S384G	234	908.008545	0.740229025429
+8	FGVSSESKPEEVK	NP_001265138, NP_001265138_rs2274136:A858T,F879L, NP_001265139, NP_001265139_rs2274136:A785T,F806L, NP_005115, NP_005115_rs2274136:A827T,F848L	992; 992; 919; 919; 961; 961	711.858643	-0.766605567436
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/psm_report_out2.tsv	Fri Aug 18 16:47:57 2017 -0400
@@ -0,0 +1,5 @@
+Scan	m/z	Precursor m/z Error	Sequence	Protein(s)
+1	523.272583	-4.42404529799	PYANQPTVR	NP_116558
+3	652.843567	4.02947892238	SSWAGLQFPVGR	NP_066544_R21W
+4	788.87384	1.27277118312	AQACNLDQSGTNVAK	NP_112092_rs7285167:R182C
+7	908.008545	0.740229025429	DMTVPVLVSKPPVFTGK	NP_001909_rs72973763:I239V,rs12021720:S384G