Mercurial > repos > iuc > filter_tabular
changeset 1:cd2a99849f8b draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/query_tabular commit 81f69ad5f39223059c40501e55ac777d3feca845
author | iuc |
---|---|
date | Fri, 18 Aug 2017 16:47:57 -0400 |
parents | 6fbd9d25ceef |
children | 4a9b1cb3639b |
files | filter_tabular.xml filters.py load_db.py macros.xml query_db.py query_tabular.py sqlite_to_tabular.py test-data/psm_report.tsv test-data/psm_report_out1.tsv test-data/psm_report_out2.tsv |
diffstat | 10 files changed, 279 insertions(+), 47 deletions(-) [+] |
line wrap: on
line diff
--- a/filter_tabular.xml Tue Jul 18 09:06:47 2017 -0400 +++ b/filter_tabular.xml Fri Aug 18 16:47:57 2017 -0400 @@ -1,4 +1,4 @@ -<tool id="filter_tabular" name="Filter Tabular" version="1.0.0"> +<tool id="filter_tabular" name="Filter Tabular" version="2.0.0"> <description></description> <macros>
--- a/filters.py Tue Jul 18 09:06:47 2017 -0400 +++ b/filters.py Fri Aug 18 16:47:57 2017 -0400 @@ -33,7 +33,8 @@ r = filter_dict['replace'] c = int(filter_dict['column']) - 1 self.func = lambda i, l: '\t'.join( - [x if j != c else re.sub(p, r, x) for j, x in enumerate(l.split('\t'))]) + [x if j != c else re.sub(p, r, x) + for j, x in enumerate(l.split('\t'))]) elif filter_dict['filter'] == 'prepend_line_num': self.func = lambda i, l: '%d\t%s' % (i, l) elif filter_dict['filter'] == 'append_line_num':
--- a/load_db.py Tue Jul 18 09:06:47 2017 -0400 +++ b/load_db.py Fri Aug 18 16:47:57 2017 -0400 @@ -2,11 +2,165 @@ from __future__ import print_function +import re import sys from filters import TabularReader +SQLITE_KEYWORDS = [ + 'ABORT', + 'ACTION', + 'ADD', + 'AFTER', + 'ALL', + 'ALTER', + 'ANALYZE', + 'AND', + 'AS', + 'ASC', + 'ATTACH', + 'AUTOINCREMENT', + 'BEFORE', + 'BEGIN', + 'BETWEEN', + 'BY', + 'CASCADE', + 'CASE', + 'CAST', + 'CHECK', + 'COLLATE', + 'COLUMN', + 'COMMIT', + 'CONFLICT', + 'CONSTRAINT', + 'CREATE', + 'CROSS', + 'CURRENT_DATE', + 'CURRENT_TIME', + 'CURRENT_TIMESTAMP', + 'DATABASE', + 'DEFAULT', + 'DEFERRABLE', + 'DEFERRED', + 'DELETE', + 'DESC', + 'DETACH', + 'DISTINCT', + 'DROP', + 'EACH', + 'ELSE', + 'END', + 'ESCAPE', + 'EXCEPT', + 'EXCLUSIVE', + 'EXISTS', + 'EXPLAIN', + 'FAIL', + 'FOR', + 'FOREIGN', + 'FROM', + 'FULL', + 'GLOB', + 'GROUP', + 'HAVING', + 'IF', + 'IGNORE', + 'IMMEDIATE', + 'IN', + 'INDEX', + 'INDEXED', + 'INITIALLY', + 'INNER', + 'INSERT', + 'INSTEAD', + 'INTERSECT', + 'INTO', + 'IS', + 'ISNULL', + 'JOIN', + 'KEY', + 'LEFT', + 'LIKE', + 'LIMIT', + 'MATCH', + 'NATURAL', + 'NO', + 'NOT', + 'NOTNULL', + 'NULL', + 'OF', + 'OFFSET', + 'ON', + 'OR', + 'ORDER', + 'OUTER', + 'PLAN', + 'PRAGMA', + 'PRIMARY', + 'QUERY', + 'RAISE', + 'RECURSIVE', + 'REFERENCES', + 'REGEXP', + 'REINDEX', + 'RELEASE', + 'RENAME', + 'REPLACE', + 'RESTRICT', + 'RIGHT', + 'ROLLBACK', + 'ROW', + 'SAVEPOINT', + 'SELECT', + 'SET', + 'TABLE', + 'TEMP', + 'TEMPORARY', + 'THEN', + 'TO', + 'TRANSACTION', + 'TRIGGER', + 'UNION', + 'UNIQUE', + 'UPDATE', + 'USING', + 'VACUUM', + 'VALUES', + 'VIEW', + 'VIRTUAL', + 'WHEN', + 'WHERE', + 'WITH', + 'WITHOUT' +] + + +def get_valid_column_name(name): + valid_name = name + if not name or not name.strip(): + return None + elif name.upper() in SQLITE_KEYWORDS: + valid_name = '"%s"' % name + elif re.match('^[a-zA-Z]\w*$', name): + pass + elif re.match('^"[^"]+"$', name): + pass + elif re.match('^\[[^\[\]]*\]$', name): + pass + elif re.match("^`[^`]+`$", name): + pass + elif name.find('"') < 0: + valid_name = '"%s"' % name + elif name.find('[') < 0 and name.find(']') < 0: + valid_name = '[%s]' % name + elif name.find('`') < 0: + valid_name = '`%s`' % name + elif name.find("'") < 0: + valid_name = "'%s'" % name + return valid_name + + def getValueType(val): if val or 0. == val: try: @@ -23,14 +177,19 @@ def get_column_def(file_path, table_name, skip=0, comment_char='#', column_names=None, max_lines=100, load_named_columns=False, - filters=None): + firstlinenames=False, filters=None): col_pref = ['TEXT', 'REAL', 'INTEGER', None] col_types = [] col_idx = None + col_names = [] try: tr = TabularReader(file_path, skip=skip, comment_char=comment_char, col_idx=None, filters=filters) for linenum, fields in enumerate(tr): + if linenum == 0 and firstlinenames: + col_names = [get_valid_column_name(name) or 'c%d' % (i + 1) + for i, name in enumerate(fields)] + continue if linenum > max_lines: break try: @@ -48,24 +207,24 @@ for i, col_type in enumerate(col_types): if not col_type: col_types[i] = 'TEXT' + if not col_names: + col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] if column_names: - col_names = [] if load_named_columns: col_idx = [] + cnames = [] for i, cname in enumerate( [cn.strip() for cn in column_names.split(',')]): if cname != '': col_idx.append(i) - col_names.append(cname) + cnames.append(cname) col_types = [col_types[i] for i in col_idx] + col_names = cnames else: - col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] for i, cname in enumerate( [cn.strip() for cn in column_names.split(',')]): if cname and i < len(col_names): col_names[i] = cname - else: - col_names = ['c%d' % i for i in range(1, len(col_types) + 1)] col_def = [] for i, col_name in enumerate(col_names): col_def.append('%s %s' % (col_names[i], col_types[i])) @@ -74,12 +233,14 @@ def create_table(conn, file_path, table_name, skip=0, comment_char='#', pkey_autoincr=None, column_names=None, - load_named_columns=False, filters=None, - unique_indexes=[], indexes=[]): + load_named_columns=False, firstlinenames=False, + filters=None, unique_indexes=[], indexes=[]): col_names, col_types, col_def, col_idx = \ get_column_def(file_path, table_name, skip=skip, comment_char=comment_char, column_names=column_names, - load_named_columns=load_named_columns, filters=filters) + load_named_columns=load_named_columns, + firstlinenames=firstlinenames, + filters=filters) col_func = [float if t == 'REAL' else int if t == 'INTEGER' else str for t in col_types] table_def = 'CREATE TABLE %s (\n %s%s\n);' % ( @@ -98,19 +259,13 @@ c.execute(table_def) conn.commit() c.close() - for i, index in enumerate(unique_indexes): - index_name = 'idx_uniq_%s_%d' % (table_name, i) - index_columns = index.split(',') - create_index(conn, table_name, index_name, index_columns, - unique=True) - for i, index in enumerate(indexes): - index_name = 'idx_%s_%d' % (table_name, i) - index_columns = index.split(',') - create_index(conn, table_name, index_name, index_columns) + c = conn.cursor() tr = TabularReader(file_path, skip=skip, comment_char=comment_char, col_idx=col_idx, filters=filters) for linenum, fields in enumerate(tr): + if linenum == 0 and firstlinenames: + continue data_lines += 1 try: vals = [col_func[i](x) @@ -121,15 +276,28 @@ file=sys.stderr) conn.commit() c.close() + for i, index in enumerate(unique_indexes): + index_name = 'idx_uniq_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns, + unique=True) + for i, index in enumerate(indexes): + index_name = 'idx_%s_%d' % (table_name, i) + index_columns = index.split(',') + create_index(conn, table_name, index_name, index_columns) except Exception as e: exit('Error: %s' % (e)) def create_index(conn, table_name, index_name, index_columns, unique=False): - index_def = "CREATE %s INDEX %s on %s(%s)" % ( + index_def = 'CREATE %s INDEX %s on %s(%s)' % ( 'UNIQUE' if unique else '', index_name, table_name, ','.join(index_columns)) - c = conn.cursor() - c.execute(index_def) - conn.commit() - c.close() + try: + c = conn.cursor() + c.execute(index_def) + conn.commit() + c.close() + except Exception as e: + print('Failed: %s err: %s' % (index_def, e), file=sys.stderr) + raise(e)
--- a/macros.xml Tue Jul 18 09:06:47 2017 -0400 +++ b/macros.xml Fri Aug 18 16:47:57 2017 -0400 @@ -60,6 +60,52 @@ #end for ]]> </token> + <token name="@RESULT_HEADER@"> +<![CDATA[ + #if $query_result.header == 'yes': + #if $query_result.header_prefix: + #set $header_prefix = chr(int(str($query_result.header_prefix))) + --comment_char='$header_prefix' + #end if + #else + --no_header + #end if +]]> + </token> + <xml name="comment_char_options"> + <option value="62">></option> + <option value="64">@</option> + <option value="43">+</option> + <option value="60"><</option> + <option value="42">*</option> + <option value="45">-</option> + <option value="61">=</option> + <option value="124">|</option> + <option value="63">?</option> + <option value="36">$</option> + <option value="46">.</option> + <option value="58">:</option> + <option value="38">&</option> + <option value="37">%</option> + <option value="94">^</option> + <option value="35" selected="true">#</option> + <option value="33">!</option> + </xml> + <xml name="result_results_header_line"> + <conditional name="query_result"> + <param name="header" type="select" label="include query result column headers"> + <option value="yes">Yes</option> + <option value="no">No</option> + </param> + <when value="yes"> + <param name="header_prefix" type="select" optional="true" label="Prefix character for column_header line"> + <option value="">no comment character prefix</option> + <expand macro="comment_char_options" /> + </param> + </when> + <when value="no"/> + </conditional> + </xml> <xml name="macro_line_filters"> <repeat name="linefilters" title="Filter Tabular Input Lines"> <conditional name="filter"> @@ -81,23 +127,7 @@ </when> <when value="comment"> <param name="comment_char" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are skipped"> - <option value="62">></option> - <option value="64">@</option> - <option value="43">+</option> - <option value="60"><</option> - <option value="42">*</option> - <option value="45">-</option> - <option value="61">=</option> - <option value="124">|</option> - <option value="63">?</option> - <option value="36">$</option> - <option value="46">.</option> - <option value="58">:</option> - <option value="38">&</option> - <option value="37">%</option> - <option value="94">^</option> - <option value="35">#</option> - <option value="33">!</option> + <expand macro="comment_char_options" /> </param> </when> <when value="prepend_line_num"/>
--- a/query_db.py Tue Jul 18 09:06:47 2017 -0400 +++ b/query_db.py Fri Aug 18 16:47:57 2017 -0400 @@ -56,12 +56,12 @@ exit(0) -def run_query(conn, query, outputFile, no_header=False): +def run_query(conn, query, outputFile, no_header=False, comment_char='#'): cur = conn.cursor() results = cur.execute(query) if not no_header: - outputFile.write("#%s\n" % '\t'.join( - [str(col[0]) for col in cur.description])) + outputFile.write("%s%s\n" % (comment_char, '\t'.join( + [str(col[0]) for col in cur.description]))) for i, row in enumerate(results): outputFile.write("%s\n" % '\t'.join( [str(val) if val is not None else '' for val in row]))
--- a/query_tabular.py Tue Jul 18 09:06:47 2017 -0400 +++ b/query_tabular.py Fri Aug 18 16:47:57 2017 -0400 @@ -56,6 +56,8 @@ parser.add_option('-n', '--no_header', dest='no_header', default=False, action='store_true', help='Include a column headers line') + parser.add_option('-c', '--comment_char', dest='comment_char', default='', + help='comment character to prefix column header line') parser.add_option('-o', '--output', dest='output', default=None, help='Output file for query results') (options, args) = parser.parse_args() @@ -80,6 +82,8 @@ table['comment_char'] if 'comment_char' in table else None column_names =\ table['column_names'] if 'column_names' in table else None + firstlinenames =\ + table['firstlinenames'] if 'firstlinenames' in table else False if column_names: load_named_columns =\ table['load_named_columns']\ @@ -93,6 +97,7 @@ table['pkey_autoincr'] if 'pkey_autoincr' in table else None create_table(get_connection(options.sqlitedb), path, table_name, pkey_autoincr=pkey_autoincr, + firstlinenames=firstlinenames, column_names=column_names, skip=comment_lines, comment_char=comment_char, @@ -128,7 +133,8 @@ else: try: run_query(get_connection(options.sqlitedb), query, outputFile, - no_header=options.no_header) + no_header=options.no_header, + comment_char=options.comment_char) except Exception as e: exit('Error: %s' % (e))
--- a/sqlite_to_tabular.py Tue Jul 18 09:06:47 2017 -0400 +++ b/sqlite_to_tabular.py Fri Aug 18 16:47:57 2017 -0400 @@ -21,6 +21,8 @@ parser.add_option('-n', '--no_header', dest='no_header', default=False, action='store_true', help='Include a column headers line') + parser.add_option('-c', '--comment_char', dest='comment_char', default='', + help='comment character to prefix column header line') parser.add_option('-o', '--output', dest='output', default=None, help='Output file for query results') (options, args) = parser.parse_args() @@ -51,7 +53,8 @@ else: try: run_query(get_connection(options.sqlitedb), query, outputFile, - no_header=options.no_header) + no_header=options.no_header, + comment_char=options.comment_char) except Exception as e: exit('Error: %s' % (e))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/psm_report.tsv Fri Aug 18 16:47:57 2017 -0400 @@ -0,0 +1,10 @@ + + Protein(s) Sequence AAs Before AAs After Position Modified Sequence Variable Modifications Fixed Modifications Spectrum File Spectrum Title Spectrum Scan Number RT m/z Measured Charge Identification Charge Theoretical Mass Isotope Number Precursor m/z Error [ppm] Localization Confidence Probabilistic PTM score D-score Confidence [%] Validation +1 NP_116558 PYANQPTVR M IT 2 NH2-PYANQPTVR-COOH trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.04679.04679.2 4679 -1.0 523.272583 2+ 2+ 1044.53524305008 0 -4.4240452979909675 100.0 Doubtful +2 NP_443137, NP_443137_S1016F DANTQVHTLR YK; YK KM; KM 443; 443 NH2-DANTQVHTLR-COOH trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.03894.03894.2 3894 -1.0 577.799622 2+ 2+ 1153.5839841476504 0 0.6117338355350196 95.0 Doubtful +3 NP_066544_R21W SSWAGLQFPVGR TR VH 19 NH2-SSWAGLQFPVGR-COOH trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.20932.20932.2 20932 -1.0 652.843567 2+ 2+ 1303.6673198487902 0 4.029478922381296 100.0 Doubtful +4 NP_112092_rs7285167:R182C AQACNLDQSGTNVAK AR VM 179 NH2-AQAC<cmm>NLDQSGTNVAK-COOH Carbamidomethylation of C(4) trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.05257.05257.2 5257 -1.0 788.87384 2+ 2+ 1575.7311189571503 0 1.2727711831177586 100.0 Doubtful +5 NP_001273432_F75C, NP_001273433_F77C, NP_001273434_F77C, NP_006635_F75C ACNDPFIQK GR; GR; GR; GR EK; EK; EK; EK 74; 76; 76; 74 NH2-AC<cmm>NDPFIQK-COOH Carbamidomethylation of C(2) trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.09560.09560.2 9560 -1.0 546.760803 2+ 2+ 1091.50697969691 0 0.06709466946167521 94.73684210526316 Doubtful +6 NP_001028196, NP_001244919, NP_036311 LQLLPESFICK KR; KR; KR EK; EK; EK 198; 163; 277 NH2-LQLLPESFIC<cmm>K-COOH Carbamidomethylation of C(10) trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.22345.22345.2 22345 -1.0 674.370911 2+ 2+ 1346.7268088708604 0 0.3412036795550375 99.38650306748467 Doubtful +7 NP_001909_rs72973763:I239V,rs12021720:S384G DMTVPVLVSKPPVFTGK PK DK 234 NH2-DMTVPVLVSKPPVFTGK-COOH trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.20464.20464.2 20464 -1.0 908.008545 2+ 2+ 1814.0011927988103 0 0.7402290254293575 94.73684210526316 Doubtful +8 NP_001265138, NP_001265138_rs2274136:A858T,F879L, NP_001265139, NP_001265139_rs2274136:A785T,F806L, NP_005115, NP_005115_rs2274136:A827T,F848L FGVSSESKPEEVK FK; FK; FK; FK; FK; FK KD; KD; KD; KD; KD; KD 992; 992; 919; 919; 961; 961 NH2-FGVSSESKPEEVK-COOH trimmed_tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.mgf tgriffin_cguerrer_20150826_RP_MCF7_hipH_frac_12n28.06665.06665.2 6665 -1.0 711.858643 2+ 2+ 1421.7038244968105 0 -0.7666055674357796 100.0 Doubtful
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/psm_report_out1.tsv Fri Aug 18 16:47:57 2017 -0400 @@ -0,0 +1,9 @@ +Scan Sequence Protein(s) Position m/z Precursor m/z Error +1 PYANQPTVR NP_116558 2 523.272583 -4.42404529799 +2 DANTQVHTLR NP_443137, NP_443137_S1016F 443; 443 577.799622 0.611733835535 +3 SSWAGLQFPVGR NP_066544_R21W 19 652.843567 4.02947892238 +4 AQACNLDQSGTNVAK NP_112092_rs7285167:R182C 179 788.87384 1.27277118312 +5 ACNDPFIQK NP_001273432_F75C, NP_001273433_F77C, NP_001273434_F77C, NP_006635_F75C 74; 76; 76; 74 546.760803 0.0670946694617 +6 LQLLPESFICK NP_001028196, NP_001244919, NP_036311 198; 163; 277 674.370911 0.341203679555 +7 DMTVPVLVSKPPVFTGK NP_001909_rs72973763:I239V,rs12021720:S384G 234 908.008545 0.740229025429 +8 FGVSSESKPEEVK NP_001265138, NP_001265138_rs2274136:A858T,F879L, NP_001265139, NP_001265139_rs2274136:A785T,F806L, NP_005115, NP_005115_rs2274136:A827T,F848L 992; 992; 919; 919; 961; 961 711.858643 -0.766605567436
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/psm_report_out2.tsv Fri Aug 18 16:47:57 2017 -0400 @@ -0,0 +1,5 @@ +Scan m/z Precursor m/z Error Sequence Protein(s) +1 523.272583 -4.42404529799 PYANQPTVR NP_116558 +3 652.843567 4.02947892238 SSWAGLQFPVGR NP_066544_R21W +4 788.87384 1.27277118312 AQACNLDQSGTNVAK NP_112092_rs7285167:R182C +7 908.008545 0.740229025429 DMTVPVLVSKPPVFTGK NP_001909_rs72973763:I239V,rs12021720:S384G