0
+ − 1 #!/usr/bin/env python
+ − 2 # This tool takes a tab-delimited textfile as input and creates another column in the file which is the result of
+ − 3 # a computation performed on every row in the original file. The tool will skip over invalid lines within the file,
+ − 4 # informing the user about the number of lines skipped.
+ − 5
+ − 6 import sys, re
+ − 7 # These functions may be used in compute expression:
+ − 8 from math import log,exp,sqrt,ceil,floor
+ − 9
+ − 10
+ − 11 assert sys.version_info[:2] >= ( 2, 4 )
+ − 12
+ − 13 def stop_err( msg ):
+ − 14 sys.stderr.write( msg )
+ − 15 sys.exit()
+ − 16
+ − 17 inp_file = sys.argv[1]
+ − 18 out_file = sys.argv[2]
+ − 19 expr = sys.argv[3]
+ − 20 round_result = sys.argv[4]
+ − 21 try:
+ − 22 in_columns = int( sys.argv[5] )
+ − 23 except:
+ − 24 stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." )
+ − 25 if in_columns < 2:
+ − 26 # To be considered tabular, data must fulfill requirements of the sniff.is_column_based() method.
+ − 27 stop_err( "Missing or invalid 'columns' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." )
+ − 28 try:
+ − 29 in_column_types = sys.argv[6].split( ',' )
+ − 30 except:
+ − 31 stop_err( "Missing or invalid 'column_types' metadata value, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." )
+ − 32 if len( in_column_types ) != in_columns:
+ − 33 stop_err( "The 'columns' metadata setting does not conform to the 'column_types' metadata setting, click the pencil icon in the history item and select the Auto-detect option to correct it. This tool can only be used with tab-delimited data." )
+ − 34
+ − 35 # Unescape if input has been escaped
+ − 36 mapped_str = {
+ − 37 '__lt__': '<',
+ − 38 '__le__': '<=',
+ − 39 '__eq__': '==',
+ − 40 '__ne__': '!=',
+ − 41 '__gt__': '>',
+ − 42 '__ge__': '>=',
+ − 43 '__sq__': '\'',
+ − 44 '__dq__': '"',
+ − 45 }
+ − 46 for key, value in mapped_str.items():
+ − 47 expr = expr.replace( key, value )
+ − 48
+ − 49 operators = 'is|not|or|and'
+ − 50 builtin_and_math_functions = 'abs|all|any|bin|chr|cmp|complex|divmod|float|hex|int|len|long|max|min|oct|ord|pow|range|reversed|round|sorted|str|sum|type|unichr|unicode|log|exp|sqrt|ceil|floor'
+ − 51 string_and_list_methods = [ name for name in dir('') + dir([]) if not name.startswith('_') ]
+ − 52 whitelist = "^([c0-9\+\-\*\/\(\)\.\'\"><=,:! ]|%s|%s|%s)*$" % (operators, builtin_and_math_functions, '|'.join(string_and_list_methods))
+ − 53 if not re.compile(whitelist).match(expr):
+ − 54 stop_err("Invalid expression")
+ − 55
+ − 56 # Prepare the column variable names and wrappers for column data types
+ − 57 cols, type_casts = [], []
+ − 58 for col in range( 1, in_columns + 1 ):
+ − 59 col_name = "c%d" % col
+ − 60 cols.append( col_name )
+ − 61 col_type = in_column_types[ col - 1 ].strip()
+ − 62 if round_result == 'no' and col_type == 'int':
+ − 63 col_type = 'float'
+ − 64 type_cast = "%s(%s)" % ( col_type, col_name )
+ − 65 type_casts.append( type_cast )
+ − 66
+ − 67 col_str = ', '.join( cols ) # 'c1, c2, c3, c4'
+ − 68 type_cast_str = ', '.join( type_casts ) # 'str(c1), int(c2), int(c3), str(c4)'
+ − 69 assign = "%s = line.split( '\\t' )" % col_str
+ − 70 wrap = "%s = %s" % ( col_str, type_cast_str )
+ − 71 skipped_lines = 0
+ − 72 first_invalid_line = 0
+ − 73 invalid_line = None
+ − 74 lines_kept = 0
+ − 75 total_lines = 0
+ − 76 out = open( out_file, 'wt' )
+ − 77
+ − 78 # Read input file, skipping invalid lines, and perform computation that will result in a new column
+ − 79 code = '''
+ − 80 for i, line in enumerate( file( inp_file ) ):
+ − 81 total_lines += 1
+ − 82 line = line.rstrip( '\\r\\n' )
+ − 83 if not line or line.startswith( '#' ):
+ − 84 skipped_lines += 1
+ − 85 if not invalid_line:
+ − 86 first_invalid_line = i + 1
+ − 87 invalid_line = line
+ − 88 continue
+ − 89 try:
+ − 90 %s
+ − 91 %s
+ − 92 new_val = %s
+ − 93 if round_result == "yes":
+ − 94 new_val = int( round( new_val ) )
+ − 95 new_line = line + '\\t' + str( new_val )
+ − 96 print >> out, new_line
+ − 97 lines_kept += 1
+ − 98 except:
+ − 99 skipped_lines += 1
+ − 100 if not invalid_line:
+ − 101 first_invalid_line = i + 1
+ − 102 invalid_line = line
+ − 103 ''' % ( assign, wrap, expr )
+ − 104
+ − 105 valid_expr = True
+ − 106 try:
+ − 107 exec code
+ − 108 except Exception, e:
+ − 109 out.close()
+ − 110 if str( e ).startswith( 'invalid syntax' ):
+ − 111 valid_expr = False
+ − 112 stop_err( 'Expression "%s" likely invalid. See tool tips, syntax and examples.' % expr )
+ − 113 else:
+ − 114 stop_err( str( e ) )
+ − 115
+ − 116 if valid_expr:
+ − 117 out.close()
+ − 118 valid_lines = total_lines - skipped_lines
+ − 119 print 'Creating column %d with expression %s' % ( in_columns + 1, expr )
+ − 120 if valid_lines > 0:
+ − 121 print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
+ − 122 else:
+ − 123 print 'Possible invalid expression "%s" or non-existent column referenced. See tool tips, syntax and examples.' % expr
+ − 124 if skipped_lines > 0:
+ − 125 print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )