diff column_regex_substitution.py @ 0:12b740c4cbc1 draft default tip

planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/column_regex_substitution commit 78936dc6be1747303d4cbfd80d09e4cfd1cbf292
author blankenberg
date Fri, 07 Sep 2018 10:29:30 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/column_regex_substitution.py	Fri Sep 07 10:29:30 2018 -0400
@@ -0,0 +1,93 @@
+#!/usr/bin/env python
+#Dan Blankenberg
+
+"""
+A script for using regex substitutions on columns.
+"""
+
+import optparse
+import re
+import sys
+import string
+
+VERSION = "0.0.1"
+
+COLUMN_STRIP_VALUES = "".join( set( string.printable ) - set( string.digits ) - set(',') )
+
+def get_provided_columns( provided_value, column_offset ):
+    try:
+        rval = sorted( map( lambda x: int( x.strip( COLUMN_STRIP_VALUES ) ) + column_offset, provided_value.split( ',' ) ) )
+    except:
+        rval = None
+    if rval:
+        return rval
+    return None
+
+
+def __main__():
+    parser = optparse.OptionParser()
+    parser.add_option('--pattern', action='store', default=None,
+                      help='pattern string')
+    parser.add_option('--replacement', action='store', default=None,
+                      help='replacement string')
+    parser.add_option('--input', action='store', default=None,
+                      help='Filename of input file')
+    parser.add_option('--output', action='store', default=None,
+                      help='Filename of output file')
+    parser.add_option('--delimiter', action='store', default=None,
+                      help='column delimiter')
+    parser.add_option('--columns', action='store', default=None,
+                      help='columns to operate on')
+    parser.add_option('--column_offset', action='store', default=0,
+                      help='offset to apply to columns index to force to zero-based')
+    parser.add_option('--skip', action='store', default=0,
+                      help='Number of lines to skip')
+    parser.add_option('--version', action='store_true', default=False,
+                      help='Show version')
+
+    (options, args) = parser.parse_args()
+
+    if options.version:
+        print "blankenberg_python_regex_substitution %s" % ( VERSION )
+        sys.exit(0)
+
+    if None in [ options.pattern, options.replacement, options.output ]:
+        parser.print_help()
+        sys.exit(1)
+
+    pattern = options.pattern
+    replacement = options.replacement
+    column_offset = int( options.column_offset )
+    print "Pattern: %s\nReplacement: %s" % ( repr( pattern ), repr( replacement ) )
+    pattern = re.compile( pattern )
+    provided_columns = get_provided_columns( options.columns, column_offset )
+    if provided_columns:
+        column_str = ", ".join( map( lambda x: str( x - column_offset ), provided_columns ) )
+    else:
+        column_str = 'all'
+    print "With delimiter %s, on columns: %s" % ( repr( options.delimiter ), column_str )
+    if options.delimiter is None:
+        split_func = lambda x: [ x.rstrip( '\n\r' ) ]
+        join_char = ""
+    else:
+        split_func = lambda x: x.rstrip( '\n\r' ).split( options.delimiter )
+        join_char = options.delimiter
+    with open( options.input, 'rb' ) as fin:
+        with open( options.output, 'w') as fout:
+            for i, line in enumerate( fin ):
+                if i < options.skip:
+                    continue
+                line = split_func( line )
+                field_count = len( line )
+                if provided_columns:
+                    columns = provided_columns
+                else:
+                    columns = range( field_count )
+                for j in columns:
+                    if j >= field_count:
+                        break
+                    line[ j ] = re.sub( pattern, replacement, line[ j ] )
+                fout.write( "%s\n" % ( join_char.join( line ) ) )
+
+if __name__ == "__main__":
+    __main__()