# HG changeset patch # User blankenberg # Date 1536330570 14400 # Node ID 12b740c4cbc12ba4a48cd3b29a904f4457cc70cb planemo upload for repository https://github.com/blankenberg/tools-blankenberg/tree/master/tools/column_regex_substitution commit 78936dc6be1747303d4cbfd80d09e4cfd1cbf292 diff -r 000000000000 -r 12b740c4cbc1 column_regex_substitution.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_regex_substitution.py Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,93 @@ +#!/usr/bin/env python +#Dan Blankenberg + +""" +A script for using regex substitutions on columns. +""" + +import optparse +import re +import sys +import string + +VERSION = "0.0.1" + +COLUMN_STRIP_VALUES = "".join( set( string.printable ) - set( string.digits ) - set(',') ) + +def get_provided_columns( provided_value, column_offset ): + try: + rval = sorted( map( lambda x: int( x.strip( COLUMN_STRIP_VALUES ) ) + column_offset, provided_value.split( ',' ) ) ) + except: + rval = None + if rval: + return rval + return None + + +def __main__(): + parser = optparse.OptionParser() + parser.add_option('--pattern', action='store', default=None, + help='pattern string') + parser.add_option('--replacement', action='store', default=None, + help='replacement string') + parser.add_option('--input', action='store', default=None, + help='Filename of input file') + parser.add_option('--output', action='store', default=None, + help='Filename of output file') + parser.add_option('--delimiter', action='store', default=None, + help='column delimiter') + parser.add_option('--columns', action='store', default=None, + help='columns to operate on') + parser.add_option('--column_offset', action='store', default=0, + help='offset to apply to columns index to force to zero-based') + parser.add_option('--skip', action='store', default=0, + help='Number of lines to skip') + parser.add_option('--version', action='store_true', default=False, + help='Show version') + + (options, args) = parser.parse_args() + + if options.version: + print "blankenberg_python_regex_substitution %s" % ( VERSION ) + sys.exit(0) + + if None in [ options.pattern, options.replacement, options.output ]: + parser.print_help() + sys.exit(1) + + pattern = options.pattern + replacement = options.replacement + column_offset = int( options.column_offset ) + print "Pattern: %s\nReplacement: %s" % ( repr( pattern ), repr( replacement ) ) + pattern = re.compile( pattern ) + provided_columns = get_provided_columns( options.columns, column_offset ) + if provided_columns: + column_str = ", ".join( map( lambda x: str( x - column_offset ), provided_columns ) ) + else: + column_str = 'all' + print "With delimiter %s, on columns: %s" % ( repr( options.delimiter ), column_str ) + if options.delimiter is None: + split_func = lambda x: [ x.rstrip( '\n\r' ) ] + join_char = "" + else: + split_func = lambda x: x.rstrip( '\n\r' ).split( options.delimiter ) + join_char = options.delimiter + with open( options.input, 'rb' ) as fin: + with open( options.output, 'w') as fout: + for i, line in enumerate( fin ): + if i < options.skip: + continue + line = split_func( line ) + field_count = len( line ) + if provided_columns: + columns = provided_columns + else: + columns = range( field_count ) + for j in columns: + if j >= field_count: + break + line[ j ] = re.sub( pattern, replacement, line[ j ] ) + fout.write( "%s\n" % ( join_char.join( line ) ) ) + +if __name__ == "__main__": + __main__() diff -r 000000000000 -r 12b740c4cbc1 column_regex_substitution.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/column_regex_substitution.xml Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,57 @@ + + by regular expressions + + python + + + + + + python column_regex_substitution.py --version + + + + + + + + + + + + + + + + + + + + + + + + + `_. + +]]> + + + diff -r 000000000000 -r 12b740c4cbc1 test-data/column_regex_substitution_in.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/column_regex_substitution_in.tabular Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,10 @@ + Firmicutes Proteobacteria Actinobacteria Deinococcus-Thermus Bacteroidetes Synergistetes Plactomycetes +expected 50 30 10 5 5 0 0 +AA53D 99 61 2 2 16 0 0 +D08KJ/ABH4G 856 905 8 33 12 1 0 +AD891/AD87D 1315 1289 9 43 18 1 0 +AFJRV 268 219 0 4 7 0 1 +AG3A0 448 274 3 6 30 0 0 +AG22P 531 450 0 12 11 0 0 +AH4HV 363 336 2 5 16 0 0 +AHHBT 414 190 1 6 7 0 0 diff -r 000000000000 -r 12b740c4cbc1 test-data/column_regex_substitution_out.tabular --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/column_regex_substitution_out.tabular Fri Sep 07 10:29:30 2018 -0400 @@ -0,0 +1,10 @@ +. Firmicutes Proteobacteria Actinobacteria Deinococcus-Thermus Bacteroidetes Synergistetes Plactomycetes +expected 50 30 10 5 5 0 0 +AA53D 99 61 2 2 16 0 0 +D08KJ/ABH4G 856 905 8 33 12 1 0 +AD891/AD87D 1315 1289 9 43 18 1 0 +AFJRV 268 219 0 4 7 0 1 +AG3A0 448 274 3 6 30 0 0 +AG22P 531 450 0 12 11 0 0 +AH4HV 363 336 2 5 16 0 0 +AHHBT 414 190 1 6 7 0 0