Mercurial > repos > iuc > column_remove_by_header
diff column_remove_by_header.py @ 1:2040e4c2750a draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
author | iuc |
---|---|
date | Sat, 16 Jul 2022 06:55:49 +0000 |
parents | 372967836e98 |
children |
line wrap: on
line diff
--- a/column_remove_by_header.py Wed Apr 12 17:17:29 2017 -0400 +++ b/column_remove_by_header.py Sat Jul 16 06:55:49 2022 +0000 @@ -1,35 +1,64 @@ #!/usr/bin/env python -import subprocess -import sys - -AWK_CMD = """BEGIN{FS="%s"; OFS="%s";} {print %s;}""" +import argparse -input_filename = sys.argv[1] -output_filename = sys.argv[2] -delimiter = sys.argv[3] -keep_columns = sys.argv[4] -strip_characters = sys.argv[5] - -if keep_columns == "--keep": - keep_columns = True -else: - keep_columns = False +parser = argparse.ArgumentParser() +parser.add_argument( + "-i", "--input", required=True, help="Tabular Input File Name" +) +parser.add_argument( + "-o", "--output", required=True, help="Tabular Output File" +) +parser.add_argument( + "-c", "--columns", dest="names", nargs="+", + help="Column headers to operate on" +) +parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter") +parser.add_argument( + "-k", "--keep", action="store_true", + help="Drop non-selected columns instead of selected ones" +) +parser.add_argument( + "-s", "--strip_chars", default=None, + help="Ignore these leading characters when extracting the name of the " + "first line" +) +parser.add_argument( + "--unicode-escaped-cols", action="store_true", + help="Indicate that the --columns names use unicode escape sequences " + "that should be decoded back before comparing them to the input file " + "header" +) +args = parser.parse_args() -names = [] -for name in sys.argv[6:]: - names.append( name ) +# The delimiter can only be parsed reliably from the input if it's from +# the ASCII range of characters +try: + bytes_delimiter = args.delimiter.encode(encoding="ascii") +except UnicodeEncodeError: + raise ValueError("Only ASCII characters are allowed as column delimiters") +# handle unicode escape sequences in --columns argument +if args.unicode_escaped_cols: + names = [n.encode().decode('unicode_escape') for n in args.names] +else: + names = args.names -header = None -with open( input_filename, 'r' ) as fh: - header = fh.readline().strip( '\r\n' ) -header = header.split( delimiter ) -columns = [] -for i, key in enumerate( header, 1 ): - if i == 1 and strip_characters: - key = key.lstrip( strip_characters ) - if ( keep_columns and key in names ) or ( not keep_columns and key not in names ): - columns.append( i ) -print( "Kept", len( columns ), "of", len( header ), "columns." ) -awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%s" % x, columns ) ) ) -sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) ) +with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh: + header_cols = fh.readline().strip("\n").split(args.delimiter) +columns = set() +for i, key in enumerate(header_cols): + if i == 0 and args.strip_chars: + key = key.lstrip(args.strip_chars) + if (args.keep and key in names) or (not args.keep and key not in names): + columns.add(i) +print("Kept", len(columns), "of", len(header_cols), "columns.") + +with open(args.input, "rb") as i: + with open(args.output, "wb") as o: + for line in i: + fields = [ + f for idx, f in enumerate( + line.rstrip(b"\r\n").split(bytes_delimiter) + ) if idx in columns + ] + o.write(bytes_delimiter.join(fields) + b"\n")