column_remove_by_header: column_remove_by

comparison column_remove_by_header.py @ 1:2040e4c2750a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9

author	iuc
date	Sat, 16 Jul 2022 06:55:49 +0000
parents	372967836e98
children

comparison

equal deleted inserted replaced

-:372967836e98
+:2040e4c2750a
 #!/usr/bin/env python
-import subprocess
+import argparse
-import sys
-AWK_CMD = """BEGIN{FS="%s"; OFS="%s";} {print %s;}"""
+parser = argparse.ArgumentParser()
+parser.add_argument(
+"-i", "--input", required=True, help="Tabular Input File Name"
+)
+parser.add_argument(
+"-o", "--output", required=True, help="Tabular Output File"
+)
+parser.add_argument(
+"-c", "--columns", dest="names", nargs="+",
+help="Column headers to operate on"
+)
+parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter")
+parser.add_argument(
+"-k", "--keep", action="store_true",
+help="Drop non-selected columns instead of selected ones"
+)
+parser.add_argument(
+"-s", "--strip_chars", default=None,
+help="Ignore these leading characters when extracting the name of the "
+"first line"
+)
+parser.add_argument(
+"--unicode-escaped-cols", action="store_true",
+help="Indicate that the --columns names use unicode escape sequences "
+"that should be decoded back before comparing them to the input file "
+"header"
+)
+args = parser.parse_args()
-input_filename = sys.argv[1]
+# The delimiter can only be parsed reliably from the input if it's from
-output_filename = sys.argv[2]
+# the ASCII range of characters
-delimiter = sys.argv[3]
+try:
-keep_columns = sys.argv[4]
+bytes_delimiter = args.delimiter.encode(encoding="ascii")
-strip_characters = sys.argv[5]
+except UnicodeEncodeError:
+raise ValueError("Only ASCII characters are allowed as column delimiters")
+# handle unicode escape sequences in --columns argument
+if args.unicode_escaped_cols:
+names = [n.encode().decode('unicode_escape') for n in args.names]
+else:
+names = args.names
-if keep_columns == "--keep":
+with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh:
-keep_columns = True
+header_cols = fh.readline().strip("\n").split(args.delimiter)
-else:
+columns = set()
-keep_columns = False
+for i, key in enumerate(header_cols):
+if i == 0 and args.strip_chars:
+key = key.lstrip(args.strip_chars)
+if (args.keep and key in names) or (not args.keep and key not in names):
+columns.add(i)
+print("Kept", len(columns), "of", len(header_cols), "columns.")
-names = []
+with open(args.input, "rb") as i:
-for name in sys.argv[6:]:
+with open(args.output, "wb") as o:
-names.append( name )
+for line in i:
+fields = [
-header = None
+f for idx, f in enumerate(
-with open( input_filename, 'r' ) as fh:
+line.rstrip(b"\r\n").split(bytes_delimiter)
-header = fh.readline().strip( '\r\n' )
+) if idx in columns
-header = header.split( delimiter )
+]
-columns = []
+o.write(bytes_delimiter.join(fields) + b"\n")
-for i, key in enumerate( header, 1 ):
-if i == 1 and strip_characters:
-key = key.lstrip( strip_characters )
-if ( keep_columns and key in names ) or ( not keep_columns and key not in names ):
-columns.append( i )
-print( "Kept", len( columns ), "of", len( header ), "columns." )
-awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%s" % x, columns ) ) )
-sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) )

Mercurial > repos > iuc > column_remove_by_header

comparison column_remove_by_header.py @ 1:2040e4c2750a draft default tip