diff column_remove_by_header.py @ 1:2040e4c2750a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
author iuc
date Sat, 16 Jul 2022 06:55:49 +0000
parents 372967836e98
children
line wrap: on
line diff
--- a/column_remove_by_header.py	Wed Apr 12 17:17:29 2017 -0400
+++ b/column_remove_by_header.py	Sat Jul 16 06:55:49 2022 +0000
@@ -1,35 +1,64 @@
 #!/usr/bin/env python
 
-import subprocess
-import sys
-
-AWK_CMD = """BEGIN{FS="%s"; OFS="%s";} {print %s;}"""
+import argparse
 
-input_filename = sys.argv[1]
-output_filename = sys.argv[2]
-delimiter = sys.argv[3]
-keep_columns = sys.argv[4]
-strip_characters = sys.argv[5]
-
-if keep_columns == "--keep":
-    keep_columns = True
-else:
-    keep_columns = False
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "-i", "--input", required=True, help="Tabular Input File Name"
+)
+parser.add_argument(
+    "-o", "--output", required=True, help="Tabular Output File"
+)
+parser.add_argument(
+    "-c", "--columns", dest="names", nargs="+",
+    help="Column headers to operate on"
+)
+parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter")
+parser.add_argument(
+    "-k", "--keep", action="store_true",
+    help="Drop non-selected columns instead of selected ones"
+)
+parser.add_argument(
+    "-s", "--strip_chars", default=None,
+    help="Ignore these leading characters when extracting the name of the "
+         "first line"
+)
+parser.add_argument(
+    "--unicode-escaped-cols", action="store_true",
+    help="Indicate that the --columns names use unicode escape sequences "
+         "that should be decoded back before comparing them to the input file "
+         "header"
+)
+args = parser.parse_args()
 
-names = []
-for name in sys.argv[6:]:
-    names.append( name )
+# The delimiter can only be parsed reliably from the input if it's from
+# the ASCII range of characters
+try:
+    bytes_delimiter = args.delimiter.encode(encoding="ascii")
+except UnicodeEncodeError:
+    raise ValueError("Only ASCII characters are allowed as column delimiters")
+# handle unicode escape sequences in --columns argument
+if args.unicode_escaped_cols:
+    names = [n.encode().decode('unicode_escape') for n in args.names]
+else:
+    names = args.names
 
-header = None
-with open( input_filename, 'r' ) as fh:
-    header = fh.readline().strip( '\r\n' )
-header = header.split( delimiter )
-columns = []
-for i, key in enumerate( header, 1 ):
-    if i == 1 and strip_characters:
-        key = key.lstrip( strip_characters )
-    if ( keep_columns and key in names ) or ( not keep_columns and key not in names ):
-        columns.append( i )
-print( "Kept", len( columns ), "of", len( header ), "columns." )
-awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%s" % x, columns ) ) )
-sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) )
+with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh:
+    header_cols = fh.readline().strip("\n").split(args.delimiter)
+columns = set()
+for i, key in enumerate(header_cols):
+    if i == 0 and args.strip_chars:
+        key = key.lstrip(args.strip_chars)
+    if (args.keep and key in names) or (not args.keep and key not in names):
+        columns.add(i)
+print("Kept", len(columns), "of", len(header_cols), "columns.")
+
+with open(args.input, "rb") as i:
+    with open(args.output, "wb") as o:
+        for line in i:
+            fields = [
+                f for idx, f in enumerate(
+                    line.rstrip(b"\r\n").split(bytes_delimiter)
+                ) if idx in columns
+            ]
+            o.write(bytes_delimiter.join(fields) + b"\n")