comparison column_remove_by_header.py @ 1:2040e4c2750a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
author iuc
date Sat, 16 Jul 2022 06:55:49 +0000
parents 372967836e98
children
comparison
equal deleted inserted replaced
0:372967836e98 1:2040e4c2750a
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import subprocess 3 import argparse
4 import sys
5 4
6 AWK_CMD = """BEGIN{FS="%s"; OFS="%s";} {print %s;}""" 5 parser = argparse.ArgumentParser()
6 parser.add_argument(
7 "-i", "--input", required=True, help="Tabular Input File Name"
8 )
9 parser.add_argument(
10 "-o", "--output", required=True, help="Tabular Output File"
11 )
12 parser.add_argument(
13 "-c", "--columns", dest="names", nargs="+",
14 help="Column headers to operate on"
15 )
16 parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter")
17 parser.add_argument(
18 "-k", "--keep", action="store_true",
19 help="Drop non-selected columns instead of selected ones"
20 )
21 parser.add_argument(
22 "-s", "--strip_chars", default=None,
23 help="Ignore these leading characters when extracting the name of the "
24 "first line"
25 )
26 parser.add_argument(
27 "--unicode-escaped-cols", action="store_true",
28 help="Indicate that the --columns names use unicode escape sequences "
29 "that should be decoded back before comparing them to the input file "
30 "header"
31 )
32 args = parser.parse_args()
7 33
8 input_filename = sys.argv[1] 34 # The delimiter can only be parsed reliably from the input if it's from
9 output_filename = sys.argv[2] 35 # the ASCII range of characters
10 delimiter = sys.argv[3] 36 try:
11 keep_columns = sys.argv[4] 37 bytes_delimiter = args.delimiter.encode(encoding="ascii")
12 strip_characters = sys.argv[5] 38 except UnicodeEncodeError:
39 raise ValueError("Only ASCII characters are allowed as column delimiters")
40 # handle unicode escape sequences in --columns argument
41 if args.unicode_escaped_cols:
42 names = [n.encode().decode('unicode_escape') for n in args.names]
43 else:
44 names = args.names
13 45
14 if keep_columns == "--keep": 46 with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh:
15 keep_columns = True 47 header_cols = fh.readline().strip("\n").split(args.delimiter)
16 else: 48 columns = set()
17 keep_columns = False 49 for i, key in enumerate(header_cols):
50 if i == 0 and args.strip_chars:
51 key = key.lstrip(args.strip_chars)
52 if (args.keep and key in names) or (not args.keep and key not in names):
53 columns.add(i)
54 print("Kept", len(columns), "of", len(header_cols), "columns.")
18 55
19 names = [] 56 with open(args.input, "rb") as i:
20 for name in sys.argv[6:]: 57 with open(args.output, "wb") as o:
21 names.append( name ) 58 for line in i:
22 59 fields = [
23 header = None 60 f for idx, f in enumerate(
24 with open( input_filename, 'r' ) as fh: 61 line.rstrip(b"\r\n").split(bytes_delimiter)
25 header = fh.readline().strip( '\r\n' ) 62 ) if idx in columns
26 header = header.split( delimiter ) 63 ]
27 columns = [] 64 o.write(bytes_delimiter.join(fields) + b"\n")
28 for i, key in enumerate( header, 1 ):
29 if i == 1 and strip_characters:
30 key = key.lstrip( strip_characters )
31 if ( keep_columns and key in names ) or ( not keep_columns and key not in names ):
32 columns.append( i )
33 print( "Kept", len( columns ), "of", len( header ), "columns." )
34 awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%s" % x, columns ) ) )
35 sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) )