Mercurial > repos > iuc > column_remove_by_header
comparison column_remove_by_header.py @ 1:2040e4c2750a draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
author | iuc |
---|---|
date | Sat, 16 Jul 2022 06:55:49 +0000 |
parents | 372967836e98 |
children |
comparison
equal
deleted
inserted
replaced
0:372967836e98 | 1:2040e4c2750a |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import subprocess | 3 import argparse |
4 import sys | |
5 | 4 |
6 AWK_CMD = """BEGIN{FS="%s"; OFS="%s";} {print %s;}""" | 5 parser = argparse.ArgumentParser() |
6 parser.add_argument( | |
7 "-i", "--input", required=True, help="Tabular Input File Name" | |
8 ) | |
9 parser.add_argument( | |
10 "-o", "--output", required=True, help="Tabular Output File" | |
11 ) | |
12 parser.add_argument( | |
13 "-c", "--columns", dest="names", nargs="+", | |
14 help="Column headers to operate on" | |
15 ) | |
16 parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter") | |
17 parser.add_argument( | |
18 "-k", "--keep", action="store_true", | |
19 help="Drop non-selected columns instead of selected ones" | |
20 ) | |
21 parser.add_argument( | |
22 "-s", "--strip_chars", default=None, | |
23 help="Ignore these leading characters when extracting the name of the " | |
24 "first line" | |
25 ) | |
26 parser.add_argument( | |
27 "--unicode-escaped-cols", action="store_true", | |
28 help="Indicate that the --columns names use unicode escape sequences " | |
29 "that should be decoded back before comparing them to the input file " | |
30 "header" | |
31 ) | |
32 args = parser.parse_args() | |
7 | 33 |
8 input_filename = sys.argv[1] | 34 # The delimiter can only be parsed reliably from the input if it's from |
9 output_filename = sys.argv[2] | 35 # the ASCII range of characters |
10 delimiter = sys.argv[3] | 36 try: |
11 keep_columns = sys.argv[4] | 37 bytes_delimiter = args.delimiter.encode(encoding="ascii") |
12 strip_characters = sys.argv[5] | 38 except UnicodeEncodeError: |
39 raise ValueError("Only ASCII characters are allowed as column delimiters") | |
40 # handle unicode escape sequences in --columns argument | |
41 if args.unicode_escaped_cols: | |
42 names = [n.encode().decode('unicode_escape') for n in args.names] | |
43 else: | |
44 names = args.names | |
13 | 45 |
14 if keep_columns == "--keep": | 46 with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh: |
15 keep_columns = True | 47 header_cols = fh.readline().strip("\n").split(args.delimiter) |
16 else: | 48 columns = set() |
17 keep_columns = False | 49 for i, key in enumerate(header_cols): |
50 if i == 0 and args.strip_chars: | |
51 key = key.lstrip(args.strip_chars) | |
52 if (args.keep and key in names) or (not args.keep and key not in names): | |
53 columns.add(i) | |
54 print("Kept", len(columns), "of", len(header_cols), "columns.") | |
18 | 55 |
19 names = [] | 56 with open(args.input, "rb") as i: |
20 for name in sys.argv[6:]: | 57 with open(args.output, "wb") as o: |
21 names.append( name ) | 58 for line in i: |
22 | 59 fields = [ |
23 header = None | 60 f for idx, f in enumerate( |
24 with open( input_filename, 'r' ) as fh: | 61 line.rstrip(b"\r\n").split(bytes_delimiter) |
25 header = fh.readline().strip( '\r\n' ) | 62 ) if idx in columns |
26 header = header.split( delimiter ) | 63 ] |
27 columns = [] | 64 o.write(bytes_delimiter.join(fields) + b"\n") |
28 for i, key in enumerate( header, 1 ): | |
29 if i == 1 and strip_characters: | |
30 key = key.lstrip( strip_characters ) | |
31 if ( keep_columns and key in names ) or ( not keep_columns and key not in names ): | |
32 columns.append( i ) | |
33 print( "Kept", len( columns ), "of", len( header ), "columns." ) | |
34 awk_cmd = AWK_CMD % ( delimiter, delimiter, ",".join( map( lambda x: "$%s" % x, columns ) ) ) | |
35 sys.exit( subprocess.call( [ 'gawk', awk_cmd, input_filename ], stdout=open( output_filename, 'wb+' ), shell=False ) ) |