annotate column_remove_by_header.py @ 1:2040e4c2750a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
author iuc
date Sat, 16 Jul 2022 06:55:49 +0000
parents 372967836e98
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
372967836e98 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 2150a3264364471090b650bdffde9f9c0b47ac39
iuc
parents:
diff changeset
1 #!/usr/bin/env python
372967836e98 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 2150a3264364471090b650bdffde9f9c0b47ac39
iuc
parents:
diff changeset
2
1
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
3 import argparse
0
372967836e98 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 2150a3264364471090b650bdffde9f9c0b47ac39
iuc
parents:
diff changeset
4
1
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
5 parser = argparse.ArgumentParser()
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
6 parser.add_argument(
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
7 "-i", "--input", required=True, help="Tabular Input File Name"
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
8 )
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
9 parser.add_argument(
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
10 "-o", "--output", required=True, help="Tabular Output File"
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
11 )
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
12 parser.add_argument(
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
13 "-c", "--columns", dest="names", nargs="+",
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
14 help="Column headers to operate on"
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
15 )
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
16 parser.add_argument("-d", "--delimiter", default='\t', help="Column delimiter")
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
17 parser.add_argument(
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
18 "-k", "--keep", action="store_true",
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
19 help="Drop non-selected columns instead of selected ones"
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
20 )
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
21 parser.add_argument(
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
22 "-s", "--strip_chars", default=None,
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
23 help="Ignore these leading characters when extracting the name of the "
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
24 "first line"
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
25 )
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
26 parser.add_argument(
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
27 "--unicode-escaped-cols", action="store_true",
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
28 help="Indicate that the --columns names use unicode escape sequences "
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
29 "that should be decoded back before comparing them to the input file "
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
30 "header"
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
31 )
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
32 args = parser.parse_args()
0
372967836e98 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 2150a3264364471090b650bdffde9f9c0b47ac39
iuc
parents:
diff changeset
33
1
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
34 # The delimiter can only be parsed reliably from the input if it's from
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
35 # the ASCII range of characters
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
36 try:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
37 bytes_delimiter = args.delimiter.encode(encoding="ascii")
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
38 except UnicodeEncodeError:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
39 raise ValueError("Only ASCII characters are allowed as column delimiters")
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
40 # handle unicode escape sequences in --columns argument
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
41 if args.unicode_escaped_cols:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
42 names = [n.encode().decode('unicode_escape') for n in args.names]
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
43 else:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
44 names = args.names
0
372967836e98 planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 2150a3264364471090b650bdffde9f9c0b47ac39
iuc
parents:
diff changeset
45
1
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
46 with open(args.input, "r", encoding="utf-8", errors="surrogateescape") as fh:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
47 header_cols = fh.readline().strip("\n").split(args.delimiter)
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
48 columns = set()
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
49 for i, key in enumerate(header_cols):
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
50 if i == 0 and args.strip_chars:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
51 key = key.lstrip(args.strip_chars)
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
52 if (args.keep and key in names) or (not args.keep and key not in names):
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
53 columns.add(i)
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
54 print("Kept", len(columns), "of", len(header_cols), "columns.")
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
55
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
56 with open(args.input, "rb") as i:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
57 with open(args.output, "wb") as o:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
58 for line in i:
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
59 fields = [
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
60 f for idx, f in enumerate(
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
61 line.rstrip(b"\r\n").split(bytes_delimiter)
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
62 ) if idx in columns
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
63 ]
2040e4c2750a planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/column_remove_by_header commit 6a87239f5139390963930673f36b869cde76fdf9
iuc
parents: 0
diff changeset
64 o.write(bytes_delimiter.join(fields) + b"\n")