Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.py @ 8:6cbe2f30c2d7 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
author | bgruening |
---|---|
date | Sun, 12 Jul 2020 10:27:06 -0400 |
parents | 0046692724f9 |
children | baabc30154cd |
comparison
equal
deleted
inserted
replaced
7:0046692724f9 | 8:6cbe2f30c2d7 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import argparse | 3 import argparse |
4 import math | 4 import math |
5 import os | 5 import os |
6 import random | |
6 import re | 7 import re |
7 import random | |
8 | 8 |
9 # configuration of the splitting for specific file types | 9 # configuration of the splitting for specific file types |
10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines) | 10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines) |
11 # - number of lines to split after (0 if not splitting by number of lines but regex) | 11 # - number of lines to split after (0 if not splitting by number of lines but regex) |
12 # - a boolean indicating if the record separator is at the end of the record | 12 # - a boolean indicating if the record separator is at the end of the record |
13 # | 13 # |
14 # new file types can be added by appending to this dict, | 14 # new file types can be added by appending to this dict, |
15 # updating the parser, and adding a new type option in the Galaxy wrapper | 15 # updating the parser, and adding a new type option in the Galaxy wrapper |
16 FILETYPES = {'fasta': ('^>', 0, False), | 16 FILETYPES = {'fasta': (r'^>', 0, False), |
17 'fastq': ('', 4, False), | 17 'fastq': (r'', 4, False), |
18 'tabular': ('', 1, False), | 18 'tabular': (r'', 1, False), |
19 'txt': ('', 1, False), | 19 'txt': (r'', 1, False), |
20 'mgf': ('^BEGIN IONS', 0, False), | 20 'mgf': (r'^BEGIN IONS', 0, False), |
21 'sdf': ('\$\$\$\$', 0, True), | 21 'sdf': (r'\$\$\$\$', 0, True), |
22 } | 22 } |
23 | 23 |
24 | 24 |
25 def main(): | 25 def main(): |
26 ps = parser_cli() | 26 ps = parser_cli() |
39 if top < 0: | 39 if top < 0: |
40 raise ValueError("Number of header lines cannot be negative") | 40 raise ValueError("Number of header lines cannot be negative") |
41 | 41 |
42 ftype = args["ftype"] | 42 ftype = args["ftype"] |
43 | 43 |
44 assert ftype != "generic" or args["generic_re"] != None, "--generic_re needs to be given for generic input" | 44 assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input" |
45 | 45 |
46 if args["ftype"] == "tabular" and args["by"] == "col": | 46 if args["ftype"] == "tabular" and args["by"] == "col": |
47 args["match"] = replace_mapped_chars(args["match"]) | 47 args["match"] = replace_mapped_chars(args["match"]) |
48 args["sub"] = replace_mapped_chars(args["sub"]) | 48 args["sub"] = replace_mapped_chars(args["sub"]) |
49 split_by_column(args, in_file, out_dir, top) | 49 split_by_column(args, in_file, out_dir, top) |
59 parser.add_argument('--in', '-i', required=True, help="The input file") | 59 parser.add_argument('--in', '-i', required=True, help="The input file") |
60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) | 60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) |
61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") | 61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") |
62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + | 62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + |
63 " the extension of the new files (without a period)") | 63 " the extension of the new files (without a period)") |
64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, | 64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True, |
65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) | 65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) |
66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", | 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", |
67 default = "row", choices = ["col", "row"]) | 67 default="row", choices=["col", "row"]) |
68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") | 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") |
69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') | 69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') |
70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + | 70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + |
71 "If not provided and args[\"rand\"]==True, then date is used", type=int) | 71 "If not provided and args[\"rand\"]==True, then date is used", type=int) |
72 group = parser.add_mutually_exclusive_group() | 72 group = parser.add_mutually_exclusive_group() |
73 group.add_argument('--numnew', '-n', type=int, default = 1, | 73 group.add_argument('--numnew', '-n', type=int, default=1, |
74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") | 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") |
75 group.add_argument('--chunksize', '-k', type=int, default = 0, | 75 group.add_argument('--chunksize', '-k', type=int, default=0, |
76 help="Number of records by file. Not valid for splitting on a column") | 76 help="Number of records by file. Not valid for splitting on a column") |
77 parser.add_argument('--batch', action='store_true', | 77 parser.add_argument('--batch', action='store_true', |
78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") | 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") |
79 generic = parser.add_argument_group('Arguments controling generic splitting') | 79 generic = parser.add_argument_group('Arguments controling generic splitting') |
80 group = generic.add_mutually_exclusive_group() | 80 group = generic.add_mutually_exclusive_group() |
81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False) | 81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False) |
82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False) | 82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False) |
83 generic.add_argument('--split_after', '-p', action='store_true', | 83 generic.add_argument('--split_after', '-p', action='store_true', |
84 help="Split between records after separator (default is before). " + | 84 help="Split between records after separator (default is before). " + |
85 "Only for generic splitting by regex - specific ftypes are always split in the default way") | 85 "Only for generic splitting by regex - specific ftypes are always split in the default way") |
86 bycol = parser.add_argument_group('If splitting on a column') | 86 bycol = parser.add_argument_group('If splitting on a column') |
87 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") | 87 bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries") |
88 bycol.add_argument('--sub', '-s', default = r'\1', | 88 bycol.add_argument('--sub', '-s', default=r'\1', |
89 help="The regular expression to substitute in for the matched pattern.") | 89 help="The regular expression to substitute in for the matched pattern.") |
90 bycol.add_argument('--id_column', '-c', default="1", | 90 bycol.add_argument('--id_column', '-c', default="1", |
91 help="Column that is used to name output files. Indexed starting from 1.", type=int) | 91 help="Column that is used to name output files. Indexed starting from 1.", type=int) |
92 return parser | 92 return parser |
93 | |
94 | |
95 def close_files(file_list): | |
96 # finally, close all files | |
97 for open_file in file_list: | |
98 open_file.close() | |
99 | 93 |
100 | 94 |
101 def replace_mapped_chars(pattern): | 95 def replace_mapped_chars(pattern): |
102 """ | 96 """ |
103 handles special escaped characters when coming from galaxy | 97 handles special escaped characters when coming from galaxy |
124 else: | 118 else: |
125 random.seed() | 119 random.seed() |
126 | 120 |
127 # batched division (maintains order) | 121 # batched division (maintains order) |
128 batch = args["batch"] | 122 batch = args["batch"] |
129 | 123 |
130 # determine | 124 # determine |
131 # - the number of records that should be stored per file | 125 # - the number of records that should be stored per file |
132 # (done always, even if used only for batch mode) | 126 # (done always, even if used only for batch mode) |
133 # - if the separator is a the start / end of the record | 127 # - if the separator is a the start / end of the record |
134 n_per_file = math.inf | 128 n_per_file = math.inf |
135 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected | 129 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected |
136 with open(in_file) as f: | 130 with open(in_file) as f: |
137 # read header lines | 131 # read header lines |
138 for i in range(top): | 132 for i in range(top): |
139 f.readline() | 133 f.readline() |
140 n_records = 0 | 134 n_records = 0 |
148 n_records += 1 | 142 n_records += 1 |
149 | 143 |
150 # if there are fewer records than desired files | 144 # if there are fewer records than desired files |
151 numnew = min(numnew, n_records) | 145 numnew = min(numnew, n_records) |
152 # approx. number of records per file | 146 # approx. number of records per file |
153 if chunksize == 0: # i.e. no chunking | 147 if chunksize == 0: # i.e. no chunking |
154 n_per_file = n_records // numnew | 148 n_per_file = n_records // numnew |
155 else: | 149 else: |
156 numnew = n_records // chunksize | 150 numnew = n_records // chunksize |
157 n_per_file = chunksize | 151 n_per_file = chunksize |
158 | 152 |
163 if custom_new_file_name is None: | 157 if custom_new_file_name is None: |
164 new_file_base = os.path.splitext(os.path.basename(in_file)) | 158 new_file_base = os.path.splitext(os.path.basename(in_file)) |
165 else: | 159 else: |
166 new_file_base = [custom_new_file_name, custom_new_file_ext] | 160 new_file_base = [custom_new_file_name, custom_new_file_ext] |
167 | 161 |
168 newfiles = [ | 162 newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)] |
169 open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") | |
170 for count in range(0, numnew) | |
171 ] | |
172 # bunch o' counters | 163 # bunch o' counters |
173 # index to list of new files | 164 # index to list of new files |
174 if rand: | 165 if rand: |
175 new_file_counter = int(math.floor(random.random() * numnew)) | 166 new_file_counter = int(math.floor(random.random() * numnew)) |
176 else: | 167 else: |
177 new_file_counter = 0 | 168 new_file_counter = 0 |
169 new_file = open(newfile_names[new_file_counter], "a") | |
178 # to contain header specified by top | 170 # to contain header specified by top |
179 header = "" | 171 header = "" |
180 # keep track of the files that have been opened so far | 172 # keep track of the files that have been opened so far |
181 fresh_files = set(range(numnew)) | 173 fresh_files = set(range(numnew)) |
182 | 174 |
199 if record == "": | 191 if record == "": |
200 record += line | 192 record += line |
201 else: | 193 else: |
202 # if is in fresh_files, write header and drop from freshFiles | 194 # if is in fresh_files, write header and drop from freshFiles |
203 if new_file_counter in fresh_files: | 195 if new_file_counter in fresh_files: |
204 newfiles[new_file_counter].write(header) | 196 new_file.write(header) |
205 fresh_files.remove(new_file_counter) | 197 fresh_files.remove(new_file_counter) |
206 | 198 |
207 if sep_at_end: | 199 if sep_at_end: |
208 record += line | 200 record += line |
209 # write record to file | 201 # write record to file |
210 newfiles[new_file_counter].write(record) | 202 new_file.write(record) |
211 if not sep_at_end: | 203 if not sep_at_end: |
212 record = line | 204 record = line |
213 else: | 205 else: |
214 record = "" | 206 record = "" |
215 | 207 |
216 # change destination file | 208 # change destination file |
217 if rand: | 209 if rand: |
218 new_file_counter = int(math.floor(random.random() * numnew)) | 210 new_file_counter = int(math.floor(random.random() * numnew)) |
211 new_file.close() | |
212 new_file = open(newfile_names[new_file_counter], "a") | |
219 elif batch: | 213 elif batch: |
220 # number of records read per file | 214 # number of records read per file |
221 records_in_file += 1 | 215 records_in_file += 1 |
222 # have we reached the max for each file? | 216 # have we reached the max for each file? |
223 # if so, switch file | 217 # if so, switch file |
224 if records_in_file >= n_per_file: | 218 if records_in_file >= n_per_file: |
225 new_file_counter = (new_file_counter + 1) % numnew | 219 new_file_counter = (new_file_counter + 1) % numnew |
226 records_in_file = 0 # reset to 0 | 220 records_in_file = 0 # reset to 0 |
221 new_file.close() | |
222 new_file = open(newfile_names[new_file_counter], "a") | |
227 else: | 223 else: |
228 new_file_counter = (new_file_counter + 1) % numnew | 224 new_file_counter = (new_file_counter + 1) % numnew |
225 new_file.close() | |
226 new_file = open(newfile_names[new_file_counter], "a") | |
229 # if beginning of line is not record sep, we must be inside a record | 227 # if beginning of line is not record sep, we must be inside a record |
230 # so just append | 228 # so just append |
231 else: | 229 else: |
232 record += line | 230 record += line |
233 # after loop, write final record to file | 231 # after loop, write final record to file |
234 newfiles[new_file_counter].write(record) | 232 new_file.write(record) |
235 | 233 new_file.close() |
236 # close new files | |
237 close_files(newfiles) | |
238 | 234 |
239 | 235 |
240 def split_by_column(args, in_file, out_dir, top): | 236 def split_by_column(args, in_file, out_dir, top): |
241 | 237 |
242 # shift to 0-based indexing | 238 # shift to 0-based indexing |
249 raise | 245 raise |
250 | 246 |
251 sub = args["sub"] | 247 sub = args["sub"] |
252 | 248 |
253 # set of file names | 249 # set of file names |
254 new_files = dict() | 250 files = set() |
255 | 251 |
256 # keep track of how many lines have been read | 252 # keep track of how many lines have been read |
257 n_read = 0 | 253 n_read = 0 |
258 header = "" | 254 header = "" |
259 with open(in_file) as file: | 255 with open(in_file) as file: |
272 # use regex to get new file name | 268 # use regex to get new file name |
273 out_file_name = re.sub(match, sub, id_col_val) | 269 out_file_name = re.sub(match, sub, id_col_val) |
274 out_file_path = os.path.join(out_dir, out_file_name) | 270 out_file_path = os.path.join(out_dir, out_file_name) |
275 | 271 |
276 # write | 272 # write |
277 if out_file_name not in new_files.keys(): | 273 with open(out_file_path, "a") as current_new_file: |
278 # open file (new, so not already open) | 274 if out_file_name not in files: |
279 current_new_file = open(out_file_path, "w") | 275 current_new_file.write(header) |
280 current_new_file.write(header) | 276 files.add(out_file_name) |
281 current_new_file.write(line) | 277 current_new_file.write(line) |
282 # add to dict | |
283 new_files[out_file_name] = current_new_file | |
284 else: | |
285 # file is already open, so just write to it | |
286 new_files[out_file_name].write(line) | |
287 | |
288 # finally, close all files | |
289 close_files(new_files.values()) | |
290 | 278 |
291 | 279 |
292 if __name__ == "__main__": | 280 if __name__ == "__main__": |
293 main() | 281 main() |