comparison split_file_to_collection.py @ 8:6cbe2f30c2d7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
author bgruening
date Sun, 12 Jul 2020 10:27:06 -0400
parents 0046692724f9
children baabc30154cd
comparison
equal deleted inserted replaced
7:0046692724f9 8:6cbe2f30c2d7
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import math 4 import math
5 import os 5 import os
6 import random
6 import re 7 import re
7 import random
8 8
9 # configuration of the splitting for specific file types 9 # configuration of the splitting for specific file types
10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines) 10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
11 # - number of lines to split after (0 if not splitting by number of lines but regex) 11 # - number of lines to split after (0 if not splitting by number of lines but regex)
12 # - a boolean indicating if the record separator is at the end of the record 12 # - a boolean indicating if the record separator is at the end of the record
13 # 13 #
14 # new file types can be added by appending to this dict, 14 # new file types can be added by appending to this dict,
15 # updating the parser, and adding a new type option in the Galaxy wrapper 15 # updating the parser, and adding a new type option in the Galaxy wrapper
16 FILETYPES = {'fasta': ('^>', 0, False), 16 FILETYPES = {'fasta': (r'^>', 0, False),
17 'fastq': ('', 4, False), 17 'fastq': (r'', 4, False),
18 'tabular': ('', 1, False), 18 'tabular': (r'', 1, False),
19 'txt': ('', 1, False), 19 'txt': (r'', 1, False),
20 'mgf': ('^BEGIN IONS', 0, False), 20 'mgf': (r'^BEGIN IONS', 0, False),
21 'sdf': ('\$\$\$\$', 0, True), 21 'sdf': (r'\$\$\$\$', 0, True),
22 } 22 }
23 23
24 24
25 def main(): 25 def main():
26 ps = parser_cli() 26 ps = parser_cli()
39 if top < 0: 39 if top < 0:
40 raise ValueError("Number of header lines cannot be negative") 40 raise ValueError("Number of header lines cannot be negative")
41 41
42 ftype = args["ftype"] 42 ftype = args["ftype"]
43 43
44 assert ftype != "generic" or args["generic_re"] != None, "--generic_re needs to be given for generic input" 44 assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"
45 45
46 if args["ftype"] == "tabular" and args["by"] == "col": 46 if args["ftype"] == "tabular" and args["by"] == "col":
47 args["match"] = replace_mapped_chars(args["match"]) 47 args["match"] = replace_mapped_chars(args["match"])
48 args["sub"] = replace_mapped_chars(args["sub"]) 48 args["sub"] = replace_mapped_chars(args["sub"])
49 split_by_column(args, in_file, out_dir, top) 49 split_by_column(args, in_file, out_dir, top)
59 parser.add_argument('--in', '-i', required=True, help="The input file") 59 parser.add_argument('--in', '-i', required=True, help="The input file")
60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) 60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") 61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + 62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
63 " the extension of the new files (without a period)") 63 " the extension of the new files (without a period)")
64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, 64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True,
65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) 65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
67 default = "row", choices = ["col", "row"]) 67 default="row", choices=["col", "row"])
68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') 69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + 70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
71 "If not provided and args[\"rand\"]==True, then date is used", type=int) 71 "If not provided and args[\"rand\"]==True, then date is used", type=int)
72 group = parser.add_mutually_exclusive_group() 72 group = parser.add_mutually_exclusive_group()
73 group.add_argument('--numnew', '-n', type=int, default = 1, 73 group.add_argument('--numnew', '-n', type=int, default=1,
74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
75 group.add_argument('--chunksize', '-k', type=int, default = 0, 75 group.add_argument('--chunksize', '-k', type=int, default=0,
76 help="Number of records by file. Not valid for splitting on a column") 76 help="Number of records by file. Not valid for splitting on a column")
77 parser.add_argument('--batch', action='store_true', 77 parser.add_argument('--batch', action='store_true',
78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
79 generic = parser.add_argument_group('Arguments controling generic splitting') 79 generic = parser.add_argument_group('Arguments controling generic splitting')
80 group = generic.add_mutually_exclusive_group() 80 group = generic.add_mutually_exclusive_group()
81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False) 81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False)
82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False) 82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False)
83 generic.add_argument('--split_after', '-p', action='store_true', 83 generic.add_argument('--split_after', '-p', action='store_true',
84 help="Split between records after separator (default is before). " + 84 help="Split between records after separator (default is before). " +
85 "Only for generic splitting by regex - specific ftypes are always split in the default way") 85 "Only for generic splitting by regex - specific ftypes are always split in the default way")
86 bycol = parser.add_argument_group('If splitting on a column') 86 bycol = parser.add_argument_group('If splitting on a column')
87 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") 87 bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries")
88 bycol.add_argument('--sub', '-s', default = r'\1', 88 bycol.add_argument('--sub', '-s', default=r'\1',
89 help="The regular expression to substitute in for the matched pattern.") 89 help="The regular expression to substitute in for the matched pattern.")
90 bycol.add_argument('--id_column', '-c', default="1", 90 bycol.add_argument('--id_column', '-c', default="1",
91 help="Column that is used to name output files. Indexed starting from 1.", type=int) 91 help="Column that is used to name output files. Indexed starting from 1.", type=int)
92 return parser 92 return parser
93
94
95 def close_files(file_list):
96 # finally, close all files
97 for open_file in file_list:
98 open_file.close()
99 93
100 94
101 def replace_mapped_chars(pattern): 95 def replace_mapped_chars(pattern):
102 """ 96 """
103 handles special escaped characters when coming from galaxy 97 handles special escaped characters when coming from galaxy
124 else: 118 else:
125 random.seed() 119 random.seed()
126 120
127 # batched division (maintains order) 121 # batched division (maintains order)
128 batch = args["batch"] 122 batch = args["batch"]
129 123
130 # determine 124 # determine
131 # - the number of records that should be stored per file 125 # - the number of records that should be stored per file
132 # (done always, even if used only for batch mode) 126 # (done always, even if used only for batch mode)
133 # - if the separator is a the start / end of the record 127 # - if the separator is a the start / end of the record
134 n_per_file = math.inf 128 n_per_file = math.inf
135 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected 129 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
136 with open(in_file) as f: 130 with open(in_file) as f:
137 # read header lines 131 # read header lines
138 for i in range(top): 132 for i in range(top):
139 f.readline() 133 f.readline()
140 n_records = 0 134 n_records = 0
148 n_records += 1 142 n_records += 1
149 143
150 # if there are fewer records than desired files 144 # if there are fewer records than desired files
151 numnew = min(numnew, n_records) 145 numnew = min(numnew, n_records)
152 # approx. number of records per file 146 # approx. number of records per file
153 if chunksize == 0: # i.e. no chunking 147 if chunksize == 0: # i.e. no chunking
154 n_per_file = n_records // numnew 148 n_per_file = n_records // numnew
155 else: 149 else:
156 numnew = n_records // chunksize 150 numnew = n_records // chunksize
157 n_per_file = chunksize 151 n_per_file = chunksize
158 152
163 if custom_new_file_name is None: 157 if custom_new_file_name is None:
164 new_file_base = os.path.splitext(os.path.basename(in_file)) 158 new_file_base = os.path.splitext(os.path.basename(in_file))
165 else: 159 else:
166 new_file_base = [custom_new_file_name, custom_new_file_ext] 160 new_file_base = [custom_new_file_name, custom_new_file_ext]
167 161
168 newfiles = [ 162 newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]
169 open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")
170 for count in range(0, numnew)
171 ]
172 # bunch o' counters 163 # bunch o' counters
173 # index to list of new files 164 # index to list of new files
174 if rand: 165 if rand:
175 new_file_counter = int(math.floor(random.random() * numnew)) 166 new_file_counter = int(math.floor(random.random() * numnew))
176 else: 167 else:
177 new_file_counter = 0 168 new_file_counter = 0
169 new_file = open(newfile_names[new_file_counter], "a")
178 # to contain header specified by top 170 # to contain header specified by top
179 header = "" 171 header = ""
180 # keep track of the files that have been opened so far 172 # keep track of the files that have been opened so far
181 fresh_files = set(range(numnew)) 173 fresh_files = set(range(numnew))
182 174
199 if record == "": 191 if record == "":
200 record += line 192 record += line
201 else: 193 else:
202 # if is in fresh_files, write header and drop from freshFiles 194 # if is in fresh_files, write header and drop from freshFiles
203 if new_file_counter in fresh_files: 195 if new_file_counter in fresh_files:
204 newfiles[new_file_counter].write(header) 196 new_file.write(header)
205 fresh_files.remove(new_file_counter) 197 fresh_files.remove(new_file_counter)
206 198
207 if sep_at_end: 199 if sep_at_end:
208 record += line 200 record += line
209 # write record to file 201 # write record to file
210 newfiles[new_file_counter].write(record) 202 new_file.write(record)
211 if not sep_at_end: 203 if not sep_at_end:
212 record = line 204 record = line
213 else: 205 else:
214 record = "" 206 record = ""
215 207
216 # change destination file 208 # change destination file
217 if rand: 209 if rand:
218 new_file_counter = int(math.floor(random.random() * numnew)) 210 new_file_counter = int(math.floor(random.random() * numnew))
211 new_file.close()
212 new_file = open(newfile_names[new_file_counter], "a")
219 elif batch: 213 elif batch:
220 # number of records read per file 214 # number of records read per file
221 records_in_file += 1 215 records_in_file += 1
222 # have we reached the max for each file? 216 # have we reached the max for each file?
223 # if so, switch file 217 # if so, switch file
224 if records_in_file >= n_per_file: 218 if records_in_file >= n_per_file:
225 new_file_counter = (new_file_counter + 1) % numnew 219 new_file_counter = (new_file_counter + 1) % numnew
226 records_in_file = 0 # reset to 0 220 records_in_file = 0 # reset to 0
221 new_file.close()
222 new_file = open(newfile_names[new_file_counter], "a")
227 else: 223 else:
228 new_file_counter = (new_file_counter + 1) % numnew 224 new_file_counter = (new_file_counter + 1) % numnew
225 new_file.close()
226 new_file = open(newfile_names[new_file_counter], "a")
229 # if beginning of line is not record sep, we must be inside a record 227 # if beginning of line is not record sep, we must be inside a record
230 # so just append 228 # so just append
231 else: 229 else:
232 record += line 230 record += line
233 # after loop, write final record to file 231 # after loop, write final record to file
234 newfiles[new_file_counter].write(record) 232 new_file.write(record)
235 233 new_file.close()
236 # close new files
237 close_files(newfiles)
238 234
239 235
240 def split_by_column(args, in_file, out_dir, top): 236 def split_by_column(args, in_file, out_dir, top):
241 237
242 # shift to 0-based indexing 238 # shift to 0-based indexing
249 raise 245 raise
250 246
251 sub = args["sub"] 247 sub = args["sub"]
252 248
253 # set of file names 249 # set of file names
254 new_files = dict() 250 files = set()
255 251
256 # keep track of how many lines have been read 252 # keep track of how many lines have been read
257 n_read = 0 253 n_read = 0
258 header = "" 254 header = ""
259 with open(in_file) as file: 255 with open(in_file) as file:
272 # use regex to get new file name 268 # use regex to get new file name
273 out_file_name = re.sub(match, sub, id_col_val) 269 out_file_name = re.sub(match, sub, id_col_val)
274 out_file_path = os.path.join(out_dir, out_file_name) 270 out_file_path = os.path.join(out_dir, out_file_name)
275 271
276 # write 272 # write
277 if out_file_name not in new_files.keys(): 273 with open(out_file_path, "a") as current_new_file:
278 # open file (new, so not already open) 274 if out_file_name not in files:
279 current_new_file = open(out_file_path, "w") 275 current_new_file.write(header)
280 current_new_file.write(header) 276 files.add(out_file_name)
281 current_new_file.write(line) 277 current_new_file.write(line)
282 # add to dict
283 new_files[out_file_name] = current_new_file
284 else:
285 # file is already open, so just write to it
286 new_files[out_file_name].write(line)
287
288 # finally, close all files
289 close_files(new_files.values())
290 278
291 279
292 if __name__ == "__main__": 280 if __name__ == "__main__":
293 main() 281 main()