comparison split_file_to_collection.py @ 5:e77b954f0da5 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author bgruening
date Fri, 11 Oct 2019 18:24:43 -0400
parents 0850f2dfba13
children d57735dd27b0
comparison
equal deleted inserted replaced
4:0850f2dfba13 5:e77b954f0da5
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import argparse 3 import argparse
4 import math
4 import os 5 import os
5 import re 6 import re
6 import random 7 import random
7 import math 8
8 9 # configuration of the splitting for specific file types
9 10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
10 """ 11 # - number of lines to split after (0 if not splitting by number of lines but regex)
11 regexes that indicate the *beginning* of a record 12 # - a boolean indicating if the record separator is at the end of the record
12 new file types can be added by appending to this dict, 13 #
13 updating the parser, and adding a new type option in the Galaxy wrapper 14 # new file types can be added by appending to this dict,
14 """ 15 # updating the parser, and adding a new type option in the Galaxy wrapper
15 FILETYPES = {'fasta': '^>', 16 FILETYPES = {'fasta': ('^>', 0, False),
16 'fastq': '^@', 17 'fastq': ('', 4, False),
17 'tabular': '^.*', 18 'tabular': ('', 1, False),
18 'txt': '^.*', 19 'txt': ('', 1, False),
19 'mgf': '^BEGIN IONS', 20 'mgf': ('^BEGIN IONS', 0, False),
20 'sdf': '\$\$\$\$', 21 'sdf': ('\$\$\$\$', 0, True),
21 } 22 }
22 23
23 24
24 def main(): 25 def main():
25 ps = parser_cli() 26 ps = parser_cli()
44 45
45 if args["ftype"] == "tabular" and args["by"] == "col": 46 if args["ftype"] == "tabular" and args["by"] == "col":
46 args["match"] = replace_mapped_chars(args["match"]) 47 args["match"] = replace_mapped_chars(args["match"])
47 args["sub"] = replace_mapped_chars(args["sub"]) 48 args["sub"] = replace_mapped_chars(args["sub"])
48 split_by_column(args, in_file, out_dir, top) 49 split_by_column(args, in_file, out_dir, top)
49 50 else:
50 else: 51 args["generic_re"] = replace_mapped_chars(args["generic_re"])
51 split_by_record(args, in_file, out_dir, top, ftype) 52 split_by_record(args, in_file, out_dir, top, ftype)
52 53
53 54
54 def parser_cli(): 55 def parser_cli():
55 parser = argparse.ArgumentParser(description="split a file into multiple files. " + 56 parser = argparse.ArgumentParser(description="split a file into multiple files. " +
60 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") 61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
61 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + 62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
62 " the extension of the new files (without a period)") 63 " the extension of the new files (without a period)")
63 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, 64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
64 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) 65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
65 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
67 default = "row", choices = ["col", "row"]) 67 default = "row", choices = ["col", "row"])
68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
69 "(tabular only).")
70 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') 69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
71 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + 70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
72 "If not provided and args[\"rand\"]==True, then date is used", type=int) 71 "If not provided and args[\"rand\"]==True, then date is used", type=int)
73 parser.add_argument('--numnew', '-n', type=int, default = 1, 72 group = parser.add_mutually_exclusive_group()
73 group.add_argument('--numnew', '-n', type=int, default = 1,
74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
75 parser.add_argument('--chunksize', '-k', type=int, default = 0, 75 group.add_argument('--chunksize', '-k', type=int, default = 0,
76 help="Number of records by file. Not valid for splitting on a column") 76 help="Number of records by file. Not valid for splitting on a column")
77 parser.add_argument('--batch', action='store_true', 77 parser.add_argument('--batch', action='store_true',
78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
79 parser.add_argument('--split_after', '-p', action='store_true', 79 generic = parser.add_argument_group('Arguments controling generic splitting')
80 help="Split between records after separator (default is before)." + 80 group = generic.add_mutually_exclusive_group()
81 "Only for generic - specific ftypes are always split in the default way") 81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False)
82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False)
83 generic.add_argument('--split_after', '-p', action='store_true',
84 help="Split between records after separator (default is before). " +
85 "Only for generic splitting by regex - specific ftypes are always split in the default way")
82 bycol = parser.add_argument_group('If splitting on a column') 86 bycol = parser.add_argument_group('If splitting on a column')
83 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") 87 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
84 bycol.add_argument('--sub', '-s', default = r'\1', 88 bycol.add_argument('--sub', '-s', default = r'\1',
85 help="The regular expression to substitute in for the matched pattern.") 89 help="The regular expression to substitute in for the matched pattern.")
86 bycol.add_argument('--id_column', '-c', default="1", 90 bycol.add_argument('--id_column', '-c', default="1",
103 pattern = pattern.replace(value, key) 107 pattern = pattern.replace(value, key)
104 return pattern 108 return pattern
105 109
106 110
107 def split_by_record(args, in_file, out_dir, top, ftype): 111 def split_by_record(args, in_file, out_dir, top, ftype):
108 # get record separator for given filetype 112 # get configuration (record separator, start at end) for given filetype
109 sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) 113 sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))
114 sep = re.compile(sep)
110 115
111 chunksize = args["chunksize"] 116 chunksize = args["chunksize"]
112 numnew = args["numnew"] 117 numnew = args["numnew"]
113 118
114 # random division 119 # random division
119 else: 124 else:
120 random.seed() 125 random.seed()
121 126
122 # batched division (maintains order) 127 # batched division (maintains order)
123 batch = args["batch"] 128 batch = args["batch"]
124
125 129
130 # determine
131 # - the number of records that should be stored per file
132 # (done always, even if used only for batch mode)
133 # - if the separator is a the start / end of the record
134 n_per_file = math.inf
126 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected 135 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
127 # define n_per_file so we don't get a warning about ref before assignment
128 n_per_file = math.inf
129
130 # number of records
131 with open(in_file) as f: 136 with open(in_file) as f:
132 i = 0 137 # read header lines
138 for i in range(top):
139 f.readline()
140 n_records = 0
133 for line in f: 141 for line in f:
134 if re.match(sep, line) is not None: 142 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):
135 i+=1 143 n_records += 1
136 n_records = i + 1 144 last_line_matched = True
137 if top: 145 else:
138 n_records -= top # don't count the top lines 146 last_line_matched = False
139 147 if sep_at_end and not last_line_matched:
148 n_records += 1
149
150 # if there are fewer records than desired files
151 numnew = min(numnew, n_records)
152 # approx. number of records per file
140 if chunksize == 0: # i.e. no chunking 153 if chunksize == 0: # i.e. no chunking
141 # approx. number of lines per file
142 n_per_file = n_records // numnew 154 n_per_file = n_records // numnew
143 else: 155 else:
144 # approx. number of lines per file
145 numnew = n_records // chunksize 156 numnew = n_records // chunksize
146 n_per_file = chunksize 157 n_per_file = chunksize
147
148
149
150 158
151 # make new files 159 # make new files
152 # strip extension of old file and add number 160 # strip extension of old file and add number
153 custom_new_file_name = args["file_names"] 161 custom_new_file_name = args["file_names"]
154 custom_new_file_ext = "." + args["file_ext"] 162 custom_new_file_ext = "." + args["file_ext"]
159 167
160 newfiles = [ 168 newfiles = [
161 open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") 169 open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")
162 for count in range(0, numnew) 170 for count in range(0, numnew)
163 ] 171 ]
164
165 # bunch o' counters 172 # bunch o' counters
166 # index to list of new files 173 # index to list of new files
167 new_file_counter = 0 174 if rand:
168 175 new_file_counter = int(math.floor(random.random() * numnew))
169 # used for top 176 else:
170 # number of lines read so far 177 new_file_counter = 0
171 n_read = 0
172 # to contain header specified by top 178 # to contain header specified by top
173 header = "" 179 header = ""
174 # keep track of the files that have been opened so far 180 # keep track of the files that have been opened so far
175 fresh_files = {i for i in range(0, numnew)} 181 fresh_files = set(range(numnew))
176 182
177 # keep track in loop of number of records in each file 183 # keep track in loop of number of records in each file
178 # only used in batch 184 # only used in batch
179 records_in_file = 0 185 records_in_file = 0
180 186
181 # open file 187 # open file
182 with open(in_file, "r") as file: 188 with open(in_file, "r") as f:
189 # read header
190 for i in range(top):
191 header += f.readline()
192
183 record = "" 193 record = ""
184 for line in file: 194 for line_no, line in enumerate(f):
185 n_read += 1
186 if n_read <= top:
187 header += line
188 continue
189 # check if beginning of line is record sep 195 # check if beginning of line is record sep
190 # if beginning of line is record sep, either start record or finish one 196 # if beginning of line is record sep, either start record or finish one
191 if re.match(sep, line) is not None: 197 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):
192 # this only happens first time through 198 # this only happens first time through
193 if record == "": 199 if record == "":
194 record += line 200 record += line
195 else: 201 else:
196 # if is in fresh_files, write header and drop from freshFiles 202 # if is in fresh_files, write header and drop from freshFiles
197 if new_file_counter in fresh_files: 203 if new_file_counter in fresh_files:
198 newfiles[new_file_counter].write(header) 204 newfiles[new_file_counter].write(header)
199 fresh_files.remove(new_file_counter) 205 fresh_files.remove(new_file_counter)
200 206
201 if ftype != "sdf" and args["split_after"] == False: 207 if sep_at_end:
202 # write record to file 208 record += line
203 newfiles[new_file_counter].write(record) 209 # write record to file
204 210 newfiles[new_file_counter].write(record)
205 # if not the first time through, we assign the new record 211 if not sep_at_end:
206 record = line 212 record = line
207 213 else:
208 else: # for sdf we want to write the line to the record before starting a new one
209 record += line
210 newfiles[new_file_counter].write(record)
211 record = "" 214 record = ""
212 215
213 # change destination file 216 # change destination file
214 if rand: 217 if rand:
215 new_file_counter = int(math.floor(random.random() * numnew)) 218 new_file_counter = int(math.floor(random.random() * numnew))
216 elif batch: 219 elif batch:
217 # number of records read per file 220 # number of records read per file
227 # so just append 230 # so just append
228 else: 231 else:
229 record += line 232 record += line
230 # after loop, write final record to file 233 # after loop, write final record to file
231 newfiles[new_file_counter].write(record) 234 newfiles[new_file_counter].write(record)
235
232 # close new files 236 # close new files
233 close_files(newfiles) 237 close_files(newfiles)
234 238
235 239
236 def split_by_column(args, in_file, out_dir, top): 240 def split_by_column(args, in_file, out_dir, top):