comparison split_file_to_collection.py @ 0:de3c2c88e710 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
author bgruening
date Tue, 17 Jul 2018 14:37:13 -0400
parents
children d150ac3d853d
comparison
equal deleted inserted replaced
-1:000000000000 0:de3c2c88e710
1 #!/usr/bin/env python
2
3 import argparse
4 import os
5 import re
6 import random
7 import math
8
9
10 """
11 regexes that indicate the *beginning* of a record
12 new file types can be added by appending to this dict,
13 updating the parser, and adding a new type option in the Galaxy wrapper
14 """
15 FILETYPES = {'fasta': '^>',
16 'fastq': '^@',
17 'tabular': '^.*',
18 'mgf': '^BEGIN IONS'}
19
20
21 def main():
22 ps = parser_cli()
23 args = vars(ps.parse_args())
24
25 # get args and validate
26 in_file = args["in"]
27 if not os.path.isfile(args["in"]):
28 raise FileNotFoundError('Input file does not exist')
29
30 out_dir = args["out_dir"]
31 if not os.path.isdir(args["out_dir"]):
32 raise FileNotFoundError('out_dir is not a directory')
33
34 top = args["top"]
35 if top < 0:
36 raise ValueError("Number of header lines cannot be negative")
37
38 ftype = args["ftype"]
39
40 if args["ftype"] == "tabular" and args["by"] == "col":
41 args["match"] = replace_mapped_chars(args["match"])
42 args["sub"] = replace_mapped_chars(args["sub"])
43 split_by_column(args, in_file, out_dir, top)
44
45 else:
46 split_by_record(args, in_file, out_dir, top, ftype)
47
48
49 def parser_cli():
50 parser = argparse.ArgumentParser(description="split a file into multiple files. " +
51 "Can split on the column of a tabular file, " +
52 "with custom and useful names based on column value.")
53 parser.add_argument('--in', '-i', required=True, help="The input file")
54 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
55 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
56 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
57 " the extension of the new files (without a period)")
58 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
59 choices=["mgf", "fastq", "fasta", "tabular"])
60 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
61 default = "row", choices = ["col", "row"])
62 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " +
63 "(tabular only).")
64 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
65 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
66 "If not provided and args[\"rand\"]==True, then date is used", type=int)
67 parser.add_argument('--numnew', '-n', type=int, default = 1,
68 help="Number of output files desired. Not valid for splitting on a column")
69 parser.add_argument('--batch', action='store_true',
70 help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
71
72 bycol = parser.add_argument_group('If splitting on a column')
73 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
74 bycol.add_argument('--sub', '-s', default = r'\1',
75 help="The regular expression to substitute in for the matched pattern.")
76 bycol.add_argument('--id_column', '-c', default="1",
77 help="Column that is used to name output files. Indexed starting from 1.", type=int)
78 return parser
79
80
81 def close_files(file_list):
82 # finally, close all files
83 for open_file in file_list:
84 open_file.close()
85
86
87 def replace_mapped_chars(pattern):
88 """
89 handles special escaped characters when coming from galaxy
90 """
91 mapped_chars = {'\'': '__sq__', '\\': '__backslash__'}
92 for key, value in mapped_chars.items():
93 pattern = pattern.replace(value, key)
94 return pattern
95
96
97 def split_by_record(args, in_file, out_dir, top, ftype):
98 # get record separator for given filetype
99 sep = re.compile(FILETYPES[ftype])
100
101 numnew = args["numnew"]
102
103 # random division
104 rand = args["rand"]
105 seed = args["seed"]
106 if seed:
107 random.seed(seed)
108 else:
109 random.seed()
110
111 # batched division (maintains order)
112 batch = args["batch"]
113 # define n_per_file so we don't get a warning about ref before assignment
114 n_per_file = math.inf
115 if batch:
116 # number of records
117 with open(in_file) as f:
118 i = 0
119 for line in f:
120 if re.match(sep, line) is not None:
121 i+=1
122 n_records = i + 1
123 if top:
124 n_records -= top # don't count the top lines
125
126 # approx. number of lines per file
127 n_per_file = n_records // numnew
128
129 # make new files
130 # strip extension of old file and add number
131 custom_new_file_name = args["file_names"]
132 custom_new_file_ext = "." + args["file_ext"]
133 if custom_new_file_name is None:
134 new_file_base = os.path.splitext(os.path.basename(in_file))
135 else:
136 new_file_base = [custom_new_file_name, custom_new_file_ext]
137
138 newfiles = [
139 open(out_dir + "/" + new_file_base[0] + "_" + str(count) + new_file_base[1], "w")
140 for count in range(0, numnew)
141 ]
142
143 # bunch o' counters
144 # index to list of new files
145 new_file_counter = 0
146
147 # used for top
148 # number of lines read so far
149 n_read = 0
150 # to contain header specified by top
151 header = ""
152 # keep track of the files that have been opened so far
153 fresh_files = {i for i in range(0, numnew)}
154
155 # keep track in loop of number of records in each file
156 # only used in batch
157 records_in_file = 0
158
159 # open file
160 with open(in_file, "r") as file:
161 record = ""
162 for line in file:
163 n_read += 1
164 if n_read <= top:
165 header += line
166 continue
167 # check if beginning of line is record sep
168 # if beginning of line is record sep, either start record or finish one
169 if re.match(sep, line) is not None:
170 # this only happens first time through
171 if record == "":
172 record += line
173 else:
174 # if is in fresh_files, write header and drop from freshFiles
175 if new_file_counter in fresh_files:
176 newfiles[new_file_counter].write(header)
177 fresh_files.remove(new_file_counter)
178
179 # write record to file
180 newfiles[new_file_counter].write(record)
181
182 # if not the first time through, we assign the new record
183 record = line
184
185 # change destination file
186 if rand:
187 new_file_counter = int(math.floor(random.random() * numnew))
188 elif batch:
189 # number of records read per file
190 records_in_file += 1
191 # have we reached the max for each file?
192 # if so, switch file
193 if records_in_file >= n_per_file:
194 new_file_counter = (new_file_counter + 1) % numnew
195 records_in_file = 0 # reset to 0
196 else:
197 new_file_counter = (new_file_counter + 1) % numnew
198 # if beginning of line is not record sep, we must be inside a record
199 # so just append
200 else:
201 record += line
202 # after loop, write final record to file
203 newfiles[new_file_counter].write(record)
204 # close new files
205 close_files(newfiles)
206
207
208 def split_by_column(args, in_file, out_dir, top):
209
210 # shift to 0-based indexing
211 id_col = int(args["id_column"]) - 1
212
213 try:
214 match = re.compile(args["match"])
215 except re.error:
216 print("ERROR: Match (-m) supplied is not valid regex.")
217 raise
218
219 sub = args["sub"]
220
221 # set of file names
222 new_files = dict()
223
224 # keep track of how many lines have been read
225 n_read = 0
226 header = ""
227 with open(in_file) as file:
228 for line in file:
229 # if still in top, save to header
230 n_read += 1
231 if n_read <= top:
232 header += line
233 continue
234 # split into columns, on tab
235 fields = re.split(r'\t', line.strip('\n'))
236
237 # get id column value
238 id_col_val = fields[id_col]
239
240 # use regex to get new file name
241 out_file_name = re.sub(match, sub, id_col_val)
242 out_file_path = os.path.join(out_dir, out_file_name)
243
244 # write
245 if out_file_name not in new_files.keys():
246 # open file (new, so not already open)
247 current_new_file = open(out_file_path, "w")
248 current_new_file.write(header)
249 current_new_file.write(line)
250 # add to dict
251 new_files[out_file_name] = current_new_file
252 else:
253 # file is already open, so just write to it
254 new_files[out_file_name].write(line)
255
256 # finally, close all files
257 close_files(new_files.values())
258
259
260 if __name__ == "__main__":
261 main()