comparison split_file_to_collection.py @ 9:baabc30154cd draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 37305ce5e6ef1b91f6058eff5b128064f76fbe45
author bgruening
date Thu, 23 Nov 2023 20:02:01 +0000
parents 6cbe2f30c2d7
children 2dae863c8f42
comparison
equal deleted inserted replaced
8:6cbe2f30c2d7 9:baabc30154cd
11 # - number of lines to split after (0 if not splitting by number of lines but regex) 11 # - number of lines to split after (0 if not splitting by number of lines but regex)
12 # - a boolean indicating if the record separator is at the end of the record 12 # - a boolean indicating if the record separator is at the end of the record
13 # 13 #
14 # new file types can be added by appending to this dict, 14 # new file types can be added by appending to this dict,
15 # updating the parser, and adding a new type option in the Galaxy wrapper 15 # updating the parser, and adding a new type option in the Galaxy wrapper
16 FILETYPES = {'fasta': (r'^>', 0, False), 16 FILETYPES = {
17 'fastq': (r'', 4, False), 17 "fasta": (r"^>", 0, False),
18 'tabular': (r'', 1, False), 18 "fastq": (r"", 4, False),
19 'txt': (r'', 1, False), 19 "tabular": (r"", 1, False),
20 'mgf': (r'^BEGIN IONS', 0, False), 20 "txt": (r"", 1, False),
21 'sdf': (r'\$\$\$\$', 0, True), 21 "mgf": (r"^BEGIN IONS", 0, False),
22 } 22 "sdf": (r"\$\$\$\$", 0, True),
23 }
23 24
24 25
25 def main(): 26 def main():
26 ps = parser_cli() 27 ps = parser_cli()
27 args = vars(ps.parse_args()) 28 args = vars(ps.parse_args())
28 29
29 # get args and validate 30 # get args and validate
30 in_file = args["in"] 31 in_file = args["in"]
31 if not os.path.isfile(args["in"]): 32 if not os.path.isfile(args["in"]):
32 raise FileNotFoundError('Input file does not exist') 33 raise FileNotFoundError("Input file does not exist")
33 34
34 out_dir = args["out_dir"] 35 out_dir = args["out_dir"]
35 if not os.path.isdir(args["out_dir"]): 36 if not os.path.isdir(args["out_dir"]):
36 raise FileNotFoundError('out_dir is not a directory') 37 raise FileNotFoundError("out_dir is not a directory")
37 38
38 top = args["top"] 39 top = args["top"]
39 if top < 0: 40 if top < 0:
40 raise ValueError("Number of header lines cannot be negative") 41 raise ValueError("Number of header lines cannot be negative")
41 42
42 ftype = args["ftype"] 43 ftype = args["ftype"]
43 44
44 assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input" 45 assert (
46 ftype != "generic" or args["generic_re"] is not None
47 ), "--generic_re needs to be given for generic input"
45 48
46 if args["ftype"] == "tabular" and args["by"] == "col": 49 if args["ftype"] == "tabular" and args["by"] == "col":
47 args["match"] = replace_mapped_chars(args["match"]) 50 args["match"] = replace_mapped_chars(args["match"])
48 args["sub"] = replace_mapped_chars(args["sub"]) 51 args["sub"] = replace_mapped_chars(args["sub"])
49 split_by_column(args, in_file, out_dir, top) 52 split_by_column(args, in_file, out_dir, top)
51 args["generic_re"] = replace_mapped_chars(args["generic_re"]) 54 args["generic_re"] = replace_mapped_chars(args["generic_re"])
52 split_by_record(args, in_file, out_dir, top, ftype) 55 split_by_record(args, in_file, out_dir, top, ftype)
53 56
54 57
55 def parser_cli(): 58 def parser_cli():
56 parser = argparse.ArgumentParser(description="split a file into multiple files. " + 59 parser = argparse.ArgumentParser(
57 "Can split on the column of a tabular file, " + 60 description="split a file into multiple files. "
58 "with custom and useful names based on column value.") 61 + "Can split on the column of a tabular file, "
59 parser.add_argument('--in', '-i', required=True, help="The input file") 62 + "with custom and useful names based on column value."
60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) 63 )
61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") 64 parser.add_argument("--in", "-i", required=True, help="The input file")
62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + 65 parser.add_argument(
63 " the extension of the new files (without a period)") 66 "--out_dir",
64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True, 67 "-o",
65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) 68 default=os.getcwd(),
66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", 69 help="The output directory",
67 default="row", choices=["col", "row"]) 70 required=True,
68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") 71 )
69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') 72 parser.add_argument(
70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + 73 "--file_names",
71 "If not provided and args[\"rand\"]==True, then date is used", type=int) 74 "-a",
75 help="If not splitting by column, the base name of the new files",
76 )
77 parser.add_argument(
78 "--file_ext",
79 "-e",
80 help="If not splitting by column,"
81 + " the extension of the new files (without a period)",
82 )
83 parser.add_argument(
84 "--ftype",
85 "-f",
86 help="The type of the file to split",
87 required=True,
88 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"],
89 )
90 parser.add_argument(
91 "--by",
92 "-b",
93 help="Split by line or by column (tabular only)",
94 default="row",
95 choices=["col", "row"],
96 )
97 parser.add_argument(
98 "--top",
99 "-t",
100 type=int,
101 default=0,
102 help="Number of header lines to carry over to new files.",
103 )
104 parser.add_argument(
105 "--rand",
106 "-r",
107 help="Divide records randomly into new files",
108 action="store_true",
109 )
110 parser.add_argument(
111 "--seed",
112 "-x",
113 help="Provide a seed for the random number generator. "
114 + 'If not provided and args["rand"]==True, then date is used',
115 type=int,
116 )
72 group = parser.add_mutually_exclusive_group() 117 group = parser.add_mutually_exclusive_group()
73 group.add_argument('--numnew', '-n', type=int, default=1, 118 group.add_argument(
74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") 119 "--numnew",
75 group.add_argument('--chunksize', '-k', type=int, default=0, 120 "-n",
76 help="Number of records by file. Not valid for splitting on a column") 121 type=int,
77 parser.add_argument('--batch', action='store_true', 122 default=1,
78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") 123 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.",
79 generic = parser.add_argument_group('Arguments controling generic splitting') 124 )
125 group.add_argument(
126 "--chunksize",
127 "-k",
128 type=int,
129 default=0,
130 help="Number of records by file. Not valid for splitting on a column",
131 )
132 parser.add_argument(
133 "--batch",
134 action="store_true",
135 help="Distribute files to collection while maintaining order. Ignored if splitting on column.",
136 )
137 generic = parser.add_argument_group("Arguments controling generic splitting")
80 group = generic.add_mutually_exclusive_group() 138 group = generic.add_mutually_exclusive_group()
81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False) 139 group.add_argument(
82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False) 140 "--generic_re",
83 generic.add_argument('--split_after', '-p', action='store_true', 141 "-g",
84 help="Split between records after separator (default is before). " + 142 default="",
85 "Only for generic splitting by regex - specific ftypes are always split in the default way") 143 help="Regular expression indicating the start of a new record (only for generic)",
86 bycol = parser.add_argument_group('If splitting on a column') 144 required=False,
87 bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries") 145 )
88 bycol.add_argument('--sub', '-s', default=r'\1', 146 group.add_argument(
89 help="The regular expression to substitute in for the matched pattern.") 147 "--generic_num",
90 bycol.add_argument('--id_column', '-c', default="1", 148 type=int,
91 help="Column that is used to name output files. Indexed starting from 1.", type=int) 149 default=0,
150 help="Length of records in number of lines (only for generic)",
151 required=False,
152 )
153 generic.add_argument(
154 "--split_after",
155 "-p",
156 action="store_true",
157 help="Split between records after separator (default is before). "
158 + "Only for generic splitting by regex - specific ftypes are always split in the default way",
159 )
160 bycol = parser.add_argument_group("If splitting on a column")
161 bycol.add_argument(
162 "--match",
163 "-m",
164 default="(.*)",
165 help="The regular expression to match id column entries",
166 )
167 bycol.add_argument(
168 "--sub",
169 "-s",
170 default=r"\1",
171 help="The regular expression to substitute in for the matched pattern.",
172 )
173 bycol.add_argument(
174 "--id_column",
175 "-c",
176 default="1",
177 help="Column that is used to name output files. Indexed starting from 1.",
178 type=int,
179 )
92 return parser 180 return parser
93 181
94 182
95 def replace_mapped_chars(pattern): 183 def replace_mapped_chars(pattern):
96 """ 184 """
97 handles special escaped characters when coming from galaxy 185 handles special escaped characters when coming from galaxy
98 """ 186 """
99 mapped_chars = {'\'': '__sq__', '\\': '__backslash__'} 187 mapped_chars = {"'": "__sq__", "\\": "__backslash__"}
100 for key, value in mapped_chars.items(): 188 for key, value in mapped_chars.items():
101 pattern = pattern.replace(value, key) 189 pattern = pattern.replace(value, key)
102 return pattern 190 return pattern
103 191
104 192
105 def split_by_record(args, in_file, out_dir, top, ftype): 193 def split_by_record(args, in_file, out_dir, top, ftype):
106 # get configuration (record separator, start at end) for given filetype 194 # get configuration (record separator, start at end) for given filetype
107 sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) 195 sep, num, sep_at_end = FILETYPES.get(
196 ftype, (args["generic_re"], args["generic_num"], args["split_after"])
197 )
108 sep = re.compile(sep) 198 sep = re.compile(sep)
109 199
110 chunksize = args["chunksize"] 200 chunksize = args["chunksize"]
111 numnew = args["numnew"] 201 numnew = args["numnew"]
112 202
124 # determine 214 # determine
125 # - the number of records that should be stored per file 215 # - the number of records that should be stored per file
126 # (done always, even if used only for batch mode) 216 # (done always, even if used only for batch mode)
127 # - if the separator is a the start / end of the record 217 # - if the separator is a the start / end of the record
128 n_per_file = math.inf 218 n_per_file = math.inf
129 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected 219 if (
220 chunksize != 0 or batch
221 ): # needs to be calculated if either batch or chunksize are selected
130 with open(in_file) as f: 222 with open(in_file) as f:
131 # read header lines 223 # read header lines
132 for i in range(top): 224 for i in range(top):
133 f.readline() 225 f.readline()
134 n_records = 0 226 n_records = 0
227 last_line_matched = False
135 for line in f: 228 for line in f:
136 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): 229 if (num == 0 and re.match(sep, line) is not None) or (
230 num > 0 and n_records % num == 0
231 ):
137 n_records += 1 232 n_records += 1
138 last_line_matched = True 233 last_line_matched = True
139 else: 234 else:
140 last_line_matched = False 235 last_line_matched = False
141 if sep_at_end and not last_line_matched: 236 if sep_at_end and not last_line_matched:
145 numnew = min(numnew, n_records) 240 numnew = min(numnew, n_records)
146 # approx. number of records per file 241 # approx. number of records per file
147 if chunksize == 0: # i.e. no chunking 242 if chunksize == 0: # i.e. no chunking
148 n_per_file = n_records // numnew 243 n_per_file = n_records // numnew
149 else: 244 else:
150 numnew = n_records // chunksize 245 numnew = max(n_records // chunksize, 1) # should not be less than 1
151 n_per_file = chunksize 246 n_per_file = chunksize
152 247
153 # make new files 248 # make new files
154 # strip extension of old file and add number 249 # strip extension of old file and add number
155 custom_new_file_name = args["file_names"] 250 custom_new_file_name = args["file_names"]
157 if custom_new_file_name is None: 252 if custom_new_file_name is None:
158 new_file_base = os.path.splitext(os.path.basename(in_file)) 253 new_file_base = os.path.splitext(os.path.basename(in_file))
159 else: 254 else:
160 new_file_base = [custom_new_file_name, custom_new_file_ext] 255 new_file_base = [custom_new_file_name, custom_new_file_ext]
161 256
162 newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)] 257 newfile_names = [
258 os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1]))
259 for count in range(0, numnew)
260 ]
163 # bunch o' counters 261 # bunch o' counters
164 # index to list of new files 262 # index to list of new files
165 if rand: 263 if rand:
166 new_file_counter = int(math.floor(random.random() * numnew)) 264 new_file_counter = int(math.floor(random.random() * numnew))
167 else: 265 else:
184 282
185 record = "" 283 record = ""
186 for line_no, line in enumerate(f): 284 for line_no, line in enumerate(f):
187 # check if beginning of line is record sep 285 # check if beginning of line is record sep
188 # if beginning of line is record sep, either start record or finish one 286 # if beginning of line is record sep, either start record or finish one
189 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): 287 if (num == 0 and re.match(sep, line) is not None) or (
288 num > 0 and line_no % num == 0
289 ):
190 # this only happens first time through 290 # this only happens first time through
191 if record == "": 291 if record == "":
192 record += line 292 record += line
193 else: 293 else:
194 # if is in fresh_files, write header and drop from freshFiles 294 # if is in fresh_files, write header and drop from freshFiles
258 n_read += 1 358 n_read += 1
259 if n_read <= top: 359 if n_read <= top:
260 header += line 360 header += line
261 continue 361 continue
262 # split into columns, on tab 362 # split into columns, on tab
263 fields = re.split(r'\t', line.strip('\n')) 363 fields = re.split(r"\t", line.strip("\n"))
264 364
265 # get id column value 365 # get id column value
266 id_col_val = fields[id_col] 366 id_col_val = fields[id_col]
267 367
268 # use regex to get new file name 368 # use regex to get new file name