Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.py @ 9:baabc30154cd draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 37305ce5e6ef1b91f6058eff5b128064f76fbe45
| author | bgruening | 
|---|---|
| date | Thu, 23 Nov 2023 20:02:01 +0000 | 
| parents | 6cbe2f30c2d7 | 
| children | 2dae863c8f42 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 8:6cbe2f30c2d7 | 9:baabc30154cd | 
|---|---|
| 11 # - number of lines to split after (0 if not splitting by number of lines but regex) | 11 # - number of lines to split after (0 if not splitting by number of lines but regex) | 
| 12 # - a boolean indicating if the record separator is at the end of the record | 12 # - a boolean indicating if the record separator is at the end of the record | 
| 13 # | 13 # | 
| 14 # new file types can be added by appending to this dict, | 14 # new file types can be added by appending to this dict, | 
| 15 # updating the parser, and adding a new type option in the Galaxy wrapper | 15 # updating the parser, and adding a new type option in the Galaxy wrapper | 
| 16 FILETYPES = {'fasta': (r'^>', 0, False), | 16 FILETYPES = { | 
| 17 'fastq': (r'', 4, False), | 17 "fasta": (r"^>", 0, False), | 
| 18 'tabular': (r'', 1, False), | 18 "fastq": (r"", 4, False), | 
| 19 'txt': (r'', 1, False), | 19 "tabular": (r"", 1, False), | 
| 20 'mgf': (r'^BEGIN IONS', 0, False), | 20 "txt": (r"", 1, False), | 
| 21 'sdf': (r'\$\$\$\$', 0, True), | 21 "mgf": (r"^BEGIN IONS", 0, False), | 
| 22 } | 22 "sdf": (r"\$\$\$\$", 0, True), | 
| 23 } | |
| 23 | 24 | 
| 24 | 25 | 
| 25 def main(): | 26 def main(): | 
| 26 ps = parser_cli() | 27 ps = parser_cli() | 
| 27 args = vars(ps.parse_args()) | 28 args = vars(ps.parse_args()) | 
| 28 | 29 | 
| 29 # get args and validate | 30 # get args and validate | 
| 30 in_file = args["in"] | 31 in_file = args["in"] | 
| 31 if not os.path.isfile(args["in"]): | 32 if not os.path.isfile(args["in"]): | 
| 32 raise FileNotFoundError('Input file does not exist') | 33 raise FileNotFoundError("Input file does not exist") | 
| 33 | 34 | 
| 34 out_dir = args["out_dir"] | 35 out_dir = args["out_dir"] | 
| 35 if not os.path.isdir(args["out_dir"]): | 36 if not os.path.isdir(args["out_dir"]): | 
| 36 raise FileNotFoundError('out_dir is not a directory') | 37 raise FileNotFoundError("out_dir is not a directory") | 
| 37 | 38 | 
| 38 top = args["top"] | 39 top = args["top"] | 
| 39 if top < 0: | 40 if top < 0: | 
| 40 raise ValueError("Number of header lines cannot be negative") | 41 raise ValueError("Number of header lines cannot be negative") | 
| 41 | 42 | 
| 42 ftype = args["ftype"] | 43 ftype = args["ftype"] | 
| 43 | 44 | 
| 44 assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input" | 45 assert ( | 
| 46 ftype != "generic" or args["generic_re"] is not None | |
| 47 ), "--generic_re needs to be given for generic input" | |
| 45 | 48 | 
| 46 if args["ftype"] == "tabular" and args["by"] == "col": | 49 if args["ftype"] == "tabular" and args["by"] == "col": | 
| 47 args["match"] = replace_mapped_chars(args["match"]) | 50 args["match"] = replace_mapped_chars(args["match"]) | 
| 48 args["sub"] = replace_mapped_chars(args["sub"]) | 51 args["sub"] = replace_mapped_chars(args["sub"]) | 
| 49 split_by_column(args, in_file, out_dir, top) | 52 split_by_column(args, in_file, out_dir, top) | 
| 51 args["generic_re"] = replace_mapped_chars(args["generic_re"]) | 54 args["generic_re"] = replace_mapped_chars(args["generic_re"]) | 
| 52 split_by_record(args, in_file, out_dir, top, ftype) | 55 split_by_record(args, in_file, out_dir, top, ftype) | 
| 53 | 56 | 
| 54 | 57 | 
| 55 def parser_cli(): | 58 def parser_cli(): | 
| 56 parser = argparse.ArgumentParser(description="split a file into multiple files. " + | 59 parser = argparse.ArgumentParser( | 
| 57 "Can split on the column of a tabular file, " + | 60 description="split a file into multiple files. " | 
| 58 "with custom and useful names based on column value.") | 61 + "Can split on the column of a tabular file, " | 
| 59 parser.add_argument('--in', '-i', required=True, help="The input file") | 62 + "with custom and useful names based on column value." | 
| 60 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) | 63 ) | 
| 61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") | 64 parser.add_argument("--in", "-i", required=True, help="The input file") | 
| 62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + | 65 parser.add_argument( | 
| 63 " the extension of the new files (without a period)") | 66 "--out_dir", | 
| 64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True, | 67 "-o", | 
| 65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) | 68 default=os.getcwd(), | 
| 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", | 69 help="The output directory", | 
| 67 default="row", choices=["col", "row"]) | 70 required=True, | 
| 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") | 71 ) | 
| 69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') | 72 parser.add_argument( | 
| 70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + | 73 "--file_names", | 
| 71 "If not provided and args[\"rand\"]==True, then date is used", type=int) | 74 "-a", | 
| 75 help="If not splitting by column, the base name of the new files", | |
| 76 ) | |
| 77 parser.add_argument( | |
| 78 "--file_ext", | |
| 79 "-e", | |
| 80 help="If not splitting by column," | |
| 81 + " the extension of the new files (without a period)", | |
| 82 ) | |
| 83 parser.add_argument( | |
| 84 "--ftype", | |
| 85 "-f", | |
| 86 help="The type of the file to split", | |
| 87 required=True, | |
| 88 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"], | |
| 89 ) | |
| 90 parser.add_argument( | |
| 91 "--by", | |
| 92 "-b", | |
| 93 help="Split by line or by column (tabular only)", | |
| 94 default="row", | |
| 95 choices=["col", "row"], | |
| 96 ) | |
| 97 parser.add_argument( | |
| 98 "--top", | |
| 99 "-t", | |
| 100 type=int, | |
| 101 default=0, | |
| 102 help="Number of header lines to carry over to new files.", | |
| 103 ) | |
| 104 parser.add_argument( | |
| 105 "--rand", | |
| 106 "-r", | |
| 107 help="Divide records randomly into new files", | |
| 108 action="store_true", | |
| 109 ) | |
| 110 parser.add_argument( | |
| 111 "--seed", | |
| 112 "-x", | |
| 113 help="Provide a seed for the random number generator. " | |
| 114 + 'If not provided and args["rand"]==True, then date is used', | |
| 115 type=int, | |
| 116 ) | |
| 72 group = parser.add_mutually_exclusive_group() | 117 group = parser.add_mutually_exclusive_group() | 
| 73 group.add_argument('--numnew', '-n', type=int, default=1, | 118 group.add_argument( | 
| 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") | 119 "--numnew", | 
| 75 group.add_argument('--chunksize', '-k', type=int, default=0, | 120 "-n", | 
| 76 help="Number of records by file. Not valid for splitting on a column") | 121 type=int, | 
| 77 parser.add_argument('--batch', action='store_true', | 122 default=1, | 
| 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") | 123 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.", | 
| 79 generic = parser.add_argument_group('Arguments controling generic splitting') | 124 ) | 
| 125 group.add_argument( | |
| 126 "--chunksize", | |
| 127 "-k", | |
| 128 type=int, | |
| 129 default=0, | |
| 130 help="Number of records by file. Not valid for splitting on a column", | |
| 131 ) | |
| 132 parser.add_argument( | |
| 133 "--batch", | |
| 134 action="store_true", | |
| 135 help="Distribute files to collection while maintaining order. Ignored if splitting on column.", | |
| 136 ) | |
| 137 generic = parser.add_argument_group("Arguments controling generic splitting") | |
| 80 group = generic.add_mutually_exclusive_group() | 138 group = generic.add_mutually_exclusive_group() | 
| 81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False) | 139 group.add_argument( | 
| 82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False) | 140 "--generic_re", | 
| 83 generic.add_argument('--split_after', '-p', action='store_true', | 141 "-g", | 
| 84 help="Split between records after separator (default is before). " + | 142 default="", | 
| 85 "Only for generic splitting by regex - specific ftypes are always split in the default way") | 143 help="Regular expression indicating the start of a new record (only for generic)", | 
| 86 bycol = parser.add_argument_group('If splitting on a column') | 144 required=False, | 
| 87 bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries") | 145 ) | 
| 88 bycol.add_argument('--sub', '-s', default=r'\1', | 146 group.add_argument( | 
| 89 help="The regular expression to substitute in for the matched pattern.") | 147 "--generic_num", | 
| 90 bycol.add_argument('--id_column', '-c', default="1", | 148 type=int, | 
| 91 help="Column that is used to name output files. Indexed starting from 1.", type=int) | 149 default=0, | 
| 150 help="Length of records in number of lines (only for generic)", | |
| 151 required=False, | |
| 152 ) | |
| 153 generic.add_argument( | |
| 154 "--split_after", | |
| 155 "-p", | |
| 156 action="store_true", | |
| 157 help="Split between records after separator (default is before). " | |
| 158 + "Only for generic splitting by regex - specific ftypes are always split in the default way", | |
| 159 ) | |
| 160 bycol = parser.add_argument_group("If splitting on a column") | |
| 161 bycol.add_argument( | |
| 162 "--match", | |
| 163 "-m", | |
| 164 default="(.*)", | |
| 165 help="The regular expression to match id column entries", | |
| 166 ) | |
| 167 bycol.add_argument( | |
| 168 "--sub", | |
| 169 "-s", | |
| 170 default=r"\1", | |
| 171 help="The regular expression to substitute in for the matched pattern.", | |
| 172 ) | |
| 173 bycol.add_argument( | |
| 174 "--id_column", | |
| 175 "-c", | |
| 176 default="1", | |
| 177 help="Column that is used to name output files. Indexed starting from 1.", | |
| 178 type=int, | |
| 179 ) | |
| 92 return parser | 180 return parser | 
| 93 | 181 | 
| 94 | 182 | 
| 95 def replace_mapped_chars(pattern): | 183 def replace_mapped_chars(pattern): | 
| 96 """ | 184 """ | 
| 97 handles special escaped characters when coming from galaxy | 185 handles special escaped characters when coming from galaxy | 
| 98 """ | 186 """ | 
| 99 mapped_chars = {'\'': '__sq__', '\\': '__backslash__'} | 187 mapped_chars = {"'": "__sq__", "\\": "__backslash__"} | 
| 100 for key, value in mapped_chars.items(): | 188 for key, value in mapped_chars.items(): | 
| 101 pattern = pattern.replace(value, key) | 189 pattern = pattern.replace(value, key) | 
| 102 return pattern | 190 return pattern | 
| 103 | 191 | 
| 104 | 192 | 
| 105 def split_by_record(args, in_file, out_dir, top, ftype): | 193 def split_by_record(args, in_file, out_dir, top, ftype): | 
| 106 # get configuration (record separator, start at end) for given filetype | 194 # get configuration (record separator, start at end) for given filetype | 
| 107 sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) | 195 sep, num, sep_at_end = FILETYPES.get( | 
| 196 ftype, (args["generic_re"], args["generic_num"], args["split_after"]) | |
| 197 ) | |
| 108 sep = re.compile(sep) | 198 sep = re.compile(sep) | 
| 109 | 199 | 
| 110 chunksize = args["chunksize"] | 200 chunksize = args["chunksize"] | 
| 111 numnew = args["numnew"] | 201 numnew = args["numnew"] | 
| 112 | 202 | 
| 124 # determine | 214 # determine | 
| 125 # - the number of records that should be stored per file | 215 # - the number of records that should be stored per file | 
| 126 # (done always, even if used only for batch mode) | 216 # (done always, even if used only for batch mode) | 
| 127 # - if the separator is a the start / end of the record | 217 # - if the separator is a the start / end of the record | 
| 128 n_per_file = math.inf | 218 n_per_file = math.inf | 
| 129 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected | 219 if ( | 
| 220 chunksize != 0 or batch | |
| 221 ): # needs to be calculated if either batch or chunksize are selected | |
| 130 with open(in_file) as f: | 222 with open(in_file) as f: | 
| 131 # read header lines | 223 # read header lines | 
| 132 for i in range(top): | 224 for i in range(top): | 
| 133 f.readline() | 225 f.readline() | 
| 134 n_records = 0 | 226 n_records = 0 | 
| 227 last_line_matched = False | |
| 135 for line in f: | 228 for line in f: | 
| 136 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): | 229 if (num == 0 and re.match(sep, line) is not None) or ( | 
| 230 num > 0 and n_records % num == 0 | |
| 231 ): | |
| 137 n_records += 1 | 232 n_records += 1 | 
| 138 last_line_matched = True | 233 last_line_matched = True | 
| 139 else: | 234 else: | 
| 140 last_line_matched = False | 235 last_line_matched = False | 
| 141 if sep_at_end and not last_line_matched: | 236 if sep_at_end and not last_line_matched: | 
| 145 numnew = min(numnew, n_records) | 240 numnew = min(numnew, n_records) | 
| 146 # approx. number of records per file | 241 # approx. number of records per file | 
| 147 if chunksize == 0: # i.e. no chunking | 242 if chunksize == 0: # i.e. no chunking | 
| 148 n_per_file = n_records // numnew | 243 n_per_file = n_records // numnew | 
| 149 else: | 244 else: | 
| 150 numnew = n_records // chunksize | 245 numnew = max(n_records // chunksize, 1) # should not be less than 1 | 
| 151 n_per_file = chunksize | 246 n_per_file = chunksize | 
| 152 | 247 | 
| 153 # make new files | 248 # make new files | 
| 154 # strip extension of old file and add number | 249 # strip extension of old file and add number | 
| 155 custom_new_file_name = args["file_names"] | 250 custom_new_file_name = args["file_names"] | 
| 157 if custom_new_file_name is None: | 252 if custom_new_file_name is None: | 
| 158 new_file_base = os.path.splitext(os.path.basename(in_file)) | 253 new_file_base = os.path.splitext(os.path.basename(in_file)) | 
| 159 else: | 254 else: | 
| 160 new_file_base = [custom_new_file_name, custom_new_file_ext] | 255 new_file_base = [custom_new_file_name, custom_new_file_ext] | 
| 161 | 256 | 
| 162 newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)] | 257 newfile_names = [ | 
| 258 os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) | |
| 259 for count in range(0, numnew) | |
| 260 ] | |
| 163 # bunch o' counters | 261 # bunch o' counters | 
| 164 # index to list of new files | 262 # index to list of new files | 
| 165 if rand: | 263 if rand: | 
| 166 new_file_counter = int(math.floor(random.random() * numnew)) | 264 new_file_counter = int(math.floor(random.random() * numnew)) | 
| 167 else: | 265 else: | 
| 184 | 282 | 
| 185 record = "" | 283 record = "" | 
| 186 for line_no, line in enumerate(f): | 284 for line_no, line in enumerate(f): | 
| 187 # check if beginning of line is record sep | 285 # check if beginning of line is record sep | 
| 188 # if beginning of line is record sep, either start record or finish one | 286 # if beginning of line is record sep, either start record or finish one | 
| 189 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): | 287 if (num == 0 and re.match(sep, line) is not None) or ( | 
| 288 num > 0 and line_no % num == 0 | |
| 289 ): | |
| 190 # this only happens first time through | 290 # this only happens first time through | 
| 191 if record == "": | 291 if record == "": | 
| 192 record += line | 292 record += line | 
| 193 else: | 293 else: | 
| 194 # if is in fresh_files, write header and drop from freshFiles | 294 # if is in fresh_files, write header and drop from freshFiles | 
| 258 n_read += 1 | 358 n_read += 1 | 
| 259 if n_read <= top: | 359 if n_read <= top: | 
| 260 header += line | 360 header += line | 
| 261 continue | 361 continue | 
| 262 # split into columns, on tab | 362 # split into columns, on tab | 
| 263 fields = re.split(r'\t', line.strip('\n')) | 363 fields = re.split(r"\t", line.strip("\n")) | 
| 264 | 364 | 
| 265 # get id column value | 365 # get id column value | 
| 266 id_col_val = fields[id_col] | 366 id_col_val = fields[id_col] | 
| 267 | 367 | 
| 268 # use regex to get new file name | 368 # use regex to get new file name | 
