Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.py @ 5:e77b954f0da5 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
| author | bgruening |
|---|---|
| date | Fri, 11 Oct 2019 18:24:43 -0400 |
| parents | 0850f2dfba13 |
| children | d57735dd27b0 |
comparison
equal
deleted
inserted
replaced
| 4:0850f2dfba13 | 5:e77b954f0da5 |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 | 2 |
| 3 import argparse | 3 import argparse |
| 4 import math | |
| 4 import os | 5 import os |
| 5 import re | 6 import re |
| 6 import random | 7 import random |
| 7 import math | 8 |
| 8 | 9 # configuration of the splitting for specific file types |
| 9 | 10 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines) |
| 10 """ | 11 # - number of lines to split after (0 if not splitting by number of lines but regex) |
| 11 regexes that indicate the *beginning* of a record | 12 # - a boolean indicating if the record separator is at the end of the record |
| 12 new file types can be added by appending to this dict, | 13 # |
| 13 updating the parser, and adding a new type option in the Galaxy wrapper | 14 # new file types can be added by appending to this dict, |
| 14 """ | 15 # updating the parser, and adding a new type option in the Galaxy wrapper |
| 15 FILETYPES = {'fasta': '^>', | 16 FILETYPES = {'fasta': ('^>', 0, False), |
| 16 'fastq': '^@', | 17 'fastq': ('', 4, False), |
| 17 'tabular': '^.*', | 18 'tabular': ('', 1, False), |
| 18 'txt': '^.*', | 19 'txt': ('', 1, False), |
| 19 'mgf': '^BEGIN IONS', | 20 'mgf': ('^BEGIN IONS', 0, False), |
| 20 'sdf': '\$\$\$\$', | 21 'sdf': ('\$\$\$\$', 0, True), |
| 21 } | 22 } |
| 22 | 23 |
| 23 | 24 |
| 24 def main(): | 25 def main(): |
| 25 ps = parser_cli() | 26 ps = parser_cli() |
| 44 | 45 |
| 45 if args["ftype"] == "tabular" and args["by"] == "col": | 46 if args["ftype"] == "tabular" and args["by"] == "col": |
| 46 args["match"] = replace_mapped_chars(args["match"]) | 47 args["match"] = replace_mapped_chars(args["match"]) |
| 47 args["sub"] = replace_mapped_chars(args["sub"]) | 48 args["sub"] = replace_mapped_chars(args["sub"]) |
| 48 split_by_column(args, in_file, out_dir, top) | 49 split_by_column(args, in_file, out_dir, top) |
| 49 | 50 else: |
| 50 else: | 51 args["generic_re"] = replace_mapped_chars(args["generic_re"]) |
| 51 split_by_record(args, in_file, out_dir, top, ftype) | 52 split_by_record(args, in_file, out_dir, top, ftype) |
| 52 | 53 |
| 53 | 54 |
| 54 def parser_cli(): | 55 def parser_cli(): |
| 55 parser = argparse.ArgumentParser(description="split a file into multiple files. " + | 56 parser = argparse.ArgumentParser(description="split a file into multiple files. " + |
| 60 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") | 61 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") |
| 61 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + | 62 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + |
| 62 " the extension of the new files (without a period)") | 63 " the extension of the new files (without a period)") |
| 63 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, | 64 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, |
| 64 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) | 65 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) |
| 65 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) | |
| 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", | 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", |
| 67 default = "row", choices = ["col", "row"]) | 67 default = "row", choices = ["col", "row"]) |
| 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + | 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") |
| 69 "(tabular only).") | |
| 70 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') | 69 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') |
| 71 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + | 70 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + |
| 72 "If not provided and args[\"rand\"]==True, then date is used", type=int) | 71 "If not provided and args[\"rand\"]==True, then date is used", type=int) |
| 73 parser.add_argument('--numnew', '-n', type=int, default = 1, | 72 group = parser.add_mutually_exclusive_group() |
| 73 group.add_argument('--numnew', '-n', type=int, default = 1, | |
| 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") | 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") |
| 75 parser.add_argument('--chunksize', '-k', type=int, default = 0, | 75 group.add_argument('--chunksize', '-k', type=int, default = 0, |
| 76 help="Number of records by file. Not valid for splitting on a column") | 76 help="Number of records by file. Not valid for splitting on a column") |
| 77 parser.add_argument('--batch', action='store_true', | 77 parser.add_argument('--batch', action='store_true', |
| 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") | 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") |
| 79 parser.add_argument('--split_after', '-p', action='store_true', | 79 generic = parser.add_argument_group('Arguments controling generic splitting') |
| 80 help="Split between records after separator (default is before)." + | 80 group = generic.add_mutually_exclusive_group() |
| 81 "Only for generic - specific ftypes are always split in the default way") | 81 group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False) |
| 82 group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False) | |
| 83 generic.add_argument('--split_after', '-p', action='store_true', | |
| 84 help="Split between records after separator (default is before). " + | |
| 85 "Only for generic splitting by regex - specific ftypes are always split in the default way") | |
| 82 bycol = parser.add_argument_group('If splitting on a column') | 86 bycol = parser.add_argument_group('If splitting on a column') |
| 83 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") | 87 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") |
| 84 bycol.add_argument('--sub', '-s', default = r'\1', | 88 bycol.add_argument('--sub', '-s', default = r'\1', |
| 85 help="The regular expression to substitute in for the matched pattern.") | 89 help="The regular expression to substitute in for the matched pattern.") |
| 86 bycol.add_argument('--id_column', '-c', default="1", | 90 bycol.add_argument('--id_column', '-c', default="1", |
| 103 pattern = pattern.replace(value, key) | 107 pattern = pattern.replace(value, key) |
| 104 return pattern | 108 return pattern |
| 105 | 109 |
| 106 | 110 |
| 107 def split_by_record(args, in_file, out_dir, top, ftype): | 111 def split_by_record(args, in_file, out_dir, top, ftype): |
| 108 # get record separator for given filetype | 112 # get configuration (record separator, start at end) for given filetype |
| 109 sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) | 113 sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) |
| 114 sep = re.compile(sep) | |
| 110 | 115 |
| 111 chunksize = args["chunksize"] | 116 chunksize = args["chunksize"] |
| 112 numnew = args["numnew"] | 117 numnew = args["numnew"] |
| 113 | 118 |
| 114 # random division | 119 # random division |
| 119 else: | 124 else: |
| 120 random.seed() | 125 random.seed() |
| 121 | 126 |
| 122 # batched division (maintains order) | 127 # batched division (maintains order) |
| 123 batch = args["batch"] | 128 batch = args["batch"] |
| 124 | |
| 125 | 129 |
| 130 # determine | |
| 131 # - the number of records that should be stored per file | |
| 132 # (done always, even if used only for batch mode) | |
| 133 # - if the separator is a the start / end of the record | |
| 134 n_per_file = math.inf | |
| 126 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected | 135 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected |
| 127 # define n_per_file so we don't get a warning about ref before assignment | |
| 128 n_per_file = math.inf | |
| 129 | |
| 130 # number of records | |
| 131 with open(in_file) as f: | 136 with open(in_file) as f: |
| 132 i = 0 | 137 # read header lines |
| 138 for i in range(top): | |
| 139 f.readline() | |
| 140 n_records = 0 | |
| 133 for line in f: | 141 for line in f: |
| 134 if re.match(sep, line) is not None: | 142 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): |
| 135 i+=1 | 143 n_records += 1 |
| 136 n_records = i + 1 | 144 last_line_matched = True |
| 137 if top: | 145 else: |
| 138 n_records -= top # don't count the top lines | 146 last_line_matched = False |
| 139 | 147 if sep_at_end and not last_line_matched: |
| 148 n_records += 1 | |
| 149 | |
| 150 # if there are fewer records than desired files | |
| 151 numnew = min(numnew, n_records) | |
| 152 # approx. number of records per file | |
| 140 if chunksize == 0: # i.e. no chunking | 153 if chunksize == 0: # i.e. no chunking |
| 141 # approx. number of lines per file | |
| 142 n_per_file = n_records // numnew | 154 n_per_file = n_records // numnew |
| 143 else: | 155 else: |
| 144 # approx. number of lines per file | |
| 145 numnew = n_records // chunksize | 156 numnew = n_records // chunksize |
| 146 n_per_file = chunksize | 157 n_per_file = chunksize |
| 147 | |
| 148 | |
| 149 | |
| 150 | 158 |
| 151 # make new files | 159 # make new files |
| 152 # strip extension of old file and add number | 160 # strip extension of old file and add number |
| 153 custom_new_file_name = args["file_names"] | 161 custom_new_file_name = args["file_names"] |
| 154 custom_new_file_ext = "." + args["file_ext"] | 162 custom_new_file_ext = "." + args["file_ext"] |
| 159 | 167 |
| 160 newfiles = [ | 168 newfiles = [ |
| 161 open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") | 169 open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") |
| 162 for count in range(0, numnew) | 170 for count in range(0, numnew) |
| 163 ] | 171 ] |
| 164 | |
| 165 # bunch o' counters | 172 # bunch o' counters |
| 166 # index to list of new files | 173 # index to list of new files |
| 167 new_file_counter = 0 | 174 if rand: |
| 168 | 175 new_file_counter = int(math.floor(random.random() * numnew)) |
| 169 # used for top | 176 else: |
| 170 # number of lines read so far | 177 new_file_counter = 0 |
| 171 n_read = 0 | |
| 172 # to contain header specified by top | 178 # to contain header specified by top |
| 173 header = "" | 179 header = "" |
| 174 # keep track of the files that have been opened so far | 180 # keep track of the files that have been opened so far |
| 175 fresh_files = {i for i in range(0, numnew)} | 181 fresh_files = set(range(numnew)) |
| 176 | 182 |
| 177 # keep track in loop of number of records in each file | 183 # keep track in loop of number of records in each file |
| 178 # only used in batch | 184 # only used in batch |
| 179 records_in_file = 0 | 185 records_in_file = 0 |
| 180 | 186 |
| 181 # open file | 187 # open file |
| 182 with open(in_file, "r") as file: | 188 with open(in_file, "r") as f: |
| 189 # read header | |
| 190 for i in range(top): | |
| 191 header += f.readline() | |
| 192 | |
| 183 record = "" | 193 record = "" |
| 184 for line in file: | 194 for line_no, line in enumerate(f): |
| 185 n_read += 1 | |
| 186 if n_read <= top: | |
| 187 header += line | |
| 188 continue | |
| 189 # check if beginning of line is record sep | 195 # check if beginning of line is record sep |
| 190 # if beginning of line is record sep, either start record or finish one | 196 # if beginning of line is record sep, either start record or finish one |
| 191 if re.match(sep, line) is not None: | 197 if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): |
| 192 # this only happens first time through | 198 # this only happens first time through |
| 193 if record == "": | 199 if record == "": |
| 194 record += line | 200 record += line |
| 195 else: | 201 else: |
| 196 # if is in fresh_files, write header and drop from freshFiles | 202 # if is in fresh_files, write header and drop from freshFiles |
| 197 if new_file_counter in fresh_files: | 203 if new_file_counter in fresh_files: |
| 198 newfiles[new_file_counter].write(header) | 204 newfiles[new_file_counter].write(header) |
| 199 fresh_files.remove(new_file_counter) | 205 fresh_files.remove(new_file_counter) |
| 200 | 206 |
| 201 if ftype != "sdf" and args["split_after"] == False: | 207 if sep_at_end: |
| 202 # write record to file | 208 record += line |
| 203 newfiles[new_file_counter].write(record) | 209 # write record to file |
| 204 | 210 newfiles[new_file_counter].write(record) |
| 205 # if not the first time through, we assign the new record | 211 if not sep_at_end: |
| 206 record = line | 212 record = line |
| 207 | 213 else: |
| 208 else: # for sdf we want to write the line to the record before starting a new one | |
| 209 record += line | |
| 210 newfiles[new_file_counter].write(record) | |
| 211 record = "" | 214 record = "" |
| 212 | 215 |
| 213 # change destination file | 216 # change destination file |
| 214 if rand: | 217 if rand: |
| 215 new_file_counter = int(math.floor(random.random() * numnew)) | 218 new_file_counter = int(math.floor(random.random() * numnew)) |
| 216 elif batch: | 219 elif batch: |
| 217 # number of records read per file | 220 # number of records read per file |
| 227 # so just append | 230 # so just append |
| 228 else: | 231 else: |
| 229 record += line | 232 record += line |
| 230 # after loop, write final record to file | 233 # after loop, write final record to file |
| 231 newfiles[new_file_counter].write(record) | 234 newfiles[new_file_counter].write(record) |
| 235 | |
| 232 # close new files | 236 # close new files |
| 233 close_files(newfiles) | 237 close_files(newfiles) |
| 234 | 238 |
| 235 | 239 |
| 236 def split_by_column(args, in_file, out_dir, top): | 240 def split_by_column(args, in_file, out_dir, top): |
