split_file_to_collection: split_file_to

comparison split_file_to_collection.py @ 5:e77b954f0da5 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"

author	bgruening
date	Fri, 11 Oct 2019 18:24:43 -0400
parents	0850f2dfba13
children	d57735dd27b0

comparison

equal deleted inserted replaced

-:0850f2dfba13
+:e77b954f0da5
 #!/usr/bin/env python
 import argparse
+import math
 import os
 import re
 import random
-import math
+# configuration of the splitting for specific file types
+# - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
-"""
+# - number of lines to split after (0 if not splitting by number of lines but regex)
-regexes that indicate the *beginning* of a record
+# - a boolean indicating if the record separator is at the end of the record
-new file types can be added by appending to this dict,
+#
-updating the parser, and adding a new type option in the Galaxy wrapper
+# new file types can be added by appending to this dict,
-"""
+# updating the parser, and adding a new type option in the Galaxy wrapper
-FILETYPES = {'fasta': '^>',
+FILETYPES = {'fasta': ('^>', 0,  False),
-'fastq': '^@',
+'fastq': ('', 4, False),
-'tabular': '^.*',
+'tabular': ('', 1, False),
-'txt': '^.*',
+'txt': ('', 1, False),
-'mgf': '^BEGIN IONS',
+'mgf': ('^BEGIN IONS', 0, False),
-'sdf': '\$\$\$\$',
+'sdf': ('\$\$\$\$', 0, True),
 }
 def main():
 ps = parser_cli()
 if args["ftype"] == "tabular" and args["by"] == "col":
 args["match"] = replace_mapped_chars(args["match"])
 args["sub"] = replace_mapped_chars(args["sub"])
 split_by_column(args, in_file, out_dir, top)
+else:
-else:
+args["generic_re"] = replace_mapped_chars(args["generic_re"])
 split_by_record(args, in_file, out_dir, top, ftype)
 def parser_cli():
 parser = argparse.ArgumentParser(description="split a file into multiple files. " +
 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
 " the extension of the new files (without a period)")
 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
-parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
 default = "row", choices = ["col", "row"])
-parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " +
+parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
-"(tabular only).")
 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
 "If not provided and args[\"rand\"]==True, then date is used", type=int)
-parser.add_argument('--numnew', '-n', type=int, default = 1,
+group = parser.add_mutually_exclusive_group()
+group.add_argument('--numnew', '-n', type=int, default = 1,
 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
-parser.add_argument('--chunksize', '-k', type=int, default = 0,
+group.add_argument('--chunksize', '-k', type=int, default = 0,
 help="Number of records by file. Not valid for splitting on a column")
 parser.add_argument('--batch', action='store_true',
 help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
-parser.add_argument('--split_after', '-p', action='store_true',
+generic = parser.add_argument_group('Arguments controling generic splitting')
-help="Split between records after separator (default is before)." +
+group = generic.add_mutually_exclusive_group()
-"Only for generic - specific ftypes are always split in the default way")
+group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False)
+group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False)
+generic.add_argument('--split_after', '-p', action='store_true',
+help="Split between records after separator (default is before). " +
+"Only for generic splitting by regex - specific ftypes are always split in the default way")
 bycol = parser.add_argument_group('If splitting on a column')
 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
 bycol.add_argument('--sub', '-s', default = r'\1',
 help="The regular expression to substitute in for the matched pattern.")
 bycol.add_argument('--id_column', '-c', default="1",
 pattern = pattern.replace(value, key)
 return pattern
 def split_by_record(args, in_file, out_dir, top, ftype):
-# get record separator for given filetype
+# get configuration (record separator, start at end) for given filetype
-sep = re.compile(FILETYPES.get(ftype, args["generic_re"]))
+sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))
+sep = re.compile(sep)
 chunksize = args["chunksize"]
 numnew = args["numnew"]
 # random division
 else:
 random.seed()
 # batched division (maintains order)
 batch = args["batch"]
+# determine
+# - the number of records that should be stored per file
+#   (done always, even if used only for batch mode)
+# - if the separator is a the start / end of the record
+n_per_file = math.inf
 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
-# define n_per_file so we don't get a warning about ref before assignment
-n_per_file = math.inf
-# number of records
 with open(in_file) as f:
-i = 0
+# read header lines
+for i in range(top):
+f.readline()
+n_records = 0
 for line in f:
-if re.match(sep, line) is not None:
+if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):
-i+=1
+n_records += 1
-n_records = i + 1
+last_line_matched = True
-if top:
+else:
-n_records -= top  # don't count the top lines
+last_line_matched = False
+if sep_at_end and not last_line_matched:
+n_records += 1
+# if there are fewer records than desired files
+numnew = min(numnew, n_records)
+# approx. number of records per file
 if chunksize == 0: # i.e. no chunking
-# approx. number of lines per file
 n_per_file = n_records // numnew
 else:
-# approx. number of lines per file
 numnew = n_records // chunksize
 n_per_file = chunksize
 # make new files
 # strip extension of old file and add number
 custom_new_file_name = args["file_names"]
 custom_new_file_ext = "." + args["file_ext"]
 newfiles = [
 open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")
 for count in range(0, numnew)
 ]
 # bunch o' counters
 # index to list of new files
-new_file_counter = 0
+if rand:
+new_file_counter = int(math.floor(random.random() * numnew))
-# used for top
+else:
-# number of lines read so far
+new_file_counter = 0
-n_read = 0
 # to contain header specified by top
 header = ""
 # keep track of the files that have been opened so far
-fresh_files = {i for i in range(0, numnew)}
+fresh_files = set(range(numnew))
 # keep track in loop of number of records in each file
 # only used in batch
 records_in_file = 0
 # open file
-with open(in_file, "r") as file:
+with open(in_file, "r") as f:
+# read header
+for i in range(top):
+header += f.readline()
 record = ""
-for line in file:
+for line_no, line in enumerate(f):
-n_read += 1
-if n_read <= top:
-header += line
-continue
 # check if beginning of line is record sep
 # if beginning of line is record sep, either start record or finish one
-if re.match(sep, line) is not None:
+if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):
 # this only happens first time through
 if record == "":
 record += line
 else:
 # if is in fresh_files, write header and drop from freshFiles
 if new_file_counter in fresh_files:
 newfiles[new_file_counter].write(header)
 fresh_files.remove(new_file_counter)
-if ftype != "sdf" and args["split_after"] == False:
+if sep_at_end:
-# write record to file
+record += line
-newfiles[new_file_counter].write(record)
+# write record to file
+newfiles[new_file_counter].write(record)
-# if not the first time through, we assign the new record
+if not sep_at_end:
 record = line
+else:
-else:  # for sdf we want to write the line to the record before starting a new one
-record += line
-newfiles[new_file_counter].write(record)
 record = ""
 # change destination file
 if rand:
 new_file_counter = int(math.floor(random.random() * numnew))
 elif batch:
 # number of records read per file
 # so just append
 else:
 record += line
 # after loop, write final record to file
 newfiles[new_file_counter].write(record)
 # close new files
 close_files(newfiles)
 def split_by_column(args, in_file, out_dir, top):

Mercurial > repos > bgruening > split_file_to_collection

comparison split_file_to_collection.py @ 5:e77b954f0da5 draft