# HG changeset patch # User bgruening # Date 1570832683 14400 # Node ID e77b954f0da5ce39519feee9b9a8067bad490eb2 # Parent 0850f2dfba13d0f6b1391c37c592de4d1294116d "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219" diff -r 0850f2dfba13 -r e77b954f0da5 split_file_to_collection.py --- a/split_file_to_collection.py Wed Oct 09 07:34:49 2019 -0400 +++ b/split_file_to_collection.py Fri Oct 11 18:24:43 2019 -0400 @@ -1,23 +1,24 @@ #!/usr/bin/env python import argparse +import math import os import re import random -import math - -""" -regexes that indicate the *beginning* of a record -new file types can be added by appending to this dict, -updating the parser, and adding a new type option in the Galaxy wrapper -""" -FILETYPES = {'fasta': '^>', - 'fastq': '^@', - 'tabular': '^.*', - 'txt': '^.*', - 'mgf': '^BEGIN IONS', - 'sdf': '\$\$\$\$', +# configuration of the splitting for specific file types +# - regular expression matching the record separator ('' if not splitting by regex but by number of lines) +# - number of lines to split after (0 if not splitting by number of lines but regex) +# - a boolean indicating if the record separator is at the end of the record +# +# new file types can be added by appending to this dict, +# updating the parser, and adding a new type option in the Galaxy wrapper +FILETYPES = {'fasta': ('^>', 0, False), + 'fastq': ('', 4, False), + 'tabular': ('', 1, False), + 'txt': ('', 1, False), + 'mgf': ('^BEGIN IONS', 0, False), + 'sdf': ('\$\$\$\$', 0, True), } @@ -46,8 +47,8 @@ args["match"] = replace_mapped_chars(args["match"]) args["sub"] = replace_mapped_chars(args["sub"]) split_by_column(args, in_file, out_dir, top) - else: + args["generic_re"] = replace_mapped_chars(args["generic_re"]) split_by_record(args, in_file, out_dir, top, ftype) @@ -62,23 +63,26 @@ " the extension of the new files (without a period)") parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) - parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", default = "row", choices = ["col", "row"]) - parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + - "(tabular only).") + parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + "If not provided and args[\"rand\"]==True, then date is used", type=int) - parser.add_argument('--numnew', '-n', type=int, default = 1, + group = parser.add_mutually_exclusive_group() + group.add_argument('--numnew', '-n', type=int, default = 1, help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") - parser.add_argument('--chunksize', '-k', type=int, default = 0, + group.add_argument('--chunksize', '-k', type=int, default = 0, help="Number of records by file. Not valid for splitting on a column") parser.add_argument('--batch', action='store_true', help="Distribute files to collection while maintaining order. Ignored if splitting on column.") - parser.add_argument('--split_after', '-p', action='store_true', - help="Split between records after separator (default is before)." + - "Only for generic - specific ftypes are always split in the default way") + generic = parser.add_argument_group('Arguments controling generic splitting') + group = generic.add_mutually_exclusive_group() + group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False) + group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False) + generic.add_argument('--split_after', '-p', action='store_true', + help="Split between records after separator (default is before). " + + "Only for generic splitting by regex - specific ftypes are always split in the default way") bycol = parser.add_argument_group('If splitting on a column') bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") bycol.add_argument('--sub', '-s', default = r'\1', @@ -105,8 +109,9 @@ def split_by_record(args, in_file, out_dir, top, ftype): - # get record separator for given filetype - sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) + # get configuration (record separator, start at end) for given filetype + sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) + sep = re.compile(sep) chunksize = args["chunksize"] numnew = args["numnew"] @@ -121,33 +126,36 @@ # batched division (maintains order) batch = args["batch"] - + # determine + # - the number of records that should be stored per file + # (done always, even if used only for batch mode) + # - if the separator is a the start / end of the record + n_per_file = math.inf if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected - # define n_per_file so we don't get a warning about ref before assignment - n_per_file = math.inf - - # number of records with open(in_file) as f: - i = 0 + # read header lines + for i in range(top): + f.readline() + n_records = 0 for line in f: - if re.match(sep, line) is not None: - i+=1 - n_records = i + 1 - if top: - n_records -= top # don't count the top lines - + if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): + n_records += 1 + last_line_matched = True + else: + last_line_matched = False + if sep_at_end and not last_line_matched: + n_records += 1 + + # if there are fewer records than desired files + numnew = min(numnew, n_records) + # approx. number of records per file if chunksize == 0: # i.e. no chunking - # approx. number of lines per file n_per_file = n_records // numnew else: - # approx. number of lines per file numnew = n_records // chunksize n_per_file = chunksize - - - # make new files # strip extension of old file and add number custom_new_file_name = args["file_names"] @@ -161,34 +169,32 @@ open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") for count in range(0, numnew) ] - # bunch o' counters # index to list of new files - new_file_counter = 0 - - # used for top - # number of lines read so far - n_read = 0 + if rand: + new_file_counter = int(math.floor(random.random() * numnew)) + else: + new_file_counter = 0 # to contain header specified by top header = "" # keep track of the files that have been opened so far - fresh_files = {i for i in range(0, numnew)} + fresh_files = set(range(numnew)) # keep track in loop of number of records in each file # only used in batch records_in_file = 0 # open file - with open(in_file, "r") as file: + with open(in_file, "r") as f: + # read header + for i in range(top): + header += f.readline() + record = "" - for line in file: - n_read += 1 - if n_read <= top: - header += line - continue + for line_no, line in enumerate(f): # check if beginning of line is record sep # if beginning of line is record sep, either start record or finish one - if re.match(sep, line) is not None: + if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): # this only happens first time through if record == "": record += line @@ -198,18 +204,15 @@ newfiles[new_file_counter].write(header) fresh_files.remove(new_file_counter) - if ftype != "sdf" and args["split_after"] == False: - # write record to file - newfiles[new_file_counter].write(record) - - # if not the first time through, we assign the new record + if sep_at_end: + record += line + # write record to file + newfiles[new_file_counter].write(record) + if not sep_at_end: record = line - - else: # for sdf we want to write the line to the record before starting a new one - record += line - newfiles[new_file_counter].write(record) + else: record = "" - + # change destination file if rand: new_file_counter = int(math.floor(random.random() * numnew)) @@ -229,6 +232,7 @@ record += line # after loop, write final record to file newfiles[new_file_counter].write(record) + # close new files close_files(newfiles) diff -r 0850f2dfba13 -r e77b954f0da5 split_file_to_collection.xml --- a/split_file_to_collection.xml Wed Oct 09 07:34:49 2019 -0400 +++ b/split_file_to_collection.xml Fri Oct 11 18:24:43 2019 -0400 @@ -1,4 +1,4 @@ - + to dataset collection @@ -77,9 +77,13 @@ #end if #else #if $split_parms.select_ftype == "generic" - --generic_re '$split_parms.generic_regex' - #if $split_parms.split_after == 'true': - --split_after + #if $split_parms.split_method.select_split_method == "regex" + --generic_re '$split_parms.split_method.generic_regex' + #if $split_parms.split_method.split_after == 'true': + --split_after + #end if + #else + --generic_num $split_parms.split_method.record_length #end if #end if #if $split_parms.select_mode.mode == "numnew": @@ -163,14 +167,25 @@ - - - + + + + + + + + + + + + + + + + + + - - - - @@ -205,6 +220,7 @@ + @@ -219,6 +235,7 @@ + @@ -232,6 +249,7 @@ + @@ -246,6 +264,7 @@ + @@ -260,6 +279,7 @@ + @@ -295,6 +315,7 @@ + @@ -310,6 +331,7 @@ + @@ -322,6 +344,7 @@ + @@ -333,6 +356,7 @@ + @@ -344,6 +368,7 @@ + @@ -355,6 +380,23 @@ + + + + + + + + + + + + + + + @@ -368,6 +410,7 @@ + @@ -380,6 +423,7 @@ + @@ -391,9 +435,11 @@ + + @@ -403,9 +449,11 @@ + + @@ -417,6 +465,7 @@ + @@ -430,6 +479,7 @@ + @@ -443,10 +493,12 @@ + - + + @@ -454,7 +506,8 @@ - + + @@ -463,10 +516,11 @@ This tool splits a data set consisting of records into multiple data sets within a collection. A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence -(headers + sequence + qualities), etc. The important property is that the beginning of a new record -can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. -The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF. -For other data types the text delimiting records can be specified manually using the generic splitter. +(headers + sequence + qualities), etc. The important property is that the records either have a +specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record +can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA. +The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$"). +For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter. If the generic splitter is used, an option is also available to split records either before or after the separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all others). diff -r 0850f2dfba13 -r e77b954f0da5 test-data/rand_0.fasta --- a/test-data/rand_0.fasta Wed Oct 09 07:34:49 2019 -0400 +++ b/test-data/rand_0.fasta Fri Oct 11 18:24:43 2019 -0400 @@ -1,5 +1,5 @@ ->seq1 -PROTEIN0 +>seq3 +ANOTHERPROTEIN >seq4 ASFWEFOIN >seq5 diff -r 0850f2dfba13 -r e77b954f0da5 test-data/rand_1.fasta --- a/test-data/rand_1.fasta Wed Oct 09 07:34:49 2019 -0400 +++ b/test-data/rand_1.fasta Fri Oct 11 18:24:43 2019 -0400 @@ -1,4 +1,4 @@ +>seq1 +PROTEIN0 >seq2 PROTEIN ->seq3 -ANOTHERPROTEIN