Mercurial > repos > bgruening > split_file_to_collection
changeset 5:e77b954f0da5 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
| author | bgruening | 
|---|---|
| date | Fri, 11 Oct 2019 18:24:43 -0400 | 
| parents | 0850f2dfba13 | 
| children | d57735dd27b0 | 
| files | split_file_to_collection.py split_file_to_collection.xml test-data/rand_0.fasta test-data/rand_1.fasta | 
| diffstat | 4 files changed, 144 insertions(+), 86 deletions(-) [+] | 
line wrap: on
 line diff
--- a/split_file_to_collection.py Wed Oct 09 07:34:49 2019 -0400 +++ b/split_file_to_collection.py Fri Oct 11 18:24:43 2019 -0400 @@ -1,23 +1,24 @@ #!/usr/bin/env python import argparse +import math import os import re import random -import math - -""" -regexes that indicate the *beginning* of a record -new file types can be added by appending to this dict, -updating the parser, and adding a new type option in the Galaxy wrapper -""" -FILETYPES = {'fasta': '^>', - 'fastq': '^@', - 'tabular': '^.*', - 'txt': '^.*', - 'mgf': '^BEGIN IONS', - 'sdf': '\$\$\$\$', +# configuration of the splitting for specific file types +# - regular expression matching the record separator ('' if not splitting by regex but by number of lines) +# - number of lines to split after (0 if not splitting by number of lines but regex) +# - a boolean indicating if the record separator is at the end of the record +# +# new file types can be added by appending to this dict, +# updating the parser, and adding a new type option in the Galaxy wrapper +FILETYPES = {'fasta': ('^>', 0, False), + 'fastq': ('', 4, False), + 'tabular': ('', 1, False), + 'txt': ('', 1, False), + 'mgf': ('^BEGIN IONS', 0, False), + 'sdf': ('\$\$\$\$', 0, True), } @@ -46,8 +47,8 @@ args["match"] = replace_mapped_chars(args["match"]) args["sub"] = replace_mapped_chars(args["sub"]) split_by_column(args, in_file, out_dir, top) - else: + args["generic_re"] = replace_mapped_chars(args["generic_re"]) split_by_record(args, in_file, out_dir, top, ftype) @@ -62,23 +63,26 @@ " the extension of the new files (without a period)") parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) - parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", default = "row", choices = ["col", "row"]) - parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + - "(tabular only).") + parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.") parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + "If not provided and args[\"rand\"]==True, then date is used", type=int) - parser.add_argument('--numnew', '-n', type=int, default = 1, + group = parser.add_mutually_exclusive_group() + group.add_argument('--numnew', '-n', type=int, default = 1, help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") - parser.add_argument('--chunksize', '-k', type=int, default = 0, + group.add_argument('--chunksize', '-k', type=int, default = 0, help="Number of records by file. Not valid for splitting on a column") parser.add_argument('--batch', action='store_true', help="Distribute files to collection while maintaining order. Ignored if splitting on column.") - parser.add_argument('--split_after', '-p', action='store_true', - help="Split between records after separator (default is before)." + - "Only for generic - specific ftypes are always split in the default way") + generic = parser.add_argument_group('Arguments controling generic splitting') + group = generic.add_mutually_exclusive_group() + group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False) + group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False) + generic.add_argument('--split_after', '-p', action='store_true', + help="Split between records after separator (default is before). " + + "Only for generic splitting by regex - specific ftypes are always split in the default way") bycol = parser.add_argument_group('If splitting on a column') bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") bycol.add_argument('--sub', '-s', default = r'\1', @@ -105,8 +109,9 @@ def split_by_record(args, in_file, out_dir, top, ftype): - # get record separator for given filetype - sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) + # get configuration (record separator, start at end) for given filetype + sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"])) + sep = re.compile(sep) chunksize = args["chunksize"] numnew = args["numnew"] @@ -121,33 +126,36 @@ # batched division (maintains order) batch = args["batch"] - + # determine + # - the number of records that should be stored per file + # (done always, even if used only for batch mode) + # - if the separator is a the start / end of the record + n_per_file = math.inf if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected - # define n_per_file so we don't get a warning about ref before assignment - n_per_file = math.inf - - # number of records with open(in_file) as f: - i = 0 + # read header lines + for i in range(top): + f.readline() + n_records = 0 for line in f: - if re.match(sep, line) is not None: - i+=1 - n_records = i + 1 - if top: - n_records -= top # don't count the top lines - + if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0): + n_records += 1 + last_line_matched = True + else: + last_line_matched = False + if sep_at_end and not last_line_matched: + n_records += 1 + + # if there are fewer records than desired files + numnew = min(numnew, n_records) + # approx. number of records per file if chunksize == 0: # i.e. no chunking - # approx. number of lines per file n_per_file = n_records // numnew else: - # approx. number of lines per file numnew = n_records // chunksize n_per_file = chunksize - - - # make new files # strip extension of old file and add number custom_new_file_name = args["file_names"] @@ -161,34 +169,32 @@ open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") for count in range(0, numnew) ] - # bunch o' counters # index to list of new files - new_file_counter = 0 - - # used for top - # number of lines read so far - n_read = 0 + if rand: + new_file_counter = int(math.floor(random.random() * numnew)) + else: + new_file_counter = 0 # to contain header specified by top header = "" # keep track of the files that have been opened so far - fresh_files = {i for i in range(0, numnew)} + fresh_files = set(range(numnew)) # keep track in loop of number of records in each file # only used in batch records_in_file = 0 # open file - with open(in_file, "r") as file: + with open(in_file, "r") as f: + # read header + for i in range(top): + header += f.readline() + record = "" - for line in file: - n_read += 1 - if n_read <= top: - header += line - continue + for line_no, line in enumerate(f): # check if beginning of line is record sep # if beginning of line is record sep, either start record or finish one - if re.match(sep, line) is not None: + if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0): # this only happens first time through if record == "": record += line @@ -198,18 +204,15 @@ newfiles[new_file_counter].write(header) fresh_files.remove(new_file_counter) - if ftype != "sdf" and args["split_after"] == False: - # write record to file - newfiles[new_file_counter].write(record) - - # if not the first time through, we assign the new record + if sep_at_end: + record += line + # write record to file + newfiles[new_file_counter].write(record) + if not sep_at_end: record = line - - else: # for sdf we want to write the line to the record before starting a new one - record += line - newfiles[new_file_counter].write(record) + else: record = "" - + # change destination file if rand: new_file_counter = int(math.floor(random.random() * numnew)) @@ -229,6 +232,7 @@ record += line # after loop, write final record to file newfiles[new_file_counter].write(record) + # close new files close_files(newfiles)
--- a/split_file_to_collection.xml Wed Oct 09 07:34:49 2019 -0400 +++ b/split_file_to_collection.xml Fri Oct 11 18:24:43 2019 -0400 @@ -1,4 +1,4 @@ -<tool id="split_file_to_collection" name="Split file" version="0.3.0"> +<tool id="split_file_to_collection" name="Split file" version="0.4.0"> <description>to dataset collection</description> <macros> <xml name="regex_sanitizer"> @@ -77,9 +77,13 @@ #end if #else #if $split_parms.select_ftype == "generic" - --generic_re '$split_parms.generic_regex' - #if $split_parms.split_after == 'true': - --split_after + #if $split_parms.split_method.select_split_method == "regex" + --generic_re '$split_parms.split_method.generic_regex' + #if $split_parms.split_method.split_after == 'true': + --split_after + #end if + #else + --generic_num $split_parms.split_method.record_length #end if #end if #if $split_parms.select_mode.mode == "numnew": @@ -163,14 +167,25 @@ </when> <when value="generic"> <param name="input" type="data" format="txt" label="File to split"/> - <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*"> - <expand macro="regex_sanitizer"/> - </param> + <conditional name="split_method"> + <param name="select_split_method" type="select" label="Method to split files"> + <option value="regex">Specify record separator as regular expression</option> + <option value="number">Specify number of lines after which a record ends</option> + </param> + <when value="regex"> + <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*"> + <expand macro="regex_sanitizer"/> + </param> + <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end"> + <option value="false" selected="true">Before</option> + <option value="true">After</option> + </param> + </when> + <when value="number"> + <param name="record_length" type="integer" value="1" label="Record length" help="The number of lines after which each record ends"/> + </when> + </conditional> <expand macro="numnew_fname"/> - <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end"> - <option value="false" selected="true">Before</option> - <option value="true">After</option> - </param> </when> </conditional> </inputs> @@ -205,6 +220,7 @@ </collection> </outputs> <tests> + <!-- 1 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -219,6 +235,7 @@ <element name="foo3.tab" file="foo3.tab" ftype="tabular"/> </output_collection> </test> + <!-- 2 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -232,6 +249,7 @@ <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/> </output_collection> </test> + <!-- 3 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -246,6 +264,7 @@ <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> </output_collection> </test> + <!-- 4 --> <test> <param name="input" value="test.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -260,6 +279,7 @@ <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> </output_collection> </test> + <!-- 5 --> <test> <param name="select_ftype" value="txt"/> <param name="input" value="karyotype.txt" ftype="txt"/> @@ -295,6 +315,7 @@ <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/> </output_collection> </test> + <!-- 6 --> <test> <param name="input" value="psm.tabular" ftype="tabular"/> <param name="select_ftype" value="tabular"/> @@ -310,6 +331,7 @@ <element name="file4.tab" file="file4.tab" ftype="tabular"/> </output_collection> </test> + <!-- 7 splitting of mgf --> <test> <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/> <param name="select_ftype" value="mgf"/> @@ -322,6 +344,7 @@ <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/> </output_collection> </test> + <!-- 8 splitting of fasta + desired number of files--> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -333,6 +356,7 @@ <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- 9 splitting of fasta + desired chunksize --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -344,6 +368,7 @@ <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- 10 splitting of fastq, specify desired number of files --> <test> <param name="input" value="test.fastq" ftype="fastq"/> <param name="select_ftype" value="fastq"/> @@ -355,6 +380,23 @@ <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/> </output_collection> </test> + <!-- 11 splitting of fastq, specify desired number of files + same as previous test, but by specifying the number of lines per record + explicitely (not using the preset of the python script) --> + <test> + <param name="input" value="test.fastq" ftype="fastq"/> + <param name="select_ftype" value="generic"/> + <param name="select_split_method" value="number"/> + <param name="record_length" value="4"/> + <param name="mode" value="numnew"/> + <param name="numnew" value="2"/> + <param name="newfilenames" value="test"/> + <output_collection name="list_output_generic" type="list"> + <element name="test_000000" file="test_0.fastq" ftype="fastq"/> + <element name="test_000001" file="test_1.fastq" ftype="fastq"/> + </output_collection> + </test> + <!-- splitting of fasta w random assignment and specific filename prefix --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -368,6 +410,7 @@ <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- splitting of fasta w batch assignment and specific filename prefix --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> @@ -380,6 +423,7 @@ <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- splitting of txt w default (alternating assignment) --> <test> <param name="input" value="test.tabular" ftype="txt"/> <param name="select_ftype" value="txt"/> @@ -391,9 +435,11 @@ <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> </output_collection> </test> + <!-- generic-regex splitting (of txt) w default assignement (alternating) --> <test> <param name="input" value="test.tabular" ftype="txt"/> <param name="select_ftype" value="generic"/> + <param name="select_split_method" value="regex"/> <param name="generic_regex" value="^.*"/> <param name="mode" value="numnew"/> <param name="numnew" value="2"/> @@ -403,9 +449,11 @@ <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/> </output_collection> </test> + <!-- generic-regex splitting (of a fasta) w random assignment --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="generic"/> + <param name="select_split_method" value="regex"/> <param name="generic_regex" value="^>.*"/> <param name="mode" value="numnew"/> <param name="numnew" value="2"/> @@ -417,6 +465,7 @@ <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> + <!-- sdf + specify desired number of files --> <test> <param name="input" value="3_molecules.sdf" ftype="sdf"/> <param name="select_ftype" value="sdf"/> @@ -430,6 +479,7 @@ <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/> </output_collection> </test> + <!-- sdf + specify desired number of records per file (chunksize) --> <test> <param name="input" value="3_molecules.sdf" ftype="sdf"/> <param name="select_ftype" value="sdf"/> @@ -443,10 +493,12 @@ <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/> </output_collection> </test> + <!-- test split_after (by splitting fasta files after non-header lines) --> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="generic"/> - <param name="generic_regex" value="^>.*"/> + <param name="select_split_method" value="regex"/> + <param name="generic_regex" value="^[^>].*"/> <param name="split_after" value="true"/> <param name="mode" value="numnew"/> <param name="numnew" value="2"/> @@ -454,7 +506,8 @@ <param name="allocate" value="random"/> <param name="seed" value="1010"/> <output_collection name="list_output_generic" type="list"> - <element name="rand_000001" file="split_after.fasta" ftype="fasta"/> + <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/> + <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> </tests> @@ -463,10 +516,11 @@ This tool splits a data set consisting of records into multiple data sets within a collection. A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence -(headers + sequence + qualities), etc. The important property is that the beginning of a new record -can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. -The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF. -For other data types the text delimiting records can be specified manually using the generic splitter. +(headers + sequence + qualities), etc. The important property is that the records either have a +specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record +can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA. +The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$"). +For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter. If the generic splitter is used, an option is also available to split records either before or after the separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all others).
