split_file_to_collection: split_file_to

comparison split_file_to_collection.py @ 8:6cbe2f30c2d7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"

author	bgruening
date	Sun, 12 Jul 2020 10:27:06 -0400
parents	0046692724f9
children	baabc30154cd

comparison

equal deleted inserted replaced

-:0046692724f9
+:6cbe2f30c2d7
 #!/usr/bin/env python
 import argparse
 import math
 import os
+import random
 import re
-import random
 # configuration of the splitting for specific file types
 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
 # - number of lines to split after (0 if not splitting by number of lines but regex)
 # - a boolean indicating if the record separator is at the end of the record
 #
 # new file types can be added by appending to this dict,
 # updating the parser, and adding a new type option in the Galaxy wrapper
-FILETYPES = {'fasta': ('^>', 0,  False),
+FILETYPES = {'fasta': (r'^>', 0,  False),
-'fastq': ('', 4, False),
+'fastq': (r'', 4, False),
-'tabular': ('', 1, False),
+'tabular': (r'', 1, False),
-'txt': ('', 1, False),
+'txt': (r'', 1, False),
-'mgf': ('^BEGIN IONS', 0, False),
+'mgf': (r'^BEGIN IONS', 0, False),
-'sdf': ('\$\$\$\$', 0, True),
+'sdf': (r'\$\$\$\$', 0, True),
 }
 def main():
 ps = parser_cli()
 if top < 0:
 raise ValueError("Number of header lines cannot be negative")
 ftype = args["ftype"]
-assert ftype != "generic" or args["generic_re"] != None, "--generic_re needs to be given for generic input"
+assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"
 if args["ftype"] == "tabular" and args["by"] == "col":
 args["match"] = replace_mapped_chars(args["match"])
 args["sub"] = replace_mapped_chars(args["sub"])
 split_by_column(args, in_file, out_dir, top)
 parser.add_argument('--in', '-i', required=True, help="The input file")
 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
 " the extension of the new files (without a period)")
-parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
+parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True,
 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
-default = "row", choices = ["col", "row"])
+default="row", choices=["col", "row"])
 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
 "If not provided and args[\"rand\"]==True, then date is used", type=int)
 group = parser.add_mutually_exclusive_group()
-group.add_argument('--numnew', '-n', type=int, default = 1,
+group.add_argument('--numnew', '-n', type=int, default=1,
 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
-group.add_argument('--chunksize', '-k', type=int, default = 0,
+group.add_argument('--chunksize', '-k', type=int, default=0,
 help="Number of records by file. Not valid for splitting on a column")
 parser.add_argument('--batch', action='store_true',
 help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
 generic = parser.add_argument_group('Arguments controling generic splitting')
 group = generic.add_mutually_exclusive_group()
-group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False)
+group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False)
-group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False)
+group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False)
 generic.add_argument('--split_after', '-p', action='store_true',
 help="Split between records after separator (default is before). " +
 "Only for generic splitting by regex - specific ftypes are always split in the default way")
 bycol = parser.add_argument_group('If splitting on a column')
-bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
+bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries")
-bycol.add_argument('--sub', '-s', default = r'\1',
+bycol.add_argument('--sub', '-s', default=r'\1',
 help="The regular expression to substitute in for the matched pattern.")
 bycol.add_argument('--id_column', '-c', default="1",
 help="Column that is used to name output files. Indexed starting from 1.", type=int)
 return parser
-def close_files(file_list):
-# finally, close all files
-for open_file in file_list:
-open_file.close()
 def replace_mapped_chars(pattern):
 """
 handles special escaped characters when coming from galaxy
 else:
 random.seed()
 # batched division (maintains order)
 batch = args["batch"]
 # determine
 # - the number of records that should be stored per file
 #   (done always, even if used only for batch mode)
 # - if the separator is a the start / end of the record
 n_per_file = math.inf
-if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
+if chunksize != 0 or batch:  # needs to be calculated if either batch or chunksize are selected
 with open(in_file) as f:
 # read header lines
 for i in range(top):
 f.readline()
 n_records = 0
 n_records += 1
 # if there are fewer records than desired files
 numnew = min(numnew, n_records)
 # approx. number of records per file
-if chunksize == 0: # i.e. no chunking
+if chunksize == 0:  # i.e. no chunking
 n_per_file = n_records // numnew
 else:
 numnew = n_records // chunksize
 n_per_file = chunksize
 if custom_new_file_name is None:
 new_file_base = os.path.splitext(os.path.basename(in_file))
 else:
 new_file_base = [custom_new_file_name, custom_new_file_ext]
-newfiles = [
+newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]
-open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")
-for count in range(0, numnew)
-]
 # bunch o' counters
 # index to list of new files
 if rand:
 new_file_counter = int(math.floor(random.random() * numnew))
 else:
 new_file_counter = 0
+new_file = open(newfile_names[new_file_counter], "a")
 # to contain header specified by top
 header = ""
 # keep track of the files that have been opened so far
 fresh_files = set(range(numnew))
 if record == "":
 record += line
 else:
 # if is in fresh_files, write header and drop from freshFiles
 if new_file_counter in fresh_files:
-newfiles[new_file_counter].write(header)
+new_file.write(header)
 fresh_files.remove(new_file_counter)
 if sep_at_end:
 record += line
 # write record to file
-newfiles[new_file_counter].write(record)
+new_file.write(record)
 if not sep_at_end:
 record = line
 else:
 record = ""
 # change destination file
 if rand:
 new_file_counter = int(math.floor(random.random() * numnew))
+new_file.close()
+new_file = open(newfile_names[new_file_counter], "a")
 elif batch:
 # number of records read per file
 records_in_file += 1
 # have we reached the max for each file?
 # if so, switch file
 if records_in_file >= n_per_file:
 new_file_counter = (new_file_counter + 1) % numnew
 records_in_file = 0  # reset to 0
+new_file.close()
+new_file = open(newfile_names[new_file_counter], "a")
 else:
 new_file_counter = (new_file_counter + 1) % numnew
+new_file.close()
+new_file = open(newfile_names[new_file_counter], "a")
 # if beginning of line is not record sep, we must be inside a record
 # so just append
 else:
 record += line
 # after loop, write final record to file
-newfiles[new_file_counter].write(record)
+new_file.write(record)
+new_file.close()
-# close new files
-close_files(newfiles)
 def split_by_column(args, in_file, out_dir, top):
 # shift to 0-based indexing
 raise
 sub = args["sub"]
 # set of file names
-new_files = dict()
+files = set()
 # keep track of how many lines have been read
 n_read = 0
 header = ""
 with open(in_file) as file:
 # use regex to get new file name
 out_file_name = re.sub(match, sub, id_col_val)
 out_file_path = os.path.join(out_dir, out_file_name)
 # write
-if out_file_name not in new_files.keys():
+with open(out_file_path, "a") as current_new_file:
-# open file (new, so not already open)
+if out_file_name not in files:
-current_new_file = open(out_file_path, "w")
+current_new_file.write(header)
-current_new_file.write(header)
+files.add(out_file_name)
 current_new_file.write(line)
-# add to dict
-new_files[out_file_name] = current_new_file
-else:
-# file is already open, so just write to it
-new_files[out_file_name].write(line)
-# finally, close all files
-close_files(new_files.values())
 if __name__ == "__main__":
 main()

Mercurial > repos > bgruening > split_file_to_collection

comparison split_file_to_collection.py @ 8:6cbe2f30c2d7 draft