diff split_file_to_collection.py @ 8:6cbe2f30c2d7 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit a1f1f00463f734ba43f4e5c13c63fe5297ee128e-dirty"
author bgruening
date Sun, 12 Jul 2020 10:27:06 -0400
parents 0046692724f9
children baabc30154cd
line wrap: on
line diff
--- a/split_file_to_collection.py	Fri Jul 10 13:41:00 2020 -0400
+++ b/split_file_to_collection.py	Sun Jul 12 10:27:06 2020 -0400
@@ -3,8 +3,8 @@
 import argparse
 import math
 import os
+import random
 import re
-import random
 
 # configuration of the splitting for specific file types
 # - regular expression matching the record separator ('' if not splitting by regex but by number of lines)
@@ -13,12 +13,12 @@
 #
 # new file types can be added by appending to this dict,
 # updating the parser, and adding a new type option in the Galaxy wrapper
-FILETYPES = {'fasta': ('^>', 0,  False),
-             'fastq': ('', 4, False),
-             'tabular': ('', 1, False),
-             'txt': ('', 1, False),
-             'mgf': ('^BEGIN IONS', 0, False),
-             'sdf': ('\$\$\$\$', 0, True),
+FILETYPES = {'fasta': (r'^>', 0,  False),
+             'fastq': (r'', 4, False),
+             'tabular': (r'', 1, False),
+             'txt': (r'', 1, False),
+             'mgf': (r'^BEGIN IONS', 0, False),
+             'sdf': (r'\$\$\$\$', 0, True),
              }
 
 
@@ -41,7 +41,7 @@
 
     ftype = args["ftype"]
 
-    assert ftype != "generic" or args["generic_re"] != None, "--generic_re needs to be given for generic input"
+    assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"
 
     if args["ftype"] == "tabular" and args["by"] == "col":
         args["match"] = replace_mapped_chars(args["match"])
@@ -61,43 +61,37 @@
     parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
     parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
                                                  " the extension of the new files (without a period)")
-    parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
-        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
+    parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True,
+                        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
     parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
-        default = "row", choices = ["col", "row"])
+                        default="row", choices=["col", "row"])
     parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
     parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
     parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
                                              "If not provided and args[\"rand\"]==True, then date is used", type=int)
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--numnew', '-n', type=int, default = 1,
-                        help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
-    group.add_argument('--chunksize', '-k', type=int, default = 0,
-                        help="Number of records by file. Not valid for splitting on a column")
+    group.add_argument('--numnew', '-n', type=int, default=1,
+                       help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
+    group.add_argument('--chunksize', '-k', type=int, default=0,
+                       help="Number of records by file. Not valid for splitting on a column")
     parser.add_argument('--batch', action='store_true',
                         help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
     generic = parser.add_argument_group('Arguments controling generic splitting')
     group = generic.add_mutually_exclusive_group()
-    group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required = False)
-    group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required = False)
+    group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False)
+    group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False)
     generic.add_argument('--split_after', '-p', action='store_true',
-                        help="Split between records after separator (default is before). " + 
-                        "Only for generic splitting by regex - specific ftypes are always split in the default way")
+                         help="Split between records after separator (default is before). " +
+                         "Only for generic splitting by regex - specific ftypes are always split in the default way")
     bycol = parser.add_argument_group('If splitting on a column')
-    bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
-    bycol.add_argument('--sub', '-s', default = r'\1',
+    bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries")
+    bycol.add_argument('--sub', '-s', default=r'\1',
                        help="The regular expression to substitute in for the matched pattern.")
     bycol.add_argument('--id_column', '-c', default="1",
                        help="Column that is used to name output files. Indexed starting from 1.", type=int)
     return parser
 
 
-def close_files(file_list):
-    # finally, close all files
-    for open_file in file_list:
-        open_file.close()
-
-
 def replace_mapped_chars(pattern):
     """
     handles special escaped characters when coming from galaxy
@@ -126,13 +120,13 @@
 
     # batched division (maintains order)
     batch = args["batch"]
-    
+
     # determine
-    # - the number of records that should be stored per file 
+    # - the number of records that should be stored per file
     #   (done always, even if used only for batch mode)
     # - if the separator is a the start / end of the record
     n_per_file = math.inf
-    if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
+    if chunksize != 0 or batch:  # needs to be calculated if either batch or chunksize are selected
         with open(in_file) as f:
             # read header lines
             for i in range(top):
@@ -150,7 +144,7 @@
         # if there are fewer records than desired files
         numnew = min(numnew, n_records)
         # approx. number of records per file
-        if chunksize == 0: # i.e. no chunking
+        if chunksize == 0:  # i.e. no chunking
             n_per_file = n_records // numnew
         else:
             numnew = n_records // chunksize
@@ -165,16 +159,14 @@
     else:
         new_file_base = [custom_new_file_name, custom_new_file_ext]
 
-    newfiles = [
-        open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w")
-        for count in range(0, numnew)
-    ]
+    newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]
     # bunch o' counters
     # index to list of new files
     if rand:
         new_file_counter = int(math.floor(random.random() * numnew))
     else:
         new_file_counter = 0
+    new_file = open(newfile_names[new_file_counter], "a")
     # to contain header specified by top
     header = ""
     # keep track of the files that have been opened so far
@@ -201,13 +193,13 @@
                 else:
                     # if is in fresh_files, write header and drop from freshFiles
                     if new_file_counter in fresh_files:
-                        newfiles[new_file_counter].write(header)
+                        new_file.write(header)
                         fresh_files.remove(new_file_counter)
-                    
+
                     if sep_at_end:
                         record += line
                     # write record to file
-                    newfiles[new_file_counter].write(record)
+                    new_file.write(record)
                     if not sep_at_end:
                         record = line
                     else:
@@ -216,6 +208,8 @@
                     # change destination file
                     if rand:
                         new_file_counter = int(math.floor(random.random() * numnew))
+                        new_file.close()
+                        new_file = open(newfile_names[new_file_counter], "a")
                     elif batch:
                         # number of records read per file
                         records_in_file += 1
@@ -224,17 +218,19 @@
                         if records_in_file >= n_per_file:
                             new_file_counter = (new_file_counter + 1) % numnew
                             records_in_file = 0  # reset to 0
+                            new_file.close()
+                            new_file = open(newfile_names[new_file_counter], "a")
                     else:
                         new_file_counter = (new_file_counter + 1) % numnew
+                        new_file.close()
+                        new_file = open(newfile_names[new_file_counter], "a")
             # if beginning of line is not record sep, we must be inside a record
             # so just append
             else:
                 record += line
         # after loop, write final record to file
-        newfiles[new_file_counter].write(record)
-
-    # close new files
-    close_files(newfiles)
+        new_file.write(record)
+        new_file.close()
 
 
 def split_by_column(args, in_file, out_dir, top):
@@ -251,7 +247,7 @@
     sub = args["sub"]
 
     # set of file names
-    new_files = dict()
+    files = set()
 
     # keep track of how many lines have been read
     n_read = 0
@@ -274,19 +270,11 @@
             out_file_path = os.path.join(out_dir, out_file_name)
 
             # write
-            if out_file_name not in new_files.keys():
-                # open file (new, so not already open)
-                current_new_file = open(out_file_path, "w")
-                current_new_file.write(header)
+            with open(out_file_path, "a") as current_new_file:
+                if out_file_name not in files:
+                    current_new_file.write(header)
+                    files.add(out_file_name)
                 current_new_file.write(line)
-                # add to dict
-                new_files[out_file_name] = current_new_file
-            else:
-                # file is already open, so just write to it
-                new_files[out_file_name].write(line)
-
-    # finally, close all files
-    close_files(new_files.values())
 
 
 if __name__ == "__main__":