diff split_file_to_collection.py @ 4:0850f2dfba13 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
author bgruening
date Wed, 09 Oct 2019 07:34:49 -0400
parents 2ddc36385d7a
children e77b954f0da5
line wrap: on
line diff
--- a/split_file_to_collection.py	Tue Sep 10 12:31:15 2019 -0400
+++ b/split_file_to_collection.py	Wed Oct 09 07:34:49 2019 -0400
@@ -16,7 +16,9 @@
              'fastq': '^@',
              'tabular': '^.*',
              'txt': '^.*',
-             'mgf': '^BEGIN IONS'}
+             'mgf': '^BEGIN IONS',
+             'sdf': '\$\$\$\$',
+             }
 
 
 def main():
@@ -59,7 +61,7 @@
     parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
                                                  " the extension of the new files (without a period)")
     parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
-        choices=["mgf", "fastq", "fasta", "tabular", "txt", "generic"])
+        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
     parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
     parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
         default = "row", choices = ["col", "row"])
@@ -69,10 +71,14 @@
     parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
                                              "If not provided and args[\"rand\"]==True, then date is used", type=int)
     parser.add_argument('--numnew', '-n', type=int, default = 1,
-                        help="Number of output files desired. Not valid for splitting on a column")
+                        help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
+    parser.add_argument('--chunksize', '-k', type=int, default = 0,
+                        help="Number of records by file. Not valid for splitting on a column")
     parser.add_argument('--batch', action='store_true',
                         help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
-
+    parser.add_argument('--split_after', '-p', action='store_true',
+                        help="Split between records after separator (default is before)." + 
+                         "Only for generic - specific ftypes are always split in the default way")
     bycol = parser.add_argument_group('If splitting on a column')
     bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries")
     bycol.add_argument('--sub', '-s', default = r'\1',
@@ -102,6 +108,7 @@
     # get record separator for given filetype
     sep = re.compile(FILETYPES.get(ftype, args["generic_re"]))
 
+    chunksize = args["chunksize"]
     numnew = args["numnew"]
 
     # random division
@@ -114,9 +121,12 @@
 
     # batched division (maintains order)
     batch = args["batch"]
-    # define n_per_file so we don't get a warning about ref before assignment
-    n_per_file = math.inf
-    if batch:
+
+    
+    if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected
+        # define n_per_file so we don't get a warning about ref before assignment
+        n_per_file = math.inf
+
         # number of records
         with open(in_file) as f:
             i = 0
@@ -126,9 +136,17 @@
             n_records = i + 1
         if top:
             n_records -= top  # don't count the top lines
+        
+        if chunksize == 0: # i.e. no chunking
+            # approx. number of lines per file
+            n_per_file = n_records // numnew
+        else:
+            # approx. number of lines per file
+            numnew = n_records // chunksize
+            n_per_file = chunksize
 
-        # approx. number of lines per file
-        n_per_file = n_records // numnew
+
+
 
     # make new files
     # strip extension of old file and add number
@@ -179,13 +197,19 @@
                     if new_file_counter in fresh_files:
                         newfiles[new_file_counter].write(header)
                         fresh_files.remove(new_file_counter)
-
-                    # write record to file
-                    newfiles[new_file_counter].write(record)
+                    
+                    if ftype != "sdf" and args["split_after"] == False:
+                        # write record to file
+                        newfiles[new_file_counter].write(record)
 
-                    # if not the first time through, we assign the new record
-                    record = line
-
+                        # if not the first time through, we assign the new record
+                        record = line
+                                                
+                    else:  # for sdf we want to write the line to the record before starting a new one
+                        record += line
+                        newfiles[new_file_counter].write(record)
+                        record = ""
+                        
                     # change destination file
                     if rand:
                         new_file_counter = int(math.floor(random.random() * numnew))