Mercurial > repos > bgruening > split_file_to_collection

--- a/split_file_to_collection.py	Sun Jul 12 10:27:06 2020 -0400
+++ b/split_file_to_collection.py	Thu Nov 23 20:02:01 2023 +0000
@@ -13,13 +13,14 @@
 #
 # new file types can be added by appending to this dict,
 # updating the parser, and adding a new type option in the Galaxy wrapper
-FILETYPES = {'fasta': (r'^>', 0,  False),
-             'fastq': (r'', 4, False),
-             'tabular': (r'', 1, False),
-             'txt': (r'', 1, False),
-             'mgf': (r'^BEGIN IONS', 0, False),
-             'sdf': (r'\$\$\$\$', 0, True),
-             }
+FILETYPES = {
+    "fasta": (r"^>", 0, False),
+    "fastq": (r"", 4, False),
+    "tabular": (r"", 1, False),
+    "txt": (r"", 1, False),
+    "mgf": (r"^BEGIN IONS", 0, False),
+    "sdf": (r"\$\$\$\$", 0, True),
+}


 def main():
@@ -29,11 +30,11 @@
     # get args and validate
     in_file = args["in"]
     if not os.path.isfile(args["in"]):
-        raise FileNotFoundError('Input file does not exist')
+        raise FileNotFoundError("Input file does not exist")

     out_dir = args["out_dir"]
     if not os.path.isdir(args["out_dir"]):
-        raise FileNotFoundError('out_dir is not a directory')
+        raise FileNotFoundError("out_dir is not a directory")

     top = args["top"]
     if top < 0:
@@ -41,7 +42,9 @@

     ftype = args["ftype"]

-    assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"
+    assert (
+        ftype != "generic" or args["generic_re"] is not None
+    ), "--generic_re needs to be given for generic input"

     if args["ftype"] == "tabular" and args["by"] == "col":
         args["match"] = replace_mapped_chars(args["match"])
@@ -53,42 +56,127 @@


 def parser_cli():
-    parser = argparse.ArgumentParser(description="split a file into multiple files. " +
-                                                 "Can split on the column of a tabular file, " +
-                                                 "with custom and useful names based on column value.")
-    parser.add_argument('--in', '-i', required=True, help="The input file")
-    parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
-    parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
-    parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
-                                                 " the extension of the new files (without a period)")
-    parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True,
-                        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
-    parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
-                        default="row", choices=["col", "row"])
-    parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
-    parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
-    parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
-                                             "If not provided and args[\"rand\"]==True, then date is used", type=int)
+    parser = argparse.ArgumentParser(
+        description="split a file into multiple files. "
+        + "Can split on the column of a tabular file, "
+        + "with custom and useful names based on column value."
+    )
+    parser.add_argument("--in", "-i", required=True, help="The input file")
+    parser.add_argument(
+        "--out_dir",
+        "-o",
+        default=os.getcwd(),
+        help="The output directory",
+        required=True,
+    )
+    parser.add_argument(
+        "--file_names",
+        "-a",
+        help="If not splitting by column, the base name of the new files",
+    )
+    parser.add_argument(
+        "--file_ext",
+        "-e",
+        help="If not splitting by column,"
+        + " the extension of the new files (without a period)",
+    )
+    parser.add_argument(
+        "--ftype",
+        "-f",
+        help="The type of the file to split",
+        required=True,
+        choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"],
+    )
+    parser.add_argument(
+        "--by",
+        "-b",
+        help="Split by line or by column (tabular only)",
+        default="row",
+        choices=["col", "row"],
+    )
+    parser.add_argument(
+        "--top",
+        "-t",
+        type=int,
+        default=0,
+        help="Number of header lines to carry over to new files.",
+    )
+    parser.add_argument(
+        "--rand",
+        "-r",
+        help="Divide records randomly into new files",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--seed",
+        "-x",
+        help="Provide a seed for the random number generator. "
+        + 'If not provided and args["rand"]==True, then date is used',
+        type=int,
+    )
     group = parser.add_mutually_exclusive_group()
-    group.add_argument('--numnew', '-n', type=int, default=1,
-                       help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
-    group.add_argument('--chunksize', '-k', type=int, default=0,
-                       help="Number of records by file. Not valid for splitting on a column")
-    parser.add_argument('--batch', action='store_true',
-                        help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
-    generic = parser.add_argument_group('Arguments controling generic splitting')
+    group.add_argument(
+        "--numnew",
+        "-n",
+        type=int,
+        default=1,
+        help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.",
+    )
+    group.add_argument(
+        "--chunksize",
+        "-k",
+        type=int,
+        default=0,
+        help="Number of records by file. Not valid for splitting on a column",
+    )
+    parser.add_argument(
+        "--batch",
+        action="store_true",
+        help="Distribute files to collection while maintaining order. Ignored if splitting on column.",
+    )
+    generic = parser.add_argument_group("Arguments controling generic splitting")
     group = generic.add_mutually_exclusive_group()
-    group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False)
-    group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False)
-    generic.add_argument('--split_after', '-p', action='store_true',
-                         help="Split between records after separator (default is before). " +
-                         "Only for generic splitting by regex - specific ftypes are always split in the default way")
-    bycol = parser.add_argument_group('If splitting on a column')
-    bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries")
-    bycol.add_argument('--sub', '-s', default=r'\1',
-                       help="The regular expression to substitute in for the matched pattern.")
-    bycol.add_argument('--id_column', '-c', default="1",
-                       help="Column that is used to name output files. Indexed starting from 1.", type=int)
+    group.add_argument(
+        "--generic_re",
+        "-g",
+        default="",
+        help="Regular expression indicating the start of a new record (only for generic)",
+        required=False,
+    )
+    group.add_argument(
+        "--generic_num",
+        type=int,
+        default=0,
+        help="Length of records in number of lines (only for generic)",
+        required=False,
+    )
+    generic.add_argument(
+        "--split_after",
+        "-p",
+        action="store_true",
+        help="Split between records after separator (default is before). "
+        + "Only for generic splitting by regex - specific ftypes are always split in the default way",
+    )
+    bycol = parser.add_argument_group("If splitting on a column")
+    bycol.add_argument(
+        "--match",
+        "-m",
+        default="(.*)",
+        help="The regular expression to match id column entries",
+    )
+    bycol.add_argument(
+        "--sub",
+        "-s",
+        default=r"\1",
+        help="The regular expression to substitute in for the matched pattern.",
+    )
+    bycol.add_argument(
+        "--id_column",
+        "-c",
+        default="1",
+        help="Column that is used to name output files. Indexed starting from 1.",
+        type=int,
+    )
     return parser


@@ -96,7 +184,7 @@
     """
     handles special escaped characters when coming from galaxy
     """
-    mapped_chars = {'\'': '__sq__', '\\': '__backslash__'}
+    mapped_chars = {"'": "__sq__", "\\": "__backslash__"}
     for key, value in mapped_chars.items():
         pattern = pattern.replace(value, key)
     return pattern
@@ -104,7 +192,9 @@

 def split_by_record(args, in_file, out_dir, top, ftype):
     # get configuration (record separator, start at end) for given filetype
-    sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))
+    sep, num, sep_at_end = FILETYPES.get(
+        ftype, (args["generic_re"], args["generic_num"], args["split_after"])
+    )
     sep = re.compile(sep)

     chunksize = args["chunksize"]
@@ -126,14 +216,19 @@
     #   (done always, even if used only for batch mode)
     # - if the separator is a the start / end of the record
     n_per_file = math.inf
-    if chunksize != 0 or batch:  # needs to be calculated if either batch or chunksize are selected
+    if (
+        chunksize != 0 or batch
+    ):  # needs to be calculated if either batch or chunksize are selected
         with open(in_file) as f:
             # read header lines
             for i in range(top):
                 f.readline()
             n_records = 0
+            last_line_matched = False
             for line in f:
-                if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):
+                if (num == 0 and re.match(sep, line) is not None) or (
+                    num > 0 and n_records % num == 0
+                ):
                     n_records += 1
                     last_line_matched = True
                 else:
@@ -147,7 +242,7 @@
         if chunksize == 0:  # i.e. no chunking
             n_per_file = n_records // numnew
         else:
-            numnew = n_records // chunksize
+            numnew = max(n_records // chunksize, 1)  # should not be less than 1
             n_per_file = chunksize

     # make new files
@@ -159,7 +254,10 @@
     else:
         new_file_base = [custom_new_file_name, custom_new_file_ext]

-    newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]
+    newfile_names = [
+        os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1]))
+        for count in range(0, numnew)
+    ]
     # bunch o' counters
     # index to list of new files
     if rand:
@@ -186,7 +284,9 @@
         for line_no, line in enumerate(f):
             # check if beginning of line is record sep
             # if beginning of line is record sep, either start record or finish one
-            if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):
+            if (num == 0 and re.match(sep, line) is not None) or (
+                num > 0 and line_no % num == 0
+            ):
                 # this only happens first time through
                 if record == "":
                     record += line
@@ -260,7 +360,7 @@
                 header += line
                 continue
             # split into columns, on tab
-            fields = re.split(r'\t', line.strip('\n'))
+            fields = re.split(r"\t", line.strip("\n"))

             # get id column value
             id_col_val = fields[id_col]
--- a/split_file_to_collection.xml	Sun Jul 12 10:27:06 2020 -0400
+++ b/split_file_to_collection.xml	Thu Nov 23 20:02:01 2023 +0000
@@ -1,4 +1,4 @@
-<tool id="split_file_to_collection" name="Split file" version="0.5.0">
+<tool id="split_file_to_collection" name="Split file" version="0.5.1">
     <description>to dataset collection</description>
     <macros>
         <xml name="regex_sanitizer">