split_file_to_collection: split_file_to

comparison split_file_to_collection.py @ 9:baabc30154cd draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 37305ce5e6ef1b91f6058eff5b128064f76fbe45

author	bgruening
date	Thu, 23 Nov 2023 20:02:01 +0000
parents	6cbe2f30c2d7
children	2dae863c8f42

comparison

equal deleted inserted replaced

-:6cbe2f30c2d7
+:baabc30154cd
 # - number of lines to split after (0 if not splitting by number of lines but regex)
 # - a boolean indicating if the record separator is at the end of the record
 #
 # new file types can be added by appending to this dict,
 # updating the parser, and adding a new type option in the Galaxy wrapper
-FILETYPES = {'fasta': (r'^>', 0,  False),
+FILETYPES = {
-'fastq': (r'', 4, False),
+"fasta": (r"^>", 0, False),
-'tabular': (r'', 1, False),
+"fastq": (r"", 4, False),
-'txt': (r'', 1, False),
+"tabular": (r"", 1, False),
-'mgf': (r'^BEGIN IONS', 0, False),
+"txt": (r"", 1, False),
-'sdf': (r'\$\$\$\$', 0, True),
+"mgf": (r"^BEGIN IONS", 0, False),
-}
+"sdf": (r"\$\$\$\$", 0, True),
+}
 def main():
 ps = parser_cli()
 args = vars(ps.parse_args())
 # get args and validate
 in_file = args["in"]
 if not os.path.isfile(args["in"]):
-raise FileNotFoundError('Input file does not exist')
+raise FileNotFoundError("Input file does not exist")
 out_dir = args["out_dir"]
 if not os.path.isdir(args["out_dir"]):
-raise FileNotFoundError('out_dir is not a directory')
+raise FileNotFoundError("out_dir is not a directory")
 top = args["top"]
 if top < 0:
 raise ValueError("Number of header lines cannot be negative")
 ftype = args["ftype"]
-assert ftype != "generic" or args["generic_re"] is not None, "--generic_re needs to be given for generic input"
+assert (
+ftype != "generic" or args["generic_re"] is not None
+), "--generic_re needs to be given for generic input"
 if args["ftype"] == "tabular" and args["by"] == "col":
 args["match"] = replace_mapped_chars(args["match"])
 args["sub"] = replace_mapped_chars(args["sub"])
 split_by_column(args, in_file, out_dir, top)
 args["generic_re"] = replace_mapped_chars(args["generic_re"])
 split_by_record(args, in_file, out_dir, top, ftype)
 def parser_cli():
-parser = argparse.ArgumentParser(description="split a file into multiple files. " +
+parser = argparse.ArgumentParser(
-"Can split on the column of a tabular file, " +
+description="split a file into multiple files. "
-"with custom and useful names based on column value.")
++ "Can split on the column of a tabular file, "
-parser.add_argument('--in', '-i', required=True, help="The input file")
++ "with custom and useful names based on column value."
-parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
+)
-parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
+parser.add_argument("--in", "-i", required=True, help="The input file")
-parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
+parser.add_argument(
-" the extension of the new files (without a period)")
+"--out_dir",
-parser.add_argument('--ftype', '-f', help="The type of the file to split", required=True,
+"-o",
-choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"])
+default=os.getcwd(),
-parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
+help="The output directory",
-default="row", choices=["col", "row"])
+required=True,
-parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files.")
+)
-parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
+parser.add_argument(
-parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " +
+"--file_names",
-"If not provided and args[\"rand\"]==True, then date is used", type=int)
+"-a",
+help="If not splitting by column, the base name of the new files",
+)
+parser.add_argument(
+"--file_ext",
+"-e",
+help="If not splitting by column,"
++ " the extension of the new files (without a period)",
+)
+parser.add_argument(
+"--ftype",
+"-f",
+help="The type of the file to split",
+required=True,
+choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"],
+)
+parser.add_argument(
+"--by",
+"-b",
+help="Split by line or by column (tabular only)",
+default="row",
+choices=["col", "row"],
+)
+parser.add_argument(
+"--top",
+"-t",
+type=int,
+default=0,
+help="Number of header lines to carry over to new files.",
+)
+parser.add_argument(
+"--rand",
+"-r",
+help="Divide records randomly into new files",
+action="store_true",
+)
+parser.add_argument(
+"--seed",
+"-x",
+help="Provide a seed for the random number generator. "
++ 'If not provided and args["rand"]==True, then date is used',
+type=int,
+)
 group = parser.add_mutually_exclusive_group()
-group.add_argument('--numnew', '-n', type=int, default=1,
+group.add_argument(
-help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.")
+"--numnew",
-group.add_argument('--chunksize', '-k', type=int, default=0,
+"-n",
-help="Number of records by file. Not valid for splitting on a column")
+type=int,
-parser.add_argument('--batch', action='store_true',
+default=1,
-help="Distribute files to collection while maintaining order. Ignored if splitting on column.")
+help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.",
-generic = parser.add_argument_group('Arguments controling generic splitting')
+)
+group.add_argument(
+"--chunksize",
+"-k",
+type=int,
+default=0,
+help="Number of records by file. Not valid for splitting on a column",
+)
+parser.add_argument(
+"--batch",
+action="store_true",
+help="Distribute files to collection while maintaining order. Ignored if splitting on column.",
+)
+generic = parser.add_argument_group("Arguments controling generic splitting")
 group = generic.add_mutually_exclusive_group()
-group.add_argument('--generic_re', '-g', default="", help="Regular expression indicating the start of a new record (only for generic)", required=False)
+group.add_argument(
-group.add_argument('--generic_num', type=int, default=0, help="Length of records in number of lines (only for generic)", required=False)
+"--generic_re",
-generic.add_argument('--split_after', '-p', action='store_true',
+"-g",
-help="Split between records after separator (default is before). " +
+default="",
-"Only for generic splitting by regex - specific ftypes are always split in the default way")
+help="Regular expression indicating the start of a new record (only for generic)",
-bycol = parser.add_argument_group('If splitting on a column')
+required=False,
-bycol.add_argument('--match', '-m', default="(.*)", help="The regular expression to match id column entries")
+)
-bycol.add_argument('--sub', '-s', default=r'\1',
+group.add_argument(
-help="The regular expression to substitute in for the matched pattern.")
+"--generic_num",
-bycol.add_argument('--id_column', '-c', default="1",
+type=int,
-help="Column that is used to name output files. Indexed starting from 1.", type=int)
+default=0,
+help="Length of records in number of lines (only for generic)",
+required=False,
+)
+generic.add_argument(
+"--split_after",
+"-p",
+action="store_true",
+help="Split between records after separator (default is before). "
++ "Only for generic splitting by regex - specific ftypes are always split in the default way",
+)
+bycol = parser.add_argument_group("If splitting on a column")
+bycol.add_argument(
+"--match",
+"-m",
+default="(.*)",
+help="The regular expression to match id column entries",
+)
+bycol.add_argument(
+"--sub",
+"-s",
+default=r"\1",
+help="The regular expression to substitute in for the matched pattern.",
+)
+bycol.add_argument(
+"--id_column",
+"-c",
+default="1",
+help="Column that is used to name output files. Indexed starting from 1.",
+type=int,
+)
 return parser
 def replace_mapped_chars(pattern):
 """
 handles special escaped characters when coming from galaxy
 """
-mapped_chars = {'\'': '__sq__', '\\': '__backslash__'}
+mapped_chars = {"'": "__sq__", "\\": "__backslash__"}
 for key, value in mapped_chars.items():
 pattern = pattern.replace(value, key)
 return pattern
 def split_by_record(args, in_file, out_dir, top, ftype):
 # get configuration (record separator, start at end) for given filetype
-sep, num, sep_at_end = FILETYPES.get(ftype, (args["generic_re"], args["generic_num"], args["split_after"]))
+sep, num, sep_at_end = FILETYPES.get(
+ftype, (args["generic_re"], args["generic_num"], args["split_after"])
+)
 sep = re.compile(sep)
 chunksize = args["chunksize"]
 numnew = args["numnew"]
 # determine
 # - the number of records that should be stored per file
 #   (done always, even if used only for batch mode)
 # - if the separator is a the start / end of the record
 n_per_file = math.inf
-if chunksize != 0 or batch:  # needs to be calculated if either batch or chunksize are selected
+if (
+chunksize != 0 or batch
+):  # needs to be calculated if either batch or chunksize are selected
 with open(in_file) as f:
 # read header lines
 for i in range(top):
 f.readline()
 n_records = 0
+last_line_matched = False
 for line in f:
-if (num == 0 and re.match(sep, line) is not None) or (num > 0 and n_records % num == 0):
+if (num == 0 and re.match(sep, line) is not None) or (
+num > 0 and n_records % num == 0
+):
 n_records += 1
 last_line_matched = True
 else:
 last_line_matched = False
 if sep_at_end and not last_line_matched:
 numnew = min(numnew, n_records)
 # approx. number of records per file
 if chunksize == 0:  # i.e. no chunking
 n_per_file = n_records // numnew
 else:
-numnew = n_records // chunksize
+numnew = max(n_records // chunksize, 1)  # should not be less than 1
 n_per_file = chunksize
 # make new files
 # strip extension of old file and add number
 custom_new_file_name = args["file_names"]
 if custom_new_file_name is None:
 new_file_base = os.path.splitext(os.path.basename(in_file))
 else:
 new_file_base = [custom_new_file_name, custom_new_file_ext]
-newfile_names = [os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) for count in range(0, numnew)]
+newfile_names = [
+os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1]))
+for count in range(0, numnew)
+]
 # bunch o' counters
 # index to list of new files
 if rand:
 new_file_counter = int(math.floor(random.random() * numnew))
 else:
 record = ""
 for line_no, line in enumerate(f):
 # check if beginning of line is record sep
 # if beginning of line is record sep, either start record or finish one
-if (num == 0 and re.match(sep, line) is not None) or (num > 0 and line_no % num == 0):
+if (num == 0 and re.match(sep, line) is not None) or (
+num > 0 and line_no % num == 0
+):
 # this only happens first time through
 if record == "":
 record += line
 else:
 # if is in fresh_files, write header and drop from freshFiles
 n_read += 1
 if n_read <= top:
 header += line
 continue
 # split into columns, on tab
-fields = re.split(r'\t', line.strip('\n'))
+fields = re.split(r"\t", line.strip("\n"))
 # get id column value
 id_col_val = fields[id_col]
 # use regex to get new file name

Mercurial > repos > bgruening > split_file_to_collection

comparison split_file_to_collection.py @ 9:baabc30154cd draft