Mercurial > repos > bgruening > split_file_to_collection
diff split_file_to_collection.py @ 0:de3c2c88e710 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
author | bgruening |
---|---|
date | Tue, 17 Jul 2018 14:37:13 -0400 |
parents | |
children | d150ac3d853d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/split_file_to_collection.py Tue Jul 17 14:37:13 2018 -0400 @@ -0,0 +1,261 @@ +#!/usr/bin/env python + +import argparse +import os +import re +import random +import math + + +""" +regexes that indicate the *beginning* of a record +new file types can be added by appending to this dict, +updating the parser, and adding a new type option in the Galaxy wrapper +""" +FILETYPES = {'fasta': '^>', + 'fastq': '^@', + 'tabular': '^.*', + 'mgf': '^BEGIN IONS'} + + +def main(): + ps = parser_cli() + args = vars(ps.parse_args()) + + # get args and validate + in_file = args["in"] + if not os.path.isfile(args["in"]): + raise FileNotFoundError('Input file does not exist') + + out_dir = args["out_dir"] + if not os.path.isdir(args["out_dir"]): + raise FileNotFoundError('out_dir is not a directory') + + top = args["top"] + if top < 0: + raise ValueError("Number of header lines cannot be negative") + + ftype = args["ftype"] + + if args["ftype"] == "tabular" and args["by"] == "col": + args["match"] = replace_mapped_chars(args["match"]) + args["sub"] = replace_mapped_chars(args["sub"]) + split_by_column(args, in_file, out_dir, top) + + else: + split_by_record(args, in_file, out_dir, top, ftype) + + +def parser_cli(): + parser = argparse.ArgumentParser(description="split a file into multiple files. " + + "Can split on the column of a tabular file, " + + "with custom and useful names based on column value.") + parser.add_argument('--in', '-i', required=True, help="The input file") + parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) + parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") + parser.add_argument('--file_ext', '-e', help="If not splitting by column," + + " the extension of the new files (without a period)") + parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, + choices=["mgf", "fastq", "fasta", "tabular"]) + parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", + default = "row", choices = ["col", "row"]) + parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + + "(tabular only).") + parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') + parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + + "If not provided and args[\"rand\"]==True, then date is used", type=int) + parser.add_argument('--numnew', '-n', type=int, default = 1, + help="Number of output files desired. Not valid for splitting on a column") + parser.add_argument('--batch', action='store_true', + help="Distribute files to collection while maintaining order. Ignored if splitting on column.") + + bycol = parser.add_argument_group('If splitting on a column') + bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") + bycol.add_argument('--sub', '-s', default = r'\1', + help="The regular expression to substitute in for the matched pattern.") + bycol.add_argument('--id_column', '-c', default="1", + help="Column that is used to name output files. Indexed starting from 1.", type=int) + return parser + + +def close_files(file_list): + # finally, close all files + for open_file in file_list: + open_file.close() + + +def replace_mapped_chars(pattern): + """ + handles special escaped characters when coming from galaxy + """ + mapped_chars = {'\'': '__sq__', '\\': '__backslash__'} + for key, value in mapped_chars.items(): + pattern = pattern.replace(value, key) + return pattern + + +def split_by_record(args, in_file, out_dir, top, ftype): + # get record separator for given filetype + sep = re.compile(FILETYPES[ftype]) + + numnew = args["numnew"] + + # random division + rand = args["rand"] + seed = args["seed"] + if seed: + random.seed(seed) + else: + random.seed() + + # batched division (maintains order) + batch = args["batch"] + # define n_per_file so we don't get a warning about ref before assignment + n_per_file = math.inf + if batch: + # number of records + with open(in_file) as f: + i = 0 + for line in f: + if re.match(sep, line) is not None: + i+=1 + n_records = i + 1 + if top: + n_records -= top # don't count the top lines + + # approx. number of lines per file + n_per_file = n_records // numnew + + # make new files + # strip extension of old file and add number + custom_new_file_name = args["file_names"] + custom_new_file_ext = "." + args["file_ext"] + if custom_new_file_name is None: + new_file_base = os.path.splitext(os.path.basename(in_file)) + else: + new_file_base = [custom_new_file_name, custom_new_file_ext] + + newfiles = [ + open(out_dir + "/" + new_file_base[0] + "_" + str(count) + new_file_base[1], "w") + for count in range(0, numnew) + ] + + # bunch o' counters + # index to list of new files + new_file_counter = 0 + + # used for top + # number of lines read so far + n_read = 0 + # to contain header specified by top + header = "" + # keep track of the files that have been opened so far + fresh_files = {i for i in range(0, numnew)} + + # keep track in loop of number of records in each file + # only used in batch + records_in_file = 0 + + # open file + with open(in_file, "r") as file: + record = "" + for line in file: + n_read += 1 + if n_read <= top: + header += line + continue + # check if beginning of line is record sep + # if beginning of line is record sep, either start record or finish one + if re.match(sep, line) is not None: + # this only happens first time through + if record == "": + record += line + else: + # if is in fresh_files, write header and drop from freshFiles + if new_file_counter in fresh_files: + newfiles[new_file_counter].write(header) + fresh_files.remove(new_file_counter) + + # write record to file + newfiles[new_file_counter].write(record) + + # if not the first time through, we assign the new record + record = line + + # change destination file + if rand: + new_file_counter = int(math.floor(random.random() * numnew)) + elif batch: + # number of records read per file + records_in_file += 1 + # have we reached the max for each file? + # if so, switch file + if records_in_file >= n_per_file: + new_file_counter = (new_file_counter + 1) % numnew + records_in_file = 0 # reset to 0 + else: + new_file_counter = (new_file_counter + 1) % numnew + # if beginning of line is not record sep, we must be inside a record + # so just append + else: + record += line + # after loop, write final record to file + newfiles[new_file_counter].write(record) + # close new files + close_files(newfiles) + + +def split_by_column(args, in_file, out_dir, top): + + # shift to 0-based indexing + id_col = int(args["id_column"]) - 1 + + try: + match = re.compile(args["match"]) + except re.error: + print("ERROR: Match (-m) supplied is not valid regex.") + raise + + sub = args["sub"] + + # set of file names + new_files = dict() + + # keep track of how many lines have been read + n_read = 0 + header = "" + with open(in_file) as file: + for line in file: + # if still in top, save to header + n_read += 1 + if n_read <= top: + header += line + continue + # split into columns, on tab + fields = re.split(r'\t', line.strip('\n')) + + # get id column value + id_col_val = fields[id_col] + + # use regex to get new file name + out_file_name = re.sub(match, sub, id_col_val) + out_file_path = os.path.join(out_dir, out_file_name) + + # write + if out_file_name not in new_files.keys(): + # open file (new, so not already open) + current_new_file = open(out_file_path, "w") + current_new_file.write(header) + current_new_file.write(line) + # add to dict + new_files[out_file_name] = current_new_file + else: + # file is already open, so just write to it + new_files[out_file_name].write(line) + + # finally, close all files + close_files(new_files.values()) + + +if __name__ == "__main__": + main()