comparison split_file_to_collection.py @ 2:d150ac3d853d draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 06ffe450bafa280eee8a4331c9cfc9e1ece7c522"
author bgruening
date Wed, 28 Aug 2019 10:55:25 -0400
parents de3c2c88e710
children 2ddc36385d7a
comparison
equal deleted inserted replaced
1:750c1684d47c 2:d150ac3d853d
13 updating the parser, and adding a new type option in the Galaxy wrapper 13 updating the parser, and adding a new type option in the Galaxy wrapper
14 """ 14 """
15 FILETYPES = {'fasta': '^>', 15 FILETYPES = {'fasta': '^>',
16 'fastq': '^@', 16 'fastq': '^@',
17 'tabular': '^.*', 17 'tabular': '^.*',
18 'txt': '^.*',
18 'mgf': '^BEGIN IONS'} 19 'mgf': '^BEGIN IONS'}
19 20
20 21
21 def main(): 22 def main():
22 ps = parser_cli() 23 ps = parser_cli()
34 top = args["top"] 35 top = args["top"]
35 if top < 0: 36 if top < 0:
36 raise ValueError("Number of header lines cannot be negative") 37 raise ValueError("Number of header lines cannot be negative")
37 38
38 ftype = args["ftype"] 39 ftype = args["ftype"]
40
41 assert ftype != "generic" or args["generic_re"] != None, "--generic_re needs to be given for generic input"
39 42
40 if args["ftype"] == "tabular" and args["by"] == "col": 43 if args["ftype"] == "tabular" and args["by"] == "col":
41 args["match"] = replace_mapped_chars(args["match"]) 44 args["match"] = replace_mapped_chars(args["match"])
42 args["sub"] = replace_mapped_chars(args["sub"]) 45 args["sub"] = replace_mapped_chars(args["sub"])
43 split_by_column(args, in_file, out_dir, top) 46 split_by_column(args, in_file, out_dir, top)
54 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) 57 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True)
55 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") 58 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files")
56 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + 59 parser.add_argument('--file_ext', '-e', help="If not splitting by column," +
57 " the extension of the new files (without a period)") 60 " the extension of the new files (without a period)")
58 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, 61 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True,
59 choices=["mgf", "fastq", "fasta", "tabular"]) 62 choices=["mgf", "fastq", "fasta", "tabular", "txt", "generic"])
63 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False)
60 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", 64 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)",
61 default = "row", choices = ["col", "row"]) 65 default = "row", choices = ["col", "row"])
62 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + 66 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " +
63 "(tabular only).") 67 "(tabular only).")
64 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') 68 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true')
94 return pattern 98 return pattern
95 99
96 100
97 def split_by_record(args, in_file, out_dir, top, ftype): 101 def split_by_record(args, in_file, out_dir, top, ftype):
98 # get record separator for given filetype 102 # get record separator for given filetype
99 sep = re.compile(FILETYPES[ftype]) 103 sep = re.compile(FILETYPES.get(ftype, args["generic_re"]))
100 104
101 numnew = args["numnew"] 105 numnew = args["numnew"]
102 106
103 # random division 107 # random division
104 rand = args["rand"] 108 rand = args["rand"]