Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.py @ 4:0850f2dfba13 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 872590086696cfbc248519507ecb9063926297ad"
author | bgruening |
---|---|
date | Wed, 09 Oct 2019 07:34:49 -0400 |
parents | 2ddc36385d7a |
children | e77b954f0da5 |
comparison
equal
deleted
inserted
replaced
3:2ddc36385d7a | 4:0850f2dfba13 |
---|---|
14 """ | 14 """ |
15 FILETYPES = {'fasta': '^>', | 15 FILETYPES = {'fasta': '^>', |
16 'fastq': '^@', | 16 'fastq': '^@', |
17 'tabular': '^.*', | 17 'tabular': '^.*', |
18 'txt': '^.*', | 18 'txt': '^.*', |
19 'mgf': '^BEGIN IONS'} | 19 'mgf': '^BEGIN IONS', |
20 'sdf': '\$\$\$\$', | |
21 } | |
20 | 22 |
21 | 23 |
22 def main(): | 24 def main(): |
23 ps = parser_cli() | 25 ps = parser_cli() |
24 args = vars(ps.parse_args()) | 26 args = vars(ps.parse_args()) |
57 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) | 59 parser.add_argument('--out_dir', '-o', default=os.getcwd(), help="The output directory", required=True) |
58 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") | 60 parser.add_argument('--file_names', '-a', help="If not splitting by column, the base name of the new files") |
59 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + | 61 parser.add_argument('--file_ext', '-e', help="If not splitting by column," + |
60 " the extension of the new files (without a period)") | 62 " the extension of the new files (without a period)") |
61 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, | 63 parser.add_argument('--ftype', '-f', help="The type of the file to split", required = True, |
62 choices=["mgf", "fastq", "fasta", "tabular", "txt", "generic"]) | 64 choices=["mgf", "fastq", "fasta", "sdf", "tabular", "txt", "generic"]) |
63 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) | 65 parser.add_argument('--generic_re', '-g', help="Regular expression indicating the start of a new record (only for generic)", required = False) |
64 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", | 66 parser.add_argument('--by', '-b', help="Split by line or by column (tabular only)", |
65 default = "row", choices = ["col", "row"]) | 67 default = "row", choices = ["col", "row"]) |
66 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + | 68 parser.add_argument('--top', '-t', type=int, default=0, help="Number of header lines to carry over to new files. " + |
67 "(tabular only).") | 69 "(tabular only).") |
68 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') | 70 parser.add_argument('--rand', '-r', help="Divide records randomly into new files", action='store_true') |
69 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + | 71 parser.add_argument('--seed', '-x', help="Provide a seed for the random number generator. " + |
70 "If not provided and args[\"rand\"]==True, then date is used", type=int) | 72 "If not provided and args[\"rand\"]==True, then date is used", type=int) |
71 parser.add_argument('--numnew', '-n', type=int, default = 1, | 73 parser.add_argument('--numnew', '-n', type=int, default = 1, |
72 help="Number of output files desired. Not valid for splitting on a column") | 74 help="Number of output files desired. Not valid for splitting on a column. Not compatible with chunksize and will be ignored if both are set.") |
75 parser.add_argument('--chunksize', '-k', type=int, default = 0, | |
76 help="Number of records by file. Not valid for splitting on a column") | |
73 parser.add_argument('--batch', action='store_true', | 77 parser.add_argument('--batch', action='store_true', |
74 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") | 78 help="Distribute files to collection while maintaining order. Ignored if splitting on column.") |
75 | 79 parser.add_argument('--split_after', '-p', action='store_true', |
80 help="Split between records after separator (default is before)." + | |
81 "Only for generic - specific ftypes are always split in the default way") | |
76 bycol = parser.add_argument_group('If splitting on a column') | 82 bycol = parser.add_argument_group('If splitting on a column') |
77 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") | 83 bycol.add_argument('--match', '-m', default = "(.*)", help="The regular expression to match id column entries") |
78 bycol.add_argument('--sub', '-s', default = r'\1', | 84 bycol.add_argument('--sub', '-s', default = r'\1', |
79 help="The regular expression to substitute in for the matched pattern.") | 85 help="The regular expression to substitute in for the matched pattern.") |
80 bycol.add_argument('--id_column', '-c', default="1", | 86 bycol.add_argument('--id_column', '-c', default="1", |
100 | 106 |
101 def split_by_record(args, in_file, out_dir, top, ftype): | 107 def split_by_record(args, in_file, out_dir, top, ftype): |
102 # get record separator for given filetype | 108 # get record separator for given filetype |
103 sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) | 109 sep = re.compile(FILETYPES.get(ftype, args["generic_re"])) |
104 | 110 |
111 chunksize = args["chunksize"] | |
105 numnew = args["numnew"] | 112 numnew = args["numnew"] |
106 | 113 |
107 # random division | 114 # random division |
108 rand = args["rand"] | 115 rand = args["rand"] |
109 seed = args["seed"] | 116 seed = args["seed"] |
112 else: | 119 else: |
113 random.seed() | 120 random.seed() |
114 | 121 |
115 # batched division (maintains order) | 122 # batched division (maintains order) |
116 batch = args["batch"] | 123 batch = args["batch"] |
117 # define n_per_file so we don't get a warning about ref before assignment | 124 |
118 n_per_file = math.inf | 125 |
119 if batch: | 126 if chunksize != 0 or batch: # needs to be calculated if either batch or chunksize are selected |
127 # define n_per_file so we don't get a warning about ref before assignment | |
128 n_per_file = math.inf | |
129 | |
120 # number of records | 130 # number of records |
121 with open(in_file) as f: | 131 with open(in_file) as f: |
122 i = 0 | 132 i = 0 |
123 for line in f: | 133 for line in f: |
124 if re.match(sep, line) is not None: | 134 if re.match(sep, line) is not None: |
125 i+=1 | 135 i+=1 |
126 n_records = i + 1 | 136 n_records = i + 1 |
127 if top: | 137 if top: |
128 n_records -= top # don't count the top lines | 138 n_records -= top # don't count the top lines |
129 | 139 |
130 # approx. number of lines per file | 140 if chunksize == 0: # i.e. no chunking |
131 n_per_file = n_records // numnew | 141 # approx. number of lines per file |
142 n_per_file = n_records // numnew | |
143 else: | |
144 # approx. number of lines per file | |
145 numnew = n_records // chunksize | |
146 n_per_file = chunksize | |
147 | |
148 | |
149 | |
132 | 150 |
133 # make new files | 151 # make new files |
134 # strip extension of old file and add number | 152 # strip extension of old file and add number |
135 custom_new_file_name = args["file_names"] | 153 custom_new_file_name = args["file_names"] |
136 custom_new_file_ext = "." + args["file_ext"] | 154 custom_new_file_ext = "." + args["file_ext"] |
177 else: | 195 else: |
178 # if is in fresh_files, write header and drop from freshFiles | 196 # if is in fresh_files, write header and drop from freshFiles |
179 if new_file_counter in fresh_files: | 197 if new_file_counter in fresh_files: |
180 newfiles[new_file_counter].write(header) | 198 newfiles[new_file_counter].write(header) |
181 fresh_files.remove(new_file_counter) | 199 fresh_files.remove(new_file_counter) |
182 | 200 |
183 # write record to file | 201 if ftype != "sdf" and args["split_after"] == False: |
184 newfiles[new_file_counter].write(record) | 202 # write record to file |
185 | 203 newfiles[new_file_counter].write(record) |
186 # if not the first time through, we assign the new record | 204 |
187 record = line | 205 # if not the first time through, we assign the new record |
188 | 206 record = line |
207 | |
208 else: # for sdf we want to write the line to the record before starting a new one | |
209 record += line | |
210 newfiles[new_file_counter].write(record) | |
211 record = "" | |
212 | |
189 # change destination file | 213 # change destination file |
190 if rand: | 214 if rand: |
191 new_file_counter = int(math.floor(random.random() * numnew)) | 215 new_file_counter = int(math.floor(random.random() * numnew)) |
192 elif batch: | 216 elif batch: |
193 # number of records read per file | 217 # number of records read per file |