comparison split_file_to_collection.xml @ 5:e77b954f0da5 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author bgruening
date Fri, 11 Oct 2019 18:24:43 -0400
parents 0850f2dfba13
children 6cbe2f30c2d7
comparison
equal deleted inserted replaced
4:0850f2dfba13 5:e77b954f0da5
1 <tool id="split_file_to_collection" name="Split file" version="0.3.0"> 1 <tool id="split_file_to_collection" name="Split file" version="0.4.0">
2 <description>to dataset collection</description> 2 <description>to dataset collection</description>
3 <macros> 3 <macros>
4 <xml name="regex_sanitizer"> 4 <xml name="regex_sanitizer">
5 <sanitizer> 5 <sanitizer>
6 <valid> 6 <valid>
75 --batch 75 --batch
76 #end if 76 #end if
77 #end if 77 #end if
78 #else 78 #else
79 #if $split_parms.select_ftype == "generic" 79 #if $split_parms.select_ftype == "generic"
80 --generic_re '$split_parms.generic_regex' 80 #if $split_parms.split_method.select_split_method == "regex"
81 #if $split_parms.split_after == 'true': 81 --generic_re '$split_parms.split_method.generic_regex'
82 --split_after 82 #if $split_parms.split_method.split_after == 'true':
83 --split_after
84 #end if
85 #else
86 --generic_num $split_parms.split_method.record_length
83 #end if 87 #end if
84 #end if 88 #end if
85 #if $split_parms.select_mode.mode == "numnew": 89 #if $split_parms.select_mode.mode == "numnew":
86 --numnew '$split_parms.select_mode.numnew' 90 --numnew '$split_parms.select_mode.numnew'
87 #else 91 #else
161 <param name="input" type="data" format="txt" label="Text file to split"/> 165 <param name="input" type="data" format="txt" label="Text file to split"/>
162 <expand macro="numnew_fname"/> 166 <expand macro="numnew_fname"/>
163 </when> 167 </when>
164 <when value="generic"> 168 <when value="generic">
165 <param name="input" type="data" format="txt" label="File to split"/> 169 <param name="input" type="data" format="txt" label="File to split"/>
166 <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*"> 170 <conditional name="split_method">
167 <expand macro="regex_sanitizer"/> 171 <param name="select_split_method" type="select" label="Method to split files">
168 </param> 172 <option value="regex">Specify record separator as regular expression</option>
173 <option value="number">Specify number of lines after which a record ends</option>
174 </param>
175 <when value="regex">
176 <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*">
177 <expand macro="regex_sanitizer"/>
178 </param>
179 <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
180 <option value="false" selected="true">Before</option>
181 <option value="true">After</option>
182 </param>
183 </when>
184 <when value="number">
185 <param name="record_length" type="integer" value="1" label="Record length" help="The number of lines after which each record ends"/>
186 </when>
187 </conditional>
169 <expand macro="numnew_fname"/> 188 <expand macro="numnew_fname"/>
170 <param name="split_after" type="select" value="false" label="Split records before or after the separator?" help="If before, the separator will appear at the start of each record; if after, at the end">
171 <option value="false" selected="true">Before</option>
172 <option value="true">After</option>
173 </param>
174 </when> 189 </when>
175 </conditional> 190 </conditional>
176 </inputs> 191 </inputs>
177 <outputs> 192 <outputs>
178 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}"> 193 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}">
203 <discover_datasets pattern="__name_and_ext__" directory="out" visible="false"/> 218 <discover_datasets pattern="__name_and_ext__" directory="out" visible="false"/>
204 <filter>split_parms['select_ftype'] == "generic"</filter> 219 <filter>split_parms['select_ftype'] == "generic"</filter>
205 </collection> 220 </collection>
206 </outputs> 221 </outputs>
207 <tests> 222 <tests>
223 <!-- 1 -->
208 <test> 224 <test>
209 <param name="input" value="test.tabular" ftype="tabular"/> 225 <param name="input" value="test.tabular" ftype="tabular"/>
210 <param name="select_ftype" value="tabular"/> 226 <param name="select_ftype" value="tabular"/>
211 <param name="select_split_by" value="col"/> 227 <param name="select_split_by" value="col"/>
212 <param name="id_col" value="1"/> 228 <param name="id_col" value="1"/>
217 <element name="foo.tab" file="foo.tab" ftype="tabular"/> 233 <element name="foo.tab" file="foo.tab" ftype="tabular"/>
218 <element name="foo2.tab" file="foo2.tab" ftype="tabular"/> 234 <element name="foo2.tab" file="foo2.tab" ftype="tabular"/>
219 <element name="foo3.tab" file="foo3.tab" ftype="tabular"/> 235 <element name="foo3.tab" file="foo3.tab" ftype="tabular"/>
220 </output_collection> 236 </output_collection>
221 </test> 237 </test>
238 <!-- 2 -->
222 <test> 239 <test>
223 <param name="input" value="test.tabular" ftype="tabular"/> 240 <param name="input" value="test.tabular" ftype="tabular"/>
224 <param name="select_ftype" value="tabular"/> 241 <param name="select_ftype" value="tabular"/>
225 <param name="select_split_by" value="row"/> 242 <param name="select_split_by" value="row"/>
226 <param name="top" value="2"/> 243 <param name="top" value="2"/>
230 <output_collection name="list_output_tab" type="list"> 247 <output_collection name="list_output_tab" type="list">
231 <element name="test_000000.tabular" file="test_0.tabular" ftype="tabular"/> 248 <element name="test_000000.tabular" file="test_0.tabular" ftype="tabular"/>
232 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/> 249 <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/>
233 </output_collection> 250 </output_collection>
234 </test> 251 </test>
252 <!-- 3 -->
235 <test> 253 <test>
236 <param name="input" value="test.tabular" ftype="tabular"/> 254 <param name="input" value="test.tabular" ftype="tabular"/>
237 <param name="select_ftype" value="tabular"/> 255 <param name="select_ftype" value="tabular"/>
238 <param name="select_split_by" value="row"/> 256 <param name="select_split_by" value="row"/>
239 <param name="top" value="2"/> 257 <param name="top" value="2"/>
244 <output_collection name="list_output_tab" type="list"> 262 <output_collection name="list_output_tab" type="list">
245 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/> 263 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/>
246 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> 264 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
247 </output_collection> 265 </output_collection>
248 </test> 266 </test>
267 <!-- 4 -->
249 <test> 268 <test>
250 <param name="input" value="test.tabular" ftype="tabular"/> 269 <param name="input" value="test.tabular" ftype="tabular"/>
251 <param name="select_ftype" value="tabular"/> 270 <param name="select_ftype" value="tabular"/>
252 <param name="select_split_by" value="row"/> 271 <param name="select_split_by" value="row"/>
253 <param name="top" value="2"/> 272 <param name="top" value="2"/>
258 <output_collection name="list_output_tab" type="list"> 277 <output_collection name="list_output_tab" type="list">
259 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/> 278 <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/>
260 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> 279 <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
261 </output_collection> 280 </output_collection>
262 </test> 281 </test>
282 <!-- 5 -->
263 <test> 283 <test>
264 <param name="select_ftype" value="txt"/> 284 <param name="select_ftype" value="txt"/>
265 <param name="input" value="karyotype.txt" ftype="txt"/> 285 <param name="input" value="karyotype.txt" ftype="txt"/>
266 <param name="mode" value="numnew"/> 286 <param name="mode" value="numnew"/>
267 <param name="numnew" value="24"/> 287 <param name="numnew" value="24"/>
293 <element name="chr_000021.txt" file="chr_000021.txt" ftype="txt"/> 313 <element name="chr_000021.txt" file="chr_000021.txt" ftype="txt"/>
294 <element name="chr_000022.txt" file="chr_000022.txt" ftype="txt"/> 314 <element name="chr_000022.txt" file="chr_000022.txt" ftype="txt"/>
295 <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/> 315 <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/>
296 </output_collection> 316 </output_collection>
297 </test> 317 </test>
318 <!-- 6 -->
298 <test> 319 <test>
299 <param name="input" value="psm.tabular" ftype="tabular"/> 320 <param name="input" value="psm.tabular" ftype="tabular"/>
300 <param name="select_ftype" value="tabular"/> 321 <param name="select_ftype" value="tabular"/>
301 <param name="select_split_by" value="col"/> 322 <param name="select_split_by" value="col"/>
302 <param name="id_col" value="10"/> 323 <param name="id_col" value="10"/>
308 <element name="file2.tab" file="file2.tab" ftype="tabular"/> 329 <element name="file2.tab" file="file2.tab" ftype="tabular"/>
309 <element name="file3.tab" file="file3.tab" ftype="tabular"/> 330 <element name="file3.tab" file="file3.tab" ftype="tabular"/>
310 <element name="file4.tab" file="file4.tab" ftype="tabular"/> 331 <element name="file4.tab" file="file4.tab" ftype="tabular"/>
311 </output_collection> 332 </output_collection>
312 </test> 333 </test>
334 <!-- 7 splitting of mgf -->
313 <test> 335 <test>
314 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/> 336 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
315 <param name="select_ftype" value="mgf"/> 337 <param name="select_ftype" value="mgf"/>
316 <param name="mode" value="numnew"/> 338 <param name="mode" value="numnew"/>
317 <param name="numnew" value="3"/> 339 <param name="numnew" value="3"/>
320 <element name="demo_000000.mgf" file="demo_0.mgf" ftype="mgf"/> 342 <element name="demo_000000.mgf" file="demo_0.mgf" ftype="mgf"/>
321 <element name="demo_000001.mgf" file="demo_1.mgf" ftype="mgf"/> 343 <element name="demo_000001.mgf" file="demo_1.mgf" ftype="mgf"/>
322 <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/> 344 <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/>
323 </output_collection> 345 </output_collection>
324 </test> 346 </test>
347 <!-- 8 splitting of fasta + desired number of files-->
325 <test> 348 <test>
326 <param name="input" value="test.fasta" ftype="fasta"/> 349 <param name="input" value="test.fasta" ftype="fasta"/>
327 <param name="select_ftype" value="fasta"/> 350 <param name="select_ftype" value="fasta"/>
328 <param name="mode" value="numnew"/> 351 <param name="mode" value="numnew"/>
329 <param name="numnew" value="2"/> 352 <param name="numnew" value="2"/>
331 <output_collection name="list_output_fasta" type="list"> 354 <output_collection name="list_output_fasta" type="list">
332 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/> 355 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/>
333 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> 356 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
334 </output_collection> 357 </output_collection>
335 </test> 358 </test>
359 <!-- 9 splitting of fasta + desired chunksize -->
336 <test> 360 <test>
337 <param name="input" value="test.fasta" ftype="fasta"/> 361 <param name="input" value="test.fasta" ftype="fasta"/>
338 <param name="select_ftype" value="fasta"/> 362 <param name="select_ftype" value="fasta"/>
339 <param name="mode" value="chunk"/> 363 <param name="mode" value="chunk"/>
340 <param name="chunksize" value="3"/> 364 <param name="chunksize" value="3"/>
342 <output_collection name="list_output_fasta" type="list"> 366 <output_collection name="list_output_fasta" type="list">
343 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/> 367 <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/>
344 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> 368 <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/>
345 </output_collection> 369 </output_collection>
346 </test> 370 </test>
371 <!-- 10 splitting of fastq, specify desired number of files -->
347 <test> 372 <test>
348 <param name="input" value="test.fastq" ftype="fastq"/> 373 <param name="input" value="test.fastq" ftype="fastq"/>
349 <param name="select_ftype" value="fastq"/> 374 <param name="select_ftype" value="fastq"/>
350 <param name="mode" value="numnew"/> 375 <param name="mode" value="numnew"/>
351 <param name="numnew" value="2"/> 376 <param name="numnew" value="2"/>
353 <output_collection name="list_output_fastq" type="list"> 378 <output_collection name="list_output_fastq" type="list">
354 <element name="test_000000.fastq" file="test_0.fastq" ftype="fastq"/> 379 <element name="test_000000.fastq" file="test_0.fastq" ftype="fastq"/>
355 <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/> 380 <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/>
356 </output_collection> 381 </output_collection>
357 </test> 382 </test>
383 <!-- 11 splitting of fastq, specify desired number of files
384 same as previous test, but by specifying the number of lines per record
385 explicitely (not using the preset of the python script) -->
386 <test>
387 <param name="input" value="test.fastq" ftype="fastq"/>
388 <param name="select_ftype" value="generic"/>
389 <param name="select_split_method" value="number"/>
390 <param name="record_length" value="4"/>
391 <param name="mode" value="numnew"/>
392 <param name="numnew" value="2"/>
393 <param name="newfilenames" value="test"/>
394 <output_collection name="list_output_generic" type="list">
395 <element name="test_000000" file="test_0.fastq" ftype="fastq"/>
396 <element name="test_000001" file="test_1.fastq" ftype="fastq"/>
397 </output_collection>
398 </test>
399 <!-- splitting of fasta w random assignment and specific filename prefix -->
358 <test> 400 <test>
359 <param name="input" value="test.fasta" ftype="fasta"/> 401 <param name="input" value="test.fasta" ftype="fasta"/>
360 <param name="select_ftype" value="fasta"/> 402 <param name="select_ftype" value="fasta"/>
361 <param name="mode" value="numnew"/> 403 <param name="mode" value="numnew"/>
362 <param name="numnew" value="2"/> 404 <param name="numnew" value="2"/>
366 <output_collection name="list_output_fasta" type="list"> 408 <output_collection name="list_output_fasta" type="list">
367 <element name="rand_000000.fasta" file="rand_0.fasta" ftype="fasta"/> 409 <element name="rand_000000.fasta" file="rand_0.fasta" ftype="fasta"/>
368 <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/> 410 <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/>
369 </output_collection> 411 </output_collection>
370 </test> 412 </test>
413 <!-- splitting of fasta w batch assignment and specific filename prefix -->
371 <test> 414 <test>
372 <param name="input" value="test.fasta" ftype="fasta"/> 415 <param name="input" value="test.fasta" ftype="fasta"/>
373 <param name="select_ftype" value="fasta"/> 416 <param name="select_ftype" value="fasta"/>
374 <param name="mode" value="numnew"/> 417 <param name="mode" value="numnew"/>
375 <param name="numnew" value="2"/> 418 <param name="numnew" value="2"/>
378 <output_collection name="list_output_fasta" type="list"> 421 <output_collection name="list_output_fasta" type="list">
379 <element name="fasta_batch_000000.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> 422 <element name="fasta_batch_000000.fasta" file="fasta_batch_0.fasta" ftype="fasta"/>
380 <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> 423 <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/>
381 </output_collection> 424 </output_collection>
382 </test> 425 </test>
426 <!-- splitting of txt w default (alternating assignment) -->
383 <test> 427 <test>
384 <param name="input" value="test.tabular" ftype="txt"/> 428 <param name="input" value="test.tabular" ftype="txt"/>
385 <param name="select_ftype" value="txt"/> 429 <param name="select_ftype" value="txt"/>
386 <param name="mode" value="numnew"/> 430 <param name="mode" value="numnew"/>
387 <param name="numnew" value="2"/> 431 <param name="numnew" value="2"/>
389 <output_collection name="list_output_txt" type="list"> 433 <output_collection name="list_output_txt" type="list">
390 <element name="test_000000.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/> 434 <element name="test_000000.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/>
391 <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> 435 <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/>
392 </output_collection> 436 </output_collection>
393 </test> 437 </test>
438 <!-- generic-regex splitting (of txt) w default assignement (alternating) -->
394 <test> 439 <test>
395 <param name="input" value="test.tabular" ftype="txt"/> 440 <param name="input" value="test.tabular" ftype="txt"/>
396 <param name="select_ftype" value="generic"/> 441 <param name="select_ftype" value="generic"/>
442 <param name="select_split_method" value="regex"/>
397 <param name="generic_regex" value="^.*"/> 443 <param name="generic_regex" value="^.*"/>
398 <param name="mode" value="numnew"/> 444 <param name="mode" value="numnew"/>
399 <param name="numnew" value="2"/> 445 <param name="numnew" value="2"/>
400 <param name="newfilenames" value="test"/> 446 <param name="newfilenames" value="test"/>
401 <output_collection name="list_output_generic" type="list"> 447 <output_collection name="list_output_generic" type="list">
402 <element name="test_000000" file="test_0.tabular" ftype="txt" lines_diff="1"/> 448 <element name="test_000000" file="test_0.tabular" ftype="txt" lines_diff="1"/>
403 <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/> 449 <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/>
404 </output_collection> 450 </output_collection>
405 </test> 451 </test>
452 <!-- generic-regex splitting (of a fasta) w random assignment -->
406 <test> 453 <test>
407 <param name="input" value="test.fasta" ftype="fasta"/> 454 <param name="input" value="test.fasta" ftype="fasta"/>
408 <param name="select_ftype" value="generic"/> 455 <param name="select_ftype" value="generic"/>
456 <param name="select_split_method" value="regex"/>
409 <param name="generic_regex" value="^>.*"/> 457 <param name="generic_regex" value="^>.*"/>
410 <param name="mode" value="numnew"/> 458 <param name="mode" value="numnew"/>
411 <param name="numnew" value="2"/> 459 <param name="numnew" value="2"/>
412 <param name="newfilenames" value="rand"/> 460 <param name="newfilenames" value="rand"/>
413 <param name="allocate" value="random"/> 461 <param name="allocate" value="random"/>
415 <output_collection name="list_output_generic" type="list"> 463 <output_collection name="list_output_generic" type="list">
416 <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/> 464 <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/>
417 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/> 465 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
418 </output_collection> 466 </output_collection>
419 </test> 467 </test>
468 <!-- sdf + specify desired number of files -->
420 <test> 469 <test>
421 <param name="input" value="3_molecules.sdf" ftype="sdf"/> 470 <param name="input" value="3_molecules.sdf" ftype="sdf"/>
422 <param name="select_ftype" value="sdf"/> 471 <param name="select_ftype" value="sdf"/>
423 <param name="mode" value="numnew"/> 472 <param name="mode" value="numnew"/>
424 <param name="numnew" value="10"/> 473 <param name="numnew" value="10"/>
428 <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/> 477 <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/>
429 <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/> 478 <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/>
430 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/> 479 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
431 </output_collection> 480 </output_collection>
432 </test> 481 </test>
482 <!-- sdf + specify desired number of records per file (chunksize) -->
433 <test> 483 <test>
434 <param name="input" value="3_molecules.sdf" ftype="sdf"/> 484 <param name="input" value="3_molecules.sdf" ftype="sdf"/>
435 <param name="select_ftype" value="sdf"/> 485 <param name="select_ftype" value="sdf"/>
436 <param name="mode" value="chunk"/> 486 <param name="mode" value="chunk"/>
437 <param name="chunksize" value="1"/> 487 <param name="chunksize" value="1"/>
441 <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/> 491 <element name="mol_000000.sdf" file="mol_0.sdf" ftype="sdf"/>
442 <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/> 492 <element name="mol_000001.sdf" file="mol_1.sdf" ftype="sdf"/>
443 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/> 493 <element name="mol_000002.sdf" file="mol_2.sdf" ftype="sdf"/>
444 </output_collection> 494 </output_collection>
445 </test> 495 </test>
496 <!-- test split_after (by splitting fasta files after non-header lines) -->
446 <test> 497 <test>
447 <param name="input" value="test.fasta" ftype="fasta"/> 498 <param name="input" value="test.fasta" ftype="fasta"/>
448 <param name="select_ftype" value="generic"/> 499 <param name="select_ftype" value="generic"/>
449 <param name="generic_regex" value="^>.*"/> 500 <param name="select_split_method" value="regex"/>
501 <param name="generic_regex" value="^[^>].*"/>
450 <param name="split_after" value="true"/> 502 <param name="split_after" value="true"/>
451 <param name="mode" value="numnew"/> 503 <param name="mode" value="numnew"/>
452 <param name="numnew" value="2"/> 504 <param name="numnew" value="2"/>
453 <param name="newfilenames" value="rand"/> 505 <param name="newfilenames" value="rand"/>
454 <param name="allocate" value="random"/> 506 <param name="allocate" value="random"/>
455 <param name="seed" value="1010"/> 507 <param name="seed" value="1010"/>
456 <output_collection name="list_output_generic" type="list"> 508 <output_collection name="list_output_generic" type="list">
457 <element name="rand_000001" file="split_after.fasta" ftype="fasta"/> 509 <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/>
510 <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/>
458 </output_collection> 511 </output_collection>
459 </test> 512 </test>
460 </tests> 513 </tests>
461 <help><![CDATA[ 514 <help><![CDATA[
462 **Split file into a dataset collection** 515 **Split file into a dataset collection**
463 516
464 This tool splits a data set consisting of records into multiple data sets within a collection. 517 This tool splits a data set consisting of records into multiple data sets within a collection.
465 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence 518 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence
466 (headers + sequence + qualities), etc. The important property is that the beginning of a new record 519 (headers + sequence + qualities), etc. The important property is that the records either have a
467 can be specified by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. 520 specific length (e.g. 4 lines for FASTQ) or that the beginning/end of a new record
468 The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, SDF and MGF. 521 can be specified by a regular expression, e.g. ".*" for lines or ">.*" for FASTA.
469 For other data types the text delimiting records can be specified manually using the generic splitter. 522 The tool has presets for text, tabular data sets (which are split after each line), FASTA (new records start with ">.*"), FASTQ (records consist of 4 lines), SDF (records start with "^BEGIN IONS") and MGF (records end with "^$$$$").
523 For other data types the text delimiting records or the number of lines making up a record can be specified manually using the generic splitter.
470 If the generic splitter is used, an option is also available to split records either before or after the 524 If the generic splitter is used, an option is also available to split records either before or after the
471 separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all 525 separator. If a preset filetype is used, this is selected automatically (after for SDF, before for all
472 others). 526 others).
473 527
474 If splitting by line (or by some other item, like a FASTA entry or an MGF record), the splitting can be either done alternatingly, in original record order, or at random. 528 If splitting by line (or by some other item, like a FASTA entry or an MGF record), the splitting can be either done alternatingly, in original record order, or at random.