Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.xml @ 2:d150ac3d853d draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 06ffe450bafa280eee8a4331c9cfc9e1ece7c522"
author | bgruening |
---|---|
date | Wed, 28 Aug 2019 10:55:25 -0400 |
parents | 750c1684d47c |
children | 2ddc36385d7a |
comparison
equal
deleted
inserted
replaced
1:750c1684d47c | 2:d150ac3d853d |
---|---|
1 <tool id="split_file_to_collection" name="Split file" version="0.1.1"> | 1 <tool id="split_file_to_collection" name="Split file" version="0.2.0"> |
2 <description>to dataset collection</description> | 2 <description>to dataset collection</description> |
3 <macros> | 3 <macros> |
4 <xml name="regex_sanitizer"> | |
5 <sanitizer> | |
6 <valid> | |
7 <add preset="string.printable"/> | |
8 <remove value="\" /> | |
9 <remove value="'" /> | |
10 </valid> | |
11 <mapping initial="none"> | |
12 <add source="\" target="__backslash__" /> | |
13 <add source="'" target="__sq__"/> | |
14 </mapping> | |
15 </sanitizer> | |
16 </xml> | |
4 <xml name="numnew_fname"> | 17 <xml name="numnew_fname"> |
5 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/> | 18 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/> |
6 <param name="newfilenames" type="text" label="Base name for new files in collection" | 19 <param name="newfilenames" type="text" label="Base name for new files in collection" |
7 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/> | 20 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/> |
8 <conditional name="select_allocate"> | 21 <conditional name="select_allocate"> |
46 #if $split_parms.split_by.select_allocate.allocate == "batch": | 59 #if $split_parms.split_by.select_allocate.allocate == "batch": |
47 --batch | 60 --batch |
48 #end if | 61 #end if |
49 #end if | 62 #end if |
50 #else | 63 #else |
64 #if $split_parms.select_ftype == "generic" | |
65 --generic_re '$split_parms.generic_regex' | |
66 #end if | |
51 --numnew '$split_parms.numnew' | 67 --numnew '$split_parms.numnew' |
52 #if $split_parms.select_allocate.allocate == "random": | 68 #if $split_parms.select_allocate.allocate == "random": |
53 --rand | 69 --rand |
54 --seed '$split_parms.select_allocate.seed' | 70 --seed '$split_parms.select_allocate.seed' |
55 #end if | 71 #end if |
61 --file_names '$split_parms.split_by.newfilenames' | 77 --file_names '$split_parms.split_by.newfilenames' |
62 --file_ext '$split_parms.select_ftype' | 78 --file_ext '$split_parms.select_ftype' |
63 #end if | 79 #end if |
64 #if $split_parms.select_ftype != "tabular": | 80 #if $split_parms.select_ftype != "tabular": |
65 --file_names '$split_parms.newfilenames' | 81 --file_names '$split_parms.newfilenames' |
66 --file_ext '$split_parms.select_ftype' | 82 #if $split_parms.select_ftype == "generic" |
83 --file_ext '$split_parms.input.ext' | |
84 #else | |
85 --file_ext '$split_parms.select_ftype' | |
86 #end if | |
67 #end if | 87 #end if |
68 ]]></command> | 88 ]]></command> |
69 <inputs> | 89 <inputs> |
70 <conditional name="split_parms"> | 90 <conditional name="split_parms"> |
71 <param name="select_ftype" type="select" label="Select the file type to split"> | 91 <param name="select_ftype" type="select" label="Select the file type to split"> |
72 <option value="mgf">MGF</option> | 92 <option value="mgf">MGF</option> |
73 <option value="fastq">FASTQ</option> | 93 <option value="fastq">FASTQ</option> |
74 <option value="tabular">Tabular</option> | 94 <option value="tabular">Tabular</option> |
75 <option value="fasta">FASTA</option> | 95 <option value="fasta">FASTA</option> |
96 <option value="txt">Text files</option> | |
97 <option value="generic">Generic</option> | |
76 </param> | 98 </param> |
77 <when value="tabular"> | 99 <when value="tabular"> |
78 <param name="input" type="data" format="tabular" label="Tabular file to split"/> | 100 <param name="input" type="data" format="tabular" label="Tabular file to split"/> |
79 <param name="top" type="integer" value="0" min="0" label="Number of header lines to transfer to new files"/> | 101 <param name="top" type="integer" value="0" min="0" label="Number of header lines to transfer to new files"/> |
80 <conditional name="split_by"> | 102 <conditional name="split_by"> |
83 <option value="col">By column</option> | 105 <option value="col">By column</option> |
84 </param> | 106 </param> |
85 <when value="col"> | 107 <when value="col"> |
86 <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/> | 108 <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/> |
87 <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)"> | 109 <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)"> |
88 <sanitizer> | 110 <expand macro="regex_sanitizer"/> |
89 <valid> | 111 </param> |
90 <add preset="string.printable"/> | |
91 <remove value="\" /> | |
92 <remove value="'" /> | |
93 </valid> | |
94 <mapping initial="none"> | |
95 <add source="\" target="__backslash__" /> | |
96 <add source="'" target="__sq__"/> | |
97 </mapping> | |
98 </sanitizer> | |
99 </param> | |
100 <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1"> | 112 <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1"> |
101 <sanitizer> | 113 <expand macro="regex_sanitizer"/> |
102 <valid> | |
103 <add preset="string.printable"/> | |
104 <remove value="\" /> | |
105 <remove value="'" /> | |
106 </valid> | |
107 <mapping initial="none"> | |
108 <add source="\" target="__backslash__" /> | |
109 <add source="'" target="__sq__"/> | |
110 </mapping> | |
111 </sanitizer> | |
112 </param> | 114 </param> |
113 </when> | 115 </when> |
114 <when value="row"> | 116 <when value="row"> |
115 <expand macro="numnew_fname"/> | 117 <expand macro="numnew_fname"/> |
116 </when> | 118 </when> |
126 </when> | 128 </when> |
127 <when value="fasta"> | 129 <when value="fasta"> |
128 <param name="input" type="data" format="fasta" label="FASTA file to split"/> | 130 <param name="input" type="data" format="fasta" label="FASTA file to split"/> |
129 <expand macro="numnew_fname"/> | 131 <expand macro="numnew_fname"/> |
130 </when> | 132 </when> |
133 <when value="txt"> | |
134 <param name="input" type="data" format="txt" label="Text file to split"/> | |
135 <expand macro="numnew_fname"/> | |
136 </when> | |
137 <when value="generic"> | |
138 <param name="input" type="data" format="txt" label="File to split"/> | |
139 <param name="generic_regex" type="text" label="Regex to match record separator" value="^.*"> | |
140 <expand macro="regex_sanitizer"/> | |
141 </param> | |
142 <expand macro="numnew_fname"/> | |
143 </when> | |
131 </conditional> | 144 </conditional> |
132 </inputs> | 145 </inputs> |
133 <outputs> | 146 <outputs> |
134 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: output collection"> | 147 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}"> |
135 <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/> | 148 <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/> |
136 <filter>split_parms['select_ftype'] == "tabular"</filter> | 149 <filter>split_parms['select_ftype'] == "tabular"</filter> |
137 </collection> | 150 </collection> |
138 <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: output collection"> | 151 <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}"> |
139 <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/> | 152 <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/> |
140 <filter>split_parms['select_ftype'] == "mgf"</filter> | 153 <filter>split_parms['select_ftype'] == "mgf"</filter> |
141 </collection> | 154 </collection> |
142 <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: output collection"> | 155 <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}"> |
143 <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/> | 156 <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/> |
144 <filter>split_parms['select_ftype'] == "fasta"</filter> | 157 <filter>split_parms['select_ftype'] == "fasta"</filter> |
145 </collection> | 158 </collection> |
146 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: output collection"> | 159 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}"> |
147 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/> | 160 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/> |
148 <filter>split_parms['select_ftype'] == "fastq"</filter> | 161 <filter>split_parms['select_ftype'] == "fastq"</filter> |
162 </collection> | |
163 <collection name="list_output_txt" type="list" label="${tool.name} on ${on_string}"> | |
164 <discover_datasets pattern="__name__" directory="out" visible="false" format="txt"/> | |
165 <filter>split_parms['select_ftype'] == "txt"</filter> | |
166 </collection> | |
167 <collection name="list_output_generic" type="list" label="${tool.name} on ${on_string}"> | |
168 <discover_datasets pattern="__name_and_ext__" directory="out" visible="false"/> | |
169 <filter>split_parms['select_ftype'] == "generic"</filter> | |
149 </collection> | 170 </collection> |
150 </outputs> | 171 </outputs> |
151 <tests> | 172 <tests> |
152 <test> | 173 <test> |
153 <param name="input" value="test.tabular" ftype="tabular"/> | 174 <param name="input" value="test.tabular" ftype="tabular"/> |
255 <output_collection name="list_output_fasta" type="list"> | 276 <output_collection name="list_output_fasta" type="list"> |
256 <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> | 277 <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> |
257 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> | 278 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> |
258 </output_collection> | 279 </output_collection> |
259 </test> | 280 </test> |
281 <test> | |
282 <param name="input" value="test.tabular" ftype="txt"/> | |
283 <param name="select_ftype" value="txt"/> | |
284 <param name="numnew" value="2"/> | |
285 <param name="newfilenames" value="test"/> | |
286 <output_collection name="list_output_txt" type="list"> | |
287 <element name="test_0.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/> | |
288 <element name="test_1.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> | |
289 </output_collection> | |
290 </test> | |
291 <test> | |
292 <param name="input" value="test.tabular" ftype="txt"/> | |
293 <param name="select_ftype" value="generic"/> | |
294 <param name="generic_regex" value="^.*"/> | |
295 <param name="numnew" value="2"/> | |
296 <param name="newfilenames" value="test"/> | |
297 <output_collection name="list_output_generic" type="list"> | |
298 <element name="test_0" file="test_0.tabular" ftype="txt" lines_diff="1"/> | |
299 <element name="test_1" file="test_1.tabular" ftype="txt" lines_diff="1"/> | |
300 </output_collection> | |
301 </test> | |
302 <test> | |
303 <param name="input" value="test.fasta" ftype="fasta"/> | |
304 <param name="select_ftype" value="generic"/> | |
305 <param name="generic_regex" value="^>.*"/> | |
306 <param name="numnew" value="2"/> | |
307 <param name="newfilenames" value="rand"/> | |
308 <param name="allocate" value="random"/> | |
309 <param name="seed" value="1010"/> | |
310 <output_collection name="list_output_generic" type="list"> | |
311 <element name="rand_0" file="rand_0.fasta" ftype="fasta"/> | |
312 <element name="rand_1" file="rand_1.fasta" ftype="fasta"/> | |
313 </output_collection> | |
314 </test> | |
315 <test> | |
316 <param name="input" value="3_molecules.sdf" ftype="sdf"/> | |
317 <param name="select_ftype" value="generic"/> | |
318 <param name="generic_regex" value="^\$\$\$\$.*"/> | |
319 <param name="numnew" value="1000"/> | |
320 <param name="newfilenames" value="mol"/> | |
321 <param name="allocate" value="batch"/> | |
322 <output_collection name="list_output_generic" type="list"> | |
323 <element name="mol_0" file="mol_0.sdf" ftype="sdf"/> | |
324 <element name="mol_1" file="mol_1.sdf" ftype="sdf"/> | |
325 <element name="mol_2" file="mol_2.sdf" ftype="sdf"/> | |
326 </output_collection> | |
327 </test> | |
260 </tests> | 328 </tests> |
261 <help><![CDATA[ | 329 <help><![CDATA[ |
262 **Split file into a dataset collection** | 330 **Split file into a dataset collection** |
263 | 331 |
264 This tool can split five types of files into a separate files within a dataset collection: MGF, FASTA, FASTQ, and tabular. | 332 This tool splits a data sets consisting of records into multiple data sets within a collection. |
333 A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence | |
334 (headers + sequence + qualities), etc. The important property is that the begin of a new record | |
335 can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. | |
336 The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF. | |
337 For other data types the text delimiting records can be specified manually using the generic splitter. | |
338 | |
339 If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random. | |
340 | |
341 If t records are to be distributed to n new data sets, then the i-th record goes to data set | |
342 | |
343 * floor(i / t * n) (for batch), | |
344 * i % n (for alternating), or | |
345 * a random data set | |
346 | |
347 For instance, t=5 records are distributed as follows on n=2 data sets | |
348 | |
349 = === === ==== | |
350 i bat alt rand | |
351 = === === ==== | |
352 0 0 0 0 | |
353 1 0 1 1 | |
354 2 0 0 1 | |
355 3 1 1 0 | |
356 4 1 0 0 | |
357 = === === ==== | |
358 | |
359 If the five records are distributed on n=3 data sets: | |
360 | |
361 = === === ==== | |
362 i bat alt rand | |
363 = === === ==== | |
364 0 0 0 0 | |
365 1 0 1 1 | |
366 2 1 2 2 | |
367 3 1 0 0 | |
368 4 2 1 1 | |
369 = === === ==== | |
370 | |
371 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. | |
372 | |
265 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. | 373 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. |
266 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. | 374 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. |
267 The default regular expression uses each value in the column without modifying it. | 375 The default regular expression uses each value in the column without modifying it. |
268 | |
269 If splitting by line (or by some other item, like a FASTA entry or an MGF section), the splitting can be either done sequentially or at random. | |
270 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. | |
271 | |
272 **Note** | |
273 | |
274 Due to current limitations with dataset collections, a log file is produced when running this tool. It will usually be empty, but if the tool fails, any errors will be printed to the log file. | |
275 ]]></help> | 376 ]]></help> |
276 <citations> | 377 <citations> |
277 <citation type="bibtex"> | 378 <citation type="bibtex"> |
278 @misc{githubsplit, | 379 @misc{githubsplit, |
279 author = {Easterly, Caleb}, | 380 author = {Easterly, Caleb}, |