Mercurial > repos > bgruening > split_file_to_collection
comparison split_file_to_collection.xml @ 0:de3c2c88e710 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
author | bgruening |
---|---|
date | Tue, 17 Jul 2018 14:37:13 -0400 |
parents | |
children | 750c1684d47c |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:de3c2c88e710 |
---|---|
1 <tool id="split_file_to_collection" name="Split file" version="0.1.1"> | |
2 <description>to dataset collection</description> | |
3 <macros> | |
4 <xml name="numnew_fname"> | |
5 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/> | |
6 <param name="newfilenames" type="text" label="Base name for new files in collection" | |
7 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/> | |
8 <conditional name="select_allocate"> | |
9 <param name="allocate" type="select" label="Method to allocate records to new files" help="See the information section for a diagram"> | |
10 <option value="random">At random</option> | |
11 <option value="batch">Maintain record order</option> | |
12 <option value="byrow" selected="true">Alternate output files</option> | |
13 </param> | |
14 <when value="random"> | |
15 <param name="seed" type="integer" label="Random number seed" help="For reproducibility, set this to some arbitrary integer (i.e. '1010')" value="1010"/> | |
16 </when> | |
17 <when value="batch"> | |
18 </when> | |
19 <when value="byrow"> | |
20 </when> | |
21 </conditional> | |
22 </xml> | |
23 </macros> | |
24 <requirements> | |
25 <requirement type="package" version="3.5">python</requirement> | |
26 </requirements> | |
27 <command detect_errors="aggressive"><![CDATA[ | |
28 mkdir ./out && | |
29 python '$__tool_directory__/split_file_to_collection.py' | |
30 --out ./out | |
31 --in '$split_parms.input' | |
32 --ftype '$split_parms.select_ftype' | |
33 #if $split_parms.select_ftype == "tabular": | |
34 --top '$split_parms.top' | |
35 --by '$split_parms.split_by.select_split_by' | |
36 #if $split_parms.split_by.select_split_by == "col": | |
37 --id_column '$split_parms.split_by.id_col' | |
38 --match '$split_parms.split_by.match_regex' | |
39 --sub '$split_parms.split_by.sub_regex' | |
40 #else | |
41 --numnew '$split_parms.split_by.numnew' | |
42 #if $split_parms.split_by.select_allocate.allocate == "random": | |
43 --rand | |
44 --seed '$split_parms.split_by.rand.seed' | |
45 #end if | |
46 #if $split_parms.split_by.select_allocate.allocate == "batch": | |
47 --batch | |
48 #end if | |
49 #end if | |
50 #else | |
51 --numnew '$split_parms.numnew' | |
52 #if $split_parms.select_allocate.allocate == "random": | |
53 --rand | |
54 --seed '$split_parms.select_allocate.seed' | |
55 #end if | |
56 #if $split_parms.select_allocate.allocate == "batch": | |
57 --batch | |
58 #end if | |
59 #end if | |
60 #if ($split_parms.select_ftype == "tabular" and $split_parms.split_by.select_split_by == "row"): | |
61 --file_names '$split_parms.split_by.newfilenames' | |
62 --file_ext '$split_parms.select_ftype' | |
63 #end if | |
64 #if $split_parms.select_ftype != "tabular": | |
65 --file_names '$split_parms.newfilenames' | |
66 --file_ext '$split_parms.select_ftype' | |
67 #end if | |
68 > '$log' | |
69 ]]></command> | |
70 <inputs> | |
71 <conditional name="split_parms"> | |
72 <param name="select_ftype" type="select" label="Select the file type to split"> | |
73 <option value="mgf">MGF</option> | |
74 <option value="fastq">FASTQ</option> | |
75 <option value="tabular">Tabular</option> | |
76 <option value="fasta">FASTA</option> | |
77 </param> | |
78 <when value="tabular"> | |
79 <param name="input" type="data" format="tabular" label="Tabular file to split"/> | |
80 <param name="top" type="integer" value="0" min="0" label="Number of header lines to transfer to new files"/> | |
81 <conditional name="split_by"> | |
82 <param name="select_split_by" type="select" label="Split by row or by a column?"> | |
83 <option value="row">By row</option> | |
84 <option value="col">By column</option> | |
85 </param> | |
86 <when value="col"> | |
87 <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/> | |
88 <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)"> | |
89 <sanitizer> | |
90 <valid> | |
91 <add preset="string.printable"/> | |
92 <remove value="\" /> | |
93 <remove value="'" /> | |
94 </valid> | |
95 <mapping initial="none"> | |
96 <add source="\" target="__backslash__" /> | |
97 <add source="'" target="__sq__"/> | |
98 </mapping> | |
99 </sanitizer> | |
100 </param> | |
101 <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1"> | |
102 <sanitizer> | |
103 <valid> | |
104 <add preset="string.printable"/> | |
105 <remove value="\" /> | |
106 <remove value="'" /> | |
107 </valid> | |
108 <mapping initial="none"> | |
109 <add source="\" target="__backslash__" /> | |
110 <add source="'" target="__sq__"/> | |
111 </mapping> | |
112 </sanitizer> | |
113 </param> | |
114 </when> | |
115 <when value="row"> | |
116 <expand macro="numnew_fname"/> | |
117 </when> | |
118 </conditional> | |
119 </when> | |
120 <when value="mgf"> | |
121 <param name="input" type="data" format="mgf" label="MGF file to split"/> | |
122 <expand macro="numnew_fname"/> | |
123 </when> | |
124 <when value="fastq"> | |
125 <param name="input" type="data" format="fastq" label="FASTQ file to split"/> | |
126 <expand macro="numnew_fname"/> | |
127 </when> | |
128 <when value="fasta"> | |
129 <param name="input" type="data" format="fasta" label="FASTA file to split"/> | |
130 <expand macro="numnew_fname"/> | |
131 </when> | |
132 </conditional> | |
133 </inputs> | |
134 <outputs> | |
135 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: output collection"> | |
136 <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/> | |
137 <filter>split_parms['select_ftype'] == "tabular"</filter> | |
138 </collection> | |
139 <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: output collection"> | |
140 <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/> | |
141 <filter>split_parms['select_ftype'] == "mgf"</filter> | |
142 </collection> | |
143 <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: output collection"> | |
144 <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/> | |
145 <filter>split_parms['select_ftype'] == "fasta"</filter> | |
146 </collection> | |
147 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: output collection"> | |
148 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/> | |
149 <filter>split_parms['select_ftype'] == "fastq"</filter> | |
150 </collection> | |
151 <data name="log" format="txt" label="${tool.name} on ${on_string}: log" /> | |
152 </outputs> | |
153 <tests> | |
154 <test> | |
155 <param name="input" value="test.tabular" ftype="tabular"/> | |
156 <param name="select_ftype" value="tabular"/> | |
157 <param name="select_split_by" value="col"/> | |
158 <param name="id_col" value="1"/> | |
159 <param name="match_regex" value="(.*)\.mgf"/> | |
160 <param name="sub_regex" value="\1.tab"/> | |
161 <param name="top" value="2"/> | |
162 <output_collection name="list_output_tab" type="list"> | |
163 <element name="foo.tab" file="foo.tab" ftype="tabular"/> | |
164 <element name="foo2.tab" file="foo2.tab" ftype="tabular"/> | |
165 <element name="foo3.tab" file="foo3.tab" ftype="tabular"/> | |
166 </output_collection> | |
167 </test> | |
168 <test> | |
169 <param name="input" value="test.tabular" ftype="tabular"/> | |
170 <param name="select_ftype" value="tabular"/> | |
171 <param name="select_split_by" value="row"/> | |
172 <param name="top" value="2"/> | |
173 <param name="numnew" value="2"/> | |
174 <param name="newfilenames" value="test"/> | |
175 <output_collection name="list_output_tab" type="list"> | |
176 <element name="test_0.tabular" file="test_0.tabular" ftype="tabular"/> | |
177 <element name="test_1.tabular" file="test_1.tabular" ftype="tabular"/> | |
178 </output_collection> | |
179 </test> | |
180 <test> | |
181 <param name="input" value="test.tabular" ftype="tabular"/> | |
182 <param name="select_ftype" value="tabular"/> | |
183 <param name="select_split_by" value="row"/> | |
184 <param name="top" value="2"/> | |
185 <param name="numnew" value="2"/> | |
186 <param name="newfilenames" value="batch_tab"/> | |
187 <param name="allocate" value="batch"/> | |
188 <output_collection name="list_output_tab" type="list"> | |
189 <element name="batch_tab_0.tabular" file="batch_tab_0.tabular" ftype="tabular"/> | |
190 <element name="batch_tab_1.tabular" file="batch_tab_1.tabular" ftype="tabular"/> | |
191 </output_collection> | |
192 </test> | |
193 <test> | |
194 <param name="input" value="psm.tabular" ftype="tabular"/> | |
195 <param name="select_ftype" value="tabular"/> | |
196 <param name="select_split_by" value="col"/> | |
197 <param name="id_col" value="10"/> | |
198 <param name="match_regex" value="(.*)\.mgf"/> | |
199 <param name="sub_regex" value="\1.tab"/> | |
200 <param name="top" value="1"/> | |
201 <output_collection name="list_output_tab" type="list"> | |
202 <element name="file1.tab" file="file1.tab" ftype="tabular"/> | |
203 <element name="file2.tab" file="file2.tab" ftype="tabular"/> | |
204 <element name="file3.tab" file="file3.tab" ftype="tabular"/> | |
205 <element name="file4.tab" file="file4.tab" ftype="tabular"/> | |
206 </output_collection> | |
207 </test> | |
208 <test> | |
209 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/> | |
210 <param name="select_ftype" value="mgf"/> | |
211 <param name="numnew" value="3"/> | |
212 <param name="newfilenames" value="demo"/> | |
213 <output_collection name="list_output_mgf" type="list"> | |
214 <element name="demo_0.mgf" file="demo_0.mgf" ftype="mgf"/> | |
215 <element name="demo_1.mgf" file="demo_1.mgf" ftype="mgf"/> | |
216 <element name="demo_2.mgf" file="demo_2.mgf" ftype="mgf"/> | |
217 </output_collection> | |
218 </test> | |
219 <test> | |
220 <param name="input" value="test.fasta" ftype="fasta"/> | |
221 <param name="select_ftype" value="fasta"/> | |
222 <param name="numnew" value="2"/> | |
223 <param name="newfilenames" value="test"/> | |
224 <output_collection name="list_output_fasta" type="list"> | |
225 <element name="test_0.fasta" file="test_0.fasta" ftype="fasta"/> | |
226 <element name="test_1.fasta" file="test_1.fasta" ftype="fasta"/> | |
227 </output_collection> | |
228 </test> | |
229 <test> | |
230 <param name="input" value="test.fastq" ftype="fastq"/> | |
231 <param name="select_ftype" value="fastq"/> | |
232 <param name="numnew" value="2"/> | |
233 <param name="newfilenames" value="test"/> | |
234 <output_collection name="list_output_fastq" type="list"> | |
235 <element name="test_0.fastq" file="test_0.fastq" ftype="fastq"/> | |
236 <element name="test_1.fastq" file="test_1.fastq" ftype="fastq"/> | |
237 </output_collection> | |
238 </test> | |
239 <test> | |
240 <param name="input" value="test.fasta" ftype="fasta"/> | |
241 <param name="select_ftype" value="fasta"/> | |
242 <param name="numnew" value="2"/> | |
243 <param name="newfilenames" value="rand"/> | |
244 <param name="allocate" value="random"/> | |
245 <param name="seed" value="1010"/> | |
246 <output_collection name="list_output_fasta" type="list"> | |
247 <element name="rand_0.fasta" file="rand_0.fasta" ftype="fasta"/> | |
248 <element name="rand_1.fasta" file="rand_1.fasta" ftype="fasta"/> | |
249 </output_collection> | |
250 </test> | |
251 <test> | |
252 <param name="input" value="test.fasta" ftype="fasta"/> | |
253 <param name="select_ftype" value="fasta"/> | |
254 <param name="numnew" value="2"/> | |
255 <param name="newfilenames" value="fasta_batch"/> | |
256 <param name="allocate" value="batch"/> | |
257 <output_collection name="list_output_fasta" type="list"> | |
258 <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> | |
259 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> | |
260 </output_collection> | |
261 </test> | |
262 </tests> | |
263 <help><![CDATA[ | |
264 **Split file into a dataset collection** | |
265 | |
266 This tool can split five types of files into a separate files within a dataset collection: MGF, FASTA, FASTQ, and tabular. | |
267 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. | |
268 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. | |
269 The default regular expression uses each value in the column without modifying it. | |
270 | |
271 If splitting by line (or by some other item, like a FASTA entry or an MGF section), the splitting can be either done sequentially or at random. | |
272 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. | |
273 | |
274 **Note** | |
275 | |
276 Due to current limitations with dataset collections, a log file is produced when running this tool. It will usually be empty, but if the tool fails, any errors will be printed to the log file. | |
277 ]]></help> | |
278 <citations> | |
279 <citation type="bibtex"> | |
280 @misc{githubsplit, | |
281 author = {Easterly, Caleb}, | |
282 year = {2018}, | |
283 title = {split_file_to_collection: a Galaxy tool}, | |
284 publisher = {GitHub}, | |
285 journal = {GitHub repository}, | |
286 url = {https://github.com/galaxyproteomics/tools-galaxyp/tools/split_file_to_collection}, | |
287 }</citation> | |
288 </citations> | |
289 </tool> |