comparison split_file_to_collection.xml @ 0:de3c2c88e710 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 85015046a6d8a9dc0f4b54611986676aceeeadd7
author bgruening
date Tue, 17 Jul 2018 14:37:13 -0400
parents
children 750c1684d47c
comparison
equal deleted inserted replaced
-1:000000000000 0:de3c2c88e710
1 <tool id="split_file_to_collection" name="Split file" version="0.1.1">
2 <description>to dataset collection</description>
3 <macros>
4 <xml name="numnew_fname">
5 <param name="numnew" type="integer" label="Number of new files" min="1" value="1"/>
6 <param name="newfilenames" type="text" label="Base name for new files in collection"
7 help="This will increment automatically - if input is 'file', then output is 'file0', 'file1', etc." value="split_file"/>
8 <conditional name="select_allocate">
9 <param name="allocate" type="select" label="Method to allocate records to new files" help="See the information section for a diagram">
10 <option value="random">At random</option>
11 <option value="batch">Maintain record order</option>
12 <option value="byrow" selected="true">Alternate output files</option>
13 </param>
14 <when value="random">
15 <param name="seed" type="integer" label="Random number seed" help="For reproducibility, set this to some arbitrary integer (i.e. '1010')" value="1010"/>
16 </when>
17 <when value="batch">
18 </when>
19 <when value="byrow">
20 </when>
21 </conditional>
22 </xml>
23 </macros>
24 <requirements>
25 <requirement type="package" version="3.5">python</requirement>
26 </requirements>
27 <command detect_errors="aggressive"><![CDATA[
28 mkdir ./out &&
29 python '$__tool_directory__/split_file_to_collection.py'
30 --out ./out
31 --in '$split_parms.input'
32 --ftype '$split_parms.select_ftype'
33 #if $split_parms.select_ftype == "tabular":
34 --top '$split_parms.top'
35 --by '$split_parms.split_by.select_split_by'
36 #if $split_parms.split_by.select_split_by == "col":
37 --id_column '$split_parms.split_by.id_col'
38 --match '$split_parms.split_by.match_regex'
39 --sub '$split_parms.split_by.sub_regex'
40 #else
41 --numnew '$split_parms.split_by.numnew'
42 #if $split_parms.split_by.select_allocate.allocate == "random":
43 --rand
44 --seed '$split_parms.split_by.rand.seed'
45 #end if
46 #if $split_parms.split_by.select_allocate.allocate == "batch":
47 --batch
48 #end if
49 #end if
50 #else
51 --numnew '$split_parms.numnew'
52 #if $split_parms.select_allocate.allocate == "random":
53 --rand
54 --seed '$split_parms.select_allocate.seed'
55 #end if
56 #if $split_parms.select_allocate.allocate == "batch":
57 --batch
58 #end if
59 #end if
60 #if ($split_parms.select_ftype == "tabular" and $split_parms.split_by.select_split_by == "row"):
61 --file_names '$split_parms.split_by.newfilenames'
62 --file_ext '$split_parms.select_ftype'
63 #end if
64 #if $split_parms.select_ftype != "tabular":
65 --file_names '$split_parms.newfilenames'
66 --file_ext '$split_parms.select_ftype'
67 #end if
68 > '$log'
69 ]]></command>
70 <inputs>
71 <conditional name="split_parms">
72 <param name="select_ftype" type="select" label="Select the file type to split">
73 <option value="mgf">MGF</option>
74 <option value="fastq">FASTQ</option>
75 <option value="tabular">Tabular</option>
76 <option value="fasta">FASTA</option>
77 </param>
78 <when value="tabular">
79 <param name="input" type="data" format="tabular" label="Tabular file to split"/>
80 <param name="top" type="integer" value="0" min="0" label="Number of header lines to transfer to new files"/>
81 <conditional name="split_by">
82 <param name="select_split_by" type="select" label="Split by row or by a column?">
83 <option value="row">By row</option>
84 <option value="col">By column</option>
85 </param>
86 <when value="col">
87 <param name="id_col" type="data_column" label="Column to split on" data_ref="input"/>
88 <param name="match_regex" type="text" label="Regex to match contents of id column" value="(.*)">
89 <sanitizer>
90 <valid>
91 <add preset="string.printable"/>
92 <remove value="&#92;" />
93 <remove value="&apos;" />
94 </valid>
95 <mapping initial="none">
96 <add source="&#92;" target="__backslash__" />
97 <add source="&apos;" target="__sq__"/>
98 </mapping>
99 </sanitizer>
100 </param>
101 <param name="sub_regex" type="text" label="Pattern to replace match with" value="\1">
102 <sanitizer>
103 <valid>
104 <add preset="string.printable"/>
105 <remove value="&#92;" />
106 <remove value="&apos;" />
107 </valid>
108 <mapping initial="none">
109 <add source="&#92;" target="__backslash__" />
110 <add source="&apos;" target="__sq__"/>
111 </mapping>
112 </sanitizer>
113 </param>
114 </when>
115 <when value="row">
116 <expand macro="numnew_fname"/>
117 </when>
118 </conditional>
119 </when>
120 <when value="mgf">
121 <param name="input" type="data" format="mgf" label="MGF file to split"/>
122 <expand macro="numnew_fname"/>
123 </when>
124 <when value="fastq">
125 <param name="input" type="data" format="fastq" label="FASTQ file to split"/>
126 <expand macro="numnew_fname"/>
127 </when>
128 <when value="fasta">
129 <param name="input" type="data" format="fasta" label="FASTA file to split"/>
130 <expand macro="numnew_fname"/>
131 </when>
132 </conditional>
133 </inputs>
134 <outputs>
135 <collection name="list_output_tab" type="list" label="${tool.name} on ${on_string}: output collection">
136 <discover_datasets pattern="__name__" directory="out" visible="false" format="tabular"/>
137 <filter>split_parms['select_ftype'] == "tabular"</filter>
138 </collection>
139 <collection name="list_output_mgf" type="list" label="${tool.name} on ${on_string}: output collection">
140 <discover_datasets pattern="__name__" directory="out" visible="false" format="mgf"/>
141 <filter>split_parms['select_ftype'] == "mgf"</filter>
142 </collection>
143 <collection name="list_output_fasta" type="list" label="${tool.name} on ${on_string}: output collection">
144 <discover_datasets pattern="__name__" directory="out" visible="false" format="fasta"/>
145 <filter>split_parms['select_ftype'] == "fasta"</filter>
146 </collection>
147 <collection name="list_output_fastq" type="list" label="${tool.name} on ${on_string}: output collection">
148 <discover_datasets pattern="__name__" directory="out" visible="false" format="fastq"/>
149 <filter>split_parms['select_ftype'] == "fastq"</filter>
150 </collection>
151 <data name="log" format="txt" label="${tool.name} on ${on_string}: log" />
152 </outputs>
153 <tests>
154 <test>
155 <param name="input" value="test.tabular" ftype="tabular"/>
156 <param name="select_ftype" value="tabular"/>
157 <param name="select_split_by" value="col"/>
158 <param name="id_col" value="1"/>
159 <param name="match_regex" value="(.*)\.mgf"/>
160 <param name="sub_regex" value="\1.tab"/>
161 <param name="top" value="2"/>
162 <output_collection name="list_output_tab" type="list">
163 <element name="foo.tab" file="foo.tab" ftype="tabular"/>
164 <element name="foo2.tab" file="foo2.tab" ftype="tabular"/>
165 <element name="foo3.tab" file="foo3.tab" ftype="tabular"/>
166 </output_collection>
167 </test>
168 <test>
169 <param name="input" value="test.tabular" ftype="tabular"/>
170 <param name="select_ftype" value="tabular"/>
171 <param name="select_split_by" value="row"/>
172 <param name="top" value="2"/>
173 <param name="numnew" value="2"/>
174 <param name="newfilenames" value="test"/>
175 <output_collection name="list_output_tab" type="list">
176 <element name="test_0.tabular" file="test_0.tabular" ftype="tabular"/>
177 <element name="test_1.tabular" file="test_1.tabular" ftype="tabular"/>
178 </output_collection>
179 </test>
180 <test>
181 <param name="input" value="test.tabular" ftype="tabular"/>
182 <param name="select_ftype" value="tabular"/>
183 <param name="select_split_by" value="row"/>
184 <param name="top" value="2"/>
185 <param name="numnew" value="2"/>
186 <param name="newfilenames" value="batch_tab"/>
187 <param name="allocate" value="batch"/>
188 <output_collection name="list_output_tab" type="list">
189 <element name="batch_tab_0.tabular" file="batch_tab_0.tabular" ftype="tabular"/>
190 <element name="batch_tab_1.tabular" file="batch_tab_1.tabular" ftype="tabular"/>
191 </output_collection>
192 </test>
193 <test>
194 <param name="input" value="psm.tabular" ftype="tabular"/>
195 <param name="select_ftype" value="tabular"/>
196 <param name="select_split_by" value="col"/>
197 <param name="id_col" value="10"/>
198 <param name="match_regex" value="(.*)\.mgf"/>
199 <param name="sub_regex" value="\1.tab"/>
200 <param name="top" value="1"/>
201 <output_collection name="list_output_tab" type="list">
202 <element name="file1.tab" file="file1.tab" ftype="tabular"/>
203 <element name="file2.tab" file="file2.tab" ftype="tabular"/>
204 <element name="file3.tab" file="file3.tab" ftype="tabular"/>
205 <element name="file4.tab" file="file4.tab" ftype="tabular"/>
206 </output_collection>
207 </test>
208 <test>
209 <param name="input" value="demo758Dacentroid.mgf" ftype="mgf"/>
210 <param name="select_ftype" value="mgf"/>
211 <param name="numnew" value="3"/>
212 <param name="newfilenames" value="demo"/>
213 <output_collection name="list_output_mgf" type="list">
214 <element name="demo_0.mgf" file="demo_0.mgf" ftype="mgf"/>
215 <element name="demo_1.mgf" file="demo_1.mgf" ftype="mgf"/>
216 <element name="demo_2.mgf" file="demo_2.mgf" ftype="mgf"/>
217 </output_collection>
218 </test>
219 <test>
220 <param name="input" value="test.fasta" ftype="fasta"/>
221 <param name="select_ftype" value="fasta"/>
222 <param name="numnew" value="2"/>
223 <param name="newfilenames" value="test"/>
224 <output_collection name="list_output_fasta" type="list">
225 <element name="test_0.fasta" file="test_0.fasta" ftype="fasta"/>
226 <element name="test_1.fasta" file="test_1.fasta" ftype="fasta"/>
227 </output_collection>
228 </test>
229 <test>
230 <param name="input" value="test.fastq" ftype="fastq"/>
231 <param name="select_ftype" value="fastq"/>
232 <param name="numnew" value="2"/>
233 <param name="newfilenames" value="test"/>
234 <output_collection name="list_output_fastq" type="list">
235 <element name="test_0.fastq" file="test_0.fastq" ftype="fastq"/>
236 <element name="test_1.fastq" file="test_1.fastq" ftype="fastq"/>
237 </output_collection>
238 </test>
239 <test>
240 <param name="input" value="test.fasta" ftype="fasta"/>
241 <param name="select_ftype" value="fasta"/>
242 <param name="numnew" value="2"/>
243 <param name="newfilenames" value="rand"/>
244 <param name="allocate" value="random"/>
245 <param name="seed" value="1010"/>
246 <output_collection name="list_output_fasta" type="list">
247 <element name="rand_0.fasta" file="rand_0.fasta" ftype="fasta"/>
248 <element name="rand_1.fasta" file="rand_1.fasta" ftype="fasta"/>
249 </output_collection>
250 </test>
251 <test>
252 <param name="input" value="test.fasta" ftype="fasta"/>
253 <param name="select_ftype" value="fasta"/>
254 <param name="numnew" value="2"/>
255 <param name="newfilenames" value="fasta_batch"/>
256 <param name="allocate" value="batch"/>
257 <output_collection name="list_output_fasta" type="list">
258 <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/>
259 <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/>
260 </output_collection>
261 </test>
262 </tests>
263 <help><![CDATA[
264 **Split file into a dataset collection**
265
266 This tool can split five types of files into a separate files within a dataset collection: MGF, FASTA, FASTQ, and tabular.
267 If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column.
268 In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior.
269 The default regular expression uses each value in the column without modifying it.
270
271 If splitting by line (or by some other item, like a FASTA entry or an MGF section), the splitting can be either done sequentially or at random.
272 Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files.
273
274 **Note**
275
276 Due to current limitations with dataset collections, a log file is produced when running this tool. It will usually be empty, but if the tool fails, any errors will be printed to the log file.
277 ]]></help>
278 <citations>
279 <citation type="bibtex">
280 @misc{githubsplit,
281 author = {Easterly, Caleb},
282 year = {2018},
283 title = {split_file_to_collection: a Galaxy tool},
284 publisher = {GitHub},
285 journal = {GitHub repository},
286 url = {https://github.com/galaxyproteomics/tools-galaxyp/tools/split_file_to_collection},
287 }</citation>
288 </citations>
289 </tool>