comparison ncbi_acc_download.xml @ 0:1c58de56d587 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_acc_download commit 6747338e8e02cb87c4f3b9cdea0b761f236a02d1"
author iuc
date Wed, 04 Dec 2019 07:01:37 -0500
parents
children e063168e0a81
comparison
equal deleted inserted replaced
-1:000000000000 0:1c58de56d587
1 <tool id="ncbi_acc_download" name="NCBI Accession Download" version="@TOOL_VERSION@+galaxy0">
2 <description>Download sequences from GenBank/RefSeq by accession through the NCBI ENTREZ API</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <requirements>
7 <requirement type="package" version="@TOOL_VERSION@">ncbi-acc-download</requirement>
8 </requirements>
9 <command detect_errors="exit_code"><![CDATA[
10 #if $query_source.select == "accession_file":
11 { grep -v "^[ \t]*$" $query_source.accession_file > accessions ||
12 { echo "No accession numbers in input. Aborting." 1>&2; exit 1; } } &&
13 #else if $query_source.select == "accession_list":
14 echo '$query_source.accession_list' | sed -r 's/(\,|__cn__)/\n/g' | grep -v "^[ \t]*$" > accessions &&
15 #end if
16 mkdir outdir &&
17 cd outdir &&
18 ignore_errors=$ignore_failed &&
19 while read accession; do
20 echo "Downloading accession number: " \$accession " ..." >> ../error.log &&
21 ncbi-acc-download
22 --molecule '${molecule.select}'
23 --format '${molecule.format}'
24 #if $molecule.format != 'featuretable' and $molecule.format != 'gff3':
25 --extended-validation all
26 #end if
27 \${accession};
28 failure=\$?;
29 if [ \$failure -ne 0 ]; then
30 echo " failed." >> ../error.log;
31 if [ \$ignore_errors -ne 0 ]; then
32 echo \$accession >> ../failed.txt;
33 else
34 exit 1;
35 fi;
36 else
37 echo " done." >> ../error.log;
38 fi;
39 sleep 2;
40 done < ../accessions 2> >(tee -a ../error.log >&2);
41 ]]></command>
42 <inputs>
43 <conditional name="query_source">
44 <param name="select" type="select" label="Select source for IDs">
45 <option value="accession_file">File containing Accessions (one per line)</option>
46 <option value="accession_list">Direct Entry</option>
47 </param>
48 <when value="accession_file">
49 <param label="Accession File" name="accession_file" type="data" format="txt,tabular"/>
50 </when>
51 <when value="accession_list">
52 <param label="ID List" name="accession_list" type="text" area="true" help="Newline/Comma separated list of IDs">
53 <validator type="expression" message="ID list cannot be empty">value.strip()</validator>
54 </param>
55 </when>
56 </conditional>
57 <conditional name="molecule">
58 <param name="select" type="select" label="Molecule Type">
59 <option value="nucleotide" selected="true">Nucleotide</option>
60 <option value="protein">Protein</option>
61 </param>
62 <when value="nucleotide">
63 <param name="format" type="select" label="File Format">
64 <option value="fasta" selected="true">FASTA</option>
65 <option value="genbank">GenBank</option>
66 <option value="featuretable">Feature Table</option>
67 <option value="gff3">GFF3</option>
68 </param>
69 </when>
70 <when value="protein">
71 <param name="format" type="select" label="File Format">
72 <option value="fasta" selected="true">FASTA</option>
73 </param>
74 </when>
75 </conditional>
76 <param name="ignore_failed" type="select" display="radio"
77 label="How to handle download failures">
78 <option value="0">Abort with error on first failure</option>
79 <option value="1">Add accession to failed list and continue</option>
80 </param>
81 </inputs>
82 <outputs>
83 <collection name="output" type="list" label="${tool.name} on ${on_string}: Downloaded Files">
84 <discover_datasets pattern="(?P&lt;name&gt;.+)\.fa$" directory="outdir" format="fasta"/>
85 <discover_datasets pattern="(?P&lt;name&gt;.+)\.gbk$" directory="outdir" format="genbank"/>
86 <discover_datasets pattern="(?P&lt;name&gt;.+)\.gff$" directory="outdir" format="gff"/>
87 <discover_datasets pattern="(?P&lt;name&gt;.+)\.ft$" directory="outdir" format="txt"/>
88 </collection>
89 <data name="error_log" from_work_dir="error.log" label="${tool.name} on ${on_string}: Log" format="txt"/>
90 <data name="failed_accessions" from_work_dir="failed.txt" label="${tool.name} on ${on_string}: Failed accessions" format="txt">
91 <filter>str(ignore_failed)=='1'</filter>
92 </data>
93 </outputs>
94 <tests>
95 <test>
96 <conditional name="molecule">
97 <param name="select" value="nucleotide"/>
98 <param name="format" value="fasta"/>
99 </conditional>
100 <conditional name="query_source">
101 <param name="select" value="accession_file" />
102 <param name="accession_file" value="accessions_1.tsv"/>
103 </conditional>
104 <output_collection name="output" type="list">
105 <element name="CP011064" ftype="fasta">
106 <assert_contents>
107 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
108 </assert_contents>
109 </element>
110 <element name="CP021680" ftype="fasta">
111 <assert_contents>
112 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
113 </assert_contents>
114 </element>
115 </output_collection>
116 </test>
117 <test>
118 <conditional name="molecule">
119 <param name="select" value="nucleotide"/>
120 <param name="format" value="genbank"/>
121 </conditional>
122 <conditional name="query_source">
123 <param name="select" value="accession_file" />
124 <param name="accession_file" value="accessions_1.tsv"/>
125 </conditional>
126 <output_collection name="output" type="list">
127 <element name="CP011064" ftype="genbank">
128 <assert_contents>
129 <has_line line="DEFINITION Escherichia coli str. Sanji plasmid pSJ_94, complete sequence." />
130 </assert_contents>
131 </element>
132 <element name="CP021680" ftype="genbank">
133 <assert_contents>
134 <has_line line="DEFINITION Escherichia coli strain AR_0162 plasmid tig00002623, complete" />
135 </assert_contents>
136 </element>
137 </output_collection>
138 </test>
139 <test>
140 <conditional name="molecule">
141 <param name="select" value="nucleotide"/>
142 <param name="format" value="gff3"/>
143 </conditional>
144 <conditional name="query_source">
145 <param name="select" value="accession_file" />
146 <param name="accession_file" value="accessions_1.tsv"/>
147 </conditional>
148 <output_collection name="output" type="list">
149 <element name="CP011064" ftype="gff">
150 <assert_contents>
151 <has_line line="##sequence-region CP011064.1 1 94712" />
152 </assert_contents>
153 </element>
154 <element name="CP021680" ftype="gff">
155 <assert_contents>
156 <has_line line="##sequence-region CP021680.1 1 23332" />
157 </assert_contents>
158 </element>
159 </output_collection>
160 </test>
161 <test>
162 <conditional name="molecule">
163 <param name="select" value="nucleotide"/>
164 <param name="format" value="featuretable"/>
165 </conditional>
166 <conditional name="query_source">
167 <param name="select" value="accession_file" />
168 <param name="accession_file" value="accessions_1.tsv"/>
169 </conditional>
170 <output_collection name="output" type="list">
171 <element name="CP011064" ftype="txt">
172 <assert_contents>
173 <has_line line=">Feature gb|CP011064.1|" />
174 </assert_contents>
175 </element>
176 <element name="CP021680" ftype="txt">
177 <assert_contents>
178 <has_line line=">Feature gb|CP021680.1|" />
179 </assert_contents>
180 </element>
181 </output_collection>
182 </test>
183 <test>
184 <conditional name="molecule">
185 <param name="select" value="nucleotide"/>
186 <param name="format" value="fasta"/>
187 </conditional>
188 <conditional name="query_source">
189 <param name="select" value="accession_list" />
190 <param name="accession_list" value="CP011064,CP021680"/>
191 </conditional>
192 <output_collection name="output" type="list">
193 <element name="CP011064" ftype="fasta">
194 <assert_contents>
195 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
196 </assert_contents>
197 </element>
198 <element name="CP021680" ftype="fasta">
199 <assert_contents>
200 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
201 </assert_contents>
202 </element>
203 </output_collection>
204 </test>
205 <test>
206 <conditional name="molecule">
207 <param name="select" value="nucleotide"/>
208 <param name="format" value="fasta"/>
209 </conditional>
210 <conditional name="query_source">
211 <param name="select" value="accession_list" />
212 <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/>
213 </conditional>
214 <param name="ignore_failed" value="1" />
215 <output_collection name="output" type="list">
216 <element name="CP011064" ftype="fasta">
217 <assert_contents>
218 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
219 </assert_contents>
220 </element>
221 <element name="CP021680" ftype="fasta">
222 <assert_contents>
223 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
224 </assert_contents>
225 </element>
226 </output_collection>
227 <output name="failed_accessions">
228 <assert_contents>
229 <has_line line="CP0XXXXX" />
230 </assert_contents>
231 </output>
232 </test>
233 <test>
234 <conditional name="molecule">
235 <param name="select" value="nucleotide"/>
236 <param name="format" value="fasta"/>
237 </conditional>
238 <conditional name="query_source">
239 <param name="select" value="accession_list" />
240 <param name="accession_list" value="CP0XXXXX"/>
241 </conditional>
242 <param name="ignore_failed" value="1" />
243 <output name="failed_accessions">
244 <assert_contents>
245 <has_line line="CP0XXXXX" />
246 </assert_contents>
247 </output>
248 </test>
249 <test expect_failure="true">
250 <conditional name="molecule">
251 <param name="select" value="nucleotide"/>
252 <param name="format" value="fasta"/>
253 </conditional>
254 <conditional name="query_source">
255 <param name="select" value="accession_list" />
256 <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/>
257 </conditional>
258 <param name="ignore_failed" value="0" />
259 </test>
260 <test>
261 <conditional name="molecule">
262 <param name="select" value="nucleotide"/>
263 <param name="format" value="fasta"/>
264 </conditional>
265 <conditional name="query_source">
266 <param name="select" value="accession_list" />
267 <param name="accession_list" value="CP011064&#10;CP021680"/>
268 </conditional>
269 <output_collection name="output" type="list">
270 <element name="CP011064" ftype="fasta">
271 <assert_contents>
272 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" />
273 </assert_contents>
274 </element>
275 <element name="CP021680" ftype="fasta">
276 <assert_contents>
277 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" />
278 </assert_contents>
279 </element>
280 </output_collection>
281 </test>
282 <test>
283 <conditional name="molecule">
284 <param name="select" value="protein"/>
285 <param name="format" value="fasta"/>
286 </conditional>
287 <conditional name="query_source">
288 <param name="select" value="accession_list" />
289 <param name="accession_list" value="NP_003192"/>
290 </conditional>
291 <output_collection name="output" type="list">
292 <element name="NP_003192" ftype="fasta">
293 <assert_contents>
294 <has_line line=">NP_003192.1 transcription factor A, mitochondrial isoform 1 precursor [Homo sapiens]" />
295 </assert_contents>
296 </element>
297 </output_collection>
298 </test>
299 </tests>
300 <help><![CDATA[
301 **What it does**
302 Given a file containing a list of NCBI accession numbers or a direct entry of accession numbers in the tool text input box, this tool will download the corresponding sequence records via the NCBI API.
303
304 **Limitations**
305 - For protein sequence downloads, only fasta format is supported
306 - To avoid rate-limits imposed by the NCBI API, records are downloaded sequentially with a delay between requests. This may make it impractical to use this tool to download many (>100) records.
307
308 **Output**
309 A collection of sequence records in the desired format.
310 ]]></help>
311 <citations>
312 </citations>
313 </tool>