Mercurial > repos > iuc > ncbi_acc_download
comparison ncbi_acc_download.xml @ 0:1c58de56d587 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ncbi_acc_download commit 6747338e8e02cb87c4f3b9cdea0b761f236a02d1"
author | iuc |
---|---|
date | Wed, 04 Dec 2019 07:01:37 -0500 |
parents | |
children | e063168e0a81 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:1c58de56d587 |
---|---|
1 <tool id="ncbi_acc_download" name="NCBI Accession Download" version="@TOOL_VERSION@+galaxy0"> | |
2 <description>Download sequences from GenBank/RefSeq by accession through the NCBI ENTREZ API</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <requirements> | |
7 <requirement type="package" version="@TOOL_VERSION@">ncbi-acc-download</requirement> | |
8 </requirements> | |
9 <command detect_errors="exit_code"><![CDATA[ | |
10 #if $query_source.select == "accession_file": | |
11 { grep -v "^[ \t]*$" $query_source.accession_file > accessions || | |
12 { echo "No accession numbers in input. Aborting." 1>&2; exit 1; } } && | |
13 #else if $query_source.select == "accession_list": | |
14 echo '$query_source.accession_list' | sed -r 's/(\,|__cn__)/\n/g' | grep -v "^[ \t]*$" > accessions && | |
15 #end if | |
16 mkdir outdir && | |
17 cd outdir && | |
18 ignore_errors=$ignore_failed && | |
19 while read accession; do | |
20 echo "Downloading accession number: " \$accession " ..." >> ../error.log && | |
21 ncbi-acc-download | |
22 --molecule '${molecule.select}' | |
23 --format '${molecule.format}' | |
24 #if $molecule.format != 'featuretable' and $molecule.format != 'gff3': | |
25 --extended-validation all | |
26 #end if | |
27 \${accession}; | |
28 failure=\$?; | |
29 if [ \$failure -ne 0 ]; then | |
30 echo " failed." >> ../error.log; | |
31 if [ \$ignore_errors -ne 0 ]; then | |
32 echo \$accession >> ../failed.txt; | |
33 else | |
34 exit 1; | |
35 fi; | |
36 else | |
37 echo " done." >> ../error.log; | |
38 fi; | |
39 sleep 2; | |
40 done < ../accessions 2> >(tee -a ../error.log >&2); | |
41 ]]></command> | |
42 <inputs> | |
43 <conditional name="query_source"> | |
44 <param name="select" type="select" label="Select source for IDs"> | |
45 <option value="accession_file">File containing Accessions (one per line)</option> | |
46 <option value="accession_list">Direct Entry</option> | |
47 </param> | |
48 <when value="accession_file"> | |
49 <param label="Accession File" name="accession_file" type="data" format="txt,tabular"/> | |
50 </when> | |
51 <when value="accession_list"> | |
52 <param label="ID List" name="accession_list" type="text" area="true" help="Newline/Comma separated list of IDs"> | |
53 <validator type="expression" message="ID list cannot be empty">value.strip()</validator> | |
54 </param> | |
55 </when> | |
56 </conditional> | |
57 <conditional name="molecule"> | |
58 <param name="select" type="select" label="Molecule Type"> | |
59 <option value="nucleotide" selected="true">Nucleotide</option> | |
60 <option value="protein">Protein</option> | |
61 </param> | |
62 <when value="nucleotide"> | |
63 <param name="format" type="select" label="File Format"> | |
64 <option value="fasta" selected="true">FASTA</option> | |
65 <option value="genbank">GenBank</option> | |
66 <option value="featuretable">Feature Table</option> | |
67 <option value="gff3">GFF3</option> | |
68 </param> | |
69 </when> | |
70 <when value="protein"> | |
71 <param name="format" type="select" label="File Format"> | |
72 <option value="fasta" selected="true">FASTA</option> | |
73 </param> | |
74 </when> | |
75 </conditional> | |
76 <param name="ignore_failed" type="select" display="radio" | |
77 label="How to handle download failures"> | |
78 <option value="0">Abort with error on first failure</option> | |
79 <option value="1">Add accession to failed list and continue</option> | |
80 </param> | |
81 </inputs> | |
82 <outputs> | |
83 <collection name="output" type="list" label="${tool.name} on ${on_string}: Downloaded Files"> | |
84 <discover_datasets pattern="(?P<name>.+)\.fa$" directory="outdir" format="fasta"/> | |
85 <discover_datasets pattern="(?P<name>.+)\.gbk$" directory="outdir" format="genbank"/> | |
86 <discover_datasets pattern="(?P<name>.+)\.gff$" directory="outdir" format="gff"/> | |
87 <discover_datasets pattern="(?P<name>.+)\.ft$" directory="outdir" format="txt"/> | |
88 </collection> | |
89 <data name="error_log" from_work_dir="error.log" label="${tool.name} on ${on_string}: Log" format="txt"/> | |
90 <data name="failed_accessions" from_work_dir="failed.txt" label="${tool.name} on ${on_string}: Failed accessions" format="txt"> | |
91 <filter>str(ignore_failed)=='1'</filter> | |
92 </data> | |
93 </outputs> | |
94 <tests> | |
95 <test> | |
96 <conditional name="molecule"> | |
97 <param name="select" value="nucleotide"/> | |
98 <param name="format" value="fasta"/> | |
99 </conditional> | |
100 <conditional name="query_source"> | |
101 <param name="select" value="accession_file" /> | |
102 <param name="accession_file" value="accessions_1.tsv"/> | |
103 </conditional> | |
104 <output_collection name="output" type="list"> | |
105 <element name="CP011064" ftype="fasta"> | |
106 <assert_contents> | |
107 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> | |
108 </assert_contents> | |
109 </element> | |
110 <element name="CP021680" ftype="fasta"> | |
111 <assert_contents> | |
112 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> | |
113 </assert_contents> | |
114 </element> | |
115 </output_collection> | |
116 </test> | |
117 <test> | |
118 <conditional name="molecule"> | |
119 <param name="select" value="nucleotide"/> | |
120 <param name="format" value="genbank"/> | |
121 </conditional> | |
122 <conditional name="query_source"> | |
123 <param name="select" value="accession_file" /> | |
124 <param name="accession_file" value="accessions_1.tsv"/> | |
125 </conditional> | |
126 <output_collection name="output" type="list"> | |
127 <element name="CP011064" ftype="genbank"> | |
128 <assert_contents> | |
129 <has_line line="DEFINITION Escherichia coli str. Sanji plasmid pSJ_94, complete sequence." /> | |
130 </assert_contents> | |
131 </element> | |
132 <element name="CP021680" ftype="genbank"> | |
133 <assert_contents> | |
134 <has_line line="DEFINITION Escherichia coli strain AR_0162 plasmid tig00002623, complete" /> | |
135 </assert_contents> | |
136 </element> | |
137 </output_collection> | |
138 </test> | |
139 <test> | |
140 <conditional name="molecule"> | |
141 <param name="select" value="nucleotide"/> | |
142 <param name="format" value="gff3"/> | |
143 </conditional> | |
144 <conditional name="query_source"> | |
145 <param name="select" value="accession_file" /> | |
146 <param name="accession_file" value="accessions_1.tsv"/> | |
147 </conditional> | |
148 <output_collection name="output" type="list"> | |
149 <element name="CP011064" ftype="gff"> | |
150 <assert_contents> | |
151 <has_line line="##sequence-region CP011064.1 1 94712" /> | |
152 </assert_contents> | |
153 </element> | |
154 <element name="CP021680" ftype="gff"> | |
155 <assert_contents> | |
156 <has_line line="##sequence-region CP021680.1 1 23332" /> | |
157 </assert_contents> | |
158 </element> | |
159 </output_collection> | |
160 </test> | |
161 <test> | |
162 <conditional name="molecule"> | |
163 <param name="select" value="nucleotide"/> | |
164 <param name="format" value="featuretable"/> | |
165 </conditional> | |
166 <conditional name="query_source"> | |
167 <param name="select" value="accession_file" /> | |
168 <param name="accession_file" value="accessions_1.tsv"/> | |
169 </conditional> | |
170 <output_collection name="output" type="list"> | |
171 <element name="CP011064" ftype="txt"> | |
172 <assert_contents> | |
173 <has_line line=">Feature gb|CP011064.1|" /> | |
174 </assert_contents> | |
175 </element> | |
176 <element name="CP021680" ftype="txt"> | |
177 <assert_contents> | |
178 <has_line line=">Feature gb|CP021680.1|" /> | |
179 </assert_contents> | |
180 </element> | |
181 </output_collection> | |
182 </test> | |
183 <test> | |
184 <conditional name="molecule"> | |
185 <param name="select" value="nucleotide"/> | |
186 <param name="format" value="fasta"/> | |
187 </conditional> | |
188 <conditional name="query_source"> | |
189 <param name="select" value="accession_list" /> | |
190 <param name="accession_list" value="CP011064,CP021680"/> | |
191 </conditional> | |
192 <output_collection name="output" type="list"> | |
193 <element name="CP011064" ftype="fasta"> | |
194 <assert_contents> | |
195 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> | |
196 </assert_contents> | |
197 </element> | |
198 <element name="CP021680" ftype="fasta"> | |
199 <assert_contents> | |
200 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> | |
201 </assert_contents> | |
202 </element> | |
203 </output_collection> | |
204 </test> | |
205 <test> | |
206 <conditional name="molecule"> | |
207 <param name="select" value="nucleotide"/> | |
208 <param name="format" value="fasta"/> | |
209 </conditional> | |
210 <conditional name="query_source"> | |
211 <param name="select" value="accession_list" /> | |
212 <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/> | |
213 </conditional> | |
214 <param name="ignore_failed" value="1" /> | |
215 <output_collection name="output" type="list"> | |
216 <element name="CP011064" ftype="fasta"> | |
217 <assert_contents> | |
218 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> | |
219 </assert_contents> | |
220 </element> | |
221 <element name="CP021680" ftype="fasta"> | |
222 <assert_contents> | |
223 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> | |
224 </assert_contents> | |
225 </element> | |
226 </output_collection> | |
227 <output name="failed_accessions"> | |
228 <assert_contents> | |
229 <has_line line="CP0XXXXX" /> | |
230 </assert_contents> | |
231 </output> | |
232 </test> | |
233 <test> | |
234 <conditional name="molecule"> | |
235 <param name="select" value="nucleotide"/> | |
236 <param name="format" value="fasta"/> | |
237 </conditional> | |
238 <conditional name="query_source"> | |
239 <param name="select" value="accession_list" /> | |
240 <param name="accession_list" value="CP0XXXXX"/> | |
241 </conditional> | |
242 <param name="ignore_failed" value="1" /> | |
243 <output name="failed_accessions"> | |
244 <assert_contents> | |
245 <has_line line="CP0XXXXX" /> | |
246 </assert_contents> | |
247 </output> | |
248 </test> | |
249 <test expect_failure="true"> | |
250 <conditional name="molecule"> | |
251 <param name="select" value="nucleotide"/> | |
252 <param name="format" value="fasta"/> | |
253 </conditional> | |
254 <conditional name="query_source"> | |
255 <param name="select" value="accession_list" /> | |
256 <param name="accession_list" value="CP011064,CP0XXXXX,CP021680"/> | |
257 </conditional> | |
258 <param name="ignore_failed" value="0" /> | |
259 </test> | |
260 <test> | |
261 <conditional name="molecule"> | |
262 <param name="select" value="nucleotide"/> | |
263 <param name="format" value="fasta"/> | |
264 </conditional> | |
265 <conditional name="query_source"> | |
266 <param name="select" value="accession_list" /> | |
267 <param name="accession_list" value="CP011064 CP021680"/> | |
268 </conditional> | |
269 <output_collection name="output" type="list"> | |
270 <element name="CP011064" ftype="fasta"> | |
271 <assert_contents> | |
272 <has_line line=">CP011064.1 Escherichia coli str. Sanji plasmid pSJ_94, complete sequence" /> | |
273 </assert_contents> | |
274 </element> | |
275 <element name="CP021680" ftype="fasta"> | |
276 <assert_contents> | |
277 <has_line line=">CP021680.1 Escherichia coli strain AR_0162 plasmid tig00002623, complete sequence" /> | |
278 </assert_contents> | |
279 </element> | |
280 </output_collection> | |
281 </test> | |
282 <test> | |
283 <conditional name="molecule"> | |
284 <param name="select" value="protein"/> | |
285 <param name="format" value="fasta"/> | |
286 </conditional> | |
287 <conditional name="query_source"> | |
288 <param name="select" value="accession_list" /> | |
289 <param name="accession_list" value="NP_003192"/> | |
290 </conditional> | |
291 <output_collection name="output" type="list"> | |
292 <element name="NP_003192" ftype="fasta"> | |
293 <assert_contents> | |
294 <has_line line=">NP_003192.1 transcription factor A, mitochondrial isoform 1 precursor [Homo sapiens]" /> | |
295 </assert_contents> | |
296 </element> | |
297 </output_collection> | |
298 </test> | |
299 </tests> | |
300 <help><![CDATA[ | |
301 **What it does** | |
302 Given a file containing a list of NCBI accession numbers or a direct entry of accession numbers in the tool text input box, this tool will download the corresponding sequence records via the NCBI API. | |
303 | |
304 **Limitations** | |
305 - For protein sequence downloads, only fasta format is supported | |
306 - To avoid rate-limits imposed by the NCBI API, records are downloaded sequentially with a delay between requests. This may make it impractical to use this tool to download many (>100) records. | |
307 | |
308 **Output** | |
309 A collection of sequence records in the desired format. | |
310 ]]></help> | |
311 <citations> | |
312 </citations> | |
313 </tool> |