Mercurial > repos > iuc > sra_tools
comparison fastq_dump.xml @ 7:c7620aa7e1f0 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit d1347141d384ed404f674d7ce408b6769e763ea1
author | iuc |
---|---|
date | Wed, 10 May 2017 10:45:41 -0400 |
parents | 30775c836c77 |
children | 1920e0508831 |
comparison
equal
deleted
inserted
replaced
6:30775c836c77 | 7:c7620aa7e1f0 |
---|---|
1 <tool id="fastq_dump" name="Extract reads" version="@VERSION@.1"> | 1 <tool id="fastq_dump" name="Extract reads in Fastq/a" version="@VERSION@.2"> |
2 <description>in FASTQ/A format from NCBI SRA.</description> | 2 <description>format from NCBI SRA</description> |
3 <macros> | 3 <macros> |
4 <import>sra_macros.xml</import> | 4 <import>sra_macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="requirements"/> | 6 <expand macro="requirements"/> |
7 <version_command>fastq-dump --version</version_command> | 7 <version_command>fastq-dump --version</version_command> |
8 <command detect_errors="exit_code"> | 8 <command detect_errors="exit_code"> |
9 <![CDATA[ | 9 <![CDATA[ |
10 | 10 |
11 #if $input.input_select=="file_list": | 11 #if $input.input_select=="file_list": |
12 for acc in `cat $input.file_list` ; | 12 |
13 do | 13 for acc in `cat $input.file_list` ; |
14 do | |
15 | |
14 #elif $input.input_select=="accession_number": | 16 #elif $input.input_select=="accession_number": |
15 acc="$input.accession" && | 17 |
18 ## Stripping leading and trailing spaces in case user typed them in | |
19 acc="${input.accession}" && | |
20 | |
16 #end if | 21 #end if |
17 | 22 |
18 #if $input.input_select=="file_list" or $input.input_select=="accession_number": | 23 #if $input.input_select=="file_list" or $input.input_select=="accession_number": |
19 [ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && ( | 24 |
25 [ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && ( | |
26 | |
20 #end if | 27 #end if |
21 | 28 |
22 ## Need to set the home directory to the current working directory, | 29 ## Need to set the home directory to the current working directory, |
23 ## else the tool tries to write to home/.ncbi and fails when used | 30 ## else the tool tries to write to home/.ncbi and fails when used |
24 ## with a cluster manager. | 31 ## with a cluster manager. |
72 --matepair-distance "$adv.matepairDist" | 79 --matepair-distance "$adv.matepairDist" |
73 #end if | 80 #end if |
74 $adv.clip | 81 $adv.clip |
75 $adv.skip_technical | 82 $adv.skip_technical |
76 | 83 |
77 #if str( $outputformat ) == "fasta": | 84 #if str( $outputformat ) == "fastqsanger.gz": |
78 --fasta | 85 --gzip |
86 #elif str( $outputformat ) == "fastqsanger.bz2": | |
87 --bzip2 | |
79 #end if | 88 #end if |
80 #if $input.input_select=="file": | 89 #if $input.input_select=="file": |
81 --stdout | 90 --stdout |
82 "$input.file" > "$output_file" | 91 "$input.file" > "$output_file" |
83 #elif $input.input_select=="file_list": | 92 |
84 "\$acc" | 93 #elif $input.input_select=="accession_number": |
85 #else: | 94 --stdout |
86 --stdout | |
87 "\$acc" > "$output_accession" ) | 95 "\$acc" > "$output_accession" ) |
88 #end if | 96 #end if |
89 | 97 |
90 #if $input.input_select=="file_list": | 98 #if $input.input_select=="file_list": |
91 ) ; done | 99 ) ; done |
92 | 100 |
93 ; | 101 ; |
94 | 102 |
95 | 103 for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do |
96 | 104 count=`ls \$i* | wc -l` ; |
97 | 105 data=(\$(ls -d \$i*)); |
98 | |
99 for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do | |
100 count=`ls \$i* | wc -l` ; | |
101 data=(\$(ls -d \$i*)); | |
102 | 106 |
103 if [ "\$count" -eq 2 ]; then | 107 if [ "\$count" -eq 2 ]; then |
104 mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ; | 108 mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ; |
105 elif [ "\$count" -eq 1 ]; then | 109 elif [ "\$count" -eq 1 ]; then |
106 mv "\${data[0]}" "\${data[0]}"__single.$outputformat ; | 110 mv "\${data[0]}" "\${data[0]}"__single.$outputformat ; |
107 fi; | 111 fi; |
108 done | 112 done |
109 | 113 |
110 | 114 |
111 #end if | 115 #end if |
112 | 116 |
113 | 117 |
114 ]]> | 118 ]]> |
115 </command> | 119 </command> |
116 <inputs> | 120 <inputs> |
117 <expand macro="input_conditional"/> | 121 <expand macro="input_conditional"/> |
118 <param name="outputformat" type="select" label="select output format"> | 122 <param name="outputformat" type="select" display="radio" label="Select output format" help="Compression will greatly reduce the amount of space occupied by downloaded data. Downstream applications such as a short-read mappers will accept compressed data as input. Consider this example: an uncoimpressed 400 Mb fastq datasets compresses to 100 Mb or 80 Mb by gzip or bzip2, respectively. " argument="--gzip --bzip2"> |
119 <option value="fastqsanger">fastq</option> | 123 <option value="fastqsanger.gz">gzip compressed fastq</option> |
120 <option value="fasta">fasta</option> | 124 <option value="fastqsanger">Uncompressed fastq</option> |
125 <option value="fastqsanger.bz2">bzip2 compressed fastq</option> | |
121 </param> | 126 </param> |
122 <section name="adv" title="Advanced Options" expanded="False"> | 127 <section name="adv" title="Advanced Options" expanded="False"> |
123 <param name="minID" type="integer" label="minimum spot ID" optional="true"/> | 128 <param name="minID" type="integer" label="Minimum spot ID" optional="true" help="Minimum spot id to be dumped." argument="--minSpotId"/> |
124 <param name="maxID" type="integer" label="maximum spot ID" optional="true"/> | 129 <param name="maxID" type="integer" label="Maximum spot ID" optional="true" help="Maximum spot id to be dumped." argument="--maxSpotId"/> |
125 <param name="minlen" type="integer" label="minimum read length" optional="true"/> | 130 <param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--minReadLen"/> |
126 <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue=""> | 131 <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue="" label="Split spot by read pairs" help="Split spots into individual reads." argument="--split-spot"/> |
127 <label>split spot by read pairs</label> | |
128 </param> | |
129 <expand macro="alignments"/> | 132 <expand macro="alignments"/> |
130 <expand macro="region"/> | 133 <expand macro="region"/> |
131 <expand macro="matepairDist"/> | 134 <expand macro="matepairDist"/> |
132 <param name="readfilter" type="select" value=""> | 135 <param name="readfilter" type="select" value="" label="filter by value" argument="--read-filter"> |
133 <label>filter by value</label> | |
134 <option value="">None</option> | 136 <option value="">None</option> |
135 <option value="pass">pass</option> | 137 <option value="pass">pass</option> |
136 <option value="reject">reject</option> | 138 <option value="reject">reject</option> |
137 <option value="criteria">criteria</option> | 139 <option value="criteria">criteria</option> |
138 <option value="redacted">redacted</option> | 140 <option value="redacted">redacted</option> |
139 </param> | 141 </param> |
140 <param name="spotgroups" type="text" label="filter by spot-groups" optional="true"/> | 142 <param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/> |
141 <param name="clip" type="boolean" truevalue="--clip" falsevalue=""> | 143 <param name="clip" type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" /> |
142 <label>apply left and right clips</label> | 144 <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/> |
143 </param> | |
144 <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads"/> | |
145 </section> | 145 </section> |
146 </inputs> | 146 </inputs> |
147 <outputs> | 147 <outputs> |
148 <collection name="list_paired" type="list:paired" label="Pair-end Fast(q|a)"> | 148 <collection name="list_paired" type="list:paired" label="Pair-end data (fastq-dump)"> |
149 <filter>input['input_select'] == "file_list"</filter> | 149 <filter>input['input_select'] == "file_list"</filter> |
150 | |
150 <!-- Use named regex group to grab pattern | 151 <!-- Use named regex group to grab pattern |
151 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list | 152 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list |
152 identifier in the nested collection and identifier_1 is either | 153 identifier in the nested collection and identifier_1 is either |
153 forward or reverse (for instance samp1_forward.fq). | 154 forward or reverse (for instance samp1_forward.fq). |
154 --> | 155 --> |
155 <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq_(?P<identifier_1>[^_]+)\.fastq" ext="fastqsanger" visible="false" /> | 156 |
156 <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fasta_(?P<identifier_1>[^_]+)\.fasta" ext="fasta" visible="false" /> | 157 <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq_(?P<identifier_1>[^_]+)\.fastqsanger" ext="fastqsanger" /> |
157 </collection> | 158 <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq.gz_(?P<identifier_1>[^_]+)\.fastqsanger.gz" ext="fastqsanger.gz" /> |
158 <collection name="output_collection" type='list' label="Single-end Fast(q|a)"> | 159 <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq.bz2_(?P<identifier_1>[^_]+)\.fastqsanger.bz2" ext="fastqsanger.bz2" /> |
159 <filter>input['input_select'] == "file_list"</filter> | 160 </collection> |
160 <discover_datasets pattern="(?P<designation>.+)_\d+.fastq__single\.fastq" directory="." ext='fastqsanger'/> | 161 <collection name="output_collection" type='list' label="Single-end data (fastq-dump)"> |
161 <discover_datasets pattern="(?P<designation>.+)_\d+.fasta__single\.fasta" directory="." ext='fasta'/> | 162 <filter>input['input_select'] == "file_list"</filter> |
162 </collection> | 163 <discover_datasets pattern="(?P<designation>.+)_\d+.fastq__single\.fastqsanger" directory="." ext='fastqsanger'/> |
163 <data format="fastqsanger" name="output_accession" > | 164 <discover_datasets pattern="(?P<designation>.+)_\d+.fastq.gz__single\.fastqsanger.gz" directory="." ext='fastqsanger.gz'/> |
164 <filter>input['input_select'] == "accession_number"</filter> | 165 <discover_datasets pattern="(?P<designation>.+)_\d+.fastq.bz2__single\.fastqsanger.bz2" directory="." ext='fastqsanger.bz2'/> |
165 <change_format> | 166 </collection> |
166 <when input="outputformat" value="fasta" format="fasta"/> | 167 <data format="fastqsanger" name="output_accession" label="${input.accession} (fastq-dump)"> |
167 </change_format> | 168 <filter>input['input_select'] == "accession_number"</filter> |
168 </data> | 169 <change_format> |
169 <data format="fastqsanger" name="output_file" label="${input.file.name}.${outputformat}"> | 170 <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/> |
170 <filter>input['input_select'] == "file"</filter> | 171 <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/> |
171 <change_format> | 172 </change_format> |
172 <when input="outputformat" value="fasta" format="fasta"/> | 173 </data> |
173 </change_format> | 174 <data format="fastqsanger" name="output_file" label="${input.file.name} (fastq-dump)"> |
174 </data> | 175 <filter>input['input_select'] == "file"</filter> |
176 <change_format> | |
177 <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/> | |
178 <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/> | |
179 </change_format> | |
180 </data> | |
175 </outputs> | 181 </outputs> |
176 <tests> | 182 <tests> |
177 <test> | 183 <test> |
178 <param name="input_select" value="accession_number"/> | 184 <param name="input_select" value="accession_number"/> |
179 <param name="outputformat" value="fastqsanger"/> | 185 <param name="outputformat" value="fastqsanger"/> |
180 <param name="accession" value="SRR044777"/> | 186 <param name="accession" value="SRR044777"/> |
181 <param name="skip_technical" value="True"/> | 187 <param name="skip_technical" value="True"/> |
182 <output name="output_accession"> | 188 <output name="output_accession"> |
183 <assert_contents> | 189 <assert_contents> |
184 <not_has_text text="rRNA_primer"/> | 190 <not_has_text text="rRNA_primer"/> |
185 <has_text text="F47USSH02GNP1D" /> | 191 <has_text text="F47USSH02GNP1D" /> |
186 </assert_contents> | 192 </assert_contents> |
187 </output> | 193 </output> |
188 </test> | 194 </test> |
189 <test> | 195 <test> |
190 <param name="input_select" value="accession_number"/> | 196 <param name="input_select" value="accession_number"/> |
191 <param name="outputformat" value="fastqsanger"/> | 197 <param name="outputformat" value="fastqsanger.gz"/> |
192 <param name="accession" value="SRR925743"/> | 198 <param name="accession" value="SRR925743"/> |
193 <param name="maxID" value="5"/> | 199 <param name="maxID" value="5"/> |
194 <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/> | 200 <output name="output_accession" file="fastq_dump_result.fastq.gz" decompress="True"/> |
195 </test> | 201 </test> |
196 <test> | 202 <test> |
197 <param name="input_select" value="file_list"/> | 203 <param name="input_select" value="accession_number"/> |
198 <param name="outputformat" value="fastqsanger"/> | 204 <param name="outputformat" value="fastqsanger"/> |
199 <param name="file_list" value="list_pe"/> | 205 <param name="accession" value="SRR925743"/> |
200 <param name="maxID" value="5"/> | 206 <param name="maxID" value="5"/> |
201 <output_collection name="list_paired" type="list:paired"> | 207 <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/> |
202 <element name="DRR015708"> | 208 </test> |
203 <element name="forward" file="DRR015708_forward.fastqsanger"> | 209 <test> |
204 </element> | 210 <param name="input_select" value="file_list"/> |
205 <element name="reverse" file="DRR015708_reverse.fastqsanger"> | 211 <param name="outputformat" value="fastqsanger"/> |
206 </element> | 212 <param name="file_list" value="list_pe"/> |
207 </element> | 213 <param name="maxID" value="5"/> |
208 </output_collection> | 214 <output_collection name="list_paired" type="list:paired"> |
209 </test> | 215 <element name="DRR015708"> |
210 <test> | 216 <element name="forward" file="DRR015708_forward.fastqsanger"> |
211 <param name="input_select" value="file_list"/> | 217 </element> |
212 <param name="outputformat" value="fastqsanger"/> | 218 <element name="reverse" file="DRR015708_reverse.fastqsanger"> |
213 <param name="file_list" value="list_pe2"/> | 219 </element> |
214 <param name="maxID" value="5"/> | 220 </element> |
215 <output_collection name="list_paired" type="list:paired"> | 221 </output_collection> |
216 <element name="ERR027433"> | 222 </test> |
217 <element name="forward" file="ERR027433_forward.fastqsanger"> | 223 <test> |
218 </element> | 224 <param name="input_select" value="file_list"/> |
219 <element name="reverse" file="ERR027433_reverse.fastqsanger"> | 225 <param name="outputformat" value="fastqsanger"/> |
220 </element> | 226 <param name="file_list" value="list_pe2"/> |
221 </element> | 227 <param name="maxID" value="5"/> |
222 </output_collection> | 228 <output_collection name="list_paired" type="list:paired"> |
223 </test> | 229 <element name="ERR027433"> |
224 <test> | 230 <element name="forward" file="ERR027433_forward.fastqsanger"> |
225 <param name="input_select" value="file_list"/> | 231 </element> |
226 <param name="outputformat" value="fastqsanger"/> | 232 <element name="reverse" file="ERR027433_reverse.fastqsanger"> |
227 <param name="file_list" value="list_se"/> | 233 </element> |
228 <param name="maxID" value="5"/> | 234 </element> |
229 <output_collection name="output_collection" type="list"> | 235 </output_collection> |
230 <element name="SRR1993644" file="SRR1993644.fastqsanger"/> | 236 </test> |
231 </output_collection> | 237 <test> |
232 </test> | 238 <param name="input_select" value="file_list"/> |
239 <param name="outputformat" value="fastqsanger"/> | |
240 <param name="file_list" value="list_se"/> | |
241 <param name="maxID" value="5"/> | |
242 <output_collection name="output_collection" type="list"> | |
243 <element name="SRR1993644" file="SRR1993644.fastqsanger"/> | |
244 </output_collection> | |
245 </test> | |
233 </tests> | 246 </tests> |
234 <help> | 247 <help><![CDATA[ |
235 This tool extracts reads from SRA archives using fastq-dump. | 248 **What it does?** |
236 The fastq-dump program is developed at NCBI, and is available at | 249 |
237 http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software. | 250 This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fastq-dump_ utility of the SRA Toolkit. |
238 | 251 |
239 NB: Single-end or pair-end collections may be empty if given SRRs LibraryLayout contains only either SINGLE or PAIRED respectively | 252 **How to use it?** |
240 @SRATOOLS_ATTRRIBUTION@ | 253 |
254 There are three ways in which you can download data: | |
255 | |
256 1. Data for single accession | |
257 2. Multiple datasets using a list of accessions | |
258 3. Extract data from already uploaded SRA dataset | |
259 | |
260 Below we discuss each in detail. | |
261 | |
262 ------ | |
263 | |
264 **Uploading data for a single accession** | |
265 | |
266 When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you. It is important to keep the following in mind: | |
267 | |
268 - if data is paired-ended (or mate-paired) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see an example dataset below) | |
269 - if data is single ended, a standard single fastq dataset will be produced | |
270 | |
271 ----- | |
272 | |
273 **Uploading multiple datasets using a list of accessions** | |
274 | |
275 A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file: | |
276 | |
277 1. Upload it into your history using Galaxy's upload tool | |
278 2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown | |
279 3. Choose uploaded file within the **sra accession list** field | |
280 4. Click **Execute** | |
281 | |
282 .. class:: warningmark | |
283 | |
284 Fastq datasets produced by this option will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, two collections will be produced: one containing paired-end data and another containing single-end data. Single-end or pair-end collections may be empty if the accessions provided in the list contain only SINGLE or PAIRED data, respectively. | |
285 | |
286 ----- | |
287 | |
288 **Extract data from already uploaded SRA dataset** | |
289 | |
290 If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies: | |
291 | |
292 - if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below). | |
293 - if data is single ended, a standard fastq dataset will be produced | |
294 | |
295 @ACCESSION_LIST_HOWTO@ | |
296 | |
297 ----- | |
298 | |
299 **Paired-end (and mate-pair) data in fastq format** | |
300 | |
301 Paired end datasets can be represented as two individual datasets: | |
302 | |
303 First dataset:: | |
304 | |
305 @1/1 | |
306 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA | |
307 + | |
308 EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED | |
309 @2/1 | |
310 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA | |
311 + | |
312 HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG | |
313 | |
314 Second dataset:: | |
315 | |
316 @1/2 | |
317 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC | |
318 + | |
319 GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF | |
320 @2/2 | |
321 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC | |
322 + | |
323 HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH | |
324 | |
325 Or a single *interleaved* dataset:: | |
326 | |
327 @1/1 | |
328 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA | |
329 + | |
330 EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED | |
331 @1/2 | |
332 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC | |
333 + | |
334 GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF | |
335 @2/1 | |
336 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA | |
337 + | |
338 HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG | |
339 @2/2 | |
340 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC | |
341 + | |
342 HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH | |
343 | |
344 ---- | |
345 | |
346 | |
347 .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format | |
348 .. _fastq-dump: https://ncbi.github.io/sra-tools/fastq-dump.html | |
349 .. _collection: https://galaxyproject.org/tutorials/collections/ | |
350 .. _link: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies | |
351 | |
352 @SRATOOLS_ATTRRIBUTION@ | |
353 | |
354 ]]> | |
241 </help> | 355 </help> |
242 <expand macro="citation"/> | 356 <expand macro="citation"/> |
243 </tool> | 357 </tool> |