sra_tools: fastq_dump.xml comparison

comparison fastq_dump.xml @ 7:c7620aa7e1f0 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit d1347141d384ed404f674d7ce408b6769e763ea1

author	iuc
date	Wed, 10 May 2017 10:45:41 -0400
parents	30775c836c77
children	1920e0508831

comparison

equal deleted inserted replaced

-:30775c836c77
+:c7620aa7e1f0
-<tool id="fastq_dump" name="Extract reads" version="@VERSION@.1">
+<tool id="fastq_dump" name="Extract reads in Fastq/a" version="@VERSION@.2">
-<description>in FASTQ/A format from NCBI SRA.</description>
+<description>format from NCBI SRA</description>
 <macros>
 <import>sra_macros.xml</import>
 </macros>
 <expand macro="requirements"/>
 <version_command>fastq-dump --version</version_command>
 <command detect_errors="exit_code">
 <![CDATA[
 #if $input.input_select=="file_list":
-for acc in `cat $input.file_list` ;
-do
+for acc in `cat $input.file_list` ;
+do
 #elif $input.input_select=="accession_number":
-acc="$input.accession" &&
+## Stripping leading and trailing spaces in case user typed them in
+acc="${input.accession}" &&
 #end if
 #if $input.input_select=="file_list" or $input.input_select=="accession_number":
-[ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && (
+[ ""\$acc" =~ ^[E|S|D]RR[0-9]{1,}$" ] && (
 #end if
 ## Need to set the home directory to the current working directory,
 ## else the tool tries to write to home/.ncbi and fails when used
 ## with a cluster manager.
 --matepair-distance "$adv.matepairDist"
 #end if
 $adv.clip
 $adv.skip_technical
-#if str( $outputformat ) == "fasta":
+#if str( $outputformat ) == "fastqsanger.gz":
---fasta
+--gzip
+#elif str( $outputformat ) == "fastqsanger.bz2":
+--bzip2
 #end if
 #if $input.input_select=="file":
 --stdout
 "$input.file" > "$output_file"
-#elif $input.input_select=="file_list":
-"\$acc"
+#elif $input.input_select=="accession_number":
-#else:
+--stdout
---stdout
 "\$acc" > "$output_accession" )
 #end if
 #if $input.input_select=="file_list":
 ) ; done
 ;
+for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do
+count=`ls \$i* | wc -l` ;
+data=(\$(ls -d \$i*));
-for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do
-count=`ls \$i* | wc -l` ;
-data=(\$(ls -d \$i*));
 if [ "\$count" -eq 2 ]; then
 mv "\${data[0]}" "\${data[0]}"_forward.$outputformat;  mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ;
 elif [ "\$count" -eq 1 ]; then
 mv "\${data[0]}" "\${data[0]}"__single.$outputformat ;
 fi;
 done
 #end if
 ]]>
 </command>
 <inputs>
 <expand macro="input_conditional"/>
-<param name="outputformat" type="select" label="select output format">
+<param name="outputformat" type="select" display="radio" label="Select output format" help="Compression will greatly reduce the amount of space occupied by downloaded data. Downstream applications such as a short-read mappers will accept compressed data as input. Consider this example: an uncoimpressed 400 Mb fastq datasets compresses to 100 Mb or 80 Mb by gzip or bzip2, respectively. " argument="--gzip --bzip2">
-<option value="fastqsanger">fastq</option>
+<option value="fastqsanger.gz">gzip compressed fastq</option>
-<option value="fasta">fasta</option>
+<option value="fastqsanger">Uncompressed fastq</option>
+<option value="fastqsanger.bz2">bzip2 compressed fastq</option>
 </param>
 <section name="adv" title="Advanced Options" expanded="False">
-<param name="minID" type="integer" label="minimum spot ID" optional="true"/>
+<param name="minID" type="integer" label="Minimum spot ID" optional="true" help="Minimum spot id to be dumped." argument="--minSpotId"/>
-<param name="maxID" type="integer" label="maximum spot ID" optional="true"/>
+<param name="maxID" type="integer" label="Maximum spot ID" optional="true" help="Maximum spot id to be dumped." argument="--maxSpotId"/>
-<param name="minlen" type="integer" label="minimum read length" optional="true"/>
+<param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--minReadLen"/>
-<param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue="">
+<param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue="" label="Split spot by read pairs" help="Split spots into individual reads." argument="--split-spot"/>
-<label>split spot by read pairs</label>
-</param>
 <expand macro="alignments"/>
 <expand macro="region"/>
 <expand macro="matepairDist"/>
-<param name="readfilter" type="select" value="">
+<param name="readfilter" type="select" value="" label="filter by value" argument="--read-filter">
-<label>filter by value</label>
 <option value="">None</option>
 <option value="pass">pass</option>
 <option value="reject">reject</option>
 <option value="criteria">criteria</option>
 <option value="redacted">redacted</option>
 </param>
-<param name="spotgroups" type="text" label="filter by spot-groups" optional="true"/>
+<param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/>
-<param name="clip" type="boolean" truevalue="--clip" falsevalue="">
+<param name="clip" type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" />
-<label>apply left and right clips</label>
+<param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/>
-</param>
-<param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads"/>
 </section>
 </inputs>
 <outputs>
-<collection name="list_paired" type="list:paired" label="Pair-end Fast(q|a)">
+<collection name="list_paired" type="list:paired" label="Pair-end data (fastq-dump)">
 <filter>input['input_select'] == "file_list"</filter>
 <!-- Use named regex group to grab pattern
 <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list
 identifier in the nested collection and identifier_1 is either
 forward or reverse (for instance samp1_forward.fq).
 -->
-<discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq_(?P&lt;identifier_1&gt;[^_]+)\.fastq" ext="fastqsanger" visible="false" />
-<discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fasta_(?P&lt;identifier_1&gt;[^_]+)\.fasta" ext="fasta" visible="false" />
+<discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger" ext="fastqsanger" />
-</collection>
+<discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.gz_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.gz" ext="fastqsanger.gz" />
-<collection name="output_collection" type='list' label="Single-end Fast(q|a)">
+<discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_\d+.fastq.bz2_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.bz2" ext="fastqsanger.bz2" />
-<filter>input['input_select'] == "file_list"</filter>
+</collection>
-<discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq__single\.fastq" directory="." ext='fastqsanger'/>
+<collection name="output_collection" type='list' label="Single-end data (fastq-dump)">
-<discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fasta__single\.fasta" directory="." ext='fasta'/>
+<filter>input['input_select'] == "file_list"</filter>
-</collection>
+<discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq__single\.fastqsanger" directory="." ext='fastqsanger'/>
-<data format="fastqsanger" name="output_accession" >
+<discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.gz__single\.fastqsanger.gz" directory="." ext='fastqsanger.gz'/>
-<filter>input['input_select'] == "accession_number"</filter>
+<discover_datasets pattern="(?P&lt;designation&gt;.+)_\d+.fastq.bz2__single\.fastqsanger.bz2" directory="." ext='fastqsanger.bz2'/>
-<change_format>
+</collection>
-<when input="outputformat" value="fasta" format="fasta"/>
+<data format="fastqsanger" name="output_accession" label="${input.accession} (fastq-dump)">
-</change_format>
+<filter>input['input_select'] == "accession_number"</filter>
-</data>
+<change_format>
-<data format="fastqsanger" name="output_file" label="${input.file.name}.${outputformat}">
+<when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/>
-<filter>input['input_select'] == "file"</filter>
+<when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/>
-<change_format>
+</change_format>
-<when input="outputformat" value="fasta" format="fasta"/>
+</data>
-</change_format>
+<data format="fastqsanger" name="output_file" label="${input.file.name} (fastq-dump)">
-</data>
+<filter>input['input_select'] == "file"</filter>
+<change_format>
+<when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/>
+<when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/>
+</change_format>
+</data>
 </outputs>
 <tests>
 <test>
 <param name="input_select" value="accession_number"/>
 <param name="outputformat" value="fastqsanger"/>
 <param name="accession" value="SRR044777"/>
 <param name="skip_technical" value="True"/>
 <output name="output_accession">
 <assert_contents>
 <not_has_text text="rRNA_primer"/>
 <has_text text="F47USSH02GNP1D" />
 </assert_contents>
 </output>
 </test>
 <test>
 <param name="input_select" value="accession_number"/>
-<param name="outputformat" value="fastqsanger"/>
+<param name="outputformat" value="fastqsanger.gz"/>
 <param name="accession" value="SRR925743"/>
 <param name="maxID" value="5"/>
-<output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/>
+<output name="output_accession" file="fastq_dump_result.fastq.gz" decompress="True"/>
 </test>
 <test>
-<param name="input_select" value="file_list"/>
+<param name="input_select" value="accession_number"/>
 <param name="outputformat" value="fastqsanger"/>
-<param name="file_list" value="list_pe"/>
+<param name="accession" value="SRR925743"/>
 <param name="maxID" value="5"/>
-<output_collection name="list_paired" type="list:paired">
+<output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/>
-<element name="DRR015708">
+</test>
-<element name="forward" file="DRR015708_forward.fastqsanger">
+<test>
-</element>
+<param name="input_select" value="file_list"/>
-<element name="reverse" file="DRR015708_reverse.fastqsanger">
+<param name="outputformat" value="fastqsanger"/>
-</element>
+<param name="file_list" value="list_pe"/>
-</element>
+<param name="maxID" value="5"/>
-</output_collection>
+<output_collection name="list_paired" type="list:paired">
-</test>
+<element name="DRR015708">
-<test>
+<element name="forward" file="DRR015708_forward.fastqsanger">
-<param name="input_select" value="file_list"/>
+</element>
-<param name="outputformat" value="fastqsanger"/>
+<element name="reverse" file="DRR015708_reverse.fastqsanger">
-<param name="file_list" value="list_pe2"/>
+</element>
-<param name="maxID" value="5"/>
+</element>
-<output_collection name="list_paired" type="list:paired">
+</output_collection>
-<element name="ERR027433">
+</test>
-<element name="forward" file="ERR027433_forward.fastqsanger">
+<test>
-</element>
+<param name="input_select" value="file_list"/>
-<element name="reverse" file="ERR027433_reverse.fastqsanger">
+<param name="outputformat" value="fastqsanger"/>
-</element>
+<param name="file_list" value="list_pe2"/>
-</element>
+<param name="maxID" value="5"/>
-</output_collection>
+<output_collection name="list_paired" type="list:paired">
-</test>
+<element name="ERR027433">
-<test>
+<element name="forward" file="ERR027433_forward.fastqsanger">
-<param name="input_select" value="file_list"/>
+</element>
-<param name="outputformat" value="fastqsanger"/>
+<element name="reverse" file="ERR027433_reverse.fastqsanger">
-<param name="file_list" value="list_se"/>
+</element>
-<param name="maxID" value="5"/>
+</element>
-<output_collection name="output_collection" type="list">
+</output_collection>
-<element name="SRR1993644" file="SRR1993644.fastqsanger"/>
+</test>
-</output_collection>
+<test>
-</test>
+<param name="input_select" value="file_list"/>
+<param name="outputformat" value="fastqsanger"/>
+<param name="file_list" value="list_se"/>
+<param name="maxID" value="5"/>
+<output_collection name="output_collection" type="list">
+<element name="SRR1993644" file="SRR1993644.fastqsanger"/>
+</output_collection>
+</test>
 </tests>
-<help>
+<help><![CDATA[
-This tool extracts reads from SRA archives using fastq-dump.
+**What it does?**
-The fastq-dump program is developed at NCBI, and is available at
-http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=software.
+This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fastq-dump_ utility of the SRA Toolkit.
-NB: Single-end or pair-end collections may be empty if given SRRs LibraryLayout contains only either SINGLE or PAIRED respectively
+**How to use it?**
-@SRATOOLS_ATTRRIBUTION@
+There are three ways in which you can download data:
+1. Data for single accession
+2. Multiple datasets using a list of accessions
+3. Extract data from already uploaded SRA dataset
+Below we discuss each in detail.
+------
+**Uploading data for a single accession**
+When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you. It is important to keep the following in mind:
+- if data is paired-ended (or mate-paired) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see an example dataset below)
+- if data is single ended, a standard single fastq dataset will be produced
+-----
+**Uploading multiple datasets using a list of accessions**
+A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file:
+1. Upload it into your history using Galaxy's upload tool
+2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown
+3. Choose uploaded file within the **sra accession list** field
+4. Click **Execute**
+.. class:: warningmark
+Fastq datasets produced by this option will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, two collections will be produced: one containing paired-end data and another containing single-end data. Single-end or pair-end collections may be empty if the accessions provided in the list contain only SINGLE or PAIRED data, respectively.
+-----
+**Extract data from already uploaded SRA dataset**
+If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies:
+- if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below).
+- if data is single ended, a standard fastq dataset will be produced
+@ACCESSION_LIST_HOWTO@
+-----
+**Paired-end (and mate-pair) data in fastq format**
+Paired end datasets can be represented as two individual datasets:
+First dataset::
+@1/1
+AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
++
+EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED
+@2/1
+AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
++
+HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG
+Second dataset::
+@1/2
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
++
+GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF
+@2/2
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
++
+HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH
+Or a single *interleaved* dataset::
+@1/1
+AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
++
+EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED
+@1/2
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
++
+GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF
+@2/1
+AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA
++
+HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG
+@2/2
+CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC
++
+HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH
+----
+.. _fastq: https://en.wikipedia.org/wiki/FASTQ_format
+.. _fastq-dump: https://ncbi.github.io/sra-tools/fastq-dump.html
+.. _collection: https://galaxyproject.org/tutorials/collections/
+.. _link: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies
+@SRATOOLS_ATTRRIBUTION@
+]]>
 </help>
 <expand macro="citation"/>
 </tool>

Mercurial > repos > iuc > sra_tools

comparison fastq_dump.xml @ 7:c7620aa7e1f0 draft