Mercurial > repos > iuc > sra_tools
view fastq_dump.xml @ 13:c38286ea7047 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit 686710a7d313b828f1daed20c4055479727f2d91
author | iuc |
---|---|
date | Wed, 17 Oct 2018 11:44:12 -0400 |
parents | 6c60903f70ac |
children | f5ea3ce9b9b0 |
line wrap: on
line source
<tool id="fastq_dump" name="Download and Extract Reads in FASTA/Q" version="@VERSION@.3"> <description>format from NCBI SRA</description> <macros> <import>sra_macros.xml</import> </macros> <expand macro="requirements"/> <version_command>fastq-dump --version</version_command> <command detect_errors="exit_code"><![CDATA[ @SET_ACCESSIONS@ ## Need to set the home directory to the current working directory, ## else the tool tries to write to home/.ncbi and fails when used ## with a cluster manager. export HOME=\$PWD && vdb-config --restore-defaults && #if $input.input_select == "file": fastq-dump --log-level fatal --accession '${input.file.name}' #else: vdb-config -s "/repository/user/main/public/root=\$PWD" && ## Do not use prefetch if region is specified, to avoid downloading ## the complete sra file. #if ( str( $adv.region ) == "" ) and ( str( $adv.minID ) == "" ) and ( str( $adv.maxID ) == "" ): ASCP_PATH=`command -v ascp` && ASCP_KEY=`dirname \$ASCP_PATH`/asperaweb_id_dsa.openssh || true && prefetch -X 200G --ascp-path "\$ASCP_PATH|\$ASCP_KEY" "\$acc" && ## Duplicate vdb-config, in case settings changed between prefetch and ## dump command. vdb-config -s "/repository/user/main/public/root=\$PWD" && #end if fastq-dump --accession "\$acc" --split-files #end if --defline-seq '@\$sn[_\$rn]/\$ri' --defline-qual '+' $adv.split #if str( $adv.alignments ) == "aligned": --aligned #end if #if str( $adv.alignments ) == "unaligned": --unaligned #end if #if str( $adv.minID ) != "": --minSpotId "$adv.minID" #end if #if str( $adv.maxID ) != "": --maxSpotId "$adv.maxID" #end if #if str( $adv.minlen ) != "": --minReadLen "$adv.minlen" #end if #if str( $adv.readfilter ) != "": --read-filter "$adv.readfilter" #end if #if str( $adv.region ) != "": --aligned-region "$adv.region" #end if #if str( $adv.spotgroups ) != "": --spot-groups "$adv.spotgroups" #end if #if str( $adv.matepairDist ) != "": --matepair-distance "$adv.matepairDist" #end if $adv.clip $adv.skip_technical #if str( $outputformat ) == "fastqsanger.gz": --gzip #elif str( $outputformat ) == "fastqsanger.bz2": --bzip2 #end if #if $input.input_select=="file": --stdout "$input.file" > "$output_file" #elif $input.input_select=="accession_number": --stdout "\$acc" > "$output_accession" ) #end if #if $input.input_select=="file_list": ) ; done ; for i in `ls *.fast* | cut -f 1 -d '_' | uniq` ; do count=`ls \$i* | wc -l` ; data=(\$(ls -d \$i*)); if [ "\$count" -eq 2 ]; then mv "\${data[0]}" "\${data[0]}"_forward.$outputformat; mv "\${data[1]}" "\${data[1]}"_reverse.$outputformat ; elif [ "\$count" -eq 1 ]; then mv "\${data[0]}" "\${data[0]}"__single.$outputformat ; fi; done #end if ]]> </command> <inputs> <expand macro="input_conditional"/> <param name="outputformat" type="select" display="radio" label="Select output format" help="Compression will greatly reduce the amount of space occupied by downloaded data. Downstream applications such as a short-read mappers will accept compressed data as input. Consider this example: an uncoimpressed 400 Mb fastq datasets compresses to 100 Mb or 80 Mb by gzip or bzip2, respectively. " argument="--gzip --bzip2"> <option value="fastqsanger.gz">gzip compressed fastq</option> <option value="fastqsanger">Uncompressed fastq</option> <option value="fastqsanger.bz2">bzip2 compressed fastq</option> </param> <section name="adv" title="Advanced Options" expanded="False"> <param name="minID" type="integer" label="Minimum spot ID" optional="true" help="Minimum spot id to be dumped." argument="--minSpotId"/> <param name="maxID" type="integer" label="Maximum spot ID" optional="true" help="Maximum spot id to be dumped." argument="--maxSpotId"/> <param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--minReadLen"/> <param name="split" type="boolean" checked="true" truevalue="--split-spot" falsevalue="" label="Split spot by read pairs" help="Split spots into individual reads." argument="--split-spot"/> <expand macro="alignments"/> <expand macro="region"/> <expand macro="matepairDist"/> <param name="readfilter" type="select" value="" label="filter by value" argument="--read-filter"> <option value="">None</option> <option value="pass">pass</option> <option value="reject">reject</option> <option value="criteria">criteria</option> <option value="redacted">redacted</option> </param> <param name="spotgroups" type="text" label="Filter by spot-groups" optional="true" argument="--spot-groups"/> <param name="clip" type="boolean" truevalue="--clip" falsevalue="" argument="--clip" label="Apply left and right clips" /> <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="" checked="False" label="Dump only biological reads" argument="--skip-technical"/> </section> </inputs> <outputs> <collection name="list_paired" type="list:paired" label="Pair-end data (fastq-dump)"> <filter>input['input_select'] == "file_list"</filter> <!-- Use named regex group to grab pattern <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list identifier in the nested collection and identifier_1 is either forward or reverse (for instance samp1_forward.fq). --> <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq_(?P<identifier_1>[^_]+)\.fastqsanger" ext="fastqsanger" /> <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq.gz_(?P<identifier_1>[^_]+)\.fastqsanger.gz" ext="fastqsanger.gz" /> <discover_datasets pattern="(?P<identifier_0>[^_]+)_\d+.fastq.bz2_(?P<identifier_1>[^_]+)\.fastqsanger.bz2" ext="fastqsanger.bz2" /> </collection> <collection name="output_collection" type='list' label="Single-end data (fastq-dump)"> <filter>input['input_select'] == "file_list"</filter> <discover_datasets pattern="(?P<designation>.+)_\d+.fastq__single\.fastqsanger" directory="." ext='fastqsanger'/> <discover_datasets pattern="(?P<designation>.+)_\d+.fastq.gz__single\.fastqsanger.gz" directory="." ext='fastqsanger.gz'/> <discover_datasets pattern="(?P<designation>.+)_\d+.fastq.bz2__single\.fastqsanger.bz2" directory="." ext='fastqsanger.bz2'/> </collection> <data format="fastqsanger" name="output_accession" label="${input.accession} (fastq-dump)"> <filter>input['input_select'] == "accession_number"</filter> <change_format> <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/> <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/> </change_format> </data> <data format="fastqsanger" name="output_file" label="${input.file.name} (fastq-dump)"> <filter>input['input_select'] == "file"</filter> <change_format> <when input="outputformat" value="fastqsanger.gz" format="fastqsanger.gz"/> <when input="outputformat" value="fastqsanger.bz2" format="fastqsanger.bz2"/> </change_format> </data> </outputs> <tests> <test> <param name="input_select" value="accession_number"/> <param name="outputformat" value="fastqsanger"/> <param name="accession" value="SRR044777"/> <param name="skip_technical" value="True"/> <output name="output_accession"> <assert_contents> <not_has_text text="rRNA_primer"/> <has_text text="F47USSH02GNP1D" /> </assert_contents> </output> </test> <test> <param name="input_select" value="accession_number"/> <param name="outputformat" value="fastqsanger.gz"/> <param name="accession" value="SRR925743"/> <param name="maxID" value="5"/> <output name="output_accession" file="fastq_dump_result.fastq.gz" decompress="True"/> </test> <test> <param name="input_select" value="accession_number"/> <param name="outputformat" value="fastqsanger"/> <param name="accession" value="SRR925743"/> <param name="maxID" value="5"/> <output name="output_accession" file="fastq_dump_result.fastq" ftype="fastqsanger"/> </test> <test> <param name="input_select" value="file_list"/> <param name="outputformat" value="fastqsanger"/> <param name="file_list" value="list_pe"/> <param name="maxID" value="5"/> <output_collection name="list_paired" type="list:paired"> <element name="DRR015708"> <element name="forward" file="DRR015708_forward.fastqsanger"> </element> <element name="reverse" file="DRR015708_reverse.fastqsanger"> </element> </element> </output_collection> </test> <test> <param name="input_select" value="file_list"/> <param name="outputformat" value="fastqsanger"/> <param name="file_list" value="list_pe2"/> <param name="maxID" value="5"/> <output_collection name="list_paired" type="list:paired"> <element name="ERR027433"> <element name="forward" file="ERR027433_forward.fastqsanger"> </element> <element name="reverse" file="ERR027433_reverse.fastqsanger"> </element> </element> </output_collection> </test> <test> <param name="input_select" value="file_list"/> <param name="outputformat" value="fastqsanger"/> <param name="file_list" value="list_se"/> <param name="maxID" value="5"/> <output_collection name="output_collection" type="list"> <element name="SRR1993644" file="SRR1993644.fastqsanger"/> </output_collection> </test> </tests> <help><![CDATA[ **What it does?** This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fastq-dump_ utility of the SRA Toolkit. **How to use it?** There are three ways in which you can download data: 1. Data for single accession 2. Multiple datasets using a list of accessions 3. Extract data from already uploaded SRA dataset Below we discuss each in detail. ------ **Uploading data for a single accession** When you type a single accession number (e.g., `SRR1582967`) into **Accession** box and click **Execute** the tool will fetch data for you. It is important to keep the following in mind: - if data is paired-ended (or mate-paired) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see an example dataset below) - if data is single ended, a standard single fastq dataset will be produced ----- **Uploading multiple datasets using a list of accessions** A more realistic scenario is when you want to upload a number of datasets at once. To do this you need a list of accession, where there is only one accession per line (see below for information on how to generate such a file). Once you have this file: 1. Upload it into your history using Galaxy's upload tool 2. Once the list of accessions is uploaded choose *List of SRA accessions, one per line* from **select input type** dropdown 3. Choose uploaded file within the **sra accession list** field 4. Click **Execute** .. class:: warningmark Fastq datasets produced by this option will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, two collections will be produced: one containing paired-end data and another containing single-end data. Single-end or pair-end collections may be empty if the accessions provided in the list contain only SINGLE or PAIRED data, respectively. ----- **Extract data from already uploaded SRA dataset** If a SRA dataset is present in the history, it can be converted into fastq dataset by setting **select input type** drop-down to *SRA archive in current history*. Just like in the case of extracting data for single accession number the following applies: - if data is paired-ended (or mate-pair) the tool will generate a single *interleaved* dataset, in which forward and reverse mates are alternating (see example below). - if data is single ended, a standard fastq dataset will be produced @ACCESSION_LIST_HOWTO@ ----- **Paired-end (and mate-pair) data in fastq format** Paired end datasets can be represented as two individual datasets: First dataset:: @1/1 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED @2/1 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG Second dataset:: @1/2 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF @2/2 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH Or a single *interleaved* dataset:: @1/1 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + EGGEGGGDFGEEEAEECGDEGGFEEGEFGBEEDDECFEFDD@CDD<ED @1/2 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + GHHHDFDFGFGEGFBGEGGEGEGGGHGFGHFHFHHHHHHHEF?EFEFF @2/1 AGGGATGTGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTA + HHHHHHEGFHEEFEEHEEHHGGEGGGGEFGFGGGGHHHHFBEEEEEFG @2/2 CCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAAC + HHHHHHHHHHHHHGHHHHHHGHHHHHHHHHHHFHHHFHHHHHHHHHHH ---- .. _fastq: https://en.wikipedia.org/wiki/FASTQ_format .. _fastq-dump: https://ncbi.github.io/sra-tools/fastq-dump.html .. _collection: https://galaxyproject.org/tutorials/collections/ .. _link: http://www.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=studies @SRATOOLS_ATTRRIBUTION@ ]]> </help> <expand macro="citation"/> </tool>