view fasterq_dump.xml @ 40:8848455c0270 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/sra-tools commit 7632986831e24fcd189a57d824b9e6b1d5dc5c4c
author iuc
date Mon, 26 Aug 2024 14:59:28 +0000
parents f8054ea1c365
children
line wrap: on
line source

<tool id="fasterq_dump" name="Faster Download and Extract Reads in FASTQ" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>format from NCBI SRA</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="bio_tools"/>
    <expand macro="requirements"/>
    <version_command>fasterq-dump --version | tr -d $'\n'</version_command>
    <command detect_errors="exit_code"><![CDATA[
    set -o | grep -q pipefail && set -o pipefail;
    @COPY_CONFIGFILE@
    @CONFIGURE_RETRY@
    @SET_ACCESSIONS@
    while [ \$SRA_PREFETCH_ATTEMPT -le \$SRA_PREFETCH_RETRIES ] ; do
        fasterq-dump "\$acc"
        -e \${GALAXY_SLOTS:-1}
        -t \${TMPDIR}
        --seq-defline '$adv.seq_defline'
        --qual-defline '+'
        $adv.split
        #if str( $adv.minlen ) != "":
            --min-read-len "$adv.minlen"
        #end if
        $adv.skip_technical 2>&1 | tee -a '$log';
        if [ \$? == 0 ] && [ \$(ls *.fastq | wc -l) -ge 1 ]; then
            break ;
        else
            echo "Prefetch attempt \$SRA_PREFETCH_ATTEMPT of \$SRA_PREFETCH_RETRIES exited with code \$?" ;
            SRA_PREFETCH_ATTEMPT=`expr \$SRA_PREFETCH_ATTEMPT + 1` ;
            sleep 1 ;
        fi ;
    done &&
    mkdir -p output &&
    mkdir -p outputOther &&
    count="\$(ls *.fastq | wc -l)" &&
    echo "There are \$count fastq files" &&
    data=(\$(ls *.fastq)) &&
    if [ "\$count" -eq 1 ]; then
        @COMPRESS@ "\${data[0]}" > output/"\${acc}"__single.fastqsanger.gz &&
        rm "\${data[0]}";
    elif [ "$adv.split" = "--split-3" ]; then
        if [ -e "\${acc}".fastq ]; then
            @COMPRESS@ "\${acc}".fastq > outputOther/"\${acc}"__single.fastqsanger.gz;
        fi &&
        @COMPRESS@ "\${acc}"_1.fastq > output/"\${acc}"_forward.fastqsanger.gz &&
        @COMPRESS@ "\${acc}"_2.fastq > output/"\${acc}"_reverse.fastqsanger.gz &&
        rm "\${acc}"*.fastq;
    elif [ "\$count" -eq 2 ]; then
        #if $adv.skip_technical:
            @COMPRESS@ "\${data[0]}" > output/"\${acc}"_forward.fastqsanger.gz &&
            @COMPRESS@ "\${data[1]}" > output/"\${acc}"_reverse.fastqsanger.gz &&
        #else
            @COMPRESS@ "\${data[0]}" > outputOther/"\${data[0]}"sanger.gz &&
            @COMPRESS@ "\${data[1]}" > outputOther/"\${data[1]}"sanger.gz &&
        #end if
        rm "\${data[0]}" &&
        rm "\${data[1]}";
    else
        for file in \${data[*]}; do
            @COMPRESS@ "\$file" > outputOther/"\$file"sanger.gz &&
            rm "\$file";
        done;
    fi;
    
    #if $input.input_select != "sra_file":
        ); done;
    #end if
    echo "Done with all accessions."
    ]]>
    </command>
    <expand macro="configfile_hack"/>
    <inputs>
        <expand macro="input_conditional"/>
        <section name="adv" title="Advanced Options" expanded="False">
            <expand macro="defline" defline_param="--seq-defline" defline_default="@$ac.$sn/$ri"/>
            <param name="minlen" type="integer" label="Minimum read length" optional="true" help="Filter by sequence length. Will dump only reads longer or equal to this value." argument="--min-read-len"/>
            <param name="split" type="select" display="radio" label="Select how to split the spots" help="This option will only be used when there are multiple reads per spot (for example paired-end).">
                <option value="--split-3">--split-3: write properly paired biological reads into different files and single reads in another file</option>
                <option value="--split-files">--split-files: write reads into different files (forward and reverse may not match if one read is empty)</option>
                <option value="--split-spot">--split-spot: split spots into reads (only one output file)</option>
                <option value="--concatenate-reads">--concatenate-reads: writes whole spots into one file</option>
            </param>
            <param name="skip_technical" type="boolean" truevalue="--skip-technical" falsevalue="--include-technical" checked="True" label="Dump only biological reads" help="Will not be used if --split-3 is selected." argument="--skip-technical/--include-technical"/>
        </section>
    </inputs>
    <outputs>
        <data name="log" format="txt" label="fasterq-dump log"/>
        <collection name="list_paired" type="list:paired" label="Pair-end data (fasterq-dump)">

        <!-- Use named regex group to grab pattern
             <identifier_0>_<identifier_1>.fq. Here identifier_0 is the list
             identifier in the nested collection and identifier_1 is either
             forward or reverse (for instance samp1_forward.fq).
        -->

            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\.fastqsanger.gz" directory="output" ext="fastqsanger.gz" />
        </collection>
        <collection name="output_collection" type='list' label="Single-end data (fasterq-dump)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)__single\.fastqsanger.gz" directory="output" ext='fastqsanger.gz'/>
        </collection>
        <collection name="output_collection_other" type='list' label="Other data (fasterq-dump)">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fastqsanger\.gz" directory="outputOther" format="fastqsanger.gz"/>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="4">
            <param name="input_select" value="accession_number"/>
            <param name="accession" value="ERR086330"/>
            <output_collection name="list_paired" type="list:paired" count="1">
                <element name="ERR086330">
                    <element name="forward" file="ERR086330_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                    <element name="reverse" file="ERR086330_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <param name="input_select" value="accession_number"/>
            <param name="accession" value="SRR002702"/>
            <param name="split" value="--split-files"/>
            <param name="skip_technical" value="False"/>
            <output_collection name="output_collection_other" type="list" count="2">
                <element name="SRR002702_1" file="SRR002702_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                <element name="SRR002702_2" file="SRR002702_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <param name="input_select" value="accession_number"/>
            <param name="accession" value="ERR086330, SRR11953971"/>
            <output_collection name="list_paired" type="list:paired" count="2">
                <element name="ERR086330">
                    <element name="forward" file="ERR086330_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                    <element name="reverse" file="ERR086330_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                </element>
                <element name="SRR11953971">
                    <element name="forward" file="SRR11953971_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                    <element name="reverse" file="SRR11953971_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <param name="input_select" value="sra_file"/>
            <param name="sra_file" value="SRR522874.sra"/>
            <param name="split" value="--split-files"/>
            <param name="skip_technical" value="True"/>
            <output_collection name="list_paired" type="list:paired" count="1">
                <element name="SRR522874.sra">
                    <element name="forward" file="SRR522874.sra_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                    <element name="reverse" file="SRR522874.sra_4.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <param name="input_select" value="sra_file"/>
            <param name="sra_file" value="SRR522874.sra"/>
            <param name="split" value="--split-files"/>
            <param name="skip_technical" value="False"/>
            <output_collection name="output_collection_other" type="list" count="4">
                <element name="SRR522874_1" file="SRR522874.sra_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                <element name="SRR522874_2" file="SRR522874.sra_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                <element name="SRR522874_3" file="SRR522874.sra_3.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                <element name="SRR522874_4" file="SRR522874.sra_4.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <param name="input_select" value="file_list"/>
            <param name="file_list" value="list_sra"/>
            <param name="minlen" value="21"/>
            <output_collection name="output_collection_other" type="list" count="1">
                <element name="SRR522874__single" file="SRR522874.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
            </output_collection>
            <output_collection name="list_paired" type="list:paired" count="1">
                <element name="SRR522874">
                    <element name="forward" file="SRR522874_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                    <element name="reverse" file="SRR522874_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                </element>
            </output_collection>
            <output_collection name="output_collection" type="list" count="1">
                <element name="SRR002702" file="SRR002702_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <param name="input_select" value="file_list"/>
            <param name="file_list" value="sra_manifest.tabular" ftype="sra_manifest.tabular"/>
            <output_collection name="list_paired" type="list:paired" count="1">
                <element name="SRR11953971">
                    <element name="forward" file="SRR11953971_1.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                    <element name="reverse" file="SRR11953971_2.fastq.gz" ftype="fastqsanger.gz" decompress="True"/>
                </element>
            </output_collection>
        </test>
        <test expect_num_outputs="4">
            <param name="input_select" value="file_list"/>
            <param name="file_list" value="sra_manifest.tabular" ftype="sra_manifest.tabular"/>
            <section name="adv">
                <param name="seq_defline" value="@$ac.$si/$ri $sn length=$rl"/>
            </section>
            <output_collection name="list_paired" type="list:paired" count="1">
                <element name="SRR11953971">
                    <element name="forward" ftype="fastqsanger.gz" decompress="True">
                        <assert_contents>
                            <!-- TODO replace has_size by has_line -->
                            <has_size min="56206"/>
                            <!-- <has_line line="@SRR11953971.2053/1 2053 length=101"/> -->
                        </assert_contents>
                    </element>
                    <element name="reverse" ftype="fastqsanger.gz" decompress="True">
                        <assert_contents>
                            <has_size min="59843"/>
                            <!-- <has_line line="@SRR11953971.2053/2 2053 length=101"/> -->
                        </assert_contents>
                    </element>
                </element>
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[
**What it does?**

This tool extracts data (in fastq_ format) from the Short Read Archive (SRA) at the National Center for Biotechnology Information (NCBI). It is based on the fasterq-dump_ utility of the SRA Toolkit.  The following applies:

 - if data is paired-ended (or mate-pair) the tool will generate a collection of file pairs, in which each element will be a pair of fastq_ files containing forward and reverse mates.
 - if data is single ended, each element of the collection will be a single fastq_ dataset.


@HOW_TO_USE_IT@

-----

**Output**

In every case, fastq datasets produced will be saved in Galaxy's history as a collection_ - a single history element containing multiple datasets. In fact, regardless of the experimental design, three collections will be produced: one containing paired-end data, another containing single-end data, and a third one which contains reads which could not be classified.
Some collections may be empty if the accessions provided in the list do not contain one of the type of data.

.. class:: warningmark

When you decide to dump technical reads (in Advanced Options Dump only biological reads is set to No), you will probably find your PAIRED data in the other data collection as it is impossible to determine if it was 2 biological reads or one biological and one technical.

.. class:: warningmark

By default, only biological reads are dumped and in case of PAIRED dataset only the spots which have both reads will be in the paired-end collection. The remaining single reads will be in the other colletion.
To keep all reads, and potentially not have the same number of reads in forward and reverse use the --split-files option in Advanced Options, Select how to split the spots.

@ACCESSION_LIST_HOWTO@

-----


.. _fastq: https://en.wikipedia.org/wiki/FASTQ_format
.. _fasterq-dump: https://github.com/ncbi/sra-tools/wiki/HowTo:-fasterq-dump
.. _collection: https://galaxyproject.org/tutorials/collections/
.. _link: https://trace.ncbi.nlm.nih.gov/Traces/index.html?view=run_browser&display=reads

@SRATOOLS_ATTRRIBUTION@
]]>
    </help>
    <expand macro="citation"/>
</tool>