Mercurial > repos > iuc > getorganelle

<tool id="get_organelle_from_reads" name="Get organelle from reads" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@">
    <macros>
        <import>macros.xml</import>
        <xml name="seed_and_genes" tokens="optional">
            <param type="data" argument="-s" format="fasta" optional="@OPTIONAL@" label="Seed sequence(s)" help="Fasta file to use as initial seed. A seed sequence in GetOrganelle is only used for identifying initial organelle reads."/>
            <param type="data" argument="--genes" format="fasta" optional="@OPTIONAL@" label="Gene sequence(s)" help="Fasta file containing protein coding genes and ribosomal RNAs extracted from a reference genome that you want to assemble."/>
        </xml>
    </macros>
    <xrefs>
        <xref type="bio.tools">getorganelle</xref>
    </xrefs>
    <requirements>
        <requirement type="package" version="@TOOL_VERSION@">getorganelle</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
        ## Link input files
        #import re
        #set ext = '.fastq'
        #if $fastq_input.fastq_input_selector == 'single':
            #if $fastq_input.fastq_input1.ext.endswith('.gz'):
                #set ext = '.fastq.gz'
            #end if
            #set $in1 = $fastq_input.fastq_input1
            #set $in1_name = re.sub('[^\w\-\.]', '_', str($fastq_input.fastq_input1.element_identifier)) + $ext
            ln -s '$in1' '$in1_name' &&
        #else if $fastq_input.fastq_input_selector == 'paired':
            #if $fastq_input.fastq_input1.ext.endswith('.gz'):
                #set ext = '.fastq.gz'
            #end if
            #set $in1 = $fastq_input.fastq_input1
            #set $in2 = $fastq_input.fastq_input2
            #set $in1_name = re.sub('[^\w\-\.]', '_', str($fastq_input.fastq_input1.element_identifier)) + $ext
            #set $in2_name = re.sub('[^\w\-\.]', '_', str($fastq_input.fastq_input2.element_identifier)) + $ext
            ln -s '$in1' '$in1_name' &&
            ln -s '$in2' '$in2_name' &&
        #else if $fastq_input.fastq_input_selector == 'paired_collection':
            #if $fastq_input.paired_input.forward.ext.endswith('.gz'):
                #set ext = '.fastq.gz'
            #end if
            #set $in1 = $fastq_input.paired_input.forward
            #set $in2 = $fastq_input.paired_input.reverse
            #set $in1_name = "%s_R1%s" % (re.sub('[^\w\-\.]', '_', str($fastq_input.paired_input.element_identifier)),$ext)
            #set $in2_name = "%s_R2%s" % (re.sub('[^\w\-\.]', '_', str($fastq_input.paired_input.element_identifier)),$ext)
            ln -s '$in1' '$in1_name' &&
            ln -s '$in2' '$in2_name' &&
        #end if
        ## GetOrganelle cmd
        get_organelle_from_reads.py
        #if str($fastq_input.fastq_input_selector) == "single":
            -u '$in1_name'
        #end if
        #if str($fastq_input.fastq_input_selector) == "paired":
            -1 '$in1_name' -2 '$in2_name'
        #end if
        #if str($fastq_input.fastq_input_selector) == "paired_collection":
            -1 '$in1_name' -2 '$in2_name'
        #end if
        -o results_directory
        #if $organelle_type_input.F == "anonym":
            -F '$organelle_type_input.F'
            -s '$organelle_type_input.s'
            --genes '$organelle_type_input.genes'
        #else:
            -F '$organelle_type_input.F'
            #if $organelle_type_input.s
                -s '$organelle_type_input.s'
            #end if
            #if $organelle_type_input.genes
                --genes '$organelle_type_input.genes'
            #end if
        #end if
        --config-dir '$config_dir.fields.path'
        #if $settings.advanced == "advanced":
            #if $settings.max_reads:
                #if $settings.max_reads == 0:
                    --max-reads inf
                #else:
                    --max-reads $settings.max_reads
                #end if
            #end if
            #if $settings.reduce_reads_for_coverage:
                #if $settings.reduce_reads_for_coverage == 0:
                    --reduce-reads-for-coverage inf
                #else:
                    --reduce-reads-for-coverage $settings.reduce_reads_for_coverage
                #end if
            #end if
            #if $settings.w:
                -w $settings.w
            #end if
            #if $settings.R:
                -R $settings.R
            #end if
            #if $settings.M:
                -M $settings.M
            #end if
            #if $settings.J:
                -J $settings.J
            #end if
            #if $settings.P:
                -P $settings.P
            #end if
            #if $settings.max_extending_len:
                #if $settings.max_extending_len == 0:
                    --max-extending-len inf
                #else:
                    --max-extending-len $settings.max_extending_len
                #end if
            #end if
            #if $settings.k:
                -k $settings.k
            #end if
            $settings.fast
            $settings.memory_save
            $settings.larger_auto_ws
        #end if
    ]]></command>
    <inputs>
        <conditional name="fastq_input">
            <param name="fastq_input_selector" type="select" label="Single-end or paired reads">
                <option value="single" selected="true">Single-end</option>
                <option value="paired">Paired</option>
                <option value="paired_collection">Paired collection</option>
            </param>
            <when value="single">
                <param type="data" name="fastq_input1" format="fastqsanger,fastqsanger.gz" label="Single-end fastq"/>
            </when>
            <when value="paired">
                <param type="data" name="fastq_input1" format="fastqsanger,fastqsanger.gz" label="Forward fastq"/>
                <param type="data" name="fastq_input2" format="fastqsanger,fastqsanger.gz" label="Reverse fastq"/>
            </when>
            <when value="paired_collection">
                <param type="data_collection" collection_type="paired" name="paired_input" format="fastqsanger,fastqsanger.gz" label="Paired collection fastq"/>
            </when>
        </conditional>
        <conditional name="organelle_type_input">
            <param type="select" argument="-F" label="Organelle type">
                <option value="embplant_pt" selected="true">Plant chloroplast</option>
                <option value="embplant_mt">Plant mitochondrial</option>
                <option value="embplant_nr">Plant ribosomal</option>
                <option value="fungus_mt">Fungus mitochondrial</option>
                <option value="fungus_nr">Fungus nuclear</option>
                <option value="animal_mt">Animal mitochondrial</option>
                <option value="other_mt">Other mitochondrial</option>
                <option value="anonym">Anonymous</option>
            </param>
            <when value="embplant_pt">
                <expand macro="seed_and_genes" optional="true"/>
            </when>
            <when value="embplant_mt">
                <expand macro="seed_and_genes" optional="true"/>
            </when>
            <when value="embplant_nr">
                <expand macro="seed_and_genes" optional="true"/>
            </when>
            <when value="fungus_mt">
                <expand macro="seed_and_genes" optional="true"/>
            </when>
            <when value="fungus_nr">
                <expand macro="seed_and_genes" optional="true"/>
            </when>
            <when value="animal_mt">
                <expand macro="seed_and_genes" optional="true"/>
            </when>
            <when value="other_mt">
                <expand macro="seed_and_genes" optional="true"/>
            </when>
            <when value="anonym">
                <expand macro="seed_and_genes" optional="false"/>
            </when>
        </conditional>
        <param argument="--config-dir" label="Reference data" type="select" help="Contact the administrator of this Galaxy instance if you miss reference data">
            <options from_data_table="getorganelle" />
            <validator message="No reference annotation is available for getorganelle" type="no_options" />
        </param>
        <conditional name="settings">
            <param name="advanced" type="select" label="Specify advanced parameters">
                <option value="simple" selected="true">No, use program defaults.</option>
                <option value="advanced">Yes, see full parameter list.</option>
            </param>
            <when value="simple">
            </when>
            <when value="advanced">
                <param name="out" type="select" label="Select output files" multiple="true" display="checkboxes">
                    <option value="raw_assembly_graph">Raw assembly graph</option>
                    <option value="simplified_graph">Simplified graph</option>
                    <option value="contig_labels">Contig labels</option>
                    <option value="seed_fastq">Seed fastq</option>
                    <option value="seed_sam">Seed SAM</option>
                    <option value="spades_log">Spades log</option>
                </param>
                <param type="integer"  argument="--max-reads" optional="true" label="Maximum number of reads" help="Maximum number of reads to be used. An input VALUE of 0 will be treated as infinity (INF). Default: 1.5E7 (plant chloroplast, plant ribosomal, fungus mitochondrial, fungus nuclear) 7.5E7 (Plant mitochondrial) 3E8 (Animal mitochondrial)"/>
                <param type="integer" argument="--reduce-reads-for-coverage" optional="true" label="Reduce reads for coverage" help="Maximum number of reads to be used according to target-hitting base coverage. If the estimated target-hitting base coverage is too high and over this VALUE, GetOrganelle automatically reduce the number of reads to generate a final assembly with base coverage close to this VALUE. This design could greatly save computational resources in many situations. A mean base coverage over 500 is extremely sufficient for most cases. This VALUE must be larger than 10. An input VALUE of 0 will be treated as infinity (INF). Default: 500."/>
                <param type="integer" argument="-w" optional="true" label="Word size" help="Word size for pre-grouping and extending process. This script would try to guess (auto-estimate) a proper word size using an empirical function based on average read length, reads quality, target genome coverage, and other variables that might influence the extending process. Default: estimated automatically"/>
                <param type="integer" argument="-R" optional="true" label="Max rounds" help="Maximum number of extending rounds (suggested: >=2). Default: 15 (-F embplant_pt), 30 (-F embplant_mt/other_pt), 10 (-F embplant_nr/animal_mt/fungus_mt/fungus_nr), inf (-P 0)."/>
                <param type="text" argument="-k" optional="true" label="Spades kmer settings" help="Use the same format as in SPAdes. Illegal kmer values would be automatically discarded by GetOrganelle. Default: 21,55,85,115"/>
                <param type="integer" argument="-J" optional="true" label="Jump step" help="The length of step for checking words in reads during extending process (integer >= 1). When you have reads of high quality, the larger the number is, the faster the extension will be, the more risk of missing reads in low coverage area. Choose 1 to choose the slowest but safest extension strategy. Default: 3"/>
                <param type="integer" argument="-M" optional="true" label="Mesh size" help="The length of step for building words from seeds during extending process (integer >= 1). When you have reads of high quality, the larger the number is, the faster the extension will be, the more risk of missing reads in low coverage area. Another usage of this mesh size is to choose a larger mesh size coupled with a smaller word size, which makes smaller word size feasible when memory is limited. Choose 1 to choose the slowest but safest extension strategy. Default: 2"/>
                <param type="integer" argument="-P" optional="true" label="Pre-grouped" help="The maximum number (integer) of high-covered reads to be pre-grouped before extending process. pre_grouping is suggested when the whole genome coverage is shallow but the organ genome coverage is deep. The default value is 2E5. For personal computer with 8G memory, we suggest no more than 3E5. A larger number (ex. 6E5) would run faster but exhaust memory in the first few minutes. Choose 0 to disable this process."/>
                <param type="integer" argument="--max-extending-len" optional="true" label="Max extending length" help="Maximum extending length(s) derived from the seed(s). A single value could be a non-negative number. An input VALUE of 0 will be treated as infinity (INF) and is the default. This is designed for properly stopping the extending from getting too long and saving computational resources. However, empirically, a maximum extending length value larger than 6000 would not be helpful for saving computational resources. This value would not be precise in controlling output size, especially when pre-group (followed by '-P') is turn on. A sequence of a closely related species would be preferred for estimating a better maximum extending length value. If you are using limited loci, e.g. rbcL gene as the seed for assembling the whole plastome (with extending length ca. 75000 >> 6000), you should set maximum extending length to inf. Default: inf."/>
                <param type="boolean" argument="--fast" optional="true" label="Fast" truevalue="--fast" falsevalue=""  help="='-R 10 -t 4 -J 5 -M 7 --max-n-words 3E7 --larger-auto-ws --disentangle-time-limit 360' This option is suggested for homogeneously and highly covered data (very fine data). You can overwrite the value of a specific option listed above by adding that option along with the '--fast' flag. You could try GetOrganelle with this option for a list of samples and run a second time without this option for the rest with incomplete results."/>
                <param type="boolean" argument="--memory-save" optional="true" label="Memory save" truevalue="--memory-save" falsevalue="" help="='--out-per-round -P 0 --remove-duplicates 0' You can overwrite the value of a specific option listed above by adding that option along with the '--memory-save' flag. A larger '-R' value is suggested when '--memory-save' is chosen."/>
                <param type="boolean" argument="--larger-auto-ws"  optional="true" label="Larger auto ws" truevalue="--larger-auto-ws" falsevalue=""  help="By using this flag, the empirical function for estimating W would tend to produce a relative larger W, which would speed up the matching in extending, reduce the memory cost in extending, but increase the risk of broken final graph. Suggested when the data is good with high and homogenous coverage."/>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data format="txt" name="logfile" from_work_dir="results_directory/get_org.log.txt" label='${tool.name} on ${on_string}: Logfile'/>
        <collection name="path_sequence_list" type="list" label='${tool.name} on ${on_string}: Path sequences'>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.path_sequence.fasta" format="fasta" directory="results_directory" />
        </collection>
        <data format="gfa1" name="selected_graph" from_work_dir="results_directory/*.selected_graph.gfa" label='${tool.name} on ${on_string}: Selected graph'/>
        <data format="fastg" name="raw_assembly_graph" from_work_dir="results_directory/extended_K*.assembly_graph.fastg" label='${tool.name} on ${on_string}: Raw assembly graph'>
            <filter>settings['advanced'] == 'advanced' and "raw_assembly_graph" in settings['out']</filter>
        </data>
        <data format="fastg" name="simplified_graph" from_work_dir="results_directory/extended_K*.assembly_graph.fastg.extend-*.fastg" label='${tool.name} on ${on_string}: Simplified graph'>
            <filter>settings['advanced'] == 'advanced' and "simplified_graph" in settings['out']</filter>
        </data>
        <data format="tsv" name="assembly_csv" from_work_dir="results_directory/extended_K*.assembly_graph.fastg.extend-*.csv" label='${tool.name} on ${on_string}: Contig labels'>
            <filter>settings['advanced'] == 'advanced' and "contig_labels" in settings['out']</filter>
        </data>
        <data format="fastq" name="seed_fastq" from_work_dir="results_directory/seed/*.fq" label='${tool.name} on ${on_string}: Seed fastq'>
            <filter>settings['advanced'] == 'advanced' and "seed_fastq" in settings['out']</filter>
        </data>
        <data format="sam" name="seed_sam" from_work_dir="results_directory/seed/*.sam" label='${tool.name} on ${on_string}: Seed sam'>
            <filter>settings['advanced'] == 'advanced' and "seed_sam" in settings['out']</filter>
        </data>
        <data format="txt" name="spades_log" from_work_dir="results_directory/extended_spades/spades.log" label='${tool.name} on ${on_string}: Spades log'>
            <filter>settings['advanced'] == 'advanced' and "spades_log" in settings['out']</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="8">
            <param name="config_dir" value="getorganelle_refdata"/>
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="paired"/>
                <param name="fastq_input1" value="Arabidopsis_simulated.10k.1.fq.gz" ftype="fastqsanger.gz"/>
                <param name="fastq_input2" value="Arabidopsis_simulated.10k.2.fq.gz" ftype="fastqsanger.gz"/>
            </conditional>
            <param name="F" value="embplant_pt" />
            <param name="advanced" value="advanced" />
            <param name="out" value="raw_assembly_graph,simplified_graph,contig_labels,seed_fastq,seed_sam" />
            <assert_stdout>
                <has_text text="Thank you!" />
            </assert_stdout>
            <output name="logfile" >
                <assert_contents>
                    <has_text text="Thank you!" />
                </assert_contents>
            </output>
            <output_collection name="path_sequence_list" type="list" count="2">
                <element name="embplant_pt.K115.complete.graph1.1" ftype="fasta">
                    <assert_contents>
                        <has_n_lines n="2"/>
                    </assert_contents>
                </element>
                <element name="embplant_pt.K115.complete.graph1.2" ftype="fasta">
                    <assert_contents>
                        <has_n_lines n="2"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output name="selected_graph" >
                <assert_contents>
                    <has_n_lines n="7" />
                </assert_contents>
            </output>
            <output name="raw_assembly_graph" >
                <assert_contents>
                    <has_text text=">EDGE" />
                </assert_contents>
            </output>
            <output name="simplified_graph" >
                <assert_contents>
                    <has_text text=">EDGE" />
                </assert_contents>
            </output>
            <output name="assembly_csv" >
                <assert_contents>
                    <has_text text="EDGE" />
                </assert_contents>
            </output>
            <output name="seed_fastq" >
                <assert_contents>
                    <has_text text="@" />
                </assert_contents>
            </output>
            <output name="seed_sam" >
                <assert_contents>
                    <has_text text="@HD" />
                </assert_contents>
            </output>
        </test>
        <!-- Ensure paired collection works; also check output filters -->
        <test expect_num_outputs="3">
            <param name="config_dir" value="getorganelle_refdata"/>
            <conditional name="fastq_input">
                <param name="fastq_input_selector" value="paired_collection"/>
                <param name="paired_input">
                    <collection type="paired">
                        <element name="forward" value="Arabidopsis_simulated.10k.1.fq.gz" ftype="fastqsanger.gz" />
                        <element name="reverse" value="Arabidopsis_simulated.10k.2.fq.gz" ftype="fastqsanger.gz" />
                    </collection>
                </param>
            </conditional>
            <param name="F" value="embplant_pt" />
	    <param name="advanced" value="simple" />
            <assert_stdout>
                <has_text text="Thank you!" />
            </assert_stdout>
            <output name="logfile" >
                <assert_contents>
                    <has_text text="Thank you!" />
                </assert_contents>
            </output>
            <output_collection name="path_sequence_list" type="list" count="2">
                <element name="embplant_pt.K115.complete.graph1.1" ftype="fasta">
                    <assert_contents>
                        <has_n_lines n="2"/>
                    </assert_contents>
                </element>
                <element name="embplant_pt.K115.complete.graph1.2" ftype="fasta">
                    <assert_contents>
                        <has_n_lines n="2"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output name="selected_graph" >
                <assert_contents>
                    <has_n_lines n="7" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
        GetOrganelle assembles organelle genomes from genome skimming data.
        See https://github.com/Kinggerm/GetOrganelle for more information.
        ]]></help>
    <citations>
        <citation type="doi">10.1186/s13059-020-02154-5</citation>
    </citations>
</tool>
author	iuc
date	Tue, 16 May 2023 09:45:40 +0000
parents	06bcf65179fb
children