diff gfastats.xml @ 0:5799092ffdff draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/gfastats commit 115f451c7c9e7e30fd1b8df26bfc5362832a6eb7"
author bgruening
date Wed, 09 Mar 2022 10:29:20 +0000
parents
children 2b8b4cacb83d
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gfastats.xml	Wed Mar 09 10:29:20 2022 +0000
@@ -0,0 +1,339 @@
+<tool id="gfastats" name="gfastats" version="@TOOL_VERSION@+galaxy@SUFFIX_VERSION@" profile="20.01">
+    <description>the swiss army knife for genome assembly</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <expand macro="biotools"/>
+    <version_command>gfastats --version</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+        gfastats
+        '$input_file'
+        #if $mode_condition.selector == 'statistics'
+            #if $mode_condition.statistics_condition.selector == 'assembly'
+                $mode_condition.statistics_condition.expected_genomesize
+            #end if
+        #end if
+        #if $target_condition.target_option == 'true'
+            $target_condition.target_sequence
+            #if $target_condition.include_bed
+                --include-bed $target_condition.include_bed
+            #end if
+            #if $target_condition.exclude_bed
+                --exclude-bed $target_condition.exclude_bed
+            #end if
+        #end if
+        #if $mode_condition.selector == 'manipulation'
+            #if $mode_condition.swiss_army_knife
+                -k $mode_condition.swiss_army_knife
+            #end if
+            #if $mode_condition.sort
+                --sort $mode_condition.sort
+            #end if
+            $mode_condition.homopolymer_compress
+            -o dataset.$mode_condition.output_condition.out_format
+            #if $mode_condition.output_condition.out_format == 'fasta'
+                #if $mode_condition.output_condition.line_length
+                    --line-length $mode_condition.output_condition.line_length
+                #end if
+            #else if $mode_condition.output_condition.out_format == 'fasta.gz'
+                #if $mode_condition.output_condition.line_length
+                    --line-length $mode_condition.output_condition.line_length
+                #end if
+            #end if
+        #else
+            #if $mode_condition.statistics_condition.selector == 'size'
+                --out-size $mode_condition.statistics_condition.out_size
+            #else if $mode_condition.statistics_condition.selector == 'coordinates'
+                --out-coord $mode_condition.statistics_condition.out_coord
+            #else if $mode_condition.statistics_condition.selector == 'assembly'
+                --nstar-report
+            #else
+                --seq-report
+                $mode_condition.statistics_condition.out_sequence
+            #end if
+            $mode_condition.tabular > '$stats' 
+        #end if
+        #if $mode_condition.selector == 'manipulation'
+            && mv dataset* output_dataset
+        #end if
+    ]]></command>
+    <inputs>
+        <param name="input_file" argument="--fasta" type="data"
+            format="fasta,fastq,fastqsanger,gfa1,fasta.gz,fastq.gz,fastqsanger.gz,gfa1.gz"
+            label="Input file"/>
+        <conditional name="target_condition">
+            <param name="target_option" type="select" label="Specify target sequences">
+                <option value="false">Disabled</option>
+                <option value="true">Enabled</option>
+            </param>
+            <when value="false"/>
+            <when value="true">
+                <param name="target_sequence" type="text" value="" label="Target sequence" help="Target specific sequence by header, optionally with coordinates: header[:start-end]">
+                    <sanitizer invalid_char="">
+                        <valid initial="string.digits,string.letters">
+                            <add value=":"/>
+                            <add value="-"/>
+                            <add value="_"/>
+                            <add value="|"/>
+                            <add value=" "/>
+                        </valid>
+                    </sanitizer>
+                    <validator type="regex">[0-9A-Za-z:-_| ]+</validator>
+                </param>
+                <param argument="--include-bed" type="data" optional="true"
+                    format="bed" label="Include specific intervals"
+                    help="Generates output on a subset list of headers or coordinates   
+                        in 0-based bed format. It can be combined with --exclude-bed. Optional"/>
+                <param argument="--exclude-bed" type="data" format="bed" optional="true"
+                    label="Exclude specific intervals"
+                    help="Exclude a subset of headers or coordinates in 0-base bed format. It can be conmbined with --include-bed Optional"/>
+            </when>
+            <when value="false"/>
+        </conditional>
+        <conditional name="mode_condition">
+            <param name="selector" type="select" label="Tool mode">
+                <option value="statistics">Summary statistics generation</option>
+                <option value="manipulation">Genome assembly manipulation</option>
+            </param>
+            <when value="manipulation">
+                <param argument="--swiss-army-knife" type="data"
+                    format="text" label="SAK input file" optional="true"
+                    help="Set of instructions provided as an ordered list"/>
+                <conditional name="output_condition">
+                    <param argument="--out-format" type="select" 
+                        label="Output format" help="Outputs selected sequences.">
+                            <option value="fasta">FASTA</option>
+                            <option value="fasta.gz">FASTA.gz</option>
+                            <option value="fastq">FASTQ</option>
+                            <option value="fastq.gz" selected="true">FASTQ.gz</option>
+                            <option value="gfa">GFA</option>
+                            <option value="gfa.gz">GFA.gz</option>
+                    </param>
+                    <when value="fasta">
+                        <expand macro="length_macro"/>
+                    </when>
+                    <when value="fasta.gz">
+                        <expand macro="length_macro"/>
+                    </when>
+                    <when value="fastq"/>
+                    <when value="fastq.gz"/>
+                    <when value="gfa"/>
+                    <when value="gfa.gz"/>
+                </conditional>
+                <param argument="--sort" type="select" label="Sort sequences" help="Specify how to sort the sequences. Ascending/descending used the sequence/path header.">
+                    <option value="" selected="true">Disabled</option>
+                    <option value="ascending">Ascending</option>
+                    <option value="descending">Descending</option>
+                    <option value="largest">Largest</option>
+                    <option value="smallest">Smallest</option>
+                </param>
+                <param argument="--homopolymer-compress" type="boolean" truevalue="--homopolymer-compress" falsevalue="" checked="false"
+                    label="Homopolymer compression" help="Compress all the homopolymers in the input"/>
+            </when>
+            <when value="statistics">
+                <conditional name="statistics_condition">
+                    <param name="selector" type="select" label="Report mode">
+                        <option value="assembly" selected="true">Genome assembly statistics (--nstar-report)</option>
+                        <option value="size">Scaffold, contig or gap sizes (--out-size)</option>
+                        <option value="coordinates">AGP, contig or gap coordinates (--out-coord)</option>
+                        <option value="sequence">Sequence statistics (--seq-report)</option>
+                    </param>
+                    <when value="size">
+                        <param argument="--out-size" type="select" label="Feature for reporting sizes"
+                            help="Generate a tabular file with the sequence sizes">
+                            <option value="s">Scaffolds</option>
+                            <option value="c">Contigs</option>
+                            <option value="g">Gaps</option>
+                        </param>
+                    </when>
+                    <when value="coordinates">
+                        <param argument="--out-coord" type="select" label="BED coordinares feature"
+                            help="Generates bed coordinates of given feature. Default: agp">
+                            <option value="a">AGP</option>
+                            <option value="c">Contigs</option>
+                            <option value="g">Gaps</option>
+                        </param>
+                    </when>
+                    <when value="assembly">
+                        <param name="expected_genomesize" type="integer" min="0" optional="true"
+                            label="Expected genome size" help="Estimated genome size. This parameter is optional, but required for NG* statistics."/>
+                    </when>
+                    <when value="sequence">
+                        <param argument="--out-sequence" type="boolean" truevalue="--out-sequence" falsevalue="" checked="false"
+                            label="Report actual sequence" help="It reports also the actual sequence"/>
+                    </when>
+                </conditional>
+                <param argument="--tabular" type="boolean" truevalue="--tabular" falsevalue="" checked="true"
+                    label="Tabular-format output" help="Generate output in tabular format"/>
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="stats" format="tabular" label="${tool.name} on ${on_string}: stats">
+            <filter>mode_condition['selector'] == 'statistics'</filter>
+            <change_format>
+                <when input="tabular" value="false" format="text"/>
+            </change_format>
+        </data>
+        <data name="output" format="fastq" from_work_dir="output_dataset" label="${tool.name} on ${on_string}: edited sequences">
+            <filter>mode_condition['selector'] == 'manipulation'</filter>
+            <change_format>
+                <when input="mode_condition.output_condition.out_format" value="fasta" format="fasta"/>
+                <when input="mode_condition.output_condition.out_format" value="fasta.gz" format="fasta.gz"/>
+                <when input="mode_condition.output_condition.out_format" value="fastq" format="fastq"/>
+                <when input="mode_condition.output_condition.out_format" value="fastq.gz" format="fastq.gz"/>
+                <when input="mode_condition.output_condition.out_format" value="gfa" format="gfa1"/>
+                <when input="mode_condition.output_condition.out_format" value="gfa.gz" format="gfa1.gz"/>
+            </change_format>
+        </data>
+    </outputs>
+    <tests>
+        <!--Test 01 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_01.fastq.gz"/>
+            <conditional name="target_condition">
+                <param name="target_condition" value="true"/>
+                <param name="target_sequence" value="S1_1"/>
+            </conditional>
+            <conditional name="mode_condition">
+                <param name="selector" value="manipulation"/>
+                <param name="swiss_army_knife" value="swiss_army.sak"/>
+                <conditional name="output_condition">
+                    <param name="out_format" value="fasta.gz"/>
+                </conditional>
+            </conditional>
+            <output name="output" value="test_01.fasta.gz" ftype="fasta.gz"/>
+        </test>
+        <!--Test 02 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_01.fastq.gz"/>
+            <conditional name="target_condition">
+                <param name="target_condition" value="true"/>
+                <param name="target_sequence" value="S1_1"/>
+            </conditional>
+            <conditional name="mode_condition">
+                <param name="selector" value="statistics"/>
+                <conditional name="statistics_condition">
+                    <param name="selector" value="size"/>
+                    <param name="out_size" value="c"/>
+                </conditional>
+            </conditional>
+            <output name="stats" value="test_02_stats.tabular" ftype="tabular"/>
+        </test>
+        <!--Test 03 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_02.fasta.gz"/>
+            <conditional name="mode_condition">
+                <param name="selector" value="statistics"/>
+                <conditional name="statistics_condition">
+                    <param name="selector" value="sequence"/>
+                </conditional>
+            </conditional>
+            <output name="stats" value="test_03_stats.tabular" ftype="tabular"/>
+        </test>
+        <!--Test 04 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_03.fasta"/>
+            <conditional name="mode_condition">
+                <param name="selector" value="statistics"/>
+                <conditional name="statistics_condition">
+                    <param name="selector" value="assembly"/>
+                    <param name="expected_genomesize" value="600000"/>
+                </conditional>
+            </conditional>
+            <output name="stats" value="test_04_stats.tabular" ftype="tabular"/>
+        </test>
+        <!--Test 05 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_04.gfa"/>
+            <conditional name="mode_condition">
+                <param name="selector" value="statistics"/>
+                <conditional name="statistics_condition">
+                    <param name="selector" value="coordinates"/>
+                    <param name="out_coord" value="a"/>
+                </conditional>
+            </conditional>
+            <output name="stats" value="test_05_stats.tabular" ftype="tabular"/>
+        </test>
+        <!--Test 06 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_04.gfa"/>
+            <conditional name="mode_condition">
+                <param name="selector" value="manipulation"/>
+                <conditional name="output_condition">
+                    <param name="out_format" value="fasta.gz"/>
+                </conditional>
+            </conditional>
+            <output name="output" value="test_06.fasta.gz" ftype="fasta.gz"/>
+        </test>
+        <!--Test 07 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_03.fasta"/>
+            <conditional name="mode_condition">
+                <param name="selector" value="statistics"/>
+                <conditional name="statistics_condition">
+                    <param name="selector" value="assembly"/>
+                </conditional>
+                <param name="tabular" value="false"/>
+            </conditional>
+            <output name="stats" value="test_07_stats.tabular" ftype="tabular"/>
+        </test>
+        <!--Test 08 -->
+        <test expect_num_outputs="1">
+            <param name="input_file" value="dataset_01.fastq.gz"/>
+            <conditional name="mode_condition">
+                <param name="selector" value="manipulation"/>
+                <conditional name="output_condition">
+                    <param name="out_format" value="fasta.gz"/>
+                </conditional>
+                <param name="sort" value="ascending"/>
+                <param name="homopolymer_compress" value="true"/>
+            </conditional>
+            <output name="output" value="test_08.fasta.gz" ftype="fasta.gz"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**Purpose**
+
+gfastats is a single fast and exhaustive tool for summary statistics and simultaneous genome assembly file manipulation. gfastats also allows seamless format conversion.
+
+
+.. class:: infomark
+
+**Metrics details**
+
+Typical fast* metrics include:
+
+- Scaffold, contig and gap size
+- Number of scaffolds, contigs and gaps
+- Total length of scaffolds, contigs and gaps
+- Scaffold, contig, gap N50 and statistics (full N*/NG* statistics with the --nstar-report flag)
+- Area under the curve (AuN/AuNG) values for scaffolds, contigs and gaps
+- Average scaffold, contig, gap size
+- Largest scaffold, contig and gap
+- Base composition and GC content
+- Soft-masked base counts (lower case bases)
+
+
+Typical gfa metrics include:
+
+- Number of nodes and edges
+- Average degree
+- Number of connected components, and length of the largets connected component
+- Number of dead ends
+- Number of disconnected components, and their total length
+
+
+.. class:: infomark
+
+**Assembly manipulation**
+
+gfastats allows extensive assembly manipulation at the sequence level. Manipulation is achieved using a set of instructions provided as an ordered list in a file to the option **swiss army knife**. See the `instruction wiki <https://github.com/vgl-hub/gfastats/tree/main/instructions>`_ for a full list of instructions.
+
+  ]]></help>
+    <expand macro="citations" />
+</tool>