comparison hypo.xml @ 0:d7c48cf1bf50 draft default tip

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hypo commit 2151cc2f0b32a242d8a18537f0bdfb92b907548a"
author iuc
date Mon, 15 Nov 2021 16:48:46 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d7c48cf1bf50
1 <tool id="hypo" name="HyPo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05">
2 <description>super fast and accurate polisher for long read genome assemblies</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro='requirements' />
7 <expand macro='xrefs'/>
8 <command detect_errors="exit_code"><![CDATA[
9 #for $i, $fastq in enumerate($reads_short):
10 #if $fastq.ext.endswith(".gz")
11 #set $ext='.fastq.gz'
12 #else
13 #set $ext='.fastq'
14 #end if
15 ln -s '$fastq' 'read_$i$ext' &&
16 echo 'read_$i$ext' >> short_reads.txt &&
17 #end for
18 hypo
19 --reads-short @short_reads.txt
20 --draft '$draft'
21 --bam-sr '$bam_sr'
22 #if $bam_lr
23 --bam-lr '$bam_lr'
24 #end if
25 --coverage-short $coverage_short
26 --size-ref '$size_ref'
27 --kind-sr $kind_sr
28
29 #if $advanced_options.processing_size
30 --processing-size $advanced_options.processing_size
31 #end if
32 --match-sr $advanced_options.match_sr
33 --mismatch-sr $advanced_options.mismatch_sr
34 --gap-sr $advanced_options.gap_sr
35 --match-lr $advanced_options.match_lr
36 --mismatch-lr $advanced_options.mismatch_lr
37 --gap-lr $advanced_options.gap_lr
38 --ned-th $advanced_options.ned_th
39 --qual-map-th $advanced_options.qual_map_th
40 -o '$out_fasta'
41 -t \${GALAXY_SLOTS:-4}
42 #if $advanced_options.log == 'true'
43 > '$out_log'
44 #end if
45 ]]></command>
46 <inputs>
47 <param argument="--reads-short" type="data" format="fastq,fastq.gz" multiple="true" label="Illumina FASTQ files" />
48 <param argument="--draft" type="data" format="fasta,fasta.gz,fastq,fastq.gz" label="Draft genome assembly"/>
49 <param argument="--bam-sr" type="data" format="bam" label="BAM with illumina read alignments"
50 help="Input file name containing the alignments of short reads against the draft (must have CIGAR information)"/>
51 <param argument="--bam-lr" type="data" format="bam" optional="true" label="BAM with ONT reads aligned"
52 help="Input file name containing the alignments of long reads against the draft (must have CIGAR information).
53 Optional (only Short reads polishing will be performed if this argument is not given)"/>
54 <param argument="--coverage-short" type="integer" value="" min="0" label="Aproximate mean coverage of the short reads"/>
55 <param argument="--size-ref" type="text" label="Aproximate size of the genome" help="A number can be followed by units k/m/g; e.g. 10m, 2.3g.">
56 <sanitizer invalid_char="">
57 <valid initial="string.letters,string.digits">
58 <add value="." />
59 </valid>
60 </sanitizer>
61 <validator type="regex">[A-Za-z0-9\.]+</validator>
62 </param>
63 <param argument="--kind-sr" type="select" label="Type of short reads">
64 <option value="sr" selected="true">Corresponding to NGS reads like Illumina reads (sr)</option>
65 <option value="ccs">Corresponding to HiFi reads like PacBio CCS reads (ccs)</option>
66 </param>
67 <section name="advanced_options" title="Advanced options">
68 <param argument="--match-sr" type="integer" value="5" label="Score for matching bases for short reads" help="Default value is 5"/>
69 <param argument="--mismatch-sr" type="integer" value="-4" label="Score for mismatching bases for short reads" help="Default value is -4"/>
70 <param argument="--gap-sr" type="integer" value="-8" max="0" label="Gap penalty for short reads" help="Default value is -8 (must be negative)"/>
71 <param argument="--match-lr" type="integer" min="0" value="3" label="Score for matching bases for long reads" help="Default value is 3"/>
72 <param argument="--mismatch-lr" type="integer" value="-5" label="Score for mismatching bases for long reads" help="Default value is -5"/>
73 <param argument="--gap-lr" type="integer" value="-4" max="0" label="Gap penalty for long reads" help="Default value is -4 (must be negative)"/>
74 <param argument="--ned-th" type="integer" value="20" label="Threshold for NED"
75 help="Threshold for Normalised Edit Distance of long arms allowed in a window (in percentage). Higher number means more arms allowed which
76 may slow down the execution. Default value is 20" />
77 <param argument="--qual-map-th" type="integer" value="2" label="Threshold for mapping quality of reads"
78 help="The reads with mapping quality below this threshold will not be taken into consideration. Default value is 2" />
79 <param argument="--processing-size" type="integer" optional="true" label="Number of contigs to be processed in one batch" help="Lower value means less memory usage but slower speed. By default, all the contigs in the draft"/>
80 <param name="log" type="boolean" truevalue="true" falsevalue="false" label="Generate log file"/>
81 </section>
82 </inputs>
83 <outputs>
84 <data name="out_fasta" format="fasta" label="${tool.name} on ${on_string}: polished assembly"/>
85 <data name="out_log" format="txt" label="${tool.name} on ${on_string}: log file">
86 <filter>advanced_options['log']</filter>
87 </data>
88 </outputs>
89 <tests>
90 <!--Test 01-->
91 <test expect_num_outputs="1">
92 <param name="draft" value="draft_genome.fasta" ftype="fasta"/>
93 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/>
94 <param name="size_ref" value="10k" />
95 <param name="coverage_short" value="35"/>
96 <param name="bam_sr" value="short_reads.bam" ftype="bam" />
97 <output name="out_fasta" file="test_01.fasta" ftype="fasta" />
98 </test>
99 <!--Test 02: test long-reads BAM-->
100 <test expect_num_outputs="1">
101 <param name="draft" value="draft_genome.fasta" ftype="fasta"/>
102 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/>
103 <param name="size_ref" value="10k" />
104 <param name="coverage_short" value="35"/>
105 <param name="bam_sr" value="short_reads.bam" ftype="bam" />
106 <param name="bam_lr" value="long_reads.bam" ftype="bam" />
107 <output name="out_fasta" file="test_02.fasta" ftype="fasta" />
108 </test>
109 <!--Test 03: test css option in type of short reads-->
110 <test expect_num_outputs="1">
111 <param name="draft" value="draft_genome.fasta" ftype="fasta"/>
112 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/>
113 <param name="size_ref" value="10k" />
114 <param name="coverage_short" value="35"/>
115 <param name="bam_sr" value="short_reads.bam" ftype="bam" />
116 <param name="bam_lr" value="long_reads.bam" ftype="bam" />
117 <param name="kind-sr" value="css"/>
118 <output name="out_fasta" file="test_03.fasta" ftype="fasta" />
119 </test>
120 <!--Test 04: test processing-size parameter -->
121 <test expect_num_outputs="1">
122 <param name="draft" value="draft_genome.fasta" ftype="fasta"/>
123 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/>
124 <param name="size_ref" value="10k" />
125 <param name="coverage_short" value="35"/>
126 <param name="bam_sr" value="short_reads.bam" ftype="bam" />
127 <param name="bam_lr" value="long_reads.bam" ftype="bam" />
128 <section name="advanced_options">
129 <param name="processing-size" value="2"/>
130 </section>
131 <output name="out_fasta" file="test_04.fasta" ftype="fasta" />
132 </test>
133 <!--Test 05: test log option-->
134 <test expect_num_outputs="2">
135 <param name="draft" value="draft_genome.fasta" ftype="fasta"/>
136 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/>
137 <param name="size_ref" value="10k" />
138 <param name="coverage_short" value="35"/>
139 <param name="bam_sr" value="short_reads.bam" ftype="bam" />
140 <param name="log" value="true"/>
141 <output name="out_fasta" file="test_05.fasta" ftype="fasta" />
142 <output name="out_log" ftype="txt">
143 <assert_contents>
144 <has_text text="No. of k-mers below min. threshold"/>
145 <has_text text="Info: Value of K chosen for the given genome size"/>
146 </assert_contents>
147 </output>
148 </test>
149 <!--Test 06: advanced all options-->
150 <test expect_num_outputs="1">
151 <param name="draft" value="draft_genome.fasta" ftype="fasta"/>
152 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/>
153 <param name="size_ref" value="10k" />
154 <param name="coverage_short" value="35"/>
155 <param name="bam_sr" value="short_reads.bam" ftype="bam" />
156 <param name="bam_lr" value="long_reads.bam" ftype="bam" />
157 <section name="advanced_options">
158 <param name="match_sr" value="4"/>
159 <param name="mismatch_sr" value="-2"/>
160 <param name="gap_sr" value="-10"/>
161 <param name="match_lr" value="3"/>
162 <param name="mismatch_lr" value="-7"/>
163 <param name="gap_lr" value="-15"/>
164 <param name="ned_th" value="10"/>
165 <param name="qual_map_th" value="4"/>
166 </section>
167 <output name="out_fasta" file="test_06.fasta" ftype="fasta" />
168 </test>
169 <!--Test 07: test fastq files-->
170 <test expect_num_outputs="1">
171 <param name="draft" value="draft_genome.fasta" ftype="fasta"/>
172 <param name="reads_short" value="Illumina_01.fastq,Illumina_02.fastq" ftype="fastq"/>
173 <param name="size_ref" value="10k" />
174 <param name="coverage_short" value="35"/>
175 <param name="bam_sr" value="short_reads.bam" ftype="bam" />
176 <output name="out_fasta" file="test_07.fasta" ftype="fasta" />
177 </test>
178 </tests>
179 <help><![CDATA[
180
181 .. class:: infomark
182
183 **Purpose**
184
185 HyPo - a Hybrid Polisher - utilizes short as well as long reads within a single run to polish a long reads assembly of small and large genomes.
186 It exploits unique genomic kmers to selectively polish segments of contigs using partial order alignment of selective read-segments.
187 As demonstrated on human genome assemblies, Hypo generates significantly more accurate polished assembly in about one-third time with
188 about half the memory requirements in comparison to contemporary widely used polishers like Racon.
189
190 Please note that "short reads" doesn't necessarily have to be NGS short reads; HiFi genomic reads (e.g. CCS) like those generated from PacBio SequelII
191 could also be used instead. The requirement is that those reads should be highly accurate (>98% accuracy).
192
193 -------------------
194
195 .. class:: infomark
196
197 **Input files**
198
199 Hypo requires the following as input:
200 * Short reads/HiFi reads (in FASTA/FASTQ format; can be compressed)
201 * Draft contigs (in FASTA/FASTQ format; can be compressed)
202 * Alignments between short reads (or HiFi reads) and the draft (hould contain CIGAR). If long reads are also to be used for polishing, then alignments between long reads and the draft.
203 * Expected mean coverage of short reads (or HiFi reads) and approximate size of the genome.
204
205 In what follows, short reads can be replaced with HiFi reads.
206
207 -------------------
208
209 .. class:: infomark
210
211 **How it works**
212
213 Broadly, we (conceptually) divide a draft (uncorrected) contig into two types of regions (segments): strong and weak.
214
215 Strong regions are those which have strong evidence (support) of their correctness and
216 thus do not need polishing. Weak regions, on the other hand, will be polished using POA. Each weak region will
217 be polished using either short reads or long reads; short reads taking precedence over long reads. To identify
218 strong regions, we make use of solid kmers (expected unique genomic kmers). Strong regions also play a role in
219 selecting the read-segments to polish their neighbouring weak regions. Furthermore, our approach takes into account
220 that the long reads and thus the assemblies generated from them are prone to homopolymer errors as mentioned in the beginning.
221
222 ]]></help>
223 <expand macro="citations" />
224 </tool>