Mercurial > repos > iuc > hypo
comparison hypo.xml @ 0:d7c48cf1bf50 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hypo commit 2151cc2f0b32a242d8a18537f0bdfb92b907548a"
author | iuc |
---|---|
date | Mon, 15 Nov 2021 16:48:46 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d7c48cf1bf50 |
---|---|
1 <tool id="hypo" name="HyPo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="20.05"> | |
2 <description>super fast and accurate polisher for long read genome assemblies</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro='requirements' /> | |
7 <expand macro='xrefs'/> | |
8 <command detect_errors="exit_code"><![CDATA[ | |
9 #for $i, $fastq in enumerate($reads_short): | |
10 #if $fastq.ext.endswith(".gz") | |
11 #set $ext='.fastq.gz' | |
12 #else | |
13 #set $ext='.fastq' | |
14 #end if | |
15 ln -s '$fastq' 'read_$i$ext' && | |
16 echo 'read_$i$ext' >> short_reads.txt && | |
17 #end for | |
18 hypo | |
19 --reads-short @short_reads.txt | |
20 --draft '$draft' | |
21 --bam-sr '$bam_sr' | |
22 #if $bam_lr | |
23 --bam-lr '$bam_lr' | |
24 #end if | |
25 --coverage-short $coverage_short | |
26 --size-ref '$size_ref' | |
27 --kind-sr $kind_sr | |
28 | |
29 #if $advanced_options.processing_size | |
30 --processing-size $advanced_options.processing_size | |
31 #end if | |
32 --match-sr $advanced_options.match_sr | |
33 --mismatch-sr $advanced_options.mismatch_sr | |
34 --gap-sr $advanced_options.gap_sr | |
35 --match-lr $advanced_options.match_lr | |
36 --mismatch-lr $advanced_options.mismatch_lr | |
37 --gap-lr $advanced_options.gap_lr | |
38 --ned-th $advanced_options.ned_th | |
39 --qual-map-th $advanced_options.qual_map_th | |
40 -o '$out_fasta' | |
41 -t \${GALAXY_SLOTS:-4} | |
42 #if $advanced_options.log == 'true' | |
43 > '$out_log' | |
44 #end if | |
45 ]]></command> | |
46 <inputs> | |
47 <param argument="--reads-short" type="data" format="fastq,fastq.gz" multiple="true" label="Illumina FASTQ files" /> | |
48 <param argument="--draft" type="data" format="fasta,fasta.gz,fastq,fastq.gz" label="Draft genome assembly"/> | |
49 <param argument="--bam-sr" type="data" format="bam" label="BAM with illumina read alignments" | |
50 help="Input file name containing the alignments of short reads against the draft (must have CIGAR information)"/> | |
51 <param argument="--bam-lr" type="data" format="bam" optional="true" label="BAM with ONT reads aligned" | |
52 help="Input file name containing the alignments of long reads against the draft (must have CIGAR information). | |
53 Optional (only Short reads polishing will be performed if this argument is not given)"/> | |
54 <param argument="--coverage-short" type="integer" value="" min="0" label="Aproximate mean coverage of the short reads"/> | |
55 <param argument="--size-ref" type="text" label="Aproximate size of the genome" help="A number can be followed by units k/m/g; e.g. 10m, 2.3g."> | |
56 <sanitizer invalid_char=""> | |
57 <valid initial="string.letters,string.digits"> | |
58 <add value="." /> | |
59 </valid> | |
60 </sanitizer> | |
61 <validator type="regex">[A-Za-z0-9\.]+</validator> | |
62 </param> | |
63 <param argument="--kind-sr" type="select" label="Type of short reads"> | |
64 <option value="sr" selected="true">Corresponding to NGS reads like Illumina reads (sr)</option> | |
65 <option value="ccs">Corresponding to HiFi reads like PacBio CCS reads (ccs)</option> | |
66 </param> | |
67 <section name="advanced_options" title="Advanced options"> | |
68 <param argument="--match-sr" type="integer" value="5" label="Score for matching bases for short reads" help="Default value is 5"/> | |
69 <param argument="--mismatch-sr" type="integer" value="-4" label="Score for mismatching bases for short reads" help="Default value is -4"/> | |
70 <param argument="--gap-sr" type="integer" value="-8" max="0" label="Gap penalty for short reads" help="Default value is -8 (must be negative)"/> | |
71 <param argument="--match-lr" type="integer" min="0" value="3" label="Score for matching bases for long reads" help="Default value is 3"/> | |
72 <param argument="--mismatch-lr" type="integer" value="-5" label="Score for mismatching bases for long reads" help="Default value is -5"/> | |
73 <param argument="--gap-lr" type="integer" value="-4" max="0" label="Gap penalty for long reads" help="Default value is -4 (must be negative)"/> | |
74 <param argument="--ned-th" type="integer" value="20" label="Threshold for NED" | |
75 help="Threshold for Normalised Edit Distance of long arms allowed in a window (in percentage). Higher number means more arms allowed which | |
76 may slow down the execution. Default value is 20" /> | |
77 <param argument="--qual-map-th" type="integer" value="2" label="Threshold for mapping quality of reads" | |
78 help="The reads with mapping quality below this threshold will not be taken into consideration. Default value is 2" /> | |
79 <param argument="--processing-size" type="integer" optional="true" label="Number of contigs to be processed in one batch" help="Lower value means less memory usage but slower speed. By default, all the contigs in the draft"/> | |
80 <param name="log" type="boolean" truevalue="true" falsevalue="false" label="Generate log file"/> | |
81 </section> | |
82 </inputs> | |
83 <outputs> | |
84 <data name="out_fasta" format="fasta" label="${tool.name} on ${on_string}: polished assembly"/> | |
85 <data name="out_log" format="txt" label="${tool.name} on ${on_string}: log file"> | |
86 <filter>advanced_options['log']</filter> | |
87 </data> | |
88 </outputs> | |
89 <tests> | |
90 <!--Test 01--> | |
91 <test expect_num_outputs="1"> | |
92 <param name="draft" value="draft_genome.fasta" ftype="fasta"/> | |
93 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/> | |
94 <param name="size_ref" value="10k" /> | |
95 <param name="coverage_short" value="35"/> | |
96 <param name="bam_sr" value="short_reads.bam" ftype="bam" /> | |
97 <output name="out_fasta" file="test_01.fasta" ftype="fasta" /> | |
98 </test> | |
99 <!--Test 02: test long-reads BAM--> | |
100 <test expect_num_outputs="1"> | |
101 <param name="draft" value="draft_genome.fasta" ftype="fasta"/> | |
102 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/> | |
103 <param name="size_ref" value="10k" /> | |
104 <param name="coverage_short" value="35"/> | |
105 <param name="bam_sr" value="short_reads.bam" ftype="bam" /> | |
106 <param name="bam_lr" value="long_reads.bam" ftype="bam" /> | |
107 <output name="out_fasta" file="test_02.fasta" ftype="fasta" /> | |
108 </test> | |
109 <!--Test 03: test css option in type of short reads--> | |
110 <test expect_num_outputs="1"> | |
111 <param name="draft" value="draft_genome.fasta" ftype="fasta"/> | |
112 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/> | |
113 <param name="size_ref" value="10k" /> | |
114 <param name="coverage_short" value="35"/> | |
115 <param name="bam_sr" value="short_reads.bam" ftype="bam" /> | |
116 <param name="bam_lr" value="long_reads.bam" ftype="bam" /> | |
117 <param name="kind-sr" value="css"/> | |
118 <output name="out_fasta" file="test_03.fasta" ftype="fasta" /> | |
119 </test> | |
120 <!--Test 04: test processing-size parameter --> | |
121 <test expect_num_outputs="1"> | |
122 <param name="draft" value="draft_genome.fasta" ftype="fasta"/> | |
123 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/> | |
124 <param name="size_ref" value="10k" /> | |
125 <param name="coverage_short" value="35"/> | |
126 <param name="bam_sr" value="short_reads.bam" ftype="bam" /> | |
127 <param name="bam_lr" value="long_reads.bam" ftype="bam" /> | |
128 <section name="advanced_options"> | |
129 <param name="processing-size" value="2"/> | |
130 </section> | |
131 <output name="out_fasta" file="test_04.fasta" ftype="fasta" /> | |
132 </test> | |
133 <!--Test 05: test log option--> | |
134 <test expect_num_outputs="2"> | |
135 <param name="draft" value="draft_genome.fasta" ftype="fasta"/> | |
136 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/> | |
137 <param name="size_ref" value="10k" /> | |
138 <param name="coverage_short" value="35"/> | |
139 <param name="bam_sr" value="short_reads.bam" ftype="bam" /> | |
140 <param name="log" value="true"/> | |
141 <output name="out_fasta" file="test_05.fasta" ftype="fasta" /> | |
142 <output name="out_log" ftype="txt"> | |
143 <assert_contents> | |
144 <has_text text="No. of k-mers below min. threshold"/> | |
145 <has_text text="Info: Value of K chosen for the given genome size"/> | |
146 </assert_contents> | |
147 </output> | |
148 </test> | |
149 <!--Test 06: advanced all options--> | |
150 <test expect_num_outputs="1"> | |
151 <param name="draft" value="draft_genome.fasta" ftype="fasta"/> | |
152 <param name="reads_short" value="Illumina_01.fastq.gz,Illumina_02.fastq.gz" ftype="fastq.gz"/> | |
153 <param name="size_ref" value="10k" /> | |
154 <param name="coverage_short" value="35"/> | |
155 <param name="bam_sr" value="short_reads.bam" ftype="bam" /> | |
156 <param name="bam_lr" value="long_reads.bam" ftype="bam" /> | |
157 <section name="advanced_options"> | |
158 <param name="match_sr" value="4"/> | |
159 <param name="mismatch_sr" value="-2"/> | |
160 <param name="gap_sr" value="-10"/> | |
161 <param name="match_lr" value="3"/> | |
162 <param name="mismatch_lr" value="-7"/> | |
163 <param name="gap_lr" value="-15"/> | |
164 <param name="ned_th" value="10"/> | |
165 <param name="qual_map_th" value="4"/> | |
166 </section> | |
167 <output name="out_fasta" file="test_06.fasta" ftype="fasta" /> | |
168 </test> | |
169 <!--Test 07: test fastq files--> | |
170 <test expect_num_outputs="1"> | |
171 <param name="draft" value="draft_genome.fasta" ftype="fasta"/> | |
172 <param name="reads_short" value="Illumina_01.fastq,Illumina_02.fastq" ftype="fastq"/> | |
173 <param name="size_ref" value="10k" /> | |
174 <param name="coverage_short" value="35"/> | |
175 <param name="bam_sr" value="short_reads.bam" ftype="bam" /> | |
176 <output name="out_fasta" file="test_07.fasta" ftype="fasta" /> | |
177 </test> | |
178 </tests> | |
179 <help><![CDATA[ | |
180 | |
181 .. class:: infomark | |
182 | |
183 **Purpose** | |
184 | |
185 HyPo - a Hybrid Polisher - utilizes short as well as long reads within a single run to polish a long reads assembly of small and large genomes. | |
186 It exploits unique genomic kmers to selectively polish segments of contigs using partial order alignment of selective read-segments. | |
187 As demonstrated on human genome assemblies, Hypo generates significantly more accurate polished assembly in about one-third time with | |
188 about half the memory requirements in comparison to contemporary widely used polishers like Racon. | |
189 | |
190 Please note that "short reads" doesn't necessarily have to be NGS short reads; HiFi genomic reads (e.g. CCS) like those generated from PacBio SequelII | |
191 could also be used instead. The requirement is that those reads should be highly accurate (>98% accuracy). | |
192 | |
193 ------------------- | |
194 | |
195 .. class:: infomark | |
196 | |
197 **Input files** | |
198 | |
199 Hypo requires the following as input: | |
200 * Short reads/HiFi reads (in FASTA/FASTQ format; can be compressed) | |
201 * Draft contigs (in FASTA/FASTQ format; can be compressed) | |
202 * Alignments between short reads (or HiFi reads) and the draft (hould contain CIGAR). If long reads are also to be used for polishing, then alignments between long reads and the draft. | |
203 * Expected mean coverage of short reads (or HiFi reads) and approximate size of the genome. | |
204 | |
205 In what follows, short reads can be replaced with HiFi reads. | |
206 | |
207 ------------------- | |
208 | |
209 .. class:: infomark | |
210 | |
211 **How it works** | |
212 | |
213 Broadly, we (conceptually) divide a draft (uncorrected) contig into two types of regions (segments): strong and weak. | |
214 | |
215 Strong regions are those which have strong evidence (support) of their correctness and | |
216 thus do not need polishing. Weak regions, on the other hand, will be polished using POA. Each weak region will | |
217 be polished using either short reads or long reads; short reads taking precedence over long reads. To identify | |
218 strong regions, we make use of solid kmers (expected unique genomic kmers). Strong regions also play a role in | |
219 selecting the read-segments to polish their neighbouring weak regions. Furthermore, our approach takes into account | |
220 that the long reads and thus the assemblies generated from them are prone to homopolymer errors as mentioned in the beginning. | |
221 | |
222 ]]></help> | |
223 <expand macro="citations" /> | |
224 </tool> |