comparison trycycler_reconcile_msa.xml @ 0:4ac81df20c74 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/trycycler commit 9d7c4277b0f96aacd466f2d497e08edcca3fa238"
author iuc
date Thu, 11 Feb 2021 19:26:05 +0000
parents
children b64696992ee9
comparison
equal deleted inserted replaced
-1:000000000000 0:4ac81df20c74
1 <tool id="trycycler_reconcile_msa" name="Trycycler reconcile/msa" version="@TOOL_VERSION@" profile="21.01">
2 <description>reconcile the contigs within each cluster and perform a multiple sequence alignment</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="edam_ontology"/>
7 <expand macro="requirements"/>
8 <version_command>trycycler --version</version_command>
9 <command detect_errors="exit_code"><![CDATA[
10 #import re
11 #set $name = re.sub('[^\w\-\.]', '_', str($input_cluster.element_identifier))
12 #set $folder = $name.strip(".fasta")
13 #set $fullpath = "/".join(["selected_cluster",$folder])
14 mkdir -p "${fullpath}/1_contigs" &&
15 ln -s "${input_cluster}" "selected_cluster/${name}" &&
16 python3 "$__tool_directory__"/trycycler.py reconcile "selected_cluster/${name}" &&
17 trycycler reconcile --cluster_dir "${fullpath}"
18 --reads $reads
19 #if $linear
20 --linear
21 #end if
22 --max_mash_dist $initial_ckeck.max_mash_dist
23 --max_length_diff $initial_ckeck.max_length_diff
24 --max_add_seq $circularisation.max_add_seq
25 --max_add_seq_percent $circularisation.max_add_seq_percent
26 --max_trim_seq $circularisation.max_trim_seq
27 --max_trim_seq_percent $circularisation.max_trim_seq_percent
28 --min_identity $final_check.min_identity
29 --max_indel_size $final_check.max_indel_size
30 --threads \${GALAXY_SLOTS:-2} &&
31 trycycler msa --cluster_dir "$fullpath"
32 --kmer $msa.kmer
33 --step $msa.step
34 --lookahead $msa.lookahead
35 --threads \${GALAXY_SLOTS:-2} &&
36 mv '${fullpath}/2_all_seqs.fasta' '$reconciled_cluster' &&
37 mv '${fullpath}/3_msa.fasta' '$aligned_cluster'
38 ]]></command>
39 <inputs>
40 <param name="input_cluster" type="data"
41 format="fasta" label="Cluster multi-FASTA dataset"
42 help="The input should be an independent cluster generated by the *trycycler cluster* tool" />
43 <param name="reads" type="data"
44 format="fastq,fastq.gz" label="Long-read datasets"
45 help="Long reads (FASTQ format) used to generate the assemblies" />
46 <param argument="--linear" type="boolean"
47 truevalue="--linear" falsevalue=""
48 label="Input contigs are not circular"
49 help="Use this option if your input contigs are not circular. It will disable the circularisation-correction steps in Trycycler reconcile." />
50 <section name='initial_ckeck'
51 title='Reconcile initial check options'
52 expanded='true'>
53 <param argument="--max_mash_dist" type="float" min="0" max="0.2"
54 value="0.02" label="Max Mash distance"
55 help="If any of the sequences have a pairwise Mash distance of more than this (default = 0.02), then the contigs will fail the initial check." />
56 <param argument="--max_length_diff" type="float" min="1" max="2"
57 value="1.1" label="Max relative length factor"
58 help="If any of the sequences have a pairwise relative length factor of more than this, then the contigs will fail the initial check. For example, if set to 1.1 (the default), then no contig can be more than 10% longer than any other." />
59 </section>
60 <section name="circularisation"
61 title="Reconcile circularization options"
62 expanded='true'>
63 <param argument="--max_add_seq" type="integer" min="0" max="4000"
64 value="1000" label="Max number of pb to add to circularize"
65 help="If they are set to 1000, then Trycycler will be willing to add up to 1000 bp to circularise it. Any contig which requires more than 1000 bp added to circularise will cause Trycycler reconcile to fail." />
66 <param argument="--max_add_seq_percent" type="integer" min="0" max="10"
67 value="5" label="Max percentage of a contig length to add to circularize"
68 help="If they are set to 5, then Trycycler will be willing to add up to 5% of a contig's length to circularise it. Any contig which requires more than 5% of its length added to circularise will cause Trycycler reconcile to fail." />
69 <param argument="--max_trim_seq" type="integer" min="0" max="100000"
70 value="50000" label="Max number of pb to trim to circularize"
71 help="If they are set to 50000, then Trycycler will be willing to remove up to 5000 bp to circularise it. Any contig which requires more than 5000 bp removed to circularise will cause Trycycler reconcile to fail." />
72 <param argument="--max_trim_seq_percent" type="integer" min="0" max="20"
73 value="10" label="Max percentage of a contig length to trim to circularize"
74 help="If they are set to 10, then Trycycler will be willing to remove up to 10% of a contig's length to circularise it. Any contig which requires more than 10% of its length removed to circularise will cause Trycycler reconcile to fail." />
75 </section>
76 <section name="final_check"
77 title="Reconcile final check"
78 expanded="true">
79 <param argument="--min_identity" type="integer" min="70" max="100"
80 value="98" label="Min global alignment percentage identity"
81 help="If any of the sequences have a pairwise global alignment percent identity of less than this (default = 98), then the contigs will fail the final check." />
82 <param argument="--max_indel_size" type="integer" min="100" max="400"
83 value="250" label="Max alignment indel size"
84 help="If any of the sequences have a pairwise alignment indel size of more than this (default = 250), then the contigs will fail the final check." />
85 </section>
86 <section name="msa"
87 title="Multiple sequence alignment (MSA) options"
88 expanded="true">
89 <param argument="--kmer" type="integer" min="20" max="45"
90 value="32" label="k-mer size"
91 help="The k-mer size used for sequence partitioning (default = 32)" />
92 <param argument="--step" type="integer" min="500" max="1500"
93 value="1000" label="Step size"
94 help="The step size used for sequence partitioning (default = 1000)." />
95 <param argument="--lookahead" type="integer" min="500" max="1500"
96 value="1000" label="Look-ahead margin"
97 help="The look-ahead margin used for sequence partitioning (default = 10000)." />
98 </section>
99
100 </inputs>
101 <outputs>
102 <data name="reconciled_cluster" format="fasta" label="Trycycler reconcile on ${input_cluster.element_identifier}" from_work_dir="selected_cluster"/>
103 <data name="aligned_cluster" format="fasta" label="Trycycler msa on ${input_cluster.element_identifier}" from_work_dir="selected_clusters"/>
104 </outputs>
105 <tests>
106 <test>
107 <param name='input_cluster' value='cluster_01.fasta'/>
108 <param name="reads" value="reads.fastq.gz"/>
109 <output name='reconciled_cluster' file='reconciled_cluster_01.fasta'/>
110 <output name='aligned_cluster' file='aligned_cluster_01.fasta'/>
111 </test>
112 <test>
113 <param name='input_cluster' value='cluster_01.fasta'/>
114 <param name="reads" value="reads.fastq.gz"/>
115 <section name="initial_check">
116 <param name="max_mash_dist" value="0.3"/>
117 </section>
118 <section name="circularisation">
119 <param name="max_add_seq_percent" value="7"/>
120 <param name="max_trim_seq" value="47000"/>
121 </section>
122 <section name="final_check">
123 <param name="max_indel_size" value="230"/>
124 </section>
125 <section name="msa">
126 <param name="kmer" value="30"/>
127 </section>
128 <output name='reconciled_cluster' file='reconciled_cluster_02.fasta'/>
129 <output name='aligned_cluster' file='aligned_cluster_02.fasta'/>
130 </test>
131 <test>
132 <param name='input_cluster' value='cluster_01.fasta'/>
133 <param name="reads" value="reads.fastq.gz"/>
134 <section name="initial_check">
135 <param name="max_mash_dist" value="0.3"/>
136 </section>
137 <section name="circularisation">
138 <param name="max_add_seq" value="900"/>
139 <param name="max_trim_seq" value="45000"/>
140 </section>
141 <section name="final_check">
142 <param name="min_identity" value="97"/>
143 </section>
144 <section name="msa">
145 <param name="step" value="1100"/>
146 </section>
147 <output name='reconciled_cluster' file='reconciled_cluster_03.fasta'/>
148 <output name='aligned_cluster' file='aligned_cluster_03.fasta'/>
149 </test>
150 <test>
151 <param name='input_cluster' value='cluster_01.fasta'/>
152 <param name="reads" value="reads.fastq.gz"/>
153 <section name="initial_check">
154 <param name="max_length_diff" value="1.2"/>
155 </section>
156 <section name="circularisation">
157 <param name="max_add_seq" value="920"/>
158 <param name="max_trim_seq_percent" value="12"/>
159 </section>
160 <section name="final_check">
161 <param name="min_identity" value="95"/>
162 <param name="max_indel_size" value="230"/>
163 </section>
164 <section name="msa">
165 <param name="kmer" value="33"/>
166 </section>
167 <output name='reconciled_cluster' file='reconciled_cluster_04.fasta'/>
168 <output name='aligned_cluster' file='aligned_cluster_04.fasta'/>
169 </test>
170 <test>
171 <param name='input_cluster' value='cluster_01.fasta'/>
172 <param name="reads" value="reads.fastq.gz"/>
173 <section name="initial_check">
174 <param name="max_mash_dist" value="0.3"/>
175 </section>
176 <section name="circularisation">
177 <param name="max_add_seq_percentage" value="8"/>
178 <param name="max_trim_seq" value="45300"/>
179 </section>
180 <section name="final_check">
181 <param name="min_identity" value="97"/>
182 </section>
183 <section name="msa">
184 <param name="step" value="1100"/>
185 <param name="lookahead" value="980"/>
186 </section>
187 <output name='reconciled_cluster' file='reconciled_cluster_05.fasta'/>
188 <output name='aligned_cluster' file='aligned_cluster_05.fasta'/>
189 </test>
190
191 </tests>
192 <help><![CDATA[
193
194 .. class:: infomark
195
196 **Purpose**
197
198 This tool integrates two Trycycle commands: **Trycycler reconcile** and **Trycycler msa**.
199
200 The **Trycycler reconcile** tool carries out four routines:
201
202 ::
203
204 * Perform an initial check to make sure the contigs look sufficiently similar to each other.
205 * Ensure that all contig sequences are on the same strand.
206 * If the replicon is circular it fixes any circularisation issues (i.e. add/remove sequence at each contig's start/end as necessary)
207 * Perform a final alignment check to make sure the normalised/circularised contigs are sufficiently similar to each other for the next step
208
209 After that, **Trycycler msa** takes the reconciled contig sequences and runs a multiple sequence alignment.
210
211 ----
212
213 .. class:: infomark
214
215 **Input**
216
217 This tool requires the clustered contings generated by the **Trycycle cluster** tool, as well as the long-read dataset with **Trycycler cluster**.
218
219
220 ----
221
222 .. class:: infomark
223
224 **Output**
225
226 **Trycycler reconcile/msa** generates two datasets:
227
228 ::
229
230 * A multi-FASTA file dataset for each contig ready for multiple sequence alignment.
231 * A FASTA-formatted multiple sequence alignment for each contig ready for use in generating a consensus.
232
233 ----
234
235 .. class:: infomark
236
237 **Manual intervention**
238
239 Trycycler reconcile may not complete successfully, in which case you will have to intervene and run it again. Often this simply means excluding whichever contig is causing the problem, usually due to significant length differences between contigs, or particularly bad pairwise identity or large insertion/deletion. It can be done by using the **Filter sequences by length** tool.
240
241 Throwing out troublesome contigs at this step is normal. To prepare a set of reduduntant assemblies aims to prevent that losing one or two would not be a problem.
242
243 You should aim to have around four to eight contigs left after running Trycycler reconcile. Less than that (two or three) will not provide as many variants for the next steps and may affect your consensus sequence quality. More than that (nine or more) is fine but probably won't be of any extra benefit. If you have too few contigs for your cluster, you might want to consider going back to the start of the pipeline and generating more input assemblies. If you have plenty of contigs, you can delete some of the worst ones and run Trycycler reconcile again. Use the final check to guide you: delete the contigs with the lowest identities and largest indels relative to the other contigs.
244
245 Unlike in previous steps of **Trycycler**, the msa step should be hands-off. I.e. no manual intervention is required – just run it and wait for it to finish.
246
247
248 ----
249
250 .. class:: infomark
251
252 @PIPELINE@
253 ]]></help>
254 <expand macro='citations'/>
255 </tool>