comparison trycycler_cluster.xml @ 0:c767a45616d0 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/trycycler commit 9d7c4277b0f96aacd466f2d497e08edcca3fa238"
author iuc
date Thu, 11 Feb 2021 19:26:49 +0000
parents
children 189e837009c9
comparison
equal deleted inserted replaced
-1:000000000000 0:c767a45616d0
1 <tool id='trycycler_cluster' name='Trycycler cluster' version='@TOOL_VERSION@' profile='21.01'>
2 <description>cluster the contigs of your input assemblies into per-replicon groups</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro='edam_ontology'/>
7 <expand macro='requirements'/>
8 <version_command>trycycler --version</version_command>
9 <command detect_errors='exit_code'><![CDATA[
10 #import re
11 mkdir -p initial_clusters assemblies &&
12 #for $input_file in $assemblies
13 #set $name = re.sub('[^\w\-\.]', '_', str($input_file.element_identifier))
14 ln -s '$input_file' 'assemblies/$name' &&
15 #end for
16 trycycler cluster --assemblies assemblies/*
17 --reads '$reads'
18 --min_contig_len $min_contig_len
19 --min_contig_depth $min_contig_depth
20 --distance $distance
21 --threads \${GALAXY_SLOTS:-2}
22 --out_dir 'initial_clusters' &&
23 mv initial_clusters/contigs.phylip '$output_phylip' &&
24 mv initial_clusters/contigs.newick '$output_newick' &&
25 python3 '$__tool_directory__/trycycler.py' 'cluster' 'initial_clusters'
26 ]]></command>
27 <inputs>
28 <param name='assemblies' type='data'
29 format='fasta,fasta.gz' multiple='true' label='Assembled sequences datasets'
30 help='Input assemblies whose contigs will be clustered (multiple FASTA files)' />
31 <param name='reads' type='data'
32 format='fastq,fastq.gz' label='Long-read datasets'
33 help='Long reads (FASTQ format) used to generate the assemblies' />
34 <param argument='--min_contig_len' type='integer'
35 min='100' max='5000' value='1000' label='Minimun contig length'
36 help='Contigs shorter than this are thrown out on the assumption that they are either incomplete or spurious. The default value is 1000, as plasmids smaller than that are very rare.' />
37 <param argument='--min_contig_depth' type='float'
38 min='0.01' max='1' value='0.1' label='Minimun contig depth'
39 help='This controls how Trycycler filters out contigs with a low read depth. It is a multiple of the mean read depth for the assembly. For example, if an assembly has a mean depth of 90x and this setting is 0.1 (the default), then any contig with depth lower that x9 will be removed.'/>
40 <param argument='--distance' type='float'
41 min='0.001' max='0.1' value='0.01' label='Mash distance threshold'
42 help='This is the Mash distance threshold used when defining clusters, and the default threshold is 0.01. Smaller thresholds (e.g. 0.005) can result in a larger number of tighter clusters. Larger thresholds (e.g. 0.02) can result in a smaller number of looser clusters.' />
43 </inputs>
44 <outputs>
45 <data name='output_phylip' format='phylip' label='${tool.name} on ${on_string}: phylip'/>
46 <data name='output_newick' format='newick' label='${tool.name} on ${on_string}: newick'/>
47 <collection name='initial_clusters' type='list' label='${tool.name} on ${on_string}'>
48 <discover_datasets pattern='__designation_and_ext__' format='fasta' directory='initial_clusters'/>
49 </collection>
50 </outputs>
51 <tests>
52 <test>
53 <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz'/>
54 <param name='reads' value='reads.fastq.gz'/>
55 <output name='output_phylip' file='contigs_01.phylip'/>
56 <output name='output_newick' file='contigs_01.newick'/>
57 <output_collection name='initial_clusters' type='list' count='2'>
58 <element name='cluster_01' file='cluster_01.fasta' ftype='fasta' lines_diff='20'/>
59 </output_collection>
60 </test>
61 <test>
62 <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz'/>
63 <param name='reads' value='reads.fastq.gz'/>
64 <param name='min_contig_len' value='900'/>
65 <param name='min_contig_depth' value='0.05'/>
66 <param name='distance' value='0.05'/>
67 <output name='output_phylip' file='contigs_02.phylip'/>
68 <output name='output_newick' file='contigs_02.newick'/>
69 <output_collection name='initial_clusters' type='list' count='2'>
70 <element name='cluster_01' file='cluster_02.fasta' ftype='fasta' lines_diff='20'/>
71 </output_collection>
72 </test>
73 <test>
74 <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz'/>
75 <param name='reads' value='reads.fastq.gz'/>
76 <param name='min_contig_len' value='850'/>
77 <param name='min_contig_depth' value='0.01'/>
78 <param name='distance' value='0.09'/>
79 <output name='output_phylip' file='contigs_03.phylip'/>
80 <output name='output_newick' file='contigs_03.newick'/>
81 <output_collection name='initial_clusters' type='list' count='2'>
82 <element name='cluster_01' file='cluster_03.fasta' ftype='fasta' lines_diff='20'/>
83 </output_collection>
84 </test>
85 <test>
86 <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz'/>
87 <param name='reads' value='reads.fastq.gz'/>
88 <param name='min_contig_len' value='1100'/>
89 <param name='min_contig_depth' value='0.02'/>
90 <param name='distance' value='0.07'/>
91 <output name='output_phylip' file='contigs_04.phylip'/>
92 <output name='output_newick' file='contigs_04.newick'/>
93 <output_collection name='initial_clusters' type='list' count='2'>
94 <element name='cluster_01' file='cluster_04.fasta' ftype='fasta' lines_diff='20'/>
95 </output_collection>
96 </test>
97 </tests>
98 <help><![CDATA[
99 .. class:: infomark
100
101 **Purpose**
102
103 The *Trycycler cluster* tool carries out complete-linkage clustering of all contig sequences based on their `Mash distance <https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x/>`_, a fast sequence distance estimator that uses the MinHash algorithm. It also serves to exclude any spurious, incomplete or badly misassembled contigs.
104
105 If your contigs do not form clear clusters, that indicates that the input assemblies are inconsistent and unreliable. If you find yourself in this situation (struggling to identify which clusters are good and which are bad), then you probably need to get better long-read data (longer and/or deeper) and try again.
106
107 ----
108
109 .. class:: infomark
110
111 **Input**
112
113 This tool requires two different inputs: a set of multiple separate assemblies and a long-read set.
114
115 ----
116
117 .. class:: infomark
118
119 **Output**
120
121 **Trycycler cluster** generates three files:
122
123 \* A matrix of the Mash distances between contigs (phylip output).
124
125 \* A `FastMe tree <https://academic.oup.com/mbe/article/32/10/2798/1212138>`_ of the contigs built from the distance matrix. It can be visualised in the **newick display** tool.
126
127 \* A collection list which contains the clusters.
128
129 \
130
131 **Choose your clusters**
132
133 After running **Trycycler cluster**, you need to extract the cluster datasets from the collection by using the **Extract Element from a collection based on a name** tool. It is up to you to choose which of the clusters are good and which are bad. This can be somewhat subjective, so there is not an exact procedure for you to follow.
134
135 Generally speaking, a good cluster contains many contigs (ideally one from each assembly) which are all very close to each other and have realistic read depths. A bad cluster contains a small number of contigs (maybe just one) which might have low read depths. The tree can be useful in making these decisions, though interpret it with a grain of salt, as the contig sequences are not necessarily related in a tree-like manner.
136
137 If you have prior knowledge about what your genome should look like, that information can be quite useful in deciding which clusters are good. E.g. if you happened to know that your genome contains a 150 kbp plasmid, then you can expect one of your good clusters to have contigs of about that size.
138
139 You might also decide at this point that the default value for --distance (0.01) was not quite right. E.g. if your tree contains two very close clusters that you think should actually be one cluster, you can run Trycycler cluster again with a larger distance threshold. Another thing to keep in mind: contamination can happen. I most often see this occur with cross-barcode contamination, where a contig in one assembly actually belongs to a different genome from the same multiplexed sequencing run.
140
141
142 ----
143
144 .. class:: infomark
145
146 @PIPELINE@
147 ]]></help>
148 <expand macro='citations'/>
149 </tool>