view trycycler_cluster.xml @ 1:189e837009c9 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/trycycler commit e88166111fa3b6c57c870ea4cff6e012a1b1a912"
author iuc
date Sat, 13 Feb 2021 17:32:01 +0000
parents c767a45616d0
children
line wrap: on
line source

<tool id='trycycler_cluster' name='Trycycler cluster' version='@TOOL_VERSION@' profile='20.01'>
    <description>cluster the contigs of your input assemblies into per-replicon groups</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro='edam_ontology' />
    <expand macro='requirements' />
    <version_command>trycycler --version</version_command>
    <command detect_errors='exit_code'><![CDATA[
        #import re
        mkdir -p initial_clusters assemblies &&
        #for $input_file in $assemblies
            #set $name = re.sub('[^\w\-\.]', '_', str($input_file.element_identifier))
            ln -s '$input_file' 'assemblies/$name' &&
        #end for
        trycycler cluster --assemblies assemblies/*
            --reads '$reads'
            --min_contig_len $min_contig_len
            --min_contig_depth $min_contig_depth
            --distance $distance
            --threads \${GALAXY_SLOTS:-2}
            --out_dir 'initial_clusters' &&
        mv initial_clusters/contigs.phylip '$output_phylip' &&
        mv initial_clusters/contigs.newick '$output_newick' && 
        python3 '$__tool_directory__/trycycler.py' 'cluster' 'initial_clusters'
    ]]>    </command>
    <inputs>
        <param name='assemblies' type='data' format='fasta,fasta.gz' multiple='true' label='Assembled sequences datasets' help='Input assemblies whose contigs will be clustered (multiple FASTA files)' />
        <param name='reads' type='data' format='fastq,fastq.gz' label='Long-read datasets' help='Long reads (FASTQ format) used to generate the assemblies' />
        <param argument='--min_contig_len' type='integer' min='100' max='5000' value='1000' label='Minimun contig length' help='Contigs shorter than this are thrown out on the assumption that they are either incomplete or spurious. The default value is 1000, as plasmids smaller than that are very rare.' />
        <param argument='--min_contig_depth' type='float' min='0.01' max='1' value='0.1' label='Minimun contig depth' help='This controls how Trycycler filters out contigs with a low read depth. It is a multiple of the mean read depth for the assembly. For example, if an assembly has a mean depth of 90x and this setting is 0.1 (the default), then any contig with depth lower that x9 will be removed.' />
        <param argument='--distance' type='float' min='0.001' max='0.1' value='0.01' label='Mash distance threshold' help='This is the Mash distance threshold used when defining clusters, and the default threshold is 0.01. Smaller thresholds (e.g. 0.005) can result in a larger number of tighter clusters. Larger thresholds (e.g. 0.02) can result in a smaller number of looser clusters.' />
    </inputs>
    <outputs>
        <data name='output_phylip' format='phylip' label='${tool.name} on ${on_string}: phylip' />
        <data name='output_newick' format='newick' label='${tool.name} on ${on_string}: newick' />
        <collection name='initial_clusters' type='list' label='${tool.name} on ${on_string}'>
            <discover_datasets pattern='__designation_and_ext__' format='fasta' directory='initial_clusters' />
        </collection>
    </outputs>
    <tests>
        <test>
            <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz' />
            <param name='reads' value='reads.fastq.gz' />
            <output name='output_phylip' file='contigs_01.phylip' />
            <output name='output_newick' file='contigs_01.newick' />
            <output_collection name='initial_clusters' type='list' count='2'>
                <element name='cluster_01' file='cluster_01.fasta' ftype='fasta' lines_diff='20' />
            </output_collection>
        </test>
        <test>
            <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz' />
            <param name='reads' value='reads.fastq.gz' />
            <param name='min_contig_len' value='900' />
            <param name='min_contig_depth' value='0.05' />
            <param name='distance' value='0.05' />
            <output name='output_phylip' file='contigs_02.phylip' />
            <output name='output_newick' file='contigs_02.newick' />
            <output_collection name='initial_clusters' type='list' count='2'>
                <element name='cluster_01' file='cluster_02.fasta' ftype='fasta' lines_diff='20' />
            </output_collection>
        </test>
        <test>
            <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz' />
            <param name='reads' value='reads.fastq.gz' />
            <param name='min_contig_len' value='850' />
            <param name='min_contig_depth' value='0.01' />
            <param name='distance' value='0.09' />
            <output name='output_phylip' file='contigs_03.phylip' />
            <output name='output_newick' file='contigs_03.newick' />
            <output_collection name='initial_clusters' type='list' count='2'>
                <element name='cluster_01' file='cluster_03.fasta' ftype='fasta' lines_diff='20' />
            </output_collection>
        </test>
        <test>
            <param name='assemblies' value='assembly_00.fasta.gz,assembly_01.fasta.gz,assembly_02.fasta.gz,assembly_03.fasta.gz' />
            <param name='reads' value='reads.fastq.gz' />
            <param name='min_contig_len' value='1100' />
            <param name='min_contig_depth' value='0.02' />
            <param name='distance' value='0.07' />
            <output name='output_phylip' file='contigs_04.phylip' />
            <output name='output_newick' file='contigs_04.newick' />
            <output_collection name='initial_clusters' type='list' count='2'>
                <element name='cluster_01' file='cluster_04.fasta' ftype='fasta' lines_diff='20' />
            </output_collection>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**Purpose**
                    
The *Trycycler cluster* tool carries out complete-linkage clustering of all contig sequences based on their `Mash distance <https://genomebiology.biomedcentral.com/articles/10.1186/s13059-016-0997-x/>`_, a fast sequence distance estimator that uses the MinHash algorithm. It also serves to exclude any spurious, incomplete or badly misassembled contigs.

If your contigs do not form clear clusters, that indicates that the input assemblies are inconsistent and unreliable. If you find yourself in this situation (struggling to identify which clusters are good and which are bad), then you probably need to get better long-read data (longer and/or deeper) and try again.            
    
----
                    
.. class:: infomark
                    
**Input**

This tool requires two different inputs: a set of multiple separate assemblies and a long-read set.

----
                    
.. class:: infomark
                    
**Output**
                    
**Trycycler cluster** generates three files:
        
\* A matrix of the Mash distances between contigs (phylip output).

\* A `FastMe tree <https://academic.oup.com/mbe/article/32/10/2798/1212138>`_ of the contigs built from the distance matrix. It can be visualised in the **newick display** tool.

\* A collection list which contains the clusters.

\

**Choose your clusters** 

After running **Trycycler cluster**, you need to extract the cluster datasets from the collection by using the **Extract Element from a collection based on a name** tool. It is up to you to choose which of the clusters are good and which are bad. This can be somewhat subjective, so there is not an exact procedure for you to follow.

Generally speaking, a good cluster contains many contigs (ideally one from each assembly) which are all very close to each other and have realistic read depths. A bad cluster contains a small number of contigs (maybe just one) which might have low read depths. The tree can be useful in making these decisions, though interpret it with a grain of salt, as the contig sequences are not necessarily related in a tree-like manner.

If you have prior knowledge about what your genome should look like, that information can be quite useful in deciding which clusters are good. E.g. if you happened to know that your genome contains a 150 kbp plasmid, then you can expect one of your good clusters to have contigs of about that size.

You might also decide at this point that the default value for --distance (0.01) was not quite right. E.g. if your tree contains two very close clusters that you think should actually be one cluster, you can run Trycycler cluster again with a larger distance threshold. Another thing to keep in mind: contamination can happen. I most often see this occur with cross-barcode contamination, where a contig in one assembly actually belongs to a different genome from the same multiplexed sequencing run. 


----                                                                                                    
                                                                                                        
.. class:: infomark

@PIPELINE@
    ]]>    </help>
    <expand macro='citations' />
</tool>