view trycycler_reconcile_msa.xml @ 7:ed312479d9eb draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/trycycler commit 9da13f57f93e69fd463f6245af45674fe011b861
author iuc
date Wed, 06 Nov 2024 13:30:32 +0000
parents cb1e3db43020
children
line wrap: on
line source

<tool id="trycycler_reconcile_msa" name="Trycycler reconcile/msa" version="@TOOL_VERSION@" profile="20.01">
    <description>reconcile the contigs within each cluster and perform a multiple sequence alignment</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology" />
    <expand macro="requirements" />
    <version_command>trycycler --version</version_command>
    <command detect_errors="exit_code"><![CDATA[
        #import re
        #set $name = re.sub('[^\w\-\.]', '_', str($input_cluster.element_identifier)+".fasta")
        #set $folder = $name.strip(".fasta")
        #set $fullpath = "/".join(["selected_cluster",$folder])
        mkdir -p "${fullpath}/1_contigs" &&
        ln -s "${input_cluster}" "selected_cluster/${name}" &&
        python3 "$__tool_directory__"/trycycler.py reconcile "selected_cluster/${name}" && 
        trycycler reconcile --cluster_dir "${fullpath}"
            --reads $reads
            #if $linear
                --linear
            #end if
            --max_mash_dist $initial_ckeck.max_mash_dist
            --max_length_diff $initial_ckeck.max_length_diff
            --max_add_seq $circularisation.max_add_seq
            --max_add_seq_percent $circularisation.max_add_seq_percent
            --max_trim_seq $circularisation.max_trim_seq
            --max_trim_seq_percent $circularisation.max_trim_seq_percent
            --min_identity $final_check.min_identity
            --min_1kbp_identity $final_check.min_1kbp_identity
            --threads \${GALAXY_SLOTS:-2} &&
        trycycler msa --cluster_dir "$fullpath"
            --kmer $msa.kmer
            --step $msa.step
            --lookahead $msa.lookahead 
            --threads \${GALAXY_SLOTS:-2} &&
        mv '${fullpath}/2_all_seqs.fasta' '$reconciled_cluster' &&
        mv '${fullpath}/3_msa.fasta' '$aligned_cluster'
    ]]>    </command>
    <inputs>
        <param name="input_cluster" type="data" format="fasta" label="Cluster multi-FASTA dataset" help="The input should be an independent cluster generated by the *trycycler cluster* tool" />
        <param name="reads" type="data" format="fastq,fastq.gz" label="Long-read datasets" help="Long reads (FASTQ format) used to generate the assemblies" />
        <param argument="--linear" type="boolean" truevalue="--linear" falsevalue="" label="Input contigs are not circular" help="Use this option if your input contigs are not circular. It will disable the circularisation-correction steps in Trycycler reconcile." />
        <section name='initial_ckeck' title='Reconcile initial check options' expanded='true'>
            <param argument="--max_mash_dist" type="float" min="0" max="0.2" value="0.02" label="Max Mash distance" help="If any of the sequences have a pairwise Mash distance of more than this (default = 0.02), then the contigs will fail the initial check." />
            <param argument="--max_length_diff" type="float" min="1" max="2" value="1.1" label="Max relative length factor" help="If any of the sequences have a pairwise relative length factor of more than this, then the contigs will fail the initial check. For example, if set to 1.1 (the default), then no contig can be more than 10% longer than any other." />
        </section>
        <section name="circularisation" title="Reconcile circularization options" expanded='true'>
            <param argument="--max_add_seq" type="integer" min="0" max="4000" value="1000" label="Max number of pb to add to circularize" help="If they are set to 1000, then Trycycler will be willing to add up to 1000 bp to circularise it. Any contig which requires more than 1000 bp added to circularise will cause Trycycler reconcile to fail." />
            <param argument="--max_add_seq_percent" type="integer" min="0" max="10" value="5" label="Max percentage of a contig length to add to circularize" help="If they are set to 5, then Trycycler will be willing to add up to 5% of a contig's length to circularise it. Any contig which requires more than 5% of its length added to circularise will cause Trycycler reconcile to fail." />
            <param argument="--max_trim_seq" type="integer" min="0" max="100000" value="50000" label="Max number of pb to trim to circularize" help="If they are set to 50000, then Trycycler will be willing to remove up to 5000 bp to circularise it. Any contig which requires more than 5000 bp removed to circularise will cause Trycycler reconcile to fail." />
            <param argument="--max_trim_seq_percent" type="integer" min="0" max="20" value="10" label="Max percentage of a contig length to trim to circularize" help="If they are set to 10, then Trycycler will be willing to remove up to 10% of a contig's length to circularise it. Any contig which requires more than 10% of its length removed to circularise will cause Trycycler reconcile to fail." />
        </section>
        <section name="final_check" title="Reconcile final check" expanded="true">
            <param argument="--min_identity" type="integer" min="70" max="100" value="98" label="Min global alignment percentage identity" help="If any of the sequences have a pairwise global alignment percent identity of less than this (default = 98), then the contigs will fail the final check." />
            <param argument="--min_1kbp_identity" type="integer" min="0" value="25" label="Minimum allowed pairwise 1kbp window identity" help="If any of the sequences have lower pairwise window identity, it is discarted." />
        </section>
        <section name="msa" title="Multiple sequence alignment (MSA) options" expanded="true">
            <param argument="--kmer" type="integer" min="20" max="45" value="32" label="k-mer size" help="The k-mer size used for sequence partitioning (default = 32)" />
            <param argument="--step" type="integer" min="500" max="1500" value="1000" label="Step size" help="The step size used for sequence partitioning (default = 1000)." />
            <param argument="--lookahead" type="integer" min="500" max="1500" value="1000" label="Look-ahead margin" help="The look-ahead margin used for sequence partitioning (default = 10000)." />
        </section>

    </inputs>
    <outputs>
        <data name="reconciled_cluster" format="fasta" label="Trycycler reconcile on ${input_cluster.element_identifier}" from_work_dir="selected_cluster" />
        <data name="aligned_cluster" format="fasta" label="Trycycler msa on ${input_cluster.element_identifier}" from_work_dir="selected_clusters" />
    </outputs>
    <tests>
        <test>
            <param name='input_cluster' value='cluster_01.fasta' />
            <param name="reads" value="reads.fastq.gz" />
            <output name='reconciled_cluster' file='reconciled_cluster_01.fasta' />
            <output name='aligned_cluster' file='aligned_cluster_01.fasta' />
        </test>
        <test>
            <param name='input_cluster' value='cluster_01.fasta' />
            <param name="reads" value="reads.fastq.gz" />
            <section name="initial_check">
                <param name="max_mash_dist" value="0.3" />
            </section>
            <section name="circularisation">
                <param name="max_add_seq_percent" value="7" />
                <param name="max_trim_seq" value="47000" />
            </section>
            <section name="final_check">
                <param name="min_1kbp_identity" value="30" />
            </section>
            <section name="msa">
                <param name="kmer" value="30" />
            </section>
            <output name='reconciled_cluster' file='reconciled_cluster_02.fasta' />
            <output name='aligned_cluster' file='aligned_cluster_02.fasta' />
        </test>
        <test>
            <param name='input_cluster' value='cluster_01.fasta' />
            <param name="reads" value="reads.fastq.gz" />
            <section name="initial_check">
                <param name="max_mash_dist" value="0.3" />
            </section>
            <section name="circularisation">
                <param name="max_add_seq" value="900" />
                <param name="max_trim_seq" value="45000" />
            </section>
            <section name="final_check">
                <param name="min_identity" value="97" />
            </section>
            <section name="msa">
                <param name="step" value="1100" />
            </section>
            <output name='reconciled_cluster' file='reconciled_cluster_03.fasta' />
            <output name='aligned_cluster' file='aligned_cluster_03.fasta' />
        </test>
        <test>
            <param name='input_cluster' value='cluster_01.fasta' />
            <param name="reads" value="reads.fastq.gz" />
            <section name="initial_check">
                <param name="max_length_diff" value="1.2" />
            </section>
            <section name="circularisation">
                <param name="max_add_seq" value="920" />
                <param name="max_trim_seq_percent" value="12" />
            </section>
            <section name="final_check">
                <param name="min_identity" value="95" />
                <param name="min_1kbp_identity" value="25" />
            </section>
            <section name="msa">
                <param name="kmer" value="33" />
            </section>
            <output name='reconciled_cluster' file='reconciled_cluster_04.fasta' />
            <output name='aligned_cluster' file='aligned_cluster_04.fasta' />
        </test>
        <test>
            <param name='input_cluster' value='cluster_01.fasta' />
            <param name="reads" value="reads.fastq.gz" />
            <section name="initial_check">
                <param name="max_mash_dist" value="0.3" />
            </section>
            <section name="circularisation">
                <param name="max_add_seq_percentage" value="8" />
                <param name="max_trim_seq" value="45300" />
            </section>
            <section name="final_check">
                <param name="min_identity" value="97" />
            </section>
            <section name="msa">
                <param name="step" value="1100" />
                <param name="lookahead" value="980" />
            </section>
            <output name='reconciled_cluster' file='reconciled_cluster_05.fasta' />
            <output name='aligned_cluster' file='aligned_cluster_05.fasta' />
        </test>
        <test>
            <!-- Use similar collection elmement file id as input (wihout extension)-->
            <param name='input_cluster' value='cluster_01' />
            <param name="reads" value="reads.fastq.gz" />
            <section name="initial_check">
                <param name="max_mash_dist" value="0.3" />
            </section>
            <section name="circularisation">
                <param name="max_add_seq_percentage" value="8" />
                <param name="max_trim_seq" value="45300" />
            </section>
            <section name="final_check">
                <param name="min_identity" value="97" />
            </section>
            <section name="msa">
                <param name="step" value="1100" />
                <param name="lookahead" value="980" />
            </section>
            <output name='reconciled_cluster' file='reconciled_cluster_05.fasta' />
            <output name='aligned_cluster' file='aligned_cluster_05.fasta' />
        </test>
    </tests>
    <help><![CDATA[

.. class:: infomark

**Purpose**

This tool integrates two Trycycle commands: **Trycycler reconcile** and **Trycycler msa**.

The **Trycycler reconcile** tool carries out four routines:

    ::

        * Perform an initial check to make sure the contigs look sufficiently similar to each other.
        * Ensure that all contig sequences are on the same strand.
        * If the replicon is circular it fixes any circularisation issues (i.e. add/remove sequence at each contig's start/end as necessary)
        * Perform a final alignment check to make sure the normalised/circularised contigs are sufficiently similar to each other for the next step

After that, **Trycycler msa** takes the reconciled contig sequences and runs a multiple sequence alignment.

----
                    
.. class:: infomark
                    
**Input**

This tool requires the clustered contings generated by the **Trycycle cluster** tool, as well as the long-read dataset with **Trycycler cluster**.


----
                    
.. class:: infomark
                    
**Output**
                    
**Trycycler reconcile/msa** generates two datasets:

    ::

        * A multi-FASTA file dataset for each contig ready for multiple sequence alignment.
        * A FASTA-formatted multiple sequence alignment for each contig ready for use in generating a consensus.

----

.. class:: infomark

**Manual intervention**

Trycycler reconcile may not complete successfully, in which case you will have to intervene and run it again. Often this simply means excluding whichever contig is causing the problem, usually due to significant length differences between contigs, or particularly bad pairwise identity or large insertion/deletion. It can be done by using the **Filter sequences by length** tool.

Throwing out troublesome contigs at this step is normal. To prepare a set of reduduntant assemblies aims to prevent that losing one or two would not be a problem.

You should aim to have around four to eight contigs left after running Trycycler reconcile. Less than that (two or three) will not provide as many variants for the next steps and may affect your consensus sequence quality. More than that (nine or more) is fine but probably won't be of any extra benefit. If you have too few contigs for your cluster, you might want to consider going back to the start of the pipeline and generating more input assemblies. If you have plenty of contigs, you can delete some of the worst ones and run Trycycler reconcile again. Use the final check to guide you: delete the contigs with the lowest identities and largest indels relative to the other contigs.

Unlike in previous steps of **Trycycler**, the msa step should be hands-off. I.e. no manual intervention is required – just run it and wait for it to finish.


----                                                                                                    
                                                                                                        
.. class:: infomark

@PIPELINE@
    ]]>    </help>
    <expand macro='citations' />
</tool>