view integron_finder.xml @ 1:4768f7f8e93f draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/integron_finder commit 04aa903b53ba0596d486d2ce4a1b4f1d7125846f
author iuc
date Fri, 17 Feb 2023 18:36:09 +0000
parents 1ae00120dd24
children 392f100eebcd
line wrap: on
line source

<tool id="integron_finder" name="Integron Finder" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description> is a program that detects integrons in DNA sequences</description>
    <macros>
       <import>macro.xml</import>
    </macros>
    <expand macro="edam_info"/>
    <expand macro="xrefs"/>
    <expand macro="requirements"/>
    <command detect_errors="aggressive"><![CDATA[
        integron_finder
        '$sequence'
        --cpu @THREADS@
        --keep-tmp
        $local_max
        #if $type_replicon
          $type_replicon
        #end if
        #if $topology_file
            --topology-file '$topology_file'
        #end if
        $promoter_attI
        -dt $settings.attc_settings.dist_thresh
        --calin-threshold $settings.attc_settings.calin_threshold
        --max-attc-size $settings.attc_settings.max_attc_size
        --min-attc-size $settings.attc_settings.min_attc_size
        $settings.attc_settings.keep_palindromes
        #if $settings.attc_settings.covar_matrix
            --attc-model '$settings.attc_settings.covar_matrix'
        #end if
        $settings.protein_settings.no_proteins
        $settings.protein_settings.union_integrases
        $settings.protein_settings.func_annot
        $gbk
        $pdf
        && mv Results_Integron_Finder_* Results_Integron_Finder
    ]]></command>
    <inputs>
        <param type="data" name="sequence" format="fasta" label="Replicon file" help="Replicon can be entire chromosome, contif, PCR fragments...">
             <validator type="expression" message="Integron Finder has problems with large multi FASTA files. Please assembly your sequences or split them up into files smaller 10.000 sequences."><![CDATA[value is not None and value.metadata.sequences <= 10000]]></validator>
        </param>
        <param name="local_max" argument="--local-max" type="boolean" checked="false" truevalue="--local-max" falsevalue="" label="Thorough local detection" help="This option allows a more sensitive search. I will be slower (dependant on the number of hits) if integrons are found, but will be as fast if nothing is detected and will not increase the false positive rate." />
	      <param name="type_replicon" type="select" optional="true" label="Default replicons topology" help="Set the default topology for replicons, linear, circular (deault: no topology)">
	          <option value="--linear">linear (--linear)</option>
	          <option value="--circ">circular (--circ)</option>
	      </param>
        <param name="topology_file" argument="--topology-file" type="data" format="txt" optional="true" label="Select a topology file from your history"/>
        <param name="promoter_attI" argument="--promoter-attI" type="boolean" checked="false" truevalue="--promoter-attI" falsevalue="" label="Search also for promoter and attI sites?" />
        <param argument="--gbk" type="boolean" checked="false" truevalue="--gbk" falsevalue="" label="Genbank output?" help="Generate a GenBank file with the sequence annotated with the same annotations than .integrons file."/>
        <param argument="--pdf" type="boolean" checked="false" truevalue="--pdf" falsevalue="" label="pdf output?" help="For each complete integron, a simple graphic of the region is depicted (in pdf format)"/>
        <section name="settings" title="Advanced Parameters" expanded="False">
            <section name="attc_settings" title="Attc options" expanded="False">
               <param name="dist_thresh" argument="--distance-thresh" type="integer" value="4000" label="Threshold for clustering (in base)" min="0" help="By default, to cluster an array of attC sites and an integron integrase, they must be less than 4 kb apart. You can here change this value." />
               <param name="calin_threshold" type="integer" value="2" label="Threshold to filter CALIN" min="0" help="Keep 'CALIN' only if attC sites number >= calin-threshold" />
               <param name="max_attc_size" type="integer" value="200" label="Maximum value for attC size" min="0"/>
               <param name="min_attc_size" type="integer" value="40" label="Minimum value for attC size" min="0" />
               <param name="keep_palindromes" argument="--keep-palindromes" type="boolean" checked="false" truevalue="--keep-palindromes" falsevalue="" label="Keep palindromes with the highest evalue" help="For a given hit, if the palindromic version is found, don't remove the one with highest evalue"/>
               <param name="covar_matrix" argument="--attc-model" type="data"  optional="true" format="txt" label="Covariance Matrix" />
            </section>
            <section name="protein_settings" title="Protein options" expanded="False">
                <param name="no_proteins" argument="--no-proteins" type="boolean" checked="false" truevalue="--no-proteins" falsevalue="" label="Just look for attC sites" help="When enabled, it does not annotate CDS and does not find integrase."/>
                <param name="union_integrases" argument="--union-integrases" type="boolean" checked="false" truevalue="--union-integrases" falsevalue="" label="Use the union of the hits" help="Instead of taking intersection of hits from Phage_int profile (Tyr recombinases) and integron_integrase profile, use the union of the hits" />
                <param name="func_annot" argument="--func-annot" type="boolean" checked="false" truevalue="--func-annot" falsevalue="" label="Annotate cassettes given HMM profiles" />
            </section>
        </section>
        <param name="no_logfile" type="boolean" truevalue="true" falsevalue="false" label="Remove log file"/>
    </inputs>
    <outputs>
        <collection type="list" label="Genbank files from [$tool.name] on $[on_string]" name="genbank_out">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.gbk" format="gbk" visible="false" directory="Results_Integron_Finder/" />
            <filter>gbk</filter>
        </collection>
	      <data format="txt" name="integron_log" from_work_dir="Results_Integron_Finder/integron_finder.out" label="Log from [$tool.name] on $[on_string]">
            <filter> no_logfile == False</filter>
        </data>
        <data format="tsv" name="integrons_table" from_work_dir="Results_Integron_Finder/*.integrons" label="Integrons annotations from [$tool.name] on $[on_string]"/>
        <data format="tsv" name="summary" from_work_dir="Results_Integron_Finder/*.summary" label="Summary from [$tool.name] on $[on_string]"/>
        <collection type="list" label="Graphic from [$tool.name] on $[on_string]" name="pdf_out">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.pdf" format="pdf" visible="false" directory="Results_Integron_Finder/" />
            <filter>pdf</filter>
        </collection>
    </outputs>
    <tests>
        <test expect_num_outputs="3">
            <param name="sequence" value="input.fasta"/>
            <output name="integron_log" value="integron_log" lines_diff="3" />
            <output name="integrons_table" value="test1_integrons_table.tsv" lines_diff="3"/>
            <output name="summary" value="summary.tsv" lines_diff="3"/>
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="local_max" value="true"/>
            <param name="type_replicon" value="--linear"/>
            <param name="no_logfile" value="true"/>
            <output name="integrons_table" value="test2_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="summary.tsv" lines_diff="4" />
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="type_replicon" value="--circ"/>
            <param name="no_logfile" value="true"/>
            <output name="integrons_table" value="test3_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="summary.tsv" lines_diff="3" />
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="topology_file" value="topology.txt"/>
            <param name="no_logfile" value="true"/>
            <output name="integrons_table" value="test4_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="summary.tsv" lines_diff="5" />
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="promoter_attI" value="true"/>
            <param name="no_logfile" value="true"/>
            <output name="integrons_table" value="test5_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="summary.tsv" lines_diff="3" />
        </test>
        <test expect_num_outputs="4">
            <param name="sequence" value="input.fasta"/>
            <param name="gbk" value="true"/>
            <param name="pdf" value="true"/>
            <param name="no_logfile" value="true"/>
            <output_collection name="genbank_out" type="list">
                <element name="ACBA.007.P01_13">
                    <assert_contents>
                        <has_text text="MKTATAPLPPLRSVKVLDQLRERIRYLHYSLRTEQAYVNWVRAFI"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="pdf_out" type="list">
                <element name="ACBA.007.P01_13_1">
                    <assert_contents>
                        <has_text text=">"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output name="integrons_table" value="test6_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="summary.tsv" lines_diff="3" />
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="no_logfile" value="true"/>
            <section name="settings">
                <section name="attc_settings">
                    <param name="dist_thresh" value="2000"/>
                    <param name="calin_threshold" value="3"/>
                    <param name="max_attc_size" value="188"/>
                    <param name="min_attc_size" value="30"/>
                    <param name="keep_palindromes" value=""/>
                </section>
            </section>
            <output name="integrons_table" value="test7_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="summary.tsv" lines_diff="3" />
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="no_logfile" value="true"/>
            <section name="settings">
                <section name="attc_settings">
                    <param name="covar_matrix" value="covar.txt"/>
                </section>
            </section>
            <output name="integrons_table" value="test8_integrons_table.tsv" lines_diff="10" />
            <output name="summary" value="summary.tsv" lines_diff="3" />
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="no_logfile" value="true"/>
            <section name="settings">
                <section name="protein_settings">
                    <param name="no_proteins" value="true"/>
                </section>
            </section>
            <output name="integrons_table" value="test9_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="test9_summary.tsv" lines_diff="3" />
        </test>
        <test expect_num_outputs="2">
            <param name="sequence" value="input.fasta"/>
            <param name="no_logfile" value="true"/>
            <section name="settings">
                <section name="protein_settings">
                    <param name="union_integrases" value="true" />
                    <param name="func_annot" value="true"/>
                </section>
            </section>
            <output name="integrons_table" value="test10_integrons_table.tsv" lines_diff="3" />
            <output name="summary" value="summary.tsv" lines_diff="3" />
        </test>
    </tests>
    <help><![CDATA[

How does it work ?
==================

- First, IntegronFinder annotates the DNA sequence's CDS with Prodigal.

- Second, IntegronFinder detects independently integron integrase and *attC*
  recombination sites. The Integron integrase is detected by using the intersection
  of two HMM profiles:

  - one specific of tyrosine-recombinase (PF00589)
  - one specific of the integron integrase, near the patch III domain of tyrosine recombinases.

The *attC* recombination site is detected with a covariance model (CM), which
models the secondary structure in addition to the few conserved sequence
positions.


- Third, the results are integrated, and IntegronFinder distinguishes 3 types of
  elements:

  - complete integron
      Integron with integron integrase nearby *attC* site(s)
  - In0 element
      Integron integrase only, without any *attC* site nearby
  - CALIN element
      Cluster of *attC* sites Lacking INtegrase nearby.
      A rule of thumb to avoid false positive is to filter out singleton of
      *attC* site.

IntegronFinder can also annotate gene cassettes (CDS nearby *attC* sites) using
Resfams, a database of HMM profiles aiming at annotating antibiotic resistance
genes. This database is provided but the user can add any other HMM profiles
database of its own interest.

When available, IntegronFinder annotates the promoters and attI sites by pattern
matching.
    ]]></help>
    <expand macro="citations"/>
</tool>