Mercurial > repos > galaxy-australia > panaroo

<tool id="panaroo" name="Panaroo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>A Bacterial Pangenome Analysis Pipeline</description>
    <macros>
          <import>macros.xml</import>
    </macros>
    <expand macro="edam_ontology"/>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
      mkdir outdir &&
      #import re
      #set input_directory = 'input_directory'
      mkdir $input_directory &&
      #for $gff in $gff_input_collection:
            #set identifier = re.sub('[^\s\w\-\\.]','_',str($gff.element_identifier))
            ln -fs '$gff' '$input_directory/$identifier' &&
      #end for

      panaroo
      --clean-mode '$mode'
      #if str($gen_code) != 'None':
            --codon-table '$gen_code'
      #end if
      #if str($advanced.adv_options_selector) == "set":
            #if $advanced.remove_invalid_genes
                  $advanced.remove_invalid_genes
            #end if
            --threshold '$advanced.matching_option.threshold'
            --family_threshold '$advanced.matching_option.family_threshold'
            --len_dif_percent '$advanced.matching_option.len_dif_percent'
            $advanced.matching_option.merge_paralogs
            --search_radius '$advanced.refind_option.search_radius'
            --refind_prop_match '$advanced.refind_option.refind_prop_match'
            --refind-mode '$advanced.refind_option.refind_mode'
            --min_trailing_support '$advanced.graph_correction_option.min_trailing_support'
            --trailing_recursive '$advanced.graph_correction_option.trailing_recursive'
            --edge_support_threshold '$advanced.graph_correction_option.edge_support_threshold'
            --remove_by_consensus '$advanced.graph_correction_option.remove_by_consensus'
            --high_var_flag '$advanced.graph_correction_option.high_var_flag'
            --min_edge_support_sv '$advanced.graph_correction_option.min_edge_support_sv'
            $advanced.graph_correction_option.all_seq_in_graph
            $advanced.graph_correction_option.no_clean_edges
            #if $advanced.gene_alignment_option.alignment != 'None'
                  --alignment '$advanced.gene_alignment_option.alignment'
                  --aligner '$advanced.gene_alignment_option.aligner'
            #end if
            #if $advanced.gene_alignment_option.core_subset
                  --core_subset $advanced.gene_alignment_option.core_subset
            #end if
            #if $advanced.gene_alignment_option.core_entropy_filter
                  --core_entropy_filter $advanced.gene_alignment_option.core_entropy_filter
            #end if
      #end if
      -i $input_directory/*.gff
      -o outdir
      -t \${GALAXY_SLOTS:-8}
      #if $log_out
            2>&1 | tee '$log'
      #end if
      && mv outdir/gene_presence_absence.Rtab outdir/gene_presence_absence_rtab.Rtab &&
      mv outdir/combined_protein_cdhit_out.txt outdir/combined_protein_cdhit_out.fa
    ]]></command>
    <inputs>
	<param name="gff_input_collection" type="data_collection" format="gff" collection_type="list" label="GFF Input Collection" help="A collection of input GFF files"/>
	<param name="mode" type="select" label="The stringency mode for Panaroo to run" help="Each of these modes can be fine tuned using the additional parameters in the 'Graph correction' section.">
            <option value="strict">Strict</option>
            <option value="moderate">Moderate</option>
            <option value="sensitive">Sensitive</option>
    	</param>
	<param name="gen_code" type="select" label="The Codon table used for translation" help="Default: 11.Bacteria and Archaea">
            <expand macro="genetic_code"/>
    	</param>
      <param name="log_out" type="boolean" label="Output log file?" truevalue="yes" falsevalue="no"/>
      <conditional name="advanced">
            <param name="adv_options_selector" type="select" label="Set Advanced Options?" help="Fine Tuning of Panaroo algorithmic parameters">
                <option value="set">Set</option>
                <option value="do_not_set" selected="True">Do not set</option>
            </param>
	      <when value="set">
		      <param argument="--remove-invalid-genes" type="boolean" truevalue="--remove-invalid-genes" falsevalue="" label="Remove Invalid Genes" help="Removes annotations that do not conform to the expected Prokka format."/>

                  <!--Options for Matching-->
                  <section name="matching_option" title="Matching" expanded="false">
                        <param argument="--threshold" type="float" value="0.98" label="Sequence identity threshold" help="default: 0.98"/>
                        <param argument="--family_threshold" type="float" value="0.7" label="Protein family sequence identity threshold" help="default: 0.7"/>
                        <param argument="--len_dif_percent" type="float" value="0.98" label="Length difference cutoff" help="default: 0.98"/>
                        <param argument="--merge-paralogs" type="boolean" truevalue="--merge_paralogs" falsevalue="" label="Merge Paralogs"/>
                  </section>

                  <!--Options for Refind-->
                  <section name="refind_option" title="Refind" expanded="false">
                        <param argument="--search_radius" type="integer" value="5000" label="Refinding Search radius" help="The distance in nucleotides surronding the neighbour of an accessory gene in which to search for it"/>
                        <param argument="--refind_prop_match" type="float" value="0.2" label="Refinding Proportion Match" help="he proportion of an accessory gene that must be found in order to consider it a match"/>
                        <param argument="--refind_mode" type="select" label="Refind Mode" help="Set the stringency mode at which to re-find genes">
                              <option value="default" selected="True">Default</option>
                              <option value="strict">Strict</option>
                              <option value="off">Off</option>
                        </param>
                  </section>

                  <!--Graph Correction-->
                  <section name="graph_correction_option" title="Graph Correction" expanded="false">
                        <param argument="--min_trailing_support" type="integer" value="2" label="Minimum trailing support" help="Minimum cluster size to keep a gene called at the end of a contig"/>
                        <param argument="--trailing_recursive" type="integer" value="1" label="Trailing Recursive"  help="Number of times to perform recursive trimming of low support nodes near the end of contigs"/>
                        <param argument="--edge_support_threshold" type="float" value="1" label="Edge support threshold" help="Minimum support required to keep an edge that has been flagged as a possible mis-assembly."/>
                        <param argument="--len_outlier_proportion" type="float" value="0.01" label="Length outlier support proportion" help="--length_outlier_support_proportion"/>
                        <param argument="--remove_by_consensus" type="boolean" truevalue="True" falsevalue="False" label="Remove consensus" help="If a gene is called in the same region with similar sequence a minority of the time, remove it."/>
                        <param argument="--high_var_flag" type="integer" value="5" label="Highly variable gene region" help="Minimum number of nested cycles to call a highly variable gene region."/>
                        <param argument="--min_edge_support_sv" type="integer" value="2" label="Minimum edge support structural variants" help="Minimum edge support required to call structural variants in the presence/absence sv file"/>
                        <param argument="--all_seq_in_graph" type="boolean" truevalue="--all_seq_in_graph" falsevalue="" label="Retains all DNA sequence" help="Retains all DNA sequence for each gene cluster in the graph output."/>
                        <param argument="--no_clean_edges" type="boolean" truevalue="--no_clean_edges" falsevalue="" label="No Clean Edges" help="Turn off edge filtering in the final output graph."/>
                  </section>

                  <!--Gene Alignment-->
		      <section name="gene_alignment_option" title="Gene Alignment" expanded="false">
		            <param argument="--alignment" type="select" label="Output alignments of core genes or all genes.">
                              <option value="None" selected="True">None</option>
                              <option value="core">Core genome alignment</option>
                              <option value="pan">Pan-genome alignment</option>
                        </param>
		            <param argument="--aligner" type="select" label="Specify an aligner" help="--aligner [mafft|prank|clustal][default: mafft]">
                              <option value="mafft" selected="True">MAFFT</option>
                              <option value="prank">PRANK</option>
                              <option value="clustal">Clustal</option>
		            </param>
                        <param argument="--codons" type="boolean" label="Generate codon alignments by aligning sequences at the protein level" truevalue="--codons" falsevalue="" help="Generate codon alignments by aligning sequences at the protein level"/>
                        <param argument="--core_threshold" type="float" value="0.95" label="Core Threshold" help="Core-genome sample threshold"/>
		            <param argument="--core_subset" type="integer" optional="true" label="Subset of the core genome to these many genes" help="Randomly subset the core genome to these many genes. Default is all genes."/>
                        <param argument="--core_entropy_filter" type="float" value="0.1" label="Core Entropy Filter" help="Manually set the Block Mapping and Gathering with Entropy (BMGE) filter. By default this is set using the Tukey outlier method."/>
		      </section>
	      </when>
            <when value="do_not_set"/>
      </conditional>
    </inputs>
    <outputs>
      <!--Panaroo default outputs -->
      <collection name="output" type="list" label="${tool.name} on ${on_string}: Pangenome default output">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fasta$" directory="outdir" format="fasta" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fa$" directory="outdir" format="fasta" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.txt$" directory="outdir" format="txt" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.clstr$" directory="outdir" format="txt" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.gml$" directory="outdir" format="txt" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.csv$" directory="outdir" format="csv" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.Rtab$" directory="outdir" format="tabular" visible="false"/>
            <filter>( advanced['adv_options_selector'] != 'set' ) or ( advanced['adv_options_selector'] == 'set' and advanced['gene_alignment_option']['alignment'] == 'None' )</filter>
      </collection>

      <!--Panaroo advance alignment outputs -->
      <collection name="output_pangenome" type="list" label="${tool.name} on ${on_string}: Pangenome alignment output">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.clstr$" directory="outdir" format="txt" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.txt$" directory="outdir" format="txt" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.gml$" directory="outdir" format="txt" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.Rtab$" directory="outdir" format="tabular" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.csv$" directory="outdir" format="csv" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fasta$" directory="outdir" format="fasta" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fa$" directory="outdir" format="fasta" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.aln$" directory="outdir" format="fasta" visible="false"/>
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.embl$" directory="outdir" format="embl" visible="false"/>
            <filter>advanced['adv_options_selector'] == 'set' and advanced['gene_alignment_option']['alignment'] != 'None' </filter>
      </collection>

      <!--Pan Genome Aligned FASTA -->
      <collection name="output_pangenome_fasta" type="list" label="${tool.name} on ${on_string}: Pangenome Alignment Gene Sequences">
            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fas$" directory="outdir/aligned_gene_sequences" format="fasta" visible="false"/>
            <filter>advanced['adv_options_selector'] == 'set' and advanced['gene_alignment_option']['alignment'] != 'None'</filter>
      </collection>

      <!--Panaroo log output -->
      <data name="log" format="txt" label="${tool.name} on ${on_string}: Panaroo Log">
            <filter>log_out</filter>
      </data>
    </outputs>
    <tests>
      <!-- Test 1 : Testing Panaroo with default parameters -->
      <test expect_num_outputs="2">
            <param name="mode" value="strict"/>
            <param name="gen_code" value="11"/>
            <param name="log_out" value="yes"/>
            <conditional name="advanced">
                  <param name="adv_options_selector" value="do_not_set"/>
            </conditional>
            <param name="gff_input_collection">
                <collection type="list">
                    <element name="10_small.gff" value="10_small.gff"/>
                    <element name="11_small.gff" value="11_small.gff"/>
                </collection>
            </param>
	      <output_collection name="output" count="13" type="list">
                  <element name="combined_DNA_CDS" ftype="fasta">
                        <assert_contents>
                              <has_n_lines n="18206"/>
                        </assert_contents>
                  </element>
                  <element name="combined_protein_CDS" ftype="fasta">
                        <assert_contents>
                              <has_n_lines n="7048"/>
                        </assert_contents>
		      </element>
                  <element name="combined_protein_cdhit_out" ftype="fasta">
                        <assert_contents>
                              <has_n_lines n="5119"/>
                        </assert_contents>
		      </element>
                  <element name="pan_genome_reference" ftype="fasta">
                        <assert_contents>
                              <has_n_lines n="5055"/>
                        </assert_contents>
		      </element>
                  <element name="summary_statistics" ftype="txt">
			      <assert_contents>
				      <has_line line="Core genes&#009;(99% &#60;= strains &#60;= 100%)&#009;251"/>
                              <has_line line="Total genes&#009;(0% &#60;= strains &#60;= 100%)&#009;251"/>
                        </assert_contents>
                  </element>
                  <element name="gene_data" ftype="csv">
                        <assert_contents>
                              <has_text text="KPLBOJCC_00001"/>
                              <has_text text="NCFNLLIC_00549"/>
                        </assert_contents>
		      </element>
                  <element name="gene_presence_absence" ftype="csv">
                        <assert_contents>
                              <has_text text="dcd"/>
                              <has_text text="trmB"/>
                              <has_text text="betI_1"/>
                        </assert_contents>
                  </element>
                  <element name="gene_presence_absence_roary" ftype="csv">
                        <assert_contents>
                              <has_text text="kstR2_1"/>
                              <has_text text="ybgJ"/>
                        </assert_contents>
                  </element>
                  <element name="struct_presence_absence" ftype="tabular">
                        <assert_contents>
                              <has_line_matching expression="Gene\s+10_small\s+11_small"/>
                        </assert_contents>
                  </element>
	      </output_collection>
            <output name="log">
                 <assert_contents>
                      <has_text text="total seq: 979"/>
                 </assert_contents>
            </output>
         </test>

         <!-- Test 2 : Testing Panaroo with Advanced filtering option along with Alignment turned off -->
         <test expect_num_outputs="2">
            <param name="gen_code" value="11"/>
            <param name="mode" value="strict"/>
            <param name="log_out" value="yes"/>
            <conditional name="advanced">
                  <param name="adv_options_selector" value="set"/>
                  <section name="gene_alignment_option">
                        <param name="alignment" value="None"/>
                  </section>
            </conditional>
            <param name="gff_input_collection">
              <collection type="list">
                  <element name="10_small.gff" value="10_small.gff"/>
                  <element name="11_small.gff" value="11_small.gff"/>
              </collection>
            </param>
            <output_collection name="output" count="13" type="list">
            <element name="combined_DNA_CDS" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="18206"/>
                  </assert_contents>
            </element>
            <element name="combined_protein_CDS" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="7048"/>
                  </assert_contents>
            </element>
            <element name="combined_protein_cdhit_out" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="5119"/>
                  </assert_contents>
            </element>
            <element name="pan_genome_reference" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="13120"/>
                  </assert_contents>
            </element>
            <element name="summary_statistics" ftype="txt">
                  <assert_contents>
                        <has_line line="Core genes&#009;(99% &#60;= strains &#60;= 100%)&#009;251"/>
                        <has_line line="Shell genes&#009;(15% &#60;= strains &#60; 95%)&#009;475"/>
                        <has_line line="Total genes&#009;(0% &#60;= strains &#60;= 100%)&#009;726"/>
                  </assert_contents>
            </element>
            <element name="gene_data" ftype="csv">
                  <assert_contents>
                        <has_n_lines n="980"/>
                        <has_n_columns sep="," n="8"/>
                        <has_text text="KPLBOJCC_00003"/>
                        <has_text text="NCFNLLIC_00003"/>
                  </assert_contents>
            </element>
            <element name="gene_presence_absence" ftype="csv">
                  <assert_contents>
                        <has_text text="recB"/>
                        <has_text text="recC"/>
                        <has_text text="rpoB"/>
                  </assert_contents>
            </element>
            <element name="gene_presence_absence_roary" ftype="csv">
                  <assert_contents>
                        <has_text text="ctpI_2"/>
                        <has_text text="amiD_1"/>
                  </assert_contents>
            </element>
            <element name="struct_presence_absence" ftype="tabular">
                  <assert_contents>
                        <has_line_matching expression="Gene\s+10_small\s+11_small"/>
                  </assert_contents>
            </element>
            </output_collection>
            <output name="log">
                  <assert_contents>
                        <has_text text="total seq: 979"/>
                  </assert_contents>
            </output>
      </test>
      <!-- Test 3 : Testing Panaroo with Advanced Filtering options along with MAFFT core alignment -->
      <test expect_num_outputs="3">
            <param name="gen_code" value="11"/>
            <param name="mode" value="strict"/>
            <param name="log_out" value="yes"/>
            <conditional name="advanced">
                  <param name="adv_options_selector" value="set"/>
                  <section name="gene_alignment_option">
                        <param name="alignment" value="core"/>
                        <param name="aligner" value="mafft"/>
                  </section>
            </conditional>
            <param name="gff_input_collection">
                  <collection type="list">
                        <element name="10_small.gff" value="10_small.gff"/>
                        <element name="11_small.gff" value="11_small.gff"/>
                  </collection>
            </param>
            <output_collection name="output_pangenome" count="18" type="list">
            <element name="summary_statistics" ftype="txt">
                  <assert_contents>
                        <has_line line="Core genes&#009;(99% &#60;= strains &#60;= 100%)&#009;251"/>
                        <has_line line="Shell genes&#009;(15% &#60;= strains &#60; 95%)&#009;475"/>
                        <has_line line="Total genes&#009;(0% &#60;= strains &#60;= 100%)&#009;726"/>
                  </assert_contents>
            </element>
            <element name="alignment_entropy" ftype="csv">
                  <assert_contents>
                        <has_text text="stf0.aln,0.0"/>
                        <has_text text="bglB.aln,0.0"/>
                  </assert_contents>
            </element>
            <element name="combined_DNA_CDS" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="18206"/>
                  </assert_contents>
            </element>
            <element name="combined_protein_cdhit_out" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="5119"/>
                  </assert_contents>
            </element>
            <element name="combined_protein_CDS" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="7048"/>
                  </assert_contents>
            </element>
            <element name="struct_presence_absence" ftype="tabular">
                  <assert_contents>
                        <has_line_matching expression="Gene\s+10_small\s+11_small"/>
                  </assert_contents>
            </element>
            <element name="gene_data" ftype="csv">
                  <assert_contents>
                        <has_n_lines n="980"/>
                        <has_n_columns sep="," n="8"/>
                        <has_text text="KPLBOJCC_00003"/>
                        <has_text text="NCFNLLIC_00003"/>
                  </assert_contents>
            </element>
            <element name="pan_genome_reference" ftype="fasta">
                  <assert_contents>
                        <has_n_lines n="13120"/>
                  </assert_contents>
            </element>
            <element name="gene_presence_absence" ftype="csv">
                  <assert_contents>
                        <has_text text="recB"/>
                        <has_text text="recC"/>
                        <has_text text="rpoB"/>
                  </assert_contents>
            </element>
            <element name="gene_presence_absence_roary" ftype="csv">
                  <assert_contents>
                        <has_text text="ctpI_2"/>
                        <has_text text="amiD_1"/>
                  </assert_contents>
            </element>
            <element name="core_gene_alignment_filtered" ftype="fasta">
                  <assert_contents>
                        <has_size value="569962" delta="1000"/>
                  </assert_contents>
            </element>
            <element name="core_gene_alignment" ftype="fasta">
                  <assert_contents>
                        <has_size value="569962" delta="1000"/>
                  </assert_contents>
            </element>
            <element name="core_alignment_header" ftype="embl">
                  <assert_contents>
                        <has_text text="ID   Genome standard; DNA; PRO; 1234 BP."/>
                        <has_text text="hisB_1.aln"/>
                  </assert_contents>
            </element>
            <element name="core_alignment_filtered_header" ftype="embl">
                  <assert_contents>
                        <has_n_lines n="760" delta="10"/>
                  </assert_contents>
            </element>
            </output_collection>
            <output_collection name="output_pangenome_fasta" count="251"/>
            <output name="log">
                 <assert_contents>
                        <has_text text="total seq: 979"/>
                </assert_contents>
            </output>
         </test>
    </tests>
    <help><![CDATA[
Panaroo_ is a pangenome analysis tool specifically designed to analyze bacterial genomes. Panaroo builds a full graphical representation of the pangenome, where nodes are clusters of orthologous genes (COGs) and two nodes are connected by an edge if they are adjacent on a contig in any sample from the population. Using this graphical representation, Panaroo corrects for errors introduced during annotation by collapsing diverse gene families, filtering contamination, merging fragmented gene segments and refinding missing genes.

**INPUTS**

  - A list of gff3 files (from Prokka) in a collection.

**OUTPUTS**

  - combined_protein_cdhit_out.txt
  - combined_protein_cdhit_out.txt.clstr
  - pre_filt_graph.gml
  - gene_data.csv
  - combined_protein_CDS.fasta
  - combined_DNA_CDS.fasta
  - gene_presence_absence_rtab.Rtab
  - gene_presence_absence_roary.csv
  - gene_presence_absence.csv
  - summary_statistics.txt
  - pan_genome_reference.fa
  - struct_presence_absence.Rtab
  - final_graph.gml

**OUTPUTS with Advance parameters**

  - combined_protein_cdhit_out.txt
  - combined_protein_cdhit_out.txt.clstr
  - pre_filt_graph.gml
  - gene_data.csv
  - combined_protein_CDS.fasta
  - combined_DNA_CDS.fasta
  - gene_presence_absence_rtab.Rtab
  - gene_presence_absence_roary.csv
  - gene_presence_absence.csv
  - summary_statistics.txt
  - pan_genome_reference.fa
  - struct_presence_absence.Rtab
  - final_graph.gml
  - core_gene_alignment
  - core_gene_alignment_filtered
  - core_alignment_filtered_header
  - core_alignment_header
  - a collection of Pangenome Aligned Fasta files

.. _Panaroo: https://gthlab.au/panaroo/#/gettingstarted/quickstart

    ]]></help>
    <citations>
        <citation type="doi">10.1186/s13059-020-02090-4</citation>
    </citations>
</tool>
author	iuc
date	Fri, 11 Apr 2025 11:24:05 +0000
parents	01864c78c5a5
children