Mercurial > repos > iuc > gtdbtk_classify_wf
diff gtdbtk_classify_wf.xml @ 0:c4db8c4de66f draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gtdbtk commit 8487d2c73793be0afa5b34388b122e686ac8a094
author | iuc |
---|---|
date | Tue, 13 Dec 2022 09:48:28 +0000 |
parents | |
children | dbf1798c0dcc |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtdbtk_classify_wf.xml Tue Dec 13 09:48:28 2022 +0000 @@ -0,0 +1,156 @@ +<tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>by placement in GTDB reference tree</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +#import re + +mkdir input_dir && +mkdir output_dir && +mkdir output_tsv_dir && +mkdir output_newick_dir && +mkdir output_fasta_dir && +#for $i in $input: + ## gtdbtk uses the file extension to determine the input format. + #set ext = "." + $i.ext + #set input_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) + $ext + ln -s '${i}' input_dir/'${input_identifier}' && +#end for +export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path && +gtdbtk classify_wf +--genome_dir input_dir +--extension '$ext' +--out_dir output_dir +--cpus \${GALAXY_SLOTS:-4} +--min_perc_aa $advanced.min_perc_aa +$advanced.force +--min_af $advanced.min_af +#if str($advanced.output_process_log) == 'yes': + && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log' +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/> + <param name="gtdbtk_db" type="select" label="GTDB-Tk database"> + <options from_data_table="gtdbtk_database"> + <validator type="no_options" message="No locally cached GTDB-Tk database is available"/> + </options> + </param> + <section name="advanced" title="Advanced options"> + <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/> + <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/> + <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/> + <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/> + </section> + </inputs> + <outputs> + <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> + <filter>advanced['output_process_log'] == 'yes'</filter> + </data> + <collection name="output_tsv" type="list" format="tsv" label="${tool.name} on ${on_string} (tsv)"> + <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir"/> + </collection> + <collection name="output_newick" type="list" format="newick" label="${tool.name} on ${on_string} (newick)"> + <discover_datasets pattern="(?P<designation>.+)\.tree" ext="newick" directory="output_dir"/> + </collection> + <collection name="output_fasta" type="list" format="fasta" label="${tool.name} on ${on_string} (fasta)"> + <discover_datasets pattern="(?P<designation>.+)\.fasta" ext="fasta" directory="output_dir"/> + </collection> + </outputs> + <tests> + <!-- The commented test here is valid if we could store the GTDB-Tk database --> + <!-- + <test expect_num_outputs="3"> + <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> + <param name="gtdbtk_db" value="gtdbtk202"/> + <output_collection name="output_tsv" type="list" count="6"> + <element name="gtdbtk.ar122.filtered" ftype="tsv"> + <assert_contents> + <has_size value="0"/> + </assert_contents> + </element> + <element name="gtdbtk.ar122.markers_summary" ftype="tsv"> + <assert_contents> + <has_text text="number_unique_genes"/> + </assert_contents> + </element> + <element name="gtdbtk.ar122.summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + <element name="gtdbtk.bac120.markers_summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + <element name="gtdbtk.failed_genomes" ftype="tsv"> + <assert_contents> + <has_size value="0"/> + </assert_contents> + </element> + <element name="gtdbtk.translation_table_summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_newick" type="list" count="1"> + <element name="gtdbtk.ar122.classify" ftype="newick"> + <assert_contents> + <has_text text="GB_GCA_"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_fasta" type="list" count="2"> + <element name="gtdbtk.ar122.msa" ftype="fasta"> + <assert_contents> + <has_text text="GB_GCA_000008085"/> + </assert_contents> + </element> + <element name="gtdbtk.ar122.user_msa" ftype="fasta"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + </output_collection> + </test> + --> + <!-- GTDB-Tk databases are far too large to test currently --> + <test expect_failure="true"> + <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> + <param name="gtdbtk_db" value="gtdbtk202"/> + <assert_stderr> + <has_text text="Fatal error: Exit code 1"/> + </assert_stderr> + </test> + </tests> + <help><![CDATA[ +**What it does** + +GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes +based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or +thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also +be applied to isolate and single-cell genomes. + +This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by +maximum-likelihood (ML) placement. The classification workflow consists of three steps: identify, align, and +classify. + +The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial +and 122 archaeal marker genes used for phylogenetic inference. Multiple sequence alignments (MSA) are obtained by +aligning marker genes to their respective HMM model. + +The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000 +amino acids. + +Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk +reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary +divergence, and/or average nucleotide identity (ANI) to reference genomes. + +Results can be impacted by a lack of marker genes or contamination. + ]]></help> + <expand macro="citations"/> +</tool>