Mercurial > repos > iuc > gtdbtk_classify_wf
changeset 0:c4db8c4de66f draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/gtdbtk commit 8487d2c73793be0afa5b34388b122e686ac8a094
author | iuc |
---|---|
date | Tue, 13 Dec 2022 09:48:28 +0000 |
parents | |
children | dbf1798c0dcc |
files | gtdbtk_classify_wf.xml macros.xml test-data/genome_1.fna.gz test-data/gtdbtk_database.loc tool-data/gtdbtk_database.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 7 files changed, 236 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gtdbtk_classify_wf.xml Tue Dec 13 09:48:28 2022 +0000 @@ -0,0 +1,156 @@ +<tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>by placement in GTDB reference tree</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ +#import re + +mkdir input_dir && +mkdir output_dir && +mkdir output_tsv_dir && +mkdir output_newick_dir && +mkdir output_fasta_dir && +#for $i in $input: + ## gtdbtk uses the file extension to determine the input format. + #set ext = "." + $i.ext + #set input_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) + $ext + ln -s '${i}' input_dir/'${input_identifier}' && +#end for +export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path && +gtdbtk classify_wf +--genome_dir input_dir +--extension '$ext' +--out_dir output_dir +--cpus \${GALAXY_SLOTS:-4} +--min_perc_aa $advanced.min_perc_aa +$advanced.force +--min_af $advanced.min_af +#if str($advanced.output_process_log) == 'yes': + && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log' +#end if + ]]></command> + <inputs> + <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/> + <param name="gtdbtk_db" type="select" label="GTDB-Tk database"> + <options from_data_table="gtdbtk_database"> + <validator type="no_options" message="No locally cached GTDB-Tk database is available"/> + </options> + </param> + <section name="advanced" title="Advanced options"> + <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/> + <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/> + <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/> + <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/> + </section> + </inputs> + <outputs> + <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> + <filter>advanced['output_process_log'] == 'yes'</filter> + </data> + <collection name="output_tsv" type="list" format="tsv" label="${tool.name} on ${on_string} (tsv)"> + <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir"/> + </collection> + <collection name="output_newick" type="list" format="newick" label="${tool.name} on ${on_string} (newick)"> + <discover_datasets pattern="(?P<designation>.+)\.tree" ext="newick" directory="output_dir"/> + </collection> + <collection name="output_fasta" type="list" format="fasta" label="${tool.name} on ${on_string} (fasta)"> + <discover_datasets pattern="(?P<designation>.+)\.fasta" ext="fasta" directory="output_dir"/> + </collection> + </outputs> + <tests> + <!-- The commented test here is valid if we could store the GTDB-Tk database --> + <!-- + <test expect_num_outputs="3"> + <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> + <param name="gtdbtk_db" value="gtdbtk202"/> + <output_collection name="output_tsv" type="list" count="6"> + <element name="gtdbtk.ar122.filtered" ftype="tsv"> + <assert_contents> + <has_size value="0"/> + </assert_contents> + </element> + <element name="gtdbtk.ar122.markers_summary" ftype="tsv"> + <assert_contents> + <has_text text="number_unique_genes"/> + </assert_contents> + </element> + <element name="gtdbtk.ar122.summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + <element name="gtdbtk.bac120.markers_summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + <element name="gtdbtk.failed_genomes" ftype="tsv"> + <assert_contents> + <has_size value="0"/> + </assert_contents> + </element> + <element name="gtdbtk.translation_table_summary" ftype="tsv"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_newick" type="list" count="1"> + <element name="gtdbtk.ar122.classify" ftype="newick"> + <assert_contents> + <has_text text="GB_GCA_"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_fasta" type="list" count="2"> + <element name="gtdbtk.ar122.msa" ftype="fasta"> + <assert_contents> + <has_text text="GB_GCA_000008085"/> + </assert_contents> + </element> + <element name="gtdbtk.ar122.user_msa" ftype="fasta"> + <assert_contents> + <has_text text="genome_1_fna_gz"/> + </assert_contents> + </element> + </output_collection> + </test> + --> + <!-- GTDB-Tk databases are far too large to test currently --> + <test expect_failure="true"> + <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> + <param name="gtdbtk_db" value="gtdbtk202"/> + <assert_stderr> + <has_text text="Fatal error: Exit code 1"/> + </assert_stderr> + </test> + </tests> + <help><![CDATA[ +**What it does** + +GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes +based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or +thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also +be applied to isolate and single-cell genomes. + +This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by +maximum-likelihood (ML) placement. The classification workflow consists of three steps: identify, align, and +classify. + +The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial +and 122 archaeal marker genes used for phylogenetic inference. Multiple sequence alignments (MSA) are obtained by +aligning marker genes to their respective HMM model. + +The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000 +amino acids. + +Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk +reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary +divergence, and/or average nucleotide identity (ANI) to reference genomes. + +Results can be impacted by a lack of marker genes or contamination. + ]]></help> + <expand macro="citations"/> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Dec 13 09:48:28 2022 +0000 @@ -0,0 +1,15 @@ +<macros> + <token name="@TOOL_VERSION@">1.7.0</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">20.09</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">gtdbtk</requirement> + </requirements> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1093/bioinformatics/btz848</citation> + </citations> + </xml> +</macros>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/gtdbtk_database.loc Tue Dec 13 09:48:28 2022 +0000 @@ -0,0 +1,26 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <directory_path> +# +# So, for example, if you have the gtdbtk 202 stored in +# /depot/data2/galaxy/gtdbtk/202/, +# then the gtdbtk_databases.loc entry would look like this: +# +# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 +# +# and your /depot/data2/galaxy/gtdbtk/release202 directory +# would contain GTDB-Tk database files for release 202, sommething like this: +# +#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ +#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/ +gtdbtk202 GTDB-Tk database v202 ${__HERE__}/gtdbtk202
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/gtdbtk_database.loc.sample Tue Dec 13 09:48:28 2022 +0000 @@ -0,0 +1,25 @@ +# This is a sample file distributed with Galaxy that enables tools +# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc +# file has this format (longer white space characters are TAB characters): +# +# <unique_build_id> <display_name> <directory_path> +# +# So, for example, if you have the gtdbtk 202 stored in +# /depot/data2/galaxy/gtdbtk/202/, +# then the gtdbtk_databases.loc entry would look like this: +# +# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 +# +# and your /depot/data2/galaxy/gtdbtk/release202 directory +# would contain GTDB-Tk database files for release 202, sommething like this: +# +#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ +#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ +#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ +#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Dec 13 09:48:28 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of GTDB-Tk database versions 202 and higher --> + <table name="gtdbtk_database" comment_char="#"> + <columns>value, name, path</columns> + <file path="tool-data/gtdbtk_database.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Dec 13 09:48:28 2022 +0000 @@ -0,0 +1,7 @@ +<tables> + <!-- Location of databases for gtdbtk version 202 and higher --> + <table name="gtdbtk_database" comment_char="#"> + <columns>value, name, path</columns> + <file path="${__HERE__}/test-data/gtdbtk_database.loc" /> + </table> +</tables>