Mercurial > repos > iuc > cat_prepare
diff macros.xml @ 0:b6c5e7343617 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/cat commit 863ad85836c80811d1d6b82eaf3ce903b273368a"
author | iuc |
---|---|
date | Tue, 10 Dec 2019 16:07:39 -0500 |
parents | |
children | 8315b5cebb82 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Dec 10 16:07:39 2019 -0500 @@ -0,0 +1,389 @@ +<macros> + <token name="@VERSION@">5.0.3</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@VERSION@">cat</requirement> + <yield/> + </requirements> + </xml> + <xml name="version_command"> + <version_command><![CDATA[CAT --version]]></version_command> + </xml> + <token name="@DATABASE_FOLDER@">CAT_database</token> + <token name="@TAXONOMY_FOLDER@">taxonomy</token> + <xml name="cat_db"> + <conditional name="db"> + <param name="db_src" type="select" label="CAT database (--database_folder,--taxonomy_folder) from"> + <option value="cached">local cached database</option> + <option value="history">history</option> + </param> + <when value="cached"> + <param name="cat_builtin" type="select" label="Use a built-in CAT database" help="If the CAT database of interest is not listed, contact your Galaxy administrator"> + <options from_data_table="cat_database"> + <filter type="sort_by" column="2" /> + <validator type="no_options" message="No CAT database is available." /> + </options> + </param> + </when> + <when value="history"> + <param name="cat_db" type="data" format="txt" label="A history dataset from CAT prepare tool"/> + </when> + </conditional> + </xml> + <token name="@CAT_DB@"><![CDATA[ + #if $db.db_src == 'cached': + --database_folder '$db.cat_builtin.fields.database_folder' + --taxonomy_folder '$db.cat_builtin.fields.taxonomy_folder' + #else + #import os.path + #set $catdb = $db.cat_db.extra_files_path + --database_folder '$os.path.join($catdb,"@DATABASE_FOLDER@")' + --taxonomy_folder '$os.path.join($catdb,"@TAXONOMY_FOLDER@")' + #end if +]]></token> + <token name="@CAT_TAXONOMY@"><![CDATA[ + #if $db.db_src == 'cached': + --taxonomy_folder '$db.cat_builtin.fields.taxonomy_folder' + #else + #import os.path + #set $catdb = $db.cat_db.extra_files_path + --taxonomy_folder '$os.path.join($catdb,"@TAXONOMY_FOLDER@")' + #end if +]]></token> + <xml name="test_catdb"> + <conditional name="db"> + <param name="db_src" value="cached"/> + <param name="cat_builtin" value="CAT_prepare_test"/> + </conditional> + </xml> + <xml name="use_intermediates"> + <conditional name="previous"> + <param name="use_previous" type="select" label="Use previous prodigal gene prediction and diamond alignment"> + <help>predicted_proteins.faa and alignment.diamond from previous CAT run.</help> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param argument="--proteins_fasta" type="data" format="fasta" label="prodigal predicted proteins fasta"/> + <param argument="--diamond_alignment" type="data" format="tabular" label="alignment.diamond file"/> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@USE_INTERMEDIATES@"><![CDATA[ + #if $previous.use_previous == 'yes' + --proteins_fasta '$previous.proteins_fasta' + --diamond_alignment '$previous.diamond_alignment' + #end if + --out_prefix 'cat_output' +]]></token> + <xml name="custom_settings"> + <param argument="--range" type="integer" value="10" min="0" max="49" label="range"/> + <param argument="--fraction" type="float" value="0.5" min="0" max="0.99" label="fraction"/> + </xml> + <token name="@CUSTOM_SETTINGS@"><![CDATA[ + --range '$range' + --fraction '$fraction' +]]></token> + <xml name="diamond_options"> + <conditional name="diamond"> + <param name="set_diamond_opts" type="select" label="Set advanced diamond options"> + <option value="yes">Yes</option> + <option value="no" selected="true">No</option> + </param> + <when value="yes"> + <param argument="--sensitive" type="boolean" truevalue="--sensitive" falsevalue="" checked="false" + label="Run DIAMOND in sensitive mode (considerably slower)"/> + <param argument="--block_size" type="float" value="2.0" min="1" max="10" label="DIAMOND block-size parameter." + help="lower will decrease memory and temporary disk space usage, higher will increase performance."/> + <param argument="--index_chunks" type="integer" value="4" min="1" max="10" label="DIAMOND index-chunks parameter" + help="Set to 1 on high memory machines. The parameter has no effect on temporary disk space usage."/> + <param argument="--top" type="integer" value="50" min="1" max="50" label="DIAMOND top parameter" + help="Governs hits within range of best hit that are written to the alignment file. This implies you know what you are doing."/> + </when> + <when value="no"/> + </conditional> + </xml> + <token name="@DIAMOND_OPTIONS@"><![CDATA[ + #if $diamond.set_diamond_opts == 'yes': + $diamond.sensitive + --block_size '$diamond.block_size' + --index_chunks '$diamond.index_chunks' + #if $diamond.top < 50: + --I_know_what_Im_doing + --top '$diamond.top' + #end if + #end if +]]></token> + + <xml name="add_names_options"> + <param argument="--only_official" type="boolean" truevalue="--only_official" falsevalue="" checked="true" + label="Only output official level names."/> + <param argument="--exclude_scores" type="boolean" truevalue="--exclude_scores" falsevalue="" checked="false" + label="Exclude bit-score support scores in the lineage."/> + </xml> + <token name="@ADD_NAMES_OPTIONS@"><![CDATA[ + $only_official $exclude_scores +]]></token> + <xml name="add_names"> + <conditional name="names"> + <param name="add_names" type="select" label="CAT add_names for" + help="annotate with taxonomic names."> + <option value="no">No</option> + <option value="orf2lca">ORF2LCA.names.txt</option> + <option value="classification">classification.names.txt</option> + <option value="both">ORF2LCA.names.txt and classification.names.txt</option> + </param> + <when value="no"/> + <when value="orf2lca"> + <expand macro="add_names_options"/> + </when> + <when value="classification"> + <expand macro="add_names_options"/> + </when> + <when value="both"> + <expand macro="add_names_options"/> + </when> + </conditional> + </xml> + <token name="@TXT2TSV@">${__tool_directory__}/tabpad.py</token> + <token name="@ADD_NAMES@"><![CDATA[ + #if $names.add_names in ['classification','both']: + && CAT add_names $names.only_official $names.exclude_scores + @CAT_TAXONOMY@ + #if $bcat == 'CAT' + -i 'cat_output.contig2classification.tsv' + #else + -i 'cat_output.bin2classification.tsv' + #end if + -o 'classification_names.txt' + && ${__tool_directory__}/tabpad.py -i 'classification_names.txt' -o '$classification_names' + #end if + #if $names.add_names in ['orf2lca','both']: + && CAT add_names $names.only_official $names.exclude_scores + @CAT_TAXONOMY@ + -i 'cat_output.ORF2LCA.tsv' + -o 'orf2lca_names.txt' + && ${__tool_directory__}/tabpad.py -i 'orf2lca_names.txt' -o '$orf2lca_names' + #end if +]]></token> + <xml name="summarise"> + <param name="summarise" type="select" label="CAT summarise report" + help="Report the number of assignments to each taxonomic name"> + <option value="no">No</option> + <option value="classification">classification.summary.txt</option> + </param> + </xml> + <token name="@SUMMARISE@"><![CDATA[ + #if $summarise in ['classification']: + #if $names.add_names in ['classification','both'] and $names.only_official: + #set $summary_input = $classification_names + #else + #set $summary_input = 'classification_offical_names' + && CAT add_names --only_official + @CAT_TAXONOMY@ + #if $bcat == 'CAT' + -i 'cat_output.contig2classification.tsv' + #else + -i 'cat_output.bin2classification.tsv' + #end if + -o '$summary_input' + #end if + && CAT summarise + #if $bcat == 'CAT' + -c '$contigs_fasta' + #end if + -i '$summary_input' + -o 'classification_summary.txt' + && ${__tool_directory__}/tabpad.py -i 'classification_summary.txt' -o '$classification_summary' + #end if +]]></token> + + <xml name="select_outputs"> + <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> + <option value="log" selected="true">log</option> + <option value="predicted_proteins_faa" selected="true">Prodigal predicted_proteins.faa</option> + <option value="predicted_proteins_gff">Prodigal predicted_proteins.gff</option> + <option value="alignment_diamond">Diamond blastp alignment.diamond</option> + <option value="orf2lca" selected="true">ORF2LCA.txt (taxonomic assignment per predicted ORF)</option> + <yield/> + </param> + </xml> + <xml name="select_cat_outputs"> + <param name="bcat" type="hidden" value="CAT"/> + <param name="seqtype" type="hidden" value="contig"/> + <param name="sum_titles" type="hidden" value="contigs,number of ORFs,number of positions"/> + <param name="bin_col" type="hidden" value=""/> + <expand macro="select_outputs"> + <option value="contig2classification" selected="true">contig2classification.txt (taxonomic assignment per contig)</option> + </expand> + </xml> + <xml name="select_bat_outputs"> + <param name="bcat" type="hidden" value="BAT"/> + <param name="seqtype" type="hidden" value="bin"/> + <param name="sum_titles" type="hidden" value="bins"/> + <param name="bin_col" type="hidden" value="bin,"/> + <expand macro="select_outputs"> + <option value="bin2classification" selected="true">bin2classification.txt (taxonomic assignment per metagenome assembly)</option> + </expand> + </xml> + <xml name="outputs"> + <data name="log" format="txt" label="${bcat}.log" from_work_dir="cat_output.log"> + <filter>'log' in select_outputs or not select_outputs</filter> + </data> + <data name="predicted_proteins_faa" format="fasta" label="${bcat}.predicted_proteins.faa" from_work_dir="cat_output.predicted_proteins.faa"> + <filter>'predicted_proteins_faa' in select_outputs and previous['use_previous'] == 'no'</filter> + </data> + <data name="predicted_proteins_gff" format="gff" label="${bcat}.predicted_proteins.gff" from_work_dir="cat_output.predicted_proteins.gff"> + <filter>'predicted_proteins_gff' in select_outputs and previous['use_previous'] == 'no'</filter> + </data> + <data name="alignment_diamond" format="tabular" label="${bcat}.alignment.diamond" from_work_dir="cat_output.alignment.diamond"> + <filter>'alignment_diamond' in select_outputs and previous['use_previous'] == 'no'</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore" /> + </actions> + </data> + <data name="orf2lca" format="tabular" label="${bcat}.ORF2LCA.txt" from_work_dir="cat_output.ORF2LCA.tsv"> + <filter>'orf2lca' in select_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="ORF,${bin_col}lineage,bit-score" /> + </actions> + </data> + <data name="contig2classification" format="tabular" label="${bcat}.contig2classification.txt" from_work_dir="cat_output.contig2classification.tsv"> + <filter>'contig2classification' in select_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="contig,classification,reason,lineage,lineage scores" /> + </actions> + </data> + <data name="bin2classification" format="tabular" label="${bcat}.bin2classification.txt" from_work_dir="cat_output.bin2classification.tsv"> + <filter>'bin2classification' in select_outputs</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="bin,classification,reason,lineage,lineage scores" /> + </actions> + </data> + <data name="orf2lca_names" format="tabular" label="${bcat}.ORF2LCA.names.txt"> + <filter>names['add_names'] in ['both','orf2lca']</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="ORF,${bin_col}lineage,bit-score,superkingdom,phylum,class,order,family,genus,species" /> + </actions> + </data> + <data name="classification_names" format="tabular" label="${bcat}.${seqtype}2classification.names.txt"> + <filter>names['add_names'] in ['both','classification']</filter> + <actions> + <action name="comment_lines" type="metadata" default="1" /> + <action name="column_names" type="metadata" default="${seqtype},classification,reason,lineage,lineage scores,superkingdom,phylum,class,order,family,genus,species" /> + </actions> + </data> + <data name="classification_summary" format="tabular" label="${bcat}.${seqtype}2classification.summary.txt"> + <filter>'classification' in summarise</filter> + <actions> + <action name="comment_lines" type="metadata" default="4" /> + <action name="column_names" type="metadata" default="rank,clade,number of ${sum_titles}" /> + </actions> + </data> + </xml> + <token name="@COMMON_HELP@"><![CDATA[ +The Contig Annotation Tool (CAT) and Bin Annotation Tool (BAT) workflows are described at: https://github.com/dutilh/CAT + + - CAT contigs/CAT bins - runs Prodigal_ prokaryotic protein prediction on the fasta input. + - CAT contigs/CAT bins - runs Diamond_ to align predicted proteins to the reference proteins in the CAT database. + - CAT contigs/CAT bins - assigns taxonomic classification to fasta entries and ORFs based on alignments. + - CAT add_names - annotates outputs with taxonomic names. + - CAT summerise - reports number of assignments to each taxonomic name. + +A CAT database can either be installed by data_manager_cat or in the local history by CAT prepare tool. + +.. _Prodigal: https://github.com/hyattpd/Prodigal +.. _Diamond: https://github.com/bbuchfink/diamond + +]]></token> + <token name="@OUTPUTS_HELP@"><![CDATA[ + +**OUTPUTS** + +Any of the files produced by the CAT workflow are available as outputs + - Prodigal + + - predicted_proteins.faa + - predicted_proteins.gff + + - Diamond + + - alignment.diamond + + - CAT contigs/bins + + - contigs/bin2classification.txt + - ORF2LCA.txt + + - CAT add_names (optional) + + - contigs/bin2classification.names.txt + - ORF2LCA.names.txt + + - CAT summarise (optional) + + - contigs/bin2classification.summary.txt + + +]]></token> + + <token name="@OPTIONS_HELP@"><![CDATA[ + +Optional arguments: + -r, --range cut-off range after alignment [0-49] (default: 10). + -f, --fraction fraction of bit-score support for each classification + [0-0.99] (default: 0.5). + -p, --proteins_fasta + Path to predicted proteins fasta file. If supplied, + CAT will skip the protein prediction step. + -a, --diamond_alignment + Path to DIAMOND alignment table. If supplied, CAT will + skip the DIAMOND alignment step and directly classify + the sequences. A predicted proteins fasta file should + also be supplied with argument [-p / --proteins]. + + +DIAMOND specific optional arguments: + --sensitive Run DIAMOND in sensitive mode (default: not enabled). + + --block_size DIAMOND block-size parameter (default: 2.0). Lower + numbers will decrease memory and temporary disk space + usage. + + --index_chunks + DIAMOND index-chunks parameter (default: 4). Set to 1 + on high memory machines. The parameter has no effect + on temporary disk space usage. + + --top + DIAMOND top parameter [0-50] (default: 50). Governs + hits within range of best hit that are written to the + alignment file. This is not the [-r / --range] + parameter! + + +Setting the DIAMOND --top parameter + +You can speed up DIAMOND considerably, and at the same time greatly reduce disk usage, by setting the DIAMOND --top parameter to lower values. This will govern hits within range of the best hit that are written to the alignment file. + +You have to be very carefull to 1) not confuse this parameter with the r / --range parameter, which does a similar cut-off but after alignment and 2) be aware that if you want to run CAT or BAT again afterwards with different values of the -r / --range parameter, your options will be limited to the range you have chosen with --top earlier, because all hits that fall outside this range will not be included in the alignment file. Importantly, CAT and BAT currently do not warn you if you choose -r / --range in a second run higher than --top in a previous one, so it's up to you to remember this! + +If you have understood all this, or you do not plan to tune -r / --range at all afterwards, you can enjoy a huge speedup with much smaller alignment files! For CAT you can for example set --top 11 and for BAT --top 6. + +]]></token> + <xml name="citations"> + <citations> + <citation type="doi">https://doi.org/10.1101/072868</citation> + <citation type="doi">https://doi.org/10.1186/s13059-019-1817-x</citation> + <citation type="doi">https://doi.org/10.1038/nmeth.3176</citation> + <citation type="doi">https://doi.org/10.1186/1471-2105-11-119</citation> + <yield /> + </citations> + </xml> +</macros>