comparison antismash.xml @ 1:593bb8f5488b draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/antismash commit 654a4f3b3a1602cec2510d51fb953fd456427e08
author bgruening
date Wed, 07 Feb 2018 06:22:58 -0500
parents 5db064bbb3be
children 3f0077c88c16
comparison
equal deleted inserted replaced
0:5db064bbb3be 1:593bb8f5488b
1 <tool id="antismash" name="Secondary Metabolites" version="2.0.2.2"> 1 <?xml version='1.0' encoding='utf-8'?>
2 <description>and Antibiotics Analysis (antiSMASH)</description> 2 <tool id="antismash" name="Antismash" version="4.0.2" profile="17.01">
3 <description>allows the genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters</description>
3 <requirements> 4 <requirements>
4 <requirement type="package" version="3.0">hmmer</requirement> 5 <requirement type="package" version="4.0.2">antismash</requirement>
5 <requirement type="package">hmmer</requirement>
6 <requirement type="package" version="2.2.28">blast+</requirement>
7 <requirement type="package">blast+</requirement>
8 <requirement type="package" version="3.8.31">muscle</requirement>
9 <requirement type="package">muscle</requirement>
10 <requirement type="package" version="1.4.0-post-1">straight.plugin</requirement>
11 <requirement type="package">straight.plugin</requirement>
12 <requirement type="package" version="1.62">biopython</requirement>
13 <requirement type="package">biopython</requirement>
14 <requirement type="package" version="1.2.6">pyquery</requirement>
15 <requirement type="package">pyquery</requirement>
16 <requirement type="package" version="0.1.2">helperlibs</requirement>
17 <requirement type="package">helperlibs</requirement>
18 <requirement type="package" version="0.9">cssselect</requirement>
19 <requirement type="package">cssselect</requirement>
20 <requirement type="package" version="2.0.2">antismash</requirement>
21 <requirement type="package">antismash</requirement>
22 <requirement type="package">glimmer</requirement>
23 </requirements> 6 </requirements>
24 <command> 7 <version_command>antismash --version</version_command>
8 <command detect_errors="aggressive">
9 <![CDATA[
25 #import os, glob 10 #import os, glob
26 #set $outputfolder = $html.files_path 11 #set $htmloutputfolder = $html.files_path
27 #if str($infile.ext) == 'genbank': 12 #if str($infile.ext) == 'genbank':
28 #set $file_extension = 'gb' 13 #set $file_extension = 'gb'
29 #else: 14 #else:
30 ## TODO add embl as input file 15 #set $file_extension = $infile.ext
31 #set $file_extension = 'gb'
32 #end if 16 #end if
33 17
34 ln -s $infile #echo 'input_tempfile.' + $file_extension#; 18 ln -s '$infile' input_tempfile.$file_extension &&
35 mkdir -p $outputfolder; 19
36 run_antismash.py 20 ## create html folder
21 mkdir -p $htmloutputfolder &&
22
23 antismash
37 --cpus "\${GALAXY_SLOTS:-12}" 24 --cpus "\${GALAXY_SLOTS:-12}"
38 --enable $types 25 --taxon '${taxon}'
39 --input-type 'nucl' 26 --input-type '${input_type}'
40 $smcogs
41 $clusterblast
42 $subclusterblast
43 $inclusive
44 $full_hmmer
45 $full_blast
46 $eukaryotic
47 27
28 ${clusterblast}
29 ${subclusterblast}
30 ${smcogs}
31 ${inclusive}
32 ${borderpredict}
33 ${tta}
34 ${asf}
35 ${full_hmmer}
48 36
49 #if str($pfam_database) != "None": 37 input_tempfile.$file_extension &&
50 --pfamdir $pfam_database.fields.path
51 #end if
52 38
53 ##--debug 39 ## copy all content to html folder
40 cp input_tempfile/index.html '${html}' 2> /dev/null &&
41 cp -r input_tempfile/* '${htmloutputfolder}'
54 42
55 --disable-embl 43 ]]>
56 --outputfolder $outputfolder
57
58 #echo 'input_tempfile.' + $file_extension#
59
60 ## leave out the start and end features, it can be easily replaced with Galaxy tools
61 ##--from START Start analysis at nucleotide specified
62 ##--to END
63
64 2>&#38;1
65
66 ##
67 ## shuffling files to create the correct outputs for Galaxy
68 ##
69
70 ## html output
71 ;
72 cp #echo os.path.join($outputfolder, 'index.html')# $html 2> /dev/null
73
74 ## gene clusters
75 #if 'geneclusterprots_tabular' in str($outputs).split(','):
76 ;
77 cp #echo os.path.join($outputfolder, 'geneclusters.txt')# $geneclusterprots_tabular 2> /dev/null
78 #end if
79
80 #if 'geneclusterprots_fasta' in str($outputs).split(','):
81 ;
82 cp #echo os.path.join($outputfolder, '*_genecluster_proteins.fa')# $geneclusterprots_fasta 2> /dev/null
83 #end if
84
85
86 ##SVG images
87 #if 'archive_svgs' in str($outputs).split(','):
88 ;
89 cd #echo os.path.join($outputfolder, 'svg')#
90 #if $clusterblast:
91 ;
92 tar cfz $archive_svgs *_all.svg genecluster* 2> /dev/null
93 #else:
94 ;
95 tar cfz $archive_svgs genecluster*
96 #end if
97 #end if
98
99 ##all files in a archive
100 #if 'archive' in str($outputs).split(','):
101 ;
102 cd $outputfolder;
103 tar cf $archive *.zip 2> /dev/null
104 #end if
105
106 ## genbank
107 #if 'gb' in str($outputs).split(','):
108 ;
109 cat #echo os.path.join($outputfolder, '*.gbk')# > $genbank 2> /dev/null
110 #end if
111
112 </command> 44 </command>
113 <inputs> 45 <inputs>
114 <param name="infile" type="data" format="genbank" label="Nucleotide sequence file in GenBank format"/> 46 <param name="infile" type="data" format="genbank,fasta,embl" label="Sequence file in GenBank,EMBL or FASTA format"/>
115 47
116 <param name="eukaryotic" type="select" label="Origin of DNA"> 48 <param argument="--taxon" type="select" label="Origin of DNA">
117 <option value="" selected="True">Prokaryotic</option> 49 <option value="bacteria" selected="True">Bacteria</option>
118 <option value="--eukaryotic">Eukaryotic</option> 50 <option value="fungi">Fungi</option>
119 </param> 51 </param>
120 52
121 <param name="clusterblast" type="boolean" label="BLAST identified clusters against known clusters" 53 <param argument="--input_type" type="select" label="Origin of DNA">
122 help="(--clusterblast)" 54 <option value="nucl" selected="True">Nucleotide</option>
123 truevalue="--clusterblast" falsevalue="" checked="True" /> 55 <option value="prot">Amino-acid</option>
124 <param name="subclusterblast" type="boolean" label="Subcluster BLAST analysis"
125 help="(--subclusterblast)"
126 truevalue="--subclusterblast" falsevalue="" checked="false" />
127 <param name="smcogs" type="boolean" label="Analysis of secondary metabolism gene families (smCOGs)"
128 falsevalue="" truevalue="--smcogs" checked="True" />
129
130 <param name="full_blast" type="boolean" label="Run a whole-genome BLAST analysis"
131 help="(--full-blast)"
132 truevalue="--full-blast" falsevalue="" checked="False" />
133 <param name="full_hmmer" type="boolean" label="Run a whole-genome Pfam analysis"
134 help="(--full-hmmer)"
135 truevalue="--full-hmmer" falsevalue="" checked="false" />
136
137 <param name="inclusive" type="boolean" label="Use Cimermancic et al. algorithm for cluster detection"
138 help="(--inclusive)"
139 truevalue="--inclusive" falsevalue="" checked="false" />
140
141 <param name="pfam_database" type="select" optional="true" label="Pfam database" help="Pfam Covariance models">
142 <options from_file="antismash.loc">
143 <column name="value" index="0"/>
144 <column name="name" index="1"/>
145 <column name="path" index="2"/>
146 </options>
147 </param> 56 </param>
148 57
149 <param name="types" type="select" display="checkboxes" multiple="true" label="Gene cluster types to search"> 58 <param argument="--clusterblast" type="boolean" truevalue="--clusterblast" falsevalue="" checked="False"
150 <option value="t1pks" selected="True">type I polyketide synthases</option> 59 label="BLAST identified clusters against known clusters"
151 <option value="t2pks" selected="True">type II polyketide synthases</option> 60 help="Compare identified clusters against a database of antiSMASH-predicted clusters." />
152 <option value="t3pks" selected="True">type III polyketide synthases</option> 61 <param argument="--subclusterblast" type="boolean" truevalue="--subclusterblast" falsevalue="" checked="True"
153 <option value="t4pks" selected="True">type IV polyketide synthases</option> 62 label="Subcluster BLAST analysis"
154 <option value="transatpks" selected="True">trans-AT PKS</option> 63 help="Compare identified clusters against known subclusters responsible for synthesising precursors." />
155 <option value="nrps" selected="True">nonribosomal peptide synthetases</option> 64 <param argument="--knownclusterblast" type="boolean" truevalue="--knownclusterblast" falsevalue="" checked="True"
156 <option value="terpene" selected="True">terpene synthases</option> 65 label="KnowCluster BLAST analysis"
157 <option value="lantipeptide" selected="True">lantipeptides</option> 66 help="Compare identified clusters against known gene clusters from the MIBiG database."/>
158 <option value="bacteriocin" selected="True">bacteriocins</option> 67 <param argument="--smcogs" type="boolean" checked="True" truevalue="--smcogs" falsevalue=""
159 <option value="blactam" selected="True">beta-lactams</option> 68 label="Analysis of secondary metabolism gene families (smCOGs)"
160 <option value="amglyccycl" selected="True">aminoglycosides / aminocyclitols</option> 69 help="Look for sec. met. clusters of orthologous groups."/>
161 <option value="aminocoumarin" selected="True">aminocoumarins</option> 70 <param argument="--inclusive" type="boolean" truevalue="--inclusive" falsevalue="" checked="False"
162 <option value="siderophore" selected="True">siderophores</option> 71 label="Inclusive ClusterFinder algorithm"
163 <option value="ectoine" selected="True">ectoines</option> 72 help="Use inclusive ClusterFinder algorithm for additional cluster detection."/>
164 <option value="butyrolactone" selected="True">butyrolactones</option> 73 <param argument="--borderpredict" type="boolean" truevalue="--borderpredict" falsevalue="" checked="False"
165 <option value="indole" selected="True">indoles</option> 74 label="Predict gene cluster borders with ClusterFinder"
166 <option value="nucleoside" selected="True">nucleosides</option> 75 help="Use ClusterFinder algorithm to predict gene cluster borders."/>
167 <option value="phosphoglycolipid" selected="True">phosphoglycolipids</option> 76 <param argument="--asf" type="boolean" truevalue="--asf" falsevalue="" checked="True"
168 <option value="oligosaccharide" selected="True">oligosaccharides</option> 77 label="Run active site finder module" />
169 <option value="furan" selected="True">furans</option> 78 <param argument="--tta" type="boolean" truevalue="--tta" falsevalue="" checked="False"
170 <option value="hserlactone" selected="True">hserlactones</option> 79 label="Run TTA codon detection module" />
171 <option value="thiopeptide" selected="True">thiopeptides</option> 80 <param argument="--full_hmmer" type="boolean" truevalue="--full-hmmer" falsevalue="" checked="False"
172 <option value="phenazine" selected="True">phenazines</option> 81 label="Run a whole-genome Pfam analysis" />
173 <option value="phosphonate" selected="True">phosphonates</option>
174 <option value="other" selected="True">others</option>
175 </param>
176 82
177 <param name="outputs" type="select" multiple="true" label="Additional outputs"> 83 <param name="outputs" type="select" multiple="true" label="Outputs">
178 <option value="geneclusterprots_fasta" selected="True">Gene cluster proteins (FASTA)</option> 84 <option value="html" selected="True">HTML file</option>
179 <option value="geneclusterprots_tabular">Gene cluster proteins (Tabular)</option> 85 <option value="all">All results</option>
180 <option value="archive_svgs">All clusters as image (compressed)</option> 86 <option value="embl">EMBL files</option>
181 <option value="archive">All files compressed</option> 87 <option value="gb">GenBank files</option>
182 <option value="gb">Annotated genome (GenBank)</option> 88 <option value="genecluster_tabular">Gene clusters</option>
183 </param> 89 </param>
184 90
185 </inputs> 91 </inputs>
186 <outputs> 92 <outputs>
187 <data format="fasta" name="geneclusterprots_fasta" label="${tool.name} on ${on_string} (Gen Cluster Proteins)"> 93 <collection type="list" name="genecluster_tabular" label="${tool.name} on ${on_string} (Gene Cluster)">
188 <filter>'geneclusterprots_fasta' in outputs</filter> 94 <discover_datasets pattern="(?P&lt;designation&gt;.*)\.txt" directory="input_tempfile" ext="txt" visible="false" />
189 </data> 95 <filter>'genecluster_tabular' in outputs</filter>
190 <data format="tabular" name="geneclusterprots_tabular" label="${tool.name} on ${on_string} (Gen Cluster Proteins)"> 96 </collection>
191 <filter>'geneclusterprots_tabular' in outputs</filter> 97 <collection name="genbank" type="list" label="${tool.name} on ${on_string} (GenBank)">
192 </data> 98 <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" directory="input_tempfile" ext="genbank" visible="false" />
193 <data format="tar" name="archive" label="${tool.name} on ${on_string} (all files compressed)"> 99 <filter>'gb' in outputs</filter>
194 <filter>'archive' in outputs</filter> 100 </collection>
195 </data> 101 <collection name="embl" type="list" label="${tool.name} on ${on_string} (EMBL)">
196 <data format="tar.gz" name="archive_svgs" label="${tool.name} on ${on_string} (SVG images)"> 102 <discover_datasets pattern="(?P&lt;designation&gt;.*)\.gbk" directory="input_tempfile" ext="embl" visible="false" />
197 <filter>'archive_svgs' in outputs</filter> 103 <filter>'embl' in outputs</filter>
198 </data> 104 </collection>
199 <data format="html" name="html" label="${tool.name} on ${on_string} (html report)"> 105 <collection name="archive" type="list" label="${tool.name} on ${on_string} (all files compressed)">
200 <!-- html is default output at any time. 106 <discover_datasets pattern="(?P&lt;designation&gt;.*)\.zip" directory="input_tempfile" ext="zip" visible="false" />
201 <filter>'html' in outputs</filter> 107 <filter>'all' in outputs</filter>
202 --> 108 </collection>
203 </data> 109 <data format="html" name="html" label="${tool.name} on ${on_string} (html report)" />
204 <data name="genbank" format="genbank" label="${tool.name} on ${on_string} (genbank)">
205 <filter>'gb' in outputs</filter>
206 </data>
207 </outputs> 110 </outputs>
208 <help> 111 <tests>
209 112 <test>
210 .. class:: infomark 113 <param name="infile" value="sequence.fasta"/>
114 <output name="html" file="index.html"/>
115 </test>
116 <test>
117 <param name="infile" value="sequence.gb"/>
118 <param name="outputs" value="html,gb"/>
119 <output_collection name="genbank" type="list">
120 <element name="ARBH01000003.1.cluster001" file="ARBH01000003.1.cluster001" ftype="genbank" compare="sim_size" />
121 <element name="ARBH01000003.1.final" file="ARBH01000003.1.final" ftype="genbank"/>
122 </output_collection>
123 <output name="html" file="index.2.html"/>
124 </test>
125 </tests>
126 <help>
127 <![CDATA[
211 128
212 **What it does** 129 **What it does**
213 130
214 antiSMASH allows the rapid genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters in bacterial and fungal genomes. 131 AntiSMASH allows the rapid genome-wide identification, annotation and analysis of secondary metabolite biosynthesis gene clusters in bacterial and fungal genomes.
215 It integrates and cross-links with a large number of in silico secondary metabolite analysis tools that have been published earlier. 132 It integrates and cross-links with a large number of in silico secondary metabolite analysis tools that have been published earlier.
216 133
134 antiSMASH is powered by several open source tools: NCBI BLAST+, HMMer 3, Muscle 3, Glimmer 3, FastTree, TreeGraph 2, Indigo-depict, PySVG and JQuery SVG.
217 135
218 **Input** 136 **Input**
219 137
220 The ideal input for antiSMASH is an annotated nucleotide file in Genbank format. If no annotation is available, 138 The ideal input for antiSMASH is an annotated nucleotide file in Genbank format or EMBL format.
221 we recommend running your sequence through an annotation pipeline like RAST are the one included in Galaxy. 139 You can either upload a GenBank/EMBL file manually, or simply enter the GenBank/RefSeq accession number of your sequence for antiSMASH to upload it.
140 If no annotation is available, we recommend running your sequence through an annotation pipeline like RAST to obtain GBK/EMBL files with high-quality annotations.
222 141
142 Alternatively, you can provide a FASTA file containing a single sequence. antiSMASH will generate a preliminary annotation using Prodigal, and use that to run the rest of the analysis.
143 You can also provide gene annotations in GFF3 foramt. Input files should be properly formatted.
144 If you are creating your GBK/EMBL/FASTA file manually, be sure to do so in a plain text editor like Notepad or Emacs, and saving your files as "All files (.)", ending with the correct extension (for example ".fasta", ".gbk", or ".embl".
223 145
224 There are several optional analyses that may or may not be run on your sequence. 146 There are several optional analyses that may or may not be run on your sequence. Highly recommended is the Gene Cluster Blast Comparative Analysis, which runs BlastP using each amino acid sequence from a detected gene cluster as a query on a large database of predicted protein sequences from secondary metabolite biosynthetic gene clusters, and pools the results to identify the gene clusters that are most homologous to the gene cluster that was detected in your query nucleotide sequence.
225 Highly recommended is the Gene Cluster Blast Comparative Analysis, which runs BlastP using each amino acid sequence from a detected gene cluster as a 147 This analysis is selected by default
226 query on a large database of predicted protein sequences from secondary metabolite biosynthetic gene clusters, and pools the results to identify
227 the gene clusters that are most homologous to the gene cluster that was detected in your query nucleotide sequence.
228 148
149 Also available is the analysis of secondary metabolism gene families (smCOGs). This analysis attempts to allocate each gene in the detected gene clusters to a secondary metabolism-specific gene family using profile hidden Markov models specific for the conserved sequence region characteristic of this family.
150 Additionally, a phylogenetic tree is constructed of each gene together with the (max. 100) sequences of the smCOG seed alignment. This analysis is selected by default
229 151
230 Also available is the analysis of secondary metabolism gene families (smCOGs). 152 **Ouput**
231 This analysis attempts to allocate each gene in the detected gene clusters to a secondary metabolism-specific gene
232 family using profile hidden Markov models specific for the conserved sequence region characteristic of this family.
233 Additionally, a phylogenetic tree is constructed of each gene together with the (max. 100) sequences of the smCOG seed alignment.
234 153
154 The output of the antiSMASH analysis pipeline is organized in an interactive HTML page with SVG graphics, and different parts of the analysis are displayed in different panels for every gene cluster
235 155
236 For the most thorough genome analysis, we provide genome-wide PFAM HMM analysis of all genes in the genome through modules of the CLUSEAN pipeline. 156 In the upper right, a small list of buttons offers further functionality. The house-shaped button will get you back on the antiSMASH start page.
237 Of course, some regions important to secondary metabolism may have been missed in the gene cluster identification stage 157 The question-mark button will get you to this help page. The exclamation-mark button leads to a page explaining about antiSMASH.
238 (e.g. because they represent the biosynthetic pathway of a yet unknown secondary metabolite). 158 The downward-pointing arrow will open a menu offering to download the complete set of results from the antiSMASH run, a summary Excel file and to the summary EMBL/GenBank output file.
239 Therefore, when genome-wide PFAM HMM analysis is selected, the PFAM frequencies are also used to find all genome regions in which PFAM domains typical for secondary metabolism are overrepresented. 159 The EMBL/GenBank file can be viewed in a genome browser such as Artemis.
240 160
241 161 ]]>
242 **References** 162 </help>
243 163 <citations>
244 Marnix H. Medema, Kai Blin, Peter Cimermancic, Victor de Jager, Piotr Zakrzewski, Michael A. Fischbach, Tilmann Weber, 164 <citation type="doi">10.1093/nar/gkv437</citation>
245 Rainer Breitling and Eriko Takano (2011). antiSMASH: Rapid identification, annotation and analysis of secondary metabolite biosynthesis gene clusters. Nucleic Acids Research, doi: 10.1093/nar/gkr466. 165 </citations>
246
247 http://antismash.secondarymetabolites.org/help.html
248
249
250 Bjoern A. Gruening: https://github.com/bgruening/galaxytools/tree/master/antismash
251
252 </help>
253 </tool> 166 </tool>