Mercurial > repos > iuc > proteinortho
comparison proteinortho.xml @ 0:4850f0d15f01 draft
"planemo upload for repository https://gitlab.com/paulklemm_PHD/proteinortho commit 889335c0a31f156c3f90d4c2048cb4df155a53b2"
author | iuc |
---|---|
date | Tue, 18 Feb 2020 17:57:28 -0500 |
parents | |
children | 26abc7846e6f |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4850f0d15f01 |
---|---|
1 <tool id="proteinortho" name="Proteinortho" version="@TOOL_VERSION@+galaxy@WRAPPER_VERSION@"> | |
2 <description>detects orthologous proteins/genes within different species</description> | |
3 <macros> | |
4 <import>proteinortho_macros.xml</import> | |
5 </macros> | |
6 <expand macro="requirements"/> | |
7 <expand macro="version_command"/> | |
8 <command detect_errors="exit_code"><![CDATA[ | |
9 ## the following ln-action is necessary, since the file names are used by proteinortho (output contains filenames => species names) | |
10 #import re | |
11 #for $f in $input_files# | |
12 ln -sf '$f' '${re.sub('[^\w\-_.]', '_', f.element_identifier)}' && | |
13 #end for | |
14 #if $synteny.synteny_options == "specified": | |
15 #for $f in $synteny.input_files_syn# | |
16 ln -sf '$f' '${re.sub('[^\w\-_.]', '_', f.element_identifier)}' && | |
17 #end for# | |
18 #end if | |
19 proteinortho | |
20 --project=result | |
21 --cpus="\${GALAXY_SLOTS:-4}" | |
22 --ram="\${GALAXY_MEMORY_MB:-16000}" | |
23 #if $more_options.selfblast: | |
24 $more_options.selfblast | |
25 #end if | |
26 #if $more_options.singles: | |
27 $more_options.singles | |
28 #end if | |
29 --p=$p | |
30 --e=$evalue | |
31 #if $more_options.cov: | |
32 --cov=$more_options.cov | |
33 #end if | |
34 #if $more_options.sim: | |
35 --sim=`LC_NUMERIC=C awk "BEGIN {printf \"%.2f\",$more_options.sim/100}"` | |
36 #end if | |
37 #if $more_options.identity: | |
38 --cov=$more_options.identity | |
39 #end if | |
40 #if $more_options.isoform != "no": | |
41 --isoform=$more_options.isoform | |
42 #end if | |
43 #if $synteny.synteny_options == "specified": | |
44 --synteny | |
45 --dups=$synteny.dups | |
46 --cs=$synteny.cs | |
47 --alpha=$synteny.alpha | |
48 #end if | |
49 #for $f in $input_files# | |
50 ${re.sub('[^\w\-_.]', '_', f.element_identifier)} | |
51 #end for# | |
52 #if $synteny.synteny_options == "specified": | |
53 #for $f in $synteny.input_files_syn# | |
54 ${re.sub('[^\w\-_.]', '_', f.element_identifier)} | |
55 #end for# | |
56 #end if | |
57 2> >(sed -E "s/.\[([0-9]{1,2}(;[0-9]{1,2})?)?[mGK]//g" 1>&2) | |
58 #if $synteny.synteny_options == "specified": | |
59 && | |
60 mv result.poff-graph result.proteinortho-graph && | |
61 mv result.poff.tsv result.proteinortho.tsv && | |
62 mv result.poff.html result.proteinortho.html ; | |
63 #end if | |
64 ]]></command> | |
65 <inputs> | |
66 <param name="input_files" format="fasta" type="data" multiple="true" min="2" label="Select the input fasta files (>2)" help="The input fasta files. At least 2 are needed!"/> | |
67 <param argument="--p" type="select" label="Similarity comparision algorithm" help="In the first step of proteinortho an all-versus-all reciprocal best hit graph is build from the input files (using this algorithm)."> | |
68 <option value="diamond" selected="true">diamond (aminoacid sequences)</option> | |
69 <option value="autoblast">auto detect NCBI-BLAST (protein and nucleotide sequences)</option> | |
70 <option value="blastp">NCBI-BLASTP+ (protein sequences)</option> | |
71 <option value="blastn">NCBI-BLASTN+ (nucleotide sequences)</option> | |
72 <option value="lastp">Last (aminoacid sequences)</option> | |
73 <option value="lastn">Last (nucleotide sequences)</option> | |
74 <option value="blatp">BLAT (aminoacid sequences)</option> | |
75 <option value="blatn">BLAT (nucleotide sequences)</option> | |
76 </param> | |
77 <param argument="--evalue" type="float" value="0.001" min="0" label="E-value threshold of the blast algorithm" help="This is the main parameter for the generation of the reciprocal best hit graph. Larger values results in more false positives (connections between proteins)."/> | |
78 <param argument="--conn" type="float" value="0.1" min="0." max="10." label="Minimal algebraic connectivity" help="This is the main parameter for the clustering step. Choose larger values then more splits are done, resulting in more and smaller clusters."/> | |
79 <section name="more_options" title="Additional Options" expanded="False"> | |
80 <param argument="--cov" type="integer" value="50" min="0" max="100" label="Minimal coverage of best blast alignments in %"/> | |
81 <param argument="--sim" type="integer" value="95" min="0" max="100" label="Minimal sequence similarity in %"/> | |
82 <param argument="--identity" type="integer" value="25" min="0" max="100" label="Minimal percent identity of best blast hits in %"/> | |
83 <param argument="--selfblast" type="boolean" checked="false" truevalue="--selfblast" falsevalue="" label="Apply selfblast, detects paralogs without orthologs "/> | |
84 <param argument="--singles" type="boolean" checked="false" truevalue="--singles" falsevalue="" label="Report singleton genes without any hit "/> | |
85 <param argument="--isoform" type="select" label="Use isoform information" help="The reciprocal best hit graph is build using isoform information (isoforms are treated equivalent). For ncbi : simply add the additional files to the input (file names need to match). For uniprot : the isoforms need to contain the word isoform and the corresponding identifier. For trinity simply use the trinity output format."> | |
86 <option value="no" selected="true">Don't use isoform information</option> | |
87 <option value="ncbi">ncbi style (..._additional.fasta)</option> | |
88 <option value="uniprot">uniprot style (...isoform of...)</option> | |
89 <option value="trinity">trinity style (...i4)</option> | |
90 </param> | |
91 </section> | |
92 <conditional name="synteny"> | |
93 <param name="synteny_options" type="select" label="Activate synteny feature (POFF)" help="To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. For more details see doi:10.1371/journal.pone.0105015."> | |
94 <option value="no" selected="true">no</option> | |
95 <option value="specified">yes</option> | |
96 </param> | |
97 <when value="no"/> | |
98 <when value="specified"> | |
99 <param argument="--dups" type="integer" value="0" min="0" max="100" label="Number of reiterations for adjacencies heuristic, to determine duplicated regions"/> | |
100 <param argument="--cs" type="integer" value="3" min="0" max="100" label="Size of a maximum common substring (MCS) for adjacency matches"/> | |
101 <param argument="--alpha" type="float" value="0.5" min="0." max="1." label="Minimal percent identity of best blast hits"/> | |
102 <param name="input_files_syn" type="data" format="gff" multiple="true" min="2" label="Select the GFF3 files matching the input fasta files" help="The GFF3 files need matching names with the input fasta files. If you provide mybacteria123.faa or mybacteria123.fasta ... then you need to provide mybacteria123.gff here accoringly. The attributes column (#9) must contain the attribute Name=GENE IDENTIFIER where GENE IDENTIFIER corresponds to the respective (protein) identifier in the FASTA input. For example see https://gitlab.com/paulklemm_PHD/proteinortho/-/blob/master/test/C.gff"/> | |
103 </when> | |
104 </conditional> | |
105 </inputs> | |
106 <outputs> | |
107 <data name="blastgraph" format="tabular" label="${tool.name} on ${on_string}: RBH graph" from_work_dir="result.blast-graph"/> | |
108 <data name="proteinortho" format="tabular" label="${tool.name} on ${on_string}: orthology-groups" from_work_dir="result.proteinortho.tsv"/> | |
109 <data name="proteinorthograph" format="tabular" label="${tool.name} on ${on_string}: orthology-pairs" from_work_dir="result.proteinortho-graph"/> | |
110 </outputs> | |
111 <tests> | |
112 <test expect_num_outputs="3"> <!-- test normal --> | |
113 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | |
114 <output name="proteinortho"> | |
115 <assert_contents> | |
116 <has_text text="# Species	Genes	Alg.-Conn."/> | |
117 <has_text text="2	5	0.16"/> | |
118 <has_text text="M_640,M_642,M_649"/> | |
119 </assert_contents> | |
120 </output> | |
121 <output name="blastgraph"> | |
122 <assert_contents> | |
123 <has_text text="L_10	E_10	"/> | |
124 </assert_contents> | |
125 </output> | |
126 <output name="proteinorthograph"> | |
127 <assert_contents> | |
128 <has_text text="L_11	E_11	"/> | |
129 </assert_contents> | |
130 </output> | |
131 </test> | |
132 <test expect_num_outputs="3"> <!-- various parameter --> | |
133 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | |
134 <param name="evalue" value="1"/> | |
135 <param name="conn" value="1"/> | |
136 <param name="cov" value="42"/> | |
137 <param name="sim" value="42"/> | |
138 <param name="identity" value="42"/> | |
139 <param name="selfblast" value="true"/> | |
140 <param name="singles" value="true"/> | |
141 <output name="proteinortho"> | |
142 <assert_contents> | |
143 <has_text text="# Species	Genes	Alg.-Conn."/> | |
144 <has_text text="1	1	0"/> | |
145 <has_text text="	C_177	"/> | |
146 </assert_contents> | |
147 </output> | |
148 <output name="blastgraph"> | |
149 <assert_contents> | |
150 <has_text text="C_1	C_1	"/> | |
151 </assert_contents> | |
152 </output> | |
153 <output name="proteinorthograph"> | |
154 <assert_contents> | |
155 <has_text text="C_12	C_21	"/> | |
156 </assert_contents> | |
157 </output> | |
158 </test> | |
159 <test expect_num_outputs="3"> <!-- synteny --> | |
160 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | |
161 <param name="input_files_syn" value="L.gff,C.gff,C2.gff,E.gff,M.gff"/> | |
162 <param name="synteny_options" value="specified"/> | |
163 <output name="proteinortho"> | |
164 <assert_contents> | |
165 <has_text text="# Species	Genes	Alg.-Conn."/> | |
166 <has_text text="4	5	0.144"/> | |
167 <has_text text="E_313,E_315"/> | |
168 </assert_contents> | |
169 </output> | |
170 <output name="proteinorthograph"> | |
171 <assert_contents> | |
172 <has_text text="M_313	L_313	"/> | |
173 </assert_contents> | |
174 </output> | |
175 </test> | |
176 <test expect_num_outputs="3"> <!-- blast --> | |
177 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | |
178 <param name="--p" value="blastp"/> | |
179 <output name="proteinortho"> | |
180 <assert_contents> | |
181 <has_text text="# Species	Genes	Alg.-Conn."/> | |
182 <has_text text="2	5	0.16"/> | |
183 <has_text text="M_640,M_642,M_649"/> | |
184 </assert_contents> | |
185 </output> | |
186 <output name="blastgraph"> | |
187 <assert_contents> | |
188 <has_text text="M_3	L_3	"/> | |
189 </assert_contents> | |
190 </output> | |
191 <output name="proteinorthograph"> | |
192 <assert_contents> | |
193 <has_text text="M_317	L_317	"/> | |
194 </assert_contents> | |
195 </output> | |
196 </test> | |
197 <test expect_num_outputs="3"> <!-- auto blast --> | |
198 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | |
199 <param name="--p" value="autoblast"/> | |
200 <output name="proteinortho"> | |
201 <assert_contents> | |
202 <has_text text="# Species	Genes	Alg.-Conn."/> | |
203 <has_text text="2	5	0.16"/> | |
204 <has_text text="M_640,M_642,M_649"/> | |
205 </assert_contents> | |
206 </output> | |
207 <output name="blastgraph"> | |
208 <assert_contents> | |
209 <has_text text="M_3	L_3	"/> | |
210 </assert_contents> | |
211 </output> | |
212 <output name="proteinorthograph"> | |
213 <assert_contents> | |
214 <has_text text="M_317	L_317	"/> | |
215 </assert_contents> | |
216 </output> | |
217 </test> | |
218 <test expect_num_outputs="3"> <!-- last --> | |
219 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | |
220 <param name="--p" value="lastp"/> | |
221 <output name="proteinortho"> | |
222 <assert_contents> | |
223 <has_text text="# Species	Genes	Alg.-Conn."/> | |
224 <has_text text="2	5	0.16"/> | |
225 <has_text text="M_640,M_642,M_649"/> | |
226 </assert_contents> | |
227 </output> | |
228 <output name="blastgraph"> | |
229 <assert_contents> | |
230 <has_text text="M_636	E_317	"/> | |
231 </assert_contents> | |
232 </output> | |
233 <output name="proteinorthograph"> | |
234 <assert_contents> | |
235 <has_text text="E_11	C_11	"/> | |
236 </assert_contents> | |
237 </output> | |
238 </test> | |
239 <test expect_num_outputs="3"> <!-- blat --> | |
240 <param name="input_files" value="L.fasta,C.fasta,C2.fasta,E.fasta,M.fasta"/> | |
241 <param name="--p" value="blastp"/> | |
242 <output name="proteinortho"> | |
243 <assert_contents> | |
244 <has_text text="# Species	Genes	Alg.-Conn."/> | |
245 <has_text text="2	5	0.16"/> | |
246 <has_text text="M_640,M_642,M_649"/> | |
247 </assert_contents> | |
248 </output> | |
249 <output name="blastgraph"> | |
250 <assert_contents> | |
251 <has_text text="E_10	C_10	"/> | |
252 </assert_contents> | |
253 </output> | |
254 <output name="proteinorthograph"> | |
255 <assert_contents> | |
256 <has_text text="E_10	C_10	"/> | |
257 </assert_contents> | |
258 </output> | |
259 </test> | |
260 </tests> | |
261 <help><![CDATA[Proteinortho with POFF - An orthology detection tool | |
262 | |
263 **What it does** | |
264 | |
265 Proteinortho is a tool to detect orthologous proteins/genes within different species (at least 2). | |
266 | |
267 | It compares similarities of given gene/protein sequences and clusters them to find significant groups. | |
268 | The algorithm was designed to handle large-scale data and can be applied to hundreds of species at one. | |
269 | Details can be found in (doi:10.1186/1471-2105-12-124). | |
270 | To enhance the prediction accuracy, the relative order of genes (synteny) can be used as additional feature for the discrimination of orthologs. The corresponding extension, namely PoFF (details see doi:10.1371/journal.pone.0105015), is already build in Proteinortho. | |
271 | |
272 ---- | |
273 | |
274 **Proteinortho in a nutshell** | |
275 | |
276 ---- | |
277 | |
278 * **(i) Build adaptive reciprocal best hit graph (RBH)** | |
279 | |
280 | Using the blast algorithm (diamond,blast,blat,...) all input sequences are compared against each other. | |
281 | If two proteins find each other with respect to multiple criteria like minimal evalue, similarity compared to the best hit, ... then a edge is drawn between the two proteins. | |
282 | The result of this step is outputted to RBH | |
283 | |
284 * **(ii) Cluster the RBH** | |
285 | |
286 | Using two clustering algorithms, edges are removed that weakly connect two connected components to reduce false positive hits. | |
287 | The resulting connected components are outputted in orthology-groups / -PAIRS | |
288 | |
289 ---- | |
290 | |
291 **Proteinortho output files** | |
292 | |
293 ---- | |
294 | |
295 * **RBH** | |
296 | |
297 | The result of the (i) step, the reciprocal best hit graph. | |
298 | First a comment line announces 2 species (# ecoli.faa human.faa), then each line corresponds to a reciprocal best hit between 2 proteins/genes of the announced species. The output format is shown below. | |
299 | *seqidA*,*seqidB* = the 2 ids/names of the proteins involved | |
300 | *evalue_ab* = evalue with seqidA as query and seqidB as part of the database | |
301 | *bitscore_ab* = bitscore with seqidA as query ... | |
302 | *evalue_ba* = evalue with seqidB as query ... | |
303 | ... | |
304 | |
305 .. csv-table:: | |
306 | |
307 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba | |
308 | |
309 ---- | |
310 | |
311 * **orthology-groups** | |
312 | |
313 | The result of the (ii) step, the clustered reciprocal best hit graph or the orthology groups. | |
314 | Every line corresponds to an orthology group of proteins/genes. | |
315 | The first 3 columns characterize general properties of that group: number of proteins, species and the algebraic connectivity. The higher the algebraic connectivity the more edges are there and the better the group is connected to itself in general. | |
316 | Then a column for each species follows containing the proteins of that species. If a species contributes with more than one protein to a group of orthologs, then they are ordered by connectivity. | |
317 | |
318 .. csv-table:: | |
319 | |
320 Species,Genes,Alg.-Conn. | |
321 | |
322 ---- | |
323 | |
324 * **orthology-pairs** | |
325 | |
326 | The same as orthology-groups but every edge is printed one-by-one here. The output is formatted the same as the RBH graph: | |
327 | |
328 .. csv-table:: | |
329 | |
330 seqidA,seqidB,evalue_ab,bitscore_ab,evalue_ba,bitscore_ba | |
331 | |
332 ---- | |
333 | |
334 **Proteinortho-Tools for downstream analysis** | |
335 | |
336 * `proteinortho grab proteins` : find gene(s)/protein(s) in a given fasta file and retrieve their sequence(s). You can also use a orthology-groups file. | |
337 * `proteinortho summary` : Summaries the orthology-pairs/RBH files to determine how the species are connected to each other. | |
338 | |
339 More information can be found on github https://gitlab.com/paulklemm_PHD/proteinortho | |
340 ]]> | |
341 </help> | |
342 <expand macro="citations"/> | |
343 </tool> |