comparison pathwaymatcher.xml @ 0:f66af2b04a98 draft

planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pathwaymatcher commit c12a99d3da62c83b779175b3c9022e7d5622053a
author galaxyp
date Wed, 20 Jun 2018 14:21:10 -0400
parents
children 5d0c44bc354d
comparison
equal deleted inserted replaced
-1:000000000000 0:f66af2b04a98
1 <tool id="reactome_pathwaymatcher" name="Pathway Matcher" version="@PATHWAYMATCHER_VERSION@.@TOOL_SUBVERSION@">
2 <description>
3 PathwayMatcher is a software tool to search for pathways related to a list of proteins in Reactome.
4 </description>
5 <macros>
6 <token name="@PATHWAYMATCHER_VERSION@">1.8</token>
7 <token name="@TOOL_SUBVERSION@">0</token>
8 <xml name="input_fasta">
9 <param format="fasta" name="input_database" type="data" label="Protein Database"
10 help="Select FASTA database from history"/>
11 </xml>
12 </macros>
13 <requirements>
14 <requirement type="package" version="@PATHWAYMATCHER_VERSION@">pathwaymatcher</requirement>
15 <requirement type="package" version="3.0">zip</requirement>
16 </requirements>
17 <stdio>
18 <exit_code range="1:" level="fatal" description="Job Failed" />
19 <regex match="java.*Exception" level="fatal" description="Java Exception"/>
20 <regex match="Could not create the Java virtual machine" level="fatal" description="JVM Error"/>
21 </stdio>
22 <command>
23 <![CDATA[
24 #from datetime import datetime
25 #import json
26 #import os
27 #set $exp_str = "Galaxy_Experiment_%s" % datetime.now().strftime("%Y%m%d%H%M%s")
28 #set $samp_str = "Sample_%s" % datetime.now().strftime("%Y%m%d%H%M%s")
29 #set $temp_stderr = "pathwaym_stderr"
30 #set $bin_dir = "bin"
31
32 mkdir output;
33 cwd=`pwd`;
34 export HOME=\$cwd;
35
36 #####################
37 ## Pathway Matcher ##
38 #####################
39 (pathwaymatcher src.main.java.no.uib.pap.pathwaymatcher.PathwayMatcher
40
41 #for $i, $s in enumerate($input_types)
42
43 ## GENETIC VARIANTS
44
45 #if $s.input_type.input_type_selector == "rsid"
46 -t rsid -i '${s.input_type.input_rsid}'
47 #end if
48
49 #if $s.input_type.input_type_selector == "chrbp"
50 -t chrbp -i '${s.input_type.input_chrbp}'
51 #end if
52
53 #if $s.input_type.input_type_selector == "vcf"
54 -t vcf -i '${s.input_type.input_vcf}'
55 #end if
56
57 ## GENES
58
59 #if $s.input_type.input_type_selector == "gene"
60 -t gene -i '${s.input_type.input_gene}'
61 #end if
62
63 ## PEPTIDES
64
65 #if $s.input_type.input_type_selector == "peptide"
66 -t peptide -i '${s.input_type.input_peptide}'
67 -f '${s.input_type.input_database}'
68 -r '${s.input_type.ptm_range}'
69 #end if
70
71 #if $s.input_type.input_type_selector == "modifiedpeptide"
72 -t modifiedpeptide -i '${s.input_type.input_modifiedpeptide}'
73 -f '${s.input_type.input_database}'
74 -r '${s.input_type.ptm_range}'
75 #end if
76
77 ## PROTEINS
78
79 #if $s.input_type.input_type_selector == "uniprot"
80 -t uniprot -i '${s.input_type.input_uniprot}'
81 #end if
82
83 #if $s.input_type.input_type_selector == "ensembl"
84 -t ensembl -i '${s.input_type.input_ensembl}'
85 #end if
86
87 ## PROTEOFORMS
88
89 #if $s.input_type.input_type_selector == "proteoforms"
90
91 #if $s.input_type.proteoform_match_criteria:
92 -t proteoform -m '${s.input_type.proteoform_match_criteria}' -i '${s.input_type.input_proteoforms}'
93 #else:
94 -t proteoform -i '${s.input_type.input_proteoforms}'
95 #end if
96
97 #end if
98
99 #end for
100
101 ## OUTPUT OPTIONS
102
103 #if $output_options.search_top_level_info:
104 -tlp
105 #end if
106
107 #set $output_graphs_list = str($output_options.output_graphs).split(',')
108
109 #if 'gg' in $output_graphs_list:
110 -gg
111 #end if
112
113 #if 'gu' in $output_graphs_list:
114 -gu
115 #end if
116
117 #if 'gp' in $output_graphs_list:
118 -gp
119 #end if
120
121 2>> $temp_stderr);
122
123 ## We create a folder to contain graphs files.
124 #if $output_options.output_graphs:
125 mkdir "graphs";
126 #end if
127
128 #if 'gg' in $output_graphs_list:
129 mv -t "graphs" "geneExternalEdges.tsv" "geneInternalEdges.tsv" "geneVertices.tsv" ;
130 #end if
131
132 #if 'gu' in $output_graphs_list:
133 mv -t "graphs" "proteinExternalEdges.tsv" "proteinInternalEdges.tsv" "proteinVertices.tsv";
134 #end if
135
136 #if 'gp' in $output_graphs_list:
137 mv -t "graphs" "proteoformExternalEdges.tsv" "proteoformInternalEdges.tsv" "proteoformVertices.tsv";
138 #end if
139
140 exit_code_for_galaxy=\$?;
141 cat $temp_stderr 2>&1;
142 (exit \$exit_code_for_galaxy)
143 ]]>
144 </command>
145 <inputs>
146
147 <repeat name="input_types" title="Input" min="1">
148 <conditional name="input_type">
149 <param name="input_type_selector" type="select" label="Input type"
150 help="">
151 <option value="rsid">Genetic variants - SNP rsId list</option>
152 <option value="chrbp">Genetic variants - Chromosomes and base pairs</option>
153 <option value="vcf">Genetic variants - Variant Call Format Specification</option>
154 <option value="gene">Genes</option>
155 <option value="peptide">Peptides - Simple list</option>
156 <option value="modifiedpeptide">Peptides - Peptide List with PTM types and sites</option>
157 <option value="uniprot">Proteins - UniProt Accession list</option>
158 <option value="ensembl">Proteins - Ensembl identifier list</option>
159 <option value="proteoforms">Proteoforms</option>
160 </param>
161
162 <!-- Genetic variants -->
163 <when value="rsid">
164 <param format="txt" name="input_rsid" type="data" label="SNP rsId list"
165 help="The file contains one rsid identifier as defined in dbSNP[1] on each row.
166 The list must be ordered by chromosome and base pair (bp). The list must not have duplicates.
167 All rsids must appear in the human assembly GRCh37.p13. "/>
168 </when>
169
170 <when value="chrbp">
171 <param format="txt" name="input_chrbp" type="data" label="Chromosomes and base pairs"
172 help="Genetic variants can also be represented using the chromosome and the base pair numbers.
173 The input should be sorted by chromosome number and then by base pair. "/>
174 </when>
175
176 <when value="vcf">
177 <param format="vcf" name="input_vcf" type="data" label="Variant Call Format Specification"
178 help="The input follows the Variant Call Format Specification[2] v4.3.
179 It also allows the possibility to specify only the first 4 columns in the data section of the file:
180 CHROM, POS, ID, REF. "/>
181 </when>
182
183 <!-- Genes -->
184 <when value="gene">
185 <param format="txt" name="input_gene" type="data" label="Genes"
186 help="File with a one gene name in each line. Genes follow the HUGO gene nomenclature[3]."/>
187 </when>
188
189 <!-- Peptides -->
190 <when value="peptide">
191 <param format="txt" name="input_peptide" type="data" label="Simple list"
192 help="File with a one peptide sequence in each line."/>
193
194 <expand macro="input_fasta" />
195
196 <param name="ptm_range" type="integer" value="0" label="PTM position range" optional="true"
197 help="Plus minus positions for the same PTM site."/>
198 </when>
199
200 <when value="modifiedpeptide">
201 <param format="txt" name="input_modifiedpeptide" type="data" label="Peptide List with PTM types and sites"
202 help="Each line of the file corresponds to a single peptide with post-translational modifications."/>
203
204 <expand macro="input_fasta" />
205
206 <param name="ptm_range" type="integer" value="0" label="PTM position range" optional="true"
207 help="Plus minus positions for the same PTM site."/>
208 </when>
209
210 <!-- Proteins -->
211 <when value="uniprot">
212 <param format="txt" name="input_uniprot" type="data" label="UniProt Accession list"
213 help="File with a one Uniprot Accession [4] in each line."/>
214 </when>
215
216 <when value="ensembl">
217 <param format="txt" name="input_ensembl" type="data" label="Ensembl identifier list"
218 help="File with a one Ensembl identifier [5] in each line."/>
219 </when>
220
221 <!-- Proteoforms -->
222 <when value="proteoforms">
223 <param format="txt" name="input_proteoforms" type="data" label="Proteoforms"
224 help="A proteoform defines a specific state of a protein.
225 It is composed by the protein UniProt accession, isoform and set of post translational modifications.
226 The input file contains one line for each proteoform. Each PTM is specified using a modification
227 identifier and a site, separated by ':'(semicolon). For example: '00046:133'.
228 The identifier is a 5 digit id from the PSI-MOD Protein Modification Onthology [6]."/>
229
230 <param name="proteoform_match_criteria" type="select" label="Proteoform match criteria">
231 <option value="STRICT">STRICT</option>
232 <option value="ONE">ONE</option>
233 <option value="SUPERSET" selected="True">SUPERSET</option>
234 <option value="SUBSET">SUBSET</option>
235 </param>
236 </when>
237
238 </conditional>
239
240 </repeat>
241
242 <section name="output_options" expanded="true" title="Output options">
243
244 <param name="search_top_level_info" type="select" label="Add search top level info">
245 <option value="0" selected="True">False</option>
246 <option value="1">True</option>
247 </param>
248
249 <param name="output_graphs" type="select" display="checkboxes" multiple="True" label="Connection graphs"
250 help="Generates a zipped file with connection graphs as an additional output when executing the pathway search and analysis.
251 The graph can use genes, proteins or proteoforms as vertices.">
252 <option value="gg">Genes</option>
253 <option value="gu">Proteins</option>
254 <option value="gp">Proteoforms</option>
255 </param>
256
257 </section>
258
259 </inputs>
260 <outputs>
261 <data name="search" format="tsv" from_work_dir="search.tsv" label="${tool.name} - search on ${on_string}" />
262 <data name="analysis" format="tsv" from_work_dir="analysis.tsv" label="${tool.name} - analysis on ${on_string}" />
263 <collection name="graphs_files" type="list" label="${tool.name} - graphs on ${on_string}" >
264 <filter>output_options['output_graphs'] != None</filter>
265 <discover_datasets pattern="__name_and_ext__" directory="graphs" ext="tsv"/>
266 </collection>
267 </outputs>
268
269
270 <tests>
271
272 <!-- Test that genes search works -->
273 <test>
274 <repeat name="input_types">
275 <conditional name="input_type">
276 <param name="input_type_selector" value="gene"/>
277 <param name="input_gene" value="genes.txt" ftype="txt" />
278 </conditional>
279 </repeat>
280 <output name="search" file="genes_search.tsv" ftype="tsv" compare="sim_size" delta="3000" />
281 </test>
282
283 <!-- Test graphs from proteoforms -->
284 <test>
285 <repeat name="input_types">
286 <conditional name="input_type">
287 <param name="input_type_selector" value="proteoforms"/>
288 <param name="input_proteoforms" value="proteoforms.txt" ftype="txt" />
289 </conditional>
290 </repeat>
291 <param name="output_graphs" value="gg,gu,gp" />
292 <output_collection name="graphs_files" type="list">
293 <element name="geneExternalEdges" ftype="tsv" file="proteoforms_graphs/geneExternalEdges.tsv" compare="sim_size" delta="1000" />
294 <element name="geneInternalEdges" ftype="tsv" file="proteoforms_graphs/geneInternalEdges.tsv" compare="sim_size" delta="1000"/>
295 <element name="geneVertices" ftype="tsv" file="proteoforms_graphs/geneVertices.tsv" compare="sim_size" delta="1000"/>
296 <element name="proteinExternalEdges" ftype="tsv" file="proteoforms_graphs/proteinExternalEdges.tsv" compare="sim_size" delta="10000"/>
297 <element name="proteinInternalEdges" ftype="tsv" file="proteoforms_graphs/proteinInternalEdges.tsv" compare="sim_size" delta="1000"/>
298 <element name="proteinVertices" ftype="tsv" file="proteoforms_graphs/proteinVertices.tsv" compare="sim_size" delta="1000"/>
299 <element name="proteoformExternalEdges" ftype="tsv" file="proteoforms_graphs/proteoformExternalEdges.tsv" compare="sim_size" delta="1000"/>
300 <element name="proteoformInternalEdges" ftype="tsv" file="proteoforms_graphs/proteoformInternalEdges.tsv" compare="sim_size" delta="1000"/>
301 <element name="proteoformVertices" ftype="tsv" file="proteoforms_graphs/proteoformVertices.tsv" compare="sim_size" delta="1000"/>
302 </output_collection>
303 </test>
304
305 </tests>
306 <help>
307
308 .. class:: infomark
309
310 **Introduction**
311
312 Biological pathways are an excellent resource to analyze the causes and consequences of certain phenotypes.
313 Most of the components of the pathways are proteins. When searching for relevant pathways to perform analysis
314 of a patient sample proteins, it is very common to lose information due to lack of precision in the search.
315
316 This leads to result sets with many extra selected pathways that are not really related to the input sample.
317
318 .. class:: infomark
319
320 **What it does**
321
322 We present more fine grained approach to search, not only with the gene names, but also with post translational
323 modifications of the proteins, such as phosphorylation.
324
325 Ultimately, any omics dataset with its mutations and
326 modifications will be mapped directly to the functional knowledgebases allowing the functional interpretation by
327 researchers and clinicians.
328
329 The reference database used is Reactome, a free, open source, curated and peer reviewed database of biological reactions, that contains the quality data needed for this type of fine grained search. database of biological reactions. It can be readily queried with omics datasets, and we are improving its features by extending the matching the clinical data to the biological pathways. Not only will the gene names be used, but also mutations or post translational modifications such as phosphorylation.
330
331
332 .. class:: infomark
333
334 **Inputs and outputs**
335
336 PathwayMatcher can search for reactions and pathways with various input types, and generates mapping files to the database.
337
338 The input can be:
339
340 - Genetic variants
341 - Genes
342 - Peptides
343 - Protein
344 - Proteoforms
345
346 The output of PathwayMatcher is composed of two files, the Reaction and Pathway mapping and the statistical analysis of the relevant pathways.
347
348 .. class:: infomark
349
350 Information included with this tool is a brief summary of the main one included in PathwayMatcher_.
351
352 Specific information about PathwayMatcher's Input_ and Output_ may also be found there.
353
354
355 .. class:: infomark
356
357 **References**
358
359 [1] dbSNP_
360
361 [2] VCF v4.3:
362 http://samtools.github.io/hts-specs/VCFv4.3.pdf
363
364 [3] genenames.org: the HGNC resources in 2015. Nucleic Acids Res. 2015 Jan;43(Database issue):D1079-85. doi: 10.1093/nar/gku1071. :
365 https://www.ncbi.nlm.nih.gov/pubmed/25361968
366
367 [4] UniProt: the universal protein knowledgebase. Nucleic Acids Res. 45: D158-D169 (2017):
368 http://dx.doi.org/doi:10.1093/nar/gkw1099
369
370 [5] Ensembl:
371 https://www.ensembl.org/info/genome/stable_ids/index.html
372
373 [6] The PSI-MOD community standard for representation of protein modification data. Nature Biotechnology 26, 864 - 866 (2008):
374 http://www.nature.com/nbt/journal/v26/n8/full/nbt0808-864.html
375
376 .. _dbSNP: https://www.ncbi.nlm.nih.gov/projects/SNP/
377 .. _PathwayMatcher: https://github.com/LuisFranciscoHS/PathwayMatcher
378 .. _Input: https://github.com/LuisFranciscoHS/PathwayMatcher/wiki/Input
379 .. _Output: https://github.com/LuisFranciscoHS/PathwayMatcher/wiki/Output
380
381 </help>
382
383 </tool>