1
|
1 <tool id="secimtools_add_kegg_anno_info" name="Link Name to KEGGID" version="@WRAPPER_VERSION@">
|
|
2 <description></description>
|
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
|
6 <expand macro="requirements" />
|
|
7 <stdio>
|
|
8 <exit_code range="1" level="fatal" description="Repeated Unique IDs"/>
|
|
9 </stdio>
|
|
10 <command detect_errors="exit_code"><![CDATA[
|
|
11 add_kegg_anno_info.py
|
|
12 -s=$species
|
|
13 #if $dataSets.whichDataSet == "geneDataset":
|
|
14 -ga=$dataSets.geneAnnot
|
|
15 -gid=$dataSets.geneUniqId
|
|
16 -gn=$dataSets.geneName
|
|
17 #end if
|
|
18 #if $dataSets.whichDataSet == "metDataset":
|
|
19 -ma=$dataSets.metAnnot
|
|
20 -mid=$dataSets.metUniqId
|
|
21 -mn=$dataSets.metName
|
|
22 #end if
|
|
23 #if $dataSets.whichDataSet == "geneDataset,metDataset":
|
|
24 -ga=$dataSets.geneAnnot
|
|
25 -gid=$dataSets.geneUniqId
|
|
26 -gn=$dataSets.geneName
|
|
27 -ma=$dataSets.metAnnot
|
|
28 -mid=$dataSets.metUniqId
|
|
29 -mn=$dataSets.metName
|
|
30 #end if
|
|
31 -go=$geneOutput
|
|
32 -mo=$metOutput
|
|
33 ]]></command>
|
|
34 <inputs>
|
|
35 <param name="species" type="select" label="Select Species from the list" >
|
|
36 <option value="hsa">Homo sapiens</option>
|
|
37 <option value="mmu">Mus musculus</option>
|
|
38 <option value="rno">Rattus norvegicus</option>
|
|
39 <option value="dme">Drosophila melanogaster</option>
|
|
40 <option value="ath">Arabidopsis thaliana</option>
|
|
41 <option value="sce">Saccharomyces cerevisiae</option>
|
|
42 <option value="eco">Escherichia coli</option>
|
|
43 </param>
|
|
44 <conditional name="dataSets">
|
|
45 <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
|
|
46 <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
|
|
47 <option value="geneDataset">Gene Expression Annotation Dataset</option>
|
|
48 <option value="metDataset">Metabolomic Annotation Dataset</option>
|
|
49 <validator type="no_options" message="You must select at least one option." />
|
|
50 </param>
|
|
51 <when value="geneDataset">
|
|
52 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
|
|
53 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
|
|
54 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
|
|
55 </when>
|
|
56 <when value="metDataset">
|
|
57 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
|
|
58 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
|
|
59 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
|
|
60 </when>
|
|
61 <when value="geneDataset,metDataset">
|
|
62 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
|
|
63 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
|
|
64 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
|
|
65 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
|
|
66 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
|
|
67 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
|
|
68 </when>
|
|
69 </conditional>
|
|
70 </inputs>
|
|
71 <outputs>
|
|
72 <data format="tabular" name="geneOutput" label="${tool.name} on ${on_string}: Gene to KEGGID link">
|
|
73 <filter>(dataSets['whichDataSet'] == 'geneDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
|
|
74 </data>
|
|
75 <data format="tabular" name="metOutput" label="${tool.name} on ${on_string}: Metabolite to KEGGID link">
|
|
76 <filter>(dataSets['whichDataSet'] == 'metDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
|
|
77 </data>
|
|
78 </outputs>
|
|
79 <tests>
|
|
80 <test>
|
|
81 <param name="species" value="rno"/>
|
|
82 <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/>
|
|
83 <param name="geneUniqId" value="UniqueID"/>
|
|
84 <param name="geneName" value="GeneSymbol"/>
|
|
85 <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/>
|
|
86 <param name="metUniqId" value="UniqueID"/>
|
|
87 <param name="metName" value="MetName"/>
|
|
88 <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/>
|
|
89 <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/>
|
|
90 </test>
|
|
91 </tests>
|
|
92 <help><![CDATA[
|
|
93
|
|
94 **Tool Description**
|
|
95
|
|
96 This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs)
|
|
97 creating either a (a) Gene to KEGGID Link or a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to
|
|
98 take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will
|
|
99 link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs.
|
|
100 This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName
|
|
101 and adds the following columns: 1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG,
|
|
102 3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal
|
|
103 function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
|
|
104
|
|
105 User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure. Common metabolite prefixes
|
|
106 are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
|
|
107 If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”.
|
|
108 If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid
|
|
109 abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine,
|
|
110 PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are
|
|
111 modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
|
|
112 more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.
|
|
113
|
|
114 Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity
|
|
115 score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined
|
|
116 as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)
|
|
117
|
|
118 Selected = Yes for the match with the highest similarity score.
|
|
119
|
|
120 For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:
|
|
121 if the Score is greater than 95% for 2 or more matches in the metabolite name then:
|
|
122 1) the Tie column = 'Yes' and a warning message will appear
|
|
123 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.
|
|
124
|
|
125 --------------------------------------------------------------------------------
|
|
126
|
|
127 **INPUT**
|
|
128
|
|
129 **Annotation File**
|
|
130
|
|
131 +-------------+--------------+-----+
|
|
132 | FeatureID | Name | ... |
|
|
133 +=============+==============+=====+
|
|
134 | FeatureID_1 | one | ... |
|
|
135 +-------------+--------------+-----+
|
|
136 | FeatureID_2 | two | ... |
|
|
137 +-------------+--------------+-----+
|
|
138 | FeatureID_3 | three | ... |
|
|
139 +-------------+--------------+-----+
|
|
140 | FeatureID_4 | four | ... |
|
|
141 +-------------+--------------+-----+
|
|
142 | ... | ... | ... |
|
|
143 +-------------+--------------+-----+
|
|
144
|
|
145 **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both.
|
|
146
|
|
147 **Unique FeatureID**
|
|
148
|
|
149 Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.
|
|
150
|
|
151 **Gene Symbol or Metabolite Names**
|
|
152
|
|
153 Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.
|
|
154
|
|
155 --------------------------------------------------------------------------------
|
|
156
|
|
157 **OUTPUT**
|
|
158
|
|
159 For each input Annotation file, a TSV file containing the following columns is generated:
|
|
160
|
|
161 (1) **unique FeatureID:** column from the Annotation dataset containing the unique FeatureIDs.
|
|
162 (2) **Name:** column from Annotation dataset used for matching in KEGG.
|
|
163 (3) **Feature_Type:** column indicating whether matching was for metabolites or genes.
|
|
164 (4) **Matched:** column indicating whether a match in KEGG was found. Yes/No
|
|
165 (5) **Name_in_KEGG:** column containing the KEGG name for the match.
|
|
166 (6) **KEGGID:** column containing the KEGG identifier for the match.
|
|
167 (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1.
|
|
168 (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%.
|
|
169 (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.
|
|
170
|
|
171
|
|
172 **Example Metabolite to KEGGID Link Table**
|
|
173
|
|
174 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
|
|
175 | FeatureID | Name | Feature_Type | Matched | Name_in_KEGG | KEGG_ID | Similarity | Tie | Selected |
|
|
176 +=============+============+==============+=========+==============+==========+============+=====+==========+
|
|
177 | FeatureID_1 | one | Metabolite | Yes | one* | cpd:... | 1.0 | No | Yes |
|
|
178 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
|
|
179 | FeatureID_2 | two | Metabolite | Yes | two* | cpd:... | 1.0 | No | Yes |
|
|
180 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
|
|
181 | FeatureID_3 | two | Metabolite | Yes | three* | cpd:... | 0.87 | No | No |
|
|
182 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
|
|
183 | FeatureID_4 | four | Metabolite | No | NA | NA | NA | NA | NA |
|
|
184 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
|
|
185 | ... | ... | ... | ... | ... | ... | ... | ... | ... |
|
|
186 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
|
|
187
|
|
188 **NOTE:** Warning messages appear in cases of a Tie (greater than 95% similarity). Please check these results carefully.
|
|
189
|
|
190 ]]>
|
|
191 </help>
|
|
192 <citations>
|
|
193 <citation type="bibtex">@ARTICLE{Kirpich17secimtools,
|
|
194 author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
|
|
195 title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
|
|
196 journal = {BMC Bioinformatics},
|
|
197 year = {in press}
|
|
198 }</citation>
|
|
199 <citation type="bibtex">
|
|
200 @article{garcia2010paintomics,
|
|
201 title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data},
|
|
202 author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana},
|
|
203 journal={Bioinformatics},
|
|
204 volume={27},
|
|
205 number={1},
|
|
206 pages={137--139},
|
|
207 year={2010},
|
|
208 publisher={Oxford University Press}
|
|
209 }</citation>
|
|
210 <citation>@article{wu2014mygene,
|
|
211 title={MyGene. info: gene annotation query as a service},
|
|
212 author={Wu, Chunlei and Mark, Adam and Su, Andrew I},
|
|
213 journal={bioRxiv},
|
|
214 pages={009332},
|
|
215 year={2014},
|
|
216 publisher={Cold Spring Harbor Laboratory}
|
|
217 }</citation>
|
|
218 </citations>
|
|
219 </tool>
|