comparison add_kegg_anno_info.xml @ 1:ec9ee8edb84d draft

Initial upload of 21.6.10 release.
author malex
date Fri, 18 Jun 2021 20:23:19 +0000
parents
children 2c218a253d56
comparison
equal deleted inserted replaced
0:864fc6430432 1:ec9ee8edb84d
1 <tool id="secimtools_add_kegg_anno_info" name="Link Name to KEGGID" version="@WRAPPER_VERSION@">
2 <description></description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <stdio>
8 <exit_code range="1" level="fatal" description="Repeated Unique IDs"/>
9 </stdio>
10 <command detect_errors="exit_code"><![CDATA[
11 add_kegg_anno_info.py
12 -s=$species
13 #if $dataSets.whichDataSet == "geneDataset":
14 -ga=$dataSets.geneAnnot
15 -gid=$dataSets.geneUniqId
16 -gn=$dataSets.geneName
17 #end if
18 #if $dataSets.whichDataSet == "metDataset":
19 -ma=$dataSets.metAnnot
20 -mid=$dataSets.metUniqId
21 -mn=$dataSets.metName
22 #end if
23 #if $dataSets.whichDataSet == "geneDataset,metDataset":
24 -ga=$dataSets.geneAnnot
25 -gid=$dataSets.geneUniqId
26 -gn=$dataSets.geneName
27 -ma=$dataSets.metAnnot
28 -mid=$dataSets.metUniqId
29 -mn=$dataSets.metName
30 #end if
31 -go=$geneOutput
32 -mo=$metOutput
33 ]]></command>
34 <inputs>
35 <param name="species" type="select" label="Select Species from the list" >
36 <option value="hsa">Homo sapiens</option>
37 <option value="mmu">Mus musculus</option>
38 <option value="rno">Rattus norvegicus</option>
39 <option value="dme">Drosophila melanogaster</option>
40 <option value="ath">Arabidopsis thaliana</option>
41 <option value="sce">Saccharomyces cerevisiae</option>
42 <option value="eco">Escherichia coli</option>
43 </param>
44 <conditional name="dataSets">
45 <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
46 <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
47 <option value="geneDataset">Gene Expression Annotation Dataset</option>
48 <option value="metDataset">Metabolomic Annotation Dataset</option>
49 <validator type="no_options" message="You must select at least one option." />
50 </param>
51 <when value="geneDataset">
52 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
53 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
54 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
55 </when>
56 <when value="metDataset">
57 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
58 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
59 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
60 </when>
61 <when value="geneDataset,metDataset">
62 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
63 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
64 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
65 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
66 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
67 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
68 </when>
69 </conditional>
70 </inputs>
71 <outputs>
72 <data format="tabular" name="geneOutput" label="${tool.name} on ${on_string}: Gene to KEGGID link">
73 <filter>(dataSets['whichDataSet'] == 'geneDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
74 </data>
75 <data format="tabular" name="metOutput" label="${tool.name} on ${on_string}: Metabolite to KEGGID link">
76 <filter>(dataSets['whichDataSet'] == 'metDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
77 </data>
78 </outputs>
79 <tests>
80 <test>
81 <param name="species" value="rno"/>
82 <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/>
83 <param name="geneUniqId" value="UniqueID"/>
84 <param name="geneName" value="GeneSymbol"/>
85 <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/>
86 <param name="metUniqId" value="UniqueID"/>
87 <param name="metName" value="MetName"/>
88 <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/>
89 <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/>
90 </test>
91 </tests>
92 <help><![CDATA[
93
94 **Tool Description**
95
96 This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs)
97 creating either a (a) Gene to KEGGID Link or a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to
98 take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will
99 link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs.
100 This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName
101 and adds the following columns: 1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG,
102 3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal
103 function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
104
105 User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure. Common metabolite prefixes
106 are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
107 If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”.
108 If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid
109 abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine,
110 PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are
111 modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
112 more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.
113
114 Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity
115 score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined
116 as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)
117
118 Selected = Yes for the match with the highest similarity score.
119
120 For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:
121 if the Score is greater than 95% for 2 or more matches in the metabolite name then:
122 1) the Tie column = 'Yes' and a warning message will appear
123 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.
124
125 --------------------------------------------------------------------------------
126
127 **INPUT**
128
129 **Annotation File**
130
131 +-------------+--------------+-----+
132 | FeatureID | Name | ... |
133 +=============+==============+=====+
134 | FeatureID_1 | one | ... |
135 +-------------+--------------+-----+
136 | FeatureID_2 | two | ... |
137 +-------------+--------------+-----+
138 | FeatureID_3 | three | ... |
139 +-------------+--------------+-----+
140 | FeatureID_4 | four | ... |
141 +-------------+--------------+-----+
142 | ... | ... | ... |
143 +-------------+--------------+-----+
144
145 **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both.
146
147 **Unique FeatureID**
148
149 Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.
150
151 **Gene Symbol or Metabolite Names**
152
153 Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.
154
155 --------------------------------------------------------------------------------
156
157 **OUTPUT**
158
159 For each input Annotation file, a TSV file containing the following columns is generated:
160
161 (1) **unique FeatureID:** column from the Annotation dataset containing the unique FeatureIDs.
162 (2) **Name:** column from Annotation dataset used for matching in KEGG.
163 (3) **Feature_Type:** column indicating whether matching was for metabolites or genes.
164 (4) **Matched:** column indicating whether a match in KEGG was found. Yes/No
165 (5) **Name_in_KEGG:** column containing the KEGG name for the match.
166 (6) **KEGGID:** column containing the KEGG identifier for the match.
167 (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1.
168 (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%.
169 (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.
170
171
172 **Example Metabolite to KEGGID Link Table**
173
174 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
175 | FeatureID | Name | Feature_Type | Matched | Name_in_KEGG | KEGG_ID | Similarity | Tie | Selected |
176 +=============+============+==============+=========+==============+==========+============+=====+==========+
177 | FeatureID_1 | one | Metabolite | Yes | one* | cpd:... | 1.0 | No | Yes |
178 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
179 | FeatureID_2 | two | Metabolite | Yes | two* | cpd:... | 1.0 | No | Yes |
180 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
181 | FeatureID_3 | two | Metabolite | Yes | three* | cpd:... | 0.87 | No | No |
182 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
183 | FeatureID_4 | four | Metabolite | No | NA | NA | NA | NA | NA |
184 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
185 | ... | ... | ... | ... | ... | ... | ... | ... | ... |
186 +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
187
188 **NOTE:** Warning messages appear in cases of a Tie (greater than 95% similarity). Please check these results carefully.
189
190 ]]>
191 </help>
192 <citations>
193 <citation type="bibtex">@ARTICLE{Kirpich17secimtools,
194 author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
195 title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
196 journal = {BMC Bioinformatics},
197 year = {in press}
198 }</citation>
199 <citation type="bibtex">
200 @article{garcia2010paintomics,
201 title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data},
202 author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana},
203 journal={Bioinformatics},
204 volume={27},
205 number={1},
206 pages={137--139},
207 year={2010},
208 publisher={Oxford University Press}
209 }</citation>
210 <citation>@article{wu2014mygene,
211 title={MyGene. info: gene annotation query as a service},
212 author={Wu, Chunlei and Mark, Adam and Su, Andrew I},
213 journal={bioRxiv},
214 pages={009332},
215 year={2014},
216 publisher={Cold Spring Harbor Laboratory}
217 }</citation>
218 </citations>
219 </tool>