comparison add_kegg_anno_info.xml @ 2:2c218a253d56 draft default tip

"planemo upload for repository https://github.com/secimTools/gait-gm/tree/main/galaxy commit 758394addb95b09e794132a23a1f7e95ba39df0b"
author malex
date Thu, 29 Jul 2021 20:48:10 +0000
parents ec9ee8edb84d
children
comparison
equal deleted inserted replaced
1:ec9ee8edb84d 2:2c218a253d56
2 <description></description> 2 <description></description>
3 <macros> 3 <macros>
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements" /> 6 <expand macro="requirements" />
7 <stdio> 7 <stdio>
8 <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> 8 <exit_code range="1" level="fatal" description="Repeated Unique IDs"/>
9 </stdio> 9 </stdio>
10 <command detect_errors="exit_code"><![CDATA[ 10 <command detect_errors="exit_code"><![CDATA[
11 add_kegg_anno_info.py 11 add_kegg_anno_info.py
12 -s=$species 12 -s=$species
13 #if $dataSets.whichDataSet == "geneDataset": 13 #if $dataSets.whichDataSet == "geneDataset":
14 -ga=$dataSets.geneAnnot 14 -ga=$dataSets.geneAnnot
38 <option value="rno">Rattus norvegicus</option> 38 <option value="rno">Rattus norvegicus</option>
39 <option value="dme">Drosophila melanogaster</option> 39 <option value="dme">Drosophila melanogaster</option>
40 <option value="ath">Arabidopsis thaliana</option> 40 <option value="ath">Arabidopsis thaliana</option>
41 <option value="sce">Saccharomyces cerevisiae</option> 41 <option value="sce">Saccharomyces cerevisiae</option>
42 <option value="eco">Escherichia coli</option> 42 <option value="eco">Escherichia coli</option>
43 <option value="cel">Caenorhabditis elegans</option>
43 </param> 44 </param>
44 <conditional name="dataSets"> 45 <conditional name="dataSets">
45 <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)"> 46 <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
46 <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option> 47 <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
47 <option value="geneDataset">Gene Expression Annotation Dataset</option> 48 <option value="geneDataset">Gene Expression Annotation Dataset</option>
48 <option value="metDataset">Metabolomic Annotation Dataset</option> 49 <option value="metDataset">Metabolomic Annotation Dataset</option>
49 <validator type="no_options" message="You must select at least one option." /> 50 <validator type="no_options" message="You must select at least one option." />
50 </param> 51 </param>
51 <when value="geneDataset"> 52 <when value="geneDataset">
52 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> 53 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
53 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> 54 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
54 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> 55 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
55 </when> 56 </when>
56 <when value="metDataset"> 57 <when value="metDataset">
57 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> 58 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
58 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> 59 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
59 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> 60 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
60 </when> 61 </when>
61 <when value="geneDataset,metDataset"> 62 <when value="geneDataset,metDataset">
62 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> 63 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
63 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> 64 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
64 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> 65 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
65 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> 66 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
66 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> 67 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
67 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> 68 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
68 </when> 69 </when>
69 </conditional> 70 </conditional>
70 </inputs> 71 </inputs>
71 <outputs> 72 <outputs>
77 </data> 78 </data>
78 </outputs> 79 </outputs>
79 <tests> 80 <tests>
80 <test> 81 <test>
81 <param name="species" value="rno"/> 82 <param name="species" value="rno"/>
82 <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/> 83 <param name="geneAnnot" value="ensembl2symbol_annotation.tsv"/>
83 <param name="geneUniqId" value="UniqueID"/> 84 <param name="geneUniqId" value="UniqueID"/>
84 <param name="geneName" value="GeneSymbol"/> 85 <param name="geneName" value="GeneName"/>
85 <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/> 86 <param name="metAnnot" value="metabolite_annotation.tsv"/>
86 <param name="metUniqId" value="UniqueID"/> 87 <param name="metUniqId" value="UniqueID"/>
87 <param name="metName" value="MetName"/> 88 <param name="metName" value="MetName"/>
88 <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/> 89 <output name="geneOutput" value="gene_to_keggId_link.tsv" compare="diff" lines_diff="100000"/>
89 <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/> 90 <output name="metOutput" value="metabolite_to_keggId_link.tsv" compare="diff" lines_diff="10000"/>
90 </test> 91 </test>
91 </tests> 92 </tests>
92 <help><![CDATA[ 93 <help><![CDATA[
93 94
94 **Tool Description** 95 **Tool Description**
95 96
96 This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) 97 This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs)
97 creating either a (a) Gene to KEGGID Link or a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to 98 creating either:
98 take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will 99 (a) a Gene to KEGGID Link or
99 link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs. 100 (b) a Metabolite to KEGGID Link dataset.
100 This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName 101
101 and adds the following columns: 1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG, 102 For gene expression data, the tool is designed to
102 3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal 103 take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset
103 function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID. 104 contains a Selected column, the tool will
104 105 link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must
105 User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure. Common metabolite prefixes 106 have a column containing unique FeatureIDs.
106 are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-). 107 This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene
107 If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”. 108 expression data) and GeneSymbol/MetaboliteName
108 If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid 109 and adds the following columns:
109 abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine, 110
110 PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are 111 1) Name_in_KEGG, the name found in KEGG
111 modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of 112 2) Matched, a column indicating whether a match was found in KEGG,
113 3) KEGGID, the KEGG identifier for the Match
114 4) Score, a similarity score representing match similarity (calculated using the python internal function SequenceMatcher from difflib (check)
115 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
116
117 User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure: Common metabolite prefixes
118 are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
119 If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”.
120 If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid
121 abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine,
122 PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are
123 modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
112 more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database. 124 more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.
113 125
114 Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity 126 Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity
115 score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined 127 score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements, where 'junk' elements are defined
116 as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib) 128 as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)
117 129
118 Selected = Yes for the match with the highest similarity score. 130 Selected = Yes for the match with the highest similarity score.
119 131
120 For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows: 132 For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:
121 if the Score is greater than 95% for 2 or more matches in the metabolite name then: 133 if the Score is greater than 95% for 2 or more matches in the metabolite name then:
122 1) the Tie column = 'Yes' and a warning message will appear 134 1) the Tie column = 'Yes' and a warning message will appear
123 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset. 135 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.
124 136
125 -------------------------------------------------------------------------------- 137 --------------------------------------------------------------------------------
144 156
145 **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both. 157 **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both.
146 158
147 **Unique FeatureID** 159 **Unique FeatureID**
148 160
149 Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs. 161 Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.
150 162
151 **Gene Symbol or Metabolite Names** 163 **Gene Symbol or Metabolite Names**
152 164
153 Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs. 165 Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.
154 166
155 -------------------------------------------------------------------------------- 167 --------------------------------------------------------------------------------
156 168
157 **OUTPUT** 169 **OUTPUT**
158 170
163 (3) **Feature_Type:** column indicating whether matching was for metabolites or genes. 175 (3) **Feature_Type:** column indicating whether matching was for metabolites or genes.
164 (4) **Matched:** column indicating whether a match in KEGG was found. Yes/No 176 (4) **Matched:** column indicating whether a match in KEGG was found. Yes/No
165 (5) **Name_in_KEGG:** column containing the KEGG name for the match. 177 (5) **Name_in_KEGG:** column containing the KEGG name for the match.
166 (6) **KEGGID:** column containing the KEGG identifier for the match. 178 (6) **KEGGID:** column containing the KEGG identifier for the match.
167 (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1. 179 (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1.
168 (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. 180 (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%.
169 (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match. 181 (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.
170 182
171 183
172 **Example Metabolite to KEGGID Link Table** 184 **Example Metabolite to KEGGID Link Table**
173 185
192 <citations> 204 <citations>
193 <citation type="bibtex">@ARTICLE{Kirpich17secimtools, 205 <citation type="bibtex">@ARTICLE{Kirpich17secimtools,
194 author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre}, 206 author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
195 title = {SECIMTools: A suite of Metabolomics Data Analysis Tools}, 207 title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
196 journal = {BMC Bioinformatics}, 208 journal = {BMC Bioinformatics},
197 year = {in press} 209 year = {2018}
198 }</citation> 210 }</citation>
199 <citation type="bibtex"> 211 <citation type="bibtex">@article{Mor2021GaitGM,
200 @article{garcia2010paintomics, 212 title={GAIT-GM integrative cross-omics analyses reveal cholinergic defects in a C. elegans model of Parkinson's disease},
201 title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data}, 213 author={Mor, DE and Huertas, F and Morse, AM and Kaletsky, R and Murphy, CT and Kalia, V and Miller, GW and Moskalenko, O and Conesa, A and McIntyre, LM},
202 author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana}, 214 journal={BMC Genomics},
203 journal={Bioinformatics}, 215 year={submitted},
204 volume={27},
205 number={1},
206 pages={137--139},
207 year={2010},
208 publisher={Oxford University Press}
209 }</citation>
210 <citation>@article{wu2014mygene,
211 title={MyGene. info: gene annotation query as a service},
212 author={Wu, Chunlei and Mark, Adam and Su, Andrew I},
213 journal={bioRxiv},
214 pages={009332},
215 year={2014},
216 publisher={Cold Spring Harbor Laboratory}
217 }</citation> 216 }</citation>
218 </citations> 217 </citations>
219 </tool> 218 </tool>