Mercurial > repos > malex > gait_gm
comparison add_kegg_anno_info.xml @ 2:2c218a253d56 draft default tip
"planemo upload for repository https://github.com/secimTools/gait-gm/tree/main/galaxy commit 758394addb95b09e794132a23a1f7e95ba39df0b"
author | malex |
---|---|
date | Thu, 29 Jul 2021 20:48:10 +0000 |
parents | ec9ee8edb84d |
children |
comparison
equal
deleted
inserted
replaced
1:ec9ee8edb84d | 2:2c218a253d56 |
---|---|
2 <description></description> | 2 <description></description> |
3 <macros> | 3 <macros> |
4 <import>macros.xml</import> | 4 <import>macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="requirements" /> | 6 <expand macro="requirements" /> |
7 <stdio> | 7 <stdio> |
8 <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> | 8 <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> |
9 </stdio> | 9 </stdio> |
10 <command detect_errors="exit_code"><![CDATA[ | 10 <command detect_errors="exit_code"><![CDATA[ |
11 add_kegg_anno_info.py | 11 add_kegg_anno_info.py |
12 -s=$species | 12 -s=$species |
13 #if $dataSets.whichDataSet == "geneDataset": | 13 #if $dataSets.whichDataSet == "geneDataset": |
14 -ga=$dataSets.geneAnnot | 14 -ga=$dataSets.geneAnnot |
38 <option value="rno">Rattus norvegicus</option> | 38 <option value="rno">Rattus norvegicus</option> |
39 <option value="dme">Drosophila melanogaster</option> | 39 <option value="dme">Drosophila melanogaster</option> |
40 <option value="ath">Arabidopsis thaliana</option> | 40 <option value="ath">Arabidopsis thaliana</option> |
41 <option value="sce">Saccharomyces cerevisiae</option> | 41 <option value="sce">Saccharomyces cerevisiae</option> |
42 <option value="eco">Escherichia coli</option> | 42 <option value="eco">Escherichia coli</option> |
43 <option value="cel">Caenorhabditis elegans</option> | |
43 </param> | 44 </param> |
44 <conditional name="dataSets"> | 45 <conditional name="dataSets"> |
45 <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)"> | 46 <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)"> |
46 <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option> | 47 <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option> |
47 <option value="geneDataset">Gene Expression Annotation Dataset</option> | 48 <option value="geneDataset">Gene Expression Annotation Dataset</option> |
48 <option value="metDataset">Metabolomic Annotation Dataset</option> | 49 <option value="metDataset">Metabolomic Annotation Dataset</option> |
49 <validator type="no_options" message="You must select at least one option." /> | 50 <validator type="no_options" message="You must select at least one option." /> |
50 </param> | 51 </param> |
51 <when value="geneDataset"> | 52 <when value="geneDataset"> |
52 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> | 53 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> |
53 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> | 54 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> |
54 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> | 55 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> |
55 </when> | 56 </when> |
56 <when value="metDataset"> | 57 <when value="metDataset"> |
57 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> | 58 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> |
58 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> | 59 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> |
59 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> | 60 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> |
60 </when> | 61 </when> |
61 <when value="geneDataset,metDataset"> | 62 <when value="geneDataset,metDataset"> |
62 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> | 63 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> |
63 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> | 64 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> |
64 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> | 65 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> |
65 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> | 66 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> |
66 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> | 67 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> |
67 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> | 68 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> |
68 </when> | 69 </when> |
69 </conditional> | 70 </conditional> |
70 </inputs> | 71 </inputs> |
71 <outputs> | 72 <outputs> |
77 </data> | 78 </data> |
78 </outputs> | 79 </outputs> |
79 <tests> | 80 <tests> |
80 <test> | 81 <test> |
81 <param name="species" value="rno"/> | 82 <param name="species" value="rno"/> |
82 <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/> | 83 <param name="geneAnnot" value="ensembl2symbol_annotation.tsv"/> |
83 <param name="geneUniqId" value="UniqueID"/> | 84 <param name="geneUniqId" value="UniqueID"/> |
84 <param name="geneName" value="GeneSymbol"/> | 85 <param name="geneName" value="GeneName"/> |
85 <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/> | 86 <param name="metAnnot" value="metabolite_annotation.tsv"/> |
86 <param name="metUniqId" value="UniqueID"/> | 87 <param name="metUniqId" value="UniqueID"/> |
87 <param name="metName" value="MetName"/> | 88 <param name="metName" value="MetName"/> |
88 <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/> | 89 <output name="geneOutput" value="gene_to_keggId_link.tsv" compare="diff" lines_diff="100000"/> |
89 <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/> | 90 <output name="metOutput" value="metabolite_to_keggId_link.tsv" compare="diff" lines_diff="10000"/> |
90 </test> | 91 </test> |
91 </tests> | 92 </tests> |
92 <help><![CDATA[ | 93 <help><![CDATA[ |
93 | 94 |
94 **Tool Description** | 95 **Tool Description** |
95 | 96 |
96 This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) | 97 This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) |
97 creating either a (a) Gene to KEGGID Link or a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to | 98 creating either: |
98 take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will | 99 (a) a Gene to KEGGID Link or |
99 link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs. | 100 (b) a Metabolite to KEGGID Link dataset. |
100 This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName | 101 |
101 and adds the following columns: 1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG, | 102 For gene expression data, the tool is designed to |
102 3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal | 103 take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset |
103 function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID. | 104 contains a Selected column, the tool will |
104 | 105 link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must |
105 User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure. Common metabolite prefixes | 106 have a column containing unique FeatureIDs. |
106 are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-). | 107 This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene |
107 If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”. | 108 expression data) and GeneSymbol/MetaboliteName |
108 If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid | 109 and adds the following columns: |
109 abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine, | 110 |
110 PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are | 111 1) Name_in_KEGG, the name found in KEGG |
111 modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of | 112 2) Matched, a column indicating whether a match was found in KEGG, |
113 3) KEGGID, the KEGG identifier for the Match | |
114 4) Score, a similarity score representing match similarity (calculated using the python internal function SequenceMatcher from difflib (check) | |
115 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID. | |
116 | |
117 User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure: Common metabolite prefixes | |
118 are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-). | |
119 If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”. | |
120 If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid | |
121 abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine, | |
122 PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are | |
123 modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of | |
112 more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database. | 124 more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database. |
113 | 125 |
114 Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity | 126 Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity |
115 score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined | 127 score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements, where 'junk' elements are defined |
116 as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib) | 128 as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib) |
117 | 129 |
118 Selected = Yes for the match with the highest similarity score. | 130 Selected = Yes for the match with the highest similarity score. |
119 | 131 |
120 For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows: | 132 For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows: |
121 if the Score is greater than 95% for 2 or more matches in the metabolite name then: | 133 if the Score is greater than 95% for 2 or more matches in the metabolite name then: |
122 1) the Tie column = 'Yes' and a warning message will appear | 134 1) the Tie column = 'Yes' and a warning message will appear |
123 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset. | 135 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset. |
124 | 136 |
125 -------------------------------------------------------------------------------- | 137 -------------------------------------------------------------------------------- |
144 | 156 |
145 **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both. | 157 **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both. |
146 | 158 |
147 **Unique FeatureID** | 159 **Unique FeatureID** |
148 | 160 |
149 Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs. | 161 Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs. |
150 | 162 |
151 **Gene Symbol or Metabolite Names** | 163 **Gene Symbol or Metabolite Names** |
152 | 164 |
153 Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs. | 165 Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs. |
154 | 166 |
155 -------------------------------------------------------------------------------- | 167 -------------------------------------------------------------------------------- |
156 | 168 |
157 **OUTPUT** | 169 **OUTPUT** |
158 | 170 |
163 (3) **Feature_Type:** column indicating whether matching was for metabolites or genes. | 175 (3) **Feature_Type:** column indicating whether matching was for metabolites or genes. |
164 (4) **Matched:** column indicating whether a match in KEGG was found. Yes/No | 176 (4) **Matched:** column indicating whether a match in KEGG was found. Yes/No |
165 (5) **Name_in_KEGG:** column containing the KEGG name for the match. | 177 (5) **Name_in_KEGG:** column containing the KEGG name for the match. |
166 (6) **KEGGID:** column containing the KEGG identifier for the match. | 178 (6) **KEGGID:** column containing the KEGG identifier for the match. |
167 (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1. | 179 (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1. |
168 (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. | 180 (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. |
169 (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match. | 181 (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match. |
170 | 182 |
171 | 183 |
172 **Example Metabolite to KEGGID Link Table** | 184 **Example Metabolite to KEGGID Link Table** |
173 | 185 |
192 <citations> | 204 <citations> |
193 <citation type="bibtex">@ARTICLE{Kirpich17secimtools, | 205 <citation type="bibtex">@ARTICLE{Kirpich17secimtools, |
194 author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre}, | 206 author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre}, |
195 title = {SECIMTools: A suite of Metabolomics Data Analysis Tools}, | 207 title = {SECIMTools: A suite of Metabolomics Data Analysis Tools}, |
196 journal = {BMC Bioinformatics}, | 208 journal = {BMC Bioinformatics}, |
197 year = {in press} | 209 year = {2018} |
198 }</citation> | 210 }</citation> |
199 <citation type="bibtex"> | 211 <citation type="bibtex">@article{Mor2021GaitGM, |
200 @article{garcia2010paintomics, | 212 title={GAIT-GM integrative cross-omics analyses reveal cholinergic defects in a C. elegans model of Parkinson's disease}, |
201 title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data}, | 213 author={Mor, DE and Huertas, F and Morse, AM and Kaletsky, R and Murphy, CT and Kalia, V and Miller, GW and Moskalenko, O and Conesa, A and McIntyre, LM}, |
202 author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana}, | 214 journal={BMC Genomics}, |
203 journal={Bioinformatics}, | 215 year={submitted}, |
204 volume={27}, | |
205 number={1}, | |
206 pages={137--139}, | |
207 year={2010}, | |
208 publisher={Oxford University Press} | |
209 }</citation> | |
210 <citation>@article{wu2014mygene, | |
211 title={MyGene. info: gene annotation query as a service}, | |
212 author={Wu, Chunlei and Mark, Adam and Su, Andrew I}, | |
213 journal={bioRxiv}, | |
214 pages={009332}, | |
215 year={2014}, | |
216 publisher={Cold Spring Harbor Laboratory} | |
217 }</citation> | 216 }</citation> |
218 </citations> | 217 </citations> |
219 </tool> | 218 </tool> |