Mercurial > repos > malex > gait_gm
diff add_kegg_anno_info.xml @ 2:2c218a253d56 draft default tip
"planemo upload for repository https://github.com/secimTools/gait-gm/tree/main/galaxy commit 758394addb95b09e794132a23a1f7e95ba39df0b"
author | malex |
---|---|
date | Thu, 29 Jul 2021 20:48:10 +0000 |
parents | ec9ee8edb84d |
children |
line wrap: on
line diff
--- a/add_kegg_anno_info.xml Fri Jun 18 20:23:19 2021 +0000 +++ b/add_kegg_anno_info.xml Thu Jul 29 20:48:10 2021 +0000 @@ -4,9 +4,9 @@ <import>macros.xml</import> </macros> <expand macro="requirements" /> - <stdio> - <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> - </stdio> + <stdio> + <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> + </stdio> <command detect_errors="exit_code"><![CDATA[ add_kegg_anno_info.py -s=$species @@ -40,18 +40,19 @@ <option value="ath">Arabidopsis thaliana</option> <option value="sce">Saccharomyces cerevisiae</option> <option value="eco">Escherichia coli</option> + <option value="cel">Caenorhabditis elegans</option> </param> <conditional name="dataSets"> <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)"> <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option> - <option value="geneDataset">Gene Expression Annotation Dataset</option> + <option value="geneDataset">Gene Expression Annotation Dataset</option> <option value="metDataset">Metabolomic Annotation Dataset</option> - <validator type="no_options" message="You must select at least one option." /> + <validator type="no_options" message="You must select at least one option." /> </param> <when value="geneDataset"> <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> - <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> - <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> + <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> + <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> </when> <when value="metDataset"> <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> @@ -60,10 +61,10 @@ </when> <when value="geneDataset,metDataset"> <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> - <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> - <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> + <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> + <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> - <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> + <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> </when> </conditional> @@ -79,45 +80,56 @@ <tests> <test> <param name="species" value="rno"/> - <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/> + <param name="geneAnnot" value="ensembl2symbol_annotation.tsv"/> <param name="geneUniqId" value="UniqueID"/> - <param name="geneName" value="GeneSymbol"/> - <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/> + <param name="geneName" value="GeneName"/> + <param name="metAnnot" value="metabolite_annotation.tsv"/> <param name="metUniqId" value="UniqueID"/> <param name="metName" value="MetName"/> - <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/> - <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/> + <output name="geneOutput" value="gene_to_keggId_link.tsv" compare="diff" lines_diff="100000"/> + <output name="metOutput" value="metabolite_to_keggId_link.tsv" compare="diff" lines_diff="10000"/> </test> </tests> <help><![CDATA[ **Tool Description** - This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) - creating either a (a) Gene to KEGGID Link or a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to - take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will - link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs. - This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName - and adds the following columns: 1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG, - 3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal - function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID. +This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) +creating either: +(a) a Gene to KEGGID Link or +(b) a Metabolite to KEGGID Link dataset. + +For gene expression data, the tool is designed to +take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset +contains a Selected column, the tool will +link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must +have a column containing unique FeatureIDs. +This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene +expression data) and GeneSymbol/MetaboliteName +and adds the following columns: - User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure. Common metabolite prefixes - are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-). - If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”. - If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid - abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine, - PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are - modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of + 1) Name_in_KEGG, the name found in KEGG + 2) Matched, a column indicating whether a match was found in KEGG, + 3) KEGGID, the KEGG identifier for the Match + 4) Score, a similarity score representing match similarity (calculated using the python internal function SequenceMatcher from difflib (check) + 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID. + + User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure: Common metabolite prefixes + are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-). + If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”. + If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid + abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine, + PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are + modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database. - - Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity - score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined + + Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity + score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements, where 'junk' elements are defined as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib) - Selected = Yes for the match with the highest similarity score. + Selected = Yes for the match with the highest similarity score. - For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows: + For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows: if the Score is greater than 95% for 2 or more matches in the metabolite name then: 1) the Tie column = 'Yes' and a warning message will appear 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset. @@ -146,11 +158,11 @@ **Unique FeatureID** - Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs. +Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs. **Gene Symbol or Metabolite Names** - Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs. +Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs. -------------------------------------------------------------------------------- @@ -165,7 +177,7 @@ (5) **Name_in_KEGG:** column containing the KEGG name for the match. (6) **KEGGID:** column containing the KEGG identifier for the match. (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1. - (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. + (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match. @@ -194,26 +206,13 @@ author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre}, title = {SECIMTools: A suite of Metabolomics Data Analysis Tools}, journal = {BMC Bioinformatics}, - year = {in press} + year = {2018} }</citation> - <citation type="bibtex"> - @article{garcia2010paintomics, - title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data}, - author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana}, - journal={Bioinformatics}, - volume={27}, - number={1}, - pages={137--139}, - year={2010}, - publisher={Oxford University Press} - }</citation> - <citation>@article{wu2014mygene, - title={MyGene. info: gene annotation query as a service}, - author={Wu, Chunlei and Mark, Adam and Su, Andrew I}, - journal={bioRxiv}, - pages={009332}, - year={2014}, - publisher={Cold Spring Harbor Laboratory} + <citation type="bibtex">@article{Mor2021GaitGM, + title={GAIT-GM integrative cross-omics analyses reveal cholinergic defects in a C. elegans model of Parkinson's disease}, + author={Mor, DE and Huertas, F and Morse, AM and Kaletsky, R and Murphy, CT and Kalia, V and Miller, GW and Moskalenko, O and Conesa, A and McIntyre, LM}, + journal={BMC Genomics}, + year={submitted}, }</citation> </citations> </tool>