gait_gm: add_kegg_anno_info.xml comparison

comparison add_kegg_anno_info.xml @ 2:2c218a253d56 draft default tip

"planemo upload for repository https://github.com/secimTools/gait-gm/tree/main/galaxy commit 758394addb95b09e794132a23a1f7e95ba39df0b"

author	malex
date	Thu, 29 Jul 2021 20:48:10 +0000
parents	ec9ee8edb84d
children

comparison

equal deleted inserted replaced

-:ec9ee8edb84d
+:2c218a253d56
 <description></description>
 <macros>
 <import>macros.xml</import>
 </macros>
 <expand macro="requirements" />
 <stdio>
 <exit_code range="1" level="fatal" description="Repeated Unique IDs"/>
 </stdio>
 <command detect_errors="exit_code"><![CDATA[
 add_kegg_anno_info.py
 -s=$species
 #if $dataSets.whichDataSet == "geneDataset":
 -ga=$dataSets.geneAnnot
 <option value="rno">Rattus norvegicus</option>
 <option value="dme">Drosophila melanogaster</option>
 <option value="ath">Arabidopsis thaliana</option>
 <option value="sce">Saccharomyces cerevisiae</option>
 <option value="eco">Escherichia coli</option>
+<option value="cel">Caenorhabditis elegans</option>
 </param>
 <conditional name="dataSets">
 <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
 <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
 <option value="geneDataset">Gene Expression Annotation Dataset</option>
 <option value="metDataset">Metabolomic Annotation Dataset</option>
 <validator type="no_options" message="You must select at least one option." />
 </param>
 <when value="geneDataset">
 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
 </when>
 <when value="metDataset">
 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
 </when>
 <when value="geneDataset,metDataset">
 <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
 <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
 <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
 <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
 <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
 <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
 </when>
 </conditional>
 </inputs>
 <outputs>
 </data>
 </outputs>
 <tests>
 <test>
 <param name="species" value="rno"/>
-<param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/>
+<param name="geneAnnot" value="ensembl2symbol_annotation.tsv"/>
 <param name="geneUniqId" value="UniqueID"/>
-<param name="geneName" value="GeneSymbol"/>
+<param name="geneName" value="GeneName"/>
-<param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/>
+<param name="metAnnot" value="metabolite_annotation.tsv"/>
 <param name="metUniqId" value="UniqueID"/>
 <param name="metName" value="MetName"/>
-<param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/>
+<output name="geneOutput" value="gene_to_keggId_link.tsv" compare="diff" lines_diff="100000"/>
-<param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/>
+<output name="metOutput" value="metabolite_to_keggId_link.tsv" compare="diff" lines_diff="10000"/>
 </test>
 </tests>
 <help><![CDATA[
 **Tool Description**
 This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs)
-creating either a (a) Gene to KEGGID Link or  a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to
+creating either:
-take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will
+(a) a Gene to KEGGID Link or
-link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs.
+(b) a Metabolite to KEGGID Link dataset.
-This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName
-and adds the following columns:  1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG,
+For gene expression data, the tool is designed to
-3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal
+take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset
-function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
+contains a Selected column, the tool will
+link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must
-User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure.  Common metabolite prefixes
+have a column containing unique FeatureIDs.
-are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
+This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene
-If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing  “ic acid”, “icacid” or “ic_acid” with  “ate”.
+expression data) and GeneSymbol/MetaboliteName
-If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name.  The following commonly used lipid
+and adds the following columns:
-abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc  = lysophosphatidylcholine, PC = phosphatidylcholine,
-PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine).  Similarly, abbreviations for other commonly assayed metabolites are
+1) Name_in_KEGG, the name found in KEGG
-modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
+2) Matched, a column indicating whether a match was found in KEGG,
+3) KEGGID, the KEGG identifier for the Match
+4) Score, a similarity score representing match similarity (calculated using the python internal function SequenceMatcher from difflib (check)
+5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
+User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure:  Common metabolite prefixes
+are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
+If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing  “ic acid”, “icacid” or “ic_acid” with  “ate”.
+If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name.  The following commonly used lipid
+abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc  = lysophosphatidylcholine, PC = phosphatidylcholine,
+PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine).  Similarly, abbreviations for other commonly assayed metabolites are
+modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
 more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.
 Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned.  The similarity
-score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined
+score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements, where 'junk' elements are defined
 as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)
 Selected = Yes for the match with the highest similarity score.
 For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:
 if the Score is greater than 95% for 2 or more matches in the metabolite name then:
 1) the Tie column = 'Yes' and a warning message will appear
 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.
 --------------------------------------------------------------------------------
 **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both.
 **Unique FeatureID**
 Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.
 **Gene Symbol or Metabolite Names**
 Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.
 --------------------------------------------------------------------------------
 **OUTPUT**
 (3) **Feature_Type:**  column indicating whether matching was for metabolites or genes.
 (4) **Matched:**  column indicating whether a match in KEGG was found.  Yes/No
 (5) **Name_in_KEGG:**  column containing the KEGG name for the match.
 (6) **KEGGID:**  column containing the KEGG identifier for the match.
 (7) **Similarity:**  value indicating the similarity between the given feature and the match in KEGG.  Ranges from 0 to 1.
 (8) **Tie:**  in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%.
 (9) **Selected:**  for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple	matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.
 **Example Metabolite to KEGGID Link Table**
 <citations>
 <citation type="bibtex">@ARTICLE{Kirpich17secimtools,
 author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
 title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
 journal = {BMC Bioinformatics},
-year = {in press}
+year = {2018}
 }</citation>
-<citation type="bibtex">
+<citation type="bibtex">@article{Mor2021GaitGM,
-@article{garcia2010paintomics,
+title={GAIT-GM integrative cross-omics analyses reveal cholinergic defects in a C. elegans model of Parkinson's disease},
-title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data},
+author={Mor, DE and Huertas, F and Morse, AM and Kaletsky, R and Murphy, CT and Kalia, V and Miller, GW and Moskalenko, O and Conesa, A and McIntyre, LM},
-author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana},
+journal={BMC Genomics},
-journal={Bioinformatics},
+year={submitted},
-volume={27},
-number={1},
-pages={137--139},
-year={2010},
-publisher={Oxford University Press}
-}</citation>
-<citation>@article{wu2014mygene,
-title={MyGene. info: gene annotation query as a service},
-author={Wu, Chunlei and Mark, Adam and Su, Andrew I},
-journal={bioRxiv},
-pages={009332},
-year={2014},
-publisher={Cold Spring Harbor Laboratory}
 }</citation>
 </citations>
 </tool>

Mercurial > repos > malex > gait_gm

comparison add_kegg_anno_info.xml @ 2:2c218a253d56 draft default tip