diff add_kegg_anno_info.xml @ 2:2c218a253d56 draft default tip

"planemo upload for repository https://github.com/secimTools/gait-gm/tree/main/galaxy commit 758394addb95b09e794132a23a1f7e95ba39df0b"
author malex
date Thu, 29 Jul 2021 20:48:10 +0000
parents ec9ee8edb84d
children
line wrap: on
line diff
--- a/add_kegg_anno_info.xml	Fri Jun 18 20:23:19 2021 +0000
+++ b/add_kegg_anno_info.xml	Thu Jul 29 20:48:10 2021 +0000
@@ -4,9 +4,9 @@
       <import>macros.xml</import>
   </macros>
   <expand macro="requirements" />
-  <stdio> 
-    <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> 
-  </stdio> 
+  <stdio>
+    <exit_code range="1" level="fatal" description="Repeated Unique IDs"/>
+  </stdio>
   <command detect_errors="exit_code"><![CDATA[
   add_kegg_anno_info.py
     -s=$species
@@ -40,18 +40,19 @@
      <option value="ath">Arabidopsis thaliana</option>
      <option value="sce">Saccharomyces cerevisiae</option>
      <option value="eco">Escherichia coli</option>
+     <option value="cel">Caenorhabditis elegans</option>
     </param>
     <conditional name="dataSets">
       <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
         <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
-        <option value="geneDataset">Gene Expression Annotation Dataset</option> 
+        <option value="geneDataset">Gene Expression Annotation Dataset</option>
         <option value="metDataset">Metabolomic Annotation Dataset</option>
-        <validator type="no_options" message="You must select at least one option." /> 
+        <validator type="no_options" message="You must select at least one option." />
       </param>
       <when value="geneDataset">
         <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
-        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> 
-        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> 
+        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
+        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
       </when>
       <when value="metDataset">
         <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
@@ -60,10 +61,10 @@
       </when>
       <when value="geneDataset,metDataset">
         <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
-        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> 
-        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> 
+        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
+        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
         <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
-        <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> 
+        <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
         <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
       </when>
     </conditional>
@@ -79,45 +80,56 @@
   <tests>
     <test>
       <param name="species" value="rno"/>
-      <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/>
+      <param name="geneAnnot" value="ensembl2symbol_annotation.tsv"/>
       <param name="geneUniqId" value="UniqueID"/>
-      <param name="geneName" value="GeneSymbol"/>
-      <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/>
+      <param name="geneName" value="GeneName"/>
+      <param name="metAnnot" value="metabolite_annotation.tsv"/>
       <param name="metUniqId" value="UniqueID"/>
       <param name="metName" value="MetName"/>
-      <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/>
-      <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/>
+      <output name="geneOutput" value="gene_to_keggId_link.tsv" compare="diff" lines_diff="100000"/>
+      <output name="metOutput" value="metabolite_to_keggId_link.tsv" compare="diff" lines_diff="10000"/>
     </test>
   </tests>
   <help><![CDATA[
 
 **Tool Description**
 
-  This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) 
-  creating either a (a) Gene to KEGGID Link or  a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to 
-  take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will 
-  link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs. 
-  This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName 
-  and adds the following columns:  1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG, 
-  3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal 
-  function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
+This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs)
+creating either:
+(a) a Gene to KEGGID Link or
+(b) a Metabolite to KEGGID Link dataset.
+
+For gene expression data, the tool is designed to
+take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset
+contains a Selected column, the tool will
+link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must
+have a column containing unique FeatureIDs.
+This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene
+expression data) and GeneSymbol/MetaboliteName
+and adds the following columns:
 
-  User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure.  Common metabolite prefixes 
-  are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).  
-  If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing  “ic acid”, “icacid” or “ic_acid” with  “ate”.  
-  If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name.  The following commonly used lipid 
-  abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc  = lysophosphatidylcholine, PC = phosphatidylcholine, 
-  PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine).  Similarly, abbreviations for other commonly assayed metabolites are 
-  modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of 
+  1) Name_in_KEGG, the name found in KEGG
+  2) Matched, a column indicating whether a match was found in KEGG,
+  3) KEGGID, the KEGG identifier for the Match
+  4) Score, a similarity score representing match similarity (calculated using the python internal function SequenceMatcher from difflib (check)
+  5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
+
+  User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure:  Common metabolite prefixes
+  are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
+  If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing  “ic acid”, “icacid” or “ic_acid” with  “ate”.
+  If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name.  The following commonly used lipid
+  abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc  = lysophosphatidylcholine, PC = phosphatidylcholine,
+  PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine).  Similarly, abbreviations for other commonly assayed metabolites are
+  modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
   more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.
- 
-  Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned.  The similarity 
-  score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined 
+
+  Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned.  The similarity
+  score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements, where 'junk' elements are defined
   as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)
 
-  Selected = Yes for the match with the highest similarity score.  
+  Selected = Yes for the match with the highest similarity score.
 
-  For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:  
+  For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:
   if the Score is greater than 95% for 2 or more matches in the metabolite name then:
   1) the Tie column = 'Yes' and a warning message will appear
   2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.
@@ -146,11 +158,11 @@
 
 **Unique FeatureID**
 
-  Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.
+Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.
 
 **Gene Symbol or Metabolite Names**
 
-  Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.
+Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.
 
 --------------------------------------------------------------------------------
 
@@ -165,7 +177,7 @@
   (5) **Name_in_KEGG:**  column containing the KEGG name for the match.
   (6) **KEGGID:**  column containing the KEGG identifier for the match.
   (7) **Similarity:**  value indicating the similarity between the given feature and the match in KEGG.  Ranges from 0 to 1.
-  (8) **Tie:**  in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. 
+  (8) **Tie:**  in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%.
   (9) **Selected:**  for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple	matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.
 
 
@@ -194,26 +206,13 @@
     author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
     title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
     journal = {BMC Bioinformatics},
-    year = {in press}
+    year = {2018}
     }</citation>
-    <citation type="bibtex">
-    @article{garcia2010paintomics,
-    title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data},
-    author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana},
-    journal={Bioinformatics},
-    volume={27},
-    number={1},
-    pages={137--139},
-    year={2010},
-    publisher={Oxford University Press}
-    }</citation>
-    <citation>@article{wu2014mygene,
-    title={MyGene. info: gene annotation query as a service},
-    author={Wu, Chunlei and Mark, Adam and Su, Andrew I},
-    journal={bioRxiv},
-    pages={009332},
-    year={2014},
-    publisher={Cold Spring Harbor Laboratory}
+    <citation type="bibtex">@article{Mor2021GaitGM,
+    title={GAIT-GM integrative cross-omics analyses reveal cholinergic defects in a C. elegans model of Parkinson's disease},
+    author={Mor, DE and Huertas, F and Morse, AM and Kaletsky, R and Murphy, CT and Kalia, V and Miller, GW and Moskalenko, O and Conesa, A and McIntyre, LM},
+    journal={BMC Genomics},
+    year={submitted},
     }</citation>
   </citations>
 </tool>