view add_kegg_anno_info.xml @ 1:ec9ee8edb84d draft

Initial upload of 21.6.10 release.
author malex
date Fri, 18 Jun 2021 20:23:19 +0000
parents
children 2c218a253d56
line wrap: on
line source

<tool id="secimtools_add_kegg_anno_info" name="Link Name to KEGGID" version="@WRAPPER_VERSION@">
  <description></description>
  <macros>
      <import>macros.xml</import>
  </macros>
  <expand macro="requirements" />
  <stdio> 
    <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> 
  </stdio> 
  <command detect_errors="exit_code"><![CDATA[
  add_kegg_anno_info.py
    -s=$species
    #if $dataSets.whichDataSet == "geneDataset":
      -ga=$dataSets.geneAnnot
      -gid=$dataSets.geneUniqId
      -gn=$dataSets.geneName
    #end if
    #if $dataSets.whichDataSet == "metDataset":
      -ma=$dataSets.metAnnot
      -mid=$dataSets.metUniqId
      -mn=$dataSets.metName
    #end if
    #if $dataSets.whichDataSet == "geneDataset,metDataset":
      -ga=$dataSets.geneAnnot
      -gid=$dataSets.geneUniqId
      -gn=$dataSets.geneName
      -ma=$dataSets.metAnnot
      -mid=$dataSets.metUniqId
      -mn=$dataSets.metName
    #end if
    -go=$geneOutput
    -mo=$metOutput
  ]]></command>
  <inputs>
    <param name="species" type="select" label="Select Species from the list" >
     <option value="hsa">Homo sapiens</option>
     <option value="mmu">Mus musculus</option>
     <option value="rno">Rattus norvegicus</option>
     <option value="dme">Drosophila melanogaster</option>
     <option value="ath">Arabidopsis thaliana</option>
     <option value="sce">Saccharomyces cerevisiae</option>
     <option value="eco">Escherichia coli</option>
    </param>
    <conditional name="dataSets">
      <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
        <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
        <option value="geneDataset">Gene Expression Annotation Dataset</option> 
        <option value="metDataset">Metabolomic Annotation Dataset</option>
        <validator type="no_options" message="You must select at least one option." /> 
      </param>
      <when value="geneDataset">
        <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> 
        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> 
      </when>
      <when value="metDataset">
        <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
        <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
        <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
      </when>
      <when value="geneDataset,metDataset">
        <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> 
        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> 
        <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
        <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> 
        <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
      </when>
    </conditional>
  </inputs>
  <outputs>
    <data format="tabular" name="geneOutput" label="${tool.name} on ${on_string}: Gene to KEGGID link">
      <filter>(dataSets['whichDataSet'] == 'geneDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
    </data>
    <data format="tabular" name="metOutput" label="${tool.name} on ${on_string}: Metabolite to KEGGID link">
      <filter>(dataSets['whichDataSet'] == 'metDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
    </data>
  </outputs>
  <tests>
    <test>
      <param name="species" value="rno"/>
      <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/>
      <param name="geneUniqId" value="UniqueID"/>
      <param name="geneName" value="GeneSymbol"/>
      <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/>
      <param name="metUniqId" value="UniqueID"/>
      <param name="metName" value="MetName"/>
      <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/>
      <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/>
    </test>
  </tests>
  <help><![CDATA[

**Tool Description**

  This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) 
  creating either a (a) Gene to KEGGID Link or  a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to 
  take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will 
  link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs. 
  This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName 
  and adds the following columns:  1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG, 
  3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal 
  function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.

  User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure.  Common metabolite prefixes 
  are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).  
  If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing  “ic acid”, “icacid” or “ic_acid” with  “ate”.  
  If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name.  The following commonly used lipid 
  abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc  = lysophosphatidylcholine, PC = phosphatidylcholine, 
  PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine).  Similarly, abbreviations for other commonly assayed metabolites are 
  modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of 
  more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.
 
  Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned.  The similarity 
  score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined 
  as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)

  Selected = Yes for the match with the highest similarity score.  

  For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:  
  if the Score is greater than 95% for 2 or more matches in the metabolite name then:
  1) the Tie column = 'Yes' and a warning message will appear
  2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.

--------------------------------------------------------------------------------

**INPUT**

**Annotation File**

  +-------------+--------------+-----+
  | FeatureID   | Name         | ... |
  +=============+==============+=====+
  | FeatureID_1 | one          | ... |
  +-------------+--------------+-----+
  | FeatureID_2 | two          | ... |
  +-------------+--------------+-----+
  | FeatureID_3 | three        | ... |
  +-------------+--------------+-----+
  | FeatureID_4 | four         | ... |
  +-------------+--------------+-----+
  | ...         | ...          | ... |
  +-------------+--------------+-----+

    **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both.

**Unique FeatureID**

  Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.

**Gene Symbol or Metabolite Names**

  Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.

--------------------------------------------------------------------------------

**OUTPUT**

For each input Annotation file, a TSV file containing the following columns is generated:

  (1) **unique FeatureID:**  column from the Annotation dataset containing the unique FeatureIDs.
  (2) **Name:**  column from Annotation dataset used for matching in KEGG.
  (3) **Feature_Type:**  column indicating whether matching was for metabolites or genes.
  (4) **Matched:**  column indicating whether a match in KEGG was found.  Yes/No
  (5) **Name_in_KEGG:**  column containing the KEGG name for the match.
  (6) **KEGGID:**  column containing the KEGG identifier for the match.
  (7) **Similarity:**  value indicating the similarity between the given feature and the match in KEGG.  Ranges from 0 to 1.
  (8) **Tie:**  in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. 
  (9) **Selected:**  for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple	matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.


**Example Metabolite to KEGGID Link Table**

  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID   | Name       | Feature_Type | Matched | Name_in_KEGG | KEGG_ID  | Similarity | Tie | Selected |
  +=============+============+==============+=========+==============+==========+============+=====+==========+
  | FeatureID_1 | one        | Metabolite   | Yes     | one*         | cpd:...  | 1.0        | No  | Yes      |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID_2 | two        | Metabolite   | Yes     | two*         | cpd:...  | 1.0        | No  | Yes      |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID_3 | two        | Metabolite   | Yes     | three*       | cpd:...  | 0.87       | No  | No       |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID_4 | four       | Metabolite   | No      | NA           | NA       | NA         | NA  | NA       |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | ...         | ...        | ...          | ...     | ...          | ...      | ...        | ... | ...      |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+

    **NOTE:** Warning messages appear in cases of a Tie (greater than 95% similarity). Please check these results carefully.

  ]]>
  </help>
  <citations>
    <citation type="bibtex">@ARTICLE{Kirpich17secimtools,
    author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
    title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
    journal = {BMC Bioinformatics},
    year = {in press}
    }</citation>
    <citation type="bibtex">
    @article{garcia2010paintomics,
    title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data},
    author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana},
    journal={Bioinformatics},
    volume={27},
    number={1},
    pages={137--139},
    year={2010},
    publisher={Oxford University Press}
    }</citation>
    <citation>@article{wu2014mygene,
    title={MyGene. info: gene annotation query as a service},
    author={Wu, Chunlei and Mark, Adam and Su, Andrew I},
    journal={bioRxiv},
    pages={009332},
    year={2014},
    publisher={Cold Spring Harbor Laboratory}
    }</citation>
  </citations>
</tool>