view add_kegg_anno_info.xml @ 2:2c218a253d56 draft default tip

"planemo upload for repository https://github.com/secimTools/gait-gm/tree/main/galaxy commit 758394addb95b09e794132a23a1f7e95ba39df0b"
author malex
date Thu, 29 Jul 2021 20:48:10 +0000
parents ec9ee8edb84d
children
line wrap: on
line source

<tool id="secimtools_add_kegg_anno_info" name="Link Name to KEGGID" version="@WRAPPER_VERSION@">
  <description></description>
  <macros>
      <import>macros.xml</import>
  </macros>
  <expand macro="requirements" />
  <stdio>
    <exit_code range="1" level="fatal" description="Repeated Unique IDs"/>
  </stdio>
  <command detect_errors="exit_code"><![CDATA[
  add_kegg_anno_info.py
    -s=$species
    #if $dataSets.whichDataSet == "geneDataset":
      -ga=$dataSets.geneAnnot
      -gid=$dataSets.geneUniqId
      -gn=$dataSets.geneName
    #end if
    #if $dataSets.whichDataSet == "metDataset":
      -ma=$dataSets.metAnnot
      -mid=$dataSets.metUniqId
      -mn=$dataSets.metName
    #end if
    #if $dataSets.whichDataSet == "geneDataset,metDataset":
      -ga=$dataSets.geneAnnot
      -gid=$dataSets.geneUniqId
      -gn=$dataSets.geneName
      -ma=$dataSets.metAnnot
      -mid=$dataSets.metUniqId
      -mn=$dataSets.metName
    #end if
    -go=$geneOutput
    -mo=$metOutput
  ]]></command>
  <inputs>
    <param name="species" type="select" label="Select Species from the list" >
     <option value="hsa">Homo sapiens</option>
     <option value="mmu">Mus musculus</option>
     <option value="rno">Rattus norvegicus</option>
     <option value="dme">Drosophila melanogaster</option>
     <option value="ath">Arabidopsis thaliana</option>
     <option value="sce">Saccharomyces cerevisiae</option>
     <option value="eco">Escherichia coli</option>
     <option value="cel">Caenorhabditis elegans</option>
    </param>
    <conditional name="dataSets">
      <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
        <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
        <option value="geneDataset">Gene Expression Annotation Dataset</option>
        <option value="metDataset">Metabolomic Annotation Dataset</option>
        <validator type="no_options" message="You must select at least one option." />
      </param>
      <when value="geneDataset">
        <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
      </when>
      <when value="metDataset">
        <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
        <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
        <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
      </when>
      <when value="geneDataset,metDataset">
        <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
        <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
        <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
        <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
        <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
        <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
      </when>
    </conditional>
  </inputs>
  <outputs>
    <data format="tabular" name="geneOutput" label="${tool.name} on ${on_string}: Gene to KEGGID link">
      <filter>(dataSets['whichDataSet'] == 'geneDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
    </data>
    <data format="tabular" name="metOutput" label="${tool.name} on ${on_string}: Metabolite to KEGGID link">
      <filter>(dataSets['whichDataSet'] == 'metDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
    </data>
  </outputs>
  <tests>
    <test>
      <param name="species" value="rno"/>
      <param name="geneAnnot" value="ensembl2symbol_annotation.tsv"/>
      <param name="geneUniqId" value="UniqueID"/>
      <param name="geneName" value="GeneName"/>
      <param name="metAnnot" value="metabolite_annotation.tsv"/>
      <param name="metUniqId" value="UniqueID"/>
      <param name="metName" value="MetName"/>
      <output name="geneOutput" value="gene_to_keggId_link.tsv" compare="diff" lines_diff="100000"/>
      <output name="metOutput" value="metabolite_to_keggId_link.tsv" compare="diff" lines_diff="10000"/>
    </test>
  </tests>
  <help><![CDATA[

**Tool Description**

This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs)
creating either:
(a) a Gene to KEGGID Link or
(b) a Metabolite to KEGGID Link dataset.

For gene expression data, the tool is designed to
take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset
contains a Selected column, the tool will
link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must
have a column containing unique FeatureIDs.
This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene
expression data) and GeneSymbol/MetaboliteName
and adds the following columns:

  1) Name_in_KEGG, the name found in KEGG
  2) Matched, a column indicating whether a match was found in KEGG,
  3) KEGGID, the KEGG identifier for the Match
  4) Score, a similarity score representing match similarity (calculated using the python internal function SequenceMatcher from difflib (check)
  5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.

  User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure:  Common metabolite prefixes
  are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
  If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing  “ic acid”, “icacid” or “ic_acid” with  “ate”.
  If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name.  The following commonly used lipid
  abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc  = lysophosphatidylcholine, PC = phosphatidylcholine,
  PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine).  Similarly, abbreviations for other commonly assayed metabolites are
  modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
  more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.

  Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned.  The similarity
  score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements, where 'junk' elements are defined
  as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)

  Selected = Yes for the match with the highest similarity score.

  For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:
  if the Score is greater than 95% for 2 or more matches in the metabolite name then:
  1) the Tie column = 'Yes' and a warning message will appear
  2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.

--------------------------------------------------------------------------------

**INPUT**

**Annotation File**

  +-------------+--------------+-----+
  | FeatureID   | Name         | ... |
  +=============+==============+=====+
  | FeatureID_1 | one          | ... |
  +-------------+--------------+-----+
  | FeatureID_2 | two          | ... |
  +-------------+--------------+-----+
  | FeatureID_3 | three        | ... |
  +-------------+--------------+-----+
  | FeatureID_4 | four         | ... |
  +-------------+--------------+-----+
  | ...         | ...          | ... |
  +-------------+--------------+-----+

    **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both.

**Unique FeatureID**

Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.

**Gene Symbol or Metabolite Names**

Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.

--------------------------------------------------------------------------------

**OUTPUT**

For each input Annotation file, a TSV file containing the following columns is generated:

  (1) **unique FeatureID:**  column from the Annotation dataset containing the unique FeatureIDs.
  (2) **Name:**  column from Annotation dataset used for matching in KEGG.
  (3) **Feature_Type:**  column indicating whether matching was for metabolites or genes.
  (4) **Matched:**  column indicating whether a match in KEGG was found.  Yes/No
  (5) **Name_in_KEGG:**  column containing the KEGG name for the match.
  (6) **KEGGID:**  column containing the KEGG identifier for the match.
  (7) **Similarity:**  value indicating the similarity between the given feature and the match in KEGG.  Ranges from 0 to 1.
  (8) **Tie:**  in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%.
  (9) **Selected:**  for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple	matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.


**Example Metabolite to KEGGID Link Table**

  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID   | Name       | Feature_Type | Matched | Name_in_KEGG | KEGG_ID  | Similarity | Tie | Selected |
  +=============+============+==============+=========+==============+==========+============+=====+==========+
  | FeatureID_1 | one        | Metabolite   | Yes     | one*         | cpd:...  | 1.0        | No  | Yes      |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID_2 | two        | Metabolite   | Yes     | two*         | cpd:...  | 1.0        | No  | Yes      |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID_3 | two        | Metabolite   | Yes     | three*       | cpd:...  | 0.87       | No  | No       |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | FeatureID_4 | four       | Metabolite   | No      | NA           | NA       | NA         | NA  | NA       |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
  | ...         | ...        | ...          | ...     | ...          | ...      | ...        | ... | ...      |
  +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+

    **NOTE:** Warning messages appear in cases of a Tie (greater than 95% similarity). Please check these results carefully.

  ]]>
  </help>
  <citations>
    <citation type="bibtex">@ARTICLE{Kirpich17secimtools,
    author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
    title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
    journal = {BMC Bioinformatics},
    year = {2018}
    }</citation>
    <citation type="bibtex">@article{Mor2021GaitGM,
    title={GAIT-GM integrative cross-omics analyses reveal cholinergic defects in a C. elegans model of Parkinson's disease},
    author={Mor, DE and Huertas, F and Morse, AM and Kaletsky, R and Murphy, CT and Kalia, V and Miller, GW and Moskalenko, O and Conesa, A and McIntyre, LM},
    journal={BMC Genomics},
    year={submitted},
    }</citation>
  </citations>
</tool>