gait_gm: add_kegg_anno_info.xml comparison

comparison add_kegg_anno_info.xml @ 1:ec9ee8edb84d draft

Initial upload of 21.6.10 release.

author	malex
date	Fri, 18 Jun 2021 20:23:19 +0000
parents
children	2c218a253d56

comparison

equal deleted inserted replaced

-:864fc6430432
+:ec9ee8edb84d
+<tool id="secimtools_add_kegg_anno_info" name="Link Name to KEGGID" version="@WRAPPER_VERSION@">
+<description></description>
+<macros>
+<import>macros.xml</import>
+</macros>
+<expand macro="requirements" />
+<stdio>
+<exit_code range="1" level="fatal" description="Repeated Unique IDs"/>
+</stdio>
+<command detect_errors="exit_code"><![CDATA[
+add_kegg_anno_info.py
+-s=$species
+#if $dataSets.whichDataSet == "geneDataset":
+-ga=$dataSets.geneAnnot
+-gid=$dataSets.geneUniqId
+-gn=$dataSets.geneName
+#end if
+#if $dataSets.whichDataSet == "metDataset":
+-ma=$dataSets.metAnnot
+-mid=$dataSets.metUniqId
+-mn=$dataSets.metName
+#end if
+#if $dataSets.whichDataSet == "geneDataset,metDataset":
+-ga=$dataSets.geneAnnot
+-gid=$dataSets.geneUniqId
+-gn=$dataSets.geneName
+-ma=$dataSets.metAnnot
+-mid=$dataSets.metUniqId
+-mn=$dataSets.metName
+#end if
+-go=$geneOutput
+-mo=$metOutput
+]]></command>
+<inputs>
+<param name="species" type="select" label="Select Species from the list" >
+<option value="hsa">Homo sapiens</option>
+<option value="mmu">Mus musculus</option>
+<option value="rno">Rattus norvegicus</option>
+<option value="dme">Drosophila melanogaster</option>
+<option value="ath">Arabidopsis thaliana</option>
+<option value="sce">Saccharomyces cerevisiae</option>
+<option value="eco">Escherichia coli</option>
+</param>
+<conditional name="dataSets">
+<param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)">
+<option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option>
+<option value="geneDataset">Gene Expression Annotation Dataset</option>
+<option value="metDataset">Metabolomic Annotation Dataset</option>
+<validator type="no_options" message="You must select at least one option." />
+</param>
+<when value="geneDataset">
+<param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
+<param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
+<param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
+</when>
+<when value="metDataset">
+<param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
+<param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
+<param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
+</when>
+<when value="geneDataset,metDataset">
+<param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/>
+<param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/>
+<param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/>
+<param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" />
+<param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/>
+<param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/>
+</when>
+</conditional>
+</inputs>
+<outputs>
+<data format="tabular" name="geneOutput" label="${tool.name} on ${on_string}: Gene to KEGGID link">
+<filter>(dataSets['whichDataSet'] == 'geneDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
+</data>
+<data format="tabular" name="metOutput" label="${tool.name} on ${on_string}: Metabolite to KEGGID link">
+<filter>(dataSets['whichDataSet'] == 'metDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter>
+</data>
+</outputs>
+<tests>
+<test>
+<param name="species" value="rno"/>
+<param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/>
+<param name="geneUniqId" value="UniqueID"/>
+<param name="geneName" value="GeneSymbol"/>
+<param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/>
+<param name="metUniqId" value="UniqueID"/>
+<param name="metName" value="MetName"/>
+<param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/>
+<param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/>
+</test>
+</tests>
+<help><![CDATA[
+**Tool Description**
+This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs)
+creating either a (a) Gene to KEGGID Link or  a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to
+take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will
+link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs.
+This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName
+and adds the following columns:  1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG,
+3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal
+function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID.
+User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure.  Common metabolite prefixes
+are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-).
+If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing  “ic acid”, “icacid” or “ic_acid” with  “ate”.
+If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name.  The following commonly used lipid
+abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc  = lysophosphatidylcholine, PC = phosphatidylcholine,
+PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine).  Similarly, abbreviations for other commonly assayed metabolites are
+modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of
+more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database.
+Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned.  The similarity
+score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined
+as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib)
+Selected = Yes for the match with the highest similarity score.
+For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows:
+if the Score is greater than 95% for 2 or more matches in the metabolite name then:
+1) the Tie column = 'Yes' and a warning message will appear
+2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset.
+--------------------------------------------------------------------------------
+**INPUT**
+**Annotation File**
++-------------+--------------+-----+
+| FeatureID   | Name         | ... |
++=============+==============+=====+
+| FeatureID_1 | one          | ... |
++-------------+--------------+-----+
+| FeatureID_2 | two          | ... |
++-------------+--------------+-----+
+| FeatureID_3 | three        | ... |
++-------------+--------------+-----+
+| FeatureID_4 | four         | ... |
++-------------+--------------+-----+
+| ...         | ...          | ... |
++-------------+--------------+-----+
+**NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both.
+**Unique FeatureID**
+Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs.
+**Gene Symbol or Metabolite Names**
+Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs.
+--------------------------------------------------------------------------------
+**OUTPUT**
+For each input Annotation file, a TSV file containing the following columns is generated:
+(1) **unique FeatureID:**  column from the Annotation dataset containing the unique FeatureIDs.
+(2) **Name:**  column from Annotation dataset used for matching in KEGG.
+(3) **Feature_Type:**  column indicating whether matching was for metabolites or genes.
+(4) **Matched:**  column indicating whether a match in KEGG was found.  Yes/No
+(5) **Name_in_KEGG:**  column containing the KEGG name for the match.
+(6) **KEGGID:**  column containing the KEGG identifier for the match.
+(7) **Similarity:**  value indicating the similarity between the given feature and the match in KEGG.  Ranges from 0 to 1.
+(8) **Tie:**  in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%.
+(9) **Selected:**  for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple	matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match.
+**Example Metabolite to KEGGID Link Table**
++-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
+| FeatureID   | Name       | Feature_Type | Matched | Name_in_KEGG | KEGG_ID  | Similarity | Tie | Selected |
++=============+============+==============+=========+==============+==========+============+=====+==========+
+| FeatureID_1 | one        | Metabolite   | Yes     | one*         | cpd:...  | 1.0        | No  | Yes      |
++-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
+| FeatureID_2 | two        | Metabolite   | Yes     | two*         | cpd:...  | 1.0        | No  | Yes      |
++-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
+| FeatureID_3 | two        | Metabolite   | Yes     | three*       | cpd:...  | 0.87       | No  | No       |
++-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
+| FeatureID_4 | four       | Metabolite   | No      | NA           | NA       | NA         | NA  | NA       |
++-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
+| ...         | ...        | ...          | ...     | ...          | ...      | ...        | ... | ...      |
++-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+
+**NOTE:** Warning messages appear in cases of a Tie (greater than 95% similarity). Please check these results carefully.
+]]>
+</help>
+<citations>
+<citation type="bibtex">@ARTICLE{Kirpich17secimtools,
+author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre},
+title = {SECIMTools: A suite of Metabolomics Data Analysis Tools},
+journal = {BMC Bioinformatics},
+year = {in press}
+}</citation>
+<citation type="bibtex">
+@article{garcia2010paintomics,
+title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data},
+author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana},
+journal={Bioinformatics},
+volume={27},
+number={1},
+pages={137--139},
+year={2010},
+publisher={Oxford University Press}
+}</citation>
+<citation>@article{wu2014mygene,
+title={MyGene. info: gene annotation query as a service},
+author={Wu, Chunlei and Mark, Adam and Su, Andrew I},
+journal={bioRxiv},
+pages={009332},
+year={2014},
+publisher={Cold Spring Harbor Laboratory}
+}</citation>
+</citations>
+</tool>

Mercurial > repos > malex > gait_gm

comparison add_kegg_anno_info.xml @ 1:ec9ee8edb84d draft