Mercurial > repos > malex > gait_gm
diff add_kegg_anno_info.xml @ 1:ec9ee8edb84d draft
Initial upload of 21.6.10 release.
author | malex |
---|---|
date | Fri, 18 Jun 2021 20:23:19 +0000 |
parents | |
children | 2c218a253d56 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/add_kegg_anno_info.xml Fri Jun 18 20:23:19 2021 +0000 @@ -0,0 +1,219 @@ +<tool id="secimtools_add_kegg_anno_info" name="Link Name to KEGGID" version="@WRAPPER_VERSION@"> + <description></description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <stdio> + <exit_code range="1" level="fatal" description="Repeated Unique IDs"/> + </stdio> + <command detect_errors="exit_code"><![CDATA[ + add_kegg_anno_info.py + -s=$species + #if $dataSets.whichDataSet == "geneDataset": + -ga=$dataSets.geneAnnot + -gid=$dataSets.geneUniqId + -gn=$dataSets.geneName + #end if + #if $dataSets.whichDataSet == "metDataset": + -ma=$dataSets.metAnnot + -mid=$dataSets.metUniqId + -mn=$dataSets.metName + #end if + #if $dataSets.whichDataSet == "geneDataset,metDataset": + -ga=$dataSets.geneAnnot + -gid=$dataSets.geneUniqId + -gn=$dataSets.geneName + -ma=$dataSets.metAnnot + -mid=$dataSets.metUniqId + -mn=$dataSets.metName + #end if + -go=$geneOutput + -mo=$metOutput + ]]></command> + <inputs> + <param name="species" type="select" label="Select Species from the list" > + <option value="hsa">Homo sapiens</option> + <option value="mmu">Mus musculus</option> + <option value="rno">Rattus norvegicus</option> + <option value="dme">Drosophila melanogaster</option> + <option value="ath">Arabidopsis thaliana</option> + <option value="sce">Saccharomyces cerevisiae</option> + <option value="eco">Escherichia coli</option> + </param> + <conditional name="dataSets"> + <param name="whichDataSet" type="select" display="radio" label="Select Annotation Dataset(s)"> + <option value="geneDataset,metDataset" selected="true">Gene Expression + Metabolomic Annotation Datasets</option> + <option value="geneDataset">Gene Expression Annotation Dataset</option> + <option value="metDataset">Metabolomic Annotation Dataset</option> + <validator type="no_options" message="You must select at least one option." /> + </param> + <when value="geneDataset"> + <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> + <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> + <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> + </when> + <when value="metDataset"> + <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> + <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> + <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> + </when> + <when value="geneDataset,metDataset"> + <param name="geneAnnot" type="data" format="tabular" label="Select the Gene Expression Annotation dataset from your History"/> + <param name="geneUniqId" type="text" size="30" value="" label="Gene Unique FeatureID" help="Name of the column in your Gene Expression Annotation dataset that contains the unique FeatureIDs."/> + <param name="geneName" type="text" size="30" value="" label="Gene Symbol" help="Name of the Column in your Gene Expression Annotation dataset that contains Gene Symbols to use for linking to KEGGIDs."/> + <param name="metAnnot" type="data" format="tabular" label="Select the Metabolomic Annotation dataset from your History" /> + <param name="metUniqId" type="text" size="30" value="" label="Metabolite Unique FeatureID" help="Name of the Column in your Metabolomic Annotation dataset that contains the unique FeatureIDs."/> + <param name="metName" type="text" size="30" value="" label="Metabolite Names" help="Name of the Column in your Metabolomic Annotation dataset that has metabolite names to use for linking to KEGGIDs."/> + </when> + </conditional> + </inputs> + <outputs> + <data format="tabular" name="geneOutput" label="${tool.name} on ${on_string}: Gene to KEGGID link"> + <filter>(dataSets['whichDataSet'] == 'geneDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter> + </data> + <data format="tabular" name="metOutput" label="${tool.name} on ${on_string}: Metabolite to KEGGID link"> + <filter>(dataSets['whichDataSet'] == 'metDataset') or (dataSets['whichDataSet'] == 'geneDataset,metDataset')</filter> + </data> + </outputs> + <tests> + <test> + <param name="species" value="rno"/> + <param name="geneAnnot" value="ensembl2symbol_annotation_file_01fhl.tsv"/> + <param name="geneUniqId" value="UniqueID"/> + <param name="geneName" value="GeneSymbol"/> + <param name="metAnnot" value="metabolite_annotation_file_01fhl.tsv"/> + <param name="metUniqId" value="UniqueID"/> + <param name="metName" value="MetName"/> + <param name="geneOutput" value="gene_link_kegg_annotation_file_01fhl.tsv"/> + <param name="metOutput" value="metabolite_link_kegg_annotation_file_01fhl.tsv"/> + </test> + </tests> + <help><![CDATA[ + +**Tool Description** + + This tool takes an annotation dataset containing metabolite compound names or gene symbols and links them to identifiers in KEGG (KEGGIDs) + creating either a (a) Gene to KEGGID Link or a (b) Metabolite to KEGGID Link dataset. For gene expression data, the tool is designed to + take the output from the 'Map ENSEMBLIDs to Gene Symbols' tool as input. If your input dataset contains a Selected column, the tool will + link GeneSymbols to KEGGIDs where Selected = 'Yes'. Input Files without a Selected column must have a column containing unique FeatureIDs. + This tool takes an annotation dataset containing unique FeatureIDs, ENSEMBLIDs (for gene expression data) and GeneSymbol/MetaboliteName + and adds the following columns: 1) Name_in_KEGG, the name found in KEGG, 2) Matched, a column indicating whether a match was found in KEGG, + 3) KEGGID, the KEGG identifier for the Match, 4) Score, a similarity score representing match similarity (caluclated using the python internal + function SequenceMatcher from difflib (check) and 5) a Tie column to indicate if a gene symbol or metabolite name matched more than one KEGGID. + + User-specified metabolite names are linked to KEGGIDs by identifying the best match using the following procedure. Common metabolite prefixes + are removed (cis-, trans-, d- , l- , (s)-, alpha-, beta-, alpha, beta, alpha-d-, beta-d-, alpha-l-, beta-l-, l-beta-, l-alpha-, d-beta-, d-alpha-). + If the metabolite name given is an acid, then the name is modified to the conjugate base by replacing “ic acid”, “icacid” or “ic_acid” with “ate”. + If amino acids are given in 1-letter or 3-letter abbreviations, names are modified to the full amino acid name. The following commonly used lipid + abbreviations are modified to reflect the full names (SM = sphingomyelin, lysopc = lysophosphatidylcholine, PC = phosphatidylcholine, + PE = phosphatidylethanolamine and LysoPE = lysophosphatidylethanolamine). Similarly, abbreviations for other commonly assayed metabolites are + modified to reflect the full names (cit = citrate, orn = ornithine, thyr = thyroxine and boc = butoxycarbonyl). The code allows the addition of + more synonyms. The user-specified metabolite names are retained in the output dataset for comparisons with the KEGG database. + + Each parsed metabolite name is compared to metabolite names in KEGG. The best match in KEGG based on similarity score is returned. The similarity + score (Score column) is based on the longest contiguous matching subsequence that does not contain 'junk' elements where 'junk' elements are defined + as duplicates making up more than 1% of a sequence with minimum length of 200 (python SequenceMatcher class from difflib) + + Selected = Yes for the match with the highest similarity score. + + For metabolite names where the best match is tied with at least one other compound in KEGG, all matches are returned. A tie is determined as follows: + if the Score is greater than 95% for 2 or more matches in the metabolite name then: + 1) the Tie column = 'Yes' and a warning message will appear + 2) the Selected column is sorted alphabetically on the Name_in_KEGG column. Note that the user-specified FeatureID and MetaboliteName may not be unique in the resulting output dataset. + +-------------------------------------------------------------------------------- + +**INPUT** + +**Annotation File** + + +-------------+--------------+-----+ + | FeatureID | Name | ... | + +=============+==============+=====+ + | FeatureID_1 | one | ... | + +-------------+--------------+-----+ + | FeatureID_2 | two | ... | + +-------------+--------------+-----+ + | FeatureID_3 | three | ... | + +-------------+--------------+-----+ + | FeatureID_4 | four | ... | + +-------------+--------------+-----+ + | ... | ... | ... | + +-------------+--------------+-----+ + + **NOTE:** This dataset must contain at least two columns, a column of FeatureIDs and a column containing names (e.g. gene symbol or compound names) to use for linking to KEGGIDs. Other columns may be present in the dataset. The user can use a Gene Expression Annotation dataset, a Metabolomic Annotation dataset or both. + +**Unique FeatureID** + + Name of the column in your gene expression or metabolomic Annotation dataset that contains the Unique FeatureIDs. + +**Gene Symbol or Metabolite Names** + + Name of the column in your gene expression or metabolomic Annotation dataset with the names to use for matching to KEGGIDs. + +-------------------------------------------------------------------------------- + +**OUTPUT** + +For each input Annotation file, a TSV file containing the following columns is generated: + + (1) **unique FeatureID:** column from the Annotation dataset containing the unique FeatureIDs. + (2) **Name:** column from Annotation dataset used for matching in KEGG. + (3) **Feature_Type:** column indicating whether matching was for metabolites or genes. + (4) **Matched:** column indicating whether a match in KEGG was found. Yes/No + (5) **Name_in_KEGG:** column containing the KEGG name for the match. + (6) **KEGGID:** column containing the KEGG identifier for the match. + (7) **Similarity:** value indicating the similarity between the given feature and the match in KEGG. Ranges from 0 to 1. + (8) **Tie:** in cases where multiple matches are found for a given feature, Tie = yes if the similarity is greater than 95%. + (9) **Selected:** for features with multiple matches and different similarity scores, the 'Selected' column = yes for the match with the highest similarity score. For features with multiple matches and the same similarity score, the 'Selected' column = yes based on the alphabetical order of the returned match. + + +**Example Metabolite to KEGGID Link Table** + + +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+ + | FeatureID | Name | Feature_Type | Matched | Name_in_KEGG | KEGG_ID | Similarity | Tie | Selected | + +=============+============+==============+=========+==============+==========+============+=====+==========+ + | FeatureID_1 | one | Metabolite | Yes | one* | cpd:... | 1.0 | No | Yes | + +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+ + | FeatureID_2 | two | Metabolite | Yes | two* | cpd:... | 1.0 | No | Yes | + +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+ + | FeatureID_3 | two | Metabolite | Yes | three* | cpd:... | 0.87 | No | No | + +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+ + | FeatureID_4 | four | Metabolite | No | NA | NA | NA | NA | NA | + +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+ + | ... | ... | ... | ... | ... | ... | ... | ... | ... | + +-------------+------------+--------------+---------+--------------+----------+------------+-----+----------+ + + **NOTE:** Warning messages appear in cases of a Tie (greater than 95% similarity). Please check these results carefully. + + ]]> + </help> + <citations> + <citation type="bibtex">@ARTICLE{Kirpich17secimtools, + author = {Alexander S. Kirpich, Miguel Ibarra, Oleksandr Moskalenko, Justin M. Fear, Joseph Gerken, Xinlei Mi, Ali Ashrafi, Alison M. Morse, Lauren M. McIntyre}, + title = {SECIMTools: A suite of Metabolomics Data Analysis Tools}, + journal = {BMC Bioinformatics}, + year = {in press} + }</citation> + <citation type="bibtex"> + @article{garcia2010paintomics, + title={Paintomics: a web based tool for the joint visualization of transcriptomics and metabolomics data}, + author={Garc{\'\i}a-Alcalde, Fernando and Garc{\'\i}a-L{\'o}pez, Federico and Dopazo, Joaqu{\'\i}n and Conesa, Ana}, + journal={Bioinformatics}, + volume={27}, + number={1}, + pages={137--139}, + year={2010}, + publisher={Oxford University Press} + }</citation> + <citation>@article{wu2014mygene, + title={MyGene. info: gene annotation query as a service}, + author={Wu, Chunlei and Mark, Adam and Su, Andrew I}, + journal={bioRxiv}, + pages={009332}, + year={2014}, + publisher={Cold Spring Harbor Laboratory} + }</citation> + </citations> +</tool>