view term_mapper.xml @ 13:a4c4656ad9e0

some improvements/fixes?
author pieter.lukasse@wur.nl
date Sat, 28 Mar 2015 21:21:32 +0100
parents 89264646e458
children
line wrap: on
line source

<tool name="TermMapperTool" id="TermMapperTool1" version="0.0.2">
	<description>use cross-reference lookup tables to annotate results</description>
	<!-- 
	   For remote debugging start you listener on port 8000 and use the following as command interpreter:
	       java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000 
	    -->
	     <!--  similar to "join two datasets" tool http://galaxy.wur.nl/galaxy_production/root?tool_id=join1 
	           but this one is probably having more powerful features like supporting multiple ';' codes in key fields 
	           and the feature in termColName(s) supporting direct hierarchy like annotation -->
	<command interpreter="java -jar ">
	    TermMapperTool.jar 
		-inputFileName $inputFileName
 		-inputIdColumnName "$inputIdColumnName"
 		#if $inputIdCol.inputIdHasPrefix == True
 			-inputIdPrefix "$inputIdCol.inputIdPrefix"  
 		#end if
		
		-mappingFileName $mappingFileName
		-mappingFileIdColName "$mappingFileIdColName"  
		
		#if $mappingIdCol.mappingIdHasPrefix == True
 			-mappingIdPrefix "$mappingIdCol.mappingIdPrefix"  
 		#end if
		
		-mappingFileTermColName "$mappingFileTermColName"

		-outputFileName $outputFileName
		
		#if $genObservations.genObservationsFile == True
			-outputObservationsFileName $outputObservationsFileName
        	-quantifColumn "$genObservations.quantifColumn" 
        	-multipleMappingSolution $genObservations.multipleMappingSolution
        	-filterZeros $genObservations.filterZeros
 		#end if
		
		-mappedTermsColName $mappedTermsColName
		-numberOfHeaderLines $numberOfHeaderLines
		
		-htmlReportFile $htmlReportFile
	    -htmlReportFilesPath $htmlReportFile.files_path
        	    
	</command>
	
	<inputs>
	 	
  		<param name="inputFileName" type="data" format="tabular,csv,txt" label="Target file (TSV/CSV)" />
  		
  		<param name="inputIdColumnName" type="text" size="50" value="" label="ID column name" 
  			help="Name of the column containing the identification codes (in the given input file)"/>
  		
  		<conditional name="inputIdCol">
     		<param name="inputIdHasPrefix" type="boolean" truevalue="Yes" falsevalue="No" checked="false" 
     			label="ID values have a prefix"/>
     		<when value="Yes">
  				<param name="inputIdPrefix" type="text" size="50" value="" label="Prefix in ID column" 
  					help="Fill in if any prefix is found in the ID column values (e.g. in some 
						 files the value is preceded by a fixed value like for example 'lipidmaps:LMFA00000007' instead of just 'LMFA00000007' - in this 
						 example one would fill in 'lipidmaps:' as prefix)"/>
			</when>
			<when value="No">
			</when>
		</conditional>
  		
  		<!-- =================== cross-reference part ============== -->
  		<param name="mappingFileName" type="data" format="tabular,csv" label="Lookup table (TSV/CSV)" help="Simple mapping file between the coding scheme used to another scheme"/>
  		<param name="numberOfHeaderLines" type="select" label="Number of header lines in mapping file"
   		help="If this is '0', use the column numbers starting from 1 as the 'names' in the paramters below.">
	    	<option value="0" >0</option>
	    	<option value="1" selected="true">1</option>
	    </param>
		
		
  		<param name="mappingFileIdColName" type="text" size="50" value="" label="ID column name or number (in lookup table)" help="Name (or number) of the ID column for the lookup"/>
  		
  		<conditional name="mappingIdCol">
     		<param name="mappingIdHasPrefix" type="boolean" truevalue="Yes" falsevalue="No" checked="false" 
     			label="ID values have a prefix"/>
     		<when value="Yes">
  				<param name="mappingIdPrefix" type="text" size="50" value="" label="Prefix in ID column" 
  					help="Fill in if any prefix is found in the ID column values (e.g. in some 
						files the value is preceded by a fixed value like for example 'lipidmaps:LMFA00000007' instead of just 'LMFA00000007' - in this 
					 	example one would fill in 'lipidmaps:' as prefix)"/>
			</when>
			<when value="No">
			</when>
		</conditional>

  		<param name="mappingFileTermColName" type="text" size="50" value="" label="Term column name(s) or number(s)" 
  		       help="Name(s) or number(s) of the column(s) containing the term(s) in the lookup table (and which will be transfered to the target file based on ID match in 'ID column name'). 
  		             For using multiple term column names, set the names separated by comma (,). 
  		             If multiple columns are specified, the algorithm will look for an annotation in the first one, if none
  		             found it will try the second one, and so forth. "/>
  		
  		
  		<param name="mappedTermsColName" type="text" size="50" value="Mapped terms" label="Name to give to the new column" 
  		       help="Name to give to the new column that will be added to the target file. This new column is the one
  		             that will contain the respectively mapped terms."/>
  		
		<conditional name="genObservations">
     		<param name="genObservationsFile" type="boolean" truevalue="Yes" falsevalue="No" checked="false" 
     			label="Generate also observations file"/>
     		<when value="Yes">     	
     			<param name="quantifColumn" type="text" size="50" value="" 
     				label="(Optional) Values column name" 
     				help="Name of the column containing the quantification values (in the given input file)"/>
     			<param name="multipleMappingSolution" type="select"  
     			       label="(when using values column above) What to do when multiple items map to the same term"
     			       help="When e.g. two Uniprot codes map to the same KEGG code, which quantification value to use">
     				<option value="not" selected="true">Do nothing, leave as is</option>
     				<option value="max" >Use max value</option>
	    			<option value="min">Use min value</option>
	    			<option value="avg">Use avg value</option>
	    		</param>
	    		<param name="filterZeros" type="boolean" checked="false" 
		     	       label="Filter zeros"
		     	       help="Filter out the items that have quantification value = 0"/> 
     		</when>
     		<when value="No">
			</when>
		</conditional>
   	
     	
	</inputs>
	<outputs>
		#if isinstance( $inputFileName.datatype, $__app__.datatypes_registry.get_datatype_by_extension('tabular').__class__):
			<data name="outputFileName" format="tabular" label="${tool.name} on ${on_string}: annotated file " ></data>
		#else:
       		<data name="outputFileName" format="csv" label="${tool.name} on ${on_string}: annotated file " ></data>
   		#end if
	  #if $genObservations.genObservationsFile == True :
	  	<data name="outputObservationsFileName" format="tabular" label="${tool.name} on ${on_string}: term observations file (TSV)"></data>
	  #end if
	  <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/>
	</outputs>
	<tests>
	  <!--  find out how to use -->
	  <test>
	  </test>
	</tests>
  <help>
  
.. class:: infomark

  
This tool is responsible for annotating the given target file 
with the terms given in a lookup table. This lookup table maps the items found in the target file
(e.g. protein identifications coded in common protein coding formats such as UniProt )
to their respective terms (e.g. GO terms). It enables users to use the cross-reference 
information now available from different repositories (like uniprot and KEGG - see for example
http://www.uniprot.org/taxonomy/ or http://www.genome.jp/linkdb/ )
to map their data to other useful coding schemes or to ontologies and functional annotations.  

.. class:: infomark

**NB:** Currently the tool will do "smart parsing" of hierarchy based fields in the target file ID column. 
 This means that if the colum contains a ".", the trailing part of the ID after the "." is ignored if the full
 ID does not get a match in the lookup table while the part before the "." does. 
 
.. class:: infomark

Examples of usage:

  annotate protein identifications with Gene Ontology[GO] terms
  
  annotate metabolite CAS identifications with chebi codes
  
  add KEGG gene codes to a file containing UNIPROT codes
  
  add KEGG compound codes to a file containing chebi codes
  
  etc
 
As an example for transcripts and proteins, users can check http://www.uniprot.org/taxonomy/ to
see if their organism has been mapped to GO terms by Uniprot. For example the link 
http://www.uniprot.org/uniprot/?query=taxonomy:2850 will show the Uniprot repository and cross-references
for the taxonomy 2850.
When the organism being studied is not available, then other strategies 
could be tried (like Blast2GO for example).

Despite the specific examples above, this class is generic and can be used to map any 
values to new terms according to a given lookup table.    
  
.. class:: infomark

*Omics cross-reference resources on the web:*

LinkDB: http://www.genome.jp/linkdb/

*Ready to use metabolomics links:*

http://rest.genome.jp/link/compound/chebi

http://rest.genome.jp/link/compound/lipidmaps

http://rest.genome.jp/link/compound/lipidbank

http://rest.genome.jp/link/compound/hmdb


*Ready to use proteomics links:*

http://rest.genome.jp/link/uniprot/pti  (Phaeodactylum Tricornutum)
http://rest.genome.jp/link/pti/uniprot

http://rest.genome.jp/link/uniprot/hsa  (Homo Sapiens)

(for organism code list see: )


Uniprot to GO

http://www.uniprot.org/taxonomy/

http://www.uniprot.org/uniprot/?sort=&amp;desc=&amp;query=proteome:UP000000759&amp;fil=&amp;format=tab&amp;force=yes&amp;columns=id,go-id  (Phaeodactylum Tricornutum)


-----

**Output**

This method will read in the given input file and for each line it will add a new column 
containing the terms found for the ID in that line. So the output file is the same as the 
input file + extra terms column (separated by ; ).

-----

**Link to ontology viewer**

A second summarized "terms observations" file can also be generated.
In case the terms are ontology terms, this file can be used for visualizing the results
in the ontology viewer "OntologyAndObservationsViewer". 

  </help>
</tool>