7
+ − 1 <tool name="TermMapperTool" id="TermMapperTool1" version="0.0.2">
+ − 2 <description>use cross-reference lookup tables to annotate results</description>
+ − 3 <!--
+ − 4 For remote debugging start you listener on port 8000 and use the following as command interpreter:
+ − 5 java -jar -Xdebug -Xrunjdwp:transport=dt_socket,address=D0100564.wurnet.nl:8000
+ − 6 -->
+ − 7 <!-- similar to "join two datasets" tool http://galaxy.wur.nl/galaxy_production/root?tool_id=join1
+ − 8 but this one is probably having more powerful features like supporting multiple ';' codes in key fields
+ − 9 and the feature in termColName(s) supporting direct hierarchy like annotation -->
+ − 10 <command interpreter="java -jar ">
+ − 11 TermMapperTool.jar
+ − 12 -inputFileName $inputFileName
+ − 13 -inputIdColumnName "$inputIdColumnName"
+ − 14 #if $inputIdCol.inputIdHasPrefix == True
+ − 15 -inputIdPrefix "$inputIdCol.inputIdPrefix"
+ − 16 #end if
+ − 17
+ − 18 -mappingFileName $mappingFileName
+ − 19 -mappingFileIdColName "$mappingFileIdColName"
+ − 20
+ − 21 #if $mappingIdCol.mappingIdHasPrefix == True
+ − 22 -mappingIdPrefix "$mappingIdCol.mappingIdPrefix"
+ − 23 #end if
+ − 24
+ − 25 -mappingFileTermColName "$mappingFileTermColName"
+ − 26
+ − 27 -outputFileName $outputFileName
+ − 28
+ − 29 #if $genObservations.genObservationsFile == True
+ − 30 -outputObservationsFileName $outputObservationsFileName
+ − 31 -quantifColumn "$genObservations.quantifColumn"
9
+ − 32 -multipleMappingSolution $genObservations.multipleMappingSolution
+ − 33 -filterZeros $genObservations.filterZeros
7
+ − 34 #end if
+ − 35
+ − 36 -mappedTermsColName $mappedTermsColName
8
+ − 37 -numberOfHeaderLines $numberOfHeaderLines
9
+ − 38
+ − 39 -htmlReportFile $htmlReportFile
+ − 40 -htmlReportFilesPath $htmlReportFile.files_path
7
+ − 41
+ − 42 </command>
+ − 43
+ − 44 <inputs>
+ − 45
9
+ − 46 <param name="inputFileName" type="data" format="tabular,csv,txt" label="Target file (TSV/CSV)" />
7
+ − 47
+ − 48 <param name="inputIdColumnName" type="text" size="50" value="" label="ID column name"
+ − 49 help="Name of the column containing the identification codes (in the given input file)"/>
+ − 50
+ − 51 <conditional name="inputIdCol">
+ − 52 <param name="inputIdHasPrefix" type="boolean" truevalue="Yes" falsevalue="No" checked="false"
+ − 53 label="ID values have a prefix"/>
+ − 54 <when value="Yes">
+ − 55 <param name="inputIdPrefix" type="text" size="50" value="" label="Prefix in ID column"
+ − 56 help="Fill in if any prefix is found in the ID column values (e.g. in some
+ − 57 files the value is preceded by a fixed value like for example 'lipidmaps:LMFA00000007' instead of just 'LMFA00000007' - in this
+ − 58 example one would fill in 'lipidmaps:' as prefix)"/>
+ − 59 </when>
+ − 60 <when value="No">
+ − 61 </when>
+ − 62 </conditional>
+ − 63
+ − 64 <!-- =================== cross-reference part ============== -->
+ − 65 <param name="mappingFileName" type="data" format="tabular,csv" label="Lookup table (TSV/CSV)" help="Simple mapping file between the coding scheme used to another scheme"/>
8
+ − 66 <param name="numberOfHeaderLines" type="select" label="Number of header lines in mapping file"
+ − 67 help="If this is '0', use the column numbers starting from 1 as the 'names' in the paramters below.">
+ − 68 <option value="0" >0</option>
+ − 69 <option value="1" selected="true">1</option>
+ − 70 </param>
+ − 71
+ − 72
9
+ − 73 <param name="mappingFileIdColName" type="text" size="50" value="" label="ID column name or number (in lookup table)" help="Name (or number) of the ID column for the lookup"/>
7
+ − 74
+ − 75 <conditional name="mappingIdCol">
+ − 76 <param name="mappingIdHasPrefix" type="boolean" truevalue="Yes" falsevalue="No" checked="false"
+ − 77 label="ID values have a prefix"/>
+ − 78 <when value="Yes">
+ − 79 <param name="mappingIdPrefix" type="text" size="50" value="" label="Prefix in ID column"
+ − 80 help="Fill in if any prefix is found in the ID column values (e.g. in some
+ − 81 files the value is preceded by a fixed value like for example 'lipidmaps:LMFA00000007' instead of just 'LMFA00000007' - in this
+ − 82 example one would fill in 'lipidmaps:' as prefix)"/>
+ − 83 </when>
+ − 84 <when value="No">
+ − 85 </when>
+ − 86 </conditional>
+ − 87
+ − 88 <param name="mappingFileTermColName" type="text" size="50" value="" label="Term column name(s) or number(s)"
+ − 89 help="Name(s) or number(s) of the column(s) containing the term(s) in the lookup table (and which will be transfered to the target file based on ID match in 'ID column name').
+ − 90 For using multiple term column names, set the names separated by comma (,).
+ − 91 If multiple columns are specified, the algorithm will look for an annotation in the first one, if none
+ − 92 found it will try the second one, and so forth. "/>
+ − 93
+ − 94
9
+ − 95 <param name="mappedTermsColName" type="text" size="50" value="Mapped terms" label="Name to give to the new column"
7
+ − 96 help="Name to give to the new column that will be added to the target file. This new column is the one
+ − 97 that will contain the respectively mapped terms."/>
+ − 98
+ − 99 <conditional name="genObservations">
+ − 100 <param name="genObservationsFile" type="boolean" truevalue="Yes" falsevalue="No" checked="false"
+ − 101 label="Generate also observations file"/>
+ − 102 <when value="Yes">
+ − 103 <param name="quantifColumn" type="text" size="50" value=""
+ − 104 label="(Optional) Values column name"
+ − 105 help="Name of the column containing the quantification values (in the given input file)"/>
9
+ − 106 <param name="multipleMappingSolution" type="select"
+ − 107 label="(when using values column above) What to do when multiple items map to the same term"
+ − 108 help="When e.g. two Uniprot codes map to the same KEGG code, which quantification value to use">
+ − 109 <option value="not" selected="true">Do nothing, leave as is</option>
+ − 110 <option value="max" >Use max value</option>
+ − 111 <option value="min">Use min value</option>
+ − 112 <option value="avg">Use avg value</option>
+ − 113 </param>
+ − 114 <param name="filterZeros" type="boolean" checked="false"
+ − 115 label="Filter zeros"
+ − 116 help="Filter out the items that have quantification value = 0"/>
7
+ − 117 </when>
+ − 118 <when value="No">
+ − 119 </when>
+ − 120 </conditional>
9
+ − 121
8
+ − 122
7
+ − 123 </inputs>
+ − 124 <outputs>
+ − 125 #if isinstance( $inputFileName.datatype, $__app__.datatypes_registry.get_datatype_by_extension('tabular').__class__):
+ − 126 <data name="outputFileName" format="tabular" label="${tool.name} on ${on_string}: annotated file " ></data>
+ − 127 #else:
+ − 128 <data name="outputFileName" format="csv" label="${tool.name} on ${on_string}: annotated file " ></data>
+ − 129 #end if
9
+ − 130 #if $genObservations.genObservationsFile == True :
+ − 131 <data name="outputObservationsFileName" format="tabular" label="${tool.name} on ${on_string}: term observations file (TSV)"></data>
+ − 132 #end if
+ − 133 <data name="htmlReportFile" format="html" label="${tool.name} on ${on_string} - HTML report"/>
7
+ − 134 </outputs>
+ − 135 <tests>
+ − 136 <!-- find out how to use -->
+ − 137 <test>
+ − 138 </test>
+ − 139 </tests>
+ − 140 <help>
+ − 141
+ − 142 .. class:: infomark
+ − 143
+ − 144
+ − 145 This tool is responsible for annotating the given target file
+ − 146 with the terms given in a lookup table. This lookup table maps the items found in the target file
+ − 147 (e.g. protein identifications coded in common protein coding formats such as UniProt )
+ − 148 to their respective terms (e.g. GO terms). It enables users to use the cross-reference
+ − 149 information now available from different repositories (like uniprot and KEGG - see for example
+ − 150 http://www.uniprot.org/taxonomy/ or http://www.genome.jp/linkdb/ )
+ − 151 to map their data to other useful coding schemes or to ontologies and functional annotations.
+ − 152
+ − 153 .. class:: infomark
+ − 154
+ − 155 **NB:** Currently the tool will do "smart parsing" of hierarchy based fields in the target file ID column.
+ − 156 This means that if the colum contains a ".", the trailing part of the ID after the "." is ignored if the full
+ − 157 ID does not get a match in the lookup table while the part before the "." does.
+ − 158
+ − 159 .. class:: infomark
+ − 160
+ − 161 Examples of usage:
+ − 162
+ − 163 annotate protein identifications with Gene Ontology[GO] terms
+ − 164
+ − 165 annotate metabolite CAS identifications with chebi codes
+ − 166
+ − 167 add KEGG gene codes to a file containing UNIPROT codes
+ − 168
+ − 169 add KEGG compound codes to a file containing chebi codes
+ − 170
+ − 171 etc
+ − 172
+ − 173 As an example for transcripts and proteins, users can check http://www.uniprot.org/taxonomy/ to
+ − 174 see if their organism has been mapped to GO terms by Uniprot. For example the link
+ − 175 http://www.uniprot.org/uniprot/?query=taxonomy:2850 will show the Uniprot repository and cross-references
+ − 176 for the taxonomy 2850.
+ − 177 When the organism being studied is not available, then other strategies
+ − 178 could be tried (like Blast2GO for example).
+ − 179
+ − 180 Despite the specific examples above, this class is generic and can be used to map any
+ − 181 values to new terms according to a given lookup table.
+ − 182
+ − 183 .. class:: infomark
+ − 184
+ − 185 *Omics cross-reference resources on the web:*
+ − 186
+ − 187 LinkDB: http://www.genome.jp/linkdb/
+ − 188
+ − 189 *Ready to use metabolomics links:*
+ − 190
+ − 191 http://rest.genome.jp/link/compound/chebi
+ − 192
+ − 193 http://rest.genome.jp/link/compound/lipidmaps
+ − 194
+ − 195 http://rest.genome.jp/link/compound/lipidbank
+ − 196
+ − 197 http://rest.genome.jp/link/compound/hmdb
+ − 198
+ − 199
+ − 200 *Ready to use proteomics links:*
+ − 201
9
+ − 202 http://rest.genome.jp/link/uniprot/pti (Phaeodactylum Tricornutum)
+ − 203 http://rest.genome.jp/link/pti/uniprot
7
+ − 204
+ − 205 http://rest.genome.jp/link/uniprot/hsa (Homo Sapiens)
+ − 206
+ − 207 (for organism code list see: )
+ − 208
+ − 209
+ − 210 Uniprot to GO
+ − 211
+ − 212 http://www.uniprot.org/taxonomy/
+ − 213
9
+ − 214 http://www.uniprot.org/uniprot/?sort=&desc=&query=proteome:UP000000759&fil=&format=tab&force=yes&columns=id,go-id (Phaeodactylum Tricornutum)
+ − 215
7
+ − 216
+ − 217 -----
+ − 218
+ − 219 **Output**
+ − 220
+ − 221 This method will read in the given input file and for each line it will add a new column
+ − 222 containing the terms found for the ID in that line. So the output file is the same as the
+ − 223 input file + extra terms column (separated by ; ).
+ − 224
+ − 225 -----
+ − 226
+ − 227 **Link to ontology viewer**
+ − 228
+ − 229 A second summarized "terms observations" file can also be generated.
+ − 230 In case the terms are ontology terms, this file can be used for visualizing the results
+ − 231 in the ontology viewer "OntologyAndObservationsViewer".
+ − 232
+ − 233 </help>
+ − 234 </tool>