# HG changeset patch
# User pieter.lukasse@wur.nl
# Date 1426764143 -3600
# Node ID 4393f982d18f711ea89526a0070717c5f63b9742
# Parent 41f122255d149b7db2a8c0bee8f84f8251535db1
reorganized sources
diff -r 41f122255d14 -r 4393f982d18f GCMS/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GCMS/__init__.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,6 @@
+'''
+Module containing Galaxy tools for the GC/MS pipeline
+Created on Mar 6, 2012
+
+@author: marcelk
+'''
diff -r 41f122255d14 -r 4393f982d18f GCMS/combine_output.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GCMS/combine_output.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
+'''
+
+import csv
+import re
+import sys
+import math
+import pprint
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def _process_data(in_csv):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = {}
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def _merge_data(rankfilter, caslookup):
+ '''
+ Merges data from both input dictionaries based on the Centrotype field. This method will
+ build up a new list containing the merged hits as the items.
+ @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
+ @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
+ '''
+ # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
+ if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
+ raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
+
+ merged = []
+ processed = {}
+ for compound_id_idx in xrange(len(rankfilter['ID'])):
+ compound_id = rankfilter['ID'][compound_id_idx]
+ if not compound_id in processed :
+ # keep track of processed items to not repeat them
+ processed[compound_id] = compound_id
+ # get centrotype nr
+ centrotype = compound_id.split('-')[0]
+ # Get the indices for current compound ID in both data-structures for proper matching
+ rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
+ cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
+
+ merged_hits = []
+ # Combine hits
+ for hit in xrange(len(rindex)):
+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+ # corresponding value in the rankfilter or caslookup tables; i.e.
+ # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
+ # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+ rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
+ cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
+
+ merged_hit = _add_hit(rf_record, cl_record)
+ merged_hits.append(merged_hit)
+
+ merged.append(merged_hits)
+
+ return merged, len(rindex)
+
+
+def _add_hit(rankfilter, caslookup):
+ '''
+ Combines single records from both the RankFilter- and CasLookup-tools
+ @param rankfilter: record (dictionary) of one compound in the RankFilter output
+ @param caslookup: matching record (dictionary) of one compound in the CasLookup output
+ '''
+ # The ID in the RankFilter output contains the following 5 fields:
+ rf_id = rankfilter['ID'].split('-')
+ try:
+ name, formula = _remove_formula(rankfilter['Name'])
+ hit = [rf_id[0], # Centrotype
+ rf_id[1], # cent.Factor
+ rf_id[2], # scan nr
+ rf_id[3], # R.T. (umin)
+ rf_id[4], # nr. Peaks
+ # Appending other fields
+ rankfilter['R.T.'],
+ name,
+ caslookup['FORMULA'] if not formula else formula,
+ rankfilter['Library'].strip(),
+ rankfilter['CAS'].strip(),
+ rankfilter['Forward'],
+ rankfilter['Reverse'],
+ ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
+ rankfilter['RIexp'],
+ caslookup['RI'],
+ rankfilter['RIsvr'],
+ # Calculate absolute differences
+ math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
+ math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
+ caslookup['Regression.Column.Name'],
+ caslookup['min'],
+ caslookup['max'],
+ caslookup['nr.duplicates'],
+ caslookup['Column.phase.type'],
+ caslookup['Column.name'],
+ rankfilter['Rank'],
+ rankfilter['%rel.err'],
+ rankfilter['Synonyms']]
+ except KeyError as error:
+ print "Problem reading in data from input file(s):\n",
+ print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
+ print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
+ raise error
+
+ return hit
+
+
+def _remove_formula(name):
+ '''
+ The RankFilter Name field often contains the Formula as well, this function removes it from the Name
+ @param name: complete name of the compound from the RankFilter output
+ '''
+ name = name.split()
+ poss_formula = name[-1]
+ match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
+ if match:
+ return ' '.join(name[:-1]), poss_formula
+ else:
+ return ' '.join(name), False
+
+
+def _get_default_caslookup():
+ '''
+ The Cas Lookup tool might not have found all compounds in the library searched,
+ this default dict will be used to combine with the Rank Filter output
+ '''
+ return {'FORMULA': 'N/A',
+ 'RI': '0.0',
+ 'Regression.Column.Name': 'None',
+ 'min': '0.0',
+ 'max': '0.0',
+ 'nr.duplicates': '0',
+ 'Column.phase.type': 'N/A',
+ 'Column.name': 'N/A'}
+
+
+def _save_data(data, nhits, out_csv_single, out_csv_multi):
+ '''
+ Writes tab-separated data to file
+ @param data: dictionary containing merged dataset
+ @param out_csv: output csv file
+ '''
+ # Columns we don't repeat:
+ header_part1 = ['Centrotype',
+ 'cent.Factor',
+ 'scan nr.',
+ 'R.T. (umin)',
+ 'nr. Peaks',
+ 'R.T.']
+ # These are the headers/columns we repeat in case of
+ # combining hits in one line (see alternative_headers method below):
+ header_part2 = [
+ 'Name',
+ 'FORMULA',
+ 'Library',
+ 'CAS',
+ 'Forward',
+ 'Reverse',
+ 'Avg. (Forward, Reverse)',
+ 'RIexp',
+ 'RI',
+ 'RIsvr',
+ 'RIexp - RIsvr',
+ 'RI - RIexp',
+ 'Regression.Column.Name',
+ 'min',
+ 'max',
+ 'nr.duplicates',
+ 'Column.phase.type',
+ 'Column.name',
+ 'Rank',
+ '%rel.err',
+ 'Synonyms']
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv_single, 'wb')
+ outfile_multi_handle = open(out_csv_multi, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(header_part1 + header_part2)
+ output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
+ # Combine all hits for each centrotype into one line
+ line = []
+ for centrotype_idx in xrange(len(data)):
+ i = 0
+ for hit in data[centrotype_idx]:
+ if i==0:
+ line.extend(hit)
+ else:
+ line.extend(hit[6:])
+ i = i+1
+ # small validation (if error, it is a programming error):
+ if i > nhits:
+ raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx)
+ output_multi_handle.writerow(line)
+ line = []
+
+ # Write one line for each centrotype
+ for centrotype_idx in xrange(len(data)):
+ for hit in data[centrotype_idx]:
+ output_single_handle.writerow(hit)
+
+def alternative_headers(header_part2, nr_alternative_hits):
+ '''
+ This method will iterate over the header names and add the string 'ALT#_' before each,
+ where # is the number of the alternative, according to number of alternative hits we want to add
+ to final csv/tsv
+ '''
+ result = []
+ for i in xrange(nr_alternative_hits):
+ for header_name in header_part2:
+ result.append("ALT" + str(i+1) + "_" + header_name)
+ return result
+
+def main():
+ '''
+ Combine Output main function
+ It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers"
+ NB: the caslookup_result_file will typically have fewer lines than
+ rankfilter_result_file, so the merge has to consider this as well. The final file
+ should have the same nr of lines as rankfilter_result_file.
+ '''
+ rankfilter_result_file = sys.argv[1]
+ caslookup_result_file = sys.argv[2]
+ output_single_csv = sys.argv[3]
+ output_multi_csv = sys.argv[4]
+
+ # Read RankFilter and CasLookup output files
+ rankfilter = _process_data(rankfilter_result_file)
+ caslookup = _process_data(caslookup_result_file)
+ merged, nhits = _merge_data(rankfilter, caslookup)
+ _save_data(merged, nhits, output_single_csv, output_multi_csv)
+
+
+if __name__ == '__main__':
+ main()
diff -r 41f122255d14 -r 4393f982d18f GCMS/combine_output.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GCMS/combine_output.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,36 @@
+
+ Perform a combination of output data from the RankFilter and CasLookup tools
+
+ combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi
+
+
+
+
+
+
+
+
+
+
+
+Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files.
+
+The files produced are contain either all hits for a compound on a single line (Single) or on separate lines
+(Multi).
+
+.. class:: infomark
+
+**Notes**
+
+The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the
+original headers kept intact. Processing steps include:
+
+ - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values
+ - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)'
+ and 'nr. Peaks' fields.
+ - The formula is split off the 'Name' field in the RankFilter output
+
+
+
diff -r 41f122255d14 -r 4393f982d18f GCMS/create_model.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GCMS/create_model.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,78 @@
+
+ Generate coefficients to enable the regression from one GC-column
+ to another GC-column
+ Rscripts/ridb-regression.R
+ $ridb
+ $out_model
+ $out_log
+ $min_residuals
+ $range_mod
+ $pvalue
+ $rsquared
+ $method
+ $plot
+ #if $plot
+ $model_graphics
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (plot)
+
+
+
+
+
+Calculates regression models for a permutation of all GC columns contained in the selected
+RI database file. The method used for creating the model is either based on a 3rd degree
+polynomial or a standard linear model.
+
+The *Minimum number of residuals* option will only allow regression if the columns it is based
+on has at least that number of datapoints on the same compound.
+
+Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for
+the *R squared* value. The produced logfile will state how many models have been discarded due
+to this filtering. The output model file also includes the p-value and R squared value for
+each created model.
+
+Graphical output of the models is available by selecting the plot option which shows the
+data points used for the model as well as the fit itself and the range of data that will
+be usable.
+
+.. class:: infomark
+
+**Notes**
+
+The output file produced by this tool is required as input for the CasLookup tool when
+selecting to apply regression when finding hits in the RIDB.
+
+
diff -r 41f122255d14 -r 4393f982d18f GCMS/library_lookup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GCMS/library_lookup.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,327 @@
+'''
+Logic for searching a Retention Index database file given output from NIST
+'''
+import match_library
+import re
+import sys
+import csv
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def create_lookup_table(library_file, column_type_name, statphase):
+ '''
+ Creates a dictionary holding the contents of the library to be searched
+ @param library_file: library to read
+ @param column_type_name: the columns type name
+ @param statphase: the columns stationary phase
+ '''
+ (data, header) = match_library.read_library(library_file)
+ # Test for presence of required columns
+ if ('columntype' not in header or
+ 'columnphasetype' not in header or
+ 'cas' not in header):
+ raise IOError('Missing columns in ', library_file)
+
+ column_type_column = header.index("columntype")
+ statphase_column = header.index("columnphasetype")
+ cas_column = header.index("cas")
+
+ filtered_library = [line for line in data if line[column_type_column] == column_type_name
+ and line[statphase_column] == statphase]
+ lookup_dict = {}
+ for element in filtered_library:
+ # Here the cas_number is set to the numeric part of the cas_column value, so if the
+ # cas_column value is 'C1433' then cas_number will be '1433'
+ cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0])
+ try:
+ lookup_dict[cas_number].append(element)
+ except KeyError:
+ lookup_dict[cas_number] = [element]
+ return lookup_dict
+
+
+def _preferred(hits, pref, ctype, polar, model, method):
+ '''
+ Returns all entries in the lookup_dict that have the same column name, type and polarity
+ as given by the user, uses regression if selected given the model and method to use. The
+ regression is applied on the column with the best R-squared value in the model
+ @param hits: all entries in the lookup_dict for the given CAS number
+ @param pref: preferred GC-column, can be one or more names
+ @param ctype: column type (capillary etc.)
+ @param polar: polarity (polar / non-polar etc.)
+ @param model: data loaded from file containing regression models
+ @param method: supported regression method (i.e. poly(nomial) or linear)
+ '''
+ match = []
+ for column in pref:
+ for hit in hits:
+ if hit[4] == ctype and hit[5] == polar and hit[6] == column:
+ # Create copy of found hit since it will be altered downstream
+ match.extend(hit)
+ return match, False
+
+ # No hit found for current CAS number, return if not performing regression
+ if not model:
+ return False, False
+
+ # Perform regression
+ for column in pref:
+ if column not in model:
+ break
+ # Order regression candidates by R-squared value (last element)
+ order = sorted(model[column].items(), key=lambda col: col[1][-1])
+ # Create list of regression candidate column names
+ regress_columns = list(reversed([column for (column, _) in order]))
+ # Names of available columns
+ available = [hit[6] for hit in hits]
+
+ # TODO: combine Rsquared and number of datapoints to get the best regression match
+ '''
+ # Iterate regression columns (in order) and retrieve their models
+ models = {}
+ for col in regress_columns:
+ if col in available:
+ hit = list(hits[available.index(col)])
+ if hit[4] == ctype:
+ # models contains all model data including residuals [-2] and rsquared [-1]
+ models[pref[0]] = model[pref[0]][hit[6]]
+ # Get the combined maximum for residuals and rsquared
+ best_match = models[]
+ # Apply regression
+ if method == 'poly':
+ regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model)
+ if regressed:
+ hit[3] = regressed
+ else:
+ return False, False
+ else:
+ hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model)
+ match.extend(hit)
+ return match, hit[6]
+ '''
+
+ for col in regress_columns:
+ if col in available:
+ hit = list(hits[available.index(col)])
+ if hit[4] == ctype:
+ # Perform regression using a column for which regression is possible
+ if method == 'poly':
+ # Polynomial is only possible within a set border, if the RI falls outside
+ # of this border, skip this lookup
+ regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model)
+ if regressed:
+ hit[3] = regressed
+ else:
+ return False, False
+ else:
+ hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model)
+ match.extend(hit)
+ return match, hit[6]
+
+ return False, False
+
+
+
+def default_hit(row, cas_nr, compound_id):
+ '''
+ This method will return a "default"/empty hit for cases where the
+ method _preferred() returns False (i.e. a RI could not be found
+ for the given cas nr, also not via regression.
+ '''
+ return [
+ #'CAS',
+ 'C' + cas_nr,
+ #'NAME',
+ '',
+ #'FORMULA',
+ '',
+ #'RI',
+ '0.0',
+ #'Column.type',
+ '',
+ #'Column.phase.type',
+ '',
+ #'Column.name',
+ '',
+ #'phase.coding',
+ ' ',
+ #'CAS_column.Name',
+ '',
+ #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes
+ compound_id,
+ #'Regression.Column.Name',
+ '',
+ #'min',
+ '',
+ #'max',
+ '',
+ #'nr.duplicates',
+ '']
+
+
+def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method):
+ '''
+ Looks up the compounds in the library lookup table and formats the results
+ @param lookup_dict: dictionary containing the library to be searched
+ @param nist_tabular_filename: NIST output file to be matched
+ @param pref: (list of) column-name(s) to look for
+ @param ctype: column type of interest
+ @param polar: polarity of the used column
+ @param model: data loaded from file containing regression models
+ @param method: supported regression method (i.e. poly(nomial) or linear)
+ '''
+ (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename)
+ # Retrieve indices of the CAS and compound_id columns (exit if not present)
+ try:
+ casi = header_clean.index("cas")
+ idi = header_clean.index("id")
+ except:
+ raise IOError("'CAS' or 'compound_id' not found in header of library file")
+
+ data = []
+ for row in nist_tabular_list:
+ casf = str(row[casi].replace('-', '').strip())
+ compound_id = str(row[idi].split('-')[0])
+ if casf in lookup_dict:
+ found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method)
+ if found_hit:
+ # Keep cas nr as 'C'+ numeric part:
+ found_hit[0] = 'C' + casf
+ # Add compound id
+ found_hit.insert(9, compound_id)
+ # Add information on regression process
+ found_hit.insert(10, regress if regress else 'None')
+ # Replace column index references with actual number of duplicates
+ dups = len(found_hit[-1].split(','))
+ if dups > 1:
+ found_hit[-1] = str(dups + 1)
+ else:
+ found_hit[-1] = '0'
+ data.append(found_hit)
+ found_hit = ''
+ else:
+ data.append(default_hit(row, casf, compound_id))
+ else:
+ data.append(default_hit(row, casf, compound_id))
+
+ casf = ''
+ compound_id = ''
+ found_hit = []
+ dups = []
+ return data
+
+
+def _save_data(content, outfile):
+ '''
+ Write to output file
+ @param content: content to write
+ @param outfile: file to write to
+ '''
+ # header
+ header = ['CAS',
+ 'NAME',
+ 'FORMULA',
+ 'RI',
+ 'Column.type',
+ 'Column.phase.type',
+ 'Column.name',
+ 'phase.coding',
+ 'CAS_column.Name',
+ 'Centrotype',
+ 'Regression.Column.Name',
+ 'min',
+ 'max',
+ 'nr.duplicates']
+ output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t")
+ output_handle.writerow(header)
+ for entry in content:
+ output_handle.writerow(entry)
+
+
+def _read_model(model_file):
+ '''
+ Creates an easy to search dictionary for getting the regression parameters
+ for each valid combination of GC-columns
+ @param model_file: filename containing the regression models
+ '''
+ regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t'))
+ if len(regress.pop(0)) > 9:
+ method = 'poly'
+ else:
+ method = 'linear'
+
+ model = {}
+ # Create new dictionary for each GC-column
+ for line in regress:
+ model[line[0]] = {}
+
+ # Add data
+ for line in regress:
+ if method == 'poly':
+ model[line[0]][line[1]] = [float(col) for col in line[2:11]]
+ else: # linear
+ model[line[0]][line[1]] = [float(col) for col in line[2:9]]
+
+ return model, method
+
+
+def _apply_poly_regression(column1, column2, retention_index, model):
+ '''
+ Calculates a new retention index (RI) value using a given 3rd-degree polynomial
+ model based on data from GC columns 1 and 2
+ @param column1: name of the selected GC-column
+ @param column2: name of the GC-column to use for regression
+ @param retention_index: RI to convert
+ @param model: dictionary containing model information for all GC-columns
+ '''
+ coeff = model[column1][column2]
+ # If the retention index to convert is within range of the data the model is based on, perform regression
+ if coeff[4] < retention_index < coeff[5]:
+ return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) +
+ (retention_index * coeff[1]) + coeff[0])
+ else:
+ return False
+
+
+def _apply_linear_regression(column1, column2, retention_index, model):
+ '''
+ Calculates a new retention index (RI) value using a given linear model based on data
+ from GC columns 1 and 2
+ @param column1: name of the selected GC-column
+ @param column2: name of the GC-column to use for regression
+ @param retention_index: RI to convert
+ @param model: dictionary containing model information for all GC-columns
+ '''
+ # TODO: No use of limits
+ coeff = model[column1][column2]
+ return coeff[1] * retention_index + coeff[0]
+
+
+def main():
+ '''
+ Library Lookup main function
+ '''
+ library_file = sys.argv[1]
+ nist_tabular_filename = sys.argv[2]
+ ctype = sys.argv[3]
+ polar = sys.argv[4]
+ outfile = sys.argv[5]
+ pref = sys.argv[6:-1]
+ regress = sys.argv[-1]
+
+ if regress != 'False':
+ model, method = _read_model(regress)
+ else:
+ model, method = False, None
+
+ lookup_dict = create_lookup_table(library_file, ctype, polar)
+ data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)
+
+ _save_data(data, outfile)
+
+
+if __name__ == "__main__":
+ main()
diff -r 41f122255d14 -r 4393f982d18f GCMS/library_lookup.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GCMS/library_lookup.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,75 @@
+
+ Lookup or estimate the RI using a "known RI values" CAS numbers library
+
+ library_lookup.py
+ $library_file
+ $input
+ "$col_type"
+ "$polarity"
+ $output
+ #for $ctype in $pref
+ ${ctype.columntype}
+ #end for
+ $regression.model
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library.
+If a direct match is NOT found for the preferred column name, a regression can be done to find
+the theoretical RI value based on known RI values for the CAS number on other column types (see step 4).
+If there is no match for the CAS number on any column type, then the record is not given a RI.
+
+
+
+
+
+
diff -r 41f122255d14 -r 4393f982d18f GCMS/readme.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/GCMS/readme.txt Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,25 @@
+==Description==
+Galaxy tools for analyzing a PDF-result file generated by NIST-MSSearch by calculating ranks for
+each putative metabolite and matching with a Retention Index Database by CAS-number
+
+
+==Dependencies==
+- Galaxy
+- pdftotext (part of the 'poppler-utils' package (Ubuntu and Debian))
+- Python (>= 2.6) with the following package(s):
+ - numpy
+- R
+
+
+==Installation==
+
+- Optionally run the included Python unittests to verify the dependencies:
+ nosetests galaxy-dist/tools/GCMS
+
+
+
+==License==
+
+Licensing info can be found at the /LICENSE_AND_README folder files located
+in the root of this (PRIMS-metabolomics) project.
+
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/__init__.py
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/export_to_metexp_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/METEXPtools/export_to_metexp_tabular.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,247 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the MetExp database.
+
+RankFilter, CasLookup are already combined by combine_output.py so here we will use
+this result. Furthermore here one of the MsClust
+quantification files containing the respective spectra details are to be combined as well.
+
+Extra calculations performed:
+- The column MW is also added here and is derived from the column FORMULA found
+ in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+'''
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+import csv
+import re
+import sys
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2013, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_data(in_csv, delim='\t'):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim))
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = OrderedDict()
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+ONE_TO_ONE = 'one_to_one'
+N_TO_ONE = 'n_to_one'
+
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
+ '''
+ Merges data from both input dictionaries based on the link fields. This method will
+ build up a new list containing the merged hits as the items.
+ @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
+ @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
+ '''
+ # TODO test for correct input files -> same link_field values should be there
+ # (test at least number of unique link_field values):
+ #
+ # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
+ # raise Exception('input files should have the same nr of key values ')
+
+
+ merged = []
+ processed = {}
+ for link_field_set1_idx in xrange(len(set1[link_field_set1])):
+ link_field_set1_value = set1[link_field_set1][link_field_set1_idx]
+ if not link_field_set1_value in processed :
+ # keep track of processed items to not repeat them
+ processed[link_field_set1_value] = link_field_set1_value
+
+ # Get the indices for current link_field_set1_value in both data-structures for proper matching
+ set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
+ set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
+ # Validation :
+ if len(set2index) == 0:
+ # means that corresponding data could not be found in set2, then throw error
+ raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" +
+ link_field_set1_value + " only found in first dataset. ")
+
+ merged_hits = []
+ # Combine hits
+ for hit in xrange(len(set1index)):
+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+ # corresponding value in the sets; i.e.
+ # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
+ # represented by "key".
+ # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
+ # So set1[x][set1index[n]] = set1.attributeX.instanceN
+ #
+ # It just ensures the entry is made available as a plain named array for easy access.
+ rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
+ if relation_type == ONE_TO_ONE :
+ cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
+ else:
+ # is N to 1:
+ cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
+
+ merged_hit = merge_function(rf_record, cl_record, metadata)
+ merged_hits.append(merged_hit)
+
+ merged.append(merged_hits)
+
+ return merged
+
+
+def _compare_records(key1, key2):
+ '''
+ in this case the compare method is really simple as both keys are expected to contain
+ same value when records are the same
+ '''
+ if key1 == key2:
+ return True
+ else:
+ return False
+
+
+
+def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
+ '''
+ Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
+
+ @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
+ @param msclust_quant_record: msclust quantification + spectrum record
+ '''
+ record = []
+ for column in rank_caslookup_combi:
+ record.append(rank_caslookup_combi[column])
+
+ for column in msclust_quant_record:
+ record.append(msclust_quant_record[column])
+
+ for column in metadata:
+ record.append(metadata[column])
+
+ # add MOLECULAR MASS (MM)
+ molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
+ # limit to two decimals:
+ record.append("{0:.2f}".format(molecular_mass))
+
+ # add MOLECULAR WEIGHT (MW) - TODO - calculate this
+ record.append('0.0')
+
+ # level of identification and Location of reference standard
+ record.append('0')
+ record.append('')
+
+ return record
+
+
+def get_molecular_mass(formula):
+ '''
+ Calculates the molecular mass (MM).
+ E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
+ '''
+
+ # Each element is represented by a capital letter, followed optionally by
+ # lower case, with one or more digits as for how many elements:
+ element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
+
+ total_mass = 0
+ for (element_name, count) in element_pattern.findall(formula):
+ if count == "":
+ count = 1
+ else:
+ count = int(count)
+ element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?)
+ total_mass += element_mass * count
+
+ return total_mass
+
+
+
+def _save_data(data, headers, out_csv):
+ '''
+ Writes tab-separated data to file
+ @param data: dictionary containing merged dataset
+ @param out_csv: output csv file
+ '''
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(headers)
+
+ # Write
+ for item_idx in xrange(len(data)):
+ for hit in data[item_idx]:
+ output_single_handle.writerow(hit)
+
+
+def _get_map_for_elements_and_masses(elements_and_masses):
+ '''
+ This method will read out the column 'Chemical symbol' and make a map
+ of this, storing the column 'Relative atomic mass' as its value
+ '''
+ resultMap = {}
+ index = 0
+ for entry in elements_and_masses['Chemical symbol']:
+ resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
+ index += 1
+
+ return resultMap
+
+
+def init_elements_and_masses_map():
+ '''
+ Initializes the lookup map containing the elements and their respective masses
+ '''
+ elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
+ global elements_and_masses_map
+ elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
+
+
+def main():
+ '''
+ Combine Output main function
+
+ RankFilter, CasLookup are already combined by combine_output.py so here we will use
+ this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
+ quantification files are to be combined with combine_output.py result as well.
+ '''
+ rankfilter_and_caslookup_combined_file = sys.argv[1]
+ msclust_quantification_and_spectra_file = sys.argv[2]
+ output_csv = sys.argv[3]
+ # metadata
+ metadata = OrderedDict()
+ metadata['organism'] = sys.argv[4]
+ metadata['tissue'] = sys.argv[5]
+ metadata['experiment_name'] = sys.argv[6]
+ metadata['user_name'] = sys.argv[7]
+ metadata['column_type'] = sys.argv[8]
+
+ # Read RankFilter and CasLookup output files
+ rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
+ msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
+
+ # Read elements and masses to use for the MW/MM calculation :
+ init_elements_and_masses_map()
+
+ merged = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype',
+ msclust_quantification_and_spectra, 'centrotype',
+ _compare_records, _merge_records, metadata,
+ N_TO_ONE)
+ headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
+ _save_data(merged, headers, output_csv)
+
+
+if __name__ == '__main__':
+ main()
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/export_to_metexp_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/METEXPtools/export_to_metexp_tabular.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,75 @@
+
+ Create tabular file for loading into METabolomics EXPlorer database
+
+ export_to_metexp_tabular.py
+ $rankfilter_and_caslookup_combi
+ $msclust_quant_file
+ $output_result
+ "$organism"
+ "$tissue"
+ "$experiment_name"
+ "$user_name"
+ "$column_type"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Tool to combine output from the tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
+
+RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
+this result.
+
+**Notes**
+
+Extra calculations performed:
+- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+
+
+
+
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/query_metexp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/METEXPtools/query_metexp.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+ return _process_data(data)
+
+def _process_data(data):
+
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = OrderedDict()
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
+ '''
+ This method will iterate over the record in the input_data and
+ will enrich them with the related information found (if any) in the
+ MetExp Database.
+
+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+ '''
+ merged = []
+
+ for i in xrange(len(input_data[input_data.keys()[0]])):
+ # Get the record in same dictionary format as input_data, but containing
+ # a value at each column instead of a list of all values of all records:
+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+
+ # read the molecular mass and formula:
+ cas_id = input_data_record[casid_col]
+ formula = input_data_record[formula_col]
+ molecular_mass = input_data_record[molecular_mass_col]
+
+ # search for related records in MetExp:
+ data_found = None
+ if cas_id != "undef":
+ # 1- search for other experiments where this CAS id has been found:
+ query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "CAS"
+ if data_found == None:
+ # 2- search for other experiments where this FORMULA has been found:
+ query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "FORMULA"
+ if data_found == None:
+ # 3- search for other experiments where this MM has been found:
+ query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "MM"
+
+ if data_found == None:
+ # If still nothing found, just add empty columns
+ extra_cols = ['', '','','','','','','']
+ else:
+ # Add info found:
+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+
+ # Take all data and merge it into a "flat"/simple array of values:
+ field_values_list = _merge_data(input_data_record, extra_cols)
+
+ merged.append(field_values_list)
+
+ # return the merged/enriched records:
+ return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+ '''
+ This method will go over the data found and will return a
+ list with the following items:
+ - Experiment details where hits have been found :
+ 'organism', 'tissue','experiment_name','user_name','column_type'
+ - Link that executes same query
+
+ '''
+ # set() makes a unique list:
+ organism_set = []
+ tissue_set = []
+ experiment_name_set = []
+ user_name_set = []
+ column_type_set = []
+ cas_nr_set = []
+
+ if 'organism' in data_found:
+ organism_set = set(data_found['organism'])
+ if 'tissue' in data_found:
+ tissue_set = set(data_found['tissue'])
+ if 'experiment_name' in data_found:
+ experiment_name_set = set(data_found['experiment_name'])
+ if 'user_name' in data_found:
+ user_name_set = set(data_found['user_name'])
+ if 'column_type' in data_found:
+ column_type_set = set(data_found['column_type'])
+ if 'CAS' in data_found:
+ cas_nr_set = set(data_found['CAS'])
+
+
+ result = [data_type_found,
+ _to_xsv(organism_set),
+ _to_xsv(tissue_set),
+ _to_xsv(experiment_name_set),
+ _to_xsv(user_name_set),
+ _to_xsv(column_type_set),
+ _to_xsv(cas_nr_set),
+ #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+ "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+ return result
+
+
+def _to_xsv(data_set):
+ result = ""
+ for item in data_set:
+ result = result + str(item) + "|"
+ return result
+
+
+def _fire_query_and_return_dict(url):
+ '''
+ This method will fire the query as a web-service call and
+ return the results as a list of dictionary objects
+ '''
+
+ try:
+ data = urllib2.urlopen(url).read()
+
+ # transform to dictionary:
+ result = []
+ data_rows = data.split("\n")
+
+ # check if there is any data in the response:
+ if len(data_rows) <= 1 or data_rows[1].strip() == '':
+ # means there is only the header row...so no hits:
+ return None
+
+ for data_row in data_rows:
+ if not data_row.strip() == '':
+ row_as_list = _str_to_list(data_row, delimiter='\t')
+ result.append(row_as_list)
+
+ # return result processed into a dict:
+ return _process_data(result)
+
+ except urllib2.HTTPError, e:
+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+ except urllib2.URLError, e:
+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):
+ result = []
+ for column in data_row.split(delimiter):
+ result.append(column)
+ return result
+
+
+# alternative: ?
+# s = requests.Session()
+# s.verify = False
+# #s.auth = (token01, token02)
+# resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+# content = resp.content
+# # transform to dictionary:
+
+
+
+
+def _merge_data(input_data_record, extra_cols):
+ '''
+ Adds the extra information to the existing data record and returns
+ the combined new record.
+ '''
+ record = []
+ for column in input_data_record:
+ record.append(input_data_record[column])
+
+
+ # add extra columns
+ for column in extra_cols:
+ record.append(column)
+
+ return record
+
+
+def _save_data(data_rows, headers, out_csv):
+ '''
+ Writes tab-separated data to file
+ @param data_rows: dictionary containing merged/enriched dataset
+ @param out_csv: output csv file
+ '''
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(headers)
+
+ # Write one line for each row
+ for data_row in data_rows:
+ output_single_handle.writerow(data_row)
+
+def _get_metexp_URL(metexp_dblink_file):
+ '''
+ Read out and return the URL stored in the given file.
+ '''
+ file_input = fileinput.input(metexp_dblink_file)
+ try:
+ for line in file_input:
+ if line[0] != '#':
+ # just return the first line that is not a comment line:
+ return line
+ finally:
+ file_input.close()
+
+
+def main():
+ '''
+ MetExp Query main function
+
+ The input file can be any tabular file, as long as it contains a column for the molecular mass
+ and one for the formula of the respective identification. These two columns are then
+ used to query against MetExp Database.
+ '''
+ seconds_start = int(round(time.time()))
+
+ input_file = sys.argv[1]
+ casid_col = sys.argv[2]
+ formula_col = sys.argv[3]
+ molecular_mass_col = sys.argv[4]
+ metexp_dblink_file = sys.argv[5]
+ separation_method = sys.argv[6]
+ output_result = sys.argv[7]
+
+ # Parse metexp_dblink_file to find the URL to the MetExp service:
+ metexp_dblink = _get_metexp_URL(metexp_dblink_file)
+
+ # Parse tabular input file into dictionary/array:
+ input_data = _process_file(input_file)
+
+ # Query data against MetExp DB :
+ enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
+ headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
+ 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
+
+ _save_data(enriched_data, headers, output_result)
+
+ seconds_end = int(round(time.time()))
+ print "Took " + str(seconds_end - seconds_start) + " seconds"
+
+
+
+if __name__ == '__main__':
+ main()
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/query_metexp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/METEXPtools/query_metexp.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,69 @@
+
+ Query a set of identifications against the METabolomics EXPeriments database
+
+ query_metexp.py
+ $input_file
+ "$casid_col"
+ "$formula_col"
+ "$molecular_mass_col"
+ "$metexp_dblink_file"
+ $separation_method
+ $output_result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool will Query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass
+and one for the formula of the respective identification.
+
+
+
+
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/static_resources/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/METEXPtools/static_resources/README.txt Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,3 @@
+This folder and respective files should be deployed together with the following tools:
+
+ - ../export_to_metexp_tabular.py
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f METEXPtools/static_resources/elements_and_masses.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/METEXPtools/static_resources/elements_and_masses.tab Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,104 @@
+Name Atomic number Chemical symbol Relative atomic mass
+Hydrogen 1 H 1.01
+Helium 2 He 4
+Lithium 3 Li 6.94
+Beryllium 4 Be 9.01
+Boron 5 B 10.81
+Carbon 6 C 12.01
+Nitrogen 7 N 14.01
+Oxygen 8 O 16
+Fluorine 9 F 19
+Neon 10 Ne 20.18
+Sodium 11 Na 22.99
+Magnesium 12 Mg 24.31
+Aluminum 13 Al 26.98
+Silicon 14 Si 28.09
+Phosphorus 15 P 30.98
+Sulfur 16 S 32.06
+Chlorine 17 Cl 35.45
+Argon 18 Ar 39.95
+Potassium 19 K 39.1
+Calcium 20 Ca 40.08
+Scandium 21 Sc 44.96
+Titanium 22 Ti 47.9
+Vanadium 23 V 50.94
+Chromium 24 Cr 52
+Manganese 25 Mn 54.94
+Iron 26 Fe 55.85
+Cobalt 27 Co 58.93
+Nickel 28 Ni 58.71
+Copper 29 Cu 63.54
+Zinc 30 Zn 65.37
+Gallium 31 Ga 69.72
+Germanium 32 Ge 72.59
+Arsenic 33 As 74.99
+Selenium 34 Se 78.96
+Bromine 35 Br 79.91
+Krypton 36 Kr 83.8
+Rubidium 37 Rb 85.47
+Strontium 38 Sr 87.62
+Yttrium 39 Y 88.91
+Zirconium 40 Zr 91.22
+Niobium 41 Nb 92.91
+Molybdenum 42 Mo 95.94
+Technetium 43 Tc 96.91
+Ruthenium 44 Ru 101.07
+Rhodium 45 Rh 102.9
+Palladium 46 Pd 106.4
+Silver 47 Ag 107.87
+Cadmium 48 Cd 112.4
+Indium 49 In 114.82
+Tin 50 Sn 118.69
+Antimony 51 Sb 121.75
+Tellurium 52 Te 127.6
+Iodine 53 I 126.9
+Xenon 54 Xe 131.3
+Cesium 55 Cs 132.9
+Barium 56 Ba 137.34
+Lanthanum 57 La 138.91
+Cerium 58 Ce 140.12
+Praseodymium 59 Pr 140.91
+Neodymium 60 Nd 144.24
+Promethium 61 Pm 144.91
+Samarium 62 Sm 150.35
+Europium 63 Eu 151.96
+Gadolinium 64 Gd 157.25
+Terbium 65 Tb 158.92
+Dysprosium 66 Dy 162.5
+Holmium 67 Ho 164.93
+Erbium 68 Er 167.26
+Thulium 69 Tm 168.93
+Ytterbium 70 Yb 173.04
+Lutetium 71 Lu 174.97
+Hafnium 72 Hf 178.49
+Tantalum 73 Ta 180.95
+Wolfram 74 W 183.85
+Rhenium 75 Re 186.2
+Osmium 76 Os 190.2
+Iridium 77 Ir 192.22
+Platinum 78 Pt 195.09
+Gold 79 Au 196.97
+Mercury 80 Hg 200.59
+Thallium 81 Tl 204.37
+Lead 82 Pb 207.19
+Bismuth 83 Bi 208.98
+Polonium 84 Po 208.98
+Astatine 85 At 209.99
+Radon 86 Rn 222.02
+Francium 87 Fr 223.02
+Radium 88 Ra 226
+Actinium 89 Ac 227.03
+Thorium 90 Th 232.04
+Protactinium 91 Pa 231.04
+Uranium 92 U 238.03
+Neptunium 93 Np 237
+Plutonium 94 Pu 242
+Americium 95 Am 243.06
+Curium 96 Cm 247.07
+Berkelium 97 Bk 247.07
+Californium 98 Cf 251.08
+Einsteinium 99 Es 254.09
+Fermium 100 Fm 257.1
+Mendelevium 101 Md 257.1
+Nobelium 102 No 255.09
+Lawrencium 103 Lr 256.1
diff -r 41f122255d14 -r 4393f982d18f MS/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MS/__init__.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,6 @@
+'''
+Module containing Galaxy tools for the LC or GC/MS pipeline
+Created on Mar , 2014
+
+@author: pieter lukasse
+'''
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f MS/query_mass_repos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MS/query_mass_repos.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of accurate mass values detected by high-resolution mass spectrometers
+against various repositories/services such as METabolomics EXPlorer database or the
+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository/service. If one or more compounds are found
+then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository/service.
+
+The service should implement the following interface:
+
+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)
+
+The output should be tab separated and should contain the following columns (in this order)
+db-name molecular-formula dbe formula-weight id description
+
+
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+ return _process_data(data)
+
+def _process_data(data):
+
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = OrderedDict()
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
+
+ '''
+ This method will iterate over the record in the input_data and
+ will enrich them with the related information found (if any) in the
+ chosen repository/service
+
+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+ '''
+ merged = []
+
+ for i in xrange(len(input_data[input_data.keys()[0]])):
+ # Get the record in same dictionary format as input_data, but containing
+ # a value at each column instead of a list of all values of all records:
+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+
+ # read the molecular mass :
+ molecular_mass = input_data_record[molecular_mass_col]
+
+
+ # search for related records in repository/service:
+ data_found = None
+ if molecular_mass != "":
+ molecular_mass = float(molecular_mass)
+
+ # 1- search for data around this MM:
+ query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
+
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "MM"
+
+
+ if data_found == None:
+ # If still nothing found, just add empty columns
+ extra_cols = ['', '','','','','']
+ else:
+ # Add info found:
+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+
+ # Take all data and merge it into a "flat"/simple array of values:
+ field_values_list = _merge_data(input_data_record, extra_cols)
+
+ merged.append(field_values_list)
+
+ # return the merged/enriched records:
+ return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+ '''
+ This method will go over the data found and will return a
+ list with the following items:
+ - details of hits found :
+ db-name molecular-formula dbe formula-weight id description
+ - Link that executes same query
+
+ '''
+
+ # set() makes a unique list:
+ db_name_set = []
+ molecular_formula_set = []
+ id_set = []
+ description_set = []
+
+
+ if 'db-name' in data_found:
+ db_name_set = set(data_found['db-name'])
+ elif '# db-name' in data_found:
+ db_name_set = set(data_found['# db-name'])
+ if 'molecular-formula' in data_found:
+ molecular_formula_set = set(data_found['molecular-formula'])
+ if 'id' in data_found:
+ id_set = set(data_found['id'])
+ if 'description' in data_found:
+ description_set = set(data_found['description'])
+
+ result = [data_type_found,
+ _to_xsv(db_name_set),
+ _to_xsv(molecular_formula_set),
+ _to_xsv(id_set),
+ _to_xsv(description_set),
+ #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+ "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+ return result
+
+
+def _to_xsv(data_set):
+ result = ""
+ for item in data_set:
+ result = result + str(item) + "|"
+ return result
+
+
+def _fire_query_and_return_dict(url):
+ '''
+ This method will fire the query as a web-service call and
+ return the results as a list of dictionary objects
+ '''
+
+ try:
+ data = urllib2.urlopen(url).read()
+
+ # transform to dictionary:
+ result = []
+ data_rows = data.split("\n")
+
+ # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
+ data_rows_to_remove = []
+ for data_row in data_rows:
+ if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
+ data_rows_to_remove.append(data_row)
+
+ for data_row in data_rows_to_remove:
+ data_rows.remove(data_row)
+
+ # check if there is any data in the response:
+ if len(data_rows) <= 1 or data_rows[1].strip() == '':
+ # means there is only the header row...so no hits:
+ return None
+
+ for data_row in data_rows:
+ if not data_row.strip() == '':
+ row_as_list = _str_to_list(data_row, delimiter='\t')
+ result.append(row_as_list)
+
+ # return result processed into a dict:
+ return _process_data(result)
+
+ except urllib2.HTTPError, e:
+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+ except urllib2.URLError, e:
+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):
+ result = []
+ for column in data_row.split(delimiter):
+ result.append(column)
+ return result
+
+
+# alternative: ?
+# s = requests.Session()
+# s.verify = False
+# #s.auth = (token01, token02)
+# resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+# content = resp.content
+# # transform to dictionary:
+
+
+
+
+def _merge_data(input_data_record, extra_cols):
+ '''
+ Adds the extra information to the existing data record and returns
+ the combined new record.
+ '''
+ record = []
+ for column in input_data_record:
+ record.append(input_data_record[column])
+
+
+ # add extra columns
+ for column in extra_cols:
+ record.append(column)
+
+ return record
+
+
+def _save_data(data_rows, headers, out_csv):
+ '''
+ Writes tab-separated data to file
+ @param data_rows: dictionary containing merged/enriched dataset
+ @param out_csv: output csv file
+ '''
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(headers)
+
+ # Write one line for each row
+ for data_row in data_rows:
+ output_single_handle.writerow(data_row)
+
+def _get_repository_URL(repository_file):
+ '''
+ Read out and return the URL stored in the given file.
+ '''
+ file_input = fileinput.input(repository_file)
+ try:
+ for line in file_input:
+ if line[0] != '#':
+ # just return the first line that is not a comment line:
+ return line
+ finally:
+ file_input.close()
+
+
+def main():
+ '''
+ Query main function
+
+ The input file can be any tabular file, as long as it contains a column for the molecular mass.
+ This column is then used to query against the chosen repository/service Database.
+ '''
+ seconds_start = int(round(time.time()))
+
+ input_file = sys.argv[1]
+ molecular_mass_col = sys.argv[2]
+ repository_file = sys.argv[3]
+ error_margin = float(sys.argv[4])
+ margin_unit = sys.argv[5]
+ output_result = sys.argv[6]
+
+ # Parse repository_file to find the URL to the service:
+ repository_dblink = _get_repository_URL(repository_file)
+
+ # Parse tabular input file into dictionary/array:
+ input_data = _process_file(input_file)
+
+ # Query data against repository :
+ enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
+ headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
+ 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns
+
+ _save_data(enriched_data, headers, output_result)
+
+ seconds_end = int(round(time.time()))
+ print "Took " + str(seconds_end - seconds_start) + " seconds"
+
+
+
+if __name__ == '__main__':
+ main()
diff -r 41f122255d14 -r 4393f982d18f MS/query_mass_repos.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/MS/query_mass_repos.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,106 @@
+
+ Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers
+
+ query_mass_repos.py
+ $input_file
+ "$molecular_mass_col"
+ "$repository_file"
+ $error_margin
+ $margin_unit
+ $output_result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher
+for elemental compositions from accurate mass values detected by high-resolution mass spectrometers.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository. If one or more compounds are found in the
+repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass.
+
+**Services that can be queried**
+
+================= =========================================================================
+Database Description
+----------------- -------------------------------------------------------------------------
+PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at
+ Plant Research International. NB: restricted access to employees with
+ access key.
+ExactMassDB A database of possible elemental compositions consits of C: 100,
+ H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and
+ the Lewis valence rules.
+ (via /mfsearcher/exmassdb/)
+ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental
+ compositions, filters some elemental compositions according to
+ the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2
+ database returns the same result as does HR2 with the same atom kind
+ and number condition as that used in construction of the ExactMassDB.
+ (via /mfsearcher/exmassdb-hr2/)
+Pep1000 A database of possible linear polypeptides that are
+ constructed with 20 kinds of amino acids and having molecular
+ weights smaller than 1000.
+ (via /mfsearcher/pep1000/)
+KEGG Re-calculated compound data from KEGG. Weekly updated.
+ (via /mfsearcher/kegg/)
+KNApSAcK Re-calculated compound data from KNApSAcK.
+ (via /mfsearcher/knapsack/)
+Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer .
+ (via /mfsearcher/flavonoidviewer/
+LipidMAPS Re-calculated compound data from LIPID MAPS.
+ (via /mfsearcher/lipidmaps/)
+HMDB Re-calculated compound data from Human Metabolome Database (HMDB)
+ Version 3.5.
+ (via /mfsearcher/hmdb/)
+PubChem Re-calculated compound data from PubChem. Monthly updated.
+ (via /mfsearcher/pubchem/)
+================= =========================================================================
+
+Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher
+
+
+
diff -r 41f122255d14 -r 4393f982d18f __init__.py
diff -r 41f122255d14 -r 4393f982d18f combine_output.py
--- a/combine_output.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,253 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-'''
-Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
-'''
-
-import csv
-import re
-import sys
-import math
-import pprint
-
-__author__ = "Marcel Kempenaar"
-__contact__ = "brs@nbic.nl"
-__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
-__license__ = "MIT"
-
-def _process_data(in_csv):
- '''
- Generic method to parse a tab-separated file returning a dictionary with named columns
- @param in_csv: input filename to be parsed
- '''
- data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
- header = data.pop(0)
- # Create dictionary with column name as key
- output = {}
- for index in xrange(len(header)):
- output[header[index]] = [row[index] for row in data]
- return output
-
-
-def _merge_data(rankfilter, caslookup):
- '''
- Merges data from both input dictionaries based on the Centrotype field. This method will
- build up a new list containing the merged hits as the items.
- @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
- @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
- '''
- # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
- if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
- raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
-
- merged = []
- processed = {}
- for compound_id_idx in xrange(len(rankfilter['ID'])):
- compound_id = rankfilter['ID'][compound_id_idx]
- if not compound_id in processed :
- # keep track of processed items to not repeat them
- processed[compound_id] = compound_id
- # get centrotype nr
- centrotype = compound_id.split('-')[0]
- # Get the indices for current compound ID in both data-structures for proper matching
- rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
- cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
-
- merged_hits = []
- # Combine hits
- for hit in xrange(len(rindex)):
- # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
- # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
- # corresponding value in the rankfilter or caslookup tables; i.e.
- # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
- # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
- rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
- cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
-
- merged_hit = _add_hit(rf_record, cl_record)
- merged_hits.append(merged_hit)
-
- merged.append(merged_hits)
-
- return merged, len(rindex)
-
-
-def _add_hit(rankfilter, caslookup):
- '''
- Combines single records from both the RankFilter- and CasLookup-tools
- @param rankfilter: record (dictionary) of one compound in the RankFilter output
- @param caslookup: matching record (dictionary) of one compound in the CasLookup output
- '''
- # The ID in the RankFilter output contains the following 5 fields:
- rf_id = rankfilter['ID'].split('-')
- try:
- name, formula = _remove_formula(rankfilter['Name'])
- hit = [rf_id[0], # Centrotype
- rf_id[1], # cent.Factor
- rf_id[2], # scan nr
- rf_id[3], # R.T. (umin)
- rf_id[4], # nr. Peaks
- # Appending other fields
- rankfilter['R.T.'],
- name,
- caslookup['FORMULA'] if not formula else formula,
- rankfilter['Library'].strip(),
- rankfilter['CAS'].strip(),
- rankfilter['Forward'],
- rankfilter['Reverse'],
- ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
- rankfilter['RIexp'],
- caslookup['RI'],
- rankfilter['RIsvr'],
- # Calculate absolute differences
- math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
- math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
- caslookup['Regression.Column.Name'],
- caslookup['min'],
- caslookup['max'],
- caslookup['nr.duplicates'],
- caslookup['Column.phase.type'],
- caslookup['Column.name'],
- rankfilter['Rank'],
- rankfilter['%rel.err'],
- rankfilter['Synonyms']]
- except KeyError as error:
- print "Problem reading in data from input file(s):\n",
- print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
- print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
- raise error
-
- return hit
-
-
-def _remove_formula(name):
- '''
- The RankFilter Name field often contains the Formula as well, this function removes it from the Name
- @param name: complete name of the compound from the RankFilter output
- '''
- name = name.split()
- poss_formula = name[-1]
- match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
- if match:
- return ' '.join(name[:-1]), poss_formula
- else:
- return ' '.join(name), False
-
-
-def _get_default_caslookup():
- '''
- The Cas Lookup tool might not have found all compounds in the library searched,
- this default dict will be used to combine with the Rank Filter output
- '''
- return {'FORMULA': 'N/A',
- 'RI': '0.0',
- 'Regression.Column.Name': 'None',
- 'min': '0.0',
- 'max': '0.0',
- 'nr.duplicates': '0',
- 'Column.phase.type': 'N/A',
- 'Column.name': 'N/A'}
-
-
-def _save_data(data, nhits, out_csv_single, out_csv_multi):
- '''
- Writes tab-separated data to file
- @param data: dictionary containing merged dataset
- @param out_csv: output csv file
- '''
- # Columns we don't repeat:
- header_part1 = ['Centrotype',
- 'cent.Factor',
- 'scan nr.',
- 'R.T. (umin)',
- 'nr. Peaks',
- 'R.T.']
- # These are the headers/columns we repeat in case of
- # combining hits in one line (see alternative_headers method below):
- header_part2 = [
- 'Name',
- 'FORMULA',
- 'Library',
- 'CAS',
- 'Forward',
- 'Reverse',
- 'Avg. (Forward, Reverse)',
- 'RIexp',
- 'RI',
- 'RIsvr',
- 'RIexp - RIsvr',
- 'RI - RIexp',
- 'Regression.Column.Name',
- 'min',
- 'max',
- 'nr.duplicates',
- 'Column.phase.type',
- 'Column.name',
- 'Rank',
- '%rel.err',
- 'Synonyms']
-
- # Open output file for writing
- outfile_single_handle = open(out_csv_single, 'wb')
- outfile_multi_handle = open(out_csv_multi, 'wb')
- output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
- output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
-
- # Write headers
- output_single_handle.writerow(header_part1 + header_part2)
- output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
- # Combine all hits for each centrotype into one line
- line = []
- for centrotype_idx in xrange(len(data)):
- i = 0
- for hit in data[centrotype_idx]:
- if i==0:
- line.extend(hit)
- else:
- line.extend(hit[6:])
- i = i+1
- # small validation (if error, it is a programming error):
- if i > nhits:
- raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx)
- output_multi_handle.writerow(line)
- line = []
-
- # Write one line for each centrotype
- for centrotype_idx in xrange(len(data)):
- for hit in data[centrotype_idx]:
- output_single_handle.writerow(hit)
-
-def alternative_headers(header_part2, nr_alternative_hits):
- '''
- This method will iterate over the header names and add the string 'ALT#_' before each,
- where # is the number of the alternative, according to number of alternative hits we want to add
- to final csv/tsv
- '''
- result = []
- for i in xrange(nr_alternative_hits):
- for header_name in header_part2:
- result.append("ALT" + str(i+1) + "_" + header_name)
- return result
-
-def main():
- '''
- Combine Output main function
- It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers"
- NB: the caslookup_result_file will typically have fewer lines than
- rankfilter_result_file, so the merge has to consider this as well. The final file
- should have the same nr of lines as rankfilter_result_file.
- '''
- rankfilter_result_file = sys.argv[1]
- caslookup_result_file = sys.argv[2]
- output_single_csv = sys.argv[3]
- output_multi_csv = sys.argv[4]
-
- # Read RankFilter and CasLookup output files
- rankfilter = _process_data(rankfilter_result_file)
- caslookup = _process_data(caslookup_result_file)
- merged, nhits = _merge_data(rankfilter, caslookup)
- _save_data(merged, nhits, output_single_csv, output_multi_csv)
-
-
-if __name__ == '__main__':
- main()
diff -r 41f122255d14 -r 4393f982d18f combine_output.xml
--- a/combine_output.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,36 +0,0 @@
-
- Perform a combination of output data from the RankFilter and CasLookup tools
-
- combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi
-
-
-
-
-
-
-
-
-
-
-
-Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files.
-
-The files produced are contain either all hits for a compound on a single line (Single) or on separate lines
-(Multi).
-
-.. class:: infomark
-
-**Notes**
-
-The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the
-original headers kept intact. Processing steps include:
-
- - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values
- - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)'
- and 'nr. Peaks' fields.
- - The formula is split off the 'Name' field in the RankFilter output
-
-
-
diff -r 41f122255d14 -r 4393f982d18f create_model.xml
--- a/create_model.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,78 +0,0 @@
-
- Generate coefficients to enable the regression from one GC-column
- to another GC-column
- Rscripts/ridb-regression.R
- $ridb
- $out_model
- $out_log
- $min_residuals
- $range_mod
- $pvalue
- $rsquared
- $method
- $plot
- #if $plot
- $model_graphics
- #end if
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- (plot)
-
-
-
-
-
-Calculates regression models for a permutation of all GC columns contained in the selected
-RI database file. The method used for creating the model is either based on a 3rd degree
-polynomial or a standard linear model.
-
-The *Minimum number of residuals* option will only allow regression if the columns it is based
-on has at least that number of datapoints on the same compound.
-
-Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for
-the *R squared* value. The produced logfile will state how many models have been discarded due
-to this filtering. The output model file also includes the p-value and R squared value for
-each created model.
-
-Graphical output of the models is available by selecting the plot option which shows the
-data points used for the model as well as the fit itself and the range of data that will
-be usable.
-
-.. class:: infomark
-
-**Notes**
-
-The output file produced by this tool is required as input for the CasLookup tool when
-selecting to apply regression when finding hits in the RIDB.
-
-
diff -r 41f122255d14 -r 4393f982d18f export_to_metexp_tabular.py
--- a/export_to_metexp_tabular.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,247 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-'''
-Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
-into a tabular file that can be uploaded to the MetExp database.
-
-RankFilter, CasLookup are already combined by combine_output.py so here we will use
-this result. Furthermore here one of the MsClust
-quantification files containing the respective spectra details are to be combined as well.
-
-Extra calculations performed:
-- The column MW is also added here and is derived from the column FORMULA found
- in RankFilter, CasLookup combined result.
-
-So in total here we merge 2 files and calculate one new column.
-'''
-from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
-import csv
-import re
-import sys
-from collections import OrderedDict
-
-__author__ = "Pieter Lukasse"
-__contact__ = "pieter.lukasse@wur.nl"
-__copyright__ = "Copyright, 2013, Plant Research International, WUR"
-__license__ = "Apache v2"
-
-def _process_data(in_csv, delim='\t'):
- '''
- Generic method to parse a tab-separated file returning a dictionary with named columns
- @param in_csv: input filename to be parsed
- '''
- data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim))
- header = data.pop(0)
- # Create dictionary with column name as key
- output = OrderedDict()
- for index in xrange(len(header)):
- output[header[index]] = [row[index] for row in data]
- return output
-
-ONE_TO_ONE = 'one_to_one'
-N_TO_ONE = 'n_to_one'
-
-def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
- '''
- Merges data from both input dictionaries based on the link fields. This method will
- build up a new list containing the merged hits as the items.
- @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
- @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
- '''
- # TODO test for correct input files -> same link_field values should be there
- # (test at least number of unique link_field values):
- #
- # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
- # raise Exception('input files should have the same nr of key values ')
-
-
- merged = []
- processed = {}
- for link_field_set1_idx in xrange(len(set1[link_field_set1])):
- link_field_set1_value = set1[link_field_set1][link_field_set1_idx]
- if not link_field_set1_value in processed :
- # keep track of processed items to not repeat them
- processed[link_field_set1_value] = link_field_set1_value
-
- # Get the indices for current link_field_set1_value in both data-structures for proper matching
- set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
- set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
- # Validation :
- if len(set2index) == 0:
- # means that corresponding data could not be found in set2, then throw error
- raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" +
- link_field_set1_value + " only found in first dataset. ")
-
- merged_hits = []
- # Combine hits
- for hit in xrange(len(set1index)):
- # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
- # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
- # corresponding value in the sets; i.e.
- # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
- # represented by "key".
- # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
- # So set1[x][set1index[n]] = set1.attributeX.instanceN
- #
- # It just ensures the entry is made available as a plain named array for easy access.
- rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
- if relation_type == ONE_TO_ONE :
- cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
- else:
- # is N to 1:
- cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
-
- merged_hit = merge_function(rf_record, cl_record, metadata)
- merged_hits.append(merged_hit)
-
- merged.append(merged_hits)
-
- return merged
-
-
-def _compare_records(key1, key2):
- '''
- in this case the compare method is really simple as both keys are expected to contain
- same value when records are the same
- '''
- if key1 == key2:
- return True
- else:
- return False
-
-
-
-def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
- '''
- Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
-
- @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
- @param msclust_quant_record: msclust quantification + spectrum record
- '''
- record = []
- for column in rank_caslookup_combi:
- record.append(rank_caslookup_combi[column])
-
- for column in msclust_quant_record:
- record.append(msclust_quant_record[column])
-
- for column in metadata:
- record.append(metadata[column])
-
- # add MOLECULAR MASS (MM)
- molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
- # limit to two decimals:
- record.append("{0:.2f}".format(molecular_mass))
-
- # add MOLECULAR WEIGHT (MW) - TODO - calculate this
- record.append('0.0')
-
- # level of identification and Location of reference standard
- record.append('0')
- record.append('')
-
- return record
-
-
-def get_molecular_mass(formula):
- '''
- Calculates the molecular mass (MM).
- E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
- '''
-
- # Each element is represented by a capital letter, followed optionally by
- # lower case, with one or more digits as for how many elements:
- element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
-
- total_mass = 0
- for (element_name, count) in element_pattern.findall(formula):
- if count == "":
- count = 1
- else:
- count = int(count)
- element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?)
- total_mass += element_mass * count
-
- return total_mass
-
-
-
-def _save_data(data, headers, out_csv):
- '''
- Writes tab-separated data to file
- @param data: dictionary containing merged dataset
- @param out_csv: output csv file
- '''
-
- # Open output file for writing
- outfile_single_handle = open(out_csv, 'wb')
- output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
-
- # Write headers
- output_single_handle.writerow(headers)
-
- # Write
- for item_idx in xrange(len(data)):
- for hit in data[item_idx]:
- output_single_handle.writerow(hit)
-
-
-def _get_map_for_elements_and_masses(elements_and_masses):
- '''
- This method will read out the column 'Chemical symbol' and make a map
- of this, storing the column 'Relative atomic mass' as its value
- '''
- resultMap = {}
- index = 0
- for entry in elements_and_masses['Chemical symbol']:
- resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
- index += 1
-
- return resultMap
-
-
-def init_elements_and_masses_map():
- '''
- Initializes the lookup map containing the elements and their respective masses
- '''
- elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
- global elements_and_masses_map
- elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
-
-
-def main():
- '''
- Combine Output main function
-
- RankFilter, CasLookup are already combined by combine_output.py so here we will use
- this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
- quantification files are to be combined with combine_output.py result as well.
- '''
- rankfilter_and_caslookup_combined_file = sys.argv[1]
- msclust_quantification_and_spectra_file = sys.argv[2]
- output_csv = sys.argv[3]
- # metadata
- metadata = OrderedDict()
- metadata['organism'] = sys.argv[4]
- metadata['tissue'] = sys.argv[5]
- metadata['experiment_name'] = sys.argv[6]
- metadata['user_name'] = sys.argv[7]
- metadata['column_type'] = sys.argv[8]
-
- # Read RankFilter and CasLookup output files
- rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
- msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
-
- # Read elements and masses to use for the MW/MM calculation :
- init_elements_and_masses_map()
-
- merged = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype',
- msclust_quantification_and_spectra, 'centrotype',
- _compare_records, _merge_records, metadata,
- N_TO_ONE)
- headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
- _save_data(merged, headers, output_csv)
-
-
-if __name__ == '__main__':
- main()
diff -r 41f122255d14 -r 4393f982d18f export_to_metexp_tabular.xml
--- a/export_to_metexp_tabular.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-
- Create tabular file for loading into METabolomics EXPlorer database
-
- export_to_metexp_tabular.py
- $rankfilter_and_caslookup_combi
- $msclust_quant_file
- $output_result
- "$organism"
- "$tissue"
- "$experiment_name"
- "$user_name"
- "$column_type"
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-Tool to combine output from the tools RankFilter, CasLookup and MsClust
-into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
-
-RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
-this result.
-
-**Notes**
-
-Extra calculations performed:
-- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result.
-
-So in total here we merge 2 files and calculate one new column.
-
-
-
-
diff -r 41f122255d14 -r 4393f982d18f library_lookup.py
--- a/library_lookup.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,327 +0,0 @@
-'''
-Logic for searching a Retention Index database file given output from NIST
-'''
-import match_library
-import re
-import sys
-import csv
-
-__author__ = "Marcel Kempenaar"
-__contact__ = "brs@nbic.nl"
-__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
-__license__ = "MIT"
-
-def create_lookup_table(library_file, column_type_name, statphase):
- '''
- Creates a dictionary holding the contents of the library to be searched
- @param library_file: library to read
- @param column_type_name: the columns type name
- @param statphase: the columns stationary phase
- '''
- (data, header) = match_library.read_library(library_file)
- # Test for presence of required columns
- if ('columntype' not in header or
- 'columnphasetype' not in header or
- 'cas' not in header):
- raise IOError('Missing columns in ', library_file)
-
- column_type_column = header.index("columntype")
- statphase_column = header.index("columnphasetype")
- cas_column = header.index("cas")
-
- filtered_library = [line for line in data if line[column_type_column] == column_type_name
- and line[statphase_column] == statphase]
- lookup_dict = {}
- for element in filtered_library:
- # Here the cas_number is set to the numeric part of the cas_column value, so if the
- # cas_column value is 'C1433' then cas_number will be '1433'
- cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0])
- try:
- lookup_dict[cas_number].append(element)
- except KeyError:
- lookup_dict[cas_number] = [element]
- return lookup_dict
-
-
-def _preferred(hits, pref, ctype, polar, model, method):
- '''
- Returns all entries in the lookup_dict that have the same column name, type and polarity
- as given by the user, uses regression if selected given the model and method to use. The
- regression is applied on the column with the best R-squared value in the model
- @param hits: all entries in the lookup_dict for the given CAS number
- @param pref: preferred GC-column, can be one or more names
- @param ctype: column type (capillary etc.)
- @param polar: polarity (polar / non-polar etc.)
- @param model: data loaded from file containing regression models
- @param method: supported regression method (i.e. poly(nomial) or linear)
- '''
- match = []
- for column in pref:
- for hit in hits:
- if hit[4] == ctype and hit[5] == polar and hit[6] == column:
- # Create copy of found hit since it will be altered downstream
- match.extend(hit)
- return match, False
-
- # No hit found for current CAS number, return if not performing regression
- if not model:
- return False, False
-
- # Perform regression
- for column in pref:
- if column not in model:
- break
- # Order regression candidates by R-squared value (last element)
- order = sorted(model[column].items(), key=lambda col: col[1][-1])
- # Create list of regression candidate column names
- regress_columns = list(reversed([column for (column, _) in order]))
- # Names of available columns
- available = [hit[6] for hit in hits]
-
- # TODO: combine Rsquared and number of datapoints to get the best regression match
- '''
- # Iterate regression columns (in order) and retrieve their models
- models = {}
- for col in regress_columns:
- if col in available:
- hit = list(hits[available.index(col)])
- if hit[4] == ctype:
- # models contains all model data including residuals [-2] and rsquared [-1]
- models[pref[0]] = model[pref[0]][hit[6]]
- # Get the combined maximum for residuals and rsquared
- best_match = models[]
- # Apply regression
- if method == 'poly':
- regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model)
- if regressed:
- hit[3] = regressed
- else:
- return False, False
- else:
- hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model)
- match.extend(hit)
- return match, hit[6]
- '''
-
- for col in regress_columns:
- if col in available:
- hit = list(hits[available.index(col)])
- if hit[4] == ctype:
- # Perform regression using a column for which regression is possible
- if method == 'poly':
- # Polynomial is only possible within a set border, if the RI falls outside
- # of this border, skip this lookup
- regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model)
- if regressed:
- hit[3] = regressed
- else:
- return False, False
- else:
- hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model)
- match.extend(hit)
- return match, hit[6]
-
- return False, False
-
-
-
-def default_hit(row, cas_nr, compound_id):
- '''
- This method will return a "default"/empty hit for cases where the
- method _preferred() returns False (i.e. a RI could not be found
- for the given cas nr, also not via regression.
- '''
- return [
- #'CAS',
- 'C' + cas_nr,
- #'NAME',
- '',
- #'FORMULA',
- '',
- #'RI',
- '0.0',
- #'Column.type',
- '',
- #'Column.phase.type',
- '',
- #'Column.name',
- '',
- #'phase.coding',
- ' ',
- #'CAS_column.Name',
- '',
- #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes
- compound_id,
- #'Regression.Column.Name',
- '',
- #'min',
- '',
- #'max',
- '',
- #'nr.duplicates',
- '']
-
-
-def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method):
- '''
- Looks up the compounds in the library lookup table and formats the results
- @param lookup_dict: dictionary containing the library to be searched
- @param nist_tabular_filename: NIST output file to be matched
- @param pref: (list of) column-name(s) to look for
- @param ctype: column type of interest
- @param polar: polarity of the used column
- @param model: data loaded from file containing regression models
- @param method: supported regression method (i.e. poly(nomial) or linear)
- '''
- (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename)
- # Retrieve indices of the CAS and compound_id columns (exit if not present)
- try:
- casi = header_clean.index("cas")
- idi = header_clean.index("id")
- except:
- raise IOError("'CAS' or 'compound_id' not found in header of library file")
-
- data = []
- for row in nist_tabular_list:
- casf = str(row[casi].replace('-', '').strip())
- compound_id = str(row[idi].split('-')[0])
- if casf in lookup_dict:
- found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method)
- if found_hit:
- # Keep cas nr as 'C'+ numeric part:
- found_hit[0] = 'C' + casf
- # Add compound id
- found_hit.insert(9, compound_id)
- # Add information on regression process
- found_hit.insert(10, regress if regress else 'None')
- # Replace column index references with actual number of duplicates
- dups = len(found_hit[-1].split(','))
- if dups > 1:
- found_hit[-1] = str(dups + 1)
- else:
- found_hit[-1] = '0'
- data.append(found_hit)
- found_hit = ''
- else:
- data.append(default_hit(row, casf, compound_id))
- else:
- data.append(default_hit(row, casf, compound_id))
-
- casf = ''
- compound_id = ''
- found_hit = []
- dups = []
- return data
-
-
-def _save_data(content, outfile):
- '''
- Write to output file
- @param content: content to write
- @param outfile: file to write to
- '''
- # header
- header = ['CAS',
- 'NAME',
- 'FORMULA',
- 'RI',
- 'Column.type',
- 'Column.phase.type',
- 'Column.name',
- 'phase.coding',
- 'CAS_column.Name',
- 'Centrotype',
- 'Regression.Column.Name',
- 'min',
- 'max',
- 'nr.duplicates']
- output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t")
- output_handle.writerow(header)
- for entry in content:
- output_handle.writerow(entry)
-
-
-def _read_model(model_file):
- '''
- Creates an easy to search dictionary for getting the regression parameters
- for each valid combination of GC-columns
- @param model_file: filename containing the regression models
- '''
- regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t'))
- if len(regress.pop(0)) > 9:
- method = 'poly'
- else:
- method = 'linear'
-
- model = {}
- # Create new dictionary for each GC-column
- for line in regress:
- model[line[0]] = {}
-
- # Add data
- for line in regress:
- if method == 'poly':
- model[line[0]][line[1]] = [float(col) for col in line[2:11]]
- else: # linear
- model[line[0]][line[1]] = [float(col) for col in line[2:9]]
-
- return model, method
-
-
-def _apply_poly_regression(column1, column2, retention_index, model):
- '''
- Calculates a new retention index (RI) value using a given 3rd-degree polynomial
- model based on data from GC columns 1 and 2
- @param column1: name of the selected GC-column
- @param column2: name of the GC-column to use for regression
- @param retention_index: RI to convert
- @param model: dictionary containing model information for all GC-columns
- '''
- coeff = model[column1][column2]
- # If the retention index to convert is within range of the data the model is based on, perform regression
- if coeff[4] < retention_index < coeff[5]:
- return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) +
- (retention_index * coeff[1]) + coeff[0])
- else:
- return False
-
-
-def _apply_linear_regression(column1, column2, retention_index, model):
- '''
- Calculates a new retention index (RI) value using a given linear model based on data
- from GC columns 1 and 2
- @param column1: name of the selected GC-column
- @param column2: name of the GC-column to use for regression
- @param retention_index: RI to convert
- @param model: dictionary containing model information for all GC-columns
- '''
- # TODO: No use of limits
- coeff = model[column1][column2]
- return coeff[1] * retention_index + coeff[0]
-
-
-def main():
- '''
- Library Lookup main function
- '''
- library_file = sys.argv[1]
- nist_tabular_filename = sys.argv[2]
- ctype = sys.argv[3]
- polar = sys.argv[4]
- outfile = sys.argv[5]
- pref = sys.argv[6:-1]
- regress = sys.argv[-1]
-
- if regress != 'False':
- model, method = _read_model(regress)
- else:
- model, method = False, None
-
- lookup_dict = create_lookup_table(library_file, ctype, polar)
- data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)
-
- _save_data(data, outfile)
-
-
-if __name__ == "__main__":
- main()
diff -r 41f122255d14 -r 4393f982d18f library_lookup.xml
--- a/library_lookup.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,75 +0,0 @@
-
- Lookup or estimate the RI using a "known RI values" CAS numbers library
-
- library_lookup.py
- $library_file
- $input
- "$col_type"
- "$polarity"
- $output
- #for $ctype in $pref
- ${ctype.columntype}
- #end for
- $regression.model
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library.
-If a direct match is NOT found for the preferred column name, a regression can be done to find
-the theoretical RI value based on known RI values for the CAS number on other column types (see step 4).
-If there is no match for the CAS number on any column type, then the record is not given a RI.
-
-
-
-
-
-
diff -r 41f122255d14 -r 4393f982d18f match_library.py
--- a/match_library.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,133 +0,0 @@
-'''
-Containing functions are called from Galaxy to populate lists/checkboxes with selectable items
-'''
-import csv
-import glob
-import os
-
-
-__author__ = "Marcel Kempenaar"
-__contact__ = "brs@nbic.nl"
-__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
-__license__ = "MIT"
-
-def get_column_type(library_file):
- '''
- Returns a Galaxy formatted list of tuples containing all possibilities for the
- GC-column types. Used by the library_lookup.xml tool
- @param library_file: given library file from which the list of GC-column types is extracted
- '''
- if library_file == "":
- galaxy_output = [("", "", False)]
- else:
- (data, header) = read_library(library_file)
-
- if 'columntype' not in header:
- raise IOError('Missing columns in ', library_file)
-
- # Filter data on column type
- column_type = header.index("columntype")
- amounts_in_list_dict = count_occurrence([row[column_type] for row in data])
- galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()]
-
- return(galaxy_output)
-
-
-def filter_column(library_file, column_type_name):
- '''
- Filters the Retention Index database on column type
- @param library_file: file containing the database
- @param column_type_name: column type to filter on
- '''
- if library_file == "":
- galaxy_output = [("", "", False)]
- else:
- (data, header) = read_library(library_file)
-
- if ('columntype' not in header or
- 'columnphasetype' not in header):
- raise IOError('Missing columns in ', library_file)
-
- column_type = header.index("columntype")
- statphase = header.index("columnphasetype")
-
- # Filter data on colunn type name
- statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name]
- amounts_in_list_dict = count_occurrence(statphase_list)
- galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
-
- return(sorted(galaxy_output))
-
-
-def filter_column2(library_file, column_type_name, statphase):
- '''
- Filters the Retention Index database on column type
- @param library_file: file containing the database
- @param column_type_name: column type to filter on
- @param statphase: stationary phase of the column to filter on
- '''
- if library_file == "":
- galaxy_output = [("", "", False)]
- else:
- (data, header) = read_library(library_file)
-
- if ('columntype' not in header or
- 'columnphasetype' not in header or
- 'columnname' not in header):
- raise IOError('Missing columns in ', library_file)
-
- column_type_column = header.index("columntype")
- statphase_column = header.index("columnphasetype")
- column_name_column = header.index("columnname")
-
- # Filter data on given column type name and stationary phase
- statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and
- line[statphase_column] == statphase]
- amounts_in_list_dict = count_occurrence(statphase_list)
- galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
-
- return(sorted(galaxy_output))
-
-
-def read_library(filename):
- '''
- Reads a CSV file and returns its contents and a normalized header
- @param filename: file to read
- '''
- data = list(csv.reader(open(filename, 'rU'), delimiter='\t'))
- header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)]
- return(data, header_clean)
-
-
-
-def get_directory_files(dir_name):
- '''
- Reads the directory and
- returns the list of .txt files found as a dictionary
- with file name and full path so that it can
- fill a Galaxy drop-down combo box.
-
- '''
- files = glob.glob(dir_name + "/*.*")
- if len(files) == 0:
- # Configuration error: no library files found in /" + dir_name :
- galaxy_output = [("Configuration error: expected file not found in /" + dir_name, "", False)]
- else:
- galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files]
- return(galaxy_output)
-
-def get_file_name_no_ext(full_name):
- '''
- returns just the last part of the name
- '''
- simple_name = os.path.basename(full_name)
- base, ext = os.path.splitext(simple_name)
- return base
-
-
-def count_occurrence(data_list):
- '''
- Counts occurrences in a list and returns a dict with item:occurrence
- @param data_list: list to count items from
- '''
- return dict((key, data_list.count(key)) for key in set(data_list))
diff -r 41f122255d14 -r 4393f982d18f metaMS/INFO_structure_of_LC.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/INFO_structure_of_LC.txt Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,532 @@
+str(LC)
+List of 4
+ $ PeakTable :'data.frame': 1828 obs. of 12 variables:
+ ..$ ChemSpiderID : chr [1:1828] "" "" "" "" ...
+ ..$ compound : chr [1:1828] "" "" "" "" ...
+ ..$ pcgroup : num [1:1828] 187 226 226 226 226 226 226 306 154 154 ...
+ ..$ adduct : chr [1:1828] "" "" "" "" ...
+ ..$ isotopes : chr [1:1828] "" "" "" "" ...
+ ..$ mz : num [1:1828] 253 233 214 215 298 ...
+ ..$ rt : num [1:1828] 1.27 1.57 1.59 1.59 1.59 ...
+ ..$ CLASS2 : num [1:1828] 2 2 2 2 2 2 2 2 2 2 ...
+ ..$ STDmix_GC_01 : num [1:1828] 0 0 0 0 0 0 0 0 0 0 ...
+ ..$ STDmix_GC_02 : num [1:1828] 0 0 0 0 0 0 0 0 0 0 ...
+ ..$ STDmix_RP_pos01: num [1:1828] 144.5 32.6 58.5 38.4 152.3 ...
+ ..$ STDmix_RP_pos02: num [1:1828] 343.4 54.5 79.1 64.9 210.8 ...
+ $ xset :Formal class 'xsAnnotate' [package "CAMERA"] with 15 slots
+ .. ..@ groupInfo : num [1:1828, 1:13] 30.2 30.5 31.4 32.4 33.4 ...
+ .. .. ..- attr(*, "dimnames")=List of 2
+ .. .. .. ..$ : NULL
+ .. .. .. ..$ : chr [1:13] "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", "CLASS1", "CLASS2", "STDmix_GC_01", "STDmix_GC_02", "STDmix_RP_pos01", "STDmix_RP_pos02"
+
+ and groups() function:
+ "mzmed", "mzmin", "mzmax", "rtmed", "rtmin", "rtmax", "npeaks", "CLASS1", "CLASS2"
+
+ .. ..@ pspectra :List of 340
+ .. .. ..$ : num [1:12] 2 4 5 10 11 13 14 15 16 38 ...
+ .. .. ..$ : num [1:11] 3 12 45 76 99 ...
+ .. .. ..$ : num [1:15] 7 8 9 113 162 ...
+ .. .. ..$ : Named num 1
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:5] 6 150 473 478 802
+ .. .. ..$ : num [1:5] 18 28 40 64 85
+ .. .. ..$ : num [1:5] 89 528 533 596 872
+ .. .. ..$ : num [1:2] 57 197
+ .. .. ..$ : num [1:3] 174 708 841
+ .. .. ..$ : Named num 23
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:4] 21 34 496 1497
+ .. .. ..$ : num [1:3] 29 30 44
+ .. .. ..$ : num [1:5] 283 423 452 486 527
+ .. .. ..$ : num [1:6] 93 126 251 296 349 406
+ .. .. ..$ : num [1:3] 301 479 664
+ .. .. ..$ : num [1:5] 47 105 111 191 237
+ .. .. ..$ : num [1:5] 185 562 683 923 1548
+ .. .. ..$ : num [1:2] 469 809
+ .. .. ..$ : num [1:9] 24 146 548 550 1009 ...
+ .. .. ..$ : num [1:33] 65 72 134 139 196 200 235 299 355 361 ...
+ .. .. ..$ : num [1:2] 135 234
+ .. .. ..$ : num [1:4] 95 326 330 465
+ .. .. ..$ : num [1:2] 384 699
+ .. .. ..$ : Named num 388
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 249
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:4] 425 485 636 698
+ .. .. ..$ : Named num 482
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:2] 310 429
+ .. .. ..$ : num [1:97] 33 51 67 96 103 106 114 129 136 140 ...
+ .. .. ..$ : Named num 436
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:3] 583 790 1029
+ .. .. ..$ : Named num 593
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:2] 558 745
+ .. .. ..$ : Named num 543
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:44] 43 60 101 121 124 179 211 214 257 263 ...
+ .. .. ..$ : num [1:2] 243 941
+ .. .. ..$ : num [1:4] 657 996 1286 1344
+ .. .. ..$ : num [1:21] 145 195 199 217 230 268 271 279 665 743 ...
+ .. .. ..$ : Named num 836
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 694
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 578
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 612
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 684
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:18] 31 163 194 395 441 ...
+ .. .. ..$ : num [1:2] 79 282
+ .. .. ..$ : Named num 297
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:10] 167 169 281 284 750 ...
+ .. .. ..$ : num [1:30] 165 796 812 821 829 ...
+ .. .. ..$ : num [1:4] 112 566 733 1552
+ .. .. ..$ : num [1:33] 42 63 68 90 92 102 125 130 133 137 ...
+ .. .. ..$ : num [1:14] 183 188 435 960 1036 ...
+ .. .. ..$ : Named num 763
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 704
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:2] 693 797
+ .. .. ..$ : Named num 643
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:45] 62 100 115 128 132 178 327 366 430 492 ...
+ .. .. ..$ : Named num 838
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 653
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:9] 182 186 1035 1040 1045 ...
+ .. .. ..$ : Named num 250
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:2] 144 935
+ .. .. ..$ : num [1:3] 32 1259 1394
+ .. .. ..$ : Named num 77
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:27] 19 46 49 107 110 157 159 233 265 314 ...
+ .. .. ..$ : num [1:4] 253 410 777 991
+ .. .. ..$ : Named num 1031
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 978
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:22] 887 894 1023 1028 1037 ...
+ .. .. ..$ : num [1:5] 258 312 1669 1670 1694
+ .. .. ..$ : num [1:19] 319 730 735 888 895 ...
+ .. .. ..$ : num [1:22] 55 164 541 617 649 656 661 668 753 766 ...
+ .. .. ..$ : num [1:7] 513 545 770 773 1080 ...
+ .. .. ..$ : num [1:4] 70 295 634 1665
+ .. .. ..$ : num [1:3] 625 1447 1453
+ .. .. ..$ : Named num 287
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:2] 964 970
+ .. .. ..$ : num [1:153] 20 22 36 37 39 53 56 58 59 61 ...
+ .. .. ..$ : num [1:9] 317 323 420 886 1022 ...
+ .. .. ..$ : Named num 149
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:6] 487 549 609 623 728 ...
+ .. .. ..$ : num [1:6] 315 476 1265 1272 1634 ...
+ .. .. ..$ : num [1:5] 378 383 495 734 1063
+ .. .. ..$ : num [1:10] 645 818 823 914 919 ...
+ .. .. ..$ : num [1:7] 815 1388 1457 1539 1555 ...
+ .. .. ..$ : Named num 1019
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:6] 391 398 491 742 982 ...
+ .. .. ..$ : Named num 546
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:12] 794 804 811 820 828 ...
+ .. .. ..$ : num [1:24] 1089 1475 1481 1484 1491 ...
+ .. .. ..$ : num [1:2] 148 151
+ .. .. ..$ : num [1:33] 74 104 108 147 181 189 224 232 260 298 ...
+ .. .. ..$ : num [1:17] 293 332 394 447 449 552 619 622 680 805 ...
+ .. .. ..$ : Named num 309
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 207
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : Named num 584
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:2] 637 865
+ .. .. ..$ : Named num 143
+ .. .. .. ..- attr(*, "names")= chr ""
+ .. .. ..$ : num [1:18] 1737 1738 1741 1757 1758 ...
+ .. .. ..$ : num [1:29] 261 352 360 363 402 408 416 419 456 507 ...
+ .. .. .. [list output truncated]
+ .. ..@ psSamples : int [1:340] 1 1 1 2 2 2 2 1 2 1 ...
+ .. ..@ isotopes :List of 1828
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ :List of 4
+ .. .. .. ..$ y : num 1
+ .. .. .. ..$ iso : num 0
+ .. .. .. ..$ charge: Named num 1
+ .. .. .. .. ..- attr(*, "names")= chr "charge"
+ .. .. .. ..$ val : Named num 0
+ .. .. .. .. ..- attr(*, "names")= chr "intrinsic"
+ .. .. ..$ :List of 4
+ .. .. .. ..$ y : num 1
+ .. .. .. ..$ iso : Named num 1
+ .. .. .. .. ..- attr(*, "names")= chr "iso"
+ .. .. .. ..$ charge: Named num 1
+ .. .. .. .. ..- attr(*, "names")= chr "charge"
+ .. .. .. ..$ val : Named num 0
+ .. .. .. .. ..- attr(*, "names")= chr "intrinsic"
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ :List of 4
+ .. .. .. ..$ y : num 2
+ .. .. .. ..$ iso : num 0
+ .. .. .. ..$ charge: Named num 1
+ .. .. .. .. ..- attr(*, "names")= chr "charge"
+ .. .. .. ..$ val : Named num 0
+ .. .. .. .. ..- attr(*, "names")= chr "intrinsic"
+ .. .. ..$ : NULL
+ .. .. ..$ :List of 4
+ .. .. .. ..$ y : num 2
+ .. .. .. ..$ iso : Named num 1
+ .. .. .. .. ..- attr(*, "names")= chr "iso"
+ .. .. .. ..$ charge: Named num 1
+ .. .. .. .. ..- attr(*, "names")= chr "charge"
+ .. .. .. ..$ val : Named num 0
+ .. .. .. .. ..- attr(*, "names")= chr "intrinsic"
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. ..$ : NULL
+ .. .. .. [list output truncated]
+ .. ..@ derivativeIons : list()
+ .. ..@ formula : list()
+ .. ..@ sample : num NA
+ .. ..@ xcmsSet :Formal class 'xcmsSet' [package "xcms"] with 12 slots
+ .. .. .. ..@ peaks : num [1:9826, 1:13] 30.2 30.5 31.4 32.4 33.3 ...
+ .. .. .. .. ..- attr(*, "dimnames")=List of 2
+ .. .. .. .. .. ..$ : NULL
+ .. .. .. .. .. ..$ : chr [1:13] "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "into", "intf", "maxo", "maxf", "i", "sn", "sample" (found with dput)
+ From http://www.bioconductor.org/packages/release/bioc/manuals/xcms/man/xcms.pdf:
+
+ mz weighted (by intensity) mean of peak m/z across scans
+mzmin m/z of minimum step
+mzmax m/z of maximum step
+rt retention time of peak midpoint
+rtmin leading edge of peak retention time
+rtmax trailing edge of peak retention time
+into integrated area of original (raw) peak
+intf integrated area of filtered peak
+maxo maximum intensity of original (raw) peak
+maxf maximum intensity of filtered peak
+i rank of peak identified in merged EIC (<= max)
+sn signal to noise ratio of the peak
+
+ .. .. .. ..@ groups : num [1:1828, 1:9] 30.2 30.5 31.4 32.4 33.4 ...
+ .. .. .. .. ..- attr(*, "dimnames")=List of 2
+ .. .. .. .. .. ..$ : NULL
+ .. .. .. .. .. ..$ : chr [1:9] "mzmed" "mzmin" "mzmax" "rtmed" ...
+ .. .. .. ..@ groupidx :List of 1828
+ .. .. .. .. ..$ : int [1:4] 1 674 9671 9749
+ .. .. .. .. ..$ : int [1:4] 2 675 9672 9750
+ .. .. .. .. ..$ : int [1:4] 3 676 9673 9751
+ .. .. .. .. ..$ : int [1:4] 4 677 9674 9752
+ .. .. .. .. ..$ : int [1:4] 6 678 9675 9753
+ .. .. .. .. ..$ : int [1:4] 9 683 9676 9754
+ .. .. .. .. ..$ : int [1:4] 11 684 9677 9755
+ .. .. .. .. ..$ : int [1:4] 13 686 9678 9756
+ .. .. .. .. ..$ : int [1:4] 17 688 9679 9757
+ .. .. .. .. ..$ : int [1:4] 18 689 9680 9758
+ .. .. .. .. ..$ : int [1:4] 19 691 9681 9759
+ .. .. .. .. ..$ : int [1:4] 21 693 9682 9760
+ .. .. .. .. ..$ : int [1:4] 23 695 9683 9761
+ .. .. .. .. ..$ : int [1:4] 24 699 9684 9762
+ .. .. .. .. ..$ : int [1:4] 26 700 9685 9763
+ .. .. .. .. ..$ : int [1:4] 28 702 9686 9764
+ .. .. .. .. ..$ : int [1:4] 1353 3930 6183 7926
+ .. .. .. .. ..$ : int [1:4] 36 707 9687 9765
+ .. .. .. .. ..$ : int [1:4] 1354 3931 6184 7927
+ .. .. .. .. ..$ : int [1:4] 1355 3932 6185 7928
+ .. .. .. .. ..$ : int [1:4] 39 712 9688 9766
+ .. .. .. .. ..$ : int [1:4] 1356 3934 6186 7929
+ .. .. .. .. ..$ : int [1:4] 40 714 9689 9767
+ .. .. .. .. ..$ : int [1:4] 1358 3935 6187 7930
+ .. .. .. .. ..$ : int [1:4] 1359 3936 6188 7931
+ .. .. .. .. ..$ : int [1:4] 1361 3938 6189 7932
+ .. .. .. .. ..$ : int [1:4] 1360 3937 6190 7933
+ .. .. .. .. ..$ : int [1:4] 45 722 9690 9768
+ .. .. .. .. ..$ : int [1:4] 54 735 9691 9769
+ .. .. .. .. ..$ : int [1:4] 57 737 9692 9770
+ .. .. .. .. ..$ : int [1:4] 1365 3939 6191 7934
+ .. .. .. .. ..$ : int [1:4] 1364 3940 6192 7935
+ .. .. .. .. ..$ : int [1:4] 1369 3941 6193 7936
+ .. .. .. .. ..$ : int [1:4] 58 739 9693 9771
+ .. .. .. .. ..$ : int [1:4] 1370 3942 6194 7937
+ .. .. .. .. ..$ : int [1:4] 1371 3943 6195 7938
+ .. .. .. .. ..$ : int [1:4] 1374 3944 6196 7939
+ .. .. .. .. ..$ : int [1:4] 65 748 9694 9772
+ .. .. .. .. ..$ : int [1:4] 1375 3946 6197 7940
+ .. .. .. .. ..$ : int [1:4] 71 755 9695 9773
+ .. .. .. .. ..$ : int [1:5] 72 756 1376 1377 3947
+ .. .. .. .. ..$ : int [1:4] 1378 3948 6198 7941
+ .. .. .. .. ..$ : int [1:4] 1379 3950 6199 7942
+ .. .. .. .. ..$ : int [1:4] 74 758 9696 9774
+ .. .. .. .. ..$ : int [1:4] 1381 3951 6200 7943
+ .. .. .. .. ..$ : int [1:4] 1382 3952 6201 7944
+ .. .. .. .. ..$ : int [1:4] 78 761 9697 9775
+ .. .. .. .. ..$ : int [1:4] 1383 3955 6202 7945
+ .. .. .. .. ..$ : int [1:4] 1384 3956 6203 7946
+ .. .. .. .. ..$ : int [1:4] 1385 3957 6204 7947
+ .. .. .. .. ..$ : int [1:4] 1387 3958 6205 7948
+ .. .. .. .. ..$ : int [1:4] 1388 3959 6206 7949
+ .. .. .. .. ..$ : int [1:4] 1389 3960 6207 7950
+ .. .. .. .. ..$ : int [1:4] 1390 3962 6208 7951
+ .. .. .. .. ..$ : int [1:4] 1391 3963 6209 7952
+ .. .. .. .. ..$ : int [1:4] 1392 3965 6210 7953
+ .. .. .. .. ..$ : int [1:4] 90 776 9698 9776
+ .. .. .. .. ..$ : int [1:4] 1393 3966 6211 7954
+ .. .. .. .. ..$ : int [1:4] 1394 3967 6212 7955
+ .. .. .. .. ..$ : int [1:4] 1395 3968 6213 7956
+ .. .. .. .. ..$ : int [1:4] 1397 3970 6214 7957
+ .. .. .. .. ..$ : int [1:4] 1399 3972 6215 7958
+ .. .. .. .. ..$ : int [1:4] 1398 3971 6216 7959
+ .. .. .. .. ..$ : int [1:4] 101 788 9699 9777
+ .. .. .. .. ..$ : int [1:4] 100 790 9700 9778
+ .. .. .. .. ..$ : int [1:4] 1401 3973 6217 7960
+ .. .. .. .. ..$ : int [1:4] 1403 3975 6218 7961
+ .. .. .. .. ..$ : int [1:4] 1402 3974 6219 7962
+ .. .. .. .. ..$ : int [1:4] 1404 3976 6220 7963
+ .. .. .. .. ..$ : int [1:4] 1405 3977 6221 7964
+ .. .. .. .. ..$ : int [1:4] 1406 3978 6222 7965
+ .. .. .. .. ..$ : int [1:4] 1407 3979 6223 7966
+ .. .. .. .. ..$ : int [1:4] 1408 3980 6224 7967
+ .. .. .. .. ..$ : int [1:4] 1409 3981 6225 7968
+ .. .. .. .. ..$ : int [1:4] 1412 3982 6226 7969
+ .. .. .. .. ..$ : int [1:4] 1413 3983 6227 7970
+ .. .. .. .. ..$ : int [1:4] 1414 3984 6228 7971
+ .. .. .. .. ..$ : int [1:4] 1415 3985 6229 7972
+ .. .. .. .. ..$ : int [1:4] 1417 3987 6230 7973
+ .. .. .. .. ..$ : int [1:4] 1416 3988 6231 7974
+ .. .. .. .. ..$ : int [1:4] 1419 3989 6232 7975
+ .. .. .. .. ..$ : int [1:4] 1421 3991 6233 7976
+ .. .. .. .. ..$ : int [1:4] 1420 3990 6234 7977
+ .. .. .. .. ..$ : int [1:4] 1422 3992 6235 7978
+ .. .. .. .. ..$ : int [1:4] 115 802 9701 9779
+ .. .. .. .. ..$ : int [1:4] 1423 3993 6236 7979
+ .. .. .. .. ..$ : int [1:4] 1425 3995 6237 7980
+ .. .. .. .. ..$ : int [1:4] 1426 3996 6238 7981
+ .. .. .. .. ..$ : int [1:4] 120 807 9702 9780
+ .. .. .. .. ..$ : int [1:4] 1427 3997 6239 7982
+ .. .. .. .. ..$ : int [1:4] 1430 3998 6240 7983
+ .. .. .. .. ..$ : int [1:4] 1431 3999 6241 7984
+ .. .. .. .. ..$ : int [1:4] 123 812 9703 9781
+ .. .. .. .. ..$ : int [1:4] 1432 4000 6242 7985
+ .. .. .. .. ..$ : int [1:4] 1433 4001 6243 7986
+ .. .. .. .. ..$ : int [1:4] 1434 4002 6244 7987
+ .. .. .. .. ..$ : int [1:4] 1436 4005 6245 7988
+ .. .. .. .. ..$ : int [1:4] 1435 4004 6246 7989
+ .. .. .. .. ..$ : int [1:4] 1437 4006 6247 7990
+ .. .. .. .. .. [list output truncated]
+ .. .. .. ..@ filled : int [1:3644] 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 ...
+ .. .. .. ..@ phenoData :'data.frame': 4 obs. of 1 variable:
+ .. .. .. .. ..$ class: Factor w/ 2 levels "CLASS1","CLASS2": 1 1 2 2
+ .. .. .. ..@ rt :List of 2
+ .. .. .. .. ..$ raw :List of 4
+ .. .. .. .. .. ..$ : num [1:10136] 194 194 194 194 195 ...
+ .. .. .. .. .. ..$ : num [1:10138] 194 194 194 194 195 ...
+ .. .. .. .. .. ..$ : num [1:2614] 3.46 4.75 6.04 7.32 8.61 ...
+ .. .. .. .. .. ..$ : num [1:2613] 3.47 4.77 6.06 7.37 8.66 ...
+ .. .. .. .. ..$ corrected:List of 4
+ .. .. .. .. .. ..$ : num [1:10136] 168 169 169 169 169 ...
+ .. .. .. .. .. ..$ : num [1:10138] 157 158 158 158 158 ...
+ .. .. .. .. .. ..$ : num [1:2614] 40.5 41.8 43.1 44.4 45.7 ...
+ .. .. .. .. .. ..$ : num [1:2613] 40.7 42 43.3 44.6 45.9 ...
+ .. .. .. ..@ filepaths : chr [1:4] "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS1/STDmix_GC_01.CDF" "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS1/STDmix_GC_02.CDF" "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS2/STDmix_RP_pos01.CDF" "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS2/STDmix_RP_pos02.CDF"
+ .. .. .. ..@ profinfo :List of 2
+ .. .. .. .. ..$ method: chr "bin"
+ .. .. .. .. ..$ step : num 0.05
+ .. .. .. ..@ dataCorrection : int(0)
+ .. .. .. ..@ polarity : chr(0)
+ .. .. .. ..@ progressInfo :List of 12
+ .. .. .. .. ..$ group.density : num 0
+ .. .. .. .. ..$ group.mzClust : num 0
+ .. .. .. .. ..$ group.nearest : num 0
+ .. .. .. .. ..$ findPeaks.centWave : num 0
+ .. .. .. .. ..$ findPeaks.massifquant : num 0
+ .. .. .. .. ..$ findPeaks.matchedFilter: num 0
+ .. .. .. .. ..$ findPeaks.MS1 : num 0
+ .. .. .. .. ..$ findPeaks.MSW : num 0
+ .. .. .. .. ..$ retcor.obiwarp : num 0
+ .. .. .. .. ..$ retcor.peakgroups : num 0
+ .. .. .. .. ..$ fillPeaks.chrom : num 0
+ .. .. .. .. ..$ fillPeaks.MSW : num 0
+ .. .. .. ..@ progressCallback:function (progress)
+ .. ..@ ruleset : NULL
+ .. ..@ annoID : logi[0 , 1:4]
+ .. .. ..- attr(*, "dimnames")=List of 2
+ .. .. .. ..$ : NULL
+ .. .. .. ..$ : chr [1:4] "id" "grpID" "ruleID" "parentID"
+ .. ..@ annoGrp : logi[0 , 1:4]
+ .. .. ..- attr(*, "dimnames")=List of 2
+ .. .. .. ..$ : NULL
+ .. .. .. ..$ : chr [1:4] "id" "mass" "ips" "psgrp"
+ .. ..@ isoID : num [1:315, 1:4] 14 90 107 121 134 137 148 152 157 170 ...
+ .. .. ..- attr(*, "dimnames")=List of 2
+ .. .. .. ..$ : NULL
+ .. .. .. ..$ : chr [1:4] "mpeak" "isopeak" "iso" "charge"
+ .. ..@ polarity : chr ""
+ .. ..@ runParallel :List of 1
+ .. .. ..$ enable: num 0
+ .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots
+ .. .. .. ..@ .Data:List of 1
+ .. .. .. .. ..$ : int [1:3] 1 13 333
+ $ Annotation:List of 5
+ ..$ annotation.table :'data.frame': 27 obs. of 13 variables:
+ .. ..$ feature : int [1:27] 190 193 195 206 208 209 212 367 368 371 ...
+ .. ..$ db_position : int [1:27] 21 22 23 21 23 24 22 1 2 4 ...
+ .. ..$ ChemSpiderID: int [1:27] 10463792 10463792 10463792 10463792 10463792 10463792 10463792 8711 8711 8711 ...
+ .. ..$ mz : num [1:27] 137 348 696 137 696 ...
+ .. ..$ rt : num [1:27] 10.8 10.8 10.8 11.2 11.2 ...
+ .. ..$ I : num [1:27] 10660 3754.5 32.4 5127 58.2 ...
+ .. ..$ compound : Factor w/ 4 levels "D-(+)-Catechin",..: 4 4 4 4 4 4 4 1 1 1 ...
+ .. ..$ db_mz : num [1:27] 137 348 696 137 696 ...
+ .. ..$ db_rt : num [1:27] 10.8 10.8 10.8 10.8 10.8 ...
+ .. ..$ db_I : num [1:27] 33.4 4478.7 58.2 33.4 58.2 ...
+ .. ..$ db_ann : Factor w/ 1 level "automatic": 1 1 1 1 1 1 1 1 1 1 ...
+ .. ..$ mz.err : num [1:27] 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 ...
+ .. ..$ clid : int [1:27] 1 1 1 2 2 2 2 2 2 2 ...
+ ..$ compounds : chr [1:4] "adenosine 2'-monophosphate" "D-(+)-Catechin" "malvidin-3-glucoside" "rutin "
+ ..$ ChemSpiderIDs : int [1:4] 10463792 8711 391785 4444362
+ ..$ multiple.annotations: num(0)
+ ..$ ann.features : int [1:27] 190 193 195 206 208 209 212 367 368 371 ...
+ $ Settings :Formal class 'metaMSsettings' [package "metaMS"] with 30 slots
+ .. ..@ protocolName : chr "Synapt.QTOF.RP"
+ .. ..@ chrom : chr "LC"
+ .. ..@ PeakPicking :List of 5
+ .. .. ..$ method : chr "matchedFilter"
+ .. .. ..$ step : num 0.05
+ .. .. ..$ fwhm : num 20
+ .. .. ..$ snthresh: num 4
+ .. .. ..$ max : num 50
+ .. ..@ Alignment :List of 9
+ .. .. ..$ min.class.fraction: num 0.3
+ .. .. ..$ min.class.size : num 3
+ .. .. ..$ mzwid : num 0.1
+ .. .. ..$ bws : num [1:2] 30 10
+ .. .. ..$ missingratio : num 0.2
+ .. .. ..$ extraratio : num 0.1
+ .. .. ..$ retcormethod : chr "linear"
+ .. .. ..$ retcorfamily : chr "symmetric"
+ .. .. ..$ fillPeaks : logi TRUE
+ .. ..@ CAMERA :List of 3
+ .. .. ..$ perfwhm : num 0.6
+ .. .. ..$ cor_eic_th: num 0.7
+ .. .. ..$ ppm : num 5
+ .. ..@ match2DB.rtdiff : num 1.5
+ .. ..@ match2DB.minfeat : num 2
+ .. ..@ match2DB.rtval : num 0.1
+ .. ..@ match2DB.mzdiff : num 0.005
+ .. ..@ match2DB.ppm : num 5
+ .. ..@ match2DB.simthresh : num(0)
+ .. ..@ match2DB.timeComparison : chr(0)
+ .. ..@ match2DB.RIdiff : num(0)
+ .. ..@ DBconstruction.minfeat : num(0)
+ .. ..@ DBconstruction.rttol : num(0)
+ .. ..@ DBconstruction.mztol : num(0)
+ .. ..@ DBconstruction.minintens : num(0)
+ .. ..@ DBconstruction.intensityMeasure : chr(0)
+ .. ..@ DBconstruction.DBthreshold : num(0)
+ .. ..@ matchIrrelevants.irrelevantClasses: chr(0)
+ .. ..@ matchIrrelevants.simthresh : num(0)
+ .. ..@ matchIrrelevants.timeComparison : chr(0)
+ .. ..@ matchIrrelevants.RIdiff : num(0)
+ .. ..@ matchIrrelevants.rtdiff : num(0)
+ .. ..@ betweenSamples.min.class.fraction : num(0)
+ .. ..@ betweenSamples.min.class.size : num(0)
+ .. ..@ betweenSamples.timeComparison : chr(0)
+ .. ..@ betweenSamples.rtdiff : num(0)
+ .. ..@ betweenSamples.RIdiff : num(0)
+ .. ..@ betweenSamples.simthresh : num(0)
diff -r 41f122255d14 -r 4393f982d18f metaMS/README.txt
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/README.txt Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,70 @@
+Wrappers for:
+- the metaMS R package by Ron Wehrens.
+- the xcms package.
+
+Wrappers written by Pieter Lukasse.
+
+
+Installation (when updating: close all vignettes [i.e. pdfs/manuals] !)
+
+In R:
+source("http://bioconductor.org/biocLite.R")
+biocLite("metaMS")
+biocLite("multtest")
+#biocLite("R2HTML")
+# for "multi-threading" (actually starts multiple R processes for parallel processing):
+install.packages("snow")
+install.packages("Cairo")
+
+======Run metaMS_cmd_pick_and_group.r with:=================
+
+E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_pick_and_group.r test/extdata.zip test/example_settings.txt test/out/peakTable.txt test/out/xsAnnotatePrep.rdata positive test/out/html_peaks.html test/out
+
+
+======Run metaMS_cmd_annotate.r with:=================
+
+E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_annotate.r test/LCDBtest.RData test/out/xsAnnotatePrep.rdata test/example_settings.txt test/out/annotationTable.txt "0" test/out/html_annot.html test/out
+
+
+======Run xcms_differential_analysis.r with:=================
+
+E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_differential_analysis.r test/out/xsAnnotatePrep.rdata "CLASS1" "CLASS2" 10 test/out2/outtable.tsv test/out2/html/html.html test/out2/html
+
+
+======Run xcms_get_eic.r with:=================
+
+
+E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_alignment_eic.r test/out/xsAnnotatePrep.rdata 10 300 3 STDmix_GC_01,STDmix_GC_02 test/out3/html/html.html test/out3/html
+
+OR
+
+E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_mass_eic.r test/out/xsAnnotatePrep.rdata 10 3000 -1 -1 "77.98,231.96" 5 STDmix_GC_01,STDmix_GC_02 Yes raw test/out4/html/html.html test/out4/html
+
+
+
+!!!!!!!!!!!!!!!!Troubleshooting:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+
+NetCDF is required. If the following is found in the installation.log
+
+In file included from rnetCDF.c:2:0:
+rnetCDF.h:1:20: fatal error: netcdf.h: No such file or directory
+compilation terminated.
+
+
+then metaMS will not have been installed and running the tool will result in error:
+
+
+Possible solution:
+> Install the -dev of those packages to get the headers that are
+ required to compile the package. In this case, you need libnetcdf-dev, udunits-bin and libudunits2-dev
+ (from http://stackoverflow.com/questions/11319698/how-to-install-r-packages-rnetcdf-and-ncdf-on-ubuntu)
+So
+>>sudo apt-get install libnetcdf-dev, udunits-bin and libudunits2-dev
+
+Cairo / no X11 mode is required.
+
+Possible solution:
+> install of cairo (http://www.cairographics.org/) and/or set CAIRO_CFLAGS/LIBS correspondingly.
+So
+>>sudo apt-get install libcairo2-dev
diff -r 41f122255d14 -r 4393f982d18f metaMS/__init__.py
diff -r 41f122255d14 -r 4393f982d18f metaMS/metaMS_cmd_annotate.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/metaMS_cmd_annotate.r Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,86 @@
+## read args:
+args <- commandArgs(TRUE)
+## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData"
+args.constructedDB <- args[1]
+## data file in xset format:
+args.xsetData <- args[2]
+## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings"
+args.settings <- args[3]
+
+## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt"
+args.outAnnotationTable <- args[4]
+
+args.mass_error_function <- args[5]
+if (args.mass_error_function == "0")
+ args.mass_error_function <- NULL
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+cat("\nSettings used===============:\n")
+cat(readChar(args.settings, 1e5))
+
+
+tryCatch(
+ {
+ library(metaMS)
+
+ ## load the constructed DB :
+ tempEnv <- new.env()
+ testDB <- load(args.constructedDB, envir=tempEnv)
+ xsetData <- readRDS(args.xsetData)
+
+ ## load settings "script" into "customMetaMSsettings"
+ source(args.settings, local=tempEnv)
+ message(paste(" loaded : ", args.settings))
+
+ # Just to highlight: if you want to use more than one
+ # trigger runLC:
+ LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE)
+
+ # write out runLC annotation results:
+ write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE)
+
+ # the used constructed DB (write to log):
+ cat("\nConstructed DB info===============:\n")
+ str(tempEnv[[testDB[1]]]$Info)
+ cat("\nConstructed DB table===============:\n")
+ if (length(args) == 8)
+ {
+ write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE)
+ write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE)
+ }
+
+ message("\nGenerating report.........")
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ html <- "
"
+ # TODO add (nr and mass error) and group size
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="")
+ html <- paste(html," ", sep="")
+ png( figureName, type="cairo", width=1100,height=600 )
+ retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden")
+ html <- paste(html,"*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run,
+ but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once
+ this option is added in metaMS API. ", sep="")
+ devname = dev.off()
+
+
+ gt <- groups(LC$xset@xcmsSet)
+ groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3)
+
+ html <- paste(html,"")
+ message("finished generating report")
+ write(html,file=args.htmlReportFile)
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
diff -r 41f122255d14 -r 4393f982d18f metaMS/metams_lcms_annotate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/metams_lcms_annotate.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,121 @@
+
+ Runs metaMS process for LC/MS feature annotation
+
+ R_bioc_metams
+
+
+ metaMS_cmd_annotate.r
+ $constructed_db
+ $xsetData
+ $customMetaMSsettings
+ $outputFile
+ #if $mzTol.mzTolType == "fixed"
+ 0
+ #else
+ "$mzTol.mass_error_function"
+ #end if
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outputLog
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+## start comment
+ ## metaMS process settings
+ customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}",
+ chrom = "LC")
+metaSetting(customMetaMSsettings, "match2DB") <- list(
+ rtdiff = ${rtdiff},
+ rtval = ${rtval},
+ #if $mzTol.mzTolType == "fixed"
+ mzdiff = ${mzTol.mzdiff},
+ #else
+ ppm = ${mzTol.ppm},
+ #end if
+ minfeat = ${minfeat})
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB.
+The figure below shows the main parts of this metaMS process.
+
+.. image:: $PATH_TO_IMAGES/metaMS_annotate.png
+
+
+.. class:: infomark
+
+The implemented annotation strategy can be broken down in the following steps:
+
+1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If
+the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed
+tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the
+bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e.
+features which are associated to more than one compound - are possible. This outcome does not
+indicate a problem per se, but is an inherent drawback of co-elution.
+
+2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all
+the experimental features which are assigned to a specific compound. A specific annotation is
+confirmed only if more than minfeat features which differ in retention time less than rtval are
+present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The
+latter, indeed, accounts for shifts in retention time between the injection of the standards and the
+metabolomics experiment under investigation. This time can be rather long, considering that the
+standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift
+between the ions of the same compound within the same batch of injections and therefore it has
+only to account for the smaller shifts occurring during peak picking and alignment.
+
+
+
+
+ 10.1016/j.jchromb.2014.02.051
+
+
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS/metams_lcms_pick_and_group.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/metams_lcms_pick_and_group.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,301 @@
+
+ Runs metaMS process for LC/MS feature picking, aligning and grouping
+
+ R_bioc_metams
+
+
+ metaMS_cmd_pick_and_group.r
+ $data_files
+ $customMetaMSsettings
+ $outputFile
+ $xsetOut
+ $polarity
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outputLog
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+## ====================================
+ ## metaMS process settings
+ customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}",
+ chrom = "LC",
+ PeakPicking = list(
+ method = "${peakPicking.method}",
+ #if $peakPicking.method == "matchedFilter"
+ fwhm = ${peakPicking.fwhm},
+ sigma = ${peakPicking.fwhm}/${peakPicking.sigma_denom},
+ max = ${peakPicking.max},
+ snthresh = ${peakPicking.snthresh},
+ step = ${peakPicking.step},
+ steps = ${peakPicking.steps},
+ mzdiff = ${peakPicking.mzdiff}),
+ #else
+ ppm = ${peakPicking.ppm},
+ peakwidth = c(${peakPicking.peakwidth}),
+ snthresh = ${peakPicking.snthresh},
+ prefilter = c(${peakPicking.prefilter}),
+ mzCenterFun = "${peakPicking.mzCenterFun}",
+ integrate = ${peakPicking.integrate},
+ mzdiff = ${peakPicking.mzdiff},
+ fitgauss = ${peakPicking.fitgauss},
+ noise = ${peakPicking.noise}),
+ #end if
+ Alignment = list(
+ min.class.fraction = ${min_class_fraction},
+ min.class.size = ${min_class_size},
+ mzwid = ${mzwid},
+ bws = c(${bws}),
+ retcormethod = "${retcor.retcormethod}",
+ #if $retcor.retcormethod == "peakgroups"
+ smooth = "${retcor.smooth}",
+ missingratio = ${retcor.missingratio},
+ extraratio = ${retcor.extraratio},
+ retcorfamily = "${retcor.retcorfamily}",
+ #else
+ ##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS):
+ method = "${retcor.retcormethod}",
+ profStep = ${retcor.profStep},
+ #end if
+ fillPeaks = ${fillPeaks}),
+ CAMERA = list(
+ perfwhm = ${perfwhm},
+ cor_eic_th = ${cor_eic_th},
+ ppm= ${ppm},
+ graphMethod= "${groupCorr_graphMethod}",
+ pval= ${groupCorr_pval},
+ calcCiS= ${groupCorr_calcCiS},
+ calcIso= ${groupCorr_calcIso},
+ calcCaS= ${groupCorr_calcCaS},
+ maxcharge= ${findIsotopes_maxcharge},
+ maxiso= ${findIsotopes_maxiso},
+ minfrac= ${findIsotopes_minfrac},
+ multiplier= ${findAdducts_multiplier}
+ ))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Runs metaMS process for LC/MS feature feature picking, aligning and grouping.
+This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms.
+CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data.
+The figure below shows the main parts of the metaMS process wrapped by this tool.
+
+.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png
+
+
+From CAMERA documentation:
+
+.. image:: $PATH_TO_IMAGES/CAMERA_results.png
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+Wehrens, R.; Weingart, G.; Mattivi, F. (2014).
+metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics.
+Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116.
+doi: 10.1016/j.jchromb.2014.02.051
+handle: http://hdl.handle.net/10449/24012
+
+Wrapper by Pieter Lukasse.
+
+
+
+
+ 10.1016/j.jchromb.2014.02.051
+
+
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS/next.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/next.r Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,9 @@
+# next script, useful for plotting the CAMERA groups pseudo-spectra:
+plotPsSpectrum( LC$xset,320,maxlabel=10)
+320 is group id
+plotEICs( LC$xset,35,maxlabel=5)
+
+
+could use this for the items identified by "annotate" step.
+could also integrate this to getEIC tool (add option to give group list instead of m/z list)...but then rt and other options
+ are also not so relevant...maybe a separate tool is more clear
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/tool_dependencies.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+ This dependency:
+ Ensures R 3.1.1 installation is triggered (via dependency).
+ Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed.
+
+
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_differential_analysis.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/xcms_differential_analysis.r Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,85 @@
+## read args:
+args <- commandArgs(TRUE)
+#cat("args <- \"\"\n")
+## a xcms xset saved as .RData
+args.xsetData <- args[1]
+#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep=""))
+
+args.class1 <- args[2]
+args.class2 <- args[3]
+#cat(paste("args.class1 <- \"", args[2], "\"\n", sep=""))
+#cat(paste("args.class2 <- \"", args[3], "\"\n", sep=""))
+
+args.topcount <- strtoi(args[4])
+#cat(paste("args.topcount <- ", args[4], "\n", sep=""))
+
+args.outTable <- args[5]
+
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep=""))
+#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep=""))
+
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+tryCatch(
+ {
+ library(metaMS)
+ library(xcms)
+ #library("R2HTML")
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+
+ # info: levels(xcmsSet@phenoData$class) also gives access to the class names
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ # set cairo as default for png plots:
+ png = function (...) grDevices::png(...,type='cairo')
+ # run diffreport
+ reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640)
+
+ # write out tsv table:
+ write.table(reporttab, args.outTable, sep="\t", row.names=FALSE)
+
+ message("\nGenerating report.........")
+
+ cat("
Differential analysis report
", file= args.htmlReportFile)
+ #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile)
+ figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="")
+ message(figuresPath)
+ listOfFiles <- list.files(path = figuresPath)
+ for (i in 1:length(listOfFiles))
+ {
+ figureName <- listOfFiles[i]
+ # maybe we still need to copy the figures to the args.htmlReportFile.files_path
+ cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE)
+ cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE)
+ }
+
+ message("finished generating report")
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_differential_analysis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/xcms_differential_analysis.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,55 @@
+
+ Runs xcms diffreport function for differential Analsysis
+
+ R_bioc_metams
+
+
+ xcms_differential_analysis.r
+ $xsetData
+ "$class1"
+ "$class2"
+ $topcount
+ $outTable
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for
+the most significant differences. The figure below shows an example of such an EIC.
+
+.. image:: $PATH_TO_IMAGES/diffreport.png
+
+
+
+
+
+
+ 10.1021/ac051437y
+
+
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_alignment_eic.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/xcms_get_alignment_eic.r Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,153 @@
+# shows all alignment results in a rt region
+## read args:
+args <- commandArgs(TRUE)
+# xset data:
+args.xsetData <- args[1]
+
+args.rtStart <- strtoi(args[2])
+args.rtEnd <- strtoi(args[3])
+
+# limit max diff to 600 and minNrSamples to at least 2 :
+if (args.rtEnd - args.rtStart > 600)
+ stop("The RT window should be <= 600")
+
+args.minNrSamples <- strtoi(args[4]) #min nr samples
+if (args.minNrSamples < 2)
+ stop("Set 'Minimum number of samples' to 2 or higher")
+
+
+args.sampleNames <- strsplit(args[5], ",")[[1]]
+# trim leading and trailing spaces:
+args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
+
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+
+
+tryCatch(
+ {
+ library(metaMS)
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
+
+ write(paste("
Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples
"),
+ file=args.htmlReportFile)
+
+ gt <- groups(xsetData)
+ message("\nParsed groups... ")
+ groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected
+
+ if (length(groupidx1) > 0)
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames)
+ #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw")
+
+ #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE)
+ #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!!
+ sampNames <- sampnames(xsetData)
+ sampleNamesIdx <- match( args.sampleNames, sampNames)
+ message(paste("Samples: ", sampleNamesIdx))
+
+ #TODO html <- paste(html, "
")
+ message(paste("\nPlotting figures... "))
+
+ #get the mz list (interestingly, this [,"mz"] is relatively slow):
+ peakMzList <- xsetData@peaks[,"mz"]
+ peakSampleList <- xsetData@peaks[,"sample"]
+ #signal to noise list:
+ peakSnList <- xsetData@peaks[,"sn"]
+
+ message(paste("Total nr of peaks: ", length(peakMzList)))
+
+ for (i in 1:length(groupidx1))
+ {
+ groupMembers <- xsetData@groupidx[[groupidx1[i]]]
+
+ groupMzList <- ""
+ groupSampleList <- ""
+ finalGroupSize <- 0
+
+ for (j in 1:length(groupMembers))
+ {
+ #get only the peaks from the selected samples:
+ memberSample <- peakSampleList[groupMembers[j]]
+ memberSn <- peakSnList[groupMembers[j]]
+ if (!is.na(memberSn) && memberSample %in% sampleNamesIdx)
+ {
+ message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample))
+ memberMz <- peakMzList[groupMembers[j]]
+
+
+ if (finalGroupSize == 0)
+ {
+ groupMzList <- memberMz
+ groupSampleList <- sampNames[memberSample]
+ } else {
+ groupMzList <- paste(groupMzList,",",memberMz, sep="")
+ groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="")
+ }
+
+ finalGroupSize <- finalGroupSize +1
+ }
+ }
+ # here we do the real check on group size and only the groups that have enough
+ # members with signal to noise > 0 will be plotted here:
+ if (finalGroupSize >= args.minNrSamples)
+ {
+ message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... "))
+
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ write(paste("", sep="") ,
+ file=args.htmlReportFile, append=TRUE)
+
+ png( figureName, type="cairo", width=800 )
+ plot(eiccor, xsetData, groupidx = i)
+ devname = dev.off()
+
+ write(paste("
Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA: ", groupMzList,"
m/z values found in the following samples respectively: ", groupSampleList,"
", sep="") ,
+ file=args.htmlReportFile, append=TRUE)
+ }
+ }
+
+ }
+
+ write("",
+ file=args.htmlReportFile, append=TRUE)
+ message("finished generating report")
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_alignment_eic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/xcms_get_alignment_eic.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,73 @@
+
+ Extracts alignment EICs from feature alignment data
+
+ R_bioc_metams
+
+
+ xcms_get_alignment_eic.r
+ $xsetData
+ $rtStart
+ $rtEnd
+ $minNrSamples
+ "$sampleNames"
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method.
+It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between
+two classes, with extra alignment information beneath it.
+
+.. image:: $PATH_TO_IMAGES/diffreport.png
+
+Alignment id: 1709. m/z list of peaks in alignment:
+614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469
+
+
+
+
+ 10.1021/ac051437y
+
+
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_mass_eic.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/xcms_get_mass_eic.r Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,162 @@
+## read args:
+args <- commandArgs(TRUE)
+# xset data:
+args.xsetData <- args[1]
+
+args.rtStart <- strtoi(args[2])
+args.rtEnd <- strtoi(args[3])
+
+args.mzStart <- as.double(args[4])
+args.mzEnd <- as.double(args[5])
+# there are 2 options: specify a mz range or a mz list:
+if (args.mzStart < 0)
+{
+ args.mzList <- as.double(strsplit(args[6], ",")[[1]])
+ cat(typeof(as.double(strsplit(args[6], ",")[[1]])))
+ args.mzTolPpm <- as.double(args[7])
+ # calculate mzends based on ppm tol:
+ mzListEnd <- c()
+ mzListStart <- c()
+ for (i in 1:length(args.mzList))
+ {
+ mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0
+ mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0
+ mzListEnd <- c(mzListEnd, mzEnd)
+ mzListStart <- c(mzListStart, mzStart)
+ }
+ str(mzListStart)
+ str(mzListEnd)
+} else {
+ mzListEnd <- c(args.mzEnd)
+ mzListStart <- c(args.mzStart)
+}
+
+args.sampleNames <- strsplit(args[8], ",")[[1]]
+# trim leading and trailing spaces:
+args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
+
+args.combineSamples <- args[9]
+args.rtPlotMode <- args[10]
+
+## report files
+args.htmlReportFile <- args[11]
+args.htmlReportFile.files_path <- args[12]
+
+
+if (length(args) == 13)
+{
+ args.outLogFile <- args[13]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots
+# TODO2 - let it run in parallel
+
+tryCatch(
+ {
+ library(metaMS)
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
+
+ html <- "
Extracted Ion Chromatograms (EIC) matching criteria
"
+
+ if (args.combineSamples == "No")
+ {
+ if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart))
+ stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart),
+ " masses while ", length(args.sampleNames), " sample names were given."))
+
+ iterSize <- length(args.sampleNames)
+ # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used
+ fixSampleIdx <- 1
+ fixMzListIdx <- 1
+ if (length(args.sampleNames) == 1)
+ {
+ fixSampleIdx <- 0
+ iterSize <- length(mzListStart)
+ }
+ if (length(mzListStart) == 1)
+ {
+ fixMzListIdx <- 0
+ }
+ lineColors <- rainbow(iterSize)
+ for (i in 0:(iterSize-1))
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData,
+ mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE),
+ rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
+ sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode)
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ html <- paste(html," ", sep="")
+ png( figureName, type="cairo", width=1100,height=250 )
+ #plot(eiccor, col=lineColors[i+1])
+ # black is better in this case:
+ plot(eiccor)
+ legend('topright', # places a legend at the appropriate place
+ legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend
+ lty=c(1,1), # gives the legend appropriate symbols (lines)
+ lwd=c(2.5,2.5))
+
+ devname = dev.off()
+ }
+
+ } else {
+ for (i in 1:length(mzListStart))
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData,
+ mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE),
+ rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
+ sampleidx=args.sampleNames, rt = args.rtPlotMode)
+
+ #set size, set option (plot per sample, plot per mass)
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ html <- paste(html,"", sep="")
+ png( figureName, type="cairo", width=1100,height=450 )
+ lineColors <- rainbow(length(args.sampleNames))
+ plot(eiccor, col=lineColors)
+ legend('topright', # places a legend at the appropriate place
+ legend=args.sampleNames, # puts text in the legend
+ lty=c(1,1), # gives the legend appropriate symbols (lines)
+ lwd=c(2.5,2.5),
+ col=lineColors)
+ devname = dev.off()
+ }
+ }
+ if (args.rtPlotMode == "corrected")
+ {
+ html <- paste(html,"
*rt values are corrected ones
")
+ }
+ html <- paste(html,"")
+ message("finished generating report")
+ write(html,file=args.htmlReportFile)
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_mass_eic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS/xcms_get_mass_eic.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,117 @@
+
+ Extracts EICs for a given list of masses
+
+ R_bioc_metams
+
+
+ xcms_get_mass_eic.r
+ $xsetData
+ $rtStart
+ $rtEnd
+ #if $massParameters.massParametersType == "window"
+ $massParameters.mzStart
+ $massParameters.mzEnd
+ -1
+ "."
+ #else
+ -1
+ -1
+ "$massParameters.mzList"
+ $massParameters.mzTolPpm
+ #end if
+ "$sampleNames"
+ $combineSamples
+ $rtPlotMode
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method.
+It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot.
+
+.. image:: $PATH_TO_IMAGES/massEIC.png
+
+
+
+
+ 10.1021/ac051437y
+
+
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metaMS_cmd_annotate.r
--- a/metaMS_cmd_annotate.r Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,86 +0,0 @@
-## read args:
-args <- commandArgs(TRUE)
-## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData"
-args.constructedDB <- args[1]
-## data file in xset format:
-args.xsetData <- args[2]
-## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings"
-args.settings <- args[3]
-
-## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt"
-args.outAnnotationTable <- args[4]
-
-args.mass_error_function <- args[5]
-if (args.mass_error_function == "0")
- args.mass_error_function <- NULL
-## report files
-args.htmlReportFile <- args[6]
-args.htmlReportFile.files_path <- args[7]
-
-if (length(args) == 8)
-{
- args.outLogFile <- args[8]
- # suppress messages:
- # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
- msg <- file(args.outLogFile, open="wt")
- sink(msg, type="message")
- sink(msg, type="output")
-}
-
-cat("\nSettings used===============:\n")
-cat(readChar(args.settings, 1e5))
-
-
-tryCatch(
- {
- library(metaMS)
-
- ## load the constructed DB :
- tempEnv <- new.env()
- testDB <- load(args.constructedDB, envir=tempEnv)
- xsetData <- readRDS(args.xsetData)
-
- ## load settings "script" into "customMetaMSsettings"
- source(args.settings, local=tempEnv)
- message(paste(" loaded : ", args.settings))
-
- # Just to highlight: if you want to use more than one
- # trigger runLC:
- LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE)
-
- # write out runLC annotation results:
- write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE)
-
- # the used constructed DB (write to log):
- cat("\nConstructed DB info===============:\n")
- str(tempEnv[[testDB[1]]]$Info)
- cat("\nConstructed DB table===============:\n")
- if (length(args) == 8)
- {
- write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE)
- write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE)
- }
-
- message("\nGenerating report.........")
- # report
- dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
- html <- "
"
- # TODO add (nr and mass error) and group size
-
- message("\nPlotting figures... ")
- figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="")
- html <- paste(html," ", sep="")
- png( figureName, type="cairo", width=1100,height=600 )
- retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden")
- html <- paste(html,"*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run,
- but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once
- this option is added in metaMS API. ", sep="")
- devname = dev.off()
-
-
- gt <- groups(LC$xset@xcmsSet)
- groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3)
-
- html <- paste(html,"")
- message("finished generating report")
- write(html,file=args.htmlReportFile)
- # unlink(args.htmlReportFile)
- cat("\nWarnings================:\n")
- str( warnings() )
- },
- error=function(cond) {
- sink(NULL, type="message") # default setting
- sink(stderr(), type="output")
- message("\nERROR: ===========\n")
- print(cond)
- }
- )
diff -r 41f122255d14 -r 4393f982d18f metams_lcms_annotate.xml
--- a/metams_lcms_annotate.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,121 +0,0 @@
-
- Runs metaMS process for LC/MS feature annotation
-
- R_bioc_metams
-
-
- metaMS_cmd_annotate.r
- $constructed_db
- $xsetData
- $customMetaMSsettings
- $outputFile
- #if $mzTol.mzTolType == "fixed"
- 0
- #else
- "$mzTol.mass_error_function"
- #end if
- $htmlReportFile
- $htmlReportFile.files_path
- $outputLog
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-## start comment
- ## metaMS process settings
- customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}",
- chrom = "LC")
-metaSetting(customMetaMSsettings, "match2DB") <- list(
- rtdiff = ${rtdiff},
- rtval = ${rtval},
- #if $mzTol.mzTolType == "fixed"
- mzdiff = ${mzTol.mzdiff},
- #else
- ppm = ${mzTol.ppm},
- #end if
- minfeat = ${minfeat})
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB.
-The figure below shows the main parts of this metaMS process.
-
-.. image:: $PATH_TO_IMAGES/metaMS_annotate.png
-
-
-.. class:: infomark
-
-The implemented annotation strategy can be broken down in the following steps:
-
-1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If
-the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed
-tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the
-bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e.
-features which are associated to more than one compound - are possible. This outcome does not
-indicate a problem per se, but is an inherent drawback of co-elution.
-
-2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all
-the experimental features which are assigned to a specific compound. A specific annotation is
-confirmed only if more than minfeat features which differ in retention time less than rtval are
-present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The
-latter, indeed, accounts for shifts in retention time between the injection of the standards and the
-metabolomics experiment under investigation. This time can be rather long, considering that the
-standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift
-between the ions of the same compound within the same batch of injections and therefore it has
-only to account for the smaller shifts occurring during peak picking and alignment.
-
-
-
-
- 10.1016/j.jchromb.2014.02.051
-
-
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f metams_lcms_pick_and_group.xml
--- a/metams_lcms_pick_and_group.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,301 +0,0 @@
-
- Runs metaMS process for LC/MS feature picking, aligning and grouping
-
- R_bioc_metams
-
-
- metaMS_cmd_pick_and_group.r
- $data_files
- $customMetaMSsettings
- $outputFile
- $xsetOut
- $polarity
- $htmlReportFile
- $htmlReportFile.files_path
- $outputLog
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-## ====================================
- ## metaMS process settings
- customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}",
- chrom = "LC",
- PeakPicking = list(
- method = "${peakPicking.method}",
- #if $peakPicking.method == "matchedFilter"
- fwhm = ${peakPicking.fwhm},
- sigma = ${peakPicking.fwhm}/${peakPicking.sigma_denom},
- max = ${peakPicking.max},
- snthresh = ${peakPicking.snthresh},
- step = ${peakPicking.step},
- steps = ${peakPicking.steps},
- mzdiff = ${peakPicking.mzdiff}),
- #else
- ppm = ${peakPicking.ppm},
- peakwidth = c(${peakPicking.peakwidth}),
- snthresh = ${peakPicking.snthresh},
- prefilter = c(${peakPicking.prefilter}),
- mzCenterFun = "${peakPicking.mzCenterFun}",
- integrate = ${peakPicking.integrate},
- mzdiff = ${peakPicking.mzdiff},
- fitgauss = ${peakPicking.fitgauss},
- noise = ${peakPicking.noise}),
- #end if
- Alignment = list(
- min.class.fraction = ${min_class_fraction},
- min.class.size = ${min_class_size},
- mzwid = ${mzwid},
- bws = c(${bws}),
- retcormethod = "${retcor.retcormethod}",
- #if $retcor.retcormethod == "peakgroups"
- smooth = "${retcor.smooth}",
- missingratio = ${retcor.missingratio},
- extraratio = ${retcor.extraratio},
- retcorfamily = "${retcor.retcorfamily}",
- #else
- ##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS):
- method = "${retcor.retcormethod}",
- profStep = ${retcor.profStep},
- #end if
- fillPeaks = ${fillPeaks}),
- CAMERA = list(
- perfwhm = ${perfwhm},
- cor_eic_th = ${cor_eic_th},
- ppm= ${ppm},
- graphMethod= "${groupCorr_graphMethod}",
- pval= ${groupCorr_pval},
- calcCiS= ${groupCorr_calcCiS},
- calcIso= ${groupCorr_calcIso},
- calcCaS= ${groupCorr_calcCaS},
- maxcharge= ${findIsotopes_maxcharge},
- maxiso= ${findIsotopes_maxiso},
- minfrac= ${findIsotopes_minfrac},
- multiplier= ${findAdducts_multiplier}
- ))
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-Runs metaMS process for LC/MS feature feature picking, aligning and grouping.
-This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms.
-CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data.
-The figure below shows the main parts of the metaMS process wrapped by this tool.
-
-.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png
-
-
-From CAMERA documentation:
-
-.. image:: $PATH_TO_IMAGES/CAMERA_results.png
-
-**References**
-
-If you use this Galaxy tool in work leading to a scientific publication please
-cite the following papers:
-
-Wehrens, R.; Weingart, G.; Mattivi, F. (2014).
-metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics.
-Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116.
-doi: 10.1016/j.jchromb.2014.02.051
-handle: http://hdl.handle.net/10449/24012
-
-Wrapper by Pieter Lukasse.
-
-
-
-
- 10.1016/j.jchromb.2014.02.051
-
-
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f next.r
--- a/next.r Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,9 +0,0 @@
-# next script, useful for plotting the CAMERA groups pseudo-spectra:
-plotPsSpectrum( LC$xset,320,maxlabel=10)
-320 is group id
-plotEICs( LC$xset,35,maxlabel=5)
-
-
-could use this for the items identified by "annotate" step.
-could also integrate this to getEIC tool (add option to give group list instead of m/z list)...but then rt and other options
- are also not so relevant...maybe a separate tool is more clear
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f query_mass_repos.py
--- a/query_mass_repos.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,289 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-'''
-Module to query a set of accurate mass values detected by high-resolution mass spectrometers
-against various repositories/services such as METabolomics EXPlorer database or the
-MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
-
-It will take the input file and for each record it will query the
-molecular mass in the selected repository/service. If one or more compounds are found
-then extra information regarding these compounds is added to the output file.
-
-The output file is thus the input file enriched with information about
-related items found in the selected repository/service.
-
-The service should implement the following interface:
-
-http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)
-
-The output should be tab separated and should contain the following columns (in this order)
-db-name molecular-formula dbe formula-weight id description
-
-
-'''
-import csv
-import sys
-import fileinput
-import urllib2
-import time
-from collections import OrderedDict
-
-__author__ = "Pieter Lukasse"
-__contact__ = "pieter.lukasse@wur.nl"
-__copyright__ = "Copyright, 2014, Plant Research International, WUR"
-__license__ = "Apache v2"
-
-def _process_file(in_xsv, delim='\t'):
- '''
- Generic method to parse a tab-separated file returning a dictionary with named columns
- @param in_csv: input filename to be parsed
- '''
- data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
- return _process_data(data)
-
-def _process_data(data):
-
- header = data.pop(0)
- # Create dictionary with column name as key
- output = OrderedDict()
- for index in xrange(len(header)):
- output[header[index]] = [row[index] for row in data]
- return output
-
-
-def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
-
- '''
- This method will iterate over the record in the input_data and
- will enrich them with the related information found (if any) in the
- chosen repository/service
-
- # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
- '''
- merged = []
-
- for i in xrange(len(input_data[input_data.keys()[0]])):
- # Get the record in same dictionary format as input_data, but containing
- # a value at each column instead of a list of all values of all records:
- input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
-
- # read the molecular mass :
- molecular_mass = input_data_record[molecular_mass_col]
-
-
- # search for related records in repository/service:
- data_found = None
- if molecular_mass != "":
- molecular_mass = float(molecular_mass)
-
- # 1- search for data around this MM:
- query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
-
- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
- data_type_found = "MM"
-
-
- if data_found == None:
- # If still nothing found, just add empty columns
- extra_cols = ['', '','','','','']
- else:
- # Add info found:
- extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
-
- # Take all data and merge it into a "flat"/simple array of values:
- field_values_list = _merge_data(input_data_record, extra_cols)
-
- merged.append(field_values_list)
-
- # return the merged/enriched records:
- return merged
-
-
-def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
- '''
- This method will go over the data found and will return a
- list with the following items:
- - details of hits found :
- db-name molecular-formula dbe formula-weight id description
- - Link that executes same query
-
- '''
-
- # set() makes a unique list:
- db_name_set = []
- molecular_formula_set = []
- id_set = []
- description_set = []
-
-
- if 'db-name' in data_found:
- db_name_set = set(data_found['db-name'])
- elif '# db-name' in data_found:
- db_name_set = set(data_found['# db-name'])
- if 'molecular-formula' in data_found:
- molecular_formula_set = set(data_found['molecular-formula'])
- if 'id' in data_found:
- id_set = set(data_found['id'])
- if 'description' in data_found:
- description_set = set(data_found['description'])
-
- result = [data_type_found,
- _to_xsv(db_name_set),
- _to_xsv(molecular_formula_set),
- _to_xsv(id_set),
- _to_xsv(description_set),
- #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
- "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
- return result
-
-
-def _to_xsv(data_set):
- result = ""
- for item in data_set:
- result = result + str(item) + "|"
- return result
-
-
-def _fire_query_and_return_dict(url):
- '''
- This method will fire the query as a web-service call and
- return the results as a list of dictionary objects
- '''
-
- try:
- data = urllib2.urlopen(url).read()
-
- # transform to dictionary:
- result = []
- data_rows = data.split("\n")
-
- # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
- data_rows_to_remove = []
- for data_row in data_rows:
- if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
- data_rows_to_remove.append(data_row)
-
- for data_row in data_rows_to_remove:
- data_rows.remove(data_row)
-
- # check if there is any data in the response:
- if len(data_rows) <= 1 or data_rows[1].strip() == '':
- # means there is only the header row...so no hits:
- return None
-
- for data_row in data_rows:
- if not data_row.strip() == '':
- row_as_list = _str_to_list(data_row, delimiter='\t')
- result.append(row_as_list)
-
- # return result processed into a dict:
- return _process_data(result)
-
- except urllib2.HTTPError, e:
- raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
- except urllib2.URLError, e:
- raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
-
-def _str_to_list(data_row, delimiter='\t'):
- result = []
- for column in data_row.split(delimiter):
- result.append(column)
- return result
-
-
-# alternative: ?
-# s = requests.Session()
-# s.verify = False
-# #s.auth = (token01, token02)
-# resp = s.get(url, params={'name': 'anonymous'}, stream=True)
-# content = resp.content
-# # transform to dictionary:
-
-
-
-
-def _merge_data(input_data_record, extra_cols):
- '''
- Adds the extra information to the existing data record and returns
- the combined new record.
- '''
- record = []
- for column in input_data_record:
- record.append(input_data_record[column])
-
-
- # add extra columns
- for column in extra_cols:
- record.append(column)
-
- return record
-
-
-def _save_data(data_rows, headers, out_csv):
- '''
- Writes tab-separated data to file
- @param data_rows: dictionary containing merged/enriched dataset
- @param out_csv: output csv file
- '''
-
- # Open output file for writing
- outfile_single_handle = open(out_csv, 'wb')
- output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
-
- # Write headers
- output_single_handle.writerow(headers)
-
- # Write one line for each row
- for data_row in data_rows:
- output_single_handle.writerow(data_row)
-
-def _get_repository_URL(repository_file):
- '''
- Read out and return the URL stored in the given file.
- '''
- file_input = fileinput.input(repository_file)
- try:
- for line in file_input:
- if line[0] != '#':
- # just return the first line that is not a comment line:
- return line
- finally:
- file_input.close()
-
-
-def main():
- '''
- Query main function
-
- The input file can be any tabular file, as long as it contains a column for the molecular mass.
- This column is then used to query against the chosen repository/service Database.
- '''
- seconds_start = int(round(time.time()))
-
- input_file = sys.argv[1]
- molecular_mass_col = sys.argv[2]
- repository_file = sys.argv[3]
- error_margin = float(sys.argv[4])
- margin_unit = sys.argv[5]
- output_result = sys.argv[6]
-
- # Parse repository_file to find the URL to the service:
- repository_dblink = _get_repository_URL(repository_file)
-
- # Parse tabular input file into dictionary/array:
- input_data = _process_file(input_file)
-
- # Query data against repository :
- enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
- headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
- 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns
-
- _save_data(enriched_data, headers, output_result)
-
- seconds_end = int(round(time.time()))
- print "Took " + str(seconds_end - seconds_start) + " seconds"
-
-
-
-if __name__ == '__main__':
- main()
diff -r 41f122255d14 -r 4393f982d18f query_mass_repos.xml
--- a/query_mass_repos.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,106 +0,0 @@
-
- Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers
-
- query_mass_repos.py
- $input_file
- "$molecular_mass_col"
- "$repository_file"
- $error_margin
- $margin_unit
- $output_result
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher
-for elemental compositions from accurate mass values detected by high-resolution mass spectrometers.
-
-It will take the input file and for each record it will query the
-molecular mass in the selected repository. If one or more compounds are found in the
-repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file.
-
-The output file is thus the input file enriched with information about
-related items found in the selected repository.
-
-**Notes**
-
-The input file can be any tabular file, as long as it contains a column for the molecular mass.
-
-**Services that can be queried**
-
-================= =========================================================================
-Database Description
------------------ -------------------------------------------------------------------------
-PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at
- Plant Research International. NB: restricted access to employees with
- access key.
-ExactMassDB A database of possible elemental compositions consits of C: 100,
- H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and
- the Lewis valence rules.
- (via /mfsearcher/exmassdb/)
-ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental
- compositions, filters some elemental compositions according to
- the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2
- database returns the same result as does HR2 with the same atom kind
- and number condition as that used in construction of the ExactMassDB.
- (via /mfsearcher/exmassdb-hr2/)
-Pep1000 A database of possible linear polypeptides that are
- constructed with 20 kinds of amino acids and having molecular
- weights smaller than 1000.
- (via /mfsearcher/pep1000/)
-KEGG Re-calculated compound data from KEGG. Weekly updated.
- (via /mfsearcher/kegg/)
-KNApSAcK Re-calculated compound data from KNApSAcK.
- (via /mfsearcher/knapsack/)
-Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer .
- (via /mfsearcher/flavonoidviewer/
-LipidMAPS Re-calculated compound data from LIPID MAPS.
- (via /mfsearcher/lipidmaps/)
-HMDB Re-calculated compound data from Human Metabolome Database (HMDB)
- Version 3.5.
- (via /mfsearcher/hmdb/)
-PubChem Re-calculated compound data from PubChem. Monthly updated.
- (via /mfsearcher/pubchem/)
-================= =========================================================================
-
-Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher
-
-
-
diff -r 41f122255d14 -r 4393f982d18f query_metexp.py
--- a/query_metexp.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,282 +0,0 @@
-#!/usr/bin/env python
-# encoding: utf-8
-'''
-Module to query a set of identifications against the METabolomics EXPlorer database.
-
-It will take the input file and for each record it will query the
-molecular mass in the selected MetExp DB. If one or more compounds are found in the
-MetExp DB then extra information regarding these compounds is added to the output file.
-
-The output file is thus the input file enriched with information about
-related items found in the selected MetExp DB.
-'''
-import csv
-import sys
-import fileinput
-import urllib2
-import time
-from collections import OrderedDict
-
-__author__ = "Pieter Lukasse"
-__contact__ = "pieter.lukasse@wur.nl"
-__copyright__ = "Copyright, 2014, Plant Research International, WUR"
-__license__ = "Apache v2"
-
-def _process_file(in_xsv, delim='\t'):
- '''
- Generic method to parse a tab-separated file returning a dictionary with named columns
- @param in_csv: input filename to be parsed
- '''
- data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
- return _process_data(data)
-
-def _process_data(data):
-
- header = data.pop(0)
- # Create dictionary with column name as key
- output = OrderedDict()
- for index in xrange(len(header)):
- output[header[index]] = [row[index] for row in data]
- return output
-
-
-def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
- '''
- This method will iterate over the record in the input_data and
- will enrich them with the related information found (if any) in the
- MetExp Database.
-
- # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
- '''
- merged = []
-
- for i in xrange(len(input_data[input_data.keys()[0]])):
- # Get the record in same dictionary format as input_data, but containing
- # a value at each column instead of a list of all values of all records:
- input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
-
- # read the molecular mass and formula:
- cas_id = input_data_record[casid_col]
- formula = input_data_record[formula_col]
- molecular_mass = input_data_record[molecular_mass_col]
-
- # search for related records in MetExp:
- data_found = None
- if cas_id != "undef":
- # 1- search for other experiments where this CAS id has been found:
- query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
- data_type_found = "CAS"
- if data_found == None:
- # 2- search for other experiments where this FORMULA has been found:
- query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
- data_type_found = "FORMULA"
- if data_found == None:
- # 3- search for other experiments where this MM has been found:
- query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
- data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
- data_type_found = "MM"
-
- if data_found == None:
- # If still nothing found, just add empty columns
- extra_cols = ['', '','','','','','','']
- else:
- # Add info found:
- extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
-
- # Take all data and merge it into a "flat"/simple array of values:
- field_values_list = _merge_data(input_data_record, extra_cols)
-
- merged.append(field_values_list)
-
- # return the merged/enriched records:
- return merged
-
-
-def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
- '''
- This method will go over the data found and will return a
- list with the following items:
- - Experiment details where hits have been found :
- 'organism', 'tissue','experiment_name','user_name','column_type'
- - Link that executes same query
-
- '''
- # set() makes a unique list:
- organism_set = []
- tissue_set = []
- experiment_name_set = []
- user_name_set = []
- column_type_set = []
- cas_nr_set = []
-
- if 'organism' in data_found:
- organism_set = set(data_found['organism'])
- if 'tissue' in data_found:
- tissue_set = set(data_found['tissue'])
- if 'experiment_name' in data_found:
- experiment_name_set = set(data_found['experiment_name'])
- if 'user_name' in data_found:
- user_name_set = set(data_found['user_name'])
- if 'column_type' in data_found:
- column_type_set = set(data_found['column_type'])
- if 'CAS' in data_found:
- cas_nr_set = set(data_found['CAS'])
-
-
- result = [data_type_found,
- _to_xsv(organism_set),
- _to_xsv(tissue_set),
- _to_xsv(experiment_name_set),
- _to_xsv(user_name_set),
- _to_xsv(column_type_set),
- _to_xsv(cas_nr_set),
- #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
- "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
- return result
-
-
-def _to_xsv(data_set):
- result = ""
- for item in data_set:
- result = result + str(item) + "|"
- return result
-
-
-def _fire_query_and_return_dict(url):
- '''
- This method will fire the query as a web-service call and
- return the results as a list of dictionary objects
- '''
-
- try:
- data = urllib2.urlopen(url).read()
-
- # transform to dictionary:
- result = []
- data_rows = data.split("\n")
-
- # check if there is any data in the response:
- if len(data_rows) <= 1 or data_rows[1].strip() == '':
- # means there is only the header row...so no hits:
- return None
-
- for data_row in data_rows:
- if not data_row.strip() == '':
- row_as_list = _str_to_list(data_row, delimiter='\t')
- result.append(row_as_list)
-
- # return result processed into a dict:
- return _process_data(result)
-
- except urllib2.HTTPError, e:
- raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
- except urllib2.URLError, e:
- raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
-
-def _str_to_list(data_row, delimiter='\t'):
- result = []
- for column in data_row.split(delimiter):
- result.append(column)
- return result
-
-
-# alternative: ?
-# s = requests.Session()
-# s.verify = False
-# #s.auth = (token01, token02)
-# resp = s.get(url, params={'name': 'anonymous'}, stream=True)
-# content = resp.content
-# # transform to dictionary:
-
-
-
-
-def _merge_data(input_data_record, extra_cols):
- '''
- Adds the extra information to the existing data record and returns
- the combined new record.
- '''
- record = []
- for column in input_data_record:
- record.append(input_data_record[column])
-
-
- # add extra columns
- for column in extra_cols:
- record.append(column)
-
- return record
-
-
-def _save_data(data_rows, headers, out_csv):
- '''
- Writes tab-separated data to file
- @param data_rows: dictionary containing merged/enriched dataset
- @param out_csv: output csv file
- '''
-
- # Open output file for writing
- outfile_single_handle = open(out_csv, 'wb')
- output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
-
- # Write headers
- output_single_handle.writerow(headers)
-
- # Write one line for each row
- for data_row in data_rows:
- output_single_handle.writerow(data_row)
-
-def _get_metexp_URL(metexp_dblink_file):
- '''
- Read out and return the URL stored in the given file.
- '''
- file_input = fileinput.input(metexp_dblink_file)
- try:
- for line in file_input:
- if line[0] != '#':
- # just return the first line that is not a comment line:
- return line
- finally:
- file_input.close()
-
-
-def main():
- '''
- MetExp Query main function
-
- The input file can be any tabular file, as long as it contains a column for the molecular mass
- and one for the formula of the respective identification. These two columns are then
- used to query against MetExp Database.
- '''
- seconds_start = int(round(time.time()))
-
- input_file = sys.argv[1]
- casid_col = sys.argv[2]
- formula_col = sys.argv[3]
- molecular_mass_col = sys.argv[4]
- metexp_dblink_file = sys.argv[5]
- separation_method = sys.argv[6]
- output_result = sys.argv[7]
-
- # Parse metexp_dblink_file to find the URL to the MetExp service:
- metexp_dblink = _get_metexp_URL(metexp_dblink_file)
-
- # Parse tabular input file into dictionary/array:
- input_data = _process_file(input_file)
-
- # Query data against MetExp DB :
- enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
- headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
- 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
-
- _save_data(enriched_data, headers, output_result)
-
- seconds_end = int(round(time.time()))
- print "Took " + str(seconds_end - seconds_start) + " seconds"
-
-
-
-if __name__ == '__main__':
- main()
diff -r 41f122255d14 -r 4393f982d18f query_metexp.xml
--- a/query_metexp.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,69 +0,0 @@
-
- Query a set of identifications against the METabolomics EXPeriments database
-
- query_metexp.py
- $input_file
- "$casid_col"
- "$formula_col"
- "$molecular_mass_col"
- "$metexp_dblink_file"
- $separation_method
- $output_result
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-This tool will Query a set of identifications against the METabolomics EXPlorer database.
-
-It will take the input file and for each record it will query the
-molecular mass in the selected MetExp DB. If one or more compounds are found in the
-MetExp DB then extra information regarding these compounds is added to the output file.
-
-The output file is thus the input file enriched with information about
-related items found in the selected MetExp DB.
-
-**Notes**
-
-The input file can be any tabular file, as long as it contains a column for the molecular mass
-and one for the formula of the respective identification.
-
-
-
-
diff -r 41f122255d14 -r 4393f982d18f rankfilterGCMS_tabular.xml
--- a/rankfilterGCMS_tabular.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,80 +0,0 @@
-
- Convert Retention Time to Retention Index
- rankfilter_GCMS/rankfilter.py $input_file
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- sample = ${sample}
- calibration = ${calibration}
- lib_data = ${lib_data}
- window = ${window}
- analysis_type = ${analysis_type}
- tabular = True
- onefile = ${onefile}
- model = ${model}
-
-
-
-Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the
-given calibration file.
-
-It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File),
-based on its CAS number, its respective RIsvr value in the given global lookup library
-(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS).
-This generates an prediction of the RI for
-a compound according to the "RankFilter procedure" (RIsvr).
-
-Output is a tab separated file in which four columns are added:
-
- - **Rank** Calculated rank
- - **RIexp** Experimental Retention Index (RI)
- - **RIsvr** Calculated RI based on support vector regression (SVR)
- - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp)
-
-.. class:: infomark
-
-**Notes**
-
- - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'.
- - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will
- be used to convert from XXXX to YYYY.
-
------
-
-**References**
-
- - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS
- metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794
-
-
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/nistpdf_to_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/nistpdf_to_tabular.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,45 @@
+"""
+Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this software except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+
+import sys
+import pdfread
+from subprocess import call
+
+# NB: THIS TOOL SHOULD BE DEPRECATED ONCE NIST-Wrapper tool is stable and released. NIST-Wrapper tool already produces the necessary tabular file.
+
+def convert_pdftotext(filename, output_file):
+ '''
+ Converts PDF file to text
+ @param filename: PDF file to parse
+ @param output_file: output text file for the hits
+ '''
+
+ # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The
+ # default is to 'undo' physical layout (columns, hyphenation, etc.) and output
+ # the text in reading order.
+ try:
+ call(["pdftotext", "-layout", filename, output_file])
+ except:
+ raise Exception("Error while trying to convert PDF to text")
+
+
+
+
+if __name__ == '__main__':
+ pdf_as_text = sys.argv[1]+".txt"
+ convert_pdftotext(sys.argv[1], pdf_as_text)
+ pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False)
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/nistpdf_to_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/nistpdf_to_tabular.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,14 @@
+
+ Convert NIST text to tabular format
+ nistpdf_to_tabular.py $input $output $output_err
+
+
+
+
+
+
+
+
+ This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools.
+
+
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/pdftotabular.py
--- a/rankfilter_GCMS/pdftotabular.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,44 +0,0 @@
-"""
-Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this software except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-"""
-
-import sys
-import pdfread
-from subprocess import call
-
-
-def convert_pdftotext(filename, output_file):
- '''
- Converts PDF file to text
- @param filename: PDF file to parse
- @param output_file: output text file for the hits
- '''
-
- # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The
- # default is to 'undo' physical layout (columns, hyphenation, etc.) and output
- # the text in reading order.
- try:
- call(["pdftotext", "-layout", filename, output_file])
- except:
- raise Exception("Error while trying to convert PDF to text")
-
-
-
-
-if __name__ == '__main__':
- pdf_as_text = sys.argv[1]+".txt"
- convert_pdftotext(sys.argv[1], pdf_as_text)
- pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False)
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/rankfilterGCMS_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/rankfilterGCMS_tabular.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,80 @@
+
+ Convert Retention Time to Retention Index
+ rankfilter.py $input_file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ sample = ${sample}
+ calibration = ${calibration}
+ lib_data = ${lib_data}
+ window = ${window}
+ analysis_type = ${analysis_type}
+ tabular = True
+ onefile = ${onefile}
+ model = ${model}
+
+
+
+Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the
+given calibration file.
+
+It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File),
+based on its CAS number, its respective RIsvr value in the given global lookup library
+(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS).
+This generates an prediction of the RI for
+a compound according to the "RankFilter procedure" (RIsvr).
+
+Output is a tab separated file in which four columns are added:
+
+ - **Rank** Calculated rank
+ - **RIexp** Experimental Retention Index (RI)
+ - **RIsvr** Calculated RI based on support vector regression (SVR)
+ - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp)
+
+.. class:: infomark
+
+**Notes**
+
+ - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'.
+ - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will
+ be used to convert from XXXX to YYYY.
+
+-----
+
+**References**
+
+ - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS
+ metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794
+
+
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/select_on_rank.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/select_on_rank.py Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,21 @@
+import csv
+import sys
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+in_file = sys.argv[1]
+out_file = sys.argv[2]
+to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)]
+
+data = list(csv.reader(open(in_file, 'rb'), delimiter='\t'))
+header = data.pop(0)
+header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header]
+rank = header_clean.index("rank")
+
+writer = csv.writer(open(out_file, 'wb'), delimiter='\t')
+writer.writerow(header)
+for select in to_select_list:
+ writer.writerows([i for i in data if i[rank] == select])
diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/select_on_rank.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/select_on_rank.xml Thu Mar 19 12:22:23 2015 +0100
@@ -0,0 +1,15 @@
+
+ Filter on the Rank field in the RankFilter output file
+ select_on_rank.py $input $output "$rank"
+
+
+
+
+
+
+
+
+This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file).
+
+
diff -r 41f122255d14 -r 4393f982d18f rankfilter_text2tabular.xml
--- a/rankfilter_text2tabular.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,14 +0,0 @@
-
- Convert NIST text to tabular format
- rankfilter_GCMS/pdftotabular.py $input $output $output_err
-
-
-
-
-
-
-
-
- This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools.
-
-
diff -r 41f122255d14 -r 4393f982d18f select_on_rank.py
--- a/select_on_rank.py Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,21 +0,0 @@
-import csv
-import sys
-
-__author__ = "Marcel Kempenaar"
-__contact__ = "brs@nbic.nl"
-__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
-__license__ = "MIT"
-
-in_file = sys.argv[1]
-out_file = sys.argv[2]
-to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)]
-
-data = list(csv.reader(open(in_file, 'rb'), delimiter='\t'))
-header = data.pop(0)
-header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header]
-rank = header_clean.index("rank")
-
-writer = csv.writer(open(out_file, 'wb'), delimiter='\t')
-writer.writerow(header)
-for select in to_select_list:
- writer.writerows([i for i in data if i[rank] == select])
diff -r 41f122255d14 -r 4393f982d18f select_on_rank.xml
--- a/select_on_rank.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,15 +0,0 @@
-
- Filter on the Rank field in the RankFilter output file
- select_on_rank.py $input $output "$rank"
-
-
-
-
-
-
-
-
-This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file).
-
-
diff -r 41f122255d14 -r 4393f982d18f static_resources/README.txt
--- a/static_resources/README.txt Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,3 +0,0 @@
-This folder and respective files should be deployed together with the following tools:
-
- - ../export_to_metexp_tabular.py
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f static_resources/elements_and_masses.tab
--- a/static_resources/elements_and_masses.tab Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,104 +0,0 @@
-Name Atomic number Chemical symbol Relative atomic mass
-Hydrogen 1 H 1.01
-Helium 2 He 4
-Lithium 3 Li 6.94
-Beryllium 4 Be 9.01
-Boron 5 B 10.81
-Carbon 6 C 12.01
-Nitrogen 7 N 14.01
-Oxygen 8 O 16
-Fluorine 9 F 19
-Neon 10 Ne 20.18
-Sodium 11 Na 22.99
-Magnesium 12 Mg 24.31
-Aluminum 13 Al 26.98
-Silicon 14 Si 28.09
-Phosphorus 15 P 30.98
-Sulfur 16 S 32.06
-Chlorine 17 Cl 35.45
-Argon 18 Ar 39.95
-Potassium 19 K 39.1
-Calcium 20 Ca 40.08
-Scandium 21 Sc 44.96
-Titanium 22 Ti 47.9
-Vanadium 23 V 50.94
-Chromium 24 Cr 52
-Manganese 25 Mn 54.94
-Iron 26 Fe 55.85
-Cobalt 27 Co 58.93
-Nickel 28 Ni 58.71
-Copper 29 Cu 63.54
-Zinc 30 Zn 65.37
-Gallium 31 Ga 69.72
-Germanium 32 Ge 72.59
-Arsenic 33 As 74.99
-Selenium 34 Se 78.96
-Bromine 35 Br 79.91
-Krypton 36 Kr 83.8
-Rubidium 37 Rb 85.47
-Strontium 38 Sr 87.62
-Yttrium 39 Y 88.91
-Zirconium 40 Zr 91.22
-Niobium 41 Nb 92.91
-Molybdenum 42 Mo 95.94
-Technetium 43 Tc 96.91
-Ruthenium 44 Ru 101.07
-Rhodium 45 Rh 102.9
-Palladium 46 Pd 106.4
-Silver 47 Ag 107.87
-Cadmium 48 Cd 112.4
-Indium 49 In 114.82
-Tin 50 Sn 118.69
-Antimony 51 Sb 121.75
-Tellurium 52 Te 127.6
-Iodine 53 I 126.9
-Xenon 54 Xe 131.3
-Cesium 55 Cs 132.9
-Barium 56 Ba 137.34
-Lanthanum 57 La 138.91
-Cerium 58 Ce 140.12
-Praseodymium 59 Pr 140.91
-Neodymium 60 Nd 144.24
-Promethium 61 Pm 144.91
-Samarium 62 Sm 150.35
-Europium 63 Eu 151.96
-Gadolinium 64 Gd 157.25
-Terbium 65 Tb 158.92
-Dysprosium 66 Dy 162.5
-Holmium 67 Ho 164.93
-Erbium 68 Er 167.26
-Thulium 69 Tm 168.93
-Ytterbium 70 Yb 173.04
-Lutetium 71 Lu 174.97
-Hafnium 72 Hf 178.49
-Tantalum 73 Ta 180.95
-Wolfram 74 W 183.85
-Rhenium 75 Re 186.2
-Osmium 76 Os 190.2
-Iridium 77 Ir 192.22
-Platinum 78 Pt 195.09
-Gold 79 Au 196.97
-Mercury 80 Hg 200.59
-Thallium 81 Tl 204.37
-Lead 82 Pb 207.19
-Bismuth 83 Bi 208.98
-Polonium 84 Po 208.98
-Astatine 85 At 209.99
-Radon 86 Rn 222.02
-Francium 87 Fr 223.02
-Radium 88 Ra 226
-Actinium 89 Ac 227.03
-Thorium 90 Th 232.04
-Protactinium 91 Pa 231.04
-Uranium 92 U 238.03
-Neptunium 93 Np 237
-Plutonium 94 Pu 242
-Americium 95 Am 243.06
-Curium 96 Cm 247.07
-Berkelium 97 Bk 247.07
-Californium 98 Cf 251.08
-Einsteinium 99 Es 254.09
-Fermium 100 Fm 257.1
-Mendelevium 101 Md 257.1
-Nobelium 102 No 255.09
-Lawrencium 103 Lr 256.1
diff -r 41f122255d14 -r 4393f982d18f tool_dependencies.xml
--- a/tool_dependencies.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-
-
-
-
-
-
-
- This dependency:
- Ensures R 3.1.1 installation is triggered (via dependency).
- Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed.
-
-
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f xcms_differential_analysis.r
--- a/xcms_differential_analysis.r Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,85 +0,0 @@
-## read args:
-args <- commandArgs(TRUE)
-#cat("args <- \"\"\n")
-## a xcms xset saved as .RData
-args.xsetData <- args[1]
-#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep=""))
-
-args.class1 <- args[2]
-args.class2 <- args[3]
-#cat(paste("args.class1 <- \"", args[2], "\"\n", sep=""))
-#cat(paste("args.class2 <- \"", args[3], "\"\n", sep=""))
-
-args.topcount <- strtoi(args[4])
-#cat(paste("args.topcount <- ", args[4], "\n", sep=""))
-
-args.outTable <- args[5]
-
-## report files
-args.htmlReportFile <- args[6]
-args.htmlReportFile.files_path <- args[7]
-#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep=""))
-#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep=""))
-
-
-if (length(args) == 8)
-{
- args.outLogFile <- args[8]
- # suppress messages:
- # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
- msg <- file(args.outLogFile, open="wt")
- sink(msg, type="message")
- sink(msg, type="output")
-}
-
-tryCatch(
- {
- library(metaMS)
- library(xcms)
- #library("R2HTML")
-
- # load the xset data :
- xsetData <- readRDS(args.xsetData)
- # if here to support both scenarios:
- if ("xcmsSet" %in% slotNames(xsetData) )
- {
- xsetData <- xsetData@xcmsSet
- }
-
-
- # info: levels(xcmsSet@phenoData$class) also gives access to the class names
- dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
- # set cairo as default for png plots:
- png = function (...) grDevices::png(...,type='cairo')
- # run diffreport
- reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640)
-
- # write out tsv table:
- write.table(reporttab, args.outTable, sep="\t", row.names=FALSE)
-
- message("\nGenerating report.........")
-
- cat("
Differential analysis report
", file= args.htmlReportFile)
- #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile)
- figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="")
- message(figuresPath)
- listOfFiles <- list.files(path = figuresPath)
- for (i in 1:length(listOfFiles))
- {
- figureName <- listOfFiles[i]
- # maybe we still need to copy the figures to the args.htmlReportFile.files_path
- cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE)
- cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE)
- }
-
- message("finished generating report")
- cat("\nWarnings================:\n")
- str( warnings() )
- },
- error=function(cond) {
- sink(NULL, type="message") # default setting
- sink(stderr(), type="output")
- message("\nERROR: ===========\n")
- print(cond)
- }
- )
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f xcms_differential_analysis.xml
--- a/xcms_differential_analysis.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,55 +0,0 @@
-
- Runs xcms diffreport function for differential Analsysis
-
- R_bioc_metams
-
-
- xcms_differential_analysis.r
- $xsetData
- "$class1"
- "$class2"
- $topcount
- $outTable
- $htmlReportFile
- $htmlReportFile.files_path
- $outLogFile
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for
-the most significant differences. The figure below shows an example of such an EIC.
-
-.. image:: $PATH_TO_IMAGES/diffreport.png
-
-
-
-
-
-
- 10.1021/ac051437y
-
-
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f xcms_get_alignment_eic.r
--- a/xcms_get_alignment_eic.r Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,153 +0,0 @@
-# shows all alignment results in a rt region
-## read args:
-args <- commandArgs(TRUE)
-# xset data:
-args.xsetData <- args[1]
-
-args.rtStart <- strtoi(args[2])
-args.rtEnd <- strtoi(args[3])
-
-# limit max diff to 600 and minNrSamples to at least 2 :
-if (args.rtEnd - args.rtStart > 600)
- stop("The RT window should be <= 600")
-
-args.minNrSamples <- strtoi(args[4]) #min nr samples
-if (args.minNrSamples < 2)
- stop("Set 'Minimum number of samples' to 2 or higher")
-
-
-args.sampleNames <- strsplit(args[5], ",")[[1]]
-# trim leading and trailing spaces:
-args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
-
-## report files
-args.htmlReportFile <- args[6]
-args.htmlReportFile.files_path <- args[7]
-
-
-if (length(args) == 8)
-{
- args.outLogFile <- args[8]
- # suppress messages:
- # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
- msg <- file(args.outLogFile, open="wt")
- sink(msg, type="message")
- sink(msg, type="output")
-}
-
-
-
-tryCatch(
- {
- library(metaMS)
-
- # load the xset data :
- xsetData <- readRDS(args.xsetData)
- # if here to support both scenarios:
- if ("xcmsSet" %in% slotNames(xsetData) )
- {
- xsetData <- xsetData@xcmsSet
- }
-
- # report
- dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
- message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
-
- write(paste("
Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples
"),
- file=args.htmlReportFile)
-
- gt <- groups(xsetData)
- message("\nParsed groups... ")
- groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected
-
- if (length(groupidx1) > 0)
- {
- message("\nGetting EIC... ")
- eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames)
- #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw")
-
- #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE)
- #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!!
- sampNames <- sampnames(xsetData)
- sampleNamesIdx <- match( args.sampleNames, sampNames)
- message(paste("Samples: ", sampleNamesIdx))
-
- #TODO html <- paste(html, "
")
- message(paste("\nPlotting figures... "))
-
- #get the mz list (interestingly, this [,"mz"] is relatively slow):
- peakMzList <- xsetData@peaks[,"mz"]
- peakSampleList <- xsetData@peaks[,"sample"]
- #signal to noise list:
- peakSnList <- xsetData@peaks[,"sn"]
-
- message(paste("Total nr of peaks: ", length(peakMzList)))
-
- for (i in 1:length(groupidx1))
- {
- groupMembers <- xsetData@groupidx[[groupidx1[i]]]
-
- groupMzList <- ""
- groupSampleList <- ""
- finalGroupSize <- 0
-
- for (j in 1:length(groupMembers))
- {
- #get only the peaks from the selected samples:
- memberSample <- peakSampleList[groupMembers[j]]
- memberSn <- peakSnList[groupMembers[j]]
- if (!is.na(memberSn) && memberSample %in% sampleNamesIdx)
- {
- message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample))
- memberMz <- peakMzList[groupMembers[j]]
-
-
- if (finalGroupSize == 0)
- {
- groupMzList <- memberMz
- groupSampleList <- sampNames[memberSample]
- } else {
- groupMzList <- paste(groupMzList,",",memberMz, sep="")
- groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="")
- }
-
- finalGroupSize <- finalGroupSize +1
- }
- }
- # here we do the real check on group size and only the groups that have enough
- # members with signal to noise > 0 will be plotted here:
- if (finalGroupSize >= args.minNrSamples)
- {
- message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... "))
-
- figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
- write(paste("", sep="") ,
- file=args.htmlReportFile, append=TRUE)
-
- png( figureName, type="cairo", width=800 )
- plot(eiccor, xsetData, groupidx = i)
- devname = dev.off()
-
- write(paste("
Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA: ", groupMzList,"
m/z values found in the following samples respectively: ", groupSampleList,"
", sep="") ,
- file=args.htmlReportFile, append=TRUE)
- }
- }
-
- }
-
- write("",
- file=args.htmlReportFile, append=TRUE)
- message("finished generating report")
- # unlink(args.htmlReportFile)
- cat("\nWarnings================:\n")
- str( warnings() )
- },
- error=function(cond) {
- sink(NULL, type="message") # default setting
- sink(stderr(), type="output")
- message("\nERROR: ===========\n")
- print(cond)
- }
- )
diff -r 41f122255d14 -r 4393f982d18f xcms_get_alignment_eic.xml
--- a/xcms_get_alignment_eic.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,73 +0,0 @@
-
- Extracts alignment EICs from feature alignment data
-
- R_bioc_metams
-
-
- xcms_get_alignment_eic.r
- $xsetData
- $rtStart
- $rtEnd
- $minNrSamples
- "$sampleNames"
- $htmlReportFile
- $htmlReportFile.files_path
- $outLogFile
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method.
-It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between
-two classes, with extra alignment information beneath it.
-
-.. image:: $PATH_TO_IMAGES/diffreport.png
-
-Alignment id: 1709. m/z list of peaks in alignment:
-614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469
-
-
-
-
- 10.1021/ac051437y
-
-
\ No newline at end of file
diff -r 41f122255d14 -r 4393f982d18f xcms_get_mass_eic.r
--- a/xcms_get_mass_eic.r Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,162 +0,0 @@
-## read args:
-args <- commandArgs(TRUE)
-# xset data:
-args.xsetData <- args[1]
-
-args.rtStart <- strtoi(args[2])
-args.rtEnd <- strtoi(args[3])
-
-args.mzStart <- as.double(args[4])
-args.mzEnd <- as.double(args[5])
-# there are 2 options: specify a mz range or a mz list:
-if (args.mzStart < 0)
-{
- args.mzList <- as.double(strsplit(args[6], ",")[[1]])
- cat(typeof(as.double(strsplit(args[6], ",")[[1]])))
- args.mzTolPpm <- as.double(args[7])
- # calculate mzends based on ppm tol:
- mzListEnd <- c()
- mzListStart <- c()
- for (i in 1:length(args.mzList))
- {
- mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0
- mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0
- mzListEnd <- c(mzListEnd, mzEnd)
- mzListStart <- c(mzListStart, mzStart)
- }
- str(mzListStart)
- str(mzListEnd)
-} else {
- mzListEnd <- c(args.mzEnd)
- mzListStart <- c(args.mzStart)
-}
-
-args.sampleNames <- strsplit(args[8], ",")[[1]]
-# trim leading and trailing spaces:
-args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
-
-args.combineSamples <- args[9]
-args.rtPlotMode <- args[10]
-
-## report files
-args.htmlReportFile <- args[11]
-args.htmlReportFile.files_path <- args[12]
-
-
-if (length(args) == 13)
-{
- args.outLogFile <- args[13]
- # suppress messages:
- # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
- msg <- file(args.outLogFile, open="wt")
- sink(msg, type="message")
- sink(msg, type="output")
-}
-
-# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots
-# TODO2 - let it run in parallel
-
-tryCatch(
- {
- library(metaMS)
-
- # load the xset data :
- xsetData <- readRDS(args.xsetData)
- # if here to support both scenarios:
- if ("xcmsSet" %in% slotNames(xsetData) )
- {
- xsetData <- xsetData@xcmsSet
- }
-
- # report
- dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
- message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
-
- html <- "
Extracted Ion Chromatograms (EIC) matching criteria
"
-
- if (args.combineSamples == "No")
- {
- if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart))
- stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart),
- " masses while ", length(args.sampleNames), " sample names were given."))
-
- iterSize <- length(args.sampleNames)
- # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used
- fixSampleIdx <- 1
- fixMzListIdx <- 1
- if (length(args.sampleNames) == 1)
- {
- fixSampleIdx <- 0
- iterSize <- length(mzListStart)
- }
- if (length(mzListStart) == 1)
- {
- fixMzListIdx <- 0
- }
- lineColors <- rainbow(iterSize)
- for (i in 0:(iterSize-1))
- {
- message("\nGetting EIC... ")
- eiccor <- getEIC(xsetData,
- mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE),
- rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
- sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode)
-
- message("\nPlotting figures... ")
- figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
- html <- paste(html," ", sep="")
- png( figureName, type="cairo", width=1100,height=250 )
- #plot(eiccor, col=lineColors[i+1])
- # black is better in this case:
- plot(eiccor)
- legend('topright', # places a legend at the appropriate place
- legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend
- lty=c(1,1), # gives the legend appropriate symbols (lines)
- lwd=c(2.5,2.5))
-
- devname = dev.off()
- }
-
- } else {
- for (i in 1:length(mzListStart))
- {
- message("\nGetting EIC... ")
- eiccor <- getEIC(xsetData,
- mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE),
- rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
- sampleidx=args.sampleNames, rt = args.rtPlotMode)
-
- #set size, set option (plot per sample, plot per mass)
-
- message("\nPlotting figures... ")
- figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
- html <- paste(html,"", sep="")
- png( figureName, type="cairo", width=1100,height=450 )
- lineColors <- rainbow(length(args.sampleNames))
- plot(eiccor, col=lineColors)
- legend('topright', # places a legend at the appropriate place
- legend=args.sampleNames, # puts text in the legend
- lty=c(1,1), # gives the legend appropriate symbols (lines)
- lwd=c(2.5,2.5),
- col=lineColors)
- devname = dev.off()
- }
- }
- if (args.rtPlotMode == "corrected")
- {
- html <- paste(html,"
*rt values are corrected ones
")
- }
- html <- paste(html,"")
- message("finished generating report")
- write(html,file=args.htmlReportFile)
- # unlink(args.htmlReportFile)
- cat("\nWarnings================:\n")
- str( warnings() )
- },
- error=function(cond) {
- sink(NULL, type="message") # default setting
- sink(stderr(), type="output")
- message("\nERROR: ===========\n")
- print(cond)
- }
- )
diff -r 41f122255d14 -r 4393f982d18f xcms_get_mass_eic.xml
--- a/xcms_get_mass_eic.xml Thu Mar 19 12:13:13 2015 +0100
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,117 +0,0 @@
-
- Extracts EICs for a given list of masses
-
- R_bioc_metams
-
-
- xcms_get_mass_eic.r
- $xsetData
- $rtStart
- $rtEnd
- #if $massParameters.massParametersType == "window"
- $massParameters.mzStart
- $massParameters.mzEnd
- -1
- "."
- #else
- -1
- -1
- "$massParameters.mzList"
- $massParameters.mzTolPpm
- #end if
- "$sampleNames"
- $combineSamples
- $rtPlotMode
- $htmlReportFile
- $htmlReportFile.files_path
- $outLogFile
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-.. class:: infomark
-
-This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method.
-It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot.
-
-.. image:: $PATH_TO_IMAGES/massEIC.png
-
-
-
-
- 10.1021/ac051437y
-
-
\ No newline at end of file