# HG changeset patch # User pieter.lukasse@wur.nl # Date 1426764143 -3600 # Node ID 4393f982d18f711ea89526a0070717c5f63b9742 # Parent 41f122255d149b7db2a8c0bee8f84f8251535db1 reorganized sources diff -r 41f122255d14 -r 4393f982d18f GCMS/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/__init__.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,6 @@ +''' +Module containing Galaxy tools for the GC/MS pipeline +Created on Mar 6, 2012 + +@author: marcelk +''' diff -r 41f122255d14 -r 4393f982d18f GCMS/combine_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/combine_output.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup) +''' + +import csv +import re +import sys +import math +import pprint + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +def _process_data(in_csv): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) + header = data.pop(0) + # Create dictionary with column name as key + output = {} + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _merge_data(rankfilter, caslookup): + ''' + Merges data from both input dictionaries based on the Centrotype field. This method will + build up a new list containing the merged hits as the items. + @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name) + @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name) + ''' + # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts: + if (len(rankfilter['ID']) != len(caslookup['Centrotype'])): + raise Exception('rankfilter and caslookup files should have the same nr of rows/records ') + + merged = [] + processed = {} + for compound_id_idx in xrange(len(rankfilter['ID'])): + compound_id = rankfilter['ID'][compound_id_idx] + if not compound_id in processed : + # keep track of processed items to not repeat them + processed[compound_id] = compound_id + # get centrotype nr + centrotype = compound_id.split('-')[0] + # Get the indices for current compound ID in both data-structures for proper matching + rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id] + cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype] + + merged_hits = [] + # Combine hits + for hit in xrange(len(rindex)): + # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do + # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its + # corresponding value in the rankfilter or caslookup tables; i.e. + # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) + rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()])) + cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()])) + + merged_hit = _add_hit(rf_record, cl_record) + merged_hits.append(merged_hit) + + merged.append(merged_hits) + + return merged, len(rindex) + + +def _add_hit(rankfilter, caslookup): + ''' + Combines single records from both the RankFilter- and CasLookup-tools + @param rankfilter: record (dictionary) of one compound in the RankFilter output + @param caslookup: matching record (dictionary) of one compound in the CasLookup output + ''' + # The ID in the RankFilter output contains the following 5 fields: + rf_id = rankfilter['ID'].split('-') + try: + name, formula = _remove_formula(rankfilter['Name']) + hit = [rf_id[0], # Centrotype + rf_id[1], # cent.Factor + rf_id[2], # scan nr + rf_id[3], # R.T. (umin) + rf_id[4], # nr. Peaks + # Appending other fields + rankfilter['R.T.'], + name, + caslookup['FORMULA'] if not formula else formula, + rankfilter['Library'].strip(), + rankfilter['CAS'].strip(), + rankfilter['Forward'], + rankfilter['Reverse'], + ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2), + rankfilter['RIexp'], + caslookup['RI'], + rankfilter['RIsvr'], + # Calculate absolute differences + math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])), + math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])), + caslookup['Regression.Column.Name'], + caslookup['min'], + caslookup['max'], + caslookup['nr.duplicates'], + caslookup['Column.phase.type'], + caslookup['Column.name'], + rankfilter['Rank'], + rankfilter['%rel.err'], + rankfilter['Synonyms']] + except KeyError as error: + print "Problem reading in data from input file(s):\n", + print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n" + print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n" + raise error + + return hit + + +def _remove_formula(name): + ''' + The RankFilter Name field often contains the Formula as well, this function removes it from the Name + @param name: complete name of the compound from the RankFilter output + ''' + name = name.split() + poss_formula = name[-1] + match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula) + if match: + return ' '.join(name[:-1]), poss_formula + else: + return ' '.join(name), False + + +def _get_default_caslookup(): + ''' + The Cas Lookup tool might not have found all compounds in the library searched, + this default dict will be used to combine with the Rank Filter output + ''' + return {'FORMULA': 'N/A', + 'RI': '0.0', + 'Regression.Column.Name': 'None', + 'min': '0.0', + 'max': '0.0', + 'nr.duplicates': '0', + 'Column.phase.type': 'N/A', + 'Column.name': 'N/A'} + + +def _save_data(data, nhits, out_csv_single, out_csv_multi): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_csv: output csv file + ''' + # Columns we don't repeat: + header_part1 = ['Centrotype', + 'cent.Factor', + 'scan nr.', + 'R.T. (umin)', + 'nr. Peaks', + 'R.T.'] + # These are the headers/columns we repeat in case of + # combining hits in one line (see alternative_headers method below): + header_part2 = [ + 'Name', + 'FORMULA', + 'Library', + 'CAS', + 'Forward', + 'Reverse', + 'Avg. (Forward, Reverse)', + 'RIexp', + 'RI', + 'RIsvr', + 'RIexp - RIsvr', + 'RI - RIexp', + 'Regression.Column.Name', + 'min', + 'max', + 'nr.duplicates', + 'Column.phase.type', + 'Column.name', + 'Rank', + '%rel.err', + 'Synonyms'] + + # Open output file for writing + outfile_single_handle = open(out_csv_single, 'wb') + outfile_multi_handle = open(out_csv_multi, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(header_part1 + header_part2) + output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) + # Combine all hits for each centrotype into one line + line = [] + for centrotype_idx in xrange(len(data)): + i = 0 + for hit in data[centrotype_idx]: + if i==0: + line.extend(hit) + else: + line.extend(hit[6:]) + i = i+1 + # small validation (if error, it is a programming error): + if i > nhits: + raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) + output_multi_handle.writerow(line) + line = [] + + # Write one line for each centrotype + for centrotype_idx in xrange(len(data)): + for hit in data[centrotype_idx]: + output_single_handle.writerow(hit) + +def alternative_headers(header_part2, nr_alternative_hits): + ''' + This method will iterate over the header names and add the string 'ALT#_' before each, + where # is the number of the alternative, according to number of alternative hits we want to add + to final csv/tsv + ''' + result = [] + for i in xrange(nr_alternative_hits): + for header_name in header_part2: + result.append("ALT" + str(i+1) + "_" + header_name) + return result + +def main(): + ''' + Combine Output main function + It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" + NB: the caslookup_result_file will typically have fewer lines than + rankfilter_result_file, so the merge has to consider this as well. The final file + should have the same nr of lines as rankfilter_result_file. + ''' + rankfilter_result_file = sys.argv[1] + caslookup_result_file = sys.argv[2] + output_single_csv = sys.argv[3] + output_multi_csv = sys.argv[4] + + # Read RankFilter and CasLookup output files + rankfilter = _process_data(rankfilter_result_file) + caslookup = _process_data(caslookup_result_file) + merged, nhits = _merge_data(rankfilter, caslookup) + _save_data(merged, nhits, output_single_csv, output_multi_csv) + + +if __name__ == '__main__': + main() diff -r 41f122255d14 -r 4393f982d18f GCMS/combine_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/combine_output.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,36 @@ + + Perform a combination of output data from the RankFilter and CasLookup tools + + combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi + + + + + + + + + + + +Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files. + +The files produced are contain either all hits for a compound on a single line (Single) or on separate lines +(Multi). + +.. class:: infomark + +**Notes** + +The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the +original headers kept intact. Processing steps include: + + - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values + - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)' + and 'nr. Peaks' fields. + - The formula is split off the 'Name' field in the RankFilter output + + + diff -r 41f122255d14 -r 4393f982d18f GCMS/create_model.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/create_model.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,78 @@ + + Generate coefficients to enable the regression from one GC-column + to another GC-column + Rscripts/ridb-regression.R + $ridb + $out_model + $out_log + $min_residuals + $range_mod + $pvalue + $rsquared + $method + $plot + #if $plot + $model_graphics + #end if + + + + + + + + + + + + + + + + + + (plot) + + + + + +Calculates regression models for a permutation of all GC columns contained in the selected +RI database file. The method used for creating the model is either based on a 3rd degree +polynomial or a standard linear model. + +The *Minimum number of residuals* option will only allow regression if the columns it is based +on has at least that number of datapoints on the same compound. + +Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for +the *R squared* value. The produced logfile will state how many models have been discarded due +to this filtering. The output model file also includes the p-value and R squared value for +each created model. + +Graphical output of the models is available by selecting the plot option which shows the +data points used for the model as well as the fit itself and the range of data that will +be usable. + +.. class:: infomark + +**Notes** + +The output file produced by this tool is required as input for the CasLookup tool when +selecting to apply regression when finding hits in the RIDB. + + diff -r 41f122255d14 -r 4393f982d18f GCMS/library_lookup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/library_lookup.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,327 @@ +''' +Logic for searching a Retention Index database file given output from NIST +''' +import match_library +import re +import sys +import csv + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +def create_lookup_table(library_file, column_type_name, statphase): + ''' + Creates a dictionary holding the contents of the library to be searched + @param library_file: library to read + @param column_type_name: the columns type name + @param statphase: the columns stationary phase + ''' + (data, header) = match_library.read_library(library_file) + # Test for presence of required columns + if ('columntype' not in header or + 'columnphasetype' not in header or + 'cas' not in header): + raise IOError('Missing columns in ', library_file) + + column_type_column = header.index("columntype") + statphase_column = header.index("columnphasetype") + cas_column = header.index("cas") + + filtered_library = [line for line in data if line[column_type_column] == column_type_name + and line[statphase_column] == statphase] + lookup_dict = {} + for element in filtered_library: + # Here the cas_number is set to the numeric part of the cas_column value, so if the + # cas_column value is 'C1433' then cas_number will be '1433' + cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0]) + try: + lookup_dict[cas_number].append(element) + except KeyError: + lookup_dict[cas_number] = [element] + return lookup_dict + + +def _preferred(hits, pref, ctype, polar, model, method): + ''' + Returns all entries in the lookup_dict that have the same column name, type and polarity + as given by the user, uses regression if selected given the model and method to use. The + regression is applied on the column with the best R-squared value in the model + @param hits: all entries in the lookup_dict for the given CAS number + @param pref: preferred GC-column, can be one or more names + @param ctype: column type (capillary etc.) + @param polar: polarity (polar / non-polar etc.) + @param model: data loaded from file containing regression models + @param method: supported regression method (i.e. poly(nomial) or linear) + ''' + match = [] + for column in pref: + for hit in hits: + if hit[4] == ctype and hit[5] == polar and hit[6] == column: + # Create copy of found hit since it will be altered downstream + match.extend(hit) + return match, False + + # No hit found for current CAS number, return if not performing regression + if not model: + return False, False + + # Perform regression + for column in pref: + if column not in model: + break + # Order regression candidates by R-squared value (last element) + order = sorted(model[column].items(), key=lambda col: col[1][-1]) + # Create list of regression candidate column names + regress_columns = list(reversed([column for (column, _) in order])) + # Names of available columns + available = [hit[6] for hit in hits] + + # TODO: combine Rsquared and number of datapoints to get the best regression match + ''' + # Iterate regression columns (in order) and retrieve their models + models = {} + for col in regress_columns: + if col in available: + hit = list(hits[available.index(col)]) + if hit[4] == ctype: + # models contains all model data including residuals [-2] and rsquared [-1] + models[pref[0]] = model[pref[0]][hit[6]] + # Get the combined maximum for residuals and rsquared + best_match = models[] + # Apply regression + if method == 'poly': + regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model) + if regressed: + hit[3] = regressed + else: + return False, False + else: + hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model) + match.extend(hit) + return match, hit[6] + ''' + + for col in regress_columns: + if col in available: + hit = list(hits[available.index(col)]) + if hit[4] == ctype: + # Perform regression using a column for which regression is possible + if method == 'poly': + # Polynomial is only possible within a set border, if the RI falls outside + # of this border, skip this lookup + regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model) + if regressed: + hit[3] = regressed + else: + return False, False + else: + hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model) + match.extend(hit) + return match, hit[6] + + return False, False + + + +def default_hit(row, cas_nr, compound_id): + ''' + This method will return a "default"/empty hit for cases where the + method _preferred() returns False (i.e. a RI could not be found + for the given cas nr, also not via regression. + ''' + return [ + #'CAS', + 'C' + cas_nr, + #'NAME', + '', + #'FORMULA', + '', + #'RI', + '0.0', + #'Column.type', + '', + #'Column.phase.type', + '', + #'Column.name', + '', + #'phase.coding', + ' ', + #'CAS_column.Name', + '', + #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes + compound_id, + #'Regression.Column.Name', + '', + #'min', + '', + #'max', + '', + #'nr.duplicates', + ''] + + +def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method): + ''' + Looks up the compounds in the library lookup table and formats the results + @param lookup_dict: dictionary containing the library to be searched + @param nist_tabular_filename: NIST output file to be matched + @param pref: (list of) column-name(s) to look for + @param ctype: column type of interest + @param polar: polarity of the used column + @param model: data loaded from file containing regression models + @param method: supported regression method (i.e. poly(nomial) or linear) + ''' + (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename) + # Retrieve indices of the CAS and compound_id columns (exit if not present) + try: + casi = header_clean.index("cas") + idi = header_clean.index("id") + except: + raise IOError("'CAS' or 'compound_id' not found in header of library file") + + data = [] + for row in nist_tabular_list: + casf = str(row[casi].replace('-', '').strip()) + compound_id = str(row[idi].split('-')[0]) + if casf in lookup_dict: + found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method) + if found_hit: + # Keep cas nr as 'C'+ numeric part: + found_hit[0] = 'C' + casf + # Add compound id + found_hit.insert(9, compound_id) + # Add information on regression process + found_hit.insert(10, regress if regress else 'None') + # Replace column index references with actual number of duplicates + dups = len(found_hit[-1].split(',')) + if dups > 1: + found_hit[-1] = str(dups + 1) + else: + found_hit[-1] = '0' + data.append(found_hit) + found_hit = '' + else: + data.append(default_hit(row, casf, compound_id)) + else: + data.append(default_hit(row, casf, compound_id)) + + casf = '' + compound_id = '' + found_hit = [] + dups = [] + return data + + +def _save_data(content, outfile): + ''' + Write to output file + @param content: content to write + @param outfile: file to write to + ''' + # header + header = ['CAS', + 'NAME', + 'FORMULA', + 'RI', + 'Column.type', + 'Column.phase.type', + 'Column.name', + 'phase.coding', + 'CAS_column.Name', + 'Centrotype', + 'Regression.Column.Name', + 'min', + 'max', + 'nr.duplicates'] + output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t") + output_handle.writerow(header) + for entry in content: + output_handle.writerow(entry) + + +def _read_model(model_file): + ''' + Creates an easy to search dictionary for getting the regression parameters + for each valid combination of GC-columns + @param model_file: filename containing the regression models + ''' + regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t')) + if len(regress.pop(0)) > 9: + method = 'poly' + else: + method = 'linear' + + model = {} + # Create new dictionary for each GC-column + for line in regress: + model[line[0]] = {} + + # Add data + for line in regress: + if method == 'poly': + model[line[0]][line[1]] = [float(col) for col in line[2:11]] + else: # linear + model[line[0]][line[1]] = [float(col) for col in line[2:9]] + + return model, method + + +def _apply_poly_regression(column1, column2, retention_index, model): + ''' + Calculates a new retention index (RI) value using a given 3rd-degree polynomial + model based on data from GC columns 1 and 2 + @param column1: name of the selected GC-column + @param column2: name of the GC-column to use for regression + @param retention_index: RI to convert + @param model: dictionary containing model information for all GC-columns + ''' + coeff = model[column1][column2] + # If the retention index to convert is within range of the data the model is based on, perform regression + if coeff[4] < retention_index < coeff[5]: + return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + + (retention_index * coeff[1]) + coeff[0]) + else: + return False + + +def _apply_linear_regression(column1, column2, retention_index, model): + ''' + Calculates a new retention index (RI) value using a given linear model based on data + from GC columns 1 and 2 + @param column1: name of the selected GC-column + @param column2: name of the GC-column to use for regression + @param retention_index: RI to convert + @param model: dictionary containing model information for all GC-columns + ''' + # TODO: No use of limits + coeff = model[column1][column2] + return coeff[1] * retention_index + coeff[0] + + +def main(): + ''' + Library Lookup main function + ''' + library_file = sys.argv[1] + nist_tabular_filename = sys.argv[2] + ctype = sys.argv[3] + polar = sys.argv[4] + outfile = sys.argv[5] + pref = sys.argv[6:-1] + regress = sys.argv[-1] + + if regress != 'False': + model, method = _read_model(regress) + else: + model, method = False, None + + lookup_dict = create_lookup_table(library_file, ctype, polar) + data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method) + + _save_data(data, outfile) + + +if __name__ == "__main__": + main() diff -r 41f122255d14 -r 4393f982d18f GCMS/library_lookup.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/library_lookup.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,75 @@ + + Lookup or estimate the RI using a "known RI values" CAS numbers library + + library_lookup.py + $library_file + $input + "$col_type" + "$polarity" + $output + #for $ctype in $pref + ${ctype.columntype} + #end for + $regression.model + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library. +If a direct match is NOT found for the preferred column name, a regression can be done to find +the theoretical RI value based on known RI values for the CAS number on other column types (see step 4). +If there is no match for the CAS number on any column type, then the record is not given a RI. + + + + + + diff -r 41f122255d14 -r 4393f982d18f GCMS/readme.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/GCMS/readme.txt Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,25 @@ +==Description== +Galaxy tools for analyzing a PDF-result file generated by NIST-MSSearch by calculating ranks for +each putative metabolite and matching with a Retention Index Database by CAS-number + + +==Dependencies== +- Galaxy +- pdftotext (part of the 'poppler-utils' package (Ubuntu and Debian)) +- Python (>= 2.6) with the following package(s): + - numpy +- R + + +==Installation== + +- Optionally run the included Python unittests to verify the dependencies: + nosetests galaxy-dist/tools/GCMS + + + +==License== + +Licensing info can be found at the /LICENSE_AND_README folder files located +in the root of this (PRIMS-metabolomics) project. + diff -r 41f122255d14 -r 4393f982d18f METEXPtools/__init__.py diff -r 41f122255d14 -r 4393f982d18f METEXPtools/export_to_metexp_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/export_to_metexp_tabular.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,247 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the MetExp database. + +RankFilter, CasLookup are already combined by combine_output.py so here we will use +this result. Furthermore here one of the MsClust +quantification files containing the respective spectra details are to be combined as well. + +Extra calculations performed: +- The column MW is also added here and is derived from the column FORMULA found + in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. +''' +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import csv +import re +import sys +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2013, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_data(in_csv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + +ONE_TO_ONE = 'one_to_one' +N_TO_ONE = 'n_to_one' + +def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): + ''' + Merges data from both input dictionaries based on the link fields. This method will + build up a new list containing the merged hits as the items. + @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) + @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) + ''' + # TODO test for correct input files -> same link_field values should be there + # (test at least number of unique link_field values): + # + # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): + # raise Exception('input files should have the same nr of key values ') + + + merged = [] + processed = {} + for link_field_set1_idx in xrange(len(set1[link_field_set1])): + link_field_set1_value = set1[link_field_set1][link_field_set1_idx] + if not link_field_set1_value in processed : + # keep track of processed items to not repeat them + processed[link_field_set1_value] = link_field_set1_value + + # Get the indices for current link_field_set1_value in both data-structures for proper matching + set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] + set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] + # Validation : + if len(set2index) == 0: + # means that corresponding data could not be found in set2, then throw error + raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + + link_field_set1_value + " only found in first dataset. ") + + merged_hits = [] + # Combine hits + for hit in xrange(len(set1index)): + # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do + # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its + # corresponding value in the sets; i.e. + # set1[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". + # set1index[hit] => points to the row nr=hit (hit is a rownr/index) + # So set1[x][set1index[n]] = set1.attributeX.instanceN + # + # It just ensures the entry is made available as a plain named array for easy access. + rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) + if relation_type == ONE_TO_ONE : + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) + else: + # is N to 1: + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) + + merged_hit = merge_function(rf_record, cl_record, metadata) + merged_hits.append(merged_hit) + + merged.append(merged_hits) + + return merged + + +def _compare_records(key1, key2): + ''' + in this case the compare method is really simple as both keys are expected to contain + same value when records are the same + ''' + if key1 == key2: + return True + else: + return False + + + +def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): + ''' + Combines single records from both the RankFilter+CasLookup combi file and from MsClust file + + @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) + @param msclust_quant_record: msclust quantification + spectrum record + ''' + record = [] + for column in rank_caslookup_combi: + record.append(rank_caslookup_combi[column]) + + for column in msclust_quant_record: + record.append(msclust_quant_record[column]) + + for column in metadata: + record.append(metadata[column]) + + # add MOLECULAR MASS (MM) + molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) + # limit to two decimals: + record.append("{0:.2f}".format(molecular_mass)) + + # add MOLECULAR WEIGHT (MW) - TODO - calculate this + record.append('0.0') + + # level of identification and Location of reference standard + record.append('0') + record.append('') + + return record + + +def get_molecular_mass(formula): + ''' + Calculates the molecular mass (MM). + E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O + ''' + + # Each element is represented by a capital letter, followed optionally by + # lower case, with one or more digits as for how many elements: + element_pattern = re.compile("([A-Z][a-z]?)(\d*)") + + total_mass = 0 + for (element_name, count) in element_pattern.findall(formula): + if count == "": + count = 1 + else: + count = int(count) + element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) + total_mass += element_mass * count + + return total_mass + + + +def _save_data(data, headers, out_csv): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write + for item_idx in xrange(len(data)): + for hit in data[item_idx]: + output_single_handle.writerow(hit) + + +def _get_map_for_elements_and_masses(elements_and_masses): + ''' + This method will read out the column 'Chemical symbol' and make a map + of this, storing the column 'Relative atomic mass' as its value + ''' + resultMap = {} + index = 0 + for entry in elements_and_masses['Chemical symbol']: + resultMap[entry] = elements_and_masses['Relative atomic mass'][index] + index += 1 + + return resultMap + + +def init_elements_and_masses_map(): + ''' + Initializes the lookup map containing the elements and their respective masses + ''' + elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) + global elements_and_masses_map + elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) + + +def main(): + ''' + Combine Output main function + + RankFilter, CasLookup are already combined by combine_output.py so here we will use + this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust + quantification files are to be combined with combine_output.py result as well. + ''' + rankfilter_and_caslookup_combined_file = sys.argv[1] + msclust_quantification_and_spectra_file = sys.argv[2] + output_csv = sys.argv[3] + # metadata + metadata = OrderedDict() + metadata['organism'] = sys.argv[4] + metadata['tissue'] = sys.argv[5] + metadata['experiment_name'] = sys.argv[6] + metadata['user_name'] = sys.argv[7] + metadata['column_type'] = sys.argv[8] + + # Read RankFilter and CasLookup output files + rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) + msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') + + # Read elements and masses to use for the MW/MM calculation : + init_elements_and_masses_map() + + merged = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', + msclust_quantification_and_spectra, 'centrotype', + _compare_records, _merge_records, metadata, + N_TO_ONE) + headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] + _save_data(merged, headers, output_csv) + + +if __name__ == '__main__': + main() diff -r 41f122255d14 -r 4393f982d18f METEXPtools/export_to_metexp_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/export_to_metexp_tabular.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,75 @@ + + Create tabular file for loading into METabolomics EXPlorer database + + export_to_metexp_tabular.py + $rankfilter_and_caslookup_combi + $msclust_quant_file + $output_result + "$organism" + "$tissue" + "$experiment_name" + "$user_name" + "$column_type" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +Tool to combine output from the tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. + +RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use +this result. + +**Notes** + +Extra calculations performed: +- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. + + + + diff -r 41f122255d14 -r 4393f982d18f METEXPtools/query_metexp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/query_metexp.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + MetExp Database. + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass and formula: + cas_id = input_data_record[casid_col] + formula = input_data_record[formula_col] + molecular_mass = input_data_record[molecular_mass_col] + + # search for related records in MetExp: + data_found = None + if cas_id != "undef": + # 1- search for other experiments where this CAS id has been found: + query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "CAS" + if data_found == None: + # 2- search for other experiments where this FORMULA has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "FORMULA" + if data_found == None: + # 3- search for other experiments where this MM has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - Experiment details where hits have been found : + 'organism', 'tissue','experiment_name','user_name','column_type' + - Link that executes same query + + ''' + # set() makes a unique list: + organism_set = [] + tissue_set = [] + experiment_name_set = [] + user_name_set = [] + column_type_set = [] + cas_nr_set = [] + + if 'organism' in data_found: + organism_set = set(data_found['organism']) + if 'tissue' in data_found: + tissue_set = set(data_found['tissue']) + if 'experiment_name' in data_found: + experiment_name_set = set(data_found['experiment_name']) + if 'user_name' in data_found: + user_name_set = set(data_found['user_name']) + if 'column_type' in data_found: + column_type_set = set(data_found['column_type']) + if 'CAS' in data_found: + cas_nr_set = set(data_found['CAS']) + + + result = [data_type_found, + _to_xsv(organism_set), + _to_xsv(tissue_set), + _to_xsv(experiment_name_set), + _to_xsv(user_name_set), + _to_xsv(column_type_set), + _to_xsv(cas_nr_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_metexp_URL(metexp_dblink_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(metexp_dblink_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + MetExp Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass + and one for the formula of the respective identification. These two columns are then + used to query against MetExp Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + casid_col = sys.argv[2] + formula_col = sys.argv[3] + molecular_mass_col = sys.argv[4] + metexp_dblink_file = sys.argv[5] + separation_method = sys.argv[6] + output_result = sys.argv[7] + + # Parse metexp_dblink_file to find the URL to the MetExp service: + metexp_dblink = _get_metexp_URL(metexp_dblink_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against MetExp DB : + enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) + headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', + 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main() diff -r 41f122255d14 -r 4393f982d18f METEXPtools/query_metexp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/query_metexp.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,69 @@ + + Query a set of identifications against the METabolomics EXPeriments database + + query_metexp.py + $input_file + "$casid_col" + "$formula_col" + "$molecular_mass_col" + "$metexp_dblink_file" + $separation_method + $output_result + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool will Query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass +and one for the formula of the respective identification. + + + + diff -r 41f122255d14 -r 4393f982d18f METEXPtools/static_resources/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/static_resources/README.txt Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,3 @@ +This folder and respective files should be deployed together with the following tools: + + - ../export_to_metexp_tabular.py \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f METEXPtools/static_resources/elements_and_masses.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/METEXPtools/static_resources/elements_and_masses.tab Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,104 @@ +Name Atomic number Chemical symbol Relative atomic mass +Hydrogen 1 H 1.01 +Helium 2 He 4 +Lithium 3 Li 6.94 +Beryllium 4 Be 9.01 +Boron 5 B 10.81 +Carbon 6 C 12.01 +Nitrogen 7 N 14.01 +Oxygen 8 O 16 +Fluorine 9 F 19 +Neon 10 Ne 20.18 +Sodium 11 Na 22.99 +Magnesium 12 Mg 24.31 +Aluminum 13 Al 26.98 +Silicon 14 Si 28.09 +Phosphorus 15 P 30.98 +Sulfur 16 S 32.06 +Chlorine 17 Cl 35.45 +Argon 18 Ar 39.95 +Potassium 19 K 39.1 +Calcium 20 Ca 40.08 +Scandium 21 Sc 44.96 +Titanium 22 Ti 47.9 +Vanadium 23 V 50.94 +Chromium 24 Cr 52 +Manganese 25 Mn 54.94 +Iron 26 Fe 55.85 +Cobalt 27 Co 58.93 +Nickel 28 Ni 58.71 +Copper 29 Cu 63.54 +Zinc 30 Zn 65.37 +Gallium 31 Ga 69.72 +Germanium 32 Ge 72.59 +Arsenic 33 As 74.99 +Selenium 34 Se 78.96 +Bromine 35 Br 79.91 +Krypton 36 Kr 83.8 +Rubidium 37 Rb 85.47 +Strontium 38 Sr 87.62 +Yttrium 39 Y 88.91 +Zirconium 40 Zr 91.22 +Niobium 41 Nb 92.91 +Molybdenum 42 Mo 95.94 +Technetium 43 Tc 96.91 +Ruthenium 44 Ru 101.07 +Rhodium 45 Rh 102.9 +Palladium 46 Pd 106.4 +Silver 47 Ag 107.87 +Cadmium 48 Cd 112.4 +Indium 49 In 114.82 +Tin 50 Sn 118.69 +Antimony 51 Sb 121.75 +Tellurium 52 Te 127.6 +Iodine 53 I 126.9 +Xenon 54 Xe 131.3 +Cesium 55 Cs 132.9 +Barium 56 Ba 137.34 +Lanthanum 57 La 138.91 +Cerium 58 Ce 140.12 +Praseodymium 59 Pr 140.91 +Neodymium 60 Nd 144.24 +Promethium 61 Pm 144.91 +Samarium 62 Sm 150.35 +Europium 63 Eu 151.96 +Gadolinium 64 Gd 157.25 +Terbium 65 Tb 158.92 +Dysprosium 66 Dy 162.5 +Holmium 67 Ho 164.93 +Erbium 68 Er 167.26 +Thulium 69 Tm 168.93 +Ytterbium 70 Yb 173.04 +Lutetium 71 Lu 174.97 +Hafnium 72 Hf 178.49 +Tantalum 73 Ta 180.95 +Wolfram 74 W 183.85 +Rhenium 75 Re 186.2 +Osmium 76 Os 190.2 +Iridium 77 Ir 192.22 +Platinum 78 Pt 195.09 +Gold 79 Au 196.97 +Mercury 80 Hg 200.59 +Thallium 81 Tl 204.37 +Lead 82 Pb 207.19 +Bismuth 83 Bi 208.98 +Polonium 84 Po 208.98 +Astatine 85 At 209.99 +Radon 86 Rn 222.02 +Francium 87 Fr 223.02 +Radium 88 Ra 226 +Actinium 89 Ac 227.03 +Thorium 90 Th 232.04 +Protactinium 91 Pa 231.04 +Uranium 92 U 238.03 +Neptunium 93 Np 237 +Plutonium 94 Pu 242 +Americium 95 Am 243.06 +Curium 96 Cm 247.07 +Berkelium 97 Bk 247.07 +Californium 98 Cf 251.08 +Einsteinium 99 Es 254.09 +Fermium 100 Fm 257.1 +Mendelevium 101 Md 257.1 +Nobelium 102 No 255.09 +Lawrencium 103 Lr 256.1 diff -r 41f122255d14 -r 4393f982d18f MS/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MS/__init__.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,6 @@ +''' +Module containing Galaxy tools for the LC or GC/MS pipeline +Created on Mar , 2014 + +@author: pieter lukasse +''' \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f MS/query_mass_repos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MS/query_mass_repos.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,289 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of accurate mass values detected by high-resolution mass spectrometers +against various repositories/services such as METabolomics EXPlorer database or the +MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). + +It will take the input file and for each record it will query the +molecular mass in the selected repository/service. If one or more compounds are found +then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository/service. + +The service should implement the following interface: + +http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) + +The output should be tab separated and should contain the following columns (in this order) +db-name molecular-formula dbe formula-weight id description + + +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): + + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + chosen repository/service + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass : + molecular_mass = input_data_record[molecular_mass_col] + + + # search for related records in repository/service: + data_found = None + if molecular_mass != "": + molecular_mass = float(molecular_mass) + + # 1- search for data around this MM: + query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" + + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - details of hits found : + db-name molecular-formula dbe formula-weight id description + - Link that executes same query + + ''' + + # set() makes a unique list: + db_name_set = [] + molecular_formula_set = [] + id_set = [] + description_set = [] + + + if 'db-name' in data_found: + db_name_set = set(data_found['db-name']) + elif '# db-name' in data_found: + db_name_set = set(data_found['# db-name']) + if 'molecular-formula' in data_found: + molecular_formula_set = set(data_found['molecular-formula']) + if 'id' in data_found: + id_set = set(data_found['id']) + if 'description' in data_found: + description_set = set(data_found['description']) + + result = [data_type_found, + _to_xsv(db_name_set), + _to_xsv(molecular_formula_set), + _to_xsv(id_set), + _to_xsv(description_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): + data_rows_to_remove = [] + for data_row in data_rows: + if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): + data_rows_to_remove.append(data_row) + + for data_row in data_rows_to_remove: + data_rows.remove(data_row) + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_repository_URL(repository_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(repository_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass. + This column is then used to query against the chosen repository/service Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + molecular_mass_col = sys.argv[2] + repository_file = sys.argv[3] + error_margin = float(sys.argv[4]) + margin_unit = sys.argv[5] + output_result = sys.argv[6] + + # Parse repository_file to find the URL to the service: + repository_dblink = _get_repository_URL(repository_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against repository : + enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) + headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', + 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main() diff -r 41f122255d14 -r 4393f982d18f MS/query_mass_repos.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MS/query_mass_repos.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,106 @@ + + Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers + + query_mass_repos.py + $input_file + "$molecular_mass_col" + "$repository_file" + $error_margin + $margin_unit + $output_result + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher +for elemental compositions from accurate mass values detected by high-resolution mass spectrometers. + +It will take the input file and for each record it will query the +molecular mass in the selected repository. If one or more compounds are found in the +repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass. + +**Services that can be queried** + +================= ========================================================================= +Database Description +----------------- ------------------------------------------------------------------------- +PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at + Plant Research International. NB: restricted access to employees with + access key. +ExactMassDB A database of possible elemental compositions consits of C: 100, + H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and + the Lewis valence rules. + (via /mfsearcher/exmassdb/) +ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental + compositions, filters some elemental compositions according to + the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 + database returns the same result as does HR2 with the same atom kind + and number condition as that used in construction of the ExactMassDB. + (via /mfsearcher/exmassdb-hr2/) +Pep1000 A database of possible linear polypeptides that are + constructed with 20 kinds of amino acids and having molecular + weights smaller than 1000. + (via /mfsearcher/pep1000/) +KEGG Re-calculated compound data from KEGG. Weekly updated. + (via /mfsearcher/kegg/) +KNApSAcK Re-calculated compound data from KNApSAcK. + (via /mfsearcher/knapsack/) +Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer . + (via /mfsearcher/flavonoidviewer/ +LipidMAPS Re-calculated compound data from LIPID MAPS. + (via /mfsearcher/lipidmaps/) +HMDB Re-calculated compound data from Human Metabolome Database (HMDB) + Version 3.5. + (via /mfsearcher/hmdb/) +PubChem Re-calculated compound data from PubChem. Monthly updated. + (via /mfsearcher/pubchem/) +================= ========================================================================= + +Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher + + + diff -r 41f122255d14 -r 4393f982d18f __init__.py diff -r 41f122255d14 -r 4393f982d18f combine_output.py --- a/combine_output.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,253 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -''' -Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup) -''' - -import csv -import re -import sys -import math -import pprint - -__author__ = "Marcel Kempenaar" -__contact__ = "brs@nbic.nl" -__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" -__license__ = "MIT" - -def _process_data(in_csv): - ''' - Generic method to parse a tab-separated file returning a dictionary with named columns - @param in_csv: input filename to be parsed - ''' - data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) - header = data.pop(0) - # Create dictionary with column name as key - output = {} - for index in xrange(len(header)): - output[header[index]] = [row[index] for row in data] - return output - - -def _merge_data(rankfilter, caslookup): - ''' - Merges data from both input dictionaries based on the Centrotype field. This method will - build up a new list containing the merged hits as the items. - @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name) - @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name) - ''' - # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts: - if (len(rankfilter['ID']) != len(caslookup['Centrotype'])): - raise Exception('rankfilter and caslookup files should have the same nr of rows/records ') - - merged = [] - processed = {} - for compound_id_idx in xrange(len(rankfilter['ID'])): - compound_id = rankfilter['ID'][compound_id_idx] - if not compound_id in processed : - # keep track of processed items to not repeat them - processed[compound_id] = compound_id - # get centrotype nr - centrotype = compound_id.split('-')[0] - # Get the indices for current compound ID in both data-structures for proper matching - rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id] - cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype] - - merged_hits = [] - # Combine hits - for hit in xrange(len(rindex)): - # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do - # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its - # corresponding value in the rankfilter or caslookup tables; i.e. - # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute - # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) - rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()])) - cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()])) - - merged_hit = _add_hit(rf_record, cl_record) - merged_hits.append(merged_hit) - - merged.append(merged_hits) - - return merged, len(rindex) - - -def _add_hit(rankfilter, caslookup): - ''' - Combines single records from both the RankFilter- and CasLookup-tools - @param rankfilter: record (dictionary) of one compound in the RankFilter output - @param caslookup: matching record (dictionary) of one compound in the CasLookup output - ''' - # The ID in the RankFilter output contains the following 5 fields: - rf_id = rankfilter['ID'].split('-') - try: - name, formula = _remove_formula(rankfilter['Name']) - hit = [rf_id[0], # Centrotype - rf_id[1], # cent.Factor - rf_id[2], # scan nr - rf_id[3], # R.T. (umin) - rf_id[4], # nr. Peaks - # Appending other fields - rankfilter['R.T.'], - name, - caslookup['FORMULA'] if not formula else formula, - rankfilter['Library'].strip(), - rankfilter['CAS'].strip(), - rankfilter['Forward'], - rankfilter['Reverse'], - ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2), - rankfilter['RIexp'], - caslookup['RI'], - rankfilter['RIsvr'], - # Calculate absolute differences - math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])), - math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])), - caslookup['Regression.Column.Name'], - caslookup['min'], - caslookup['max'], - caslookup['nr.duplicates'], - caslookup['Column.phase.type'], - caslookup['Column.name'], - rankfilter['Rank'], - rankfilter['%rel.err'], - rankfilter['Synonyms']] - except KeyError as error: - print "Problem reading in data from input file(s):\n", - print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n" - print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n" - raise error - - return hit - - -def _remove_formula(name): - ''' - The RankFilter Name field often contains the Formula as well, this function removes it from the Name - @param name: complete name of the compound from the RankFilter output - ''' - name = name.split() - poss_formula = name[-1] - match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula) - if match: - return ' '.join(name[:-1]), poss_formula - else: - return ' '.join(name), False - - -def _get_default_caslookup(): - ''' - The Cas Lookup tool might not have found all compounds in the library searched, - this default dict will be used to combine with the Rank Filter output - ''' - return {'FORMULA': 'N/A', - 'RI': '0.0', - 'Regression.Column.Name': 'None', - 'min': '0.0', - 'max': '0.0', - 'nr.duplicates': '0', - 'Column.phase.type': 'N/A', - 'Column.name': 'N/A'} - - -def _save_data(data, nhits, out_csv_single, out_csv_multi): - ''' - Writes tab-separated data to file - @param data: dictionary containing merged dataset - @param out_csv: output csv file - ''' - # Columns we don't repeat: - header_part1 = ['Centrotype', - 'cent.Factor', - 'scan nr.', - 'R.T. (umin)', - 'nr. Peaks', - 'R.T.'] - # These are the headers/columns we repeat in case of - # combining hits in one line (see alternative_headers method below): - header_part2 = [ - 'Name', - 'FORMULA', - 'Library', - 'CAS', - 'Forward', - 'Reverse', - 'Avg. (Forward, Reverse)', - 'RIexp', - 'RI', - 'RIsvr', - 'RIexp - RIsvr', - 'RI - RIexp', - 'Regression.Column.Name', - 'min', - 'max', - 'nr.duplicates', - 'Column.phase.type', - 'Column.name', - 'Rank', - '%rel.err', - 'Synonyms'] - - # Open output file for writing - outfile_single_handle = open(out_csv_single, 'wb') - outfile_multi_handle = open(out_csv_multi, 'wb') - output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") - output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") - - # Write headers - output_single_handle.writerow(header_part1 + header_part2) - output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) - # Combine all hits for each centrotype into one line - line = [] - for centrotype_idx in xrange(len(data)): - i = 0 - for hit in data[centrotype_idx]: - if i==0: - line.extend(hit) - else: - line.extend(hit[6:]) - i = i+1 - # small validation (if error, it is a programming error): - if i > nhits: - raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) - output_multi_handle.writerow(line) - line = [] - - # Write one line for each centrotype - for centrotype_idx in xrange(len(data)): - for hit in data[centrotype_idx]: - output_single_handle.writerow(hit) - -def alternative_headers(header_part2, nr_alternative_hits): - ''' - This method will iterate over the header names and add the string 'ALT#_' before each, - where # is the number of the alternative, according to number of alternative hits we want to add - to final csv/tsv - ''' - result = [] - for i in xrange(nr_alternative_hits): - for header_name in header_part2: - result.append("ALT" + str(i+1) + "_" + header_name) - return result - -def main(): - ''' - Combine Output main function - It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" - NB: the caslookup_result_file will typically have fewer lines than - rankfilter_result_file, so the merge has to consider this as well. The final file - should have the same nr of lines as rankfilter_result_file. - ''' - rankfilter_result_file = sys.argv[1] - caslookup_result_file = sys.argv[2] - output_single_csv = sys.argv[3] - output_multi_csv = sys.argv[4] - - # Read RankFilter and CasLookup output files - rankfilter = _process_data(rankfilter_result_file) - caslookup = _process_data(caslookup_result_file) - merged, nhits = _merge_data(rankfilter, caslookup) - _save_data(merged, nhits, output_single_csv, output_multi_csv) - - -if __name__ == '__main__': - main() diff -r 41f122255d14 -r 4393f982d18f combine_output.xml --- a/combine_output.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,36 +0,0 @@ - - Perform a combination of output data from the RankFilter and CasLookup tools - - combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi - - - - - - - - - - - -Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files. - -The files produced are contain either all hits for a compound on a single line (Single) or on separate lines -(Multi). - -.. class:: infomark - -**Notes** - -The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the -original headers kept intact. Processing steps include: - - - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values - - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)' - and 'nr. Peaks' fields. - - The formula is split off the 'Name' field in the RankFilter output - - - diff -r 41f122255d14 -r 4393f982d18f create_model.xml --- a/create_model.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ - - Generate coefficients to enable the regression from one GC-column - to another GC-column - Rscripts/ridb-regression.R - $ridb - $out_model - $out_log - $min_residuals - $range_mod - $pvalue - $rsquared - $method - $plot - #if $plot - $model_graphics - #end if - - - - - - - - - - - - - - - - - - (plot) - - - - - -Calculates regression models for a permutation of all GC columns contained in the selected -RI database file. The method used for creating the model is either based on a 3rd degree -polynomial or a standard linear model. - -The *Minimum number of residuals* option will only allow regression if the columns it is based -on has at least that number of datapoints on the same compound. - -Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for -the *R squared* value. The produced logfile will state how many models have been discarded due -to this filtering. The output model file also includes the p-value and R squared value for -each created model. - -Graphical output of the models is available by selecting the plot option which shows the -data points used for the model as well as the fit itself and the range of data that will -be usable. - -.. class:: infomark - -**Notes** - -The output file produced by this tool is required as input for the CasLookup tool when -selecting to apply regression when finding hits in the RIDB. - - diff -r 41f122255d14 -r 4393f982d18f export_to_metexp_tabular.py --- a/export_to_metexp_tabular.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,247 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -''' -Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust -into a tabular file that can be uploaded to the MetExp database. - -RankFilter, CasLookup are already combined by combine_output.py so here we will use -this result. Furthermore here one of the MsClust -quantification files containing the respective spectra details are to be combined as well. - -Extra calculations performed: -- The column MW is also added here and is derived from the column FORMULA found - in RankFilter, CasLookup combined result. - -So in total here we merge 2 files and calculate one new column. -''' -from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 -import csv -import re -import sys -from collections import OrderedDict - -__author__ = "Pieter Lukasse" -__contact__ = "pieter.lukasse@wur.nl" -__copyright__ = "Copyright, 2013, Plant Research International, WUR" -__license__ = "Apache v2" - -def _process_data(in_csv, delim='\t'): - ''' - Generic method to parse a tab-separated file returning a dictionary with named columns - @param in_csv: input filename to be parsed - ''' - data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) - header = data.pop(0) - # Create dictionary with column name as key - output = OrderedDict() - for index in xrange(len(header)): - output[header[index]] = [row[index] for row in data] - return output - -ONE_TO_ONE = 'one_to_one' -N_TO_ONE = 'n_to_one' - -def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): - ''' - Merges data from both input dictionaries based on the link fields. This method will - build up a new list containing the merged hits as the items. - @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) - @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) - ''' - # TODO test for correct input files -> same link_field values should be there - # (test at least number of unique link_field values): - # - # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): - # raise Exception('input files should have the same nr of key values ') - - - merged = [] - processed = {} - for link_field_set1_idx in xrange(len(set1[link_field_set1])): - link_field_set1_value = set1[link_field_set1][link_field_set1_idx] - if not link_field_set1_value in processed : - # keep track of processed items to not repeat them - processed[link_field_set1_value] = link_field_set1_value - - # Get the indices for current link_field_set1_value in both data-structures for proper matching - set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] - set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] - # Validation : - if len(set2index) == 0: - # means that corresponding data could not be found in set2, then throw error - raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + - link_field_set1_value + " only found in first dataset. ") - - merged_hits = [] - # Combine hits - for hit in xrange(len(set1index)): - # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do - # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its - # corresponding value in the sets; i.e. - # set1[key] => returns the list/array with size = nrrows, with the values for the attribute - # represented by "key". - # set1index[hit] => points to the row nr=hit (hit is a rownr/index) - # So set1[x][set1index[n]] = set1.attributeX.instanceN - # - # It just ensures the entry is made available as a plain named array for easy access. - rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) - if relation_type == ONE_TO_ONE : - cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) - else: - # is N to 1: - cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) - - merged_hit = merge_function(rf_record, cl_record, metadata) - merged_hits.append(merged_hit) - - merged.append(merged_hits) - - return merged - - -def _compare_records(key1, key2): - ''' - in this case the compare method is really simple as both keys are expected to contain - same value when records are the same - ''' - if key1 == key2: - return True - else: - return False - - - -def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): - ''' - Combines single records from both the RankFilter+CasLookup combi file and from MsClust file - - @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) - @param msclust_quant_record: msclust quantification + spectrum record - ''' - record = [] - for column in rank_caslookup_combi: - record.append(rank_caslookup_combi[column]) - - for column in msclust_quant_record: - record.append(msclust_quant_record[column]) - - for column in metadata: - record.append(metadata[column]) - - # add MOLECULAR MASS (MM) - molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) - # limit to two decimals: - record.append("{0:.2f}".format(molecular_mass)) - - # add MOLECULAR WEIGHT (MW) - TODO - calculate this - record.append('0.0') - - # level of identification and Location of reference standard - record.append('0') - record.append('') - - return record - - -def get_molecular_mass(formula): - ''' - Calculates the molecular mass (MM). - E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O - ''' - - # Each element is represented by a capital letter, followed optionally by - # lower case, with one or more digits as for how many elements: - element_pattern = re.compile("([A-Z][a-z]?)(\d*)") - - total_mass = 0 - for (element_name, count) in element_pattern.findall(formula): - if count == "": - count = 1 - else: - count = int(count) - element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) - total_mass += element_mass * count - - return total_mass - - - -def _save_data(data, headers, out_csv): - ''' - Writes tab-separated data to file - @param data: dictionary containing merged dataset - @param out_csv: output csv file - ''' - - # Open output file for writing - outfile_single_handle = open(out_csv, 'wb') - output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") - - # Write headers - output_single_handle.writerow(headers) - - # Write - for item_idx in xrange(len(data)): - for hit in data[item_idx]: - output_single_handle.writerow(hit) - - -def _get_map_for_elements_and_masses(elements_and_masses): - ''' - This method will read out the column 'Chemical symbol' and make a map - of this, storing the column 'Relative atomic mass' as its value - ''' - resultMap = {} - index = 0 - for entry in elements_and_masses['Chemical symbol']: - resultMap[entry] = elements_and_masses['Relative atomic mass'][index] - index += 1 - - return resultMap - - -def init_elements_and_masses_map(): - ''' - Initializes the lookup map containing the elements and their respective masses - ''' - elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) - global elements_and_masses_map - elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) - - -def main(): - ''' - Combine Output main function - - RankFilter, CasLookup are already combined by combine_output.py so here we will use - this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust - quantification files are to be combined with combine_output.py result as well. - ''' - rankfilter_and_caslookup_combined_file = sys.argv[1] - msclust_quantification_and_spectra_file = sys.argv[2] - output_csv = sys.argv[3] - # metadata - metadata = OrderedDict() - metadata['organism'] = sys.argv[4] - metadata['tissue'] = sys.argv[5] - metadata['experiment_name'] = sys.argv[6] - metadata['user_name'] = sys.argv[7] - metadata['column_type'] = sys.argv[8] - - # Read RankFilter and CasLookup output files - rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) - msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') - - # Read elements and masses to use for the MW/MM calculation : - init_elements_and_masses_map() - - merged = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', - msclust_quantification_and_spectra, 'centrotype', - _compare_records, _merge_records, metadata, - N_TO_ONE) - headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] - _save_data(merged, headers, output_csv) - - -if __name__ == '__main__': - main() diff -r 41f122255d14 -r 4393f982d18f export_to_metexp_tabular.xml --- a/export_to_metexp_tabular.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - - Create tabular file for loading into METabolomics EXPlorer database - - export_to_metexp_tabular.py - $rankfilter_and_caslookup_combi - $msclust_quant_file - $output_result - "$organism" - "$tissue" - "$experiment_name" - "$user_name" - "$column_type" - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -Tool to combine output from the tools RankFilter, CasLookup and MsClust -into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. - -RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use -this result. - -**Notes** - -Extra calculations performed: -- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. - -So in total here we merge 2 files and calculate one new column. - - - - diff -r 41f122255d14 -r 4393f982d18f library_lookup.py --- a/library_lookup.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,327 +0,0 @@ -''' -Logic for searching a Retention Index database file given output from NIST -''' -import match_library -import re -import sys -import csv - -__author__ = "Marcel Kempenaar" -__contact__ = "brs@nbic.nl" -__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" -__license__ = "MIT" - -def create_lookup_table(library_file, column_type_name, statphase): - ''' - Creates a dictionary holding the contents of the library to be searched - @param library_file: library to read - @param column_type_name: the columns type name - @param statphase: the columns stationary phase - ''' - (data, header) = match_library.read_library(library_file) - # Test for presence of required columns - if ('columntype' not in header or - 'columnphasetype' not in header or - 'cas' not in header): - raise IOError('Missing columns in ', library_file) - - column_type_column = header.index("columntype") - statphase_column = header.index("columnphasetype") - cas_column = header.index("cas") - - filtered_library = [line for line in data if line[column_type_column] == column_type_name - and line[statphase_column] == statphase] - lookup_dict = {} - for element in filtered_library: - # Here the cas_number is set to the numeric part of the cas_column value, so if the - # cas_column value is 'C1433' then cas_number will be '1433' - cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0]) - try: - lookup_dict[cas_number].append(element) - except KeyError: - lookup_dict[cas_number] = [element] - return lookup_dict - - -def _preferred(hits, pref, ctype, polar, model, method): - ''' - Returns all entries in the lookup_dict that have the same column name, type and polarity - as given by the user, uses regression if selected given the model and method to use. The - regression is applied on the column with the best R-squared value in the model - @param hits: all entries in the lookup_dict for the given CAS number - @param pref: preferred GC-column, can be one or more names - @param ctype: column type (capillary etc.) - @param polar: polarity (polar / non-polar etc.) - @param model: data loaded from file containing regression models - @param method: supported regression method (i.e. poly(nomial) or linear) - ''' - match = [] - for column in pref: - for hit in hits: - if hit[4] == ctype and hit[5] == polar and hit[6] == column: - # Create copy of found hit since it will be altered downstream - match.extend(hit) - return match, False - - # No hit found for current CAS number, return if not performing regression - if not model: - return False, False - - # Perform regression - for column in pref: - if column not in model: - break - # Order regression candidates by R-squared value (last element) - order = sorted(model[column].items(), key=lambda col: col[1][-1]) - # Create list of regression candidate column names - regress_columns = list(reversed([column for (column, _) in order])) - # Names of available columns - available = [hit[6] for hit in hits] - - # TODO: combine Rsquared and number of datapoints to get the best regression match - ''' - # Iterate regression columns (in order) and retrieve their models - models = {} - for col in regress_columns: - if col in available: - hit = list(hits[available.index(col)]) - if hit[4] == ctype: - # models contains all model data including residuals [-2] and rsquared [-1] - models[pref[0]] = model[pref[0]][hit[6]] - # Get the combined maximum for residuals and rsquared - best_match = models[] - # Apply regression - if method == 'poly': - regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model) - if regressed: - hit[3] = regressed - else: - return False, False - else: - hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model) - match.extend(hit) - return match, hit[6] - ''' - - for col in regress_columns: - if col in available: - hit = list(hits[available.index(col)]) - if hit[4] == ctype: - # Perform regression using a column for which regression is possible - if method == 'poly': - # Polynomial is only possible within a set border, if the RI falls outside - # of this border, skip this lookup - regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model) - if regressed: - hit[3] = regressed - else: - return False, False - else: - hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model) - match.extend(hit) - return match, hit[6] - - return False, False - - - -def default_hit(row, cas_nr, compound_id): - ''' - This method will return a "default"/empty hit for cases where the - method _preferred() returns False (i.e. a RI could not be found - for the given cas nr, also not via regression. - ''' - return [ - #'CAS', - 'C' + cas_nr, - #'NAME', - '', - #'FORMULA', - '', - #'RI', - '0.0', - #'Column.type', - '', - #'Column.phase.type', - '', - #'Column.name', - '', - #'phase.coding', - ' ', - #'CAS_column.Name', - '', - #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes - compound_id, - #'Regression.Column.Name', - '', - #'min', - '', - #'max', - '', - #'nr.duplicates', - ''] - - -def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method): - ''' - Looks up the compounds in the library lookup table and formats the results - @param lookup_dict: dictionary containing the library to be searched - @param nist_tabular_filename: NIST output file to be matched - @param pref: (list of) column-name(s) to look for - @param ctype: column type of interest - @param polar: polarity of the used column - @param model: data loaded from file containing regression models - @param method: supported regression method (i.e. poly(nomial) or linear) - ''' - (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename) - # Retrieve indices of the CAS and compound_id columns (exit if not present) - try: - casi = header_clean.index("cas") - idi = header_clean.index("id") - except: - raise IOError("'CAS' or 'compound_id' not found in header of library file") - - data = [] - for row in nist_tabular_list: - casf = str(row[casi].replace('-', '').strip()) - compound_id = str(row[idi].split('-')[0]) - if casf in lookup_dict: - found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method) - if found_hit: - # Keep cas nr as 'C'+ numeric part: - found_hit[0] = 'C' + casf - # Add compound id - found_hit.insert(9, compound_id) - # Add information on regression process - found_hit.insert(10, regress if regress else 'None') - # Replace column index references with actual number of duplicates - dups = len(found_hit[-1].split(',')) - if dups > 1: - found_hit[-1] = str(dups + 1) - else: - found_hit[-1] = '0' - data.append(found_hit) - found_hit = '' - else: - data.append(default_hit(row, casf, compound_id)) - else: - data.append(default_hit(row, casf, compound_id)) - - casf = '' - compound_id = '' - found_hit = [] - dups = [] - return data - - -def _save_data(content, outfile): - ''' - Write to output file - @param content: content to write - @param outfile: file to write to - ''' - # header - header = ['CAS', - 'NAME', - 'FORMULA', - 'RI', - 'Column.type', - 'Column.phase.type', - 'Column.name', - 'phase.coding', - 'CAS_column.Name', - 'Centrotype', - 'Regression.Column.Name', - 'min', - 'max', - 'nr.duplicates'] - output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t") - output_handle.writerow(header) - for entry in content: - output_handle.writerow(entry) - - -def _read_model(model_file): - ''' - Creates an easy to search dictionary for getting the regression parameters - for each valid combination of GC-columns - @param model_file: filename containing the regression models - ''' - regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t')) - if len(regress.pop(0)) > 9: - method = 'poly' - else: - method = 'linear' - - model = {} - # Create new dictionary for each GC-column - for line in regress: - model[line[0]] = {} - - # Add data - for line in regress: - if method == 'poly': - model[line[0]][line[1]] = [float(col) for col in line[2:11]] - else: # linear - model[line[0]][line[1]] = [float(col) for col in line[2:9]] - - return model, method - - -def _apply_poly_regression(column1, column2, retention_index, model): - ''' - Calculates a new retention index (RI) value using a given 3rd-degree polynomial - model based on data from GC columns 1 and 2 - @param column1: name of the selected GC-column - @param column2: name of the GC-column to use for regression - @param retention_index: RI to convert - @param model: dictionary containing model information for all GC-columns - ''' - coeff = model[column1][column2] - # If the retention index to convert is within range of the data the model is based on, perform regression - if coeff[4] < retention_index < coeff[5]: - return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + - (retention_index * coeff[1]) + coeff[0]) - else: - return False - - -def _apply_linear_regression(column1, column2, retention_index, model): - ''' - Calculates a new retention index (RI) value using a given linear model based on data - from GC columns 1 and 2 - @param column1: name of the selected GC-column - @param column2: name of the GC-column to use for regression - @param retention_index: RI to convert - @param model: dictionary containing model information for all GC-columns - ''' - # TODO: No use of limits - coeff = model[column1][column2] - return coeff[1] * retention_index + coeff[0] - - -def main(): - ''' - Library Lookup main function - ''' - library_file = sys.argv[1] - nist_tabular_filename = sys.argv[2] - ctype = sys.argv[3] - polar = sys.argv[4] - outfile = sys.argv[5] - pref = sys.argv[6:-1] - regress = sys.argv[-1] - - if regress != 'False': - model, method = _read_model(regress) - else: - model, method = False, None - - lookup_dict = create_lookup_table(library_file, ctype, polar) - data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method) - - _save_data(data, outfile) - - -if __name__ == "__main__": - main() diff -r 41f122255d14 -r 4393f982d18f library_lookup.xml --- a/library_lookup.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,75 +0,0 @@ - - Lookup or estimate the RI using a "known RI values" CAS numbers library - - library_lookup.py - $library_file - $input - "$col_type" - "$polarity" - $output - #for $ctype in $pref - ${ctype.columntype} - #end for - $regression.model - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library. -If a direct match is NOT found for the preferred column name, a regression can be done to find -the theoretical RI value based on known RI values for the CAS number on other column types (see step 4). -If there is no match for the CAS number on any column type, then the record is not given a RI. - - - - - - diff -r 41f122255d14 -r 4393f982d18f match_library.py --- a/match_library.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,133 +0,0 @@ -''' -Containing functions are called from Galaxy to populate lists/checkboxes with selectable items -''' -import csv -import glob -import os - - -__author__ = "Marcel Kempenaar" -__contact__ = "brs@nbic.nl" -__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" -__license__ = "MIT" - -def get_column_type(library_file): - ''' - Returns a Galaxy formatted list of tuples containing all possibilities for the - GC-column types. Used by the library_lookup.xml tool - @param library_file: given library file from which the list of GC-column types is extracted - ''' - if library_file == "": - galaxy_output = [("", "", False)] - else: - (data, header) = read_library(library_file) - - if 'columntype' not in header: - raise IOError('Missing columns in ', library_file) - - # Filter data on column type - column_type = header.index("columntype") - amounts_in_list_dict = count_occurrence([row[column_type] for row in data]) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()] - - return(galaxy_output) - - -def filter_column(library_file, column_type_name): - ''' - Filters the Retention Index database on column type - @param library_file: file containing the database - @param column_type_name: column type to filter on - ''' - if library_file == "": - galaxy_output = [("", "", False)] - else: - (data, header) = read_library(library_file) - - if ('columntype' not in header or - 'columnphasetype' not in header): - raise IOError('Missing columns in ', library_file) - - column_type = header.index("columntype") - statphase = header.index("columnphasetype") - - # Filter data on colunn type name - statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name] - amounts_in_list_dict = count_occurrence(statphase_list) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] - - return(sorted(galaxy_output)) - - -def filter_column2(library_file, column_type_name, statphase): - ''' - Filters the Retention Index database on column type - @param library_file: file containing the database - @param column_type_name: column type to filter on - @param statphase: stationary phase of the column to filter on - ''' - if library_file == "": - galaxy_output = [("", "", False)] - else: - (data, header) = read_library(library_file) - - if ('columntype' not in header or - 'columnphasetype' not in header or - 'columnname' not in header): - raise IOError('Missing columns in ', library_file) - - column_type_column = header.index("columntype") - statphase_column = header.index("columnphasetype") - column_name_column = header.index("columnname") - - # Filter data on given column type name and stationary phase - statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and - line[statphase_column] == statphase] - amounts_in_list_dict = count_occurrence(statphase_list) - galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] - - return(sorted(galaxy_output)) - - -def read_library(filename): - ''' - Reads a CSV file and returns its contents and a normalized header - @param filename: file to read - ''' - data = list(csv.reader(open(filename, 'rU'), delimiter='\t')) - header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)] - return(data, header_clean) - - - -def get_directory_files(dir_name): - ''' - Reads the directory and - returns the list of .txt files found as a dictionary - with file name and full path so that it can - fill a Galaxy drop-down combo box. - - ''' - files = glob.glob(dir_name + "/*.*") - if len(files) == 0: - # Configuration error: no library files found in /" + dir_name : - galaxy_output = [("Configuration error: expected file not found in /" + dir_name, "", False)] - else: - galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files] - return(galaxy_output) - -def get_file_name_no_ext(full_name): - ''' - returns just the last part of the name - ''' - simple_name = os.path.basename(full_name) - base, ext = os.path.splitext(simple_name) - return base - - -def count_occurrence(data_list): - ''' - Counts occurrences in a list and returns a dict with item:occurrence - @param data_list: list to count items from - ''' - return dict((key, data_list.count(key)) for key in set(data_list)) diff -r 41f122255d14 -r 4393f982d18f metaMS/INFO_structure_of_LC.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/INFO_structure_of_LC.txt Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,532 @@ +str(LC) +List of 4 + $ PeakTable :'data.frame': 1828 obs. of 12 variables: + ..$ ChemSpiderID : chr [1:1828] "" "" "" "" ... + ..$ compound : chr [1:1828] "" "" "" "" ... + ..$ pcgroup : num [1:1828] 187 226 226 226 226 226 226 306 154 154 ... + ..$ adduct : chr [1:1828] "" "" "" "" ... + ..$ isotopes : chr [1:1828] "" "" "" "" ... + ..$ mz : num [1:1828] 253 233 214 215 298 ... + ..$ rt : num [1:1828] 1.27 1.57 1.59 1.59 1.59 ... + ..$ CLASS2 : num [1:1828] 2 2 2 2 2 2 2 2 2 2 ... + ..$ STDmix_GC_01 : num [1:1828] 0 0 0 0 0 0 0 0 0 0 ... + ..$ STDmix_GC_02 : num [1:1828] 0 0 0 0 0 0 0 0 0 0 ... + ..$ STDmix_RP_pos01: num [1:1828] 144.5 32.6 58.5 38.4 152.3 ... + ..$ STDmix_RP_pos02: num [1:1828] 343.4 54.5 79.1 64.9 210.8 ... + $ xset :Formal class 'xsAnnotate' [package "CAMERA"] with 15 slots + .. ..@ groupInfo : num [1:1828, 1:13] 30.2 30.5 31.4 32.4 33.4 ... + .. .. ..- attr(*, "dimnames")=List of 2 + .. .. .. ..$ : NULL + .. .. .. ..$ : chr [1:13] "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "npeaks", "CLASS1", "CLASS2", "STDmix_GC_01", "STDmix_GC_02", "STDmix_RP_pos01", "STDmix_RP_pos02" + + and groups() function: + "mzmed", "mzmin", "mzmax", "rtmed", "rtmin", "rtmax", "npeaks", "CLASS1", "CLASS2" + + .. ..@ pspectra :List of 340 + .. .. ..$ : num [1:12] 2 4 5 10 11 13 14 15 16 38 ... + .. .. ..$ : num [1:11] 3 12 45 76 99 ... + .. .. ..$ : num [1:15] 7 8 9 113 162 ... + .. .. ..$ : Named num 1 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:5] 6 150 473 478 802 + .. .. ..$ : num [1:5] 18 28 40 64 85 + .. .. ..$ : num [1:5] 89 528 533 596 872 + .. .. ..$ : num [1:2] 57 197 + .. .. ..$ : num [1:3] 174 708 841 + .. .. ..$ : Named num 23 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:4] 21 34 496 1497 + .. .. ..$ : num [1:3] 29 30 44 + .. .. ..$ : num [1:5] 283 423 452 486 527 + .. .. ..$ : num [1:6] 93 126 251 296 349 406 + .. .. ..$ : num [1:3] 301 479 664 + .. .. ..$ : num [1:5] 47 105 111 191 237 + .. .. ..$ : num [1:5] 185 562 683 923 1548 + .. .. ..$ : num [1:2] 469 809 + .. .. ..$ : num [1:9] 24 146 548 550 1009 ... + .. .. ..$ : num [1:33] 65 72 134 139 196 200 235 299 355 361 ... + .. .. ..$ : num [1:2] 135 234 + .. .. ..$ : num [1:4] 95 326 330 465 + .. .. ..$ : num [1:2] 384 699 + .. .. ..$ : Named num 388 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 249 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:4] 425 485 636 698 + .. .. ..$ : Named num 482 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:2] 310 429 + .. .. ..$ : num [1:97] 33 51 67 96 103 106 114 129 136 140 ... + .. .. ..$ : Named num 436 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:3] 583 790 1029 + .. .. ..$ : Named num 593 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:2] 558 745 + .. .. ..$ : Named num 543 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:44] 43 60 101 121 124 179 211 214 257 263 ... + .. .. ..$ : num [1:2] 243 941 + .. .. ..$ : num [1:4] 657 996 1286 1344 + .. .. ..$ : num [1:21] 145 195 199 217 230 268 271 279 665 743 ... + .. .. ..$ : Named num 836 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 694 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 578 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 612 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 684 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:18] 31 163 194 395 441 ... + .. .. ..$ : num [1:2] 79 282 + .. .. ..$ : Named num 297 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:10] 167 169 281 284 750 ... + .. .. ..$ : num [1:30] 165 796 812 821 829 ... + .. .. ..$ : num [1:4] 112 566 733 1552 + .. .. ..$ : num [1:33] 42 63 68 90 92 102 125 130 133 137 ... + .. .. ..$ : num [1:14] 183 188 435 960 1036 ... + .. .. ..$ : Named num 763 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 704 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:2] 693 797 + .. .. ..$ : Named num 643 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:45] 62 100 115 128 132 178 327 366 430 492 ... + .. .. ..$ : Named num 838 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 653 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:9] 182 186 1035 1040 1045 ... + .. .. ..$ : Named num 250 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:2] 144 935 + .. .. ..$ : num [1:3] 32 1259 1394 + .. .. ..$ : Named num 77 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:27] 19 46 49 107 110 157 159 233 265 314 ... + .. .. ..$ : num [1:4] 253 410 777 991 + .. .. ..$ : Named num 1031 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 978 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:22] 887 894 1023 1028 1037 ... + .. .. ..$ : num [1:5] 258 312 1669 1670 1694 + .. .. ..$ : num [1:19] 319 730 735 888 895 ... + .. .. ..$ : num [1:22] 55 164 541 617 649 656 661 668 753 766 ... + .. .. ..$ : num [1:7] 513 545 770 773 1080 ... + .. .. ..$ : num [1:4] 70 295 634 1665 + .. .. ..$ : num [1:3] 625 1447 1453 + .. .. ..$ : Named num 287 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:2] 964 970 + .. .. ..$ : num [1:153] 20 22 36 37 39 53 56 58 59 61 ... + .. .. ..$ : num [1:9] 317 323 420 886 1022 ... + .. .. ..$ : Named num 149 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:6] 487 549 609 623 728 ... + .. .. ..$ : num [1:6] 315 476 1265 1272 1634 ... + .. .. ..$ : num [1:5] 378 383 495 734 1063 + .. .. ..$ : num [1:10] 645 818 823 914 919 ... + .. .. ..$ : num [1:7] 815 1388 1457 1539 1555 ... + .. .. ..$ : Named num 1019 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:6] 391 398 491 742 982 ... + .. .. ..$ : Named num 546 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:12] 794 804 811 820 828 ... + .. .. ..$ : num [1:24] 1089 1475 1481 1484 1491 ... + .. .. ..$ : num [1:2] 148 151 + .. .. ..$ : num [1:33] 74 104 108 147 181 189 224 232 260 298 ... + .. .. ..$ : num [1:17] 293 332 394 447 449 552 619 622 680 805 ... + .. .. ..$ : Named num 309 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 207 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : Named num 584 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:2] 637 865 + .. .. ..$ : Named num 143 + .. .. .. ..- attr(*, "names")= chr "" + .. .. ..$ : num [1:18] 1737 1738 1741 1757 1758 ... + .. .. ..$ : num [1:29] 261 352 360 363 402 408 416 419 456 507 ... + .. .. .. [list output truncated] + .. ..@ psSamples : int [1:340] 1 1 1 2 2 2 2 1 2 1 ... + .. ..@ isotopes :List of 1828 + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ :List of 4 + .. .. .. ..$ y : num 1 + .. .. .. ..$ iso : num 0 + .. .. .. ..$ charge: Named num 1 + .. .. .. .. ..- attr(*, "names")= chr "charge" + .. .. .. ..$ val : Named num 0 + .. .. .. .. ..- attr(*, "names")= chr "intrinsic" + .. .. ..$ :List of 4 + .. .. .. ..$ y : num 1 + .. .. .. ..$ iso : Named num 1 + .. .. .. .. ..- attr(*, "names")= chr "iso" + .. .. .. ..$ charge: Named num 1 + .. .. .. .. ..- attr(*, "names")= chr "charge" + .. .. .. ..$ val : Named num 0 + .. .. .. .. ..- attr(*, "names")= chr "intrinsic" + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ :List of 4 + .. .. .. ..$ y : num 2 + .. .. .. ..$ iso : num 0 + .. .. .. ..$ charge: Named num 1 + .. .. .. .. ..- attr(*, "names")= chr "charge" + .. .. .. ..$ val : Named num 0 + .. .. .. .. ..- attr(*, "names")= chr "intrinsic" + .. .. ..$ : NULL + .. .. ..$ :List of 4 + .. .. .. ..$ y : num 2 + .. .. .. ..$ iso : Named num 1 + .. .. .. .. ..- attr(*, "names")= chr "iso" + .. .. .. ..$ charge: Named num 1 + .. .. .. .. ..- attr(*, "names")= chr "charge" + .. .. .. ..$ val : Named num 0 + .. .. .. .. ..- attr(*, "names")= chr "intrinsic" + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. ..$ : NULL + .. .. .. [list output truncated] + .. ..@ derivativeIons : list() + .. ..@ formula : list() + .. ..@ sample : num NA + .. ..@ xcmsSet :Formal class 'xcmsSet' [package "xcms"] with 12 slots + .. .. .. ..@ peaks : num [1:9826, 1:13] 30.2 30.5 31.4 32.4 33.3 ... + .. .. .. .. ..- attr(*, "dimnames")=List of 2 + .. .. .. .. .. ..$ : NULL + .. .. .. .. .. ..$ : chr [1:13] "mz", "mzmin", "mzmax", "rt", "rtmin", "rtmax", "into", "intf", "maxo", "maxf", "i", "sn", "sample" (found with dput) + From http://www.bioconductor.org/packages/release/bioc/manuals/xcms/man/xcms.pdf: + + mz weighted (by intensity) mean of peak m/z across scans +mzmin m/z of minimum step +mzmax m/z of maximum step +rt retention time of peak midpoint +rtmin leading edge of peak retention time +rtmax trailing edge of peak retention time +into integrated area of original (raw) peak +intf integrated area of filtered peak +maxo maximum intensity of original (raw) peak +maxf maximum intensity of filtered peak +i rank of peak identified in merged EIC (<= max) +sn signal to noise ratio of the peak + + .. .. .. ..@ groups : num [1:1828, 1:9] 30.2 30.5 31.4 32.4 33.4 ... + .. .. .. .. ..- attr(*, "dimnames")=List of 2 + .. .. .. .. .. ..$ : NULL + .. .. .. .. .. ..$ : chr [1:9] "mzmed" "mzmin" "mzmax" "rtmed" ... + .. .. .. ..@ groupidx :List of 1828 + .. .. .. .. ..$ : int [1:4] 1 674 9671 9749 + .. .. .. .. ..$ : int [1:4] 2 675 9672 9750 + .. .. .. .. ..$ : int [1:4] 3 676 9673 9751 + .. .. .. .. ..$ : int [1:4] 4 677 9674 9752 + .. .. .. .. ..$ : int [1:4] 6 678 9675 9753 + .. .. .. .. ..$ : int [1:4] 9 683 9676 9754 + .. .. .. .. ..$ : int [1:4] 11 684 9677 9755 + .. .. .. .. ..$ : int [1:4] 13 686 9678 9756 + .. .. .. .. ..$ : int [1:4] 17 688 9679 9757 + .. .. .. .. ..$ : int [1:4] 18 689 9680 9758 + .. .. .. .. ..$ : int [1:4] 19 691 9681 9759 + .. .. .. .. ..$ : int [1:4] 21 693 9682 9760 + .. .. .. .. ..$ : int [1:4] 23 695 9683 9761 + .. .. .. .. ..$ : int [1:4] 24 699 9684 9762 + .. .. .. .. ..$ : int [1:4] 26 700 9685 9763 + .. .. .. .. ..$ : int [1:4] 28 702 9686 9764 + .. .. .. .. ..$ : int [1:4] 1353 3930 6183 7926 + .. .. .. .. ..$ : int [1:4] 36 707 9687 9765 + .. .. .. .. ..$ : int [1:4] 1354 3931 6184 7927 + .. .. .. .. ..$ : int [1:4] 1355 3932 6185 7928 + .. .. .. .. ..$ : int [1:4] 39 712 9688 9766 + .. .. .. .. ..$ : int [1:4] 1356 3934 6186 7929 + .. .. .. .. ..$ : int [1:4] 40 714 9689 9767 + .. .. .. .. ..$ : int [1:4] 1358 3935 6187 7930 + .. .. .. .. ..$ : int [1:4] 1359 3936 6188 7931 + .. .. .. .. ..$ : int [1:4] 1361 3938 6189 7932 + .. .. .. .. ..$ : int [1:4] 1360 3937 6190 7933 + .. .. .. .. ..$ : int [1:4] 45 722 9690 9768 + .. .. .. .. ..$ : int [1:4] 54 735 9691 9769 + .. .. .. .. ..$ : int [1:4] 57 737 9692 9770 + .. .. .. .. ..$ : int [1:4] 1365 3939 6191 7934 + .. .. .. .. ..$ : int [1:4] 1364 3940 6192 7935 + .. .. .. .. ..$ : int [1:4] 1369 3941 6193 7936 + .. .. .. .. ..$ : int [1:4] 58 739 9693 9771 + .. .. .. .. ..$ : int [1:4] 1370 3942 6194 7937 + .. .. .. .. ..$ : int [1:4] 1371 3943 6195 7938 + .. .. .. .. ..$ : int [1:4] 1374 3944 6196 7939 + .. .. .. .. ..$ : int [1:4] 65 748 9694 9772 + .. .. .. .. ..$ : int [1:4] 1375 3946 6197 7940 + .. .. .. .. ..$ : int [1:4] 71 755 9695 9773 + .. .. .. .. ..$ : int [1:5] 72 756 1376 1377 3947 + .. .. .. .. ..$ : int [1:4] 1378 3948 6198 7941 + .. .. .. .. ..$ : int [1:4] 1379 3950 6199 7942 + .. .. .. .. ..$ : int [1:4] 74 758 9696 9774 + .. .. .. .. ..$ : int [1:4] 1381 3951 6200 7943 + .. .. .. .. ..$ : int [1:4] 1382 3952 6201 7944 + .. .. .. .. ..$ : int [1:4] 78 761 9697 9775 + .. .. .. .. ..$ : int [1:4] 1383 3955 6202 7945 + .. .. .. .. ..$ : int [1:4] 1384 3956 6203 7946 + .. .. .. .. ..$ : int [1:4] 1385 3957 6204 7947 + .. .. .. .. ..$ : int [1:4] 1387 3958 6205 7948 + .. .. .. .. ..$ : int [1:4] 1388 3959 6206 7949 + .. .. .. .. ..$ : int [1:4] 1389 3960 6207 7950 + .. .. .. .. ..$ : int [1:4] 1390 3962 6208 7951 + .. .. .. .. ..$ : int [1:4] 1391 3963 6209 7952 + .. .. .. .. ..$ : int [1:4] 1392 3965 6210 7953 + .. .. .. .. ..$ : int [1:4] 90 776 9698 9776 + .. .. .. .. ..$ : int [1:4] 1393 3966 6211 7954 + .. .. .. .. ..$ : int [1:4] 1394 3967 6212 7955 + .. .. .. .. ..$ : int [1:4] 1395 3968 6213 7956 + .. .. .. .. ..$ : int [1:4] 1397 3970 6214 7957 + .. .. .. .. ..$ : int [1:4] 1399 3972 6215 7958 + .. .. .. .. ..$ : int [1:4] 1398 3971 6216 7959 + .. .. .. .. ..$ : int [1:4] 101 788 9699 9777 + .. .. .. .. ..$ : int [1:4] 100 790 9700 9778 + .. .. .. .. ..$ : int [1:4] 1401 3973 6217 7960 + .. .. .. .. ..$ : int [1:4] 1403 3975 6218 7961 + .. .. .. .. ..$ : int [1:4] 1402 3974 6219 7962 + .. .. .. .. ..$ : int [1:4] 1404 3976 6220 7963 + .. .. .. .. ..$ : int [1:4] 1405 3977 6221 7964 + .. .. .. .. ..$ : int [1:4] 1406 3978 6222 7965 + .. .. .. .. ..$ : int [1:4] 1407 3979 6223 7966 + .. .. .. .. ..$ : int [1:4] 1408 3980 6224 7967 + .. .. .. .. ..$ : int [1:4] 1409 3981 6225 7968 + .. .. .. .. ..$ : int [1:4] 1412 3982 6226 7969 + .. .. .. .. ..$ : int [1:4] 1413 3983 6227 7970 + .. .. .. .. ..$ : int [1:4] 1414 3984 6228 7971 + .. .. .. .. ..$ : int [1:4] 1415 3985 6229 7972 + .. .. .. .. ..$ : int [1:4] 1417 3987 6230 7973 + .. .. .. .. ..$ : int [1:4] 1416 3988 6231 7974 + .. .. .. .. ..$ : int [1:4] 1419 3989 6232 7975 + .. .. .. .. ..$ : int [1:4] 1421 3991 6233 7976 + .. .. .. .. ..$ : int [1:4] 1420 3990 6234 7977 + .. .. .. .. ..$ : int [1:4] 1422 3992 6235 7978 + .. .. .. .. ..$ : int [1:4] 115 802 9701 9779 + .. .. .. .. ..$ : int [1:4] 1423 3993 6236 7979 + .. .. .. .. ..$ : int [1:4] 1425 3995 6237 7980 + .. .. .. .. ..$ : int [1:4] 1426 3996 6238 7981 + .. .. .. .. ..$ : int [1:4] 120 807 9702 9780 + .. .. .. .. ..$ : int [1:4] 1427 3997 6239 7982 + .. .. .. .. ..$ : int [1:4] 1430 3998 6240 7983 + .. .. .. .. ..$ : int [1:4] 1431 3999 6241 7984 + .. .. .. .. ..$ : int [1:4] 123 812 9703 9781 + .. .. .. .. ..$ : int [1:4] 1432 4000 6242 7985 + .. .. .. .. ..$ : int [1:4] 1433 4001 6243 7986 + .. .. .. .. ..$ : int [1:4] 1434 4002 6244 7987 + .. .. .. .. ..$ : int [1:4] 1436 4005 6245 7988 + .. .. .. .. ..$ : int [1:4] 1435 4004 6246 7989 + .. .. .. .. ..$ : int [1:4] 1437 4006 6247 7990 + .. .. .. .. .. [list output truncated] + .. .. .. ..@ filled : int [1:3644] 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 ... + .. .. .. ..@ phenoData :'data.frame': 4 obs. of 1 variable: + .. .. .. .. ..$ class: Factor w/ 2 levels "CLASS1","CLASS2": 1 1 2 2 + .. .. .. ..@ rt :List of 2 + .. .. .. .. ..$ raw :List of 4 + .. .. .. .. .. ..$ : num [1:10136] 194 194 194 194 195 ... + .. .. .. .. .. ..$ : num [1:10138] 194 194 194 194 195 ... + .. .. .. .. .. ..$ : num [1:2614] 3.46 4.75 6.04 7.32 8.61 ... + .. .. .. .. .. ..$ : num [1:2613] 3.47 4.77 6.06 7.37 8.66 ... + .. .. .. .. ..$ corrected:List of 4 + .. .. .. .. .. ..$ : num [1:10136] 168 169 169 169 169 ... + .. .. .. .. .. ..$ : num [1:10138] 157 158 158 158 158 ... + .. .. .. .. .. ..$ : num [1:2614] 40.5 41.8 43.1 44.4 45.7 ... + .. .. .. .. .. ..$ : num [1:2613] 40.7 42 43.3 44.6 45.9 ... + .. .. .. ..@ filepaths : chr [1:4] "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS1/STDmix_GC_01.CDF" "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS1/STDmix_GC_02.CDF" "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS2/STDmix_RP_pos01.CDF" "E:/workspace/PRIMS-metabolomics/python-tools/tools/metaMS/test/zipOut/CLASS2/STDmix_RP_pos02.CDF" + .. .. .. ..@ profinfo :List of 2 + .. .. .. .. ..$ method: chr "bin" + .. .. .. .. ..$ step : num 0.05 + .. .. .. ..@ dataCorrection : int(0) + .. .. .. ..@ polarity : chr(0) + .. .. .. ..@ progressInfo :List of 12 + .. .. .. .. ..$ group.density : num 0 + .. .. .. .. ..$ group.mzClust : num 0 + .. .. .. .. ..$ group.nearest : num 0 + .. .. .. .. ..$ findPeaks.centWave : num 0 + .. .. .. .. ..$ findPeaks.massifquant : num 0 + .. .. .. .. ..$ findPeaks.matchedFilter: num 0 + .. .. .. .. ..$ findPeaks.MS1 : num 0 + .. .. .. .. ..$ findPeaks.MSW : num 0 + .. .. .. .. ..$ retcor.obiwarp : num 0 + .. .. .. .. ..$ retcor.peakgroups : num 0 + .. .. .. .. ..$ fillPeaks.chrom : num 0 + .. .. .. .. ..$ fillPeaks.MSW : num 0 + .. .. .. ..@ progressCallback:function (progress) + .. ..@ ruleset : NULL + .. ..@ annoID : logi[0 , 1:4] + .. .. ..- attr(*, "dimnames")=List of 2 + .. .. .. ..$ : NULL + .. .. .. ..$ : chr [1:4] "id" "grpID" "ruleID" "parentID" + .. ..@ annoGrp : logi[0 , 1:4] + .. .. ..- attr(*, "dimnames")=List of 2 + .. .. .. ..$ : NULL + .. .. .. ..$ : chr [1:4] "id" "mass" "ips" "psgrp" + .. ..@ isoID : num [1:315, 1:4] 14 90 107 121 134 137 148 152 157 170 ... + .. .. ..- attr(*, "dimnames")=List of 2 + .. .. .. ..$ : NULL + .. .. .. ..$ : chr [1:4] "mpeak" "isopeak" "iso" "charge" + .. ..@ polarity : chr "" + .. ..@ runParallel :List of 1 + .. .. ..$ enable: num 0 + .. ..@ .__classVersion__:Formal class 'Versions' [package "Biobase"] with 1 slots + .. .. .. ..@ .Data:List of 1 + .. .. .. .. ..$ : int [1:3] 1 13 333 + $ Annotation:List of 5 + ..$ annotation.table :'data.frame': 27 obs. of 13 variables: + .. ..$ feature : int [1:27] 190 193 195 206 208 209 212 367 368 371 ... + .. ..$ db_position : int [1:27] 21 22 23 21 23 24 22 1 2 4 ... + .. ..$ ChemSpiderID: int [1:27] 10463792 10463792 10463792 10463792 10463792 10463792 10463792 8711 8711 8711 ... + .. ..$ mz : num [1:27] 137 348 696 137 696 ... + .. ..$ rt : num [1:27] 10.8 10.8 10.8 11.2 11.2 ... + .. ..$ I : num [1:27] 10660 3754.5 32.4 5127 58.2 ... + .. ..$ compound : Factor w/ 4 levels "D-(+)-Catechin",..: 4 4 4 4 4 4 4 1 1 1 ... + .. ..$ db_mz : num [1:27] 137 348 696 137 696 ... + .. ..$ db_rt : num [1:27] 10.8 10.8 10.8 10.8 10.8 ... + .. ..$ db_I : num [1:27] 33.4 4478.7 58.2 33.4 58.2 ... + .. ..$ db_ann : Factor w/ 1 level "automatic": 1 1 1 1 1 1 1 1 1 1 ... + .. ..$ mz.err : num [1:27] 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.005 ... + .. ..$ clid : int [1:27] 1 1 1 2 2 2 2 2 2 2 ... + ..$ compounds : chr [1:4] "adenosine 2'-monophosphate" "D-(+)-Catechin" "malvidin-3-glucoside" "rutin " + ..$ ChemSpiderIDs : int [1:4] 10463792 8711 391785 4444362 + ..$ multiple.annotations: num(0) + ..$ ann.features : int [1:27] 190 193 195 206 208 209 212 367 368 371 ... + $ Settings :Formal class 'metaMSsettings' [package "metaMS"] with 30 slots + .. ..@ protocolName : chr "Synapt.QTOF.RP" + .. ..@ chrom : chr "LC" + .. ..@ PeakPicking :List of 5 + .. .. ..$ method : chr "matchedFilter" + .. .. ..$ step : num 0.05 + .. .. ..$ fwhm : num 20 + .. .. ..$ snthresh: num 4 + .. .. ..$ max : num 50 + .. ..@ Alignment :List of 9 + .. .. ..$ min.class.fraction: num 0.3 + .. .. ..$ min.class.size : num 3 + .. .. ..$ mzwid : num 0.1 + .. .. ..$ bws : num [1:2] 30 10 + .. .. ..$ missingratio : num 0.2 + .. .. ..$ extraratio : num 0.1 + .. .. ..$ retcormethod : chr "linear" + .. .. ..$ retcorfamily : chr "symmetric" + .. .. ..$ fillPeaks : logi TRUE + .. ..@ CAMERA :List of 3 + .. .. ..$ perfwhm : num 0.6 + .. .. ..$ cor_eic_th: num 0.7 + .. .. ..$ ppm : num 5 + .. ..@ match2DB.rtdiff : num 1.5 + .. ..@ match2DB.minfeat : num 2 + .. ..@ match2DB.rtval : num 0.1 + .. ..@ match2DB.mzdiff : num 0.005 + .. ..@ match2DB.ppm : num 5 + .. ..@ match2DB.simthresh : num(0) + .. ..@ match2DB.timeComparison : chr(0) + .. ..@ match2DB.RIdiff : num(0) + .. ..@ DBconstruction.minfeat : num(0) + .. ..@ DBconstruction.rttol : num(0) + .. ..@ DBconstruction.mztol : num(0) + .. ..@ DBconstruction.minintens : num(0) + .. ..@ DBconstruction.intensityMeasure : chr(0) + .. ..@ DBconstruction.DBthreshold : num(0) + .. ..@ matchIrrelevants.irrelevantClasses: chr(0) + .. ..@ matchIrrelevants.simthresh : num(0) + .. ..@ matchIrrelevants.timeComparison : chr(0) + .. ..@ matchIrrelevants.RIdiff : num(0) + .. ..@ matchIrrelevants.rtdiff : num(0) + .. ..@ betweenSamples.min.class.fraction : num(0) + .. ..@ betweenSamples.min.class.size : num(0) + .. ..@ betweenSamples.timeComparison : chr(0) + .. ..@ betweenSamples.rtdiff : num(0) + .. ..@ betweenSamples.RIdiff : num(0) + .. ..@ betweenSamples.simthresh : num(0) diff -r 41f122255d14 -r 4393f982d18f metaMS/README.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/README.txt Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,70 @@ +Wrappers for: +- the metaMS R package by Ron Wehrens. +- the xcms package. + +Wrappers written by Pieter Lukasse. + + +Installation (when updating: close all vignettes [i.e. pdfs/manuals] !) + +In R: +source("http://bioconductor.org/biocLite.R") +biocLite("metaMS") +biocLite("multtest") +#biocLite("R2HTML") +# for "multi-threading" (actually starts multiple R processes for parallel processing): +install.packages("snow") +install.packages("Cairo") + +======Run metaMS_cmd_pick_and_group.r with:================= + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_pick_and_group.r test/extdata.zip test/example_settings.txt test/out/peakTable.txt test/out/xsAnnotatePrep.rdata positive test/out/html_peaks.html test/out + + +======Run metaMS_cmd_annotate.r with:================= + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript metaMS_cmd_annotate.r test/LCDBtest.RData test/out/xsAnnotatePrep.rdata test/example_settings.txt test/out/annotationTable.txt "0" test/out/html_annot.html test/out + + +======Run xcms_differential_analysis.r with:================= + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_differential_analysis.r test/out/xsAnnotatePrep.rdata "CLASS1" "CLASS2" 10 test/out2/outtable.tsv test/out2/html/html.html test/out2/html + + +======Run xcms_get_eic.r with:================= + + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_alignment_eic.r test/out/xsAnnotatePrep.rdata 10 300 3 STDmix_GC_01,STDmix_GC_02 test/out3/html/html.html test/out3/html + +OR + +E:\workspace\PRIMS-metabolomics\python-tools\tools\metaMS>Rscript xcms_get_mass_eic.r test/out/xsAnnotatePrep.rdata 10 3000 -1 -1 "77.98,231.96" 5 STDmix_GC_01,STDmix_GC_02 Yes raw test/out4/html/html.html test/out4/html + + + +!!!!!!!!!!!!!!!!Troubleshooting:!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + +NetCDF is required. If the following is found in the installation.log + +In file included from rnetCDF.c:2:0: +rnetCDF.h:1:20: fatal error: netcdf.h: No such file or directory +compilation terminated. + + +then metaMS will not have been installed and running the tool will result in error: + + +Possible solution: +> Install the -dev of those packages to get the headers that are + required to compile the package. In this case, you need libnetcdf-dev, udunits-bin and libudunits2-dev + (from http://stackoverflow.com/questions/11319698/how-to-install-r-packages-rnetcdf-and-ncdf-on-ubuntu) +So +>>sudo apt-get install libnetcdf-dev, udunits-bin and libudunits2-dev + +Cairo / no X11 mode is required. + +Possible solution: +> install of cairo (http://www.cairographics.org/) and/or set CAIRO_CFLAGS/LIBS correspondingly. +So +>>sudo apt-get install libcairo2-dev diff -r 41f122255d14 -r 4393f982d18f metaMS/__init__.py diff -r 41f122255d14 -r 4393f982d18f metaMS/metaMS_cmd_annotate.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metaMS_cmd_annotate.r Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,86 @@ +## read args: +args <- commandArgs(TRUE) +## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData" +args.constructedDB <- args[1] +## data file in xset format: +args.xsetData <- args[2] +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- args[3] + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outAnnotationTable <- args[4] + +args.mass_error_function <- args[5] +if (args.mass_error_function == "0") + args.mass_error_function <- NULL +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +cat("\nSettings used===============:\n") +cat(readChar(args.settings, 1e5)) + + +tryCatch( + { + library(metaMS) + + ## load the constructed DB : + tempEnv <- new.env() + testDB <- load(args.constructedDB, envir=tempEnv) + xsetData <- readRDS(args.xsetData) + + ## load settings "script" into "customMetaMSsettings" + source(args.settings, local=tempEnv) + message(paste(" loaded : ", args.settings)) + + # Just to highlight: if you want to use more than one + # trigger runLC: + LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE) + + # write out runLC annotation results: + write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE) + + # the used constructed DB (write to log): + cat("\nConstructed DB info===============:\n") + str(tempEnv[[testDB[1]]]$Info) + cat("\nConstructed DB table===============:\n") + if (length(args) == 8) + { + write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE) + write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE) + } + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + html <- "

Summary of annotation results:

" + nrTotalFeatures <- nrow(LC$PeakTable) + nrAnnotatedFeatures <- nrow(LC$Annotation$annotation.table) + html <- paste(html,"

Total nr of features: ", nrTotalFeatures,"

", sep="") + html <- paste(html,"

Total nr of annotated features: ", nrAnnotatedFeatures,"

", sep="") + + html <- paste(html,"") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 41f122255d14 -r 4393f982d18f metaMS/metaMS_cmd_pick_and_group.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metaMS_cmd_pick_and_group.r Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,92 @@ +## read args: +args <- commandArgs(TRUE) +## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/" +args.dataZip <- args[1] +args.zipExtrDir <- sub("\\.","_",paste(args[1],"dir", sep="")) +dir.create(file.path(args.zipExtrDir), showWarnings = FALSE, recursive = TRUE) +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- args[2] + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outPeakTable <- args[3] +args.xsetOut <- args[4] + +# polarity as explicit parameter: +args.runLC_polarity <- args[5] + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +cat("\nSettings used===============:\n") +cat(readChar(args.settings, 1e5)) + + +tryCatch( + { + library(metaMS) + + ## load the data files from a zip file + files <- unzip(args.dataZip, exdir=args.zipExtrDir) + + ## load settings "script" into "customMetaMSsettings" + tempEnv <- new.env() + source(args.settings, local=tempEnv) + message(paste(" loaded : ", args.settings)) + allSettings <- tempEnv[["customMetaMSsettings"]] + + # trigger runLC: + LC <- runLC(files, settings = allSettings, polarity=args.runLC_polarity, nSlaves=20, returnXset = TRUE) + + # write out runLC annotation results: + write.table(LC$PeakTable, args.outPeakTable, sep="\t", row.names=FALSE) + + # save xset as rdata: + xsAnnotatePreparedData <- LC$xset + saveRDS(xsAnnotatePreparedData, file=args.xsetOut) + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + html <- "

Info on alignment quality

" + # TODO add (nr and mass error) and group size + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="") + html <- paste(html,"
", sep="") + png( figureName, type="cairo", width=1100,height=600 ) + retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden") + html <- paste(html,"*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run, + but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once + this option is added in metaMS API.
", sep="") + devname = dev.off() + + + gt <- groups(LC$xset@xcmsSet) + groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3) + + html <- paste(html,"") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 41f122255d14 -r 4393f982d18f metaMS/metams_lcms_annotate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metams_lcms_annotate.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,121 @@ + + Runs metaMS process for LC/MS feature annotation + + R_bioc_metams + + + metaMS_cmd_annotate.r + $constructed_db + $xsetData + $customMetaMSsettings + $outputFile + #if $mzTol.mzTolType == "fixed" + 0 + #else + "$mzTol.mass_error_function" + #end if + $htmlReportFile + $htmlReportFile.files_path + $outputLog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## start comment + ## metaMS process settings + customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", + chrom = "LC") +metaSetting(customMetaMSsettings, "match2DB") <- list( + rtdiff = ${rtdiff}, + rtval = ${rtval}, + #if $mzTol.mzTolType == "fixed" + mzdiff = ${mzTol.mzdiff}, + #else + ppm = ${mzTol.ppm}, + #end if + minfeat = ${minfeat}) + + + + + + + + + + + + + +.. class:: infomark + +Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB. +The figure below shows the main parts of this metaMS process. + +.. image:: $PATH_TO_IMAGES/metaMS_annotate.png + + +.. class:: infomark + +The implemented annotation strategy can be broken down in the following steps: + +1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If +the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed +tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the +bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e. +features which are associated to more than one compound - are possible. This outcome does not +indicate a problem per se, but is an inherent drawback of co-elution. + +2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all +the experimental features which are assigned to a specific compound. A specific annotation is +confirmed only if more than minfeat features which differ in retention time less than rtval are +present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The +latter, indeed, accounts for shifts in retention time between the injection of the standards and the +metabolomics experiment under investigation. This time can be rather long, considering that the +standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift +between the ions of the same compound within the same batch of injections and therefore it has +only to account for the smaller shifts occurring during peak picking and alignment. + + + + + 10.1016/j.jchromb.2014.02.051 + + \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS/metams_lcms_pick_and_group.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/metams_lcms_pick_and_group.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,301 @@ + + Runs metaMS process for LC/MS feature picking, aligning and grouping + + R_bioc_metams + + + metaMS_cmd_pick_and_group.r + $data_files + $customMetaMSsettings + $outputFile + $xsetOut + $polarity + $htmlReportFile + $htmlReportFile.files_path + $outputLog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## ==================================== + ## metaMS process settings + customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", + chrom = "LC", + PeakPicking = list( + method = "${peakPicking.method}", + #if $peakPicking.method == "matchedFilter" + fwhm = ${peakPicking.fwhm}, + sigma = ${peakPicking.fwhm}/${peakPicking.sigma_denom}, + max = ${peakPicking.max}, + snthresh = ${peakPicking.snthresh}, + step = ${peakPicking.step}, + steps = ${peakPicking.steps}, + mzdiff = ${peakPicking.mzdiff}), + #else + ppm = ${peakPicking.ppm}, + peakwidth = c(${peakPicking.peakwidth}), + snthresh = ${peakPicking.snthresh}, + prefilter = c(${peakPicking.prefilter}), + mzCenterFun = "${peakPicking.mzCenterFun}", + integrate = ${peakPicking.integrate}, + mzdiff = ${peakPicking.mzdiff}, + fitgauss = ${peakPicking.fitgauss}, + noise = ${peakPicking.noise}), + #end if + Alignment = list( + min.class.fraction = ${min_class_fraction}, + min.class.size = ${min_class_size}, + mzwid = ${mzwid}, + bws = c(${bws}), + retcormethod = "${retcor.retcormethod}", + #if $retcor.retcormethod == "peakgroups" + smooth = "${retcor.smooth}", + missingratio = ${retcor.missingratio}, + extraratio = ${retcor.extraratio}, + retcorfamily = "${retcor.retcorfamily}", + #else + ##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS): + method = "${retcor.retcormethod}", + profStep = ${retcor.profStep}, + #end if + fillPeaks = ${fillPeaks}), + CAMERA = list( + perfwhm = ${perfwhm}, + cor_eic_th = ${cor_eic_th}, + ppm= ${ppm}, + graphMethod= "${groupCorr_graphMethod}", + pval= ${groupCorr_pval}, + calcCiS= ${groupCorr_calcCiS}, + calcIso= ${groupCorr_calcIso}, + calcCaS= ${groupCorr_calcCaS}, + maxcharge= ${findIsotopes_maxcharge}, + maxiso= ${findIsotopes_maxiso}, + minfrac= ${findIsotopes_minfrac}, + multiplier= ${findAdducts_multiplier} + )) + + + + + + + + + + + + +.. class:: infomark + +Runs metaMS process for LC/MS feature feature picking, aligning and grouping. +This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms. +CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data. +The figure below shows the main parts of the metaMS process wrapped by this tool. + +.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png + + +From CAMERA documentation: + +.. image:: $PATH_TO_IMAGES/CAMERA_results.png + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers: + +Wehrens, R.; Weingart, G.; Mattivi, F. (2014). +metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics. +Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116. +doi: 10.1016/j.jchromb.2014.02.051 +handle: http://hdl.handle.net/10449/24012 + +Wrapper by Pieter Lukasse. + + + + + 10.1016/j.jchromb.2014.02.051 + + \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS/next.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/next.r Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,9 @@ +# next script, useful for plotting the CAMERA groups pseudo-spectra: +plotPsSpectrum( LC$xset,320,maxlabel=10) +320 is group id +plotEICs( LC$xset,35,maxlabel=5) + + +could use this for the items identified by "annotate" step. +could also integrate this to getEIC tool (add option to give group list instead of m/z list)...but then rt and other options + are also not so relevant...maybe a separate tool is more clear \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS/tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/tool_dependencies.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,13 @@ + + + + + + + + This dependency: + Ensures R 3.1.1 installation is triggered (via dependency). + Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed. + + \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_differential_analysis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_differential_analysis.r Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,85 @@ +## read args: +args <- commandArgs(TRUE) +#cat("args <- \"\"\n") +## a xcms xset saved as .RData +args.xsetData <- args[1] +#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep="")) + +args.class1 <- args[2] +args.class2 <- args[3] +#cat(paste("args.class1 <- \"", args[2], "\"\n", sep="")) +#cat(paste("args.class2 <- \"", args[3], "\"\n", sep="")) + +args.topcount <- strtoi(args[4]) +#cat(paste("args.topcount <- ", args[4], "\n", sep="")) + +args.outTable <- args[5] + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] +#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep="")) +#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep="")) + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +tryCatch( + { + library(metaMS) + library(xcms) + #library("R2HTML") + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + + # info: levels(xcmsSet@phenoData$class) also gives access to the class names + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + # set cairo as default for png plots: + png = function (...) grDevices::png(...,type='cairo') + # run diffreport + reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640) + + # write out tsv table: + write.table(reporttab, args.outTable, sep="\t", row.names=FALSE) + + message("\nGenerating report.........") + + cat("

Differential analysis report

", file= args.htmlReportFile) + #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile) + figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="") + message(figuresPath) + listOfFiles <- list.files(path = figuresPath) + for (i in 1:length(listOfFiles)) + { + figureName <- listOfFiles[i] + # maybe we still need to copy the figures to the args.htmlReportFile.files_path + cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE) + cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE) + } + + message("finished generating report") + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_differential_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_differential_analysis.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,55 @@ + + Runs xcms diffreport function for differential Analsysis + + R_bioc_metams + + + xcms_differential_analysis.r + $xsetData + "$class1" + "$class2" + $topcount + $outTable + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for +the most significant differences. The figure below shows an example of such an EIC. + +.. image:: $PATH_TO_IMAGES/diffreport.png + + + + + + + 10.1021/ac051437y + + \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_alignment_eic.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_alignment_eic.r Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,153 @@ +# shows all alignment results in a rt region +## read args: +args <- commandArgs(TRUE) +# xset data: +args.xsetData <- args[1] + +args.rtStart <- strtoi(args[2]) +args.rtEnd <- strtoi(args[3]) + +# limit max diff to 600 and minNrSamples to at least 2 : +if (args.rtEnd - args.rtStart > 600) + stop("The RT window should be <= 600") + +args.minNrSamples <- strtoi(args[4]) #min nr samples +if (args.minNrSamples < 2) + stop("Set 'Minimum number of samples' to 2 or higher") + + +args.sampleNames <- strsplit(args[5], ",")[[1]] +# trim leading and trailing spaces: +args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + + + +tryCatch( + { + library(metaMS) + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) + + write(paste("

Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples

"), + file=args.htmlReportFile) + + gt <- groups(xsetData) + message("\nParsed groups... ") + groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected + + if (length(groupidx1) > 0) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames) + #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw") + + #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE) + #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!! + sampNames <- sampnames(xsetData) + sampleNamesIdx <- match( args.sampleNames, sampNames) + message(paste("Samples: ", sampleNamesIdx)) + + #TODO html <- paste(html, "") + message(paste("\nPlotting figures... ")) + + #get the mz list (interestingly, this [,"mz"] is relatively slow): + peakMzList <- xsetData@peaks[,"mz"] + peakSampleList <- xsetData@peaks[,"sample"] + #signal to noise list: + peakSnList <- xsetData@peaks[,"sn"] + + message(paste("Total nr of peaks: ", length(peakMzList))) + + for (i in 1:length(groupidx1)) + { + groupMembers <- xsetData@groupidx[[groupidx1[i]]] + + groupMzList <- "" + groupSampleList <- "" + finalGroupSize <- 0 + + for (j in 1:length(groupMembers)) + { + #get only the peaks from the selected samples: + memberSample <- peakSampleList[groupMembers[j]] + memberSn <- peakSnList[groupMembers[j]] + if (!is.na(memberSn) && memberSample %in% sampleNamesIdx) + { + message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample)) + memberMz <- peakMzList[groupMembers[j]] + + + if (finalGroupSize == 0) + { + groupMzList <- memberMz + groupSampleList <- sampNames[memberSample] + } else { + groupMzList <- paste(groupMzList,",",memberMz, sep="") + groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="") + } + + finalGroupSize <- finalGroupSize +1 + } + } + # here we do the real check on group size and only the groups that have enough + # members with signal to noise > 0 will be plotted here: + if (finalGroupSize >= args.minNrSamples) + { + message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... ")) + + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + write(paste("", sep="") , + file=args.htmlReportFile, append=TRUE) + + png( figureName, type="cairo", width=800 ) + plot(eiccor, xsetData, groupidx = i) + devname = dev.off() + + write(paste("

Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA:
", groupMzList,"

", sep="") , + file=args.htmlReportFile, append=TRUE) + write(paste("

m/z values found in the following samples respectively:
", groupSampleList,"

", sep="") , + file=args.htmlReportFile, append=TRUE) + } + } + + } + + write("", + file=args.htmlReportFile, append=TRUE) + message("finished generating report") + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_alignment_eic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_alignment_eic.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,73 @@ + + Extracts alignment EICs from feature alignment data + + R_bioc_metams + + + xcms_get_alignment_eic.r + $xsetData + $rtStart + $rtEnd + $minNrSamples + "$sampleNames" + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method. +It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between +two classes, with extra alignment information beneath it. + +.. image:: $PATH_TO_IMAGES/diffreport.png + +Alignment id: 1709. m/z list of peaks in alignment: +614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469 + + + + + 10.1021/ac051437y + + \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_mass_eic.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_mass_eic.r Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,162 @@ +## read args: +args <- commandArgs(TRUE) +# xset data: +args.xsetData <- args[1] + +args.rtStart <- strtoi(args[2]) +args.rtEnd <- strtoi(args[3]) + +args.mzStart <- as.double(args[4]) +args.mzEnd <- as.double(args[5]) +# there are 2 options: specify a mz range or a mz list: +if (args.mzStart < 0) +{ + args.mzList <- as.double(strsplit(args[6], ",")[[1]]) + cat(typeof(as.double(strsplit(args[6], ",")[[1]]))) + args.mzTolPpm <- as.double(args[7]) + # calculate mzends based on ppm tol: + mzListEnd <- c() + mzListStart <- c() + for (i in 1:length(args.mzList)) + { + mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0 + mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0 + mzListEnd <- c(mzListEnd, mzEnd) + mzListStart <- c(mzListStart, mzStart) + } + str(mzListStart) + str(mzListEnd) +} else { + mzListEnd <- c(args.mzEnd) + mzListStart <- c(args.mzStart) +} + +args.sampleNames <- strsplit(args[8], ",")[[1]] +# trim leading and trailing spaces: +args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) + +args.combineSamples <- args[9] +args.rtPlotMode <- args[10] + +## report files +args.htmlReportFile <- args[11] +args.htmlReportFile.files_path <- args[12] + + +if (length(args) == 13) +{ + args.outLogFile <- args[13] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots +# TODO2 - let it run in parallel + +tryCatch( + { + library(metaMS) + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) + + html <- "

Extracted Ion Chromatograms (EIC) matching criteria

" + + if (args.combineSamples == "No") + { + if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart)) + stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart), + " masses while ", length(args.sampleNames), " sample names were given.")) + + iterSize <- length(args.sampleNames) + # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used + fixSampleIdx <- 1 + fixMzListIdx <- 1 + if (length(args.sampleNames) == 1) + { + fixSampleIdx <- 0 + iterSize <- length(mzListStart) + } + if (length(mzListStart) == 1) + { + fixMzListIdx <- 0 + } + lineColors <- rainbow(iterSize) + for (i in 0:(iterSize-1)) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, + mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE), + rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), + sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode) + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + html <- paste(html,"
", sep="") + png( figureName, type="cairo", width=1100,height=250 ) + #plot(eiccor, col=lineColors[i+1]) + # black is better in this case: + plot(eiccor) + legend('topright', # places a legend at the appropriate place + legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend + lty=c(1,1), # gives the legend appropriate symbols (lines) + lwd=c(2.5,2.5)) + + devname = dev.off() + } + + } else { + for (i in 1:length(mzListStart)) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, + mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE), + rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), + sampleidx=args.sampleNames, rt = args.rtPlotMode) + + #set size, set option (plot per sample, plot per mass) + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + html <- paste(html,"", sep="") + png( figureName, type="cairo", width=1100,height=450 ) + lineColors <- rainbow(length(args.sampleNames)) + plot(eiccor, col=lineColors) + legend('topright', # places a legend at the appropriate place + legend=args.sampleNames, # puts text in the legend + lty=c(1,1), # gives the legend appropriate symbols (lines) + lwd=c(2.5,2.5), + col=lineColors) + devname = dev.off() + } + } + if (args.rtPlotMode == "corrected") + { + html <- paste(html,"

*rt values are corrected ones

") + } + html <- paste(html,"") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 41f122255d14 -r 4393f982d18f metaMS/xcms_get_mass_eic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS/xcms_get_mass_eic.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,117 @@ + + Extracts EICs for a given list of masses + + R_bioc_metams + + + xcms_get_mass_eic.r + $xsetData + $rtStart + $rtEnd + #if $massParameters.massParametersType == "window" + $massParameters.mzStart + $massParameters.mzEnd + -1 + "." + #else + -1 + -1 + "$massParameters.mzList" + $massParameters.mzTolPpm + #end if + "$sampleNames" + $combineSamples + $rtPlotMode + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method. +It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot. + +.. image:: $PATH_TO_IMAGES/massEIC.png + + + + + 10.1021/ac051437y + + \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metaMS_cmd_annotate.r --- a/metaMS_cmd_annotate.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData" -args.constructedDB <- args[1] -## data file in xset format: -args.xsetData <- args[2] -## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" -args.settings <- args[3] - -## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" -args.outAnnotationTable <- args[4] - -args.mass_error_function <- args[5] -if (args.mass_error_function == "0") - args.mass_error_function <- NULL -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -cat("\nSettings used===============:\n") -cat(readChar(args.settings, 1e5)) - - -tryCatch( - { - library(metaMS) - - ## load the constructed DB : - tempEnv <- new.env() - testDB <- load(args.constructedDB, envir=tempEnv) - xsetData <- readRDS(args.xsetData) - - ## load settings "script" into "customMetaMSsettings" - source(args.settings, local=tempEnv) - message(paste(" loaded : ", args.settings)) - - # Just to highlight: if you want to use more than one - # trigger runLC: - LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE) - - # write out runLC annotation results: - write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE) - - # the used constructed DB (write to log): - cat("\nConstructed DB info===============:\n") - str(tempEnv[[testDB[1]]]$Info) - cat("\nConstructed DB table===============:\n") - if (length(args) == 8) - { - write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE) - write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE) - } - - message("\nGenerating report.........") - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - html <- "

Summary of annotation results:

" - nrTotalFeatures <- nrow(LC$PeakTable) - nrAnnotatedFeatures <- nrow(LC$Annotation$annotation.table) - html <- paste(html,"

Total nr of features: ", nrTotalFeatures,"

", sep="") - html <- paste(html,"

Total nr of annotated features: ", nrAnnotatedFeatures,"

", sep="") - - html <- paste(html,"") - message("finished generating report") - write(html,file=args.htmlReportFile) - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) diff -r 41f122255d14 -r 4393f982d18f metaMS_cmd_pick_and_group.r --- a/metaMS_cmd_pick_and_group.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,92 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/" -args.dataZip <- args[1] -args.zipExtrDir <- sub("\\.","_",paste(args[1],"dir", sep="")) -dir.create(file.path(args.zipExtrDir), showWarnings = FALSE, recursive = TRUE) -## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" -args.settings <- args[2] - -## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" -args.outPeakTable <- args[3] -args.xsetOut <- args[4] - -# polarity as explicit parameter: -args.runLC_polarity <- args[5] - -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] - - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -cat("\nSettings used===============:\n") -cat(readChar(args.settings, 1e5)) - - -tryCatch( - { - library(metaMS) - - ## load the data files from a zip file - files <- unzip(args.dataZip, exdir=args.zipExtrDir) - - ## load settings "script" into "customMetaMSsettings" - tempEnv <- new.env() - source(args.settings, local=tempEnv) - message(paste(" loaded : ", args.settings)) - allSettings <- tempEnv[["customMetaMSsettings"]] - - # trigger runLC: - LC <- runLC(files, settings = allSettings, polarity=args.runLC_polarity, nSlaves=20, returnXset = TRUE) - - # write out runLC annotation results: - write.table(LC$PeakTable, args.outPeakTable, sep="\t", row.names=FALSE) - - # save xset as rdata: - xsAnnotatePreparedData <- LC$xset - saveRDS(xsAnnotatePreparedData, file=args.xsetOut) - - message("\nGenerating report.........") - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - html <- "

Info on alignment quality

" - # TODO add (nr and mass error) and group size - - message("\nPlotting figures... ") - figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="") - html <- paste(html,"
", sep="") - png( figureName, type="cairo", width=1100,height=600 ) - retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden") - html <- paste(html,"*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run, - but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once - this option is added in metaMS API.
", sep="") - devname = dev.off() - - - gt <- groups(LC$xset@xcmsSet) - groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3) - - html <- paste(html,"") - message("finished generating report") - write(html,file=args.htmlReportFile) - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) diff -r 41f122255d14 -r 4393f982d18f metams_lcms_annotate.xml --- a/metams_lcms_annotate.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,121 +0,0 @@ - - Runs metaMS process for LC/MS feature annotation - - R_bioc_metams - - - metaMS_cmd_annotate.r - $constructed_db - $xsetData - $customMetaMSsettings - $outputFile - #if $mzTol.mzTolType == "fixed" - 0 - #else - "$mzTol.mass_error_function" - #end if - $htmlReportFile - $htmlReportFile.files_path - $outputLog - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -## start comment - ## metaMS process settings - customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", - chrom = "LC") -metaSetting(customMetaMSsettings, "match2DB") <- list( - rtdiff = ${rtdiff}, - rtval = ${rtval}, - #if $mzTol.mzTolType == "fixed" - mzdiff = ${mzTol.mzdiff}, - #else - ppm = ${mzTol.ppm}, - #end if - minfeat = ${minfeat}) - - - - - - - - - - - - - -.. class:: infomark - -Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB. -The figure below shows the main parts of this metaMS process. - -.. image:: $PATH_TO_IMAGES/metaMS_annotate.png - - -.. class:: infomark - -The implemented annotation strategy can be broken down in the following steps: - -1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If -the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed -tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the -bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e. -features which are associated to more than one compound - are possible. This outcome does not -indicate a problem per se, but is an inherent drawback of co-elution. - -2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all -the experimental features which are assigned to a specific compound. A specific annotation is -confirmed only if more than minfeat features which differ in retention time less than rtval are -present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The -latter, indeed, accounts for shifts in retention time between the injection of the standards and the -metabolomics experiment under investigation. This time can be rather long, considering that the -standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift -between the ions of the same compound within the same batch of injections and therefore it has -only to account for the smaller shifts occurring during peak picking and alignment. - - - - - 10.1016/j.jchromb.2014.02.051 - - \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f metams_lcms_pick_and_group.xml --- a/metams_lcms_pick_and_group.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,301 +0,0 @@ - - Runs metaMS process for LC/MS feature picking, aligning and grouping - - R_bioc_metams - - - metaMS_cmd_pick_and_group.r - $data_files - $customMetaMSsettings - $outputFile - $xsetOut - $polarity - $htmlReportFile - $htmlReportFile.files_path - $outputLog - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -## ==================================== - ## metaMS process settings - customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", - chrom = "LC", - PeakPicking = list( - method = "${peakPicking.method}", - #if $peakPicking.method == "matchedFilter" - fwhm = ${peakPicking.fwhm}, - sigma = ${peakPicking.fwhm}/${peakPicking.sigma_denom}, - max = ${peakPicking.max}, - snthresh = ${peakPicking.snthresh}, - step = ${peakPicking.step}, - steps = ${peakPicking.steps}, - mzdiff = ${peakPicking.mzdiff}), - #else - ppm = ${peakPicking.ppm}, - peakwidth = c(${peakPicking.peakwidth}), - snthresh = ${peakPicking.snthresh}, - prefilter = c(${peakPicking.prefilter}), - mzCenterFun = "${peakPicking.mzCenterFun}", - integrate = ${peakPicking.integrate}, - mzdiff = ${peakPicking.mzdiff}, - fitgauss = ${peakPicking.fitgauss}, - noise = ${peakPicking.noise}), - #end if - Alignment = list( - min.class.fraction = ${min_class_fraction}, - min.class.size = ${min_class_size}, - mzwid = ${mzwid}, - bws = c(${bws}), - retcormethod = "${retcor.retcormethod}", - #if $retcor.retcormethod == "peakgroups" - smooth = "${retcor.smooth}", - missingratio = ${retcor.missingratio}, - extraratio = ${retcor.extraratio}, - retcorfamily = "${retcor.retcorfamily}", - #else - ##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS): - method = "${retcor.retcormethod}", - profStep = ${retcor.profStep}, - #end if - fillPeaks = ${fillPeaks}), - CAMERA = list( - perfwhm = ${perfwhm}, - cor_eic_th = ${cor_eic_th}, - ppm= ${ppm}, - graphMethod= "${groupCorr_graphMethod}", - pval= ${groupCorr_pval}, - calcCiS= ${groupCorr_calcCiS}, - calcIso= ${groupCorr_calcIso}, - calcCaS= ${groupCorr_calcCaS}, - maxcharge= ${findIsotopes_maxcharge}, - maxiso= ${findIsotopes_maxiso}, - minfrac= ${findIsotopes_minfrac}, - multiplier= ${findAdducts_multiplier} - )) - - - - - - - - - - - - -.. class:: infomark - -Runs metaMS process for LC/MS feature feature picking, aligning and grouping. -This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms. -CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data. -The figure below shows the main parts of the metaMS process wrapped by this tool. - -.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png - - -From CAMERA documentation: - -.. image:: $PATH_TO_IMAGES/CAMERA_results.png - -**References** - -If you use this Galaxy tool in work leading to a scientific publication please -cite the following papers: - -Wehrens, R.; Weingart, G.; Mattivi, F. (2014). -metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics. -Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116. -doi: 10.1016/j.jchromb.2014.02.051 -handle: http://hdl.handle.net/10449/24012 - -Wrapper by Pieter Lukasse. - - - - - 10.1016/j.jchromb.2014.02.051 - - \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f next.r --- a/next.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,9 +0,0 @@ -# next script, useful for plotting the CAMERA groups pseudo-spectra: -plotPsSpectrum( LC$xset,320,maxlabel=10) -320 is group id -plotEICs( LC$xset,35,maxlabel=5) - - -could use this for the items identified by "annotate" step. -could also integrate this to getEIC tool (add option to give group list instead of m/z list)...but then rt and other options - are also not so relevant...maybe a separate tool is more clear \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f query_mass_repos.py --- a/query_mass_repos.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,289 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -''' -Module to query a set of accurate mass values detected by high-resolution mass spectrometers -against various repositories/services such as METabolomics EXPlorer database or the -MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). - -It will take the input file and for each record it will query the -molecular mass in the selected repository/service. If one or more compounds are found -then extra information regarding these compounds is added to the output file. - -The output file is thus the input file enriched with information about -related items found in the selected repository/service. - -The service should implement the following interface: - -http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) - -The output should be tab separated and should contain the following columns (in this order) -db-name molecular-formula dbe formula-weight id description - - -''' -import csv -import sys -import fileinput -import urllib2 -import time -from collections import OrderedDict - -__author__ = "Pieter Lukasse" -__contact__ = "pieter.lukasse@wur.nl" -__copyright__ = "Copyright, 2014, Plant Research International, WUR" -__license__ = "Apache v2" - -def _process_file(in_xsv, delim='\t'): - ''' - Generic method to parse a tab-separated file returning a dictionary with named columns - @param in_csv: input filename to be parsed - ''' - data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) - return _process_data(data) - -def _process_data(data): - - header = data.pop(0) - # Create dictionary with column name as key - output = OrderedDict() - for index in xrange(len(header)): - output[header[index]] = [row[index] for row in data] - return output - - -def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): - - ''' - This method will iterate over the record in the input_data and - will enrich them with the related information found (if any) in the - chosen repository/service - - # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies - ''' - merged = [] - - for i in xrange(len(input_data[input_data.keys()[0]])): - # Get the record in same dictionary format as input_data, but containing - # a value at each column instead of a list of all values of all records: - input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) - - # read the molecular mass : - molecular_mass = input_data_record[molecular_mass_col] - - - # search for related records in repository/service: - data_found = None - if molecular_mass != "": - molecular_mass = float(molecular_mass) - - # 1- search for data around this MM: - query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" - - data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") - data_type_found = "MM" - - - if data_found == None: - # If still nothing found, just add empty columns - extra_cols = ['', '','','','',''] - else: - # Add info found: - extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) - - # Take all data and merge it into a "flat"/simple array of values: - field_values_list = _merge_data(input_data_record, extra_cols) - - merged.append(field_values_list) - - # return the merged/enriched records: - return merged - - -def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): - ''' - This method will go over the data found and will return a - list with the following items: - - details of hits found : - db-name molecular-formula dbe formula-weight id description - - Link that executes same query - - ''' - - # set() makes a unique list: - db_name_set = [] - molecular_formula_set = [] - id_set = [] - description_set = [] - - - if 'db-name' in data_found: - db_name_set = set(data_found['db-name']) - elif '# db-name' in data_found: - db_name_set = set(data_found['# db-name']) - if 'molecular-formula' in data_found: - molecular_formula_set = set(data_found['molecular-formula']) - if 'id' in data_found: - id_set = set(data_found['id']) - if 'description' in data_found: - description_set = set(data_found['description']) - - result = [data_type_found, - _to_xsv(db_name_set), - _to_xsv(molecular_formula_set), - _to_xsv(id_set), - _to_xsv(description_set), - #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): - "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] - return result - - -def _to_xsv(data_set): - result = "" - for item in data_set: - result = result + str(item) + "|" - return result - - -def _fire_query_and_return_dict(url): - ''' - This method will fire the query as a web-service call and - return the results as a list of dictionary objects - ''' - - try: - data = urllib2.urlopen(url).read() - - # transform to dictionary: - result = [] - data_rows = data.split("\n") - - # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): - data_rows_to_remove = [] - for data_row in data_rows: - if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): - data_rows_to_remove.append(data_row) - - for data_row in data_rows_to_remove: - data_rows.remove(data_row) - - # check if there is any data in the response: - if len(data_rows) <= 1 or data_rows[1].strip() == '': - # means there is only the header row...so no hits: - return None - - for data_row in data_rows: - if not data_row.strip() == '': - row_as_list = _str_to_list(data_row, delimiter='\t') - result.append(row_as_list) - - # return result processed into a dict: - return _process_data(result) - - except urllib2.HTTPError, e: - raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) - except urllib2.URLError, e: - raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") - -def _str_to_list(data_row, delimiter='\t'): - result = [] - for column in data_row.split(delimiter): - result.append(column) - return result - - -# alternative: ? -# s = requests.Session() -# s.verify = False -# #s.auth = (token01, token02) -# resp = s.get(url, params={'name': 'anonymous'}, stream=True) -# content = resp.content -# # transform to dictionary: - - - - -def _merge_data(input_data_record, extra_cols): - ''' - Adds the extra information to the existing data record and returns - the combined new record. - ''' - record = [] - for column in input_data_record: - record.append(input_data_record[column]) - - - # add extra columns - for column in extra_cols: - record.append(column) - - return record - - -def _save_data(data_rows, headers, out_csv): - ''' - Writes tab-separated data to file - @param data_rows: dictionary containing merged/enriched dataset - @param out_csv: output csv file - ''' - - # Open output file for writing - outfile_single_handle = open(out_csv, 'wb') - output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") - - # Write headers - output_single_handle.writerow(headers) - - # Write one line for each row - for data_row in data_rows: - output_single_handle.writerow(data_row) - -def _get_repository_URL(repository_file): - ''' - Read out and return the URL stored in the given file. - ''' - file_input = fileinput.input(repository_file) - try: - for line in file_input: - if line[0] != '#': - # just return the first line that is not a comment line: - return line - finally: - file_input.close() - - -def main(): - ''' - Query main function - - The input file can be any tabular file, as long as it contains a column for the molecular mass. - This column is then used to query against the chosen repository/service Database. - ''' - seconds_start = int(round(time.time())) - - input_file = sys.argv[1] - molecular_mass_col = sys.argv[2] - repository_file = sys.argv[3] - error_margin = float(sys.argv[4]) - margin_unit = sys.argv[5] - output_result = sys.argv[6] - - # Parse repository_file to find the URL to the service: - repository_dblink = _get_repository_URL(repository_file) - - # Parse tabular input file into dictionary/array: - input_data = _process_file(input_file) - - # Query data against repository : - enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) - headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', - 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns - - _save_data(enriched_data, headers, output_result) - - seconds_end = int(round(time.time())) - print "Took " + str(seconds_end - seconds_start) + " seconds" - - - -if __name__ == '__main__': - main() diff -r 41f122255d14 -r 4393f982d18f query_mass_repos.xml --- a/query_mass_repos.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,106 +0,0 @@ - - Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers - - query_mass_repos.py - $input_file - "$molecular_mass_col" - "$repository_file" - $error_margin - $margin_unit - $output_result - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher -for elemental compositions from accurate mass values detected by high-resolution mass spectrometers. - -It will take the input file and for each record it will query the -molecular mass in the selected repository. If one or more compounds are found in the -repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file. - -The output file is thus the input file enriched with information about -related items found in the selected repository. - -**Notes** - -The input file can be any tabular file, as long as it contains a column for the molecular mass. - -**Services that can be queried** - -================= ========================================================================= -Database Description ------------------ ------------------------------------------------------------------------- -PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at - Plant Research International. NB: restricted access to employees with - access key. -ExactMassDB A database of possible elemental compositions consits of C: 100, - H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and - the Lewis valence rules. - (via /mfsearcher/exmassdb/) -ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental - compositions, filters some elemental compositions according to - the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 - database returns the same result as does HR2 with the same atom kind - and number condition as that used in construction of the ExactMassDB. - (via /mfsearcher/exmassdb-hr2/) -Pep1000 A database of possible linear polypeptides that are - constructed with 20 kinds of amino acids and having molecular - weights smaller than 1000. - (via /mfsearcher/pep1000/) -KEGG Re-calculated compound data from KEGG. Weekly updated. - (via /mfsearcher/kegg/) -KNApSAcK Re-calculated compound data from KNApSAcK. - (via /mfsearcher/knapsack/) -Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer . - (via /mfsearcher/flavonoidviewer/ -LipidMAPS Re-calculated compound data from LIPID MAPS. - (via /mfsearcher/lipidmaps/) -HMDB Re-calculated compound data from Human Metabolome Database (HMDB) - Version 3.5. - (via /mfsearcher/hmdb/) -PubChem Re-calculated compound data from PubChem. Monthly updated. - (via /mfsearcher/pubchem/) -================= ========================================================================= - -Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher - - - diff -r 41f122255d14 -r 4393f982d18f query_metexp.py --- a/query_metexp.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,282 +0,0 @@ -#!/usr/bin/env python -# encoding: utf-8 -''' -Module to query a set of identifications against the METabolomics EXPlorer database. - -It will take the input file and for each record it will query the -molecular mass in the selected MetExp DB. If one or more compounds are found in the -MetExp DB then extra information regarding these compounds is added to the output file. - -The output file is thus the input file enriched with information about -related items found in the selected MetExp DB. -''' -import csv -import sys -import fileinput -import urllib2 -import time -from collections import OrderedDict - -__author__ = "Pieter Lukasse" -__contact__ = "pieter.lukasse@wur.nl" -__copyright__ = "Copyright, 2014, Plant Research International, WUR" -__license__ = "Apache v2" - -def _process_file(in_xsv, delim='\t'): - ''' - Generic method to parse a tab-separated file returning a dictionary with named columns - @param in_csv: input filename to be parsed - ''' - data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) - return _process_data(data) - -def _process_data(data): - - header = data.pop(0) - # Create dictionary with column name as key - output = OrderedDict() - for index in xrange(len(header)): - output[header[index]] = [row[index] for row in data] - return output - - -def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): - ''' - This method will iterate over the record in the input_data and - will enrich them with the related information found (if any) in the - MetExp Database. - - # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies - ''' - merged = [] - - for i in xrange(len(input_data[input_data.keys()[0]])): - # Get the record in same dictionary format as input_data, but containing - # a value at each column instead of a list of all values of all records: - input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) - - # read the molecular mass and formula: - cas_id = input_data_record[casid_col] - formula = input_data_record[formula_col] - molecular_mass = input_data_record[molecular_mass_col] - - # search for related records in MetExp: - data_found = None - if cas_id != "undef": - # 1- search for other experiments where this CAS id has been found: - query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method - data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") - data_type_found = "CAS" - if data_found == None: - # 2- search for other experiments where this FORMULA has been found: - query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method - data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") - data_type_found = "FORMULA" - if data_found == None: - # 3- search for other experiments where this MM has been found: - query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method - data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") - data_type_found = "MM" - - if data_found == None: - # If still nothing found, just add empty columns - extra_cols = ['', '','','','','','',''] - else: - # Add info found: - extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) - - # Take all data and merge it into a "flat"/simple array of values: - field_values_list = _merge_data(input_data_record, extra_cols) - - merged.append(field_values_list) - - # return the merged/enriched records: - return merged - - -def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): - ''' - This method will go over the data found and will return a - list with the following items: - - Experiment details where hits have been found : - 'organism', 'tissue','experiment_name','user_name','column_type' - - Link that executes same query - - ''' - # set() makes a unique list: - organism_set = [] - tissue_set = [] - experiment_name_set = [] - user_name_set = [] - column_type_set = [] - cas_nr_set = [] - - if 'organism' in data_found: - organism_set = set(data_found['organism']) - if 'tissue' in data_found: - tissue_set = set(data_found['tissue']) - if 'experiment_name' in data_found: - experiment_name_set = set(data_found['experiment_name']) - if 'user_name' in data_found: - user_name_set = set(data_found['user_name']) - if 'column_type' in data_found: - column_type_set = set(data_found['column_type']) - if 'CAS' in data_found: - cas_nr_set = set(data_found['CAS']) - - - result = [data_type_found, - _to_xsv(organism_set), - _to_xsv(tissue_set), - _to_xsv(experiment_name_set), - _to_xsv(user_name_set), - _to_xsv(column_type_set), - _to_xsv(cas_nr_set), - #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): - "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] - return result - - -def _to_xsv(data_set): - result = "" - for item in data_set: - result = result + str(item) + "|" - return result - - -def _fire_query_and_return_dict(url): - ''' - This method will fire the query as a web-service call and - return the results as a list of dictionary objects - ''' - - try: - data = urllib2.urlopen(url).read() - - # transform to dictionary: - result = [] - data_rows = data.split("\n") - - # check if there is any data in the response: - if len(data_rows) <= 1 or data_rows[1].strip() == '': - # means there is only the header row...so no hits: - return None - - for data_row in data_rows: - if not data_row.strip() == '': - row_as_list = _str_to_list(data_row, delimiter='\t') - result.append(row_as_list) - - # return result processed into a dict: - return _process_data(result) - - except urllib2.HTTPError, e: - raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) - except urllib2.URLError, e: - raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") - -def _str_to_list(data_row, delimiter='\t'): - result = [] - for column in data_row.split(delimiter): - result.append(column) - return result - - -# alternative: ? -# s = requests.Session() -# s.verify = False -# #s.auth = (token01, token02) -# resp = s.get(url, params={'name': 'anonymous'}, stream=True) -# content = resp.content -# # transform to dictionary: - - - - -def _merge_data(input_data_record, extra_cols): - ''' - Adds the extra information to the existing data record and returns - the combined new record. - ''' - record = [] - for column in input_data_record: - record.append(input_data_record[column]) - - - # add extra columns - for column in extra_cols: - record.append(column) - - return record - - -def _save_data(data_rows, headers, out_csv): - ''' - Writes tab-separated data to file - @param data_rows: dictionary containing merged/enriched dataset - @param out_csv: output csv file - ''' - - # Open output file for writing - outfile_single_handle = open(out_csv, 'wb') - output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") - - # Write headers - output_single_handle.writerow(headers) - - # Write one line for each row - for data_row in data_rows: - output_single_handle.writerow(data_row) - -def _get_metexp_URL(metexp_dblink_file): - ''' - Read out and return the URL stored in the given file. - ''' - file_input = fileinput.input(metexp_dblink_file) - try: - for line in file_input: - if line[0] != '#': - # just return the first line that is not a comment line: - return line - finally: - file_input.close() - - -def main(): - ''' - MetExp Query main function - - The input file can be any tabular file, as long as it contains a column for the molecular mass - and one for the formula of the respective identification. These two columns are then - used to query against MetExp Database. - ''' - seconds_start = int(round(time.time())) - - input_file = sys.argv[1] - casid_col = sys.argv[2] - formula_col = sys.argv[3] - molecular_mass_col = sys.argv[4] - metexp_dblink_file = sys.argv[5] - separation_method = sys.argv[6] - output_result = sys.argv[7] - - # Parse metexp_dblink_file to find the URL to the MetExp service: - metexp_dblink = _get_metexp_URL(metexp_dblink_file) - - # Parse tabular input file into dictionary/array: - input_data = _process_file(input_file) - - # Query data against MetExp DB : - enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) - headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', - 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] - - _save_data(enriched_data, headers, output_result) - - seconds_end = int(round(time.time())) - print "Took " + str(seconds_end - seconds_start) + " seconds" - - - -if __name__ == '__main__': - main() diff -r 41f122255d14 -r 4393f982d18f query_metexp.xml --- a/query_metexp.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ - - Query a set of identifications against the METabolomics EXPeriments database - - query_metexp.py - $input_file - "$casid_col" - "$formula_col" - "$molecular_mass_col" - "$metexp_dblink_file" - $separation_method - $output_result - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -This tool will Query a set of identifications against the METabolomics EXPlorer database. - -It will take the input file and for each record it will query the -molecular mass in the selected MetExp DB. If one or more compounds are found in the -MetExp DB then extra information regarding these compounds is added to the output file. - -The output file is thus the input file enriched with information about -related items found in the selected MetExp DB. - -**Notes** - -The input file can be any tabular file, as long as it contains a column for the molecular mass -and one for the formula of the respective identification. - - - - diff -r 41f122255d14 -r 4393f982d18f rankfilterGCMS_tabular.xml --- a/rankfilterGCMS_tabular.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,80 +0,0 @@ - - Convert Retention Time to Retention Index - rankfilter_GCMS/rankfilter.py $input_file - - - - - - - - - - - - - - - - - - - - - - - - - - sample = ${sample} - calibration = ${calibration} - lib_data = ${lib_data} - window = ${window} - analysis_type = ${analysis_type} - tabular = True - onefile = ${onefile} - model = ${model} - - - -Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the -given calibration file. - -It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File), -based on its CAS number, its respective RIsvr value in the given global lookup library -(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS). -This generates an prediction of the RI for -a compound according to the "RankFilter procedure" (RIsvr). - -Output is a tab separated file in which four columns are added: - - - **Rank** Calculated rank - - **RIexp** Experimental Retention Index (RI) - - **RIsvr** Calculated RI based on support vector regression (SVR) - - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp) - -.. class:: infomark - -**Notes** - - - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'. - - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will - be used to convert from XXXX to YYYY. - ------ - -**References** - - - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS - metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794 - - diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/nistpdf_to_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/nistpdf_to_tabular.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,45 @@ +""" +Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this software except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +import sys +import pdfread +from subprocess import call + +# NB: THIS TOOL SHOULD BE DEPRECATED ONCE NIST-Wrapper tool is stable and released. NIST-Wrapper tool already produces the necessary tabular file. + +def convert_pdftotext(filename, output_file): + ''' + Converts PDF file to text + @param filename: PDF file to parse + @param output_file: output text file for the hits + ''' + + # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The + # default is to 'undo' physical layout (columns, hyphenation, etc.) and output + # the text in reading order. + try: + call(["pdftotext", "-layout", filename, output_file]) + except: + raise Exception("Error while trying to convert PDF to text") + + + + +if __name__ == '__main__': + pdf_as_text = sys.argv[1]+".txt" + convert_pdftotext(sys.argv[1], pdf_as_text) + pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False) diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/nistpdf_to_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/nistpdf_to_tabular.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,14 @@ + + Convert NIST text to tabular format + nistpdf_to_tabular.py $input $output $output_err + + + + + + + + + This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools. + + diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/pdftotabular.py --- a/rankfilter_GCMS/pdftotabular.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,44 +0,0 @@ -""" -Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this software except in compliance with the License. -You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - -""" - -import sys -import pdfread -from subprocess import call - - -def convert_pdftotext(filename, output_file): - ''' - Converts PDF file to text - @param filename: PDF file to parse - @param output_file: output text file for the hits - ''' - - # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The - # default is to 'undo' physical layout (columns, hyphenation, etc.) and output - # the text in reading order. - try: - call(["pdftotext", "-layout", filename, output_file]) - except: - raise Exception("Error while trying to convert PDF to text") - - - - -if __name__ == '__main__': - pdf_as_text = sys.argv[1]+".txt" - convert_pdftotext(sys.argv[1], pdf_as_text) - pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False) diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/rankfilterGCMS_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/rankfilterGCMS_tabular.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,80 @@ + + Convert Retention Time to Retention Index + rankfilter.py $input_file + + + + + + + + + + + + + + + + + + + + + + + + + + sample = ${sample} + calibration = ${calibration} + lib_data = ${lib_data} + window = ${window} + analysis_type = ${analysis_type} + tabular = True + onefile = ${onefile} + model = ${model} + + + +Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the +given calibration file. + +It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File), +based on its CAS number, its respective RIsvr value in the given global lookup library +(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS). +This generates an prediction of the RI for +a compound according to the "RankFilter procedure" (RIsvr). + +Output is a tab separated file in which four columns are added: + + - **Rank** Calculated rank + - **RIexp** Experimental Retention Index (RI) + - **RIsvr** Calculated RI based on support vector regression (SVR) + - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp) + +.. class:: infomark + +**Notes** + + - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'. + - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will + be used to convert from XXXX to YYYY. + +----- + +**References** + + - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS + metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794 + + diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/select_on_rank.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/select_on_rank.py Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,21 @@ +import csv +import sys + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +in_file = sys.argv[1] +out_file = sys.argv[2] +to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)] + +data = list(csv.reader(open(in_file, 'rb'), delimiter='\t')) +header = data.pop(0) +header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header] +rank = header_clean.index("rank") + +writer = csv.writer(open(out_file, 'wb'), delimiter='\t') +writer.writerow(header) +for select in to_select_list: + writer.writerows([i for i in data if i[rank] == select]) diff -r 41f122255d14 -r 4393f982d18f rankfilter_GCMS/select_on_rank.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/select_on_rank.xml Thu Mar 19 12:22:23 2015 +0100 @@ -0,0 +1,15 @@ + + Filter on the Rank field in the RankFilter output file + select_on_rank.py $input $output "$rank" + + + + + + + + +This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file). + + diff -r 41f122255d14 -r 4393f982d18f rankfilter_text2tabular.xml --- a/rankfilter_text2tabular.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,14 +0,0 @@ - - Convert NIST text to tabular format - rankfilter_GCMS/pdftotabular.py $input $output $output_err - - - - - - - - - This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools. - - diff -r 41f122255d14 -r 4393f982d18f select_on_rank.py --- a/select_on_rank.py Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,21 +0,0 @@ -import csv -import sys - -__author__ = "Marcel Kempenaar" -__contact__ = "brs@nbic.nl" -__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" -__license__ = "MIT" - -in_file = sys.argv[1] -out_file = sys.argv[2] -to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)] - -data = list(csv.reader(open(in_file, 'rb'), delimiter='\t')) -header = data.pop(0) -header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header] -rank = header_clean.index("rank") - -writer = csv.writer(open(out_file, 'wb'), delimiter='\t') -writer.writerow(header) -for select in to_select_list: - writer.writerows([i for i in data if i[rank] == select]) diff -r 41f122255d14 -r 4393f982d18f select_on_rank.xml --- a/select_on_rank.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ - - Filter on the Rank field in the RankFilter output file - select_on_rank.py $input $output "$rank" - - - - - - - - -This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file). - - diff -r 41f122255d14 -r 4393f982d18f static_resources/README.txt --- a/static_resources/README.txt Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3 +0,0 @@ -This folder and respective files should be deployed together with the following tools: - - - ../export_to_metexp_tabular.py \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f static_resources/elements_and_masses.tab --- a/static_resources/elements_and_masses.tab Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,104 +0,0 @@ -Name Atomic number Chemical symbol Relative atomic mass -Hydrogen 1 H 1.01 -Helium 2 He 4 -Lithium 3 Li 6.94 -Beryllium 4 Be 9.01 -Boron 5 B 10.81 -Carbon 6 C 12.01 -Nitrogen 7 N 14.01 -Oxygen 8 O 16 -Fluorine 9 F 19 -Neon 10 Ne 20.18 -Sodium 11 Na 22.99 -Magnesium 12 Mg 24.31 -Aluminum 13 Al 26.98 -Silicon 14 Si 28.09 -Phosphorus 15 P 30.98 -Sulfur 16 S 32.06 -Chlorine 17 Cl 35.45 -Argon 18 Ar 39.95 -Potassium 19 K 39.1 -Calcium 20 Ca 40.08 -Scandium 21 Sc 44.96 -Titanium 22 Ti 47.9 -Vanadium 23 V 50.94 -Chromium 24 Cr 52 -Manganese 25 Mn 54.94 -Iron 26 Fe 55.85 -Cobalt 27 Co 58.93 -Nickel 28 Ni 58.71 -Copper 29 Cu 63.54 -Zinc 30 Zn 65.37 -Gallium 31 Ga 69.72 -Germanium 32 Ge 72.59 -Arsenic 33 As 74.99 -Selenium 34 Se 78.96 -Bromine 35 Br 79.91 -Krypton 36 Kr 83.8 -Rubidium 37 Rb 85.47 -Strontium 38 Sr 87.62 -Yttrium 39 Y 88.91 -Zirconium 40 Zr 91.22 -Niobium 41 Nb 92.91 -Molybdenum 42 Mo 95.94 -Technetium 43 Tc 96.91 -Ruthenium 44 Ru 101.07 -Rhodium 45 Rh 102.9 -Palladium 46 Pd 106.4 -Silver 47 Ag 107.87 -Cadmium 48 Cd 112.4 -Indium 49 In 114.82 -Tin 50 Sn 118.69 -Antimony 51 Sb 121.75 -Tellurium 52 Te 127.6 -Iodine 53 I 126.9 -Xenon 54 Xe 131.3 -Cesium 55 Cs 132.9 -Barium 56 Ba 137.34 -Lanthanum 57 La 138.91 -Cerium 58 Ce 140.12 -Praseodymium 59 Pr 140.91 -Neodymium 60 Nd 144.24 -Promethium 61 Pm 144.91 -Samarium 62 Sm 150.35 -Europium 63 Eu 151.96 -Gadolinium 64 Gd 157.25 -Terbium 65 Tb 158.92 -Dysprosium 66 Dy 162.5 -Holmium 67 Ho 164.93 -Erbium 68 Er 167.26 -Thulium 69 Tm 168.93 -Ytterbium 70 Yb 173.04 -Lutetium 71 Lu 174.97 -Hafnium 72 Hf 178.49 -Tantalum 73 Ta 180.95 -Wolfram 74 W 183.85 -Rhenium 75 Re 186.2 -Osmium 76 Os 190.2 -Iridium 77 Ir 192.22 -Platinum 78 Pt 195.09 -Gold 79 Au 196.97 -Mercury 80 Hg 200.59 -Thallium 81 Tl 204.37 -Lead 82 Pb 207.19 -Bismuth 83 Bi 208.98 -Polonium 84 Po 208.98 -Astatine 85 At 209.99 -Radon 86 Rn 222.02 -Francium 87 Fr 223.02 -Radium 88 Ra 226 -Actinium 89 Ac 227.03 -Thorium 90 Th 232.04 -Protactinium 91 Pa 231.04 -Uranium 92 U 238.03 -Neptunium 93 Np 237 -Plutonium 94 Pu 242 -Americium 95 Am 243.06 -Curium 96 Cm 247.07 -Berkelium 97 Bk 247.07 -Californium 98 Cf 251.08 -Einsteinium 99 Es 254.09 -Fermium 100 Fm 257.1 -Mendelevium 101 Md 257.1 -Nobelium 102 No 255.09 -Lawrencium 103 Lr 256.1 diff -r 41f122255d14 -r 4393f982d18f tool_dependencies.xml --- a/tool_dependencies.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,13 +0,0 @@ - - - - - - - - This dependency: - Ensures R 3.1.1 installation is triggered (via dependency). - Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed. - - \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f xcms_differential_analysis.r --- a/xcms_differential_analysis.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,85 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -#cat("args <- \"\"\n") -## a xcms xset saved as .RData -args.xsetData <- args[1] -#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep="")) - -args.class1 <- args[2] -args.class2 <- args[3] -#cat(paste("args.class1 <- \"", args[2], "\"\n", sep="")) -#cat(paste("args.class2 <- \"", args[3], "\"\n", sep="")) - -args.topcount <- strtoi(args[4]) -#cat(paste("args.topcount <- ", args[4], "\n", sep="")) - -args.outTable <- args[5] - -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] -#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep="")) -#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep="")) - - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -tryCatch( - { - library(metaMS) - library(xcms) - #library("R2HTML") - - # load the xset data : - xsetData <- readRDS(args.xsetData) - # if here to support both scenarios: - if ("xcmsSet" %in% slotNames(xsetData) ) - { - xsetData <- xsetData@xcmsSet - } - - - # info: levels(xcmsSet@phenoData$class) also gives access to the class names - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - # set cairo as default for png plots: - png = function (...) grDevices::png(...,type='cairo') - # run diffreport - reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640) - - # write out tsv table: - write.table(reporttab, args.outTable, sep="\t", row.names=FALSE) - - message("\nGenerating report.........") - - cat("

Differential analysis report

", file= args.htmlReportFile) - #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile) - figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="") - message(figuresPath) - listOfFiles <- list.files(path = figuresPath) - for (i in 1:length(listOfFiles)) - { - figureName <- listOfFiles[i] - # maybe we still need to copy the figures to the args.htmlReportFile.files_path - cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE) - cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE) - } - - message("finished generating report") - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f xcms_differential_analysis.xml --- a/xcms_differential_analysis.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,55 +0,0 @@ - - Runs xcms diffreport function for differential Analsysis - - R_bioc_metams - - - xcms_differential_analysis.r - $xsetData - "$class1" - "$class2" - $topcount - $outTable - $htmlReportFile - $htmlReportFile.files_path - $outLogFile - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for -the most significant differences. The figure below shows an example of such an EIC. - -.. image:: $PATH_TO_IMAGES/diffreport.png - - - - - - - 10.1021/ac051437y - - \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f xcms_get_alignment_eic.r --- a/xcms_get_alignment_eic.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,153 +0,0 @@ -# shows all alignment results in a rt region -## read args: -args <- commandArgs(TRUE) -# xset data: -args.xsetData <- args[1] - -args.rtStart <- strtoi(args[2]) -args.rtEnd <- strtoi(args[3]) - -# limit max diff to 600 and minNrSamples to at least 2 : -if (args.rtEnd - args.rtStart > 600) - stop("The RT window should be <= 600") - -args.minNrSamples <- strtoi(args[4]) #min nr samples -if (args.minNrSamples < 2) - stop("Set 'Minimum number of samples' to 2 or higher") - - -args.sampleNames <- strsplit(args[5], ",")[[1]] -# trim leading and trailing spaces: -args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) - -## report files -args.htmlReportFile <- args[6] -args.htmlReportFile.files_path <- args[7] - - -if (length(args) == 8) -{ - args.outLogFile <- args[8] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - - - -tryCatch( - { - library(metaMS) - - # load the xset data : - xsetData <- readRDS(args.xsetData) - # if here to support both scenarios: - if ("xcmsSet" %in% slotNames(xsetData) ) - { - xsetData <- xsetData@xcmsSet - } - - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) - - write(paste("

Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples

"), - file=args.htmlReportFile) - - gt <- groups(xsetData) - message("\nParsed groups... ") - groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected - - if (length(groupidx1) > 0) - { - message("\nGetting EIC... ") - eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames) - #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw") - - #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE) - #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!! - sampNames <- sampnames(xsetData) - sampleNamesIdx <- match( args.sampleNames, sampNames) - message(paste("Samples: ", sampleNamesIdx)) - - #TODO html <- paste(html, "
") - message(paste("\nPlotting figures... ")) - - #get the mz list (interestingly, this [,"mz"] is relatively slow): - peakMzList <- xsetData@peaks[,"mz"] - peakSampleList <- xsetData@peaks[,"sample"] - #signal to noise list: - peakSnList <- xsetData@peaks[,"sn"] - - message(paste("Total nr of peaks: ", length(peakMzList))) - - for (i in 1:length(groupidx1)) - { - groupMembers <- xsetData@groupidx[[groupidx1[i]]] - - groupMzList <- "" - groupSampleList <- "" - finalGroupSize <- 0 - - for (j in 1:length(groupMembers)) - { - #get only the peaks from the selected samples: - memberSample <- peakSampleList[groupMembers[j]] - memberSn <- peakSnList[groupMembers[j]] - if (!is.na(memberSn) && memberSample %in% sampleNamesIdx) - { - message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample)) - memberMz <- peakMzList[groupMembers[j]] - - - if (finalGroupSize == 0) - { - groupMzList <- memberMz - groupSampleList <- sampNames[memberSample] - } else { - groupMzList <- paste(groupMzList,",",memberMz, sep="") - groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="") - } - - finalGroupSize <- finalGroupSize +1 - } - } - # here we do the real check on group size and only the groups that have enough - # members with signal to noise > 0 will be plotted here: - if (finalGroupSize >= args.minNrSamples) - { - message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... ")) - - figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") - write(paste("", sep="") , - file=args.htmlReportFile, append=TRUE) - - png( figureName, type="cairo", width=800 ) - plot(eiccor, xsetData, groupidx = i) - devname = dev.off() - - write(paste("

Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA:
", groupMzList,"

", sep="") , - file=args.htmlReportFile, append=TRUE) - write(paste("

m/z values found in the following samples respectively:
", groupSampleList,"

", sep="") , - file=args.htmlReportFile, append=TRUE) - } - } - - } - - write("", - file=args.htmlReportFile, append=TRUE) - message("finished generating report") - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) diff -r 41f122255d14 -r 4393f982d18f xcms_get_alignment_eic.xml --- a/xcms_get_alignment_eic.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,73 +0,0 @@ - - Extracts alignment EICs from feature alignment data - - R_bioc_metams - - - xcms_get_alignment_eic.r - $xsetData - $rtStart - $rtEnd - $minNrSamples - "$sampleNames" - $htmlReportFile - $htmlReportFile.files_path - $outLogFile - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method. -It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between -two classes, with extra alignment information beneath it. - -.. image:: $PATH_TO_IMAGES/diffreport.png - -Alignment id: 1709. m/z list of peaks in alignment: -614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469 - - - - - 10.1021/ac051437y - - \ No newline at end of file diff -r 41f122255d14 -r 4393f982d18f xcms_get_mass_eic.r --- a/xcms_get_mass_eic.r Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,162 +0,0 @@ -## read args: -args <- commandArgs(TRUE) -# xset data: -args.xsetData <- args[1] - -args.rtStart <- strtoi(args[2]) -args.rtEnd <- strtoi(args[3]) - -args.mzStart <- as.double(args[4]) -args.mzEnd <- as.double(args[5]) -# there are 2 options: specify a mz range or a mz list: -if (args.mzStart < 0) -{ - args.mzList <- as.double(strsplit(args[6], ",")[[1]]) - cat(typeof(as.double(strsplit(args[6], ",")[[1]]))) - args.mzTolPpm <- as.double(args[7]) - # calculate mzends based on ppm tol: - mzListEnd <- c() - mzListStart <- c() - for (i in 1:length(args.mzList)) - { - mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0 - mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0 - mzListEnd <- c(mzListEnd, mzEnd) - mzListStart <- c(mzListStart, mzStart) - } - str(mzListStart) - str(mzListEnd) -} else { - mzListEnd <- c(args.mzEnd) - mzListStart <- c(args.mzStart) -} - -args.sampleNames <- strsplit(args[8], ",")[[1]] -# trim leading and trailing spaces: -args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) - -args.combineSamples <- args[9] -args.rtPlotMode <- args[10] - -## report files -args.htmlReportFile <- args[11] -args.htmlReportFile.files_path <- args[12] - - -if (length(args) == 13) -{ - args.outLogFile <- args[13] - # suppress messages: - # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 - msg <- file(args.outLogFile, open="wt") - sink(msg, type="message") - sink(msg, type="output") -} - -# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots -# TODO2 - let it run in parallel - -tryCatch( - { - library(metaMS) - - # load the xset data : - xsetData <- readRDS(args.xsetData) - # if here to support both scenarios: - if ("xcmsSet" %in% slotNames(xsetData) ) - { - xsetData <- xsetData@xcmsSet - } - - # report - dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) - message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) - - html <- "

Extracted Ion Chromatograms (EIC) matching criteria

" - - if (args.combineSamples == "No") - { - if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart)) - stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart), - " masses while ", length(args.sampleNames), " sample names were given.")) - - iterSize <- length(args.sampleNames) - # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used - fixSampleIdx <- 1 - fixMzListIdx <- 1 - if (length(args.sampleNames) == 1) - { - fixSampleIdx <- 0 - iterSize <- length(mzListStart) - } - if (length(mzListStart) == 1) - { - fixMzListIdx <- 0 - } - lineColors <- rainbow(iterSize) - for (i in 0:(iterSize-1)) - { - message("\nGetting EIC... ") - eiccor <- getEIC(xsetData, - mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE), - rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), - sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode) - - message("\nPlotting figures... ") - figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") - html <- paste(html,"
", sep="") - png( figureName, type="cairo", width=1100,height=250 ) - #plot(eiccor, col=lineColors[i+1]) - # black is better in this case: - plot(eiccor) - legend('topright', # places a legend at the appropriate place - legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend - lty=c(1,1), # gives the legend appropriate symbols (lines) - lwd=c(2.5,2.5)) - - devname = dev.off() - } - - } else { - for (i in 1:length(mzListStart)) - { - message("\nGetting EIC... ") - eiccor <- getEIC(xsetData, - mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE), - rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), - sampleidx=args.sampleNames, rt = args.rtPlotMode) - - #set size, set option (plot per sample, plot per mass) - - message("\nPlotting figures... ") - figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") - html <- paste(html,"", sep="") - png( figureName, type="cairo", width=1100,height=450 ) - lineColors <- rainbow(length(args.sampleNames)) - plot(eiccor, col=lineColors) - legend('topright', # places a legend at the appropriate place - legend=args.sampleNames, # puts text in the legend - lty=c(1,1), # gives the legend appropriate symbols (lines) - lwd=c(2.5,2.5), - col=lineColors) - devname = dev.off() - } - } - if (args.rtPlotMode == "corrected") - { - html <- paste(html,"

*rt values are corrected ones

") - } - html <- paste(html,"") - message("finished generating report") - write(html,file=args.htmlReportFile) - # unlink(args.htmlReportFile) - cat("\nWarnings================:\n") - str( warnings() ) - }, - error=function(cond) { - sink(NULL, type="message") # default setting - sink(stderr(), type="output") - message("\nERROR: ===========\n") - print(cond) - } - ) diff -r 41f122255d14 -r 4393f982d18f xcms_get_mass_eic.xml --- a/xcms_get_mass_eic.xml Thu Mar 19 12:13:13 2015 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,117 +0,0 @@ - - Extracts EICs for a given list of masses - - R_bioc_metams - - - xcms_get_mass_eic.r - $xsetData - $rtStart - $rtEnd - #if $massParameters.massParametersType == "window" - $massParameters.mzStart - $massParameters.mzEnd - -1 - "." - #else - -1 - -1 - "$massParameters.mzList" - $massParameters.mzTolPpm - #end if - "$sampleNames" - $combineSamples - $rtPlotMode - $htmlReportFile - $htmlReportFile.files_path - $outLogFile - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.. class:: infomark - -This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method. -It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot. - -.. image:: $PATH_TO_IMAGES/massEIC.png - - - - - 10.1021/ac051437y - - \ No newline at end of file