# HG changeset patch # User pieter.lukasse@wur.nl # Date 1423342920 -3600 # Node ID dffc3872749667a5de6bd0767664ea7aa0afa841 initial commit diff -r 000000000000 -r dffc38727496 LICENSE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/LICENSE Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff -r 000000000000 -r dffc38727496 NOTICE --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NOTICE Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,14 @@ +PRIMS-metabolomics toolset & Galaxy wrappers +============================================ + +Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite. +This toolset consists of custom tools to enable metabolite identifications and +Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data. + +Copyright: +* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS +* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI), + Wageningen, The Netherlands. All rights reserved. See the license text below. + +Galaxy wrappers and installation are available from the Galaxy Tool Shed at: +http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics \ No newline at end of file diff -r 000000000000 -r dffc38727496 README.rst --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.rst Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,103 @@ +PRIMS-metabolomics toolset & Galaxy wrappers +============================================ + +Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite. +This toolset consists of custom tools to enable metabolite identifications and +Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data. + +Copyright: +* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS +* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI), +Wageningen, The Netherlands. All rights reserved. See the license text below. + +Galaxy wrappers and installation are available from the Galaxy Tool Shed at: +http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics + +History +======= + +============== ====================================================================== +Date Changes +-------------- ---------------------------------------------------------------------- +December 2014 * Added MsClust support for parsing XCMS alignment results. + * Improved output reports for XCMS wrappers. +November 2014 * Added XCMS related tool wrappers (for metaMS and diffreport) +September 2014 * Added new membership cutoff option for final clusters in MsClust + * Improved MsClust memory usage for large datasets + * Simplified MsClust HTML report + * Added option for microminutes based clustering instead of scannr + based in MsClust +April 2014 * Added interface to ExactMassDB, Pep1000, KEGG, KNApSAcK, Flavonoid + Viewer, LipidMAPS, HMDB, PubChem, by using the service MFSearcher. + This enables users to query multiple public repositories for + elemental compositions from accurate mass values detected by + high-resolution mass spectrometers. NB: see also added + licensing info below. +March 2014 * Added interface to METEXP data store, including tool to fire + queries in batch mode + * Improved quantification output files of MsClust, a.o. sorting + mass list based on intensity (last two columns of quantification + files) +January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a + single package (this package) + * integration with METEXP software (data store for metabolomics + experiments with respective metadata and identifications) +2013 * hand-over of the NIST_UTIL and RIQC tools from the NBIC team to + Plant Research International +2012 * development of MsClust 2.0, making it also suitable for Galaxy +<2011 * development and publication of MsClust 1.0 +============== ====================================================================== + +Tool Versioning +=============== + +PRIMS tools will have versions of the form X.Y.Z. Versions +differing only after the second decimal should be completely +compatible with each other. Breaking changes should result in an +increment of the number before and/or after the first decimal. All +tools of version less than 1.0.0 should be considered beta. + + +Bug Reports & other questions +============================= + +For the time being issues can be reported via the contact form at: +http://www.wageningenur.nl/en/Persons/PNJ-Pieter-Lukasse.htm + +Developers, Contributions & Collaborations +========================================== + +If you wish to join forces and collaborate on some of the +tools do not hesitate to contact Pieter Lukasse via the contact form above. + + +License (Apache, Version 2.0) +============================= + +Copyright 2013 Pieter Lukasse, Plant Research International (PRI). + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this software except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + + +License for third party services +================================ +MFSearcher service : http://webs2.kazusa.or.jp/mfsearcher/#090 +In the MFSearcher system, the compound data provided by KEGG, Flavonoid Viewer, LIPID MAPS, HMDB and PubChem +were downloaded for academic purposes. The compound data of KNApSAcK is provided by Prof. Kanaya in +Nara Institute of Science and Technology (NAIST). The part of these data are utilized to construct the +specified databases for rapid mass searching in the MFSearcher system after re-calculating the molecular weights. +Please preserve the contracts of each original databases when utilizing the search results against these +databases by MFSearcher. + +The searching system of MFSearcher, the ExactMassDB database, and the Pep1000 database by Kazusa DNA +Research Institute is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License. \ No newline at end of file diff -r 000000000000 -r dffc38727496 Rscripts/filter-RIDB.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Rscripts/filter-RIDB.R Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,56 @@ +## +# +# Removes duplicates from a RI-database +# +# Usage: +# Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt +# +## + +# Commandline arguments +args <- commandArgs(TRUE) +ridb <- args[1] +out_file <- args[2] + +# Function to check duplicates +duplicates <- function(dat) { + s <- do.call("order", as.data.frame(dat)) + non.dup <- !duplicated(dat[s, ]) + orig.ind <- s[non.dup] + first.occ <- orig.ind[cumsum(non.dup)] + first.occ[non.dup] <- NA + first.occ[order(s)] +} + +# Load CSV file +ridb <- read.csv(ridb,header=TRUE, sep="\t") +## Filters on: CAS FORMULA Column type Column phase type Column name +filter_cols <- c(1, 3, 5, 6, 7) +cat("RIDB dimensions: ") +print(dim(ridb)) +deleted <- NULL +cat("Checking for duplicates...") +dups <- duplicates(ridb[,filter_cols]) +cat("\t[DONE]\nRemoving duplicates...") +newridb <- ridb +newridb["min"] <- NA +newridb["max"] <- NA +newridb["orig.columns"] <- NA +for (i in unique(dups)) { + if (!is.na(i)) { + rows <- which(dups == i) + duprows <- ridb[c(i, rows),] + # Replace duplicate rows with one row containing the median value + new_RI <- median(duprows$RI) + newridb$RI[i] <- median(duprows$RI) + newridb$min[i] <- min(duprows$RI) + newridb$max[i] <- max(duprows$RI) + newridb$orig.columns[i] <- paste(rows, collapse=",") + deleted <- c(deleted, rows) + } +} +cat("\t\t[DONE]\nCreating new dataset...") +out_ridb <- newridb[-deleted,] +cat("\t\t[DONE]\nWriting new dataset...") +write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F) +cat("\t\t[DONE]\n") diff -r 000000000000 -r dffc38727496 Rscripts/ridb-regression.R --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Rscripts/ridb-regression.R Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,208 @@ +## +# +# Performs regression analysis using either 3rd degree polynomial- or linear-method +# +## + +# Commandline arguments +args <- commandArgs(TRUE) +if (length(args) < 7) + stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ", + "ouput_file logfile min_residuals range_mod pvalue rsquared method ", + "plot(yes/no) plot_archive")) + +ridb <- args[1] +out_file <- args[2] +logfile <- args[3] +min_residuals <- as.integer(args[4]) +range_mod <- as.integer(args[5]) +pvalue <- as.double(args[6]) +rsquared <- as.double(args[7]) +method <- args[8] +plot <- tolower(args[9]) +if (plot == 'true') + plot_archive = args[10] + +# Do not show warnings etc. +sink(file='/dev/null') + +progress <- c() +logger <- function(logdata) { + ## Logs progress, adds a timestamp for each event + #cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG + progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep="")) +} + +logger("Reading Retention Index Database..") + +# Read Retention Index Database +ridb <- read.csv(ridb, header=TRUE, sep="\t") +logger(paste("\t", nrow(ridb), "records read..")) +# Get a unique list +gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1])) +cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1])) + +add_poly_fit <- function(fit, gc1_index, gc2_index, range) { + pval = anova.lm(fit)$Pr + r.squared = summary(fit)$r.squared + + data = rep(NA, 11) + # Append results to matrix + data[1] = gc_columns[gc1_index] # Column 1 + data[2] = gc_columns[gc2_index] # Column 2 + data[3] = coefficients(fit)[1] # The 4 coefficients + data[4] = coefficients(fit)[2] + data[5] = coefficients(fit)[3] + data[6] = coefficients(fit)[4] + data[7] = range[1] # Left limit + data[8] = range[2] # Right limit + data[9] = length(fit$residuals) # Number of datapoints analysed + data[10] = pval[1] # p-value for resulting fitting + data[11] = r.squared # R-squared + return(data) +} + + +add_linear_fit <- function(fit, gc1_index, gc2_index, range) { + pval = anova.lm(fit)$Pr + r.squared = summary(fit)$r.squared + + data = rep(NA, 7) + # Append results to matrix + data[1] = gc_columns[gc1_index] # Column 1 + data[2] = gc_columns[gc2_index] # Column 2 + data[3] = coefficients(fit)[1] # The 4 coefficients + data[4] = coefficients(fit)[2] + data[7] = length(fit$residuals) # Number of datapoints analysed + data[8] = pval[1] # p-value for resulting fitting + data[9] = r.squared # R-squared + return(data) +} + + +add_fit <- function(fit, gc1_index, gc2_index, range, method) { + if (method == 'poly') + return(add_poly_fit(fit, gc1_index, gc2_index, range)) + else + return(add_linear_fit(fit, gc1_index, gc2_index, range)) +} + + +plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) { + if (method == 'poly') + pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1] + else + pol <- function(x) coeff[2]*x + coeff[1] + pdf(paste('regression_model_', + make.names(gc_columns[gc1_index]), '_vs_', + make.names(gc_columns[gc2_index]), '.pdf', sep='')) + curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index], + ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750)) + points(ri1, ri2, lwd=0.4) + # Add vertical lines showing left- and right limits when using poly method + if (method == 'poly') + abline(v=range, col="grey", lwd=1.5) + dev.off() +} + +# Initialize output dataframe +if (method == 'poly') { + m <- data.frame(matrix(ncol = 11, nrow = 10)) +} else { + m <- data.frame(matrix(ncol = 9, nrow = 10)) +} + + +get_fit <- function(gc1, gc2, method) { + if (method == 'poly') + return(lm(gc1 ~ poly(gc2, 3, raw=TRUE))) + else + return(lm(gc1 ~ gc2)) +} + +# Permutate +k <- 1 +logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep="")) + +for (i in 1:(length(gc_columns)-1)) { + logger(paste("\tCalculating model for ", gc_columns[i], "..", sep="")) + breaks <- 0 + for (j in (i+1):length(gc_columns)) { + col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),] + col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),] + + # Find CAS numbers for which both columns have data (intersect) + cas_intersect = intersect(col1[['CAS']], col2[['CAS']]) + + # Skip if number of shared CAS entries is < cutoff + if (length(cas_intersect) < min_residuals) { + breaks = breaks + 1 + next + } + # Gather Retention Indices + col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])] + col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])] + + # Calculate the range within which regression is possible (and move if 'range_mod' != 0) + range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data)))) + if (range_mod != 0) { + # Calculate percentage and add/subtract from range + perc = diff(range) / 100 + perc_cutoff = range_mod * perc + range = as.integer(range + c(perc_cutoff, -perc_cutoff)) + } + + # Calculate model for column1 vs column2 and plot if requested + fit = get_fit(col1_data, col2_data, method) + m[k,] = add_fit(fit, i, j, range, method) + + if (plot == 'true') + plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method) + + # Calculate model for column2 vs column1 and plot if requested + fit = get_fit(col2_data, col1_data, method) + m[k + 1,] = add_fit(fit, j, i, range, method) + + if (plot == 'true') + plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method) + + k = k + 2 + } + logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep="")) +} + +# Filter on pvalue and R-squared +logger("Filtering on pvalue and R-squared..") +if (method == 'poly') { + pval_index <- which(m[,10] < pvalue) + rsquared_index <- which(m[,11] > rsquared) +} else { + pval_index <- which(m[,8] < pvalue) + rsquared_index <- which(m[,9] > rsquared) +} +logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep="")) + +logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep="")) + +# Remaining rows +index = unique(c(pval_index, rsquared_index)) + +# Reduce dataset +m = m[index,] +sink() + +# Place plots in the history as a ZIP file +if (plot == 'true') { + logger("Creating archive with model graphics..") + system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep="")) + system(paste("cp models.zip ", plot_archive, sep="")) +} + +# Save dataframe as tab separated file +logger("All done, saving data..") +header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4", + "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared") +if (method != 'poly') + header = header[c(1:4, 7:11)] +write(progress, logfile) +write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE) diff -r 000000000000 -r dffc38727496 __init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/__init__.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,6 @@ +''' +Module containing Galaxy tools for the LC or GC/MS pipeline +Created on Mar , 2014 + +@author: pieter lukasse +''' \ No newline at end of file diff -r 000000000000 -r dffc38727496 combine_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_output.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,253 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup) +''' + +import csv +import re +import sys +import math +import pprint + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +def _process_data(in_csv): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) + header = data.pop(0) + # Create dictionary with column name as key + output = {} + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _merge_data(rankfilter, caslookup): + ''' + Merges data from both input dictionaries based on the Centrotype field. This method will + build up a new list containing the merged hits as the items. + @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name) + @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name) + ''' + # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts: + if (len(rankfilter['ID']) != len(caslookup['Centrotype'])): + raise Exception('rankfilter and caslookup files should have the same nr of rows/records ') + + merged = [] + processed = {} + for compound_id_idx in xrange(len(rankfilter['ID'])): + compound_id = rankfilter['ID'][compound_id_idx] + if not compound_id in processed : + # keep track of processed items to not repeat them + processed[compound_id] = compound_id + # get centrotype nr + centrotype = compound_id.split('-')[0] + # Get the indices for current compound ID in both data-structures for proper matching + rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id] + cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype] + + merged_hits = [] + # Combine hits + for hit in xrange(len(rindex)): + # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do + # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its + # corresponding value in the rankfilter or caslookup tables; i.e. + # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index) + rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()])) + cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()])) + + merged_hit = _add_hit(rf_record, cl_record) + merged_hits.append(merged_hit) + + merged.append(merged_hits) + + return merged, len(rindex) + + +def _add_hit(rankfilter, caslookup): + ''' + Combines single records from both the RankFilter- and CasLookup-tools + @param rankfilter: record (dictionary) of one compound in the RankFilter output + @param caslookup: matching record (dictionary) of one compound in the CasLookup output + ''' + # The ID in the RankFilter output contains the following 5 fields: + rf_id = rankfilter['ID'].split('-') + try: + name, formula = _remove_formula(rankfilter['Name']) + hit = [rf_id[0], # Centrotype + rf_id[1], # cent.Factor + rf_id[2], # scan nr + rf_id[3], # R.T. (umin) + rf_id[4], # nr. Peaks + # Appending other fields + rankfilter['R.T.'], + name, + caslookup['FORMULA'] if not formula else formula, + rankfilter['Library'].strip(), + rankfilter['CAS'].strip(), + rankfilter['Forward'], + rankfilter['Reverse'], + ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2), + rankfilter['RIexp'], + caslookup['RI'], + rankfilter['RIsvr'], + # Calculate absolute differences + math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])), + math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])), + caslookup['Regression.Column.Name'], + caslookup['min'], + caslookup['max'], + caslookup['nr.duplicates'], + caslookup['Column.phase.type'], + caslookup['Column.name'], + rankfilter['Rank'], + rankfilter['%rel.err'], + rankfilter['Synonyms']] + except KeyError as error: + print "Problem reading in data from input file(s):\n", + print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n" + print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n" + raise error + + return hit + + +def _remove_formula(name): + ''' + The RankFilter Name field often contains the Formula as well, this function removes it from the Name + @param name: complete name of the compound from the RankFilter output + ''' + name = name.split() + poss_formula = name[-1] + match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula) + if match: + return ' '.join(name[:-1]), poss_formula + else: + return ' '.join(name), False + + +def _get_default_caslookup(): + ''' + The Cas Lookup tool might not have found all compounds in the library searched, + this default dict will be used to combine with the Rank Filter output + ''' + return {'FORMULA': 'N/A', + 'RI': '0.0', + 'Regression.Column.Name': 'None', + 'min': '0.0', + 'max': '0.0', + 'nr.duplicates': '0', + 'Column.phase.type': 'N/A', + 'Column.name': 'N/A'} + + +def _save_data(data, nhits, out_csv_single, out_csv_multi): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_csv: output csv file + ''' + # Columns we don't repeat: + header_part1 = ['Centrotype', + 'cent.Factor', + 'scan nr.', + 'R.T. (umin)', + 'nr. Peaks', + 'R.T.'] + # These are the headers/columns we repeat in case of + # combining hits in one line (see alternative_headers method below): + header_part2 = [ + 'Name', + 'FORMULA', + 'Library', + 'CAS', + 'Forward', + 'Reverse', + 'Avg. (Forward, Reverse)', + 'RIexp', + 'RI', + 'RIsvr', + 'RIexp - RIsvr', + 'RI - RIexp', + 'Regression.Column.Name', + 'min', + 'max', + 'nr.duplicates', + 'Column.phase.type', + 'Column.name', + 'Rank', + '%rel.err', + 'Synonyms'] + + # Open output file for writing + outfile_single_handle = open(out_csv_single, 'wb') + outfile_multi_handle = open(out_csv_multi, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(header_part1 + header_part2) + output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1)) + # Combine all hits for each centrotype into one line + line = [] + for centrotype_idx in xrange(len(data)): + i = 0 + for hit in data[centrotype_idx]: + if i==0: + line.extend(hit) + else: + line.extend(hit[6:]) + i = i+1 + # small validation (if error, it is a programming error): + if i > nhits: + raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx) + output_multi_handle.writerow(line) + line = [] + + # Write one line for each centrotype + for centrotype_idx in xrange(len(data)): + for hit in data[centrotype_idx]: + output_single_handle.writerow(hit) + +def alternative_headers(header_part2, nr_alternative_hits): + ''' + This method will iterate over the header names and add the string 'ALT#_' before each, + where # is the number of the alternative, according to number of alternative hits we want to add + to final csv/tsv + ''' + result = [] + for i in xrange(nr_alternative_hits): + for header_name in header_part2: + result.append("ALT" + str(i+1) + "_" + header_name) + return result + +def main(): + ''' + Combine Output main function + It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers" + NB: the caslookup_result_file will typically have fewer lines than + rankfilter_result_file, so the merge has to consider this as well. The final file + should have the same nr of lines as rankfilter_result_file. + ''' + rankfilter_result_file = sys.argv[1] + caslookup_result_file = sys.argv[2] + output_single_csv = sys.argv[3] + output_multi_csv = sys.argv[4] + + # Read RankFilter and CasLookup output files + rankfilter = _process_data(rankfilter_result_file) + caslookup = _process_data(caslookup_result_file) + merged, nhits = _merge_data(rankfilter, caslookup) + _save_data(merged, nhits, output_single_csv, output_multi_csv) + + +if __name__ == '__main__': + main() diff -r 000000000000 -r dffc38727496 combine_output.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/combine_output.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,36 @@ + + Perform a combination of output data from the RankFilter and CasLookup tools + + combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi + + + + + + + + + + + +Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files. + +The files produced are contain either all hits for a compound on a single line (Single) or on separate lines +(Multi). + +.. class:: infomark + +**Notes** + +The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the +original headers kept intact. Processing steps include: + + - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values + - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)' + and 'nr. Peaks' fields. + - The formula is split off the 'Name' field in the RankFilter output + + + diff -r 000000000000 -r dffc38727496 create_model.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/create_model.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,78 @@ + + Generate coefficients to enable the regression from one GC-column + to another GC-column + Rscripts/ridb-regression.R + $ridb + $out_model + $out_log + $min_residuals + $range_mod + $pvalue + $rsquared + $method + $plot + #if $plot + $model_graphics + #end if + + + + + + + + + + + + + + + + + + (plot) + + + + + +Calculates regression models for a permutation of all GC columns contained in the selected +RI database file. The method used for creating the model is either based on a 3rd degree +polynomial or a standard linear model. + +The *Minimum number of residuals* option will only allow regression if the columns it is based +on has at least that number of datapoints on the same compound. + +Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for +the *R squared* value. The produced logfile will state how many models have been discarded due +to this filtering. The output model file also includes the p-value and R squared value for +each created model. + +Graphical output of the models is available by selecting the plot option which shows the +data points used for the model as well as the fit itself and the range of data that will +be usable. + +.. class:: infomark + +**Notes** + +The output file produced by this tool is required as input for the CasLookup tool when +selecting to apply regression when finding hits in the RIDB. + + diff -r 000000000000 -r dffc38727496 datatypes_conf.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/datatypes_conf.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff -r 000000000000 -r dffc38727496 export_to_metexp_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,247 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the MetExp database. + +RankFilter, CasLookup are already combined by combine_output.py so here we will use +this result. Furthermore here one of the MsClust +quantification files containing the respective spectra details are to be combined as well. + +Extra calculations performed: +- The column MW is also added here and is derived from the column FORMULA found + in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. +''' +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import csv +import re +import sys +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2013, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_data(in_csv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim)) + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + +ONE_TO_ONE = 'one_to_one' +N_TO_ONE = 'n_to_one' + +def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE): + ''' + Merges data from both input dictionaries based on the link fields. This method will + build up a new list containing the merged hits as the items. + @param set1: dictionary holding set1 in the form of N lists (one list per attribute name) + @param set2: dictionary holding set2 in the form of N lists (one list per attribute name) + ''' + # TODO test for correct input files -> same link_field values should be there + # (test at least number of unique link_field values): + # + # if (len(set1[link_field_set1]) != len(set2[link_field_set2])): + # raise Exception('input files should have the same nr of key values ') + + + merged = [] + processed = {} + for link_field_set1_idx in xrange(len(set1[link_field_set1])): + link_field_set1_value = set1[link_field_set1][link_field_set1_idx] + if not link_field_set1_value in processed : + # keep track of processed items to not repeat them + processed[link_field_set1_value] = link_field_set1_value + + # Get the indices for current link_field_set1_value in both data-structures for proper matching + set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value] + set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ] + # Validation : + if len(set2index) == 0: + # means that corresponding data could not be found in set2, then throw error + raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" + + link_field_set1_value + " only found in first dataset. ") + + merged_hits = [] + # Combine hits + for hit in xrange(len(set1index)): + # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do + # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its + # corresponding value in the sets; i.e. + # set1[key] => returns the list/array with size = nrrows, with the values for the attribute + # represented by "key". + # set1index[hit] => points to the row nr=hit (hit is a rownr/index) + # So set1[x][set1index[n]] = set1.attributeX.instanceN + # + # It just ensures the entry is made available as a plain named array for easy access. + rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()])) + if relation_type == ONE_TO_ONE : + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()])) + else: + # is N to 1: + cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()])) + + merged_hit = merge_function(rf_record, cl_record, metadata) + merged_hits.append(merged_hit) + + merged.append(merged_hits) + + return merged, len(set1index) + + +def _compare_records(key1, key2): + ''' + in this case the compare method is really simple as both keys are expected to contain + same value when records are the same + ''' + if key1 == key2: + return True + else: + return False + + + +def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata): + ''' + Combines single records from both the RankFilter+CasLookup combi file and from MsClust file + + @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py) + @param msclust_quant_record: msclust quantification + spectrum record + ''' + record = [] + for column in rank_caslookup_combi: + record.append(rank_caslookup_combi[column]) + + for column in msclust_quant_record: + record.append(msclust_quant_record[column]) + + for column in metadata: + record.append(metadata[column]) + + # add MOLECULAR MASS (MM) + molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA']) + # limit to two decimals: + record.append("{0:.2f}".format(molecular_mass)) + + # add MOLECULAR WEIGHT (MW) - TODO - calculate this + record.append('0.0') + + # level of identification and Location of reference standard + record.append('0') + record.append('') + + return record + + +def get_molecular_mass(formula): + ''' + Calculates the molecular mass (MM). + E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O + ''' + + # Each element is represented by a capital letter, followed optionally by + # lower case, with one or more digits as for how many elements: + element_pattern = re.compile("([A-Z][a-z]?)(\d*)") + + total_mass = 0 + for (element_name, count) in element_pattern.findall(formula): + if count == "": + count = 1 + else: + count = int(count) + element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?) + total_mass += element_mass * count + + return total_mass + + + +def _save_data(data, headers, out_csv): + ''' + Writes tab-separated data to file + @param data: dictionary containing merged dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write + for item_idx in xrange(len(data)): + for hit in data[item_idx]: + output_single_handle.writerow(hit) + + +def _get_map_for_elements_and_masses(elements_and_masses): + ''' + This method will read out the column 'Chemical symbol' and make a map + of this, storing the column 'Relative atomic mass' as its value + ''' + resultMap = {} + index = 0 + for entry in elements_and_masses['Chemical symbol']: + resultMap[entry] = elements_and_masses['Relative atomic mass'][index] + index += 1 + + return resultMap + + +def init_elements_and_masses_map(): + ''' + Initializes the lookup map containing the elements and their respective masses + ''' + elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab")) + global elements_and_masses_map + elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses) + + +def main(): + ''' + Combine Output main function + + RankFilter, CasLookup are already combined by combine_output.py so here we will use + this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust + quantification files are to be combined with combine_output.py result as well. + ''' + rankfilter_and_caslookup_combined_file = sys.argv[1] + msclust_quantification_and_spectra_file = sys.argv[2] + output_csv = sys.argv[3] + # metadata + metadata = OrderedDict() + metadata['organism'] = sys.argv[4] + metadata['tissue'] = sys.argv[5] + metadata['experiment_name'] = sys.argv[6] + metadata['user_name'] = sys.argv[7] + metadata['column_type'] = sys.argv[8] + + # Read RankFilter and CasLookup output files + rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file) + msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',') + + # Read elements and masses to use for the MW/MM calculation : + init_elements_and_masses_map() + + merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype', + msclust_quantification_and_spectra, 'centrotype', + _compare_records, _merge_records, metadata, + N_TO_ONE) + headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard'] + _save_data(merged, headers, output_csv) + + +if __name__ == '__main__': + main() diff -r 000000000000 -r dffc38727496 export_to_metexp_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/export_to_metexp_tabular.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,75 @@ + + Create tabular file for loading into METabolomics EXPlorer database + + export_to_metexp_tabular.py + $rankfilter_and_caslookup_combi + $msclust_quant_file + $output_result + "$organism" + "$tissue" + "$experiment_name" + "$user_name" + "$column_type" + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +Tool to combine output from the tools RankFilter, CasLookup and MsClust +into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database. + +RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use +this result. + +**Notes** + +Extra calculations performed: +- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result. + +So in total here we merge 2 files and calculate one new column. + + + + diff -r 000000000000 -r dffc38727496 library_lookup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/library_lookup.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,327 @@ +''' +Logic for searching a Retention Index database file given output from NIST +''' +import match_library +import re +import sys +import csv + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +def create_lookup_table(library_file, column_type_name, statphase): + ''' + Creates a dictionary holding the contents of the library to be searched + @param library_file: library to read + @param column_type_name: the columns type name + @param statphase: the columns stationary phase + ''' + (data, header) = match_library.read_library(library_file) + # Test for presence of required columns + if ('columntype' not in header or + 'columnphasetype' not in header or + 'cas' not in header): + raise IOError('Missing columns in ', library_file) + + column_type_column = header.index("columntype") + statphase_column = header.index("columnphasetype") + cas_column = header.index("cas") + + filtered_library = [line for line in data if line[column_type_column] == column_type_name + and line[statphase_column] == statphase] + lookup_dict = {} + for element in filtered_library: + # Here the cas_number is set to the numeric part of the cas_column value, so if the + # cas_column value is 'C1433' then cas_number will be '1433' + cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0]) + try: + lookup_dict[cas_number].append(element) + except KeyError: + lookup_dict[cas_number] = [element] + return lookup_dict + + +def _preferred(hits, pref, ctype, polar, model, method): + ''' + Returns all entries in the lookup_dict that have the same column name, type and polarity + as given by the user, uses regression if selected given the model and method to use. The + regression is applied on the column with the best R-squared value in the model + @param hits: all entries in the lookup_dict for the given CAS number + @param pref: preferred GC-column, can be one or more names + @param ctype: column type (capillary etc.) + @param polar: polarity (polar / non-polar etc.) + @param model: data loaded from file containing regression models + @param method: supported regression method (i.e. poly(nomial) or linear) + ''' + match = [] + for column in pref: + for hit in hits: + if hit[4] == ctype and hit[5] == polar and hit[6] == column: + # Create copy of found hit since it will be altered downstream + match.extend(hit) + return match, False + + # No hit found for current CAS number, return if not performing regression + if not model: + return False, False + + # Perform regression + for column in pref: + if column not in model: + break + # Order regression candidates by R-squared value (last element) + order = sorted(model[column].items(), key=lambda col: col[1][-1]) + # Create list of regression candidate column names + regress_columns = list(reversed([column for (column, _) in order])) + # Names of available columns + available = [hit[6] for hit in hits] + + # TODO: combine Rsquared and number of datapoints to get the best regression match + ''' + # Iterate regression columns (in order) and retrieve their models + models = {} + for col in regress_columns: + if col in available: + hit = list(hits[available.index(col)]) + if hit[4] == ctype: + # models contains all model data including residuals [-2] and rsquared [-1] + models[pref[0]] = model[pref[0]][hit[6]] + # Get the combined maximum for residuals and rsquared + best_match = models[] + # Apply regression + if method == 'poly': + regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model) + if regressed: + hit[3] = regressed + else: + return False, False + else: + hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model) + match.extend(hit) + return match, hit[6] + ''' + + for col in regress_columns: + if col in available: + hit = list(hits[available.index(col)]) + if hit[4] == ctype: + # Perform regression using a column for which regression is possible + if method == 'poly': + # Polynomial is only possible within a set border, if the RI falls outside + # of this border, skip this lookup + regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model) + if regressed: + hit[3] = regressed + else: + return False, False + else: + hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model) + match.extend(hit) + return match, hit[6] + + return False, False + + + +def default_hit(row, cas_nr, compound_id): + ''' + This method will return a "default"/empty hit for cases where the + method _preferred() returns False (i.e. a RI could not be found + for the given cas nr, also not via regression. + ''' + return [ + #'CAS', + 'C' + cas_nr, + #'NAME', + '', + #'FORMULA', + '', + #'RI', + '0.0', + #'Column.type', + '', + #'Column.phase.type', + '', + #'Column.name', + '', + #'phase.coding', + ' ', + #'CAS_column.Name', + '', + #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes + compound_id, + #'Regression.Column.Name', + '', + #'min', + '', + #'max', + '', + #'nr.duplicates', + ''] + + +def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method): + ''' + Looks up the compounds in the library lookup table and formats the results + @param lookup_dict: dictionary containing the library to be searched + @param nist_tabular_filename: NIST output file to be matched + @param pref: (list of) column-name(s) to look for + @param ctype: column type of interest + @param polar: polarity of the used column + @param model: data loaded from file containing regression models + @param method: supported regression method (i.e. poly(nomial) or linear) + ''' + (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename) + # Retrieve indices of the CAS and compound_id columns (exit if not present) + try: + casi = header_clean.index("cas") + idi = header_clean.index("id") + except: + raise IOError("'CAS' or 'compound_id' not found in header of library file") + + data = [] + for row in nist_tabular_list: + casf = str(row[casi].replace('-', '').strip()) + compound_id = str(row[idi].split('-')[0]) + if casf in lookup_dict: + found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method) + if found_hit: + # Keep cas nr as 'C'+ numeric part: + found_hit[0] = 'C' + casf + # Add compound id + found_hit.insert(9, compound_id) + # Add information on regression process + found_hit.insert(10, regress if regress else 'None') + # Replace column index references with actual number of duplicates + dups = len(found_hit[-1].split(',')) + if dups > 1: + found_hit[-1] = str(dups + 1) + else: + found_hit[-1] = '0' + data.append(found_hit) + found_hit = '' + else: + data.append(default_hit(row, casf, compound_id)) + else: + data.append(default_hit(row, casf, compound_id)) + + casf = '' + compound_id = '' + found_hit = [] + dups = [] + return data + + +def _save_data(content, outfile): + ''' + Write to output file + @param content: content to write + @param outfile: file to write to + ''' + # header + header = ['CAS', + 'NAME', + 'FORMULA', + 'RI', + 'Column.type', + 'Column.phase.type', + 'Column.name', + 'phase.coding', + 'CAS_column.Name', + 'Centrotype', + 'Regression.Column.Name', + 'min', + 'max', + 'nr.duplicates'] + output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t") + output_handle.writerow(header) + for entry in content: + output_handle.writerow(entry) + + +def _read_model(model_file): + ''' + Creates an easy to search dictionary for getting the regression parameters + for each valid combination of GC-columns + @param model_file: filename containing the regression models + ''' + regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t')) + if len(regress.pop(0)) > 9: + method = 'poly' + else: + method = 'linear' + + model = {} + # Create new dictionary for each GC-column + for line in regress: + model[line[0]] = {} + + # Add data + for line in regress: + if method == 'poly': + model[line[0]][line[1]] = [float(col) for col in line[2:11]] + else: # linear + model[line[0]][line[1]] = [float(col) for col in line[2:9]] + + return model, method + + +def _apply_poly_regression(column1, column2, retention_index, model): + ''' + Calculates a new retention index (RI) value using a given 3rd-degree polynomial + model based on data from GC columns 1 and 2 + @param column1: name of the selected GC-column + @param column2: name of the GC-column to use for regression + @param retention_index: RI to convert + @param model: dictionary containing model information for all GC-columns + ''' + coeff = model[column1][column2] + # If the retention index to convert is within range of the data the model is based on, perform regression + if coeff[4] < retention_index < coeff[5]: + return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + + (retention_index * coeff[1]) + coeff[0]) + else: + return False + + +def _apply_linear_regression(column1, column2, retention_index, model): + ''' + Calculates a new retention index (RI) value using a given linear model based on data + from GC columns 1 and 2 + @param column1: name of the selected GC-column + @param column2: name of the GC-column to use for regression + @param retention_index: RI to convert + @param model: dictionary containing model information for all GC-columns + ''' + # TODO: No use of limits + coeff = model[column1][column2] + return coeff[1] * retention_index + coeff[0] + + +def main(): + ''' + Library Lookup main function + ''' + library_file = sys.argv[1] + nist_tabular_filename = sys.argv[2] + ctype = sys.argv[3] + polar = sys.argv[4] + outfile = sys.argv[5] + pref = sys.argv[6:-1] + regress = sys.argv[-1] + + if regress != 'False': + model, method = _read_model(regress) + else: + model, method = False, None + + lookup_dict = create_lookup_table(library_file, ctype, polar) + data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method) + + _save_data(data, outfile) + + +if __name__ == "__main__": + main() diff -r 000000000000 -r dffc38727496 library_lookup.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/library_lookup.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,75 @@ + + Lookup or estimate the RI using a "known RI values" CAS numbers library + + library_lookup.py + $library_file + $input + "$col_type" + "$polarity" + $output + #for $ctype in $pref + ${ctype.columntype} + #end for + $regression.model + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library. +If a direct match is NOT found for the preferred column name, a regression can be done to find +the theoretical RI value based on known RI values for the CAS number on other column types (see step 4). +If there is no match for the CAS number on any column type, then the record is not given a RI. + + + + + + diff -r 000000000000 -r dffc38727496 match_library.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/match_library.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,133 @@ +''' +Containing functions are called from Galaxy to populate lists/checkboxes with selectable items +''' +import csv +import glob +import os + + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +def get_column_type(library_file): + ''' + Returns a Galaxy formatted list of tuples containing all possibilities for the + GC-column types. Used by the library_lookup.xml tool + @param library_file: given library file from which the list of GC-column types is extracted + ''' + if library_file == "": + galaxy_output = [("", "", False)] + else: + (data, header) = read_library(library_file) + + if 'columntype' not in header: + raise IOError('Missing columns in ', library_file) + + # Filter data on column type + column_type = header.index("columntype") + amounts_in_list_dict = count_occurrence([row[column_type] for row in data]) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()] + + return(galaxy_output) + + +def filter_column(library_file, column_type_name): + ''' + Filters the Retention Index database on column type + @param library_file: file containing the database + @param column_type_name: column type to filter on + ''' + if library_file == "": + galaxy_output = [("", "", False)] + else: + (data, header) = read_library(library_file) + + if ('columntype' not in header or + 'columnphasetype' not in header): + raise IOError('Missing columns in ', library_file) + + column_type = header.index("columntype") + statphase = header.index("columnphasetype") + + # Filter data on colunn type name + statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name] + amounts_in_list_dict = count_occurrence(statphase_list) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + + return(sorted(galaxy_output)) + + +def filter_column2(library_file, column_type_name, statphase): + ''' + Filters the Retention Index database on column type + @param library_file: file containing the database + @param column_type_name: column type to filter on + @param statphase: stationary phase of the column to filter on + ''' + if library_file == "": + galaxy_output = [("", "", False)] + else: + (data, header) = read_library(library_file) + + if ('columntype' not in header or + 'columnphasetype' not in header or + 'columnname' not in header): + raise IOError('Missing columns in ', library_file) + + column_type_column = header.index("columntype") + statphase_column = header.index("columnphasetype") + column_name_column = header.index("columnname") + + # Filter data on given column type name and stationary phase + statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and + line[statphase_column] == statphase] + amounts_in_list_dict = count_occurrence(statphase_list) + galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()] + + return(sorted(galaxy_output)) + + +def read_library(filename): + ''' + Reads a CSV file and returns its contents and a normalized header + @param filename: file to read + ''' + data = list(csv.reader(open(filename, 'rU'), delimiter='\t')) + header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)] + return(data, header_clean) + + + +def get_directory_files(dir_name): + ''' + Reads the directory and + returns the list of .txt files found as a dictionary + with file name and full path so that it can + fill a Galaxy drop-down combo box. + + ''' + files = glob.glob(dir_name + "/*.*") + if len(files) == 0: + # Configuration error: no library files found in /" + dir_name : + galaxy_output = [("Configuration error: expected file not found in /" + dir_name, "", False)] + else: + galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files] + return(galaxy_output) + +def get_file_name_no_ext(full_name): + ''' + returns just the last part of the name + ''' + simple_name = os.path.basename(full_name) + base, ext = os.path.splitext(simple_name) + return base + + +def count_occurrence(data_list): + ''' + Counts occurrences in a list and returns a dict with item:occurrence + @param data_list: list to count items from + ''' + return dict((key, data_list.count(key)) for key in set(data_list)) diff -r 000000000000 -r dffc38727496 metaMS_cmd_annotate.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS_cmd_annotate.r Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,86 @@ +## read args: +args <- commandArgs(TRUE) +## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData" +args.constructedDB <- args[1] +## data file in xset format: +args.xsetData <- args[2] +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- args[3] + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outAnnotationTable <- args[4] + +args.mass_error_function <- args[5] +if (args.mass_error_function == "0") + args.mass_error_function <- NULL +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +cat("\nSettings used===============:\n") +cat(readChar(args.settings, 1e5)) + + +tryCatch( + { + library(metaMS) + + ## load the constructed DB : + tempEnv <- new.env() + testDB <- load(args.constructedDB, envir=tempEnv) + xsetData <- readRDS(args.xsetData) + + ## load settings "script" into "customMetaMSsettings" + source(args.settings, local=tempEnv) + message(paste(" loaded : ", args.settings)) + + # Just to highlight: if you want to use more than one + # trigger runLC: + LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE) + + # write out runLC annotation results: + write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE) + + # the used constructed DB (write to log): + cat("\nConstructed DB info===============:\n") + str(tempEnv[[testDB[1]]]$Info) + cat("\nConstructed DB table===============:\n") + if (length(args) == 8) + { + write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE) + write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE) + } + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + html <- "

Summary of annotation results:

" + nrTotalFeatures <- nrow(LC$PeakTable) + nrAnnotatedFeatures <- nrow(LC$Annotation$annotation.table) + html <- paste(html,"

Total nr of features: ", nrTotalFeatures,"

", sep="") + html <- paste(html,"

Total nr of annotated features: ", nrAnnotatedFeatures,"

", sep="") + + html <- paste(html,"") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 000000000000 -r dffc38727496 metaMS_cmd_pick_and_group.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metaMS_cmd_pick_and_group.r Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,92 @@ +## read args: +args <- commandArgs(TRUE) +## data files, e.g. "E:/Rworkspace/metaMS/data/data.zip" (with e.g. .CDF files) and unzip output dir, e.g. "E:/" +args.dataZip <- args[1] +args.zipExtrDir <- sub("\\.","_",paste(args[1],"dir", sep="")) +dir.create(file.path(args.zipExtrDir), showWarnings = FALSE, recursive = TRUE) +## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings" +args.settings <- args[2] + +## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt" +args.outPeakTable <- args[3] +args.xsetOut <- args[4] + +# polarity as explicit parameter: +args.runLC_polarity <- args[5] + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +cat("\nSettings used===============:\n") +cat(readChar(args.settings, 1e5)) + + +tryCatch( + { + library(metaMS) + + ## load the data files from a zip file + files <- unzip(args.dataZip, exdir=args.zipExtrDir) + + ## load settings "script" into "customMetaMSsettings" + tempEnv <- new.env() + source(args.settings, local=tempEnv) + message(paste(" loaded : ", args.settings)) + allSettings <- tempEnv[["customMetaMSsettings"]] + + # trigger runLC: + LC <- runLC(files, settings = allSettings, polarity=args.runLC_polarity, nSlaves=20, returnXset = TRUE) + + # write out runLC annotation results: + write.table(LC$PeakTable, args.outPeakTable, sep="\t", row.names=FALSE) + + # save xset as rdata: + xsAnnotatePreparedData <- LC$xset + saveRDS(xsAnnotatePreparedData, file=args.xsetOut) + + message("\nGenerating report.........") + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + html <- "

Info on alignment quality

" + # TODO add (nr and mass error) and group size + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="") + html <- paste(html,"
", sep="") + png( figureName, type="cairo", width=1100,height=600 ) + retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden") + html <- paste(html,"*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run, + but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once + this option is added in metaMS API.
", sep="") + devname = dev.off() + + + gt <- groups(LC$xset@xcmsSet) + groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3) + + html <- paste(html,"") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 000000000000 -r dffc38727496 metams_lcms_annotate.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metams_lcms_annotate.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,121 @@ + + Runs metaMS process for LC/MS feature annotation + + R_bioc_metams + + + metaMS_cmd_annotate.r + $constructed_db + $xsetData + $customMetaMSsettings + $outputFile + #if $mzTol.mzTolType == "fixed" + 0 + #else + "$mzTol.mass_error_function" + #end if + $htmlReportFile + $htmlReportFile.files_path + $outputLog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## start comment + ## metaMS process settings + customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", + chrom = "LC") +metaSetting(customMetaMSsettings, "match2DB") <- list( + rtdiff = ${rtdiff}, + rtval = ${rtval}, + #if $mzTol.mzTolType == "fixed" + mzdiff = ${mzTol.mzdiff}, + #else + ppm = ${mzTol.ppm}, + #end if + minfeat = ${minfeat}) + + + + + + + + + + + + + +.. class:: infomark + +Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB. +The figure below shows the main parts of this metaMS process. + +.. image:: $PATH_TO_IMAGES/metaMS_annotate.png + + +.. class:: infomark + +The implemented annotation strategy can be broken down in the following steps: + +1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If +the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed +tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the +bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e. +features which are associated to more than one compound - are possible. This outcome does not +indicate a problem per se, but is an inherent drawback of co-elution. + +2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all +the experimental features which are assigned to a specific compound. A specific annotation is +confirmed only if more than minfeat features which differ in retention time less than rtval are +present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The +latter, indeed, accounts for shifts in retention time between the injection of the standards and the +metabolomics experiment under investigation. This time can be rather long, considering that the +standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift +between the ions of the same compound within the same batch of injections and therefore it has +only to account for the smaller shifts occurring during peak picking and alignment. + + + + + 10.1016/j.jchromb.2014.02.051 + + \ No newline at end of file diff -r 000000000000 -r dffc38727496 metams_lcms_pick_and_group.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/metams_lcms_pick_and_group.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,301 @@ + + Runs metaMS process for LC/MS feature picking, aligning and grouping + + R_bioc_metams + + + metaMS_cmd_pick_and_group.r + $data_files + $customMetaMSsettings + $outputFile + $xsetOut + $polarity + $htmlReportFile + $htmlReportFile.files_path + $outputLog + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +## ==================================== + ## metaMS process settings + customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}", + chrom = "LC", + PeakPicking = list( + method = "${peakPicking.method}", + #if $peakPicking.method == "matchedFilter" + fwhm = ${peakPicking.fwhm}, + sigma = ${peakPicking.fwhm}/${peakPicking.sigma_denom}, + max = ${peakPicking.max}, + snthresh = ${peakPicking.snthresh}, + step = ${peakPicking.step}, + steps = ${peakPicking.steps}, + mzdiff = ${peakPicking.mzdiff}), + #else + ppm = ${peakPicking.ppm}, + peakwidth = c(${peakPicking.peakwidth}), + snthresh = ${peakPicking.snthresh}, + prefilter = c(${peakPicking.prefilter}), + mzCenterFun = "${peakPicking.mzCenterFun}", + integrate = ${peakPicking.integrate}, + mzdiff = ${peakPicking.mzdiff}, + fitgauss = ${peakPicking.fitgauss}, + noise = ${peakPicking.noise}), + #end if + Alignment = list( + min.class.fraction = ${min_class_fraction}, + min.class.size = ${min_class_size}, + mzwid = ${mzwid}, + bws = c(${bws}), + retcormethod = "${retcor.retcormethod}", + #if $retcor.retcormethod == "peakgroups" + smooth = "${retcor.smooth}", + missingratio = ${retcor.missingratio}, + extraratio = ${retcor.extraratio}, + retcorfamily = "${retcor.retcorfamily}", + #else + ##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS): + method = "${retcor.retcormethod}", + profStep = ${retcor.profStep}, + #end if + fillPeaks = ${fillPeaks}), + CAMERA = list( + perfwhm = ${perfwhm}, + cor_eic_th = ${cor_eic_th}, + ppm= ${ppm}, + graphMethod= "${groupCorr_graphMethod}", + pval= ${groupCorr_pval}, + calcCiS= ${groupCorr_calcCiS}, + calcIso= ${groupCorr_calcIso}, + calcCaS= ${groupCorr_calcCaS}, + maxcharge= ${findIsotopes_maxcharge}, + maxiso= ${findIsotopes_maxiso}, + minfrac= ${findIsotopes_minfrac}, + multiplier= ${findAdducts_multiplier} + )) + + + + + + + + + + + + +.. class:: infomark + +Runs metaMS process for LC/MS feature feature picking, aligning and grouping. +This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms. +CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data. +The figure below shows the main parts of the metaMS process wrapped by this tool. + +.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png + + +From CAMERA documentation: + +.. image:: $PATH_TO_IMAGES/CAMERA_results.png + +**References** + +If you use this Galaxy tool in work leading to a scientific publication please +cite the following papers: + +Wehrens, R.; Weingart, G.; Mattivi, F. (2014). +metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics. +Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116. +doi: 10.1016/j.jchromb.2014.02.051 +handle: http://hdl.handle.net/10449/24012 + +Wrapper by Pieter Lukasse. + + + + + 10.1016/j.jchromb.2014.02.051 + + \ No newline at end of file diff -r 000000000000 -r dffc38727496 primsfilters.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/primsfilters.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,44 @@ +import logging +log = logging.getLogger( __name__ ) + + +def restrict_prims_metabolomics( context, tool ): + """ + This tool filter will hide prims_metabolomics tools for non-metabolomics users. + This can be enabled by adding the following to the + ``app:main`` section of ``universe_wsgi.ini``:: + + tool_filters = primsfilters:restrict_prims_metabolomics + + and by adding this file to the folder: + + /lib/galaxy/tools/filters + + This is optional and can be used in case some control is desired on whom + gets to see the prims_metabolomics tools. When not using this file and the + settings mentioned above, all prims_metabolomics tools will be visible to + all users. + """ + # for debugging: import pydevd;pydevd.settrace("L0136815.wurnet.nl") + user = context.trans.user + metabolomics_tools = [ "msclust2", "combine_output", "create_poly_model", "lookup_library", + "NDIStext2tabular", "rankfilterGCMS_tabular", "filter_on_rank", + "export_to_metexp_tabular", "query_metexp" ] + found_match = False + # iterate over the tool (partial)ids and look for a match (this is compatible with tool shed given ids): + for partial_id in metabolomics_tools: + if tool.id.find("/"+ partial_id + "/") >= 0: + found_match = True + break + # the second part of this if is compatible with the ids when NOT using tool shed: + if found_match or tool.id in metabolomics_tools: + # logging.warn( 'FILTER MATCHED: %s' %(tool.name)) + + for user_role in user.roles: + if user_role.role.name == "PRIMS_METABOLOMICS": + return True + # not found to have the role, return false: + return False + else: + # return true for any other tool + return True \ No newline at end of file diff -r 000000000000 -r dffc38727496 query_mass_repos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,289 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of accurate mass values detected by high-resolution mass spectrometers +against various repositories/services such as METabolomics EXPlorer database or the +MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). + +It will take the input file and for each record it will query the +molecular mass in the selected repository/service. If one or more compounds are found +then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository/service. + +The service should implement the following interface: + +http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) + +The output should be tab separated and should contain the following columns (in this order) +db-name molecular-formula dbe formula-weight id description + + +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): + + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + chosen repository/service + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass : + molecular_mass = input_data_record[molecular_mass_col] + + + # search for related records in repository/service: + data_found = None + if molecular_mass != "": + molecular_mass = float(molecular_mass) + + # 1- search for data around this MM: + query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" + + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - details of hits found : + db-name molecular-formula dbe formula-weight id description + - Link that executes same query + + ''' + + # set() makes a unique list: + db_name_set = [] + molecular_formula_set = [] + id_set = [] + description_set = [] + + + if 'db-name' in data_found: + db_name_set = set(data_found['db-name']) + elif '# db-name' in data_found: + db_name_set = set(data_found['# db-name']) + if 'molecular-formula' in data_found: + molecular_formula_set = set(data_found['molecular-formula']) + if 'id' in data_found: + id_set = set(data_found['id']) + if 'description' in data_found: + description_set = set(data_found['description']) + + result = [data_type_found, + _to_xsv(db_name_set), + _to_xsv(molecular_formula_set), + _to_xsv(id_set), + _to_xsv(description_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): + data_rows_to_remove = [] + for data_row in data_rows: + if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): + data_rows_to_remove.append(data_row) + + for data_row in data_rows_to_remove: + data_rows.remove(data_row) + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_repository_URL(repository_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(repository_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass. + This column is then used to query against the chosen repository/service Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + molecular_mass_col = sys.argv[2] + repository_file = sys.argv[3] + error_margin = float(sys.argv[4]) + margin_unit = sys.argv[5] + output_result = sys.argv[6] + + # Parse repository_file to find the URL to the service: + repository_dblink = _get_repository_URL(repository_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against repository : + enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) + headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', + 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main() diff -r 000000000000 -r dffc38727496 query_mass_repos.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_mass_repos.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,106 @@ + + Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers + + query_mass_repos.py + $input_file + "$molecular_mass_col" + "$repository_file" + $error_margin + $margin_unit + $output_result + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher +for elemental compositions from accurate mass values detected by high-resolution mass spectrometers. + +It will take the input file and for each record it will query the +molecular mass in the selected repository. If one or more compounds are found in the +repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected repository. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass. + +**Services that can be queried** + +================= ========================================================================= +Database Description +----------------- ------------------------------------------------------------------------- +PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at + Plant Research International. NB: restricted access to employees with + access key. +ExactMassDB A database of possible elemental compositions consits of C: 100, + H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and + the Lewis valence rules. + (via /mfsearcher/exmassdb/) +ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental + compositions, filters some elemental compositions according to + the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2 + database returns the same result as does HR2 with the same atom kind + and number condition as that used in construction of the ExactMassDB. + (via /mfsearcher/exmassdb-hr2/) +Pep1000 A database of possible linear polypeptides that are + constructed with 20 kinds of amino acids and having molecular + weights smaller than 1000. + (via /mfsearcher/pep1000/) +KEGG Re-calculated compound data from KEGG. Weekly updated. + (via /mfsearcher/kegg/) +KNApSAcK Re-calculated compound data from KNApSAcK. + (via /mfsearcher/knapsack/) +Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer . + (via /mfsearcher/flavonoidviewer/ +LipidMAPS Re-calculated compound data from LIPID MAPS. + (via /mfsearcher/lipidmaps/) +HMDB Re-calculated compound data from Human Metabolome Database (HMDB) + Version 3.5. + (via /mfsearcher/hmdb/) +PubChem Re-calculated compound data from PubChem. Monthly updated. + (via /mfsearcher/pubchem/) +================= ========================================================================= + +Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher + + + diff -r 000000000000 -r dffc38727496 query_metexp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,282 @@ +#!/usr/bin/env python +# encoding: utf-8 +''' +Module to query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. +''' +import csv +import sys +import fileinput +import urllib2 +import time +from collections import OrderedDict + +__author__ = "Pieter Lukasse" +__contact__ = "pieter.lukasse@wur.nl" +__copyright__ = "Copyright, 2014, Plant Research International, WUR" +__license__ = "Apache v2" + +def _process_file(in_xsv, delim='\t'): + ''' + Generic method to parse a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) + return _process_data(data) + +def _process_data(data): + + header = data.pop(0) + # Create dictionary with column name as key + output = OrderedDict() + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): + ''' + This method will iterate over the record in the input_data and + will enrich them with the related information found (if any) in the + MetExp Database. + + # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies + ''' + merged = [] + + for i in xrange(len(input_data[input_data.keys()[0]])): + # Get the record in same dictionary format as input_data, but containing + # a value at each column instead of a list of all values of all records: + input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) + + # read the molecular mass and formula: + cas_id = input_data_record[casid_col] + formula = input_data_record[formula_col] + molecular_mass = input_data_record[molecular_mass_col] + + # search for related records in MetExp: + data_found = None + if cas_id != "undef": + # 1- search for other experiments where this CAS id has been found: + query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "CAS" + if data_found == None: + # 2- search for other experiments where this FORMULA has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "FORMULA" + if data_found == None: + # 3- search for other experiments where this MM has been found: + query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method + data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") + data_type_found = "MM" + + if data_found == None: + # If still nothing found, just add empty columns + extra_cols = ['', '','','','','','',''] + else: + # Add info found: + extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) + + # Take all data and merge it into a "flat"/simple array of values: + field_values_list = _merge_data(input_data_record, extra_cols) + + merged.append(field_values_list) + + # return the merged/enriched records: + return merged + + +def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): + ''' + This method will go over the data found and will return a + list with the following items: + - Experiment details where hits have been found : + 'organism', 'tissue','experiment_name','user_name','column_type' + - Link that executes same query + + ''' + # set() makes a unique list: + organism_set = [] + tissue_set = [] + experiment_name_set = [] + user_name_set = [] + column_type_set = [] + cas_nr_set = [] + + if 'organism' in data_found: + organism_set = set(data_found['organism']) + if 'tissue' in data_found: + tissue_set = set(data_found['tissue']) + if 'experiment_name' in data_found: + experiment_name_set = set(data_found['experiment_name']) + if 'user_name' in data_found: + user_name_set = set(data_found['user_name']) + if 'column_type' in data_found: + column_type_set = set(data_found['column_type']) + if 'CAS' in data_found: + cas_nr_set = set(data_found['CAS']) + + + result = [data_type_found, + _to_xsv(organism_set), + _to_xsv(tissue_set), + _to_xsv(experiment_name_set), + _to_xsv(user_name_set), + _to_xsv(column_type_set), + _to_xsv(cas_nr_set), + #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): + "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] + return result + + +def _to_xsv(data_set): + result = "" + for item in data_set: + result = result + str(item) + "|" + return result + + +def _fire_query_and_return_dict(url): + ''' + This method will fire the query as a web-service call and + return the results as a list of dictionary objects + ''' + + try: + data = urllib2.urlopen(url).read() + + # transform to dictionary: + result = [] + data_rows = data.split("\n") + + # check if there is any data in the response: + if len(data_rows) <= 1 or data_rows[1].strip() == '': + # means there is only the header row...so no hits: + return None + + for data_row in data_rows: + if not data_row.strip() == '': + row_as_list = _str_to_list(data_row, delimiter='\t') + result.append(row_as_list) + + # return result processed into a dict: + return _process_data(result) + + except urllib2.HTTPError, e: + raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) + except urllib2.URLError, e: + raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") + +def _str_to_list(data_row, delimiter='\t'): + result = [] + for column in data_row.split(delimiter): + result.append(column) + return result + + +# alternative: ? +# s = requests.Session() +# s.verify = False +# #s.auth = (token01, token02) +# resp = s.get(url, params={'name': 'anonymous'}, stream=True) +# content = resp.content +# # transform to dictionary: + + + + +def _merge_data(input_data_record, extra_cols): + ''' + Adds the extra information to the existing data record and returns + the combined new record. + ''' + record = [] + for column in input_data_record: + record.append(input_data_record[column]) + + + # add extra columns + for column in extra_cols: + record.append(column) + + return record + + +def _save_data(data_rows, headers, out_csv): + ''' + Writes tab-separated data to file + @param data_rows: dictionary containing merged/enriched dataset + @param out_csv: output csv file + ''' + + # Open output file for writing + outfile_single_handle = open(out_csv, 'wb') + output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") + + # Write headers + output_single_handle.writerow(headers) + + # Write one line for each row + for data_row in data_rows: + output_single_handle.writerow(data_row) + +def _get_metexp_URL(metexp_dblink_file): + ''' + Read out and return the URL stored in the given file. + ''' + file_input = fileinput.input(metexp_dblink_file) + try: + for line in file_input: + if line[0] != '#': + # just return the first line that is not a comment line: + return line + finally: + file_input.close() + + +def main(): + ''' + MetExp Query main function + + The input file can be any tabular file, as long as it contains a column for the molecular mass + and one for the formula of the respective identification. These two columns are then + used to query against MetExp Database. + ''' + seconds_start = int(round(time.time())) + + input_file = sys.argv[1] + casid_col = sys.argv[2] + formula_col = sys.argv[3] + molecular_mass_col = sys.argv[4] + metexp_dblink_file = sys.argv[5] + separation_method = sys.argv[6] + output_result = sys.argv[7] + + # Parse metexp_dblink_file to find the URL to the MetExp service: + metexp_dblink = _get_metexp_URL(metexp_dblink_file) + + # Parse tabular input file into dictionary/array: + input_data = _process_file(input_file) + + # Query data against MetExp DB : + enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) + headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', + 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] + + _save_data(enriched_data, headers, output_result) + + seconds_end = int(round(time.time())) + print "Took " + str(seconds_end - seconds_start) + " seconds" + + + +if __name__ == '__main__': + main() diff -r 000000000000 -r dffc38727496 query_metexp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/query_metexp.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,69 @@ + + Query a set of identifications against the METabolomics EXPeriments database + + query_metexp.py + $input_file + "$casid_col" + "$formula_col" + "$molecular_mass_col" + "$metexp_dblink_file" + $separation_method + $output_result + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool will Query a set of identifications against the METabolomics EXPlorer database. + +It will take the input file and for each record it will query the +molecular mass in the selected MetExp DB. If one or more compounds are found in the +MetExp DB then extra information regarding these compounds is added to the output file. + +The output file is thus the input file enriched with information about +related items found in the selected MetExp DB. + +**Notes** + +The input file can be any tabular file, as long as it contains a column for the molecular mass +and one for the formula of the respective identification. + + + + diff -r 000000000000 -r dffc38727496 rankfilterGCMS_tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilterGCMS_tabular.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,80 @@ + + Convert Retention Time to Retention Index + rankfilter_GCMS/rankfilter.py $input_file + + + + + + + + + + + + + + + + + + + + + + + + + + sample = ${sample} + calibration = ${calibration} + lib_data = ${lib_data} + window = ${window} + analysis_type = ${analysis_type} + tabular = True + onefile = ${onefile} + model = ${model} + + + +Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the +given calibration file. + +It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File), +based on its CAS number, its respective RIsvr value in the given global lookup library +(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS). +This generates an prediction of the RI for +a compound according to the "RankFilter procedure" (RIsvr). + +Output is a tab separated file in which four columns are added: + + - **Rank** Calculated rank + - **RIexp** Experimental Retention Index (RI) + - **RIsvr** Calculated RI based on support vector regression (SVR) + - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp) + +.. class:: infomark + +**Notes** + + - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'. + - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will + be used to convert from XXXX to YYYY. + +----- + +**References** + + - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS + metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794 + + diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/__init__.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,5 @@ +''' +Created on Mar 14, 2012 + +@author: marcelk +''' diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/pdfread.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/pdfread.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,214 @@ +""" +Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import csv + +def getPDF(filename, print_progress): + ''' + Parses NIST PDF file + @param filename: PDF file to parse + ''' + NistInput = {} + NistInput_missed = {} + nist_input = open(filename, 'r').read() + + hitid = [] + rt = [] + name = [] + forward = [] + cas = [] + reverse = [] + prob = [] + lib_id = [] + nist_id = [] + missed_compounds = [] + id_missed_compounds = [] + formula = [] + + hit_list = nist_input.split('** Search Report Page 1 of 1 **') + hit_list.pop(0) + #number_hits = range(10) + line_id = 0 + for line in hit_list: + line = line.strip().translate(None, '\r') + if line != '': + hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler! + #strange....code seems fine actually...debug! See test/data/download.pdf + # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux? + spec_id = hits.pop(0).split(' ')[1] + j = 0 + for hh in hits: + cell = hh.split(';') + if print_progress == True: + print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell + line_id += 1 + if len(cell) == 7: # the compound has CAS number + if len(cell[1].split(':')) == 2: + forward.append((cell[1].split(':')[1]).strip()) + # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end + if len(cell[0].split(':')) > 2: + name_tmp = ':'.join(cell[0].split(':')[1:]) + else: + name_tmp = cell[0].split(':')[1] + + name.append(name_tmp.replace(" ", " ").strip()) + name_tmp = name_tmp.strip().split(' ') + if name_tmp: + # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: + if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': + formule = (name_tmp[-1]) + else: + formule = ('not_def') + else: + formule = ('not_def') + formula.append(formule.replace(" ", " ")) + reverse.append((cell[2].split(':')[1]).strip()) + prob.append(cell[3].split(' ')[2].replace('%', '')) + cas.append((cell[4].split(':')[1]).strip()) + lib_id.append((cell[5].split(':')[1]).strip()) + nist_id.append(cell[6].split(':')[1].replace('.', '').strip()) + j = j + 1 + else: + missed_compounds.append(hh) + id_missed_compounds.append(spec_id) + + elif len(cell) == 6: # the compound has no CAS number + if len(cell[1].split(':')) == 2: + + forward.append((cell[1].split(':')[1]).strip()) + # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end + if len(cell[0].split(':')) > 2: + name_tmp = ':'.join(cell[0].split(':')[1:]) + else: + name_tmp = cell[0].split(':')[1] + + name.append(name_tmp.replace(" ", " ").strip()) + name_tmp = name_tmp.strip().split(' ') + if name_tmp: + # if the name ends with a word that starts with C, F or H, then assume this last word is a formula: + if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H': + formule = (name_tmp[-1]) + else: + formule = ('not_def') + else: + formule = ('not_def') + formula.append(formule.replace(" ", " ")) + reverse.append((cell[2].split(':')[1]).strip()) + prob.append(cell[3].split(' ')[2].replace('%', '')) + cas.append('undef') + lib_id.append((cell[4].split(':')[1]).strip()) + nist_id.append(cell[5].split(':')[1].replace('.', '').strip()) + j = j + 1 + + else: + missed_compounds.append(hh) + id_missed_compounds.append(spec_id) + + else: # Missing columns, report and quit + missed_compounds.append(hh) + id_missed_compounds.append(spec_id) + + for _ in range(j): + hitid.append(str(spec_id.replace(" ", " "))) + #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit: + rt.append(str(float(spec_id.split('-')[3]) / 1e+06)) + + NistInput['ID'] = hitid + NistInput['R.T.'] = rt + NistInput['Name'] = name + NistInput['CAS'] = cas + NistInput['Formula'] = formula + NistInput['Forward'] = forward + NistInput['Reverse'] = reverse + NistInput['Probability'] = prob + NistInput['Library'] = lib_id + NistInput['Library ID'] = nist_id + NistInput_missed['Missed Compounds'] = missed_compounds + NistInput_missed['ID missed Compounds'] = id_missed_compounds + + return NistInput, NistInput_missed + + +def convert_pdftotext2tabular(filename, output_file, error_file, print_progress): + ''' + Converts NIST PDF file to tabular format + @param filename: PDF file to parse + @param output_file: output file for the hits + @param error_file: output file for failed hits + ''' + [HitList, HitList_missed] = getPDF(filename, print_progress) + # save Hitlist as tab seperate file + Hitlist_as_text = "\t".join(HitList.keys()) + "\n" + Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()]) + Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)])) + output_fh = open(output_file, 'wb') + output_fh.write(Hitlist_as_text) + output_fh.close() + + out_missed_pdf = open(error_file, 'wb') + for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['ID missed Compounds']): + out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n") + out_missed_pdf.write('%s\n' % '\t'.join([y, x])) + out_missed_pdf.close() + + +def read_tabular(in_csv): + ''' + Parses a tab-separated file returning a dictionary with named columns + @param in_csv: input filename to be parsed + ''' + data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t')) + header = data.pop(0) + # Create dictionary with column name as key + output = {} + for index in xrange(len(header)): + output[header[index]] = [row[index] for row in data] + return output + + +def read_tabular_old(filename): + ''' + Function to read tabular format (created by convert_pdftotext2tabular) + and output a dict with header of columns as key and value is columns of tabular as list + @param filename: tabular file to read + ''' + input_fh = None + try: + input_fh = open(filename, 'r') + except IOError, error: + raise error + colnames = input_fh.readline().strip().split('\t') + cells = [] + for line in input_fh.readlines(): + cells.append(line.strip().split('\t')) + #transform from row oriented structure to column oriented structure + cells = zip(*cells) + #store the list of list in form of final output + RankFilterGC_format = {} + for colnumber in range(len(colnames)): + RankFilterGC_format[colnames[colnumber]] = cells[colnumber] + return RankFilterGC_format + + +if __name__ == '__main__': + convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True) diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/pdftotabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/pdftotabular.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,44 @@ +""" +Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this software except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +""" + +import sys +import pdfread +from subprocess import call + + +def convert_pdftotext(filename, output_file): + ''' + Converts PDF file to text + @param filename: PDF file to parse + @param output_file: output text file for the hits + ''' + + # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The + # default is to 'undo' physical layout (columns, hyphenation, etc.) and output + # the text in reading order. + try: + call(["pdftotext", "-layout", filename, output_file]) + except: + raise Exception("Error while trying to convert PDF to text") + + + + +if __name__ == '__main__': + pdf_as_text = sys.argv[1]+".txt" + convert_pdftotext(sys.argv[1], pdf_as_text) + pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False) diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/rankfilter.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_GCMS/rankfilter.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,432 @@ +""" +Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +#Library functions definition +#----------Begin------------- +from numpy import array, linalg, ones +from numpy.polynomial import polynomial +import math +import pdfread +import sys + + +def calibrate(standards): + ''' + Calculates the RT to RI conversion: RI = a + b*RT + @param standards: variable containing RI and RT data + ''' + A = ones((len(standards['R.T.']), 2), dtype=float) + A[:, 0] = array(map(float, standards['R.T.'])) + [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards['RI']))) + + return coeff + + +def calibrate_poly(standards): + ''' + Calculates the RT to RI conversion using a polynomial model + @param standards: variable containing RI and RT data + ''' + retention_time = array(map(float, standards['R.T.'])) + retention_index = array(map(float, standards['RI'])) + + # Fit a 3rd degree polynomial + fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1] + return [fit[0], fit[1], fit[2], fit[3]] + + +def calculate_poly(retention_time, poly_cal): + ''' + Converts a given retention time to retention index using the calculated polynomial model + @param retention_time: retention_time to convert to retention index + @param poly_cal: result from calculating regression + ''' + # Calculates RI based on given retention_time using polynomial function + retention_time = array(map(float, retention_time)) + if len(retention_time) > 1: + ri_exp = [] + for i in retention_time: + ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3]) + return ri_exp + else: + return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3] + + +def convert_rt(hit_list, coeff): + ''' + Converts a given retention time to retention index using the linear model + @param hit_list: list holding the retention time + @param coeff: calculated coefficient (slope and intercept) using the linear model + ''' + #Convert RT to RI + hit_list['RIexp'] = array(map(float, hit_list['R.T.'])) * coeff[0] + coeff[1] + return hit_list + + +def convert_rt_poly(hit_list, poly_cal): + ''' + Calls the actual RT to RI converter and returns the updated list with added RIexp value + @param hit_list: result list containing the retention time + ''' + hit_list['RIexp'] = array(map(float, calculate_poly(hit_list['R.T.'], poly_cal))) + return hit_list + + +def get_data(libdata, LabelCol): + ''' + Retrieves datacolumns indicated by LabelCol from libdata (generic function) + Returns a dict with the requested column names as keys + @param libdata: file from which data is loaded + @param LabelCol: columns to retrieve + ''' + from numpy import take + LibData = open(libdata, 'r').read().split('\n') + + #Get the labels of the columns in the file + FirstLine = LibData.pop(0).split('\t') + + FirstLine[-1] = FirstLine[-1].replace('\r', '') + + # Create a temporate variable containing the all data + tmp_data = [] + for ll in LibData: + if ll != '': + tmp_data.append(ll.split('\t')) + + # Find the indices of the desired data + ind = [] + try: + for key in LabelCol: + ind.append(FirstLine.index(key)) + except: + print str(key) + " not found in first line of library (" + str(libdata) + ")" + print"the folowing items are found in the first line of the library:\n" + str(FirstLine) + sys.exit(1) + # Extract the desired data + data = [] + for x in tmp_data: + data.append(take(array(x), ind)) + + # library_data = dict(zip(LabelCol,transpose(data))) + library_data = dict(zip(LabelCol, map(lambda *x: list(x), *data))) + return library_data + + +def rank_hit(hit_list, library_data, window): + ''' + Computes the Rank and % relative error + @param hit_list: input data + @param library_data: library used for reading the RIsvr data + @param window: error window + ''' + hit_match_ripred = [] + hit_match_syn = [] + # Convert 'Name' data to list in order to be indexed + # library_data['Name']=list(library_data['Name']) + + for hit_cas, hit_name in zip(hit_list['CAS'], hit_list['Name']): + index = 0 + if hit_cas != 'undef': + try: + index = library_data['CAS'].index(hit_cas.replace(' ', '').replace('-', '')) + except: + try: + index = library_data['Name'].index(hit_name.replace(' ', '')) + except: + # If for any reason the hit is not present + # in the mainlib library indicate this with -999 + index = 0 + else: + try: + index = library_data['Name'].index(hit_name.replace(' ', '')) + except: + # If for any reason the hit is not present + # in the mainlib library indicate this with -999 + index = 0 + if index != 0: + hit_match_ripred.append(float(library_data['RIsvr'][index])) + hit_match_syn.append(library_data['Synonyms'][index]) + else: + hit_match_ripred.append(-999) + hit_match_syn.append('None') + hit_list['RIsvr'] = hit_match_ripred + hit_list['Synonyms'] = hit_match_syn + + # Determine the relative difference between the experimental + # and the predicted RI + ri_err = [] + + for ri_exp, ri_qsar in zip(hit_list['RIexp'], hit_list['RIsvr']): + if int(ri_qsar) != -999: + ri_err.append(float(int(float(ri_qsar)) - int(float(ri_exp))) / int(float(ri_exp)) * 100) + else: + ri_err.append(-999) + + # Define the rank of the hits + hit_rank = [] + + for tt in ri_err: + if tt == -999: + # The name of the hit generated with AMDIS did not match a name + # in the mainlib library + hit_rank.append(4) + else: + # Rank 1 - ri_err is within +/- window/2 + if abs(tt) <= float(window) / 2: + hit_rank.append(1) + # Rank 2 - window/2 < ri_err <= window + if abs(tt) > float(window) / 2 and abs(tt) <= float(window): + hit_rank.append(2) + # Rank 3 - ri_err > window + if abs(tt) > float(window): + hit_rank.append(3) + hit_list['Rank'] = hit_rank + hit_list['%rel.err'] = ri_err + return hit_list + +def print_to_file(hit_list, filename, keys_to_print, print_subsets=True): + ''' + Writes output data to files (four output files are generated): + filename_ranked - the hits are ranked + filename_filter_in - only hits with rank 1 and 2 are retained + filename_filter_out - hits with rank 3 are filtered out + filename_filter_missed - hits with rank 4 - there was no match with the + library data + @param hit_list: a dictionary with the ranked hits + @param filename: the core of the output file names + @param keys_to_print: determines the order in which the data are printed + @param print_subsets: + ''' + from numpy import take + + out_ranked = open(filename["ranked"], 'w') + if (print_subsets): + out_in = open(filename["filter_in"], 'w') + out_out = open(filename["filter_out"], 'w') + out_missed = open(filename["missed"], 'w') + + #Convert RIexp and RIsvr into integer for printing + hit_list['RIexp'] = map(int, map(math.ceil, hit_list['RIexp'])) + hit_list['RIsvr'] = map(int, map(math.ceil, hit_list['RIsvr'])) + + #Establish the right order of the data to be printed + tmp_items = hit_list.items() + tmp_keys = hit_list.keys() + ind = [] + + for key in keys_to_print: + ind.append(tmp_keys.index(key)) + + #Print the labels of the columns + line = '\t'.join(take(array(tmp_keys), ind)) + out_ranked.write('%s\n' % line) + if (print_subsets): + out_in.write('%s\n' % line) + out_out.write('%s\n' % line) + out_missed.write('%s\n' % line) + + #Collect the data for each hit in the right order and print them + #in the output file + i = 0 + for name in hit_list['Name']: + tt = [] + for x in iter(tmp_items): + # trim the value if it is a string: + if isinstance(x[1][i], basestring): + tt.append(x[1][i].strip()) + else: + tt.append(x[1][i]) + tmp1 = take(array(tt), ind) + line = '\t'.join(tmp1.tolist()) + + out_ranked.write('%s\n' % line) + if(print_subsets): + if hit_list['Rank'][i] == 4: + out_missed.write('%s\n' % line) + if hit_list['Rank'][i] == 3: + out_out.write('%s\n' % line) + if hit_list['Rank'][i] == 1 or hit_list['Rank'][i] == 2: + out_in.write('%s\n' % line) + + i = i + 1 + +#---------End-------------- +def main(): + #Ranking and filtering procedure + if len(sys.argv) < 2: + print "Usage:" + print "python RankFilter_GC-MS.py input \n" + print "input is a text file that specifies the names and the location" + print "of the files with the sample, compounds for calibration, library" + print "data, the core of the name ot the outputs, and the value of the" + print "window used for the filtering \n" + + sys.exit(1) + + #------Read the input file------ + try: + ifile = open(sys.argv[1], 'r').read().split('\n') + except: + print sys.argv[1], " file can not be found" + sys.exit() + + #Get the file names for the data + #labels - the type of input files + #filenames - the names of the input files + labels = [] + filenames = [] + + for l in ifile: + l = l.strip() + if l != '': + labels.append(l.split('=')[0].replace(' ', '').replace('\r', '')) + filenames.append(l.split('=')[1].replace(' ', '').replace('\r', '')) + + InputData = dict(zip(labels, filenames)) + + #this part checkes if the ouput option is set. The output files are saved as the output variable as prefix for the output files + #if the output is not found , each output file has to be selected by forehand. This comes in handy for pipeline tools as galaxy + print_subsets = True + NDIS_is_tabular = False + + if 'output' in InputData: + output_files = {"ranked":InputData['output'] + "_ranked", \ + "filter_in":InputData['output'] + "_filter_in", \ + "filter_out":InputData['output'] + "filter_out", \ + "missed":InputData['output'] + "_missed", \ + "missed_parse_pdf":InputData['output'] + "_missed_parse_pdf"} + elif 'tabular' in InputData: + NDIS_is_tabular = True + if(not "onefile" in InputData): + output_files = {"ranked":InputData['ranked'], \ + "filter_in":InputData['filter_in'], \ + "filter_out":InputData['filter_out'], \ + "missed":InputData['missed']} + else: + print_subsets = False + output_files = {"ranked":InputData['onefile']} + else: + output_files = {"ranked":InputData['ranked'], \ + "filter_in":InputData['filter_in'], \ + "filter_out":InputData['filter_out'], \ + "missed":InputData['missed'], \ + "missed_parse_pdf":InputData['missed_parse_pdf']} + + #------Start with calibration------ + #Check whether a file with data for the calibration is specified + #Specify which data to be read from the file with standard compounds + LabelColStand = ['Name', 'R.T.', 'RI'] + + if InputData['calibration'] != 'none': + #get the coeffiecients for the RT to RI convertion + + try: + ifile = open(InputData['calibration'], 'r') + except: + print "file", InputData['calibration'], "can not be found" + sys.exit(1) + + standards = get_data(InputData['calibration'], LabelColStand) + if InputData['model'] == 'linear': + coeff = calibrate(standards) + elif InputData['model'] == 'poly': + poly_cal = calibrate_poly(standards) + else: + print "error: model ", InputData['model'], " can not be found. Use 'linear' or 'poly' " + sys.exit(1) + else: + #No file has been specified for the calibration + #Use the default coefficients + print 'No file has been specified for the calibration' + print 'WARNING: the default coefficients will be used' + coeff = array([29.4327, 454.5260]) + + if InputData['analysis_type'] == 'AMDIS': + try: + AmdisOut = open(InputData['sample'], 'r') + print("open ok") + #Specify which data to be extracted from the AMDIS output table + #Weighted and Reverse are measure of matching between the experimental + #and the library spectra. The experimental spectrum is used as template + #for the calculation of Weighted, whereas for Reverse the template is the + #library spectrum + LabelCol = ['CAS', 'Name', 'R.T.', 'Weighted', 'Reverse', 'Purity'] + + #Get the data from the AMDIS output file + HitList = get_data(InputData['sample'], LabelCol) + #Remove '>' from the names + HitList['Name'] = [s.replace('>', '') for s in HitList['Name']] + except: + print "the file", InputData['sample'], "can not be found" + sys.exit(1) + if InputData['analysis_type'] == 'NIST': + #HitList_missed - a variable of type dictionary containing the hits with the symbol ";" + #in the name + if not NDIS_is_tabular: + print "Warning; NDIS is not tabular format, reading PDF..\n" + [HitList, HitList_missed] = pdfread.getPDF(InputData['sample']) + else: + HitList = pdfread.read_tabular(InputData['sample']) + + #Convert RT to RI + if InputData['model'] == 'linear': + HitList = convert_rt(HitList, coeff) + if InputData['model'] == 'poly': + print "Executing convert_rt_poly().." + HitList = convert_rt_poly(HitList, poly_cal) + + #------Read the library data with the predicted RI------ + try: + LibData = open(InputData['lib_data'], 'r') + except: + print "the file", InputData['lib_data'], "can not be found" + sys.exit(1) + + #Specify which data to be extracted from the library data file + LabelColLib = ['CAS', 'Name', 'RIsvr', 'Synonyms'] + LibraryData = get_data(InputData['lib_data'], LabelColLib) + + #------Match the hits with the library data and rank them------ + if InputData['window'] != '': + HitList = rank_hit(HitList, LibraryData, InputData['window']) + else: + print "No value for the window used for the filtering is specified \n" + sys.exit(1) + + #------Print the ranked and filtered hits------ + #Specify which data to be printed + if InputData['analysis_type'] == 'AMDIS': + keys_to_print = ['R.T.', 'CAS', 'Name', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms'] + else: + keys_to_print = ['ID', 'R.T.', 'Name', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library'] + + #skip this error output from reading a pdftotext file when file is tabular + if InputData['analysis_type'] == 'NIST' and not NDIS_is_tabular: + out_missed_pdf = open(output_files['missed_parse_pdf'], 'w') + for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']): + out_missed_pdf.write('%s\n' % '\t'.join([y, x])) + out_missed_pdf.close() + + print_to_file(HitList, output_files, keys_to_print, print_subsets) + +if __name__ == '__main__': + main() diff -r 000000000000 -r dffc38727496 rankfilter_text2tabular.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rankfilter_text2tabular.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,14 @@ + + Convert NIST text to tabular format + rankfilter_GCMS/pdftotabular.py $input $output $output_err + + + + + + + + + This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools. + + diff -r 000000000000 -r dffc38727496 select_on_rank.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/select_on_rank.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,21 @@ +import csv +import sys + +__author__ = "Marcel Kempenaar" +__contact__ = "brs@nbic.nl" +__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" +__license__ = "MIT" + +in_file = sys.argv[1] +out_file = sys.argv[2] +to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)] + +data = list(csv.reader(open(in_file, 'rb'), delimiter='\t')) +header = data.pop(0) +header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header] +rank = header_clean.index("rank") + +writer = csv.writer(open(out_file, 'wb'), delimiter='\t') +writer.writerow(header) +for select in to_select_list: + writer.writerows([i for i in data if i[rank] == select]) diff -r 000000000000 -r dffc38727496 select_on_rank.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/select_on_rank.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,15 @@ + + Filter on the Rank field in the RankFilter output file + select_on_rank.py $input $output "$rank" + + + + + + + + +This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file). + + diff -r 000000000000 -r dffc38727496 static/images/CAMERA_results.png Binary file static/images/CAMERA_results.png has changed diff -r 000000000000 -r dffc38727496 static/images/confidence_and_slope_params_explain.png Binary file static/images/confidence_and_slope_params_explain.png has changed diff -r 000000000000 -r dffc38727496 static/images/diffreport.png Binary file static/images/diffreport.png has changed diff -r 000000000000 -r dffc38727496 static/images/massEIC.png Binary file static/images/massEIC.png has changed diff -r 000000000000 -r dffc38727496 static/images/metaMS_annotate.png Binary file static/images/metaMS_annotate.png has changed diff -r 000000000000 -r dffc38727496 static/images/metaMS_pick_align_camera.png Binary file static/images/metaMS_pick_align_camera.png has changed diff -r 000000000000 -r dffc38727496 static/images/msclust_summary.png Binary file static/images/msclust_summary.png has changed diff -r 000000000000 -r dffc38727496 static/images/sample_SIM.png Binary file static/images/sample_SIM.png has changed diff -r 000000000000 -r dffc38727496 static/images/sample_sel_and_peak_height_correction.png Binary file static/images/sample_sel_and_peak_height_correction.png has changed diff -r 000000000000 -r dffc38727496 static_resources/elements_and_masses.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/static_resources/elements_and_masses.tab Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,104 @@ +Name Atomic number Chemical symbol Relative atomic mass +Hydrogen 1 H 1.01 +Helium 2 He 4 +Lithium 3 Li 6.94 +Beryllium 4 Be 9.01 +Boron 5 B 10.81 +Carbon 6 C 12.01 +Nitrogen 7 N 14.01 +Oxygen 8 O 16 +Fluorine 9 F 19 +Neon 10 Ne 20.18 +Sodium 11 Na 22.99 +Magnesium 12 Mg 24.31 +Aluminum 13 Al 26.98 +Silicon 14 Si 28.09 +Phosphorus 15 P 30.98 +Sulfur 16 S 32.06 +Chlorine 17 Cl 35.45 +Argon 18 Ar 39.95 +Potassium 19 K 39.1 +Calcium 20 Ca 40.08 +Scandium 21 Sc 44.96 +Titanium 22 Ti 47.9 +Vanadium 23 V 50.94 +Chromium 24 Cr 52 +Manganese 25 Mn 54.94 +Iron 26 Fe 55.85 +Cobalt 27 Co 58.93 +Nickel 28 Ni 58.71 +Copper 29 Cu 63.54 +Zinc 30 Zn 65.37 +Gallium 31 Ga 69.72 +Germanium 32 Ge 72.59 +Arsenic 33 As 74.99 +Selenium 34 Se 78.96 +Bromine 35 Br 79.91 +Krypton 36 Kr 83.8 +Rubidium 37 Rb 85.47 +Strontium 38 Sr 87.62 +Yttrium 39 Y 88.91 +Zirconium 40 Zr 91.22 +Niobium 41 Nb 92.91 +Molybdenum 42 Mo 95.94 +Technetium 43 Tc 96.91 +Ruthenium 44 Ru 101.07 +Rhodium 45 Rh 102.9 +Palladium 46 Pd 106.4 +Silver 47 Ag 107.87 +Cadmium 48 Cd 112.4 +Indium 49 In 114.82 +Tin 50 Sn 118.69 +Antimony 51 Sb 121.75 +Tellurium 52 Te 127.6 +Iodine 53 I 126.9 +Xenon 54 Xe 131.3 +Cesium 55 Cs 132.9 +Barium 56 Ba 137.34 +Lanthanum 57 La 138.91 +Cerium 58 Ce 140.12 +Praseodymium 59 Pr 140.91 +Neodymium 60 Nd 144.24 +Promethium 61 Pm 144.91 +Samarium 62 Sm 150.35 +Europium 63 Eu 151.96 +Gadolinium 64 Gd 157.25 +Terbium 65 Tb 158.92 +Dysprosium 66 Dy 162.5 +Holmium 67 Ho 164.93 +Erbium 68 Er 167.26 +Thulium 69 Tm 168.93 +Ytterbium 70 Yb 173.04 +Lutetium 71 Lu 174.97 +Hafnium 72 Hf 178.49 +Tantalum 73 Ta 180.95 +Wolfram 74 W 183.85 +Rhenium 75 Re 186.2 +Osmium 76 Os 190.2 +Iridium 77 Ir 192.22 +Platinum 78 Pt 195.09 +Gold 79 Au 196.97 +Mercury 80 Hg 200.59 +Thallium 81 Tl 204.37 +Lead 82 Pb 207.19 +Bismuth 83 Bi 208.98 +Polonium 84 Po 208.98 +Astatine 85 At 209.99 +Radon 86 Rn 222.02 +Francium 87 Fr 223.02 +Radium 88 Ra 226 +Actinium 89 Ac 227.03 +Thorium 90 Th 232.04 +Protactinium 91 Pa 231.04 +Uranium 92 U 238.03 +Neptunium 93 Np 237 +Plutonium 94 Pu 242 +Americium 95 Am 243.06 +Curium 96 Cm 247.07 +Berkelium 97 Bk 247.07 +Californium 98 Cf 251.08 +Einsteinium 99 Es 254.09 +Fermium 100 Fm 257.1 +Mendelevium 101 Md 257.1 +Nobelium 102 No 255.09 +Lawrencium 103 Lr 256.1 diff -r 000000000000 -r dffc38727496 test/__init__.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/__init__.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,1 @@ +''' unit tests ''' diff -r 000000000000 -r dffc38727496 test/integration_tests.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/integration_tests.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,268 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import library_lookup, combine_output +from GCMS.rankfilter_GCMS import rankfilter +import os.path +import sys +import unittest +import re + + +class IntegrationTest(unittest.TestCase): + def test_library_lookup(self): + ''' + Run main for data/NIST_tabular and compare produced files with references determined earlier. + ''' + # Create out folder + outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup') + if not os.path.exists(outdir): + os.makedirs(outdir) + outfile_base = os.path.join(outdir, 'produced_library_lookup') + outfile_txt = outfile_base + '.txt' + + #Build up arguments and run + input_txt = resource_filename(__name__, "data/NIST_tabular.txt") + library = resource_filename(__name__, "data/RIDB_subset.txt") + regress_model = resource_filename(__name__, "data/ridb_poly_regression.txt") + sys.argv = ['test', + library, + input_txt, + 'Capillary', + 'Semi-standard non-polar', + outfile_txt, + 'HP-5', + regress_model] + # Execute main function with arguments provided through sys.argv + library_lookup.main() + #Compare with reference files + reference_txt = resource_filename(__name__, 'reference/produced_library_lookup.txt') + + #read both the reference file and actual output files + expected = _read_file(reference_txt) + actual = _read_file(outfile_txt) + + #convert the read in files to lists we can compare + expected = expected.split() + actual = actual.split() + + for exp, act in zip(expected, actual): + if re.match('\\d+\\.\\d+', exp): + exp = float(exp) + act = float(act) + self.assertAlmostEqual(exp, act, places=5) + else: + # compare values + self.failUnlessEqual(expected, actual) + + + def test_combine_output_simple(self): + ''' + Run main for data/NIST_tabular and compare produced files with references determined earlier. + ''' + # Create out folder + outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup') + if not os.path.exists(outdir): + os.makedirs(outdir) + outfile_base = os.path.join(outdir, 'produced_combine_output') + outfile_single_txt = outfile_base + '_single.txt' + outfile_multi_txt = outfile_base + '_multi.txt' + + #Build up arguments and run + input_rankfilter = resource_filename(__name__, "data/Rankfilter.txt") + input_caslookup = resource_filename(__name__, "data/Caslookup.txt") + sys.argv = ['test', + input_rankfilter, + input_caslookup, + outfile_single_txt, + outfile_multi_txt] + # Execute main function with arguments provided through sys.argv + combine_output.main() + #Compare with reference files + # reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt') + # reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt') + # self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt)) + # self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt)) + + #Clean up + #shutil.rmtree(tempdir) + + + + def def_test_rank_filter_advanced(self): + ''' + Run main of RankFilter + ''' + # Create out folder + outdir = "output/integration/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + input_txt = resource_filename(__name__, "data/integration/RankFilterInput_conf.txt") + sys.argv = ['test', + input_txt] + # Execute main function with arguments provided through sys.argv + rankfilter.main() + #Compare with reference files + + def def_test_library_lookup_advanced(self): + ''' + Run main for data/NIST_tabular and compare produced files with references determined earlier. + ''' + # Create out folder + outdir = "output/integration/" + if not os.path.exists(outdir): + os.makedirs(outdir) + outfile_base = os.path.join(outdir, 'produced_library_lookup_ADVANCED') + outfile_txt = outfile_base + '.txt' + + #Build up arguments and run + input_txt = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt") + library = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries/Library_RI_DB_capillary_columns-noDuplicates.txt") + regress_model = resource_filename(__name__, "data/integration/regression_MODEL_for_columns.txt") + sys.argv = ['test', + library, + input_txt, + 'Capillary', + 'Semi-standard non-polar', + outfile_txt, + 'DB-5', + regress_model] + # Execute main function with arguments provided through sys.argv + library_lookup.main() + + + + def test_combine_output_advanced(self): + ''' + Variant on test case above, but a bit more complex as some of the centrotypes have + different NIST hits which should give them different RI values. This test also + runs not only the combine output, but the other two preceding steps as well, + so it ensures the integration also works on the current code of all three tools. + ''' + + # Run RankFilter + self.def_test_rank_filter_advanced() + + # Run library CAS RI lookup + self.def_test_library_lookup_advanced() + + outdir = "output/integration/" + outfile_base = os.path.join(outdir, 'produced_combine_output') + outfile_single_txt = outfile_base + '_single.txt' + outfile_multi_txt = outfile_base + '_multi.txt' + + #Build up arguments and run + input_rankfilter = resource_filename(__name__, "output/integration/produced_rank_filter_out.txt") + input_caslookup = resource_filename(__name__, "output/integration/produced_library_lookup_ADVANCED.txt") + sys.argv = ['test', + input_rankfilter, + input_caslookup, + outfile_single_txt, + outfile_multi_txt] + # Execute main function with arguments provided through sys.argv + combine_output.main() + #Compare with reference files +# reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt') +# reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt') +# self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt)) +# self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt)) + + # Check 1: output single should have one record per centrotype: + + + # Check 2: output single has more records than output single: + combine_result_single_items = combine_output._process_data(outfile_single_txt) + combine_result_multi_items = combine_output._process_data(outfile_multi_txt) + self.assertGreater(len(combine_result_single_items['Centrotype']), + len(combine_result_multi_items['Centrotype'])) + + + # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct: + caslookup_items = combine_output._process_data(input_caslookup) + rankfilter_items = combine_output._process_data(input_rankfilter) + + # check that the caslookup RI column is correctly maintained in its original order in + # the combined file: + ri_caslookup = caslookup_items['RI'] + ri_combine_single = combine_result_single_items['RI'] + self.assertListEqual(ri_caslookup, ri_combine_single) + + # check the centrotype column's integrity: + centrotype_caslookup = caslookup_items['Centrotype'] + centrotype_combine_single = combine_result_single_items['Centrotype'] + centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID']) + self.assertListEqual(centrotype_caslookup, centrotype_combine_single) + self.assertListEqual(centrotype_caslookup, centrotype_rankfilter) + + # integration and integrity checks: + file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt") + file_NIST_items = combine_output._process_data(file_NIST) + # check that rank filter output has exactly the same ID items as the original NIST input file: + self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID']) + # check the same for the CAS column: + self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS']) + # now check the NIST CAS column against the cas lookup results: + cas_NIST = _get_processedcas(file_NIST_items['CAS']) + self.assertListEqual(cas_NIST, caslookup_items['CAS']) + # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order + # and values remained stable throughout all steps: + self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS']) + + # check that the rankfilter RIsvr column is correctly maintained in its original order in + # the combined file: + risvr_rankfilter = rankfilter_items['RIsvr'] + risvr_combine_single = combine_result_single_items['RIsvr'] + self.assertListEqual(risvr_rankfilter, risvr_combine_single) + + + + +def _get_centrotype_rankfilter(id_list): + ''' + returns the list of centrotype ids given a list of ID in the + form e.g. 74-1.0-564-1905200-7, where the numbers before the + first "-" are the centrotype id + ''' + result = [] + for compound_id_idx in xrange(len(id_list)): + compound_id = id_list[compound_id_idx] + centrotype = compound_id.split('-')[0] + result.append(centrotype) + + return result + + +def _get_processedcas(cas_list): + ''' + returns the list cas numbers in the form C64175 instead of 64-17-5 + ''' + result = [] + for cas_id_idx in xrange(len(cas_list)): + cas = cas_list[cas_id_idx] + processed_cas = 'C' + str(cas.replace('-', '').strip()) + result.append(processed_cas) + + return result + +def _get_strippedcas(cas_list): + ''' + removes the leading white space from e.g. " 64-17-5" + ''' + result = [] + for cas_id_idx in xrange(len(cas_list)): + cas = cas_list[cas_id_idx] + processed_cas = cas.strip() + result.append(processed_cas) + + return result + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() diff -r 000000000000 -r dffc38727496 test/test_combine_output.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_combine_output.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,106 @@ +''' +Created on Mar 27, 2012 + +@author: marcelk +''' +from GCMS import combine_output +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import os +import shutil +import tempfile +import unittest + + +class Test(unittest.TestCase): + ''' + Tests for the 'combine_output' Galaxy tool + ''' + + def setUp(self): + self.rf_output = resource_filename(__name__, "data/RankFilter.txt") + self.cl_output = resource_filename(__name__, "data/CasLookup.txt") + + def test_process_data(self): + ''' + Tests the processing of the RankFilter and CasLookup files into dictionaries + ''' + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + self.assertEqual(set([' 18457-04-0', ' 55133-95-4', ' 58-08-2', ' 112-34-5']), set(rfdata['CAS'])) + self.assertEqual(set(['C58082', 'C18457040', 'C55133954', 'C112345']), set(cldata['CAS'])) + + def test_add_hit(self): + ''' + Tests the combination of two records from both the RankFilter- and CasLookup-tools + ''' + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + index = 0 + rf_record = dict(zip(rfdata.keys(), [rfdata[key][index] for key in rfdata.keys()])) + cl_record = dict(zip(cldata.keys(), [cldata[key][index] for key in cldata.keys()])) + + hit = combine_output._add_hit(rf_record, cl_record) + self.assertEqual(len(hit), 27) + + # Pass empty record, should fail combination + self.assertRaises(KeyError, combine_output._add_hit, rf_record, {}) + + def test_merge_data(self): + ''' + Tests the merging of the RankFilter and CasLookup data + ''' + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + merged, _ = combine_output._merge_data(rfdata, cldata) + centrotypes = _get_centrotypes(merged) + self.failUnless(all(centrotype in centrotypes for centrotype in ('2716','12723', '3403', '12710'))) + +def _get_centrotypes(merged): + ''' + returns centrotype codes found in merged set + ''' + result = [] + for item_idx in xrange(len(merged)): + item = merged[item_idx] + centrotype = item[0][0] + result.append(centrotype) + + return result + + def test_remove_formula(self): + ''' + Tests the removal of the Formula from the 'Name' field (RankFilter output) + ''' + name = "Caffeine C8H10N4O2" + compound_name, compound_formula = combine_output._remove_formula(name) + self.assertEqual(compound_name, 'Caffeine') + self.assertEqual(compound_formula, 'C8H10N4O2') + name = "Ethanol C2H6O" + compound_name, compound_formula = combine_output._remove_formula(name) + self.assertEqual(compound_name, 'Ethanol') + self.assertEqual(compound_formula, 'C2H6O') + # No formula to remove + name = "Butanoic acid, 4-[(trimethylsilyl)oxy]-, trimethylsilyl ester" + compound_name, compound_formula = combine_output._remove_formula(name) + self.assertEqual(compound_name, name) + self.assertEqual(compound_formula, False) + + def test_save_data(self): + ''' + Tests the creation of the output tabular files (no content testing) + ''' + temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_') + saved_single_data = '{0}/{1}'.format(temp_folder, 'output_single.tsv') + saved_multi_data = '{0}/{1}'.format(temp_folder, 'output_multi.tsv') + rfdata = combine_output._process_data(self.rf_output) + cldata = combine_output._process_data(self.cl_output) + merged, nhits = combine_output._merge_data(rfdata, cldata) + combine_output._save_data(merged, nhits, saved_single_data, saved_multi_data) + self.failUnless(os.path.exists(saved_single_data)) + self.failUnless(os.path.exists(saved_multi_data)) + shutil.rmtree(temp_folder) + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() diff -r 000000000000 -r dffc38727496 test/test_export_to_metexp_tabular.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_export_to_metexp_tabular.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,112 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import export_to_metexp_tabular +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + + def test_MM_calculations(self): + ''' + test the implemented method for MM calculations for + given chemical formulas + ''' + export_to_metexp_tabular.init_elements_and_masses_map() + + formula = "C8H18O3" + # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertEqual(162.26, result) + + formula = "CH2O3Fe2Ni" + # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 + result = export_to_metexp_tabular.get_molecular_mass(formula) + self.assertAlmostEqual(232.44, result, 2) + + + + + + def test_combine_output_simple(self): + ''' + comment me + ''' + # Create out folder + outdir = "output/metexp/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + rankfilter_and_caslookup_combined_file = resource_filename(__name__, "data/dummy1_produced_combine_output_single.txt") + msclust_quantification_and_spectra_file = resource_filename(__name__, "data/dummy1_sim.txt") + output_csv = resource_filename(__name__, outdir + "metexp_tabular.txt") + + sys.argv = ['test', + rankfilter_and_caslookup_combined_file, + msclust_quantification_and_spectra_file, + output_csv, + 'tomato', + 'leafs', + 'test experiment', + 'pieter', + 'DB5 column'] + + # Execute main function with arguments provided through sys.argv + export_to_metexp_tabular.main() + + ''' + # Asserts are based on reading in with process_data and comparing values of + # certain columns + + # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct: + caslookup_items = combine_output._process_data(input_caslookup) + rankfilter_items = combine_output._process_data(input_rankfilter) + + # check that the caslookup RI column is correctly maintained in its original order in + # the combined file: + ri_caslookup = caslookup_items['RI'] + ri_combine_single = combine_result_single_items['RI'] + self.assertListEqual(ri_caslookup, ri_combine_single) + + # check the centrotype column's integrity: + centrotype_caslookup = caslookup_items['Centrotype'] + centrotype_combine_single = combine_result_single_items['Centrotype'] + centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID']) + self.assertListEqual(centrotype_caslookup, centrotype_combine_single) + self.assertListEqual(centrotype_caslookup, centrotype_rankfilter) + + # integration and integrity checks: + file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt") + file_NIST_items = combine_output._process_data(file_NIST) + # check that rank filter output has exactly the same ID items as the original NIST input file: + self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID']) + # check the same for the CAS column: + self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS']) + # now check the NIST CAS column against the cas lookup results: + cas_NIST = _get_processedcas(file_NIST_items['CAS']) + self.assertListEqual(cas_NIST, caslookup_items['CAS']) + # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order + # and values remained stable throughout all steps: + self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS']) + + # check that the rankfilter RIsvr column is correctly maintained in its original order in + # the combined file: + risvr_rankfilter = rankfilter_items['RIsvr'] + risvr_combine_single = combine_result_single_items['RIsvr'] + self.assertListEqual(risvr_rankfilter, risvr_combine_single) + ''' + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() diff -r 000000000000 -r dffc38727496 test/test_library_lookup.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_library_lookup.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,180 @@ +''' +Created on Mar 6, 2012 + +@author: marcelk +''' +from GCMS import library_lookup, match_library +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +import os +import shutil +import tempfile +import unittest + + +class Test(unittest.TestCase): + ''' + Tests the 'library_lookup' Galaxy tool + ''' + + def setUp(self): + self.ri_database = resource_filename(__name__, "data/RIDB_subset.txt") + self.nist_output = resource_filename(__name__, "data/NIST_tabular.txt") + self.ridb_poly_regress = resource_filename(__name__, "data/ridb_poly_regression.txt") + self.ridb_linear_regress = resource_filename(__name__, "data/ridb_linear_regression.txt") + + def test_create_lookup_table(self): + ''' + Tests the 'create_lookup_table' function + ''' + column_type = 'Capillary' + polarity = 'Semi-standard non-polar' + lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity) + self.assertFalse(False in [res[4] == 'Capillary' for res in lookup_dict['4177166']]) + self.assertEqual(['C51276336', '2,6-Dimethyl-octa-1,7-dien-3,6-diol', 'C10H18O2', + '1277', 'Capillary', 'Semi-standard non-polar', 'DB-5MS', '1', + 'C51276336_DB-5MS', '', '', ''], lookup_dict['51276336'][1]) + + def test_read_model(self): + ''' + Tests reading the regression model data containing the parameters required for converting + retention indices between GC-columns + ''' + model, _ = library_lookup._read_model(self.ridb_poly_regress) + # Order of values: coefficient 1 through 4, left limit, right limit + # Polynomial model + self.assertEqual([20.6155874639486, 0.945187096379008, 3.96480787567566e-05, -9.04377237159287e-09, + 628.0, 2944.0, 405.0, 0, 0.998685262365514], model['HP-5']['SE-54']) + self.assertEqual([-92.3963391356951, 1.26116176393346, -0.000191991657547972, 4.15387371263164e-08, + 494.0, 2198.0, 407.0, 0, 0.996665023122993], model['Apiezon L']['Squalane']) + # Linear model + model, _ = library_lookup._read_model(self.ridb_linear_regress) + self.assertEqual([2.81208738561543, 0.99482475526584, 628.0, 2944.0, 405.0, 0, 0.998643883946458], + model['HP-5']['SE-54']) + self.assertEqual([19.979922768462, 0.993741869298272, 494.0, 2198.0, 407.0, 0, 0.99636062891041], + model['Apiezon L']['Squalane']) + + def test_apply_regression(self): + ''' + Tests the regression model on some arbitrary retention indices + ''' + poly_model, _ = library_lookup._read_model(self.ridb_poly_regress) + linear_model, _ = library_lookup._read_model(self.ridb_linear_regress) + retention_indices = [1000, 1010, 1020, 1030, 1040, 1050] + converted_poly = [] + converted_linear = [] + for ri in retention_indices: + converted_poly.append(library_lookup._apply_poly_regression('HP-5', 'DB-5', ri, poly_model)) + converted_linear.append(library_lookup._apply_linear_regression('HP-5', 'DB-5', ri, linear_model)) + + self.assertEqual([1003.0566541860778, 1013.0979459524663, 1023.1358645806529, 1033.170466241159, + 1043.2018071045052, 1053.2299433412131], converted_poly) + self.assertEqual([1001.8127584915925, 1011.830140783027, 1021.8475230744615, 1031.864905365896, + 1041.8822876573306, 1051.899669948765], converted_linear) + + # Test polynomial limit detection, the following RI falls outside of the possible limits + ri = 3400 + converted_poly = library_lookup._apply_poly_regression('HP-5', 'DB-5', ri, poly_model) + self.assertEqual(False, converted_poly) + + def test_preferred_hit(self): + ''' Tests the matching of the hits with the preferred column, including regression ''' + model, method = library_lookup._read_model(self.ridb_poly_regress) + column_type = 'Capillary' + polarity = 'Semi-standard non-polar' + lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity) + hits = lookup_dict['150867'] + # No regression, should however consider order of given preference + match = library_lookup._preferred(hits, ['SE-52', 'DB-5', 'HP-5'], column_type, polarity, model, method) + expected = (['C150867', '(E)-phytol', 'C20H40O', '2110', 'Capillary', + 'Semi-standard non-polar', 'SE-52', '', 'C150867_SE-52', '', '', ''], False) + self.assertEqual(expected, match) + + # Perform regression by looking for 'OV-101' which isn't there. 'SE-52' has the best regression model + # of the available columns + match = library_lookup._preferred(hits, ['OV-101'], column_type, polarity, model, method) + expected = (['C150867', '(E)-phytol', 'C20H40O', 2158.5769891569125, 'Capillary', + 'Semi-standard non-polar', 'SE-52', '', 'C150867_SE-52', '', '', ''], 'SE-52') + self.assertEqual(expected, match) + + def test_format_result(self): + ''' + Tests the 'format_result' function + ''' + column_type = 'Capillary' + polarity = 'Semi-standard non-polar' + + # Look for DB-5 + pref_column = ['DB-5'] + model, method = library_lookup._read_model(self.ridb_poly_regress) + lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity) + data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type, + polarity, model, method)#False, None) + + # remove non-hits from set: + data = _get_hits_only(data) + self.assertEqual(['C544354', 'Ethyl linoleate', 'C20H36O2', '2155', 'Capillary', 'Semi-standard non-polar', + 'DB-5', '1', 'C544354_DB-5', '1810', 'None', '', '', '0'], data[20]) + self.assertEqual(111, len(data)) + + # Look for both DB-5 and HP-5 + pref_column = ['DB-5', 'HP-5'] + data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type, + polarity, False, None) + # remove non-hits from set: + data = _get_hits_only(data) + self.assertEqual(['C502614', '.beta.-(E)-Farnesene', 'C15H24', '1508', 'Capillary', 'Semi-standard non-polar', + 'DB-5', '1', 'C502614_DB-5', '942', 'None', '1482', '1522', '22'], data[50]) + self.assertEqual(106, len(data)) + + + def test_save_data(self): + ''' + Tests the creation of the output tabular file + ''' + temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_') + saved_data = '{0}/{1}'.format(temp_folder, 'output.tsv') + column_type = 'Capillary' + polarity = 'Semi-standard non-polar' + pref_column = ['DB-5'] + lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity) + data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type, polarity, False, None) + library_lookup._save_data(data, saved_data) + self.failUnless(os.path.exists(saved_data)) + shutil.rmtree(temp_folder) + + + def test_match_library_get_lib_files(self): + ''' + Tests the match_library.py functionality + ''' + riqc_libs_dir = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries") + get_library_files_output = match_library.get_directory_files(riqc_libs_dir) + self.assertEqual(2, len(get_library_files_output)) + self.assertEqual("Library_RI_DB_capillary_columns-noDuplicates", get_library_files_output[0][0]) + #TODO change assert below to assert that the result is a file, so the test can run on other dirs as well: + #self.assertEqual("E:\\workspace\\PRIMS-metabolomics\\python-tools\\tools\\GCMS\\test\\data\\riqc_libs\\RI DB library (capillary columns) Dec.2012.txt", get_library_files_output[0][1]) + #self.assertEqual("RI DB library (capillary columns) Jan.2013", get_library_files_output[1][0]) + try: + get_library_files_output = match_library.get_directory_files("/blah") + # should not come here + self.assertTrue(False) + except: + # should come here + self.assertTrue(True) + +def _get_hits_only(data): + ''' + removes items that have RI == 0.0 and Name == '' (these are dummy lines just for the output + ''' + result = [] + for item_idx in xrange(len(data)): + item = data[item_idx] + if item[1] != '' and item[3] > 0.0 : + result.append(item) + + return result + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() diff -r 000000000000 -r dffc38727496 test/test_match_library.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_match_library.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,51 @@ +''' +Created on Mar 6, 2012 + +@author: marcelk +''' +from GCMS import match_library +import unittest +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 + + +class Test(unittest.TestCase): + ''' + Tests the 'match_library' Galaxy tool + ''' + nist_db = resource_filename(__name__, "data/RIDB_subset.txt") + + def test_get_column_type(self): + ''' + Tests the 'get_column_type' function that returns tuples of unique elements + for column types in the RI database + ''' + galaxy_output = match_library.get_column_type(self.nist_db) + self.assertEqual([('Capillary(9999)', 'Capillary', False)], galaxy_output) + + def test_filter_column(self): + ''' + Tests the 'filter_column' function showing the column phase for all 'Capillary' columns + ''' + galaxy_output = match_library.filter_column(self.nist_db, 'Capillary') + self.assertEqual([('Semi-standard non-polar(9999)', 'Semi-standard non-polar', False)], galaxy_output) + + def test_filter_column2(self): + ''' + Tests the 'filter_column' function showing all possibilities for columns having both the + 'Capillary' and 'Semi-standard non-polar' properties + ''' + galaxy_output = match_library.filter_column2(self.nist_db, 'Capillary', 'Semi-standard non-polar') + self.failUnless(('Apiezon M(6)', 'Apiezon M', False) in galaxy_output) + + def test_count_occurrence(self): + ''' + Tests the 'count_occurrence' function + ''' + test_list = [2, 0, 0, 2, 1, 3, 4, 5, 3, 2, 3, 4, 5, 5, 4, 2, 5, 3, 4, 3, 5, 4, 2, 0, 4] + counts = match_library.count_occurrence(test_list) + self.assertEqual({0: 3, 1: 1, 2: 5, 3: 5, 4: 6, 5: 5}, counts) + + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testName'] + unittest.main() diff -r 000000000000 -r dffc38727496 test/test_query_mass_repos.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_mass_repos.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,62 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from MS import query_mass_repos +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + + + + def test_simple(self): + ''' + Simple initial test + ''' + # Create out folder + outdir = "output/query_mass_repos/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # repository_file = sys.argv[3] + # mass_tolerance = float(sys.argv[4]) + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/service_query_tabular.txt") + + molecular_mass_col = "mass (Da)" + dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt") + + + + + sys.argv = ['test', + input_file, + molecular_mass_col, + dblink_file, + '0.001', + 'ms', + output_result] + + # Execute main function with arguments provided through sys.argv + query_mass_repos.main() + + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() diff -r 000000000000 -r dffc38727496 test/test_query_metexp.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,83 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# copied from test_export_to_metexp_tabular.py +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_simple(self): + ''' + Simple initial test + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + # TODO - asserts (base them on DB being filled with test data form metexp unit test for upload method) + # PA + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() diff -r 000000000000 -r dffc38727496 test/test_query_metexp_LARGE.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test/test_query_metexp_LARGE.py Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,79 @@ +'''Integration tests for the GCMS project''' + +from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611 +from GCMS import query_metexp +import os.path +import sys +import unittest + + +class IntegrationTest(unittest.TestCase): + + +# def test_MM_calculations(self): +# ''' +# test the implemented method for MM calculations for +# given chemical formulas +# ''' +# export_to_metexp_tabular.init_elements_and_masses_map() +# +# formula = "C8H18O3" +# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertEqual(162.26, result) +# +# formula = "CH2O3Fe2Ni" +# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44 +# result = export_to_metexp_tabular.get_molecular_mass(formula) +# self.assertAlmostEqual(232.44, result, 2) +# +# +# + + + def test_large(self): + ''' + Simple test, but on larger set, last test executed in 28s + ''' + # Create out folder + outdir = "output/metexp_query/" + if not os.path.exists(outdir): + os.makedirs(outdir) + + #Build up arguments and run + + # input_file = sys.argv[1] + # molecular_mass_col = sys.argv[2] + # formula_col = sys.argv[3] + # metexp_dblink_file = sys.argv[4] + # output_result = sys.argv[5] + + input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt") + casid_col = "CAS" + formula_col = "FORMULA" + molecular_mass_col = "MM" + metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt") + output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt") + + sys.argv = ['test', + input_file, + casid_col, + formula_col, + molecular_mass_col, + metexp_dblink_file, + 'GC', + output_result] + + # Execute main function with arguments provided through sys.argv + query_metexp.main() + + + + +def _read_file(filename): + ''' + Helper method to quickly read a file + @param filename: + ''' + with open(filename) as handle: + return handle.read() diff -r 000000000000 -r dffc38727496 tool_dependencies.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_dependencies.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,13 @@ + + + + + + + + This dependency: + Ensures R 3.1.1 installation is triggered (via dependency). + Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed. + + \ No newline at end of file diff -r 000000000000 -r dffc38727496 xcms_differential_analysis.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xcms_differential_analysis.r Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,85 @@ +## read args: +args <- commandArgs(TRUE) +#cat("args <- \"\"\n") +## a xcms xset saved as .RData +args.xsetData <- args[1] +#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep="")) + +args.class1 <- args[2] +args.class2 <- args[3] +#cat(paste("args.class1 <- \"", args[2], "\"\n", sep="")) +#cat(paste("args.class2 <- \"", args[3], "\"\n", sep="")) + +args.topcount <- strtoi(args[4]) +#cat(paste("args.topcount <- ", args[4], "\n", sep="")) + +args.outTable <- args[5] + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] +#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep="")) +#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep="")) + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +tryCatch( + { + library(metaMS) + library(xcms) + #library("R2HTML") + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + + # info: levels(xcmsSet@phenoData$class) also gives access to the class names + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + # set cairo as default for png plots: + png = function (...) grDevices::png(...,type='cairo') + # run diffreport + reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640) + + # write out tsv table: + write.table(reporttab, args.outTable, sep="\t", row.names=FALSE) + + message("\nGenerating report.........") + + cat("

Differential analysis report

", file= args.htmlReportFile) + #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile) + figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="") + message(figuresPath) + listOfFiles <- list.files(path = figuresPath) + for (i in 1:length(listOfFiles)) + { + figureName <- listOfFiles[i] + # maybe we still need to copy the figures to the args.htmlReportFile.files_path + cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE) + cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE) + } + + message("finished generating report") + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) \ No newline at end of file diff -r 000000000000 -r dffc38727496 xcms_differential_analysis.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xcms_differential_analysis.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,55 @@ + + Runs xcms diffreport function for differential Analsysis + + R_bioc_metams + + + xcms_differential_analysis.r + $xsetData + "$class1" + "$class2" + $topcount + $outTable + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for +the most significant differences. The figure below shows an example of such an EIC. + +.. image:: $PATH_TO_IMAGES/diffreport.png + + + + + + + 10.1021/ac051437y + + \ No newline at end of file diff -r 000000000000 -r dffc38727496 xcms_get_alignment_eic.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xcms_get_alignment_eic.r Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,153 @@ +# shows all alignment results in a rt region +## read args: +args <- commandArgs(TRUE) +# xset data: +args.xsetData <- args[1] + +args.rtStart <- strtoi(args[2]) +args.rtEnd <- strtoi(args[3]) + +# limit max diff to 600 and minNrSamples to at least 2 : +if (args.rtEnd - args.rtStart > 600) + stop("The RT window should be <= 600") + +args.minNrSamples <- strtoi(args[4]) #min nr samples +if (args.minNrSamples < 2) + stop("Set 'Minimum number of samples' to 2 or higher") + + +args.sampleNames <- strsplit(args[5], ",")[[1]] +# trim leading and trailing spaces: +args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) + +## report files +args.htmlReportFile <- args[6] +args.htmlReportFile.files_path <- args[7] + + +if (length(args) == 8) +{ + args.outLogFile <- args[8] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + + + +tryCatch( + { + library(metaMS) + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) + + write(paste("

Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples

"), + file=args.htmlReportFile) + + gt <- groups(xsetData) + message("\nParsed groups... ") + groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected + + if (length(groupidx1) > 0) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames) + #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw") + + #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE) + #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!! + sampNames <- sampnames(xsetData) + sampleNamesIdx <- match( args.sampleNames, sampNames) + message(paste("Samples: ", sampleNamesIdx)) + + #TODO html <- paste(html, "") + message(paste("\nPlotting figures... ")) + + #get the mz list (interestingly, this [,"mz"] is relatively slow): + peakMzList <- xsetData@peaks[,"mz"] + peakSampleList <- xsetData@peaks[,"sample"] + #signal to noise list: + peakSnList <- xsetData@peaks[,"sn"] + + message(paste("Total nr of peaks: ", length(peakMzList))) + + for (i in 1:length(groupidx1)) + { + groupMembers <- xsetData@groupidx[[groupidx1[i]]] + + groupMzList <- "" + groupSampleList <- "" + finalGroupSize <- 0 + + for (j in 1:length(groupMembers)) + { + #get only the peaks from the selected samples: + memberSample <- peakSampleList[groupMembers[j]] + memberSn <- peakSnList[groupMembers[j]] + if (!is.na(memberSn) && memberSample %in% sampleNamesIdx) + { + message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample)) + memberMz <- peakMzList[groupMembers[j]] + + + if (finalGroupSize == 0) + { + groupMzList <- memberMz + groupSampleList <- sampNames[memberSample] + } else { + groupMzList <- paste(groupMzList,",",memberMz, sep="") + groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="") + } + + finalGroupSize <- finalGroupSize +1 + } + } + # here we do the real check on group size and only the groups that have enough + # members with signal to noise > 0 will be plotted here: + if (finalGroupSize >= args.minNrSamples) + { + message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... ")) + + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + write(paste("", sep="") , + file=args.htmlReportFile, append=TRUE) + + png( figureName, type="cairo", width=800 ) + plot(eiccor, xsetData, groupidx = i) + devname = dev.off() + + write(paste("

Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA:
", groupMzList,"

", sep="") , + file=args.htmlReportFile, append=TRUE) + write(paste("

m/z values found in the following samples respectively:
", groupSampleList,"

", sep="") , + file=args.htmlReportFile, append=TRUE) + } + } + + } + + write("", + file=args.htmlReportFile, append=TRUE) + message("finished generating report") + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 000000000000 -r dffc38727496 xcms_get_alignment_eic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xcms_get_alignment_eic.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,73 @@ + + Extracts alignment EICs from feature alignment data + + R_bioc_metams + + + xcms_get_alignment_eic.r + $xsetData + $rtStart + $rtEnd + $minNrSamples + "$sampleNames" + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method. +It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between +two classes, with extra alignment information beneath it. + +.. image:: $PATH_TO_IMAGES/diffreport.png + +Alignment id: 1709. m/z list of peaks in alignment: +614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469 + + + + + 10.1021/ac051437y + + \ No newline at end of file diff -r 000000000000 -r dffc38727496 xcms_get_mass_eic.r --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xcms_get_mass_eic.r Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,162 @@ +## read args: +args <- commandArgs(TRUE) +# xset data: +args.xsetData <- args[1] + +args.rtStart <- strtoi(args[2]) +args.rtEnd <- strtoi(args[3]) + +args.mzStart <- as.double(args[4]) +args.mzEnd <- as.double(args[5]) +# there are 2 options: specify a mz range or a mz list: +if (args.mzStart < 0) +{ + args.mzList <- as.double(strsplit(args[6], ",")[[1]]) + cat(typeof(as.double(strsplit(args[6], ",")[[1]]))) + args.mzTolPpm <- as.double(args[7]) + # calculate mzends based on ppm tol: + mzListEnd <- c() + mzListStart <- c() + for (i in 1:length(args.mzList)) + { + mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0 + mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0 + mzListEnd <- c(mzListEnd, mzEnd) + mzListStart <- c(mzListStart, mzStart) + } + str(mzListStart) + str(mzListEnd) +} else { + mzListEnd <- c(args.mzEnd) + mzListStart <- c(args.mzStart) +} + +args.sampleNames <- strsplit(args[8], ",")[[1]] +# trim leading and trailing spaces: +args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames) + +args.combineSamples <- args[9] +args.rtPlotMode <- args[10] + +## report files +args.htmlReportFile <- args[11] +args.htmlReportFile.files_path <- args[12] + + +if (length(args) == 13) +{ + args.outLogFile <- args[13] + # suppress messages: + # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888 + msg <- file(args.outLogFile, open="wt") + sink(msg, type="message") + sink(msg, type="output") +} + +# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots +# TODO2 - let it run in parallel + +tryCatch( + { + library(metaMS) + + # load the xset data : + xsetData <- readRDS(args.xsetData) + # if here to support both scenarios: + if ("xcmsSet" %in% slotNames(xsetData) ) + { + xsetData <- xsetData@xcmsSet + } + + # report + dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE) + message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path)) + + html <- "

Extracted Ion Chromatograms (EIC) matching criteria

" + + if (args.combineSamples == "No") + { + if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart)) + stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart), + " masses while ", length(args.sampleNames), " sample names were given.")) + + iterSize <- length(args.sampleNames) + # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used + fixSampleIdx <- 1 + fixMzListIdx <- 1 + if (length(args.sampleNames) == 1) + { + fixSampleIdx <- 0 + iterSize <- length(mzListStart) + } + if (length(mzListStart) == 1) + { + fixMzListIdx <- 0 + } + lineColors <- rainbow(iterSize) + for (i in 0:(iterSize-1)) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, + mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE), + rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), + sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode) + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + html <- paste(html,"
", sep="") + png( figureName, type="cairo", width=1100,height=250 ) + #plot(eiccor, col=lineColors[i+1]) + # black is better in this case: + plot(eiccor) + legend('topright', # places a legend at the appropriate place + legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend + lty=c(1,1), # gives the legend appropriate symbols (lines) + lwd=c(2.5,2.5)) + + devname = dev.off() + } + + } else { + for (i in 1:length(mzListStart)) + { + message("\nGetting EIC... ") + eiccor <- getEIC(xsetData, + mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE), + rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE), + sampleidx=args.sampleNames, rt = args.rtPlotMode) + + #set size, set option (plot per sample, plot per mass) + + message("\nPlotting figures... ") + figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="") + html <- paste(html,"", sep="") + png( figureName, type="cairo", width=1100,height=450 ) + lineColors <- rainbow(length(args.sampleNames)) + plot(eiccor, col=lineColors) + legend('topright', # places a legend at the appropriate place + legend=args.sampleNames, # puts text in the legend + lty=c(1,1), # gives the legend appropriate symbols (lines) + lwd=c(2.5,2.5), + col=lineColors) + devname = dev.off() + } + } + if (args.rtPlotMode == "corrected") + { + html <- paste(html,"

*rt values are corrected ones

") + } + html <- paste(html,"") + message("finished generating report") + write(html,file=args.htmlReportFile) + # unlink(args.htmlReportFile) + cat("\nWarnings================:\n") + str( warnings() ) + }, + error=function(cond) { + sink(NULL, type="message") # default setting + sink(stderr(), type="output") + message("\nERROR: ===========\n") + print(cond) + } + ) diff -r 000000000000 -r dffc38727496 xcms_get_mass_eic.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/xcms_get_mass_eic.xml Sat Feb 07 22:02:00 2015 +0100 @@ -0,0 +1,117 @@ + + Extracts EICs for a given list of masses + + R_bioc_metams + + + xcms_get_mass_eic.r + $xsetData + $rtStart + $rtEnd + #if $massParameters.massParametersType == "window" + $massParameters.mzStart + $massParameters.mzEnd + -1 + "." + #else + -1 + -1 + "$massParameters.mzList" + $massParameters.mzTolPpm + #end if + "$sampleNames" + $combineSamples + $rtPlotMode + $htmlReportFile + $htmlReportFile.files_path + $outLogFile + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.. class:: infomark + +This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method. +It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot. + +.. image:: $PATH_TO_IMAGES/massEIC.png + + + + + 10.1021/ac051437y + + \ No newline at end of file