# HG changeset patch
# User pieter.lukasse@wur.nl
# Date 1423342920 -3600
# Node ID dffc3872749667a5de6bd0767664ea7aa0afa841
initial commit
diff -r 000000000000 -r dffc38727496 LICENSE
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,202 @@
+
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "[]"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright [yyyy] [name of copyright owner]
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
diff -r 000000000000 -r dffc38727496 NOTICE
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/NOTICE Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,14 @@
+PRIMS-metabolomics toolset & Galaxy wrappers
+============================================
+
+Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite.
+This toolset consists of custom tools to enable metabolite identifications and
+Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data.
+
+Copyright:
+* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS
+* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI),
+ Wageningen, The Netherlands. All rights reserved. See the license text below.
+
+Galaxy wrappers and installation are available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 README.rst
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,103 @@
+PRIMS-metabolomics toolset & Galaxy wrappers
+============================================
+
+Metabolomics module of Plant Research International's Mass Spectrometry (PRIMS) toolsuite.
+This toolset consists of custom tools to enable metabolite identifications and
+Retention Index (RI) based Quality Control (RIQC) for Mass Spectrometry metabolomics data.
+
+Copyright:
+* 2012: NIST_UTIL and RIQC tools: Copyright (c) 2012 Maarten Kooyman and Marcel Kempenaar, NBIC BRS
+* 2013: all tools: Copyright (c) 2013 by Pieter Lukasse, Plant Research International (PRI),
+Wageningen, The Netherlands. All rights reserved. See the license text below.
+
+Galaxy wrappers and installation are available from the Galaxy Tool Shed at:
+http://toolshed.g2.bx.psu.edu/view/pieterlukasse/prims_metabolomics
+
+History
+=======
+
+============== ======================================================================
+Date Changes
+-------------- ----------------------------------------------------------------------
+December 2014 * Added MsClust support for parsing XCMS alignment results.
+ * Improved output reports for XCMS wrappers.
+November 2014 * Added XCMS related tool wrappers (for metaMS and diffreport)
+September 2014 * Added new membership cutoff option for final clusters in MsClust
+ * Improved MsClust memory usage for large datasets
+ * Simplified MsClust HTML report
+ * Added option for microminutes based clustering instead of scannr
+ based in MsClust
+April 2014 * Added interface to ExactMassDB, Pep1000, KEGG, KNApSAcK, Flavonoid
+ Viewer, LipidMAPS, HMDB, PubChem, by using the service MFSearcher.
+ This enables users to query multiple public repositories for
+ elemental compositions from accurate mass values detected by
+ high-resolution mass spectrometers. NB: see also added
+ licensing info below.
+March 2014 * Added interface to METEXP data store, including tool to fire
+ queries in batch mode
+ * Improved quantification output files of MsClust, a.o. sorting
+ mass list based on intensity (last two columns of quantification
+ files)
+January 2014 * first release via Tool Shed, combining the RIQC and MsClust in a
+ single package (this package)
+ * integration with METEXP software (data store for metabolomics
+ experiments with respective metadata and identifications)
+2013 * hand-over of the NIST_UTIL and RIQC tools from the NBIC team to
+ Plant Research International
+2012 * development of MsClust 2.0, making it also suitable for Galaxy
+<2011 * development and publication of MsClust 1.0
+============== ======================================================================
+
+Tool Versioning
+===============
+
+PRIMS tools will have versions of the form X.Y.Z. Versions
+differing only after the second decimal should be completely
+compatible with each other. Breaking changes should result in an
+increment of the number before and/or after the first decimal. All
+tools of version less than 1.0.0 should be considered beta.
+
+
+Bug Reports & other questions
+=============================
+
+For the time being issues can be reported via the contact form at:
+http://www.wageningenur.nl/en/Persons/PNJ-Pieter-Lukasse.htm
+
+Developers, Contributions & Collaborations
+==========================================
+
+If you wish to join forces and collaborate on some of the
+tools do not hesitate to contact Pieter Lukasse via the contact form above.
+
+
+License (Apache, Version 2.0)
+=============================
+
+Copyright 2013 Pieter Lukasse, Plant Research International (PRI).
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this software except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+
+License for third party services
+================================
+MFSearcher service : http://webs2.kazusa.or.jp/mfsearcher/#090
+In the MFSearcher system, the compound data provided by KEGG, Flavonoid Viewer, LIPID MAPS, HMDB and PubChem
+were downloaded for academic purposes. The compound data of KNApSAcK is provided by Prof. Kanaya in
+Nara Institute of Science and Technology (NAIST). The part of these data are utilized to construct the
+specified databases for rapid mass searching in the MFSearcher system after re-calculating the molecular weights.
+Please preserve the contracts of each original databases when utilizing the search results against these
+databases by MFSearcher.
+
+The searching system of MFSearcher, the ExactMassDB database, and the Pep1000 database by Kazusa DNA
+Research Institute is licensed under a Creative Commons Attribution-NonCommercial 3.0 Unported License.
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 Rscripts/filter-RIDB.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Rscripts/filter-RIDB.R Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,56 @@
+##
+#
+# Removes duplicates from a RI-database
+#
+# Usage:
+# Rscript filter-RIDB.R /path/to/retention_db.txt output_RIDB_file.txt
+#
+##
+
+# Commandline arguments
+args <- commandArgs(TRUE)
+ridb <- args[1]
+out_file <- args[2]
+
+# Function to check duplicates
+duplicates <- function(dat) {
+ s <- do.call("order", as.data.frame(dat))
+ non.dup <- !duplicated(dat[s, ])
+ orig.ind <- s[non.dup]
+ first.occ <- orig.ind[cumsum(non.dup)]
+ first.occ[non.dup] <- NA
+ first.occ[order(s)]
+}
+
+# Load CSV file
+ridb <- read.csv(ridb,header=TRUE, sep="\t")
+## Filters on: CAS FORMULA Column type Column phase type Column name
+filter_cols <- c(1, 3, 5, 6, 7)
+cat("RIDB dimensions: ")
+print(dim(ridb))
+deleted <- NULL
+cat("Checking for duplicates...")
+dups <- duplicates(ridb[,filter_cols])
+cat("\t[DONE]\nRemoving duplicates...")
+newridb <- ridb
+newridb["min"] <- NA
+newridb["max"] <- NA
+newridb["orig.columns"] <- NA
+for (i in unique(dups)) {
+ if (!is.na(i)) {
+ rows <- which(dups == i)
+ duprows <- ridb[c(i, rows),]
+ # Replace duplicate rows with one row containing the median value
+ new_RI <- median(duprows$RI)
+ newridb$RI[i] <- median(duprows$RI)
+ newridb$min[i] <- min(duprows$RI)
+ newridb$max[i] <- max(duprows$RI)
+ newridb$orig.columns[i] <- paste(rows, collapse=",")
+ deleted <- c(deleted, rows)
+ }
+}
+cat("\t\t[DONE]\nCreating new dataset...")
+out_ridb <- newridb[-deleted,]
+cat("\t\t[DONE]\nWriting new dataset...")
+write.table(out_ridb, na='', file=out_file, quote=T, sep="\t", row.names=F)
+cat("\t\t[DONE]\n")
diff -r 000000000000 -r dffc38727496 Rscripts/ridb-regression.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/Rscripts/ridb-regression.R Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,208 @@
+##
+#
+# Performs regression analysis using either 3rd degree polynomial- or linear-method
+#
+##
+
+# Commandline arguments
+args <- commandArgs(TRUE)
+if (length(args) < 7)
+ stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ",
+ "ouput_file logfile min_residuals range_mod pvalue rsquared method ",
+ "plot(yes/no) plot_archive"))
+
+ridb <- args[1]
+out_file <- args[2]
+logfile <- args[3]
+min_residuals <- as.integer(args[4])
+range_mod <- as.integer(args[5])
+pvalue <- as.double(args[6])
+rsquared <- as.double(args[7])
+method <- args[8]
+plot <- tolower(args[9])
+if (plot == 'true')
+ plot_archive = args[10]
+
+# Do not show warnings etc.
+sink(file='/dev/null')
+
+progress <- c()
+logger <- function(logdata) {
+ ## Logs progress, adds a timestamp for each event
+ #cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG
+ progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep=""))
+}
+
+logger("Reading Retention Index Database..")
+
+# Read Retention Index Database
+ridb <- read.csv(ridb, header=TRUE, sep="\t")
+logger(paste("\t", nrow(ridb), "records read.."))
+# Get a unique list
+gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1]))
+cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1]))
+
+add_poly_fit <- function(fit, gc1_index, gc2_index, range) {
+ pval = anova.lm(fit)$Pr
+ r.squared = summary(fit)$r.squared
+
+ data = rep(NA, 11)
+ # Append results to matrix
+ data[1] = gc_columns[gc1_index] # Column 1
+ data[2] = gc_columns[gc2_index] # Column 2
+ data[3] = coefficients(fit)[1] # The 4 coefficients
+ data[4] = coefficients(fit)[2]
+ data[5] = coefficients(fit)[3]
+ data[6] = coefficients(fit)[4]
+ data[7] = range[1] # Left limit
+ data[8] = range[2] # Right limit
+ data[9] = length(fit$residuals) # Number of datapoints analysed
+ data[10] = pval[1] # p-value for resulting fitting
+ data[11] = r.squared # R-squared
+ return(data)
+}
+
+
+add_linear_fit <- function(fit, gc1_index, gc2_index, range) {
+ pval = anova.lm(fit)$Pr
+ r.squared = summary(fit)$r.squared
+
+ data = rep(NA, 7)
+ # Append results to matrix
+ data[1] = gc_columns[gc1_index] # Column 1
+ data[2] = gc_columns[gc2_index] # Column 2
+ data[3] = coefficients(fit)[1] # The 4 coefficients
+ data[4] = coefficients(fit)[2]
+ data[7] = length(fit$residuals) # Number of datapoints analysed
+ data[8] = pval[1] # p-value for resulting fitting
+ data[9] = r.squared # R-squared
+ return(data)
+}
+
+
+add_fit <- function(fit, gc1_index, gc2_index, range, method) {
+ if (method == 'poly')
+ return(add_poly_fit(fit, gc1_index, gc2_index, range))
+ else
+ return(add_linear_fit(fit, gc1_index, gc2_index, range))
+}
+
+
+plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) {
+ if (method == 'poly')
+ pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1]
+ else
+ pol <- function(x) coeff[2]*x + coeff[1]
+ pdf(paste('regression_model_',
+ make.names(gc_columns[gc1_index]), '_vs_',
+ make.names(gc_columns[gc2_index]), '.pdf', sep=''))
+ curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index],
+ ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750))
+ points(ri1, ri2, lwd=0.4)
+ # Add vertical lines showing left- and right limits when using poly method
+ if (method == 'poly')
+ abline(v=range, col="grey", lwd=1.5)
+ dev.off()
+}
+
+# Initialize output dataframe
+if (method == 'poly') {
+ m <- data.frame(matrix(ncol = 11, nrow = 10))
+} else {
+ m <- data.frame(matrix(ncol = 9, nrow = 10))
+}
+
+
+get_fit <- function(gc1, gc2, method) {
+ if (method == 'poly')
+ return(lm(gc1 ~ poly(gc2, 3, raw=TRUE)))
+ else
+ return(lm(gc1 ~ gc2))
+}
+
+# Permutate
+k <- 1
+logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep=""))
+
+for (i in 1:(length(gc_columns)-1)) {
+ logger(paste("\tCalculating model for ", gc_columns[i], "..", sep=""))
+ breaks <- 0
+ for (j in (i+1):length(gc_columns)) {
+ col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),]
+ col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),]
+
+ # Find CAS numbers for which both columns have data (intersect)
+ cas_intersect = intersect(col1[['CAS']], col2[['CAS']])
+
+ # Skip if number of shared CAS entries is < cutoff
+ if (length(cas_intersect) < min_residuals) {
+ breaks = breaks + 1
+ next
+ }
+ # Gather Retention Indices
+ col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])]
+ col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])]
+
+ # Calculate the range within which regression is possible (and move if 'range_mod' != 0)
+ range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data))))
+ if (range_mod != 0) {
+ # Calculate percentage and add/subtract from range
+ perc = diff(range) / 100
+ perc_cutoff = range_mod * perc
+ range = as.integer(range + c(perc_cutoff, -perc_cutoff))
+ }
+
+ # Calculate model for column1 vs column2 and plot if requested
+ fit = get_fit(col1_data, col2_data, method)
+ m[k,] = add_fit(fit, i, j, range, method)
+
+ if (plot == 'true')
+ plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method)
+
+ # Calculate model for column2 vs column1 and plot if requested
+ fit = get_fit(col2_data, col1_data, method)
+ m[k + 1,] = add_fit(fit, j, i, range, method)
+
+ if (plot == 'true')
+ plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method)
+
+ k = k + 2
+ }
+ logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep=""))
+}
+
+# Filter on pvalue and R-squared
+logger("Filtering on pvalue and R-squared..")
+if (method == 'poly') {
+ pval_index <- which(m[,10] < pvalue)
+ rsquared_index <- which(m[,11] > rsquared)
+} else {
+ pval_index <- which(m[,8] < pvalue)
+ rsquared_index <- which(m[,9] > rsquared)
+}
+logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep=""))
+
+logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep=""))
+
+# Remaining rows
+index = unique(c(pval_index, rsquared_index))
+
+# Reduce dataset
+m = m[index,]
+sink()
+
+# Place plots in the history as a ZIP file
+if (plot == 'true') {
+ logger("Creating archive with model graphics..")
+ system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep=""))
+ system(paste("cp models.zip ", plot_archive, sep=""))
+}
+
+# Save dataframe as tab separated file
+logger("All done, saving data..")
+header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4",
+ "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared")
+if (method != 'poly')
+ header = header[c(1:4, 7:11)]
+write(progress, logfile)
+write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE)
diff -r 000000000000 -r dffc38727496 __init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/__init__.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,6 @@
+'''
+Module containing Galaxy tools for the LC or GC/MS pipeline
+Created on Mar , 2014
+
+@author: pieter lukasse
+'''
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 combine_output.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_output.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,253 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from two GCMS Galaxy tools (RankFilter and CasLookup)
+'''
+
+import csv
+import re
+import sys
+import math
+import pprint
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def _process_data(in_csv):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = {}
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def _merge_data(rankfilter, caslookup):
+ '''
+ Merges data from both input dictionaries based on the Centrotype field. This method will
+ build up a new list containing the merged hits as the items.
+ @param rankfilter: dictionary holding RankFilter output in the form of N lists (one list per attribute name)
+ @param caslookup: dictionary holding CasLookup output in the form of N lists (one list per attribute name)
+ '''
+ # TODO: test for correct input files -> rankfilter and caslookup internal lists should have the same lenghts:
+ if (len(rankfilter['ID']) != len(caslookup['Centrotype'])):
+ raise Exception('rankfilter and caslookup files should have the same nr of rows/records ')
+
+ merged = []
+ processed = {}
+ for compound_id_idx in xrange(len(rankfilter['ID'])):
+ compound_id = rankfilter['ID'][compound_id_idx]
+ if not compound_id in processed :
+ # keep track of processed items to not repeat them
+ processed[compound_id] = compound_id
+ # get centrotype nr
+ centrotype = compound_id.split('-')[0]
+ # Get the indices for current compound ID in both data-structures for proper matching
+ rindex = [index for index, value in enumerate(rankfilter['ID']) if value == compound_id]
+ cindex = [index for index, value in enumerate(caslookup['Centrotype']) if value == centrotype]
+
+ merged_hits = []
+ # Combine hits
+ for hit in xrange(len(rindex)):
+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+ # corresponding value in the rankfilter or caslookup tables; i.e.
+ # rankfilter[key] => returns the list/array with size = nrrows, with the values for the attribute
+ # represented by "key". rindex[hit] => points to the row nr=hit (hit is a rownr/index)
+ rf_record = dict(zip(rankfilter.keys(), [rankfilter[key][rindex[hit]] for key in rankfilter.keys()]))
+ cl_record = dict(zip(caslookup.keys(), [caslookup[key][cindex[hit]] for key in caslookup.keys()]))
+
+ merged_hit = _add_hit(rf_record, cl_record)
+ merged_hits.append(merged_hit)
+
+ merged.append(merged_hits)
+
+ return merged, len(rindex)
+
+
+def _add_hit(rankfilter, caslookup):
+ '''
+ Combines single records from both the RankFilter- and CasLookup-tools
+ @param rankfilter: record (dictionary) of one compound in the RankFilter output
+ @param caslookup: matching record (dictionary) of one compound in the CasLookup output
+ '''
+ # The ID in the RankFilter output contains the following 5 fields:
+ rf_id = rankfilter['ID'].split('-')
+ try:
+ name, formula = _remove_formula(rankfilter['Name'])
+ hit = [rf_id[0], # Centrotype
+ rf_id[1], # cent.Factor
+ rf_id[2], # scan nr
+ rf_id[3], # R.T. (umin)
+ rf_id[4], # nr. Peaks
+ # Appending other fields
+ rankfilter['R.T.'],
+ name,
+ caslookup['FORMULA'] if not formula else formula,
+ rankfilter['Library'].strip(),
+ rankfilter['CAS'].strip(),
+ rankfilter['Forward'],
+ rankfilter['Reverse'],
+ ((float(rankfilter['Forward']) + float(rankfilter['Reverse'])) / 2),
+ rankfilter['RIexp'],
+ caslookup['RI'],
+ rankfilter['RIsvr'],
+ # Calculate absolute differences
+ math.fabs(float(rankfilter['RIexp']) - float(rankfilter['RIsvr'])),
+ math.fabs(float(caslookup['RI']) - float(rankfilter['RIexp'])),
+ caslookup['Regression.Column.Name'],
+ caslookup['min'],
+ caslookup['max'],
+ caslookup['nr.duplicates'],
+ caslookup['Column.phase.type'],
+ caslookup['Column.name'],
+ rankfilter['Rank'],
+ rankfilter['%rel.err'],
+ rankfilter['Synonyms']]
+ except KeyError as error:
+ print "Problem reading in data from input file(s):\n",
+ print "Respective CasLookup entry: \n", pprint.pprint(caslookup), "\n"
+ print "Respective RankFilter entry: \n", pprint.pprint(rankfilter), "\n"
+ raise error
+
+ return hit
+
+
+def _remove_formula(name):
+ '''
+ The RankFilter Name field often contains the Formula as well, this function removes it from the Name
+ @param name: complete name of the compound from the RankFilter output
+ '''
+ name = name.split()
+ poss_formula = name[-1]
+ match = re.match("^(([A-Z][a-z]{0,2})(\d*))+$", poss_formula)
+ if match:
+ return ' '.join(name[:-1]), poss_formula
+ else:
+ return ' '.join(name), False
+
+
+def _get_default_caslookup():
+ '''
+ The Cas Lookup tool might not have found all compounds in the library searched,
+ this default dict will be used to combine with the Rank Filter output
+ '''
+ return {'FORMULA': 'N/A',
+ 'RI': '0.0',
+ 'Regression.Column.Name': 'None',
+ 'min': '0.0',
+ 'max': '0.0',
+ 'nr.duplicates': '0',
+ 'Column.phase.type': 'N/A',
+ 'Column.name': 'N/A'}
+
+
+def _save_data(data, nhits, out_csv_single, out_csv_multi):
+ '''
+ Writes tab-separated data to file
+ @param data: dictionary containing merged dataset
+ @param out_csv: output csv file
+ '''
+ # Columns we don't repeat:
+ header_part1 = ['Centrotype',
+ 'cent.Factor',
+ 'scan nr.',
+ 'R.T. (umin)',
+ 'nr. Peaks',
+ 'R.T.']
+ # These are the headers/columns we repeat in case of
+ # combining hits in one line (see alternative_headers method below):
+ header_part2 = [
+ 'Name',
+ 'FORMULA',
+ 'Library',
+ 'CAS',
+ 'Forward',
+ 'Reverse',
+ 'Avg. (Forward, Reverse)',
+ 'RIexp',
+ 'RI',
+ 'RIsvr',
+ 'RIexp - RIsvr',
+ 'RI - RIexp',
+ 'Regression.Column.Name',
+ 'min',
+ 'max',
+ 'nr.duplicates',
+ 'Column.phase.type',
+ 'Column.name',
+ 'Rank',
+ '%rel.err',
+ 'Synonyms']
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv_single, 'wb')
+ outfile_multi_handle = open(out_csv_multi, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+ output_multi_handle = csv.writer(outfile_multi_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(header_part1 + header_part2)
+ output_multi_handle.writerow(header_part1 + header_part2 + alternative_headers(header_part2, nhits-1))
+ # Combine all hits for each centrotype into one line
+ line = []
+ for centrotype_idx in xrange(len(data)):
+ i = 0
+ for hit in data[centrotype_idx]:
+ if i==0:
+ line.extend(hit)
+ else:
+ line.extend(hit[6:])
+ i = i+1
+ # small validation (if error, it is a programming error):
+ if i > nhits:
+ raise Exception('Error: more hits that expected for centrotype_idx ' + centrotype_idx)
+ output_multi_handle.writerow(line)
+ line = []
+
+ # Write one line for each centrotype
+ for centrotype_idx in xrange(len(data)):
+ for hit in data[centrotype_idx]:
+ output_single_handle.writerow(hit)
+
+def alternative_headers(header_part2, nr_alternative_hits):
+ '''
+ This method will iterate over the header names and add the string 'ALT#_' before each,
+ where # is the number of the alternative, according to number of alternative hits we want to add
+ to final csv/tsv
+ '''
+ result = []
+ for i in xrange(nr_alternative_hits):
+ for header_name in header_part2:
+ result.append("ALT" + str(i+1) + "_" + header_name)
+ return result
+
+def main():
+ '''
+ Combine Output main function
+ It will merge the result files from "RankFilter" and "Lookup RI for CAS numbers"
+ NB: the caslookup_result_file will typically have fewer lines than
+ rankfilter_result_file, so the merge has to consider this as well. The final file
+ should have the same nr of lines as rankfilter_result_file.
+ '''
+ rankfilter_result_file = sys.argv[1]
+ caslookup_result_file = sys.argv[2]
+ output_single_csv = sys.argv[3]
+ output_multi_csv = sys.argv[4]
+
+ # Read RankFilter and CasLookup output files
+ rankfilter = _process_data(rankfilter_result_file)
+ caslookup = _process_data(caslookup_result_file)
+ merged, nhits = _merge_data(rankfilter, caslookup)
+ _save_data(merged, nhits, output_single_csv, output_multi_csv)
+
+
+if __name__ == '__main__':
+ main()
diff -r 000000000000 -r dffc38727496 combine_output.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/combine_output.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,36 @@
+
+ Perform a combination of output data from the RankFilter and CasLookup tools
+
+ combine_output.py $rankfilter_in $caslookup_in $out_single $out_multi
+
+
+
+
+
+
+
+
+
+
+
+Performs a combination of output files from the 'RankFilter' and 'Lookup RI for CAS' tools into two tab-separated files.
+
+The files produced are contain either all hits for a compound on a single line (Single) or on separate lines
+(Multi).
+
+.. class:: infomark
+
+**Notes**
+
+The input data should be produced by the RankFilter and 'Lookup RI for CAS' tools provided on this Galaxy server with the
+original headers kept intact. Processing steps include:
+
+ - Added columns showing the average Forward/Reverse values, RIexp - RIsvr and RI - RIexp values
+ - The ID column of the RankFilter tool output is split into 'Centrotype', 'cent.Factor', 'scan nr.', 'R.T. (umin)'
+ and 'nr. Peaks' fields.
+ - The formula is split off the 'Name' field in the RankFilter output
+
+
+
diff -r 000000000000 -r dffc38727496 create_model.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/create_model.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,78 @@
+
+ Generate coefficients to enable the regression from one GC-column
+ to another GC-column
+ Rscripts/ridb-regression.R
+ $ridb
+ $out_model
+ $out_log
+ $min_residuals
+ $range_mod
+ $pvalue
+ $rsquared
+ $method
+ $plot
+ #if $plot
+ $model_graphics
+ #end if
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ (plot)
+
+
+
+
+
+Calculates regression models for a permutation of all GC columns contained in the selected
+RI database file. The method used for creating the model is either based on a 3rd degree
+polynomial or a standard linear model.
+
+The *Minimum number of residuals* option will only allow regression if the columns it is based
+on has at least that number of datapoints on the same compound.
+
+Filtering is possible by setting an upper limit for the *p-value* and / or a lower limit for
+the *R squared* value. The produced logfile will state how many models have been discarded due
+to this filtering. The output model file also includes the p-value and R squared value for
+each created model.
+
+Graphical output of the models is available by selecting the plot option which shows the
+data points used for the model as well as the fit itself and the range of data that will
+be usable.
+
+.. class:: infomark
+
+**Notes**
+
+The output file produced by this tool is required as input for the CasLookup tool when
+selecting to apply regression when finding hits in the RIDB.
+
+
diff -r 000000000000 -r dffc38727496 datatypes_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/datatypes_conf.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 export_to_metexp_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,247 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to combine output from the GCMS Galaxy tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the MetExp database.
+
+RankFilter, CasLookup are already combined by combine_output.py so here we will use
+this result. Furthermore here one of the MsClust
+quantification files containing the respective spectra details are to be combined as well.
+
+Extra calculations performed:
+- The column MW is also added here and is derived from the column FORMULA found
+ in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+'''
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+import csv
+import re
+import sys
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2013, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_data(in_csv, delim='\t'):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_csv, 'rU'), delimiter=delim))
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = OrderedDict()
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+ONE_TO_ONE = 'one_to_one'
+N_TO_ONE = 'n_to_one'
+
+def _merge_data(set1, link_field_set1, set2, link_field_set2, compare_function, merge_function, metadata, relation_type=ONE_TO_ONE):
+ '''
+ Merges data from both input dictionaries based on the link fields. This method will
+ build up a new list containing the merged hits as the items.
+ @param set1: dictionary holding set1 in the form of N lists (one list per attribute name)
+ @param set2: dictionary holding set2 in the form of N lists (one list per attribute name)
+ '''
+ # TODO test for correct input files -> same link_field values should be there
+ # (test at least number of unique link_field values):
+ #
+ # if (len(set1[link_field_set1]) != len(set2[link_field_set2])):
+ # raise Exception('input files should have the same nr of key values ')
+
+
+ merged = []
+ processed = {}
+ for link_field_set1_idx in xrange(len(set1[link_field_set1])):
+ link_field_set1_value = set1[link_field_set1][link_field_set1_idx]
+ if not link_field_set1_value in processed :
+ # keep track of processed items to not repeat them
+ processed[link_field_set1_value] = link_field_set1_value
+
+ # Get the indices for current link_field_set1_value in both data-structures for proper matching
+ set1index = [index for index, value in enumerate(set1[link_field_set1]) if value == link_field_set1_value]
+ set2index = [index for index, value in enumerate(set2[link_field_set2]) if compare_function(value, link_field_set1_value)==True ]
+ # Validation :
+ if len(set2index) == 0:
+ # means that corresponding data could not be found in set2, then throw error
+ raise Exception("Datasets not compatible, merge not possible. " + link_field_set1 + "=" +
+ link_field_set1_value + " only found in first dataset. ")
+
+ merged_hits = []
+ # Combine hits
+ for hit in xrange(len(set1index)):
+ # Create records of hits to be merged ("keys" are the attribute names, so what the lines below do
+ # is create a new "dict" item with same "keys"/attributes, with each attribute filled with its
+ # corresponding value in the sets; i.e.
+ # set1[key] => returns the list/array with size = nrrows, with the values for the attribute
+ # represented by "key".
+ # set1index[hit] => points to the row nr=hit (hit is a rownr/index)
+ # So set1[x][set1index[n]] = set1.attributeX.instanceN
+ #
+ # It just ensures the entry is made available as a plain named array for easy access.
+ rf_record = OrderedDict(zip(set1.keys(), [set1[key][set1index[hit]] for key in set1.keys()]))
+ if relation_type == ONE_TO_ONE :
+ cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[hit]] for key in set2.keys()]))
+ else:
+ # is N to 1:
+ cl_record = OrderedDict(zip(set2.keys(), [set2[key][set2index[0]] for key in set2.keys()]))
+
+ merged_hit = merge_function(rf_record, cl_record, metadata)
+ merged_hits.append(merged_hit)
+
+ merged.append(merged_hits)
+
+ return merged, len(set1index)
+
+
+def _compare_records(key1, key2):
+ '''
+ in this case the compare method is really simple as both keys are expected to contain
+ same value when records are the same
+ '''
+ if key1 == key2:
+ return True
+ else:
+ return False
+
+
+
+def _merge_records(rank_caslookup_combi, msclust_quant_record, metadata):
+ '''
+ Combines single records from both the RankFilter+CasLookup combi file and from MsClust file
+
+ @param rank_caslookup_combi: rankfilter and caslookup combined record (see combine_output.py)
+ @param msclust_quant_record: msclust quantification + spectrum record
+ '''
+ record = []
+ for column in rank_caslookup_combi:
+ record.append(rank_caslookup_combi[column])
+
+ for column in msclust_quant_record:
+ record.append(msclust_quant_record[column])
+
+ for column in metadata:
+ record.append(metadata[column])
+
+ # add MOLECULAR MASS (MM)
+ molecular_mass = get_molecular_mass(rank_caslookup_combi['FORMULA'])
+ # limit to two decimals:
+ record.append("{0:.2f}".format(molecular_mass))
+
+ # add MOLECULAR WEIGHT (MW) - TODO - calculate this
+ record.append('0.0')
+
+ # level of identification and Location of reference standard
+ record.append('0')
+ record.append('')
+
+ return record
+
+
+def get_molecular_mass(formula):
+ '''
+ Calculates the molecular mass (MM).
+ E.g. MM of H2O = (relative)atomic mass of H x2 + (relative)atomic mass of O
+ '''
+
+ # Each element is represented by a capital letter, followed optionally by
+ # lower case, with one or more digits as for how many elements:
+ element_pattern = re.compile("([A-Z][a-z]?)(\d*)")
+
+ total_mass = 0
+ for (element_name, count) in element_pattern.findall(formula):
+ if count == "":
+ count = 1
+ else:
+ count = int(count)
+ element_mass = float(elements_and_masses_map[element_name]) # "found: Python's built-in float type has double precision " (? check if really correct ?)
+ total_mass += element_mass * count
+
+ return total_mass
+
+
+
+def _save_data(data, headers, out_csv):
+ '''
+ Writes tab-separated data to file
+ @param data: dictionary containing merged dataset
+ @param out_csv: output csv file
+ '''
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(headers)
+
+ # Write
+ for item_idx in xrange(len(data)):
+ for hit in data[item_idx]:
+ output_single_handle.writerow(hit)
+
+
+def _get_map_for_elements_and_masses(elements_and_masses):
+ '''
+ This method will read out the column 'Chemical symbol' and make a map
+ of this, storing the column 'Relative atomic mass' as its value
+ '''
+ resultMap = {}
+ index = 0
+ for entry in elements_and_masses['Chemical symbol']:
+ resultMap[entry] = elements_and_masses['Relative atomic mass'][index]
+ index += 1
+
+ return resultMap
+
+
+def init_elements_and_masses_map():
+ '''
+ Initializes the lookup map containing the elements and their respective masses
+ '''
+ elements_and_masses = _process_data(resource_filename(__name__, "static_resources/elements_and_masses.tab"))
+ global elements_and_masses_map
+ elements_and_masses_map = _get_map_for_elements_and_masses(elements_and_masses)
+
+
+def main():
+ '''
+ Combine Output main function
+
+ RankFilter, CasLookup are already combined by combine_output.py so here we will use
+ this result. Furthermore here the MsClust spectra file (.MSP) and one of the MsClust
+ quantification files are to be combined with combine_output.py result as well.
+ '''
+ rankfilter_and_caslookup_combined_file = sys.argv[1]
+ msclust_quantification_and_spectra_file = sys.argv[2]
+ output_csv = sys.argv[3]
+ # metadata
+ metadata = OrderedDict()
+ metadata['organism'] = sys.argv[4]
+ metadata['tissue'] = sys.argv[5]
+ metadata['experiment_name'] = sys.argv[6]
+ metadata['user_name'] = sys.argv[7]
+ metadata['column_type'] = sys.argv[8]
+
+ # Read RankFilter and CasLookup output files
+ rankfilter_and_caslookup_combined = _process_data(rankfilter_and_caslookup_combined_file)
+ msclust_quantification_and_spectra = _process_data(msclust_quantification_and_spectra_file, ',')
+
+ # Read elements and masses to use for the MW/MM calculation :
+ init_elements_and_masses_map()
+
+ merged, nhits = _merge_data(rankfilter_and_caslookup_combined, 'Centrotype',
+ msclust_quantification_and_spectra, 'centrotype',
+ _compare_records, _merge_records, metadata,
+ N_TO_ONE)
+ headers = rankfilter_and_caslookup_combined.keys() + msclust_quantification_and_spectra.keys() + metadata.keys() + ['MM','MW', 'Level of identification', 'Location of reference standard']
+ _save_data(merged, headers, output_csv)
+
+
+if __name__ == '__main__':
+ main()
diff -r 000000000000 -r dffc38727496 export_to_metexp_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/export_to_metexp_tabular.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,75 @@
+
+ Create tabular file for loading into METabolomics EXPlorer database
+
+ export_to_metexp_tabular.py
+ $rankfilter_and_caslookup_combi
+ $msclust_quant_file
+ $output_result
+ "$organism"
+ "$tissue"
+ "$experiment_name"
+ "$user_name"
+ "$column_type"
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Tool to combine output from the tools RankFilter, CasLookup and MsClust
+into a tabular file that can be uploaded to the METabolomics EXPlorer (MetExp) database.
+
+RankFilter, CasLookup are already combined by 'RIQC-Combine RankFilter and CasLookup' tool so here we will use
+this result.
+
+**Notes**
+
+Extra calculations performed:
+- The columns MM and MW are also added here and are derived from the column FORMULA found in RankFilter, CasLookup combined result.
+
+So in total here we merge 2 files and calculate one new column.
+
+
+
+
diff -r 000000000000 -r dffc38727496 library_lookup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/library_lookup.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,327 @@
+'''
+Logic for searching a Retention Index database file given output from NIST
+'''
+import match_library
+import re
+import sys
+import csv
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def create_lookup_table(library_file, column_type_name, statphase):
+ '''
+ Creates a dictionary holding the contents of the library to be searched
+ @param library_file: library to read
+ @param column_type_name: the columns type name
+ @param statphase: the columns stationary phase
+ '''
+ (data, header) = match_library.read_library(library_file)
+ # Test for presence of required columns
+ if ('columntype' not in header or
+ 'columnphasetype' not in header or
+ 'cas' not in header):
+ raise IOError('Missing columns in ', library_file)
+
+ column_type_column = header.index("columntype")
+ statphase_column = header.index("columnphasetype")
+ cas_column = header.index("cas")
+
+ filtered_library = [line for line in data if line[column_type_column] == column_type_name
+ and line[statphase_column] == statphase]
+ lookup_dict = {}
+ for element in filtered_library:
+ # Here the cas_number is set to the numeric part of the cas_column value, so if the
+ # cas_column value is 'C1433' then cas_number will be '1433'
+ cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0])
+ try:
+ lookup_dict[cas_number].append(element)
+ except KeyError:
+ lookup_dict[cas_number] = [element]
+ return lookup_dict
+
+
+def _preferred(hits, pref, ctype, polar, model, method):
+ '''
+ Returns all entries in the lookup_dict that have the same column name, type and polarity
+ as given by the user, uses regression if selected given the model and method to use. The
+ regression is applied on the column with the best R-squared value in the model
+ @param hits: all entries in the lookup_dict for the given CAS number
+ @param pref: preferred GC-column, can be one or more names
+ @param ctype: column type (capillary etc.)
+ @param polar: polarity (polar / non-polar etc.)
+ @param model: data loaded from file containing regression models
+ @param method: supported regression method (i.e. poly(nomial) or linear)
+ '''
+ match = []
+ for column in pref:
+ for hit in hits:
+ if hit[4] == ctype and hit[5] == polar and hit[6] == column:
+ # Create copy of found hit since it will be altered downstream
+ match.extend(hit)
+ return match, False
+
+ # No hit found for current CAS number, return if not performing regression
+ if not model:
+ return False, False
+
+ # Perform regression
+ for column in pref:
+ if column not in model:
+ break
+ # Order regression candidates by R-squared value (last element)
+ order = sorted(model[column].items(), key=lambda col: col[1][-1])
+ # Create list of regression candidate column names
+ regress_columns = list(reversed([column for (column, _) in order]))
+ # Names of available columns
+ available = [hit[6] for hit in hits]
+
+ # TODO: combine Rsquared and number of datapoints to get the best regression match
+ '''
+ # Iterate regression columns (in order) and retrieve their models
+ models = {}
+ for col in regress_columns:
+ if col in available:
+ hit = list(hits[available.index(col)])
+ if hit[4] == ctype:
+ # models contains all model data including residuals [-2] and rsquared [-1]
+ models[pref[0]] = model[pref[0]][hit[6]]
+ # Get the combined maximum for residuals and rsquared
+ best_match = models[]
+ # Apply regression
+ if method == 'poly':
+ regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model)
+ if regressed:
+ hit[3] = regressed
+ else:
+ return False, False
+ else:
+ hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model)
+ match.extend(hit)
+ return match, hit[6]
+ '''
+
+ for col in regress_columns:
+ if col in available:
+ hit = list(hits[available.index(col)])
+ if hit[4] == ctype:
+ # Perform regression using a column for which regression is possible
+ if method == 'poly':
+ # Polynomial is only possible within a set border, if the RI falls outside
+ # of this border, skip this lookup
+ regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model)
+ if regressed:
+ hit[3] = regressed
+ else:
+ return False, False
+ else:
+ hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model)
+ match.extend(hit)
+ return match, hit[6]
+
+ return False, False
+
+
+
+def default_hit(row, cas_nr, compound_id):
+ '''
+ This method will return a "default"/empty hit for cases where the
+ method _preferred() returns False (i.e. a RI could not be found
+ for the given cas nr, also not via regression.
+ '''
+ return [
+ #'CAS',
+ 'C' + cas_nr,
+ #'NAME',
+ '',
+ #'FORMULA',
+ '',
+ #'RI',
+ '0.0',
+ #'Column.type',
+ '',
+ #'Column.phase.type',
+ '',
+ #'Column.name',
+ '',
+ #'phase.coding',
+ ' ',
+ #'CAS_column.Name',
+ '',
+ #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes
+ compound_id,
+ #'Regression.Column.Name',
+ '',
+ #'min',
+ '',
+ #'max',
+ '',
+ #'nr.duplicates',
+ '']
+
+
+def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method):
+ '''
+ Looks up the compounds in the library lookup table and formats the results
+ @param lookup_dict: dictionary containing the library to be searched
+ @param nist_tabular_filename: NIST output file to be matched
+ @param pref: (list of) column-name(s) to look for
+ @param ctype: column type of interest
+ @param polar: polarity of the used column
+ @param model: data loaded from file containing regression models
+ @param method: supported regression method (i.e. poly(nomial) or linear)
+ '''
+ (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename)
+ # Retrieve indices of the CAS and compound_id columns (exit if not present)
+ try:
+ casi = header_clean.index("cas")
+ idi = header_clean.index("id")
+ except:
+ raise IOError("'CAS' or 'compound_id' not found in header of library file")
+
+ data = []
+ for row in nist_tabular_list:
+ casf = str(row[casi].replace('-', '').strip())
+ compound_id = str(row[idi].split('-')[0])
+ if casf in lookup_dict:
+ found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method)
+ if found_hit:
+ # Keep cas nr as 'C'+ numeric part:
+ found_hit[0] = 'C' + casf
+ # Add compound id
+ found_hit.insert(9, compound_id)
+ # Add information on regression process
+ found_hit.insert(10, regress if regress else 'None')
+ # Replace column index references with actual number of duplicates
+ dups = len(found_hit[-1].split(','))
+ if dups > 1:
+ found_hit[-1] = str(dups + 1)
+ else:
+ found_hit[-1] = '0'
+ data.append(found_hit)
+ found_hit = ''
+ else:
+ data.append(default_hit(row, casf, compound_id))
+ else:
+ data.append(default_hit(row, casf, compound_id))
+
+ casf = ''
+ compound_id = ''
+ found_hit = []
+ dups = []
+ return data
+
+
+def _save_data(content, outfile):
+ '''
+ Write to output file
+ @param content: content to write
+ @param outfile: file to write to
+ '''
+ # header
+ header = ['CAS',
+ 'NAME',
+ 'FORMULA',
+ 'RI',
+ 'Column.type',
+ 'Column.phase.type',
+ 'Column.name',
+ 'phase.coding',
+ 'CAS_column.Name',
+ 'Centrotype',
+ 'Regression.Column.Name',
+ 'min',
+ 'max',
+ 'nr.duplicates']
+ output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t")
+ output_handle.writerow(header)
+ for entry in content:
+ output_handle.writerow(entry)
+
+
+def _read_model(model_file):
+ '''
+ Creates an easy to search dictionary for getting the regression parameters
+ for each valid combination of GC-columns
+ @param model_file: filename containing the regression models
+ '''
+ regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t'))
+ if len(regress.pop(0)) > 9:
+ method = 'poly'
+ else:
+ method = 'linear'
+
+ model = {}
+ # Create new dictionary for each GC-column
+ for line in regress:
+ model[line[0]] = {}
+
+ # Add data
+ for line in regress:
+ if method == 'poly':
+ model[line[0]][line[1]] = [float(col) for col in line[2:11]]
+ else: # linear
+ model[line[0]][line[1]] = [float(col) for col in line[2:9]]
+
+ return model, method
+
+
+def _apply_poly_regression(column1, column2, retention_index, model):
+ '''
+ Calculates a new retention index (RI) value using a given 3rd-degree polynomial
+ model based on data from GC columns 1 and 2
+ @param column1: name of the selected GC-column
+ @param column2: name of the GC-column to use for regression
+ @param retention_index: RI to convert
+ @param model: dictionary containing model information for all GC-columns
+ '''
+ coeff = model[column1][column2]
+ # If the retention index to convert is within range of the data the model is based on, perform regression
+ if coeff[4] < retention_index < coeff[5]:
+ return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) +
+ (retention_index * coeff[1]) + coeff[0])
+ else:
+ return False
+
+
+def _apply_linear_regression(column1, column2, retention_index, model):
+ '''
+ Calculates a new retention index (RI) value using a given linear model based on data
+ from GC columns 1 and 2
+ @param column1: name of the selected GC-column
+ @param column2: name of the GC-column to use for regression
+ @param retention_index: RI to convert
+ @param model: dictionary containing model information for all GC-columns
+ '''
+ # TODO: No use of limits
+ coeff = model[column1][column2]
+ return coeff[1] * retention_index + coeff[0]
+
+
+def main():
+ '''
+ Library Lookup main function
+ '''
+ library_file = sys.argv[1]
+ nist_tabular_filename = sys.argv[2]
+ ctype = sys.argv[3]
+ polar = sys.argv[4]
+ outfile = sys.argv[5]
+ pref = sys.argv[6:-1]
+ regress = sys.argv[-1]
+
+ if regress != 'False':
+ model, method = _read_model(regress)
+ else:
+ model, method = False, None
+
+ lookup_dict = create_lookup_table(library_file, ctype, polar)
+ data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)
+
+ _save_data(data, outfile)
+
+
+if __name__ == "__main__":
+ main()
diff -r 000000000000 -r dffc38727496 library_lookup.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/library_lookup.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,75 @@
+
+ Lookup or estimate the RI using a "known RI values" CAS numbers library
+
+ library_lookup.py
+ $library_file
+ $input
+ "$col_type"
+ "$polarity"
+ $output
+ #for $ctype in $pref
+ ${ctype.columntype}
+ #end for
+ $regression.model
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Performs a lookup of the RI values by matching CAS numbers from the given NIST identifications file to a library.
+If a direct match is NOT found for the preferred column name, a regression can be done to find
+the theoretical RI value based on known RI values for the CAS number on other column types (see step 4).
+If there is no match for the CAS number on any column type, then the record is not given a RI.
+
+
+
+
+
+
diff -r 000000000000 -r dffc38727496 match_library.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/match_library.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,133 @@
+'''
+Containing functions are called from Galaxy to populate lists/checkboxes with selectable items
+'''
+import csv
+import glob
+import os
+
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+def get_column_type(library_file):
+ '''
+ Returns a Galaxy formatted list of tuples containing all possibilities for the
+ GC-column types. Used by the library_lookup.xml tool
+ @param library_file: given library file from which the list of GC-column types is extracted
+ '''
+ if library_file == "":
+ galaxy_output = [("", "", False)]
+ else:
+ (data, header) = read_library(library_file)
+
+ if 'columntype' not in header:
+ raise IOError('Missing columns in ', library_file)
+
+ # Filter data on column type
+ column_type = header.index("columntype")
+ amounts_in_list_dict = count_occurrence([row[column_type] for row in data])
+ galaxy_output = [(str(a) + "(" + str(b) + ")", a, False) for a, b in amounts_in_list_dict.items()]
+
+ return(galaxy_output)
+
+
+def filter_column(library_file, column_type_name):
+ '''
+ Filters the Retention Index database on column type
+ @param library_file: file containing the database
+ @param column_type_name: column type to filter on
+ '''
+ if library_file == "":
+ galaxy_output = [("", "", False)]
+ else:
+ (data, header) = read_library(library_file)
+
+ if ('columntype' not in header or
+ 'columnphasetype' not in header):
+ raise IOError('Missing columns in ', library_file)
+
+ column_type = header.index("columntype")
+ statphase = header.index("columnphasetype")
+
+ # Filter data on colunn type name
+ statphase_list = [line[statphase] for line in data if line[column_type] == column_type_name]
+ amounts_in_list_dict = count_occurrence(statphase_list)
+ galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+
+ return(sorted(galaxy_output))
+
+
+def filter_column2(library_file, column_type_name, statphase):
+ '''
+ Filters the Retention Index database on column type
+ @param library_file: file containing the database
+ @param column_type_name: column type to filter on
+ @param statphase: stationary phase of the column to filter on
+ '''
+ if library_file == "":
+ galaxy_output = [("", "", False)]
+ else:
+ (data, header) = read_library(library_file)
+
+ if ('columntype' not in header or
+ 'columnphasetype' not in header or
+ 'columnname' not in header):
+ raise IOError('Missing columns in ', library_file)
+
+ column_type_column = header.index("columntype")
+ statphase_column = header.index("columnphasetype")
+ column_name_column = header.index("columnname")
+
+ # Filter data on given column type name and stationary phase
+ statphase_list = [line[column_name_column] for line in data if line[column_type_column] == column_type_name and
+ line[statphase_column] == statphase]
+ amounts_in_list_dict = count_occurrence(statphase_list)
+ galaxy_output = [(str(a) + "(" + str(b) + ")", a, False)for a, b in amounts_in_list_dict.items()]
+
+ return(sorted(galaxy_output))
+
+
+def read_library(filename):
+ '''
+ Reads a CSV file and returns its contents and a normalized header
+ @param filename: file to read
+ '''
+ data = list(csv.reader(open(filename, 'rU'), delimiter='\t'))
+ header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in data.pop(0)]
+ return(data, header_clean)
+
+
+
+def get_directory_files(dir_name):
+ '''
+ Reads the directory and
+ returns the list of .txt files found as a dictionary
+ with file name and full path so that it can
+ fill a Galaxy drop-down combo box.
+
+ '''
+ files = glob.glob(dir_name + "/*.*")
+ if len(files) == 0:
+ # Configuration error: no library files found in /" + dir_name :
+ galaxy_output = [("Configuration error: expected file not found in /" + dir_name, "", False)]
+ else:
+ galaxy_output = [(str(get_file_name_no_ext(file_name)), str(os.path.abspath(file_name)), False) for file_name in files]
+ return(galaxy_output)
+
+def get_file_name_no_ext(full_name):
+ '''
+ returns just the last part of the name
+ '''
+ simple_name = os.path.basename(full_name)
+ base, ext = os.path.splitext(simple_name)
+ return base
+
+
+def count_occurrence(data_list):
+ '''
+ Counts occurrences in a list and returns a dict with item:occurrence
+ @param data_list: list to count items from
+ '''
+ return dict((key, data_list.count(key)) for key in set(data_list))
diff -r 000000000000 -r dffc38727496 metaMS_cmd_annotate.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metaMS_cmd_annotate.r Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,86 @@
+## read args:
+args <- commandArgs(TRUE)
+## the constructed DB, e.g. "E:/Rworkspace/metaMS/data/LCDBtest.RData"
+args.constructedDB <- args[1]
+## data file in xset format:
+args.xsetData <- args[2]
+## settings file, e.g. "E:/Rworkspace/metaMS/data/settings.r", should contain assignment to an object named "customMetaMSsettings"
+args.settings <- args[3]
+
+## output file names, e.g. "E:/Rworkspace/metaMS/data/out.txt"
+args.outAnnotationTable <- args[4]
+
+args.mass_error_function <- args[5]
+if (args.mass_error_function == "0")
+ args.mass_error_function <- NULL
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+cat("\nSettings used===============:\n")
+cat(readChar(args.settings, 1e5))
+
+
+tryCatch(
+ {
+ library(metaMS)
+
+ ## load the constructed DB :
+ tempEnv <- new.env()
+ testDB <- load(args.constructedDB, envir=tempEnv)
+ xsetData <- readRDS(args.xsetData)
+
+ ## load settings "script" into "customMetaMSsettings"
+ source(args.settings, local=tempEnv)
+ message(paste(" loaded : ", args.settings))
+
+ # Just to highlight: if you want to use more than one
+ # trigger runLC:
+ LC <- runLC(xset=xsetData, settings = tempEnv[["customMetaMSsettings"]], DB = tempEnv[[testDB[1]]]$DB, errf=args.mass_error_function, nSlaves=20, returnXset = TRUE)
+
+ # write out runLC annotation results:
+ write.table(LC$PeakTable, args.outAnnotationTable, sep="\t", row.names=FALSE)
+
+ # the used constructed DB (write to log):
+ cat("\nConstructed DB info===============:\n")
+ str(tempEnv[[testDB[1]]]$Info)
+ cat("\nConstructed DB table===============:\n")
+ if (length(args) == 8)
+ {
+ write.table(tempEnv[[testDB[1]]]$DB, args.outLogFile, append=TRUE, row.names=FALSE)
+ write.table(tempEnv[[testDB[1]]]$Reftable, args.outLogFile, sep="\t", append=TRUE, row.names=FALSE)
+ }
+
+ message("\nGenerating report.........")
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ html <- "
"
+ # TODO add (nr and mass error) and group size
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure_retcor.png", sep="")
+ html <- paste(html," ", sep="")
+ png( figureName, type="cairo", width=1100,height=600 )
+ retcor(LC$xset@xcmsSet, method="peakgroups", plottype = "mdevden")
+ html <- paste(html,"*NB: retention time correction plot based on 'peakgroups' option with default settings. This is not the plot matching the exact settings used in the run,
+ but just intended to give a rough estimate of the retention time shifts present in the data. A more accurate plot will be available once
+ this option is added in metaMS API. ", sep="")
+ devname = dev.off()
+
+
+ gt <- groups(LC$xset@xcmsSet)
+ groupidx1 <- which(gt[,"rtmed"] > 0 & gt[,"rtmed"] < 3000 & gt[,"npeaks"] > 3)
+
+ html <- paste(html,"")
+ message("finished generating report")
+ write(html,file=args.htmlReportFile)
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
diff -r 000000000000 -r dffc38727496 metams_lcms_annotate.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metams_lcms_annotate.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,121 @@
+
+ Runs metaMS process for LC/MS feature annotation
+
+ R_bioc_metams
+
+
+ metaMS_cmd_annotate.r
+ $constructed_db
+ $xsetData
+ $customMetaMSsettings
+ $outputFile
+ #if $mzTol.mzTolType == "fixed"
+ 0
+ #else
+ "$mzTol.mass_error_function"
+ #end if
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outputLog
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+## start comment
+ ## metaMS process settings
+ customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}",
+ chrom = "LC")
+metaSetting(customMetaMSsettings, "match2DB") <- list(
+ rtdiff = ${rtdiff},
+ rtval = ${rtval},
+ #if $mzTol.mzTolType == "fixed"
+ mzdiff = ${mzTol.mzdiff},
+ #else
+ ppm = ${mzTol.ppm},
+ #end if
+ minfeat = ${minfeat})
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Runs metaMS process for LC/MS feature annotation based on matching to an existing 'standards' DB.
+The figure below shows the main parts of this metaMS process.
+
+.. image:: $PATH_TO_IMAGES/metaMS_annotate.png
+
+
+.. class:: infomark
+
+The implemented annotation strategy can be broken down in the following steps:
+
+1. *Feature wise Annotation:* Each feature detected by runLC is matched against the database. If
+the mass error function is provided, the appropriate m/z tolerance is calculated, otherwise a fixed
+tolerance is used (mzdiff). The retention time tolerance is fixed and should be selected on the
+bases of the characteristics of each chromatographic method (rtdiff). Multiple annotations - i.e.
+features which are associated to more than one compound - are possible. This outcome does not
+indicate a problem per se, but is an inherent drawback of co-elution.
+
+2. *Annotation Validation:* The annotated features are organized in 'pseudospectra' collecting all
+the experimental features which are assigned to a specific compound. A specific annotation is
+confirmed only if more than minfeat features which differ in retention time less than rtval are
+present in a pseudospectrum. As a general rule rtval should be narrower than rtdiff. The
+latter, indeed, accounts for shifts in retention time between the injection of the standards and the
+metabolomics experiment under investigation. This time can be rather long, considering that the
+standards are not commonly re-analyzed each time. On the other hand, rtval represents the shift
+between the ions of the same compound within the same batch of injections and therefore it has
+only to account for the smaller shifts occurring during peak picking and alignment.
+
+
+
+
+ 10.1016/j.jchromb.2014.02.051
+
+
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 metams_lcms_pick_and_group.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/metams_lcms_pick_and_group.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,301 @@
+
+ Runs metaMS process for LC/MS feature picking, aligning and grouping
+
+ R_bioc_metams
+
+
+ metaMS_cmd_pick_and_group.r
+ $data_files
+ $customMetaMSsettings
+ $outputFile
+ $xsetOut
+ $polarity
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outputLog
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+## ====================================
+ ## metaMS process settings
+ customMetaMSsettings <- metaMSsettings(protocolName = "${protocolName}",
+ chrom = "LC",
+ PeakPicking = list(
+ method = "${peakPicking.method}",
+ #if $peakPicking.method == "matchedFilter"
+ fwhm = ${peakPicking.fwhm},
+ sigma = ${peakPicking.fwhm}/${peakPicking.sigma_denom},
+ max = ${peakPicking.max},
+ snthresh = ${peakPicking.snthresh},
+ step = ${peakPicking.step},
+ steps = ${peakPicking.steps},
+ mzdiff = ${peakPicking.mzdiff}),
+ #else
+ ppm = ${peakPicking.ppm},
+ peakwidth = c(${peakPicking.peakwidth}),
+ snthresh = ${peakPicking.snthresh},
+ prefilter = c(${peakPicking.prefilter}),
+ mzCenterFun = "${peakPicking.mzCenterFun}",
+ integrate = ${peakPicking.integrate},
+ mzdiff = ${peakPicking.mzdiff},
+ fitgauss = ${peakPicking.fitgauss},
+ noise = ${peakPicking.noise}),
+ #end if
+ Alignment = list(
+ min.class.fraction = ${min_class_fraction},
+ min.class.size = ${min_class_size},
+ mzwid = ${mzwid},
+ bws = c(${bws}),
+ retcormethod = "${retcor.retcormethod}",
+ #if $retcor.retcormethod == "peakgroups"
+ smooth = "${retcor.smooth}",
+ missingratio = ${retcor.missingratio},
+ extraratio = ${retcor.extraratio},
+ retcorfamily = "${retcor.retcorfamily}",
+ #else
+ ##repeating the method as workaround/ backwards compatibility (can remove this one after fix from metaMS):
+ method = "${retcor.retcormethod}",
+ profStep = ${retcor.profStep},
+ #end if
+ fillPeaks = ${fillPeaks}),
+ CAMERA = list(
+ perfwhm = ${perfwhm},
+ cor_eic_th = ${cor_eic_th},
+ ppm= ${ppm},
+ graphMethod= "${groupCorr_graphMethod}",
+ pval= ${groupCorr_pval},
+ calcCiS= ${groupCorr_calcCiS},
+ calcIso= ${groupCorr_calcIso},
+ calcCaS= ${groupCorr_calcCaS},
+ maxcharge= ${findIsotopes_maxcharge},
+ maxiso= ${findIsotopes_maxiso},
+ minfrac= ${findIsotopes_minfrac},
+ multiplier= ${findAdducts_multiplier}
+ ))
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Runs metaMS process for LC/MS feature feature picking, aligning and grouping.
+This part of the metaMS process makes use of the XCMS and CAMERA tools and algorithms.
+CAMERA is used for automatic deconvolution/annotation of LC/ESI-MS data.
+The figure below shows the main parts of the metaMS process wrapped by this tool.
+
+.. image:: $PATH_TO_IMAGES/metaMS_pick_align_camera.png
+
+
+From CAMERA documentation:
+
+.. image:: $PATH_TO_IMAGES/CAMERA_results.png
+
+**References**
+
+If you use this Galaxy tool in work leading to a scientific publication please
+cite the following papers:
+
+Wehrens, R.; Weingart, G.; Mattivi, F. (2014).
+metaMS: an open-source pipeline for GC-MS-based untargeted metabolomics.
+Journal of chromatography B: biomedical sciences and applications, 996 (1): 109-116.
+doi: 10.1016/j.jchromb.2014.02.051
+handle: http://hdl.handle.net/10449/24012
+
+Wrapper by Pieter Lukasse.
+
+
+
+
+ 10.1016/j.jchromb.2014.02.051
+
+
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 primsfilters.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/primsfilters.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,44 @@
+import logging
+log = logging.getLogger( __name__ )
+
+
+def restrict_prims_metabolomics( context, tool ):
+ """
+ This tool filter will hide prims_metabolomics tools for non-metabolomics users.
+ This can be enabled by adding the following to the
+ ``app:main`` section of ``universe_wsgi.ini``::
+
+ tool_filters = primsfilters:restrict_prims_metabolomics
+
+ and by adding this file to the folder:
+
+ /lib/galaxy/tools/filters
+
+ This is optional and can be used in case some control is desired on whom
+ gets to see the prims_metabolomics tools. When not using this file and the
+ settings mentioned above, all prims_metabolomics tools will be visible to
+ all users.
+ """
+ # for debugging: import pydevd;pydevd.settrace("L0136815.wurnet.nl")
+ user = context.trans.user
+ metabolomics_tools = [ "msclust2", "combine_output", "create_poly_model", "lookup_library",
+ "NDIStext2tabular", "rankfilterGCMS_tabular", "filter_on_rank",
+ "export_to_metexp_tabular", "query_metexp" ]
+ found_match = False
+ # iterate over the tool (partial)ids and look for a match (this is compatible with tool shed given ids):
+ for partial_id in metabolomics_tools:
+ if tool.id.find("/"+ partial_id + "/") >= 0:
+ found_match = True
+ break
+ # the second part of this if is compatible with the ids when NOT using tool shed:
+ if found_match or tool.id in metabolomics_tools:
+ # logging.warn( 'FILTER MATCHED: %s' %(tool.name))
+
+ for user_role in user.roles:
+ if user_role.role.name == "PRIMS_METABOLOMICS":
+ return True
+ # not found to have the role, return false:
+ return False
+ else:
+ # return true for any other tool
+ return True
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 query_mass_repos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,289 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of accurate mass values detected by high-resolution mass spectrometers
+against various repositories/services such as METabolomics EXPlorer database or the
+MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository/service. If one or more compounds are found
+then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository/service.
+
+The service should implement the following interface:
+
+http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)
+
+The output should be tab separated and should contain the following columns (in this order)
+db-name molecular-formula dbe formula-weight id description
+
+
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+ return _process_data(data)
+
+def _process_data(data):
+
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = OrderedDict()
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
+
+ '''
+ This method will iterate over the record in the input_data and
+ will enrich them with the related information found (if any) in the
+ chosen repository/service
+
+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+ '''
+ merged = []
+
+ for i in xrange(len(input_data[input_data.keys()[0]])):
+ # Get the record in same dictionary format as input_data, but containing
+ # a value at each column instead of a list of all values of all records:
+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+
+ # read the molecular mass :
+ molecular_mass = input_data_record[molecular_mass_col]
+
+
+ # search for related records in repository/service:
+ data_found = None
+ if molecular_mass != "":
+ molecular_mass = float(molecular_mass)
+
+ # 1- search for data around this MM:
+ query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
+
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "MM"
+
+
+ if data_found == None:
+ # If still nothing found, just add empty columns
+ extra_cols = ['', '','','','','']
+ else:
+ # Add info found:
+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+
+ # Take all data and merge it into a "flat"/simple array of values:
+ field_values_list = _merge_data(input_data_record, extra_cols)
+
+ merged.append(field_values_list)
+
+ # return the merged/enriched records:
+ return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+ '''
+ This method will go over the data found and will return a
+ list with the following items:
+ - details of hits found :
+ db-name molecular-formula dbe formula-weight id description
+ - Link that executes same query
+
+ '''
+
+ # set() makes a unique list:
+ db_name_set = []
+ molecular_formula_set = []
+ id_set = []
+ description_set = []
+
+
+ if 'db-name' in data_found:
+ db_name_set = set(data_found['db-name'])
+ elif '# db-name' in data_found:
+ db_name_set = set(data_found['# db-name'])
+ if 'molecular-formula' in data_found:
+ molecular_formula_set = set(data_found['molecular-formula'])
+ if 'id' in data_found:
+ id_set = set(data_found['id'])
+ if 'description' in data_found:
+ description_set = set(data_found['description'])
+
+ result = [data_type_found,
+ _to_xsv(db_name_set),
+ _to_xsv(molecular_formula_set),
+ _to_xsv(id_set),
+ _to_xsv(description_set),
+ #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+ "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+ return result
+
+
+def _to_xsv(data_set):
+ result = ""
+ for item in data_set:
+ result = result + str(item) + "|"
+ return result
+
+
+def _fire_query_and_return_dict(url):
+ '''
+ This method will fire the query as a web-service call and
+ return the results as a list of dictionary objects
+ '''
+
+ try:
+ data = urllib2.urlopen(url).read()
+
+ # transform to dictionary:
+ result = []
+ data_rows = data.split("\n")
+
+ # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
+ data_rows_to_remove = []
+ for data_row in data_rows:
+ if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
+ data_rows_to_remove.append(data_row)
+
+ for data_row in data_rows_to_remove:
+ data_rows.remove(data_row)
+
+ # check if there is any data in the response:
+ if len(data_rows) <= 1 or data_rows[1].strip() == '':
+ # means there is only the header row...so no hits:
+ return None
+
+ for data_row in data_rows:
+ if not data_row.strip() == '':
+ row_as_list = _str_to_list(data_row, delimiter='\t')
+ result.append(row_as_list)
+
+ # return result processed into a dict:
+ return _process_data(result)
+
+ except urllib2.HTTPError, e:
+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+ except urllib2.URLError, e:
+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):
+ result = []
+ for column in data_row.split(delimiter):
+ result.append(column)
+ return result
+
+
+# alternative: ?
+# s = requests.Session()
+# s.verify = False
+# #s.auth = (token01, token02)
+# resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+# content = resp.content
+# # transform to dictionary:
+
+
+
+
+def _merge_data(input_data_record, extra_cols):
+ '''
+ Adds the extra information to the existing data record and returns
+ the combined new record.
+ '''
+ record = []
+ for column in input_data_record:
+ record.append(input_data_record[column])
+
+
+ # add extra columns
+ for column in extra_cols:
+ record.append(column)
+
+ return record
+
+
+def _save_data(data_rows, headers, out_csv):
+ '''
+ Writes tab-separated data to file
+ @param data_rows: dictionary containing merged/enriched dataset
+ @param out_csv: output csv file
+ '''
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(headers)
+
+ # Write one line for each row
+ for data_row in data_rows:
+ output_single_handle.writerow(data_row)
+
+def _get_repository_URL(repository_file):
+ '''
+ Read out and return the URL stored in the given file.
+ '''
+ file_input = fileinput.input(repository_file)
+ try:
+ for line in file_input:
+ if line[0] != '#':
+ # just return the first line that is not a comment line:
+ return line
+ finally:
+ file_input.close()
+
+
+def main():
+ '''
+ Query main function
+
+ The input file can be any tabular file, as long as it contains a column for the molecular mass.
+ This column is then used to query against the chosen repository/service Database.
+ '''
+ seconds_start = int(round(time.time()))
+
+ input_file = sys.argv[1]
+ molecular_mass_col = sys.argv[2]
+ repository_file = sys.argv[3]
+ error_margin = float(sys.argv[4])
+ margin_unit = sys.argv[5]
+ output_result = sys.argv[6]
+
+ # Parse repository_file to find the URL to the service:
+ repository_dblink = _get_repository_URL(repository_file)
+
+ # Parse tabular input file into dictionary/array:
+ input_data = _process_file(input_file)
+
+ # Query data against repository :
+ enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
+ headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
+ 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns
+
+ _save_data(enriched_data, headers, output_result)
+
+ seconds_end = int(round(time.time()))
+ print "Took " + str(seconds_end - seconds_start) + " seconds"
+
+
+
+if __name__ == '__main__':
+ main()
diff -r 000000000000 -r dffc38727496 query_mass_repos.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_mass_repos.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,106 @@
+
+ Query multiple public repositories for elemental compositions from accurate mass values detected by high-resolution mass spectrometers
+
+ query_mass_repos.py
+ $input_file
+ "$molecular_mass_col"
+ "$repository_file"
+ $error_margin
+ $margin_unit
+ $output_result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool will query multiple public repositories such as PRI-MetExp or http://webs2.kazusa.or.jp/mfsearcher
+for elemental compositions from accurate mass values detected by high-resolution mass spectrometers.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected repository. If one or more compounds are found in the
+repository then extra information regarding (mass based)matching elemental composition formulas is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected repository.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass.
+
+**Services that can be queried**
+
+================= =========================================================================
+Database Description
+----------------- -------------------------------------------------------------------------
+PRI-MetExp LC-MS and GC-MS data from experiments from the metabolomics group at
+ Plant Research International. NB: restricted access to employees with
+ access key.
+ExactMassDB A database of possible elemental compositions consits of C: 100,
+ H: 200, O: 50, N: 10, P: 10, and S: 10, that satisfy the Senior and
+ the Lewis valence rules.
+ (via /mfsearcher/exmassdb/)
+ExactMassDB-HR2 HR2, which is one of the fastest tools for calculation of elemental
+ compositions, filters some elemental compositions according to
+ the Seven Golden Rules (Kind and Fiehn, 2007). The ExactMassDB-HR2
+ database returns the same result as does HR2 with the same atom kind
+ and number condition as that used in construction of the ExactMassDB.
+ (via /mfsearcher/exmassdb-hr2/)
+Pep1000 A database of possible linear polypeptides that are
+ constructed with 20 kinds of amino acids and having molecular
+ weights smaller than 1000.
+ (via /mfsearcher/pep1000/)
+KEGG Re-calculated compound data from KEGG. Weekly updated.
+ (via /mfsearcher/kegg/)
+KNApSAcK Re-calculated compound data from KNApSAcK.
+ (via /mfsearcher/knapsack/)
+Flavonoid Viewer Re-calculated compound data from Flavonoid Viewer .
+ (via /mfsearcher/flavonoidviewer/
+LipidMAPS Re-calculated compound data from LIPID MAPS.
+ (via /mfsearcher/lipidmaps/)
+HMDB Re-calculated compound data from Human Metabolome Database (HMDB)
+ Version 3.5.
+ (via /mfsearcher/hmdb/)
+PubChem Re-calculated compound data from PubChem. Monthly updated.
+ (via /mfsearcher/pubchem/)
+================= =========================================================================
+
+Sources for table above: PRI-MetExp and http://webs2.kazusa.or.jp/mfsearcher
+
+
+
diff -r 000000000000 -r dffc38727496 query_metexp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,282 @@
+#!/usr/bin/env python
+# encoding: utf-8
+'''
+Module to query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+'''
+import csv
+import sys
+import fileinput
+import urllib2
+import time
+from collections import OrderedDict
+
+__author__ = "Pieter Lukasse"
+__contact__ = "pieter.lukasse@wur.nl"
+__copyright__ = "Copyright, 2014, Plant Research International, WUR"
+__license__ = "Apache v2"
+
+def _process_file(in_xsv, delim='\t'):
+ '''
+ Generic method to parse a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
+ return _process_data(data)
+
+def _process_data(data):
+
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = OrderedDict()
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
+ '''
+ This method will iterate over the record in the input_data and
+ will enrich them with the related information found (if any) in the
+ MetExp Database.
+
+ # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
+ '''
+ merged = []
+
+ for i in xrange(len(input_data[input_data.keys()[0]])):
+ # Get the record in same dictionary format as input_data, but containing
+ # a value at each column instead of a list of all values of all records:
+ input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
+
+ # read the molecular mass and formula:
+ cas_id = input_data_record[casid_col]
+ formula = input_data_record[formula_col]
+ molecular_mass = input_data_record[molecular_mass_col]
+
+ # search for related records in MetExp:
+ data_found = None
+ if cas_id != "undef":
+ # 1- search for other experiments where this CAS id has been found:
+ query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "CAS"
+ if data_found == None:
+ # 2- search for other experiments where this FORMULA has been found:
+ query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "FORMULA"
+ if data_found == None:
+ # 3- search for other experiments where this MM has been found:
+ query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
+ data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
+ data_type_found = "MM"
+
+ if data_found == None:
+ # If still nothing found, just add empty columns
+ extra_cols = ['', '','','','','','','']
+ else:
+ # Add info found:
+ extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
+
+ # Take all data and merge it into a "flat"/simple array of values:
+ field_values_list = _merge_data(input_data_record, extra_cols)
+
+ merged.append(field_values_list)
+
+ # return the merged/enriched records:
+ return merged
+
+
+def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
+ '''
+ This method will go over the data found and will return a
+ list with the following items:
+ - Experiment details where hits have been found :
+ 'organism', 'tissue','experiment_name','user_name','column_type'
+ - Link that executes same query
+
+ '''
+ # set() makes a unique list:
+ organism_set = []
+ tissue_set = []
+ experiment_name_set = []
+ user_name_set = []
+ column_type_set = []
+ cas_nr_set = []
+
+ if 'organism' in data_found:
+ organism_set = set(data_found['organism'])
+ if 'tissue' in data_found:
+ tissue_set = set(data_found['tissue'])
+ if 'experiment_name' in data_found:
+ experiment_name_set = set(data_found['experiment_name'])
+ if 'user_name' in data_found:
+ user_name_set = set(data_found['user_name'])
+ if 'column_type' in data_found:
+ column_type_set = set(data_found['column_type'])
+ if 'CAS' in data_found:
+ cas_nr_set = set(data_found['CAS'])
+
+
+ result = [data_type_found,
+ _to_xsv(organism_set),
+ _to_xsv(tissue_set),
+ _to_xsv(experiment_name_set),
+ _to_xsv(user_name_set),
+ _to_xsv(column_type_set),
+ _to_xsv(cas_nr_set),
+ #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
+ "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
+ return result
+
+
+def _to_xsv(data_set):
+ result = ""
+ for item in data_set:
+ result = result + str(item) + "|"
+ return result
+
+
+def _fire_query_and_return_dict(url):
+ '''
+ This method will fire the query as a web-service call and
+ return the results as a list of dictionary objects
+ '''
+
+ try:
+ data = urllib2.urlopen(url).read()
+
+ # transform to dictionary:
+ result = []
+ data_rows = data.split("\n")
+
+ # check if there is any data in the response:
+ if len(data_rows) <= 1 or data_rows[1].strip() == '':
+ # means there is only the header row...so no hits:
+ return None
+
+ for data_row in data_rows:
+ if not data_row.strip() == '':
+ row_as_list = _str_to_list(data_row, delimiter='\t')
+ result.append(row_as_list)
+
+ # return result processed into a dict:
+ return _process_data(result)
+
+ except urllib2.HTTPError, e:
+ raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
+ except urllib2.URLError, e:
+ raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
+
+def _str_to_list(data_row, delimiter='\t'):
+ result = []
+ for column in data_row.split(delimiter):
+ result.append(column)
+ return result
+
+
+# alternative: ?
+# s = requests.Session()
+# s.verify = False
+# #s.auth = (token01, token02)
+# resp = s.get(url, params={'name': 'anonymous'}, stream=True)
+# content = resp.content
+# # transform to dictionary:
+
+
+
+
+def _merge_data(input_data_record, extra_cols):
+ '''
+ Adds the extra information to the existing data record and returns
+ the combined new record.
+ '''
+ record = []
+ for column in input_data_record:
+ record.append(input_data_record[column])
+
+
+ # add extra columns
+ for column in extra_cols:
+ record.append(column)
+
+ return record
+
+
+def _save_data(data_rows, headers, out_csv):
+ '''
+ Writes tab-separated data to file
+ @param data_rows: dictionary containing merged/enriched dataset
+ @param out_csv: output csv file
+ '''
+
+ # Open output file for writing
+ outfile_single_handle = open(out_csv, 'wb')
+ output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
+
+ # Write headers
+ output_single_handle.writerow(headers)
+
+ # Write one line for each row
+ for data_row in data_rows:
+ output_single_handle.writerow(data_row)
+
+def _get_metexp_URL(metexp_dblink_file):
+ '''
+ Read out and return the URL stored in the given file.
+ '''
+ file_input = fileinput.input(metexp_dblink_file)
+ try:
+ for line in file_input:
+ if line[0] != '#':
+ # just return the first line that is not a comment line:
+ return line
+ finally:
+ file_input.close()
+
+
+def main():
+ '''
+ MetExp Query main function
+
+ The input file can be any tabular file, as long as it contains a column for the molecular mass
+ and one for the formula of the respective identification. These two columns are then
+ used to query against MetExp Database.
+ '''
+ seconds_start = int(round(time.time()))
+
+ input_file = sys.argv[1]
+ casid_col = sys.argv[2]
+ formula_col = sys.argv[3]
+ molecular_mass_col = sys.argv[4]
+ metexp_dblink_file = sys.argv[5]
+ separation_method = sys.argv[6]
+ output_result = sys.argv[7]
+
+ # Parse metexp_dblink_file to find the URL to the MetExp service:
+ metexp_dblink = _get_metexp_URL(metexp_dblink_file)
+
+ # Parse tabular input file into dictionary/array:
+ input_data = _process_file(input_file)
+
+ # Query data against MetExp DB :
+ enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
+ headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
+ 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
+
+ _save_data(enriched_data, headers, output_result)
+
+ seconds_end = int(round(time.time()))
+ print "Took " + str(seconds_end - seconds_start) + " seconds"
+
+
+
+if __name__ == '__main__':
+ main()
diff -r 000000000000 -r dffc38727496 query_metexp.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/query_metexp.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,69 @@
+
+ Query a set of identifications against the METabolomics EXPeriments database
+
+ query_metexp.py
+ $input_file
+ "$casid_col"
+ "$formula_col"
+ "$molecular_mass_col"
+ "$metexp_dblink_file"
+ $separation_method
+ $output_result
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool will Query a set of identifications against the METabolomics EXPlorer database.
+
+It will take the input file and for each record it will query the
+molecular mass in the selected MetExp DB. If one or more compounds are found in the
+MetExp DB then extra information regarding these compounds is added to the output file.
+
+The output file is thus the input file enriched with information about
+related items found in the selected MetExp DB.
+
+**Notes**
+
+The input file can be any tabular file, as long as it contains a column for the molecular mass
+and one for the formula of the respective identification.
+
+
+
+
diff -r 000000000000 -r dffc38727496 rankfilterGCMS_tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilterGCMS_tabular.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,80 @@
+
+ Convert Retention Time to Retention Index
+ rankfilter_GCMS/rankfilter.py $input_file
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ sample = ${sample}
+ calibration = ${calibration}
+ lib_data = ${lib_data}
+ window = ${window}
+ analysis_type = ${analysis_type}
+ tabular = True
+ onefile = ${onefile}
+ model = ${model}
+
+
+
+Basically estimates the experimental RI (RIexp) by building a RI(RT) function based on the
+given calibration file.
+
+It also determines the estimated RI (RIsvr) by looking up for each entry of the given input file (Sample File),
+based on its CAS number, its respective RIsvr value in the given global lookup library
+(this step is also called the "RankFilter analysis" -see reference below; Sample File may be either from NIST or AMDIS).
+This generates an prediction of the RI for
+a compound according to the "RankFilter procedure" (RIsvr).
+
+Output is a tab separated file in which four columns are added:
+
+ - **Rank** Calculated rank
+ - **RIexp** Experimental Retention Index (RI)
+ - **RIsvr** Calculated RI based on support vector regression (SVR)
+ - **%rel.err** Relative RI error (%rel.error = 100 * (RISVR − RIexp) / RIexp)
+
+.. class:: infomark
+
+**Notes**
+
+ - The layout of the Calibration file should include the following columns: 'MW', 'R.T.' and 'RI'.
+ - Selecting 'Polynomial' in the model parameter will calculate a 3rd degree polynomial model that will
+ be used to convert from XXXX to YYYY.
+
+-----
+
+**References**
+
+ - **RankFilter**: Mihaleva et. al. (2009) *Automated procedure for candidate compound selection in GC-MS
+ metabolomics based on prediction of Kovats retention index*. Bioinformatics, 25 (2009), pp. 787–794
+
+
diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/__init__.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,5 @@
+'''
+Created on Mar 14, 2012
+
+@author: marcelk
+'''
diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/pdfread.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/pdfread.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,214 @@
+"""
+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import sys
+import csv
+
+def getPDF(filename, print_progress):
+ '''
+ Parses NIST PDF file
+ @param filename: PDF file to parse
+ '''
+ NistInput = {}
+ NistInput_missed = {}
+ nist_input = open(filename, 'r').read()
+
+ hitid = []
+ rt = []
+ name = []
+ forward = []
+ cas = []
+ reverse = []
+ prob = []
+ lib_id = []
+ nist_id = []
+ missed_compounds = []
+ id_missed_compounds = []
+ formula = []
+
+ hit_list = nist_input.split('** Search Report Page 1 of 1 **')
+ hit_list.pop(0)
+ #number_hits = range(10)
+ line_id = 0
+ for line in hit_list:
+ line = line.strip().translate(None, '\r')
+ if line != '':
+ hits = line.replace('\n', ' ').replace('\x0c', '').replace('^L', '').split('Hit') #solution? : if we wouldn't replace the \n by ' ' but by some special sign, then reading formula would be simpler!
+ #strange....code seems fine actually...debug! See test/data/download.pdf
+ # strange thing is that it looks like the new line does not end up in the text file, eventhough it looks like there is a new line in the pdf...perhaps a bug in the pdf2text command in linux?
+ spec_id = hits.pop(0).split(' ')[1]
+ j = 0
+ for hh in hits:
+ cell = hh.split(';')
+ if print_progress == True:
+ print 'Processing line: ', line_id, ' with length: ', len(cell), ':\n\t', cell
+ line_id += 1
+ if len(cell) == 7: # the compound has CAS number
+ if len(cell[1].split(':')) == 2:
+ forward.append((cell[1].split(':')[1]).strip())
+ # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+ if len(cell[0].split(':')) > 2:
+ name_tmp = ':'.join(cell[0].split(':')[1:])
+ else:
+ name_tmp = cell[0].split(':')[1]
+
+ name.append(name_tmp.replace(" ", " ").strip())
+ name_tmp = name_tmp.strip().split(' ')
+ if name_tmp:
+ # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
+ if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
+ formule = (name_tmp[-1])
+ else:
+ formule = ('not_def')
+ else:
+ formule = ('not_def')
+ formula.append(formule.replace(" ", " "))
+ reverse.append((cell[2].split(':')[1]).strip())
+ prob.append(cell[3].split(' ')[2].replace('%', ''))
+ cas.append((cell[4].split(':')[1]).strip())
+ lib_id.append((cell[5].split(':')[1]).strip())
+ nist_id.append(cell[6].split(':')[1].replace('.', '').strip())
+ j = j + 1
+ else:
+ missed_compounds.append(hh)
+ id_missed_compounds.append(spec_id)
+
+ elif len(cell) == 6: # the compound has no CAS number
+ if len(cell[1].split(':')) == 2:
+
+ forward.append((cell[1].split(':')[1]).strip())
+ # indication that the name contains the ":". Should join the cells of name_tmp from 1 till end
+ if len(cell[0].split(':')) > 2:
+ name_tmp = ':'.join(cell[0].split(':')[1:])
+ else:
+ name_tmp = cell[0].split(':')[1]
+
+ name.append(name_tmp.replace(" ", " ").strip())
+ name_tmp = name_tmp.strip().split(' ')
+ if name_tmp:
+ # if the name ends with a word that starts with C, F or H, then assume this last word is a formula:
+ if name_tmp[-1][0] == 'C' or name_tmp[-1][0] == 'F' or name_tmp[-1][0] == 'H':
+ formule = (name_tmp[-1])
+ else:
+ formule = ('not_def')
+ else:
+ formule = ('not_def')
+ formula.append(formule.replace(" ", " "))
+ reverse.append((cell[2].split(':')[1]).strip())
+ prob.append(cell[3].split(' ')[2].replace('%', ''))
+ cas.append('undef')
+ lib_id.append((cell[4].split(':')[1]).strip())
+ nist_id.append(cell[5].split(':')[1].replace('.', '').strip())
+ j = j + 1
+
+ else:
+ missed_compounds.append(hh)
+ id_missed_compounds.append(spec_id)
+
+ else: # Missing columns, report and quit
+ missed_compounds.append(hh)
+ id_missed_compounds.append(spec_id)
+
+ for _ in range(j):
+ hitid.append(str(spec_id.replace(" ", " ")))
+ #NB: this is the RT as found in the "id" generated by e.g. msclust, so NOT the RT of the library hit:
+ rt.append(str(float(spec_id.split('-')[3]) / 1e+06))
+
+ NistInput['ID'] = hitid
+ NistInput['R.T.'] = rt
+ NistInput['Name'] = name
+ NistInput['CAS'] = cas
+ NistInput['Formula'] = formula
+ NistInput['Forward'] = forward
+ NistInput['Reverse'] = reverse
+ NistInput['Probability'] = prob
+ NistInput['Library'] = lib_id
+ NistInput['Library ID'] = nist_id
+ NistInput_missed['Missed Compounds'] = missed_compounds
+ NistInput_missed['ID missed Compounds'] = id_missed_compounds
+
+ return NistInput, NistInput_missed
+
+
+def convert_pdftotext2tabular(filename, output_file, error_file, print_progress):
+ '''
+ Converts NIST PDF file to tabular format
+ @param filename: PDF file to parse
+ @param output_file: output file for the hits
+ @param error_file: output file for failed hits
+ '''
+ [HitList, HitList_missed] = getPDF(filename, print_progress)
+ # save Hitlist as tab seperate file
+ Hitlist_as_text = "\t".join(HitList.keys()) + "\n"
+ Hitlist_array_of_array = ([HitList[row] for row in HitList.keys()])
+ Hitlist_as_text += str("\n".join(["\t".join(e) for e in zip(*Hitlist_array_of_array)]))
+ output_fh = open(output_file, 'wb')
+ output_fh.write(Hitlist_as_text)
+ output_fh.close()
+
+ out_missed_pdf = open(error_file, 'wb')
+ for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['ID missed Compounds']):
+ out_missed_pdf.write("Line with incorrect format or unexpected number of fields:\n")
+ out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
+ out_missed_pdf.close()
+
+
+def read_tabular(in_csv):
+ '''
+ Parses a tab-separated file returning a dictionary with named columns
+ @param in_csv: input filename to be parsed
+ '''
+ data = list(csv.reader(open(in_csv, 'rU'), delimiter='\t'))
+ header = data.pop(0)
+ # Create dictionary with column name as key
+ output = {}
+ for index in xrange(len(header)):
+ output[header[index]] = [row[index] for row in data]
+ return output
+
+
+def read_tabular_old(filename):
+ '''
+ Function to read tabular format (created by convert_pdftotext2tabular)
+ and output a dict with header of columns as key and value is columns of tabular as list
+ @param filename: tabular file to read
+ '''
+ input_fh = None
+ try:
+ input_fh = open(filename, 'r')
+ except IOError, error:
+ raise error
+ colnames = input_fh.readline().strip().split('\t')
+ cells = []
+ for line in input_fh.readlines():
+ cells.append(line.strip().split('\t'))
+ #transform from row oriented structure to column oriented structure
+ cells = zip(*cells)
+ #store the list of list in form of final output
+ RankFilterGC_format = {}
+ for colnumber in range(len(colnames)):
+ RankFilterGC_format[colnames[colnumber]] = cells[colnumber]
+ return RankFilterGC_format
+
+
+if __name__ == '__main__':
+ convert_pdftotext2tabular(sys.argv[1], sys.argv[2], sys.argv[3], True)
diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/pdftotabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/pdftotabular.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,44 @@
+"""
+Copyright (C) 2013, Pieter Lukasse, Plant Research International, Wageningen
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this software except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+"""
+
+import sys
+import pdfread
+from subprocess import call
+
+
+def convert_pdftotext(filename, output_file):
+ '''
+ Converts PDF file to text
+ @param filename: PDF file to parse
+ @param output_file: output text file for the hits
+ '''
+
+ # "-layout" option in pdftotext call below: Maintain (as best as possible) the original physical layout of the text. The
+ # default is to 'undo' physical layout (columns, hyphenation, etc.) and output
+ # the text in reading order.
+ try:
+ call(["pdftotext", "-layout", filename, output_file])
+ except:
+ raise Exception("Error while trying to convert PDF to text")
+
+
+
+
+if __name__ == '__main__':
+ pdf_as_text = sys.argv[1]+".txt"
+ convert_pdftotext(sys.argv[1], pdf_as_text)
+ pdfread.convert_pdftotext2tabular(pdf_as_text, sys.argv[2], sys.argv[3], False)
diff -r 000000000000 -r dffc38727496 rankfilter_GCMS/rankfilter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_GCMS/rankfilter.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,432 @@
+"""
+Copyright (C) 2011 by Velitchka Mihaleva, Wageningen University
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+#Library functions definition
+#----------Begin-------------
+from numpy import array, linalg, ones
+from numpy.polynomial import polynomial
+import math
+import pdfread
+import sys
+
+
+def calibrate(standards):
+ '''
+ Calculates the RT to RI conversion: RI = a + b*RT
+ @param standards: variable containing RI and RT data
+ '''
+ A = ones((len(standards['R.T.']), 2), dtype=float)
+ A[:, 0] = array(map(float, standards['R.T.']))
+ [coeff, res, r, s] = linalg.lstsq(A, array(map(float, standards['RI'])))
+
+ return coeff
+
+
+def calibrate_poly(standards):
+ '''
+ Calculates the RT to RI conversion using a polynomial model
+ @param standards: variable containing RI and RT data
+ '''
+ retention_time = array(map(float, standards['R.T.']))
+ retention_index = array(map(float, standards['RI']))
+
+ # Fit a 3rd degree polynomial
+ fit = polynomial.polyfit(retention_time, retention_index, 3)[::-1]
+ return [fit[0], fit[1], fit[2], fit[3]]
+
+
+def calculate_poly(retention_time, poly_cal):
+ '''
+ Converts a given retention time to retention index using the calculated polynomial model
+ @param retention_time: retention_time to convert to retention index
+ @param poly_cal: result from calculating regression
+ '''
+ # Calculates RI based on given retention_time using polynomial function
+ retention_time = array(map(float, retention_time))
+ if len(retention_time) > 1:
+ ri_exp = []
+ for i in retention_time:
+ ri_exp.append(poly_cal[0] * (i ** 3) + poly_cal[1] * (i ** 2) + (i * poly_cal[2]) + poly_cal[3])
+ return ri_exp
+ else:
+ return poly_cal[0] * (retention_time ** 3) + poly_cal[1] * (retention_time ** 2) + (retention_time * poly_cal[2]) + poly_cal[3]
+
+
+def convert_rt(hit_list, coeff):
+ '''
+ Converts a given retention time to retention index using the linear model
+ @param hit_list: list holding the retention time
+ @param coeff: calculated coefficient (slope and intercept) using the linear model
+ '''
+ #Convert RT to RI
+ hit_list['RIexp'] = array(map(float, hit_list['R.T.'])) * coeff[0] + coeff[1]
+ return hit_list
+
+
+def convert_rt_poly(hit_list, poly_cal):
+ '''
+ Calls the actual RT to RI converter and returns the updated list with added RIexp value
+ @param hit_list: result list containing the retention time
+ '''
+ hit_list['RIexp'] = array(map(float, calculate_poly(hit_list['R.T.'], poly_cal)))
+ return hit_list
+
+
+def get_data(libdata, LabelCol):
+ '''
+ Retrieves datacolumns indicated by LabelCol from libdata (generic function)
+ Returns a dict with the requested column names as keys
+ @param libdata: file from which data is loaded
+ @param LabelCol: columns to retrieve
+ '''
+ from numpy import take
+ LibData = open(libdata, 'r').read().split('\n')
+
+ #Get the labels of the columns in the file
+ FirstLine = LibData.pop(0).split('\t')
+
+ FirstLine[-1] = FirstLine[-1].replace('\r', '')
+
+ # Create a temporate variable containing the all data
+ tmp_data = []
+ for ll in LibData:
+ if ll != '':
+ tmp_data.append(ll.split('\t'))
+
+ # Find the indices of the desired data
+ ind = []
+ try:
+ for key in LabelCol:
+ ind.append(FirstLine.index(key))
+ except:
+ print str(key) + " not found in first line of library (" + str(libdata) + ")"
+ print"the folowing items are found in the first line of the library:\n" + str(FirstLine)
+ sys.exit(1)
+ # Extract the desired data
+ data = []
+ for x in tmp_data:
+ data.append(take(array(x), ind))
+
+ # library_data = dict(zip(LabelCol,transpose(data)))
+ library_data = dict(zip(LabelCol, map(lambda *x: list(x), *data)))
+ return library_data
+
+
+def rank_hit(hit_list, library_data, window):
+ '''
+ Computes the Rank and % relative error
+ @param hit_list: input data
+ @param library_data: library used for reading the RIsvr data
+ @param window: error window
+ '''
+ hit_match_ripred = []
+ hit_match_syn = []
+ # Convert 'Name' data to list in order to be indexed
+ # library_data['Name']=list(library_data['Name'])
+
+ for hit_cas, hit_name in zip(hit_list['CAS'], hit_list['Name']):
+ index = 0
+ if hit_cas != 'undef':
+ try:
+ index = library_data['CAS'].index(hit_cas.replace(' ', '').replace('-', ''))
+ except:
+ try:
+ index = library_data['Name'].index(hit_name.replace(' ', ''))
+ except:
+ # If for any reason the hit is not present
+ # in the mainlib library indicate this with -999
+ index = 0
+ else:
+ try:
+ index = library_data['Name'].index(hit_name.replace(' ', ''))
+ except:
+ # If for any reason the hit is not present
+ # in the mainlib library indicate this with -999
+ index = 0
+ if index != 0:
+ hit_match_ripred.append(float(library_data['RIsvr'][index]))
+ hit_match_syn.append(library_data['Synonyms'][index])
+ else:
+ hit_match_ripred.append(-999)
+ hit_match_syn.append('None')
+ hit_list['RIsvr'] = hit_match_ripred
+ hit_list['Synonyms'] = hit_match_syn
+
+ # Determine the relative difference between the experimental
+ # and the predicted RI
+ ri_err = []
+
+ for ri_exp, ri_qsar in zip(hit_list['RIexp'], hit_list['RIsvr']):
+ if int(ri_qsar) != -999:
+ ri_err.append(float(int(float(ri_qsar)) - int(float(ri_exp))) / int(float(ri_exp)) * 100)
+ else:
+ ri_err.append(-999)
+
+ # Define the rank of the hits
+ hit_rank = []
+
+ for tt in ri_err:
+ if tt == -999:
+ # The name of the hit generated with AMDIS did not match a name
+ # in the mainlib library
+ hit_rank.append(4)
+ else:
+ # Rank 1 - ri_err is within +/- window/2
+ if abs(tt) <= float(window) / 2:
+ hit_rank.append(1)
+ # Rank 2 - window/2 < ri_err <= window
+ if abs(tt) > float(window) / 2 and abs(tt) <= float(window):
+ hit_rank.append(2)
+ # Rank 3 - ri_err > window
+ if abs(tt) > float(window):
+ hit_rank.append(3)
+ hit_list['Rank'] = hit_rank
+ hit_list['%rel.err'] = ri_err
+ return hit_list
+
+def print_to_file(hit_list, filename, keys_to_print, print_subsets=True):
+ '''
+ Writes output data to files (four output files are generated):
+ filename_ranked - the hits are ranked
+ filename_filter_in - only hits with rank 1 and 2 are retained
+ filename_filter_out - hits with rank 3 are filtered out
+ filename_filter_missed - hits with rank 4 - there was no match with the
+ library data
+ @param hit_list: a dictionary with the ranked hits
+ @param filename: the core of the output file names
+ @param keys_to_print: determines the order in which the data are printed
+ @param print_subsets:
+ '''
+ from numpy import take
+
+ out_ranked = open(filename["ranked"], 'w')
+ if (print_subsets):
+ out_in = open(filename["filter_in"], 'w')
+ out_out = open(filename["filter_out"], 'w')
+ out_missed = open(filename["missed"], 'w')
+
+ #Convert RIexp and RIsvr into integer for printing
+ hit_list['RIexp'] = map(int, map(math.ceil, hit_list['RIexp']))
+ hit_list['RIsvr'] = map(int, map(math.ceil, hit_list['RIsvr']))
+
+ #Establish the right order of the data to be printed
+ tmp_items = hit_list.items()
+ tmp_keys = hit_list.keys()
+ ind = []
+
+ for key in keys_to_print:
+ ind.append(tmp_keys.index(key))
+
+ #Print the labels of the columns
+ line = '\t'.join(take(array(tmp_keys), ind))
+ out_ranked.write('%s\n' % line)
+ if (print_subsets):
+ out_in.write('%s\n' % line)
+ out_out.write('%s\n' % line)
+ out_missed.write('%s\n' % line)
+
+ #Collect the data for each hit in the right order and print them
+ #in the output file
+ i = 0
+ for name in hit_list['Name']:
+ tt = []
+ for x in iter(tmp_items):
+ # trim the value if it is a string:
+ if isinstance(x[1][i], basestring):
+ tt.append(x[1][i].strip())
+ else:
+ tt.append(x[1][i])
+ tmp1 = take(array(tt), ind)
+ line = '\t'.join(tmp1.tolist())
+
+ out_ranked.write('%s\n' % line)
+ if(print_subsets):
+ if hit_list['Rank'][i] == 4:
+ out_missed.write('%s\n' % line)
+ if hit_list['Rank'][i] == 3:
+ out_out.write('%s\n' % line)
+ if hit_list['Rank'][i] == 1 or hit_list['Rank'][i] == 2:
+ out_in.write('%s\n' % line)
+
+ i = i + 1
+
+#---------End--------------
+def main():
+ #Ranking and filtering procedure
+ if len(sys.argv) < 2:
+ print "Usage:"
+ print "python RankFilter_GC-MS.py input \n"
+ print "input is a text file that specifies the names and the location"
+ print "of the files with the sample, compounds for calibration, library"
+ print "data, the core of the name ot the outputs, and the value of the"
+ print "window used for the filtering \n"
+
+ sys.exit(1)
+
+ #------Read the input file------
+ try:
+ ifile = open(sys.argv[1], 'r').read().split('\n')
+ except:
+ print sys.argv[1], " file can not be found"
+ sys.exit()
+
+ #Get the file names for the data
+ #labels - the type of input files
+ #filenames - the names of the input files
+ labels = []
+ filenames = []
+
+ for l in ifile:
+ l = l.strip()
+ if l != '':
+ labels.append(l.split('=')[0].replace(' ', '').replace('\r', ''))
+ filenames.append(l.split('=')[1].replace(' ', '').replace('\r', ''))
+
+ InputData = dict(zip(labels, filenames))
+
+ #this part checkes if the ouput option is set. The output files are saved as the output variable as prefix for the output files
+ #if the output is not found , each output file has to be selected by forehand. This comes in handy for pipeline tools as galaxy
+ print_subsets = True
+ NDIS_is_tabular = False
+
+ if 'output' in InputData:
+ output_files = {"ranked":InputData['output'] + "_ranked", \
+ "filter_in":InputData['output'] + "_filter_in", \
+ "filter_out":InputData['output'] + "filter_out", \
+ "missed":InputData['output'] + "_missed", \
+ "missed_parse_pdf":InputData['output'] + "_missed_parse_pdf"}
+ elif 'tabular' in InputData:
+ NDIS_is_tabular = True
+ if(not "onefile" in InputData):
+ output_files = {"ranked":InputData['ranked'], \
+ "filter_in":InputData['filter_in'], \
+ "filter_out":InputData['filter_out'], \
+ "missed":InputData['missed']}
+ else:
+ print_subsets = False
+ output_files = {"ranked":InputData['onefile']}
+ else:
+ output_files = {"ranked":InputData['ranked'], \
+ "filter_in":InputData['filter_in'], \
+ "filter_out":InputData['filter_out'], \
+ "missed":InputData['missed'], \
+ "missed_parse_pdf":InputData['missed_parse_pdf']}
+
+ #------Start with calibration------
+ #Check whether a file with data for the calibration is specified
+ #Specify which data to be read from the file with standard compounds
+ LabelColStand = ['Name', 'R.T.', 'RI']
+
+ if InputData['calibration'] != 'none':
+ #get the coeffiecients for the RT to RI convertion
+
+ try:
+ ifile = open(InputData['calibration'], 'r')
+ except:
+ print "file", InputData['calibration'], "can not be found"
+ sys.exit(1)
+
+ standards = get_data(InputData['calibration'], LabelColStand)
+ if InputData['model'] == 'linear':
+ coeff = calibrate(standards)
+ elif InputData['model'] == 'poly':
+ poly_cal = calibrate_poly(standards)
+ else:
+ print "error: model ", InputData['model'], " can not be found. Use 'linear' or 'poly' "
+ sys.exit(1)
+ else:
+ #No file has been specified for the calibration
+ #Use the default coefficients
+ print 'No file has been specified for the calibration'
+ print 'WARNING: the default coefficients will be used'
+ coeff = array([29.4327, 454.5260])
+
+ if InputData['analysis_type'] == 'AMDIS':
+ try:
+ AmdisOut = open(InputData['sample'], 'r')
+ print("open ok")
+ #Specify which data to be extracted from the AMDIS output table
+ #Weighted and Reverse are measure of matching between the experimental
+ #and the library spectra. The experimental spectrum is used as template
+ #for the calculation of Weighted, whereas for Reverse the template is the
+ #library spectrum
+ LabelCol = ['CAS', 'Name', 'R.T.', 'Weighted', 'Reverse', 'Purity']
+
+ #Get the data from the AMDIS output file
+ HitList = get_data(InputData['sample'], LabelCol)
+ #Remove '>' from the names
+ HitList['Name'] = [s.replace('>', '') for s in HitList['Name']]
+ except:
+ print "the file", InputData['sample'], "can not be found"
+ sys.exit(1)
+ if InputData['analysis_type'] == 'NIST':
+ #HitList_missed - a variable of type dictionary containing the hits with the symbol ";"
+ #in the name
+ if not NDIS_is_tabular:
+ print "Warning; NDIS is not tabular format, reading PDF..\n"
+ [HitList, HitList_missed] = pdfread.getPDF(InputData['sample'])
+ else:
+ HitList = pdfread.read_tabular(InputData['sample'])
+
+ #Convert RT to RI
+ if InputData['model'] == 'linear':
+ HitList = convert_rt(HitList, coeff)
+ if InputData['model'] == 'poly':
+ print "Executing convert_rt_poly().."
+ HitList = convert_rt_poly(HitList, poly_cal)
+
+ #------Read the library data with the predicted RI------
+ try:
+ LibData = open(InputData['lib_data'], 'r')
+ except:
+ print "the file", InputData['lib_data'], "can not be found"
+ sys.exit(1)
+
+ #Specify which data to be extracted from the library data file
+ LabelColLib = ['CAS', 'Name', 'RIsvr', 'Synonyms']
+ LibraryData = get_data(InputData['lib_data'], LabelColLib)
+
+ #------Match the hits with the library data and rank them------
+ if InputData['window'] != '':
+ HitList = rank_hit(HitList, LibraryData, InputData['window'])
+ else:
+ print "No value for the window used for the filtering is specified \n"
+ sys.exit(1)
+
+ #------Print the ranked and filtered hits------
+ #Specify which data to be printed
+ if InputData['analysis_type'] == 'AMDIS':
+ keys_to_print = ['R.T.', 'CAS', 'Name', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Weighted', 'Reverse', 'Synonyms']
+ else:
+ keys_to_print = ['ID', 'R.T.', 'Name', 'CAS', 'Rank', 'RIexp', 'RIsvr', '%rel.err', 'Forward', 'Reverse', 'Synonyms', 'Library']
+
+ #skip this error output from reading a pdftotext file when file is tabular
+ if InputData['analysis_type'] == 'NIST' and not NDIS_is_tabular:
+ out_missed_pdf = open(output_files['missed_parse_pdf'], 'w')
+ for x, y in zip(HitList_missed['Missed Compounds'], HitList_missed['RT missed Compounds']):
+ out_missed_pdf.write('%s\n' % '\t'.join([y, x]))
+ out_missed_pdf.close()
+
+ print_to_file(HitList, output_files, keys_to_print, print_subsets)
+
+if __name__ == '__main__':
+ main()
diff -r 000000000000 -r dffc38727496 rankfilter_text2tabular.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/rankfilter_text2tabular.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,14 @@
+
+ Convert NIST text to tabular format
+ rankfilter_GCMS/pdftotabular.py $input $output $output_err
+
+
+
+
+
+
+
+
+ This tool converts NIST identification report output (PDF) to a tabular format needed for further use with the RIQC tools.
+
+
diff -r 000000000000 -r dffc38727496 select_on_rank.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/select_on_rank.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,21 @@
+import csv
+import sys
+
+__author__ = "Marcel Kempenaar"
+__contact__ = "brs@nbic.nl"
+__copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
+__license__ = "MIT"
+
+in_file = sys.argv[1]
+out_file = sys.argv[2]
+to_select_list = [str(select.strip()) for select in sys.argv[3].split(',') if (len(select) > 0)]
+
+data = list(csv.reader(open(in_file, 'rb'), delimiter='\t'))
+header = data.pop(0)
+header_clean = [i.lower().strip().replace(".", "").replace("%", "") for i in header]
+rank = header_clean.index("rank")
+
+writer = csv.writer(open(out_file, 'wb'), delimiter='\t')
+writer.writerow(header)
+for select in to_select_list:
+ writer.writerows([i for i in data if i[rank] == select])
diff -r 000000000000 -r dffc38727496 select_on_rank.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/select_on_rank.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,15 @@
+
+ Filter on the Rank field in the RankFilter output file
+ select_on_rank.py $input $output "$rank"
+
+
+
+
+
+
+
+
+This tool removes all entries with non selected rank values from the input file (supported input file is a RankFilter output file).
+
+
diff -r 000000000000 -r dffc38727496 static/images/CAMERA_results.png
Binary file static/images/CAMERA_results.png has changed
diff -r 000000000000 -r dffc38727496 static/images/confidence_and_slope_params_explain.png
Binary file static/images/confidence_and_slope_params_explain.png has changed
diff -r 000000000000 -r dffc38727496 static/images/diffreport.png
Binary file static/images/diffreport.png has changed
diff -r 000000000000 -r dffc38727496 static/images/massEIC.png
Binary file static/images/massEIC.png has changed
diff -r 000000000000 -r dffc38727496 static/images/metaMS_annotate.png
Binary file static/images/metaMS_annotate.png has changed
diff -r 000000000000 -r dffc38727496 static/images/metaMS_pick_align_camera.png
Binary file static/images/metaMS_pick_align_camera.png has changed
diff -r 000000000000 -r dffc38727496 static/images/msclust_summary.png
Binary file static/images/msclust_summary.png has changed
diff -r 000000000000 -r dffc38727496 static/images/sample_SIM.png
Binary file static/images/sample_SIM.png has changed
diff -r 000000000000 -r dffc38727496 static/images/sample_sel_and_peak_height_correction.png
Binary file static/images/sample_sel_and_peak_height_correction.png has changed
diff -r 000000000000 -r dffc38727496 static_resources/elements_and_masses.tab
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/static_resources/elements_and_masses.tab Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,104 @@
+Name Atomic number Chemical symbol Relative atomic mass
+Hydrogen 1 H 1.01
+Helium 2 He 4
+Lithium 3 Li 6.94
+Beryllium 4 Be 9.01
+Boron 5 B 10.81
+Carbon 6 C 12.01
+Nitrogen 7 N 14.01
+Oxygen 8 O 16
+Fluorine 9 F 19
+Neon 10 Ne 20.18
+Sodium 11 Na 22.99
+Magnesium 12 Mg 24.31
+Aluminum 13 Al 26.98
+Silicon 14 Si 28.09
+Phosphorus 15 P 30.98
+Sulfur 16 S 32.06
+Chlorine 17 Cl 35.45
+Argon 18 Ar 39.95
+Potassium 19 K 39.1
+Calcium 20 Ca 40.08
+Scandium 21 Sc 44.96
+Titanium 22 Ti 47.9
+Vanadium 23 V 50.94
+Chromium 24 Cr 52
+Manganese 25 Mn 54.94
+Iron 26 Fe 55.85
+Cobalt 27 Co 58.93
+Nickel 28 Ni 58.71
+Copper 29 Cu 63.54
+Zinc 30 Zn 65.37
+Gallium 31 Ga 69.72
+Germanium 32 Ge 72.59
+Arsenic 33 As 74.99
+Selenium 34 Se 78.96
+Bromine 35 Br 79.91
+Krypton 36 Kr 83.8
+Rubidium 37 Rb 85.47
+Strontium 38 Sr 87.62
+Yttrium 39 Y 88.91
+Zirconium 40 Zr 91.22
+Niobium 41 Nb 92.91
+Molybdenum 42 Mo 95.94
+Technetium 43 Tc 96.91
+Ruthenium 44 Ru 101.07
+Rhodium 45 Rh 102.9
+Palladium 46 Pd 106.4
+Silver 47 Ag 107.87
+Cadmium 48 Cd 112.4
+Indium 49 In 114.82
+Tin 50 Sn 118.69
+Antimony 51 Sb 121.75
+Tellurium 52 Te 127.6
+Iodine 53 I 126.9
+Xenon 54 Xe 131.3
+Cesium 55 Cs 132.9
+Barium 56 Ba 137.34
+Lanthanum 57 La 138.91
+Cerium 58 Ce 140.12
+Praseodymium 59 Pr 140.91
+Neodymium 60 Nd 144.24
+Promethium 61 Pm 144.91
+Samarium 62 Sm 150.35
+Europium 63 Eu 151.96
+Gadolinium 64 Gd 157.25
+Terbium 65 Tb 158.92
+Dysprosium 66 Dy 162.5
+Holmium 67 Ho 164.93
+Erbium 68 Er 167.26
+Thulium 69 Tm 168.93
+Ytterbium 70 Yb 173.04
+Lutetium 71 Lu 174.97
+Hafnium 72 Hf 178.49
+Tantalum 73 Ta 180.95
+Wolfram 74 W 183.85
+Rhenium 75 Re 186.2
+Osmium 76 Os 190.2
+Iridium 77 Ir 192.22
+Platinum 78 Pt 195.09
+Gold 79 Au 196.97
+Mercury 80 Hg 200.59
+Thallium 81 Tl 204.37
+Lead 82 Pb 207.19
+Bismuth 83 Bi 208.98
+Polonium 84 Po 208.98
+Astatine 85 At 209.99
+Radon 86 Rn 222.02
+Francium 87 Fr 223.02
+Radium 88 Ra 226
+Actinium 89 Ac 227.03
+Thorium 90 Th 232.04
+Protactinium 91 Pa 231.04
+Uranium 92 U 238.03
+Neptunium 93 Np 237
+Plutonium 94 Pu 242
+Americium 95 Am 243.06
+Curium 96 Cm 247.07
+Berkelium 97 Bk 247.07
+Californium 98 Cf 251.08
+Einsteinium 99 Es 254.09
+Fermium 100 Fm 257.1
+Mendelevium 101 Md 257.1
+Nobelium 102 No 255.09
+Lawrencium 103 Lr 256.1
diff -r 000000000000 -r dffc38727496 test/__init__.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/__init__.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,1 @@
+''' unit tests '''
diff -r 000000000000 -r dffc38727496 test/integration_tests.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/integration_tests.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,268 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+from GCMS import library_lookup, combine_output
+from GCMS.rankfilter_GCMS import rankfilter
+import os.path
+import sys
+import unittest
+import re
+
+
+class IntegrationTest(unittest.TestCase):
+ def test_library_lookup(self):
+ '''
+ Run main for data/NIST_tabular and compare produced files with references determined earlier.
+ '''
+ # Create out folder
+ outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup')
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+ outfile_base = os.path.join(outdir, 'produced_library_lookup')
+ outfile_txt = outfile_base + '.txt'
+
+ #Build up arguments and run
+ input_txt = resource_filename(__name__, "data/NIST_tabular.txt")
+ library = resource_filename(__name__, "data/RIDB_subset.txt")
+ regress_model = resource_filename(__name__, "data/ridb_poly_regression.txt")
+ sys.argv = ['test',
+ library,
+ input_txt,
+ 'Capillary',
+ 'Semi-standard non-polar',
+ outfile_txt,
+ 'HP-5',
+ regress_model]
+ # Execute main function with arguments provided through sys.argv
+ library_lookup.main()
+ #Compare with reference files
+ reference_txt = resource_filename(__name__, 'reference/produced_library_lookup.txt')
+
+ #read both the reference file and actual output files
+ expected = _read_file(reference_txt)
+ actual = _read_file(outfile_txt)
+
+ #convert the read in files to lists we can compare
+ expected = expected.split()
+ actual = actual.split()
+
+ for exp, act in zip(expected, actual):
+ if re.match('\\d+\\.\\d+', exp):
+ exp = float(exp)
+ act = float(act)
+ self.assertAlmostEqual(exp, act, places=5)
+ else:
+ # compare values
+ self.failUnlessEqual(expected, actual)
+
+
+ def test_combine_output_simple(self):
+ '''
+ Run main for data/NIST_tabular and compare produced files with references determined earlier.
+ '''
+ # Create out folder
+ outdir = "output/" #tempfile.mkdtemp(prefix='test_library_lookup')
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+ outfile_base = os.path.join(outdir, 'produced_combine_output')
+ outfile_single_txt = outfile_base + '_single.txt'
+ outfile_multi_txt = outfile_base + '_multi.txt'
+
+ #Build up arguments and run
+ input_rankfilter = resource_filename(__name__, "data/Rankfilter.txt")
+ input_caslookup = resource_filename(__name__, "data/Caslookup.txt")
+ sys.argv = ['test',
+ input_rankfilter,
+ input_caslookup,
+ outfile_single_txt,
+ outfile_multi_txt]
+ # Execute main function with arguments provided through sys.argv
+ combine_output.main()
+ #Compare with reference files
+ # reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt')
+ # reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt')
+ # self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))
+ # self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))
+
+ #Clean up
+ #shutil.rmtree(tempdir)
+
+
+
+ def def_test_rank_filter_advanced(self):
+ '''
+ Run main of RankFilter
+ '''
+ # Create out folder
+ outdir = "output/integration/"
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ #Build up arguments and run
+ input_txt = resource_filename(__name__, "data/integration/RankFilterInput_conf.txt")
+ sys.argv = ['test',
+ input_txt]
+ # Execute main function with arguments provided through sys.argv
+ rankfilter.main()
+ #Compare with reference files
+
+ def def_test_library_lookup_advanced(self):
+ '''
+ Run main for data/NIST_tabular and compare produced files with references determined earlier.
+ '''
+ # Create out folder
+ outdir = "output/integration/"
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+ outfile_base = os.path.join(outdir, 'produced_library_lookup_ADVANCED')
+ outfile_txt = outfile_base + '.txt'
+
+ #Build up arguments and run
+ input_txt = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
+ library = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries/Library_RI_DB_capillary_columns-noDuplicates.txt")
+ regress_model = resource_filename(__name__, "data/integration/regression_MODEL_for_columns.txt")
+ sys.argv = ['test',
+ library,
+ input_txt,
+ 'Capillary',
+ 'Semi-standard non-polar',
+ outfile_txt,
+ 'DB-5',
+ regress_model]
+ # Execute main function with arguments provided through sys.argv
+ library_lookup.main()
+
+
+
+ def test_combine_output_advanced(self):
+ '''
+ Variant on test case above, but a bit more complex as some of the centrotypes have
+ different NIST hits which should give them different RI values. This test also
+ runs not only the combine output, but the other two preceding steps as well,
+ so it ensures the integration also works on the current code of all three tools.
+ '''
+
+ # Run RankFilter
+ self.def_test_rank_filter_advanced()
+
+ # Run library CAS RI lookup
+ self.def_test_library_lookup_advanced()
+
+ outdir = "output/integration/"
+ outfile_base = os.path.join(outdir, 'produced_combine_output')
+ outfile_single_txt = outfile_base + '_single.txt'
+ outfile_multi_txt = outfile_base + '_multi.txt'
+
+ #Build up arguments and run
+ input_rankfilter = resource_filename(__name__, "output/integration/produced_rank_filter_out.txt")
+ input_caslookup = resource_filename(__name__, "output/integration/produced_library_lookup_ADVANCED.txt")
+ sys.argv = ['test',
+ input_rankfilter,
+ input_caslookup,
+ outfile_single_txt,
+ outfile_multi_txt]
+ # Execute main function with arguments provided through sys.argv
+ combine_output.main()
+ #Compare with reference files
+# reference_single_txt = resource_filename(__name__, 'reference/produced_combine_output_single.txt')
+# reference_multi_txt = resource_filename(__name__, 'reference/produced_combine_output_multi.txt')
+# self.failUnlessEqual(_read_file(reference_single_txt), _read_file(outfile_single_txt))
+# self.failUnlessEqual(_read_file(reference_multi_txt), _read_file(outfile_multi_txt))
+
+ # Check 1: output single should have one record per centrotype:
+
+
+ # Check 2: output single has more records than output single:
+ combine_result_single_items = combine_output._process_data(outfile_single_txt)
+ combine_result_multi_items = combine_output._process_data(outfile_multi_txt)
+ self.assertGreater(len(combine_result_single_items['Centrotype']),
+ len(combine_result_multi_items['Centrotype']))
+
+
+ # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:
+ caslookup_items = combine_output._process_data(input_caslookup)
+ rankfilter_items = combine_output._process_data(input_rankfilter)
+
+ # check that the caslookup RI column is correctly maintained in its original order in
+ # the combined file:
+ ri_caslookup = caslookup_items['RI']
+ ri_combine_single = combine_result_single_items['RI']
+ self.assertListEqual(ri_caslookup, ri_combine_single)
+
+ # check the centrotype column's integrity:
+ centrotype_caslookup = caslookup_items['Centrotype']
+ centrotype_combine_single = combine_result_single_items['Centrotype']
+ centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID'])
+ self.assertListEqual(centrotype_caslookup, centrotype_combine_single)
+ self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)
+
+ # integration and integrity checks:
+ file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
+ file_NIST_items = combine_output._process_data(file_NIST)
+ # check that rank filter output has exactly the same ID items as the original NIST input file:
+ self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID'])
+ # check the same for the CAS column:
+ self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS'])
+ # now check the NIST CAS column against the cas lookup results:
+ cas_NIST = _get_processedcas(file_NIST_items['CAS'])
+ self.assertListEqual(cas_NIST, caslookup_items['CAS'])
+ # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order
+ # and values remained stable throughout all steps:
+ self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS'])
+
+ # check that the rankfilter RIsvr column is correctly maintained in its original order in
+ # the combined file:
+ risvr_rankfilter = rankfilter_items['RIsvr']
+ risvr_combine_single = combine_result_single_items['RIsvr']
+ self.assertListEqual(risvr_rankfilter, risvr_combine_single)
+
+
+
+
+def _get_centrotype_rankfilter(id_list):
+ '''
+ returns the list of centrotype ids given a list of ID in the
+ form e.g. 74-1.0-564-1905200-7, where the numbers before the
+ first "-" are the centrotype id
+ '''
+ result = []
+ for compound_id_idx in xrange(len(id_list)):
+ compound_id = id_list[compound_id_idx]
+ centrotype = compound_id.split('-')[0]
+ result.append(centrotype)
+
+ return result
+
+
+def _get_processedcas(cas_list):
+ '''
+ returns the list cas numbers in the form C64175 instead of 64-17-5
+ '''
+ result = []
+ for cas_id_idx in xrange(len(cas_list)):
+ cas = cas_list[cas_id_idx]
+ processed_cas = 'C' + str(cas.replace('-', '').strip())
+ result.append(processed_cas)
+
+ return result
+
+def _get_strippedcas(cas_list):
+ '''
+ removes the leading white space from e.g. " 64-17-5"
+ '''
+ result = []
+ for cas_id_idx in xrange(len(cas_list)):
+ cas = cas_list[cas_id_idx]
+ processed_cas = cas.strip()
+ result.append(processed_cas)
+
+ return result
+
+
+def _read_file(filename):
+ '''
+ Helper method to quickly read a file
+ @param filename:
+ '''
+ with open(filename) as handle:
+ return handle.read()
diff -r 000000000000 -r dffc38727496 test/test_combine_output.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_combine_output.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,106 @@
+'''
+Created on Mar 27, 2012
+
+@author: marcelk
+'''
+from GCMS import combine_output
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+import os
+import shutil
+import tempfile
+import unittest
+
+
+class Test(unittest.TestCase):
+ '''
+ Tests for the 'combine_output' Galaxy tool
+ '''
+
+ def setUp(self):
+ self.rf_output = resource_filename(__name__, "data/RankFilter.txt")
+ self.cl_output = resource_filename(__name__, "data/CasLookup.txt")
+
+ def test_process_data(self):
+ '''
+ Tests the processing of the RankFilter and CasLookup files into dictionaries
+ '''
+ rfdata = combine_output._process_data(self.rf_output)
+ cldata = combine_output._process_data(self.cl_output)
+ self.assertEqual(set([' 18457-04-0', ' 55133-95-4', ' 58-08-2', ' 112-34-5']), set(rfdata['CAS']))
+ self.assertEqual(set(['C58082', 'C18457040', 'C55133954', 'C112345']), set(cldata['CAS']))
+
+ def test_add_hit(self):
+ '''
+ Tests the combination of two records from both the RankFilter- and CasLookup-tools
+ '''
+ rfdata = combine_output._process_data(self.rf_output)
+ cldata = combine_output._process_data(self.cl_output)
+ index = 0
+ rf_record = dict(zip(rfdata.keys(), [rfdata[key][index] for key in rfdata.keys()]))
+ cl_record = dict(zip(cldata.keys(), [cldata[key][index] for key in cldata.keys()]))
+
+ hit = combine_output._add_hit(rf_record, cl_record)
+ self.assertEqual(len(hit), 27)
+
+ # Pass empty record, should fail combination
+ self.assertRaises(KeyError, combine_output._add_hit, rf_record, {})
+
+ def test_merge_data(self):
+ '''
+ Tests the merging of the RankFilter and CasLookup data
+ '''
+ rfdata = combine_output._process_data(self.rf_output)
+ cldata = combine_output._process_data(self.cl_output)
+ merged, _ = combine_output._merge_data(rfdata, cldata)
+ centrotypes = _get_centrotypes(merged)
+ self.failUnless(all(centrotype in centrotypes for centrotype in ('2716','12723', '3403', '12710')))
+
+def _get_centrotypes(merged):
+ '''
+ returns centrotype codes found in merged set
+ '''
+ result = []
+ for item_idx in xrange(len(merged)):
+ item = merged[item_idx]
+ centrotype = item[0][0]
+ result.append(centrotype)
+
+ return result
+
+ def test_remove_formula(self):
+ '''
+ Tests the removal of the Formula from the 'Name' field (RankFilter output)
+ '''
+ name = "Caffeine C8H10N4O2"
+ compound_name, compound_formula = combine_output._remove_formula(name)
+ self.assertEqual(compound_name, 'Caffeine')
+ self.assertEqual(compound_formula, 'C8H10N4O2')
+ name = "Ethanol C2H6O"
+ compound_name, compound_formula = combine_output._remove_formula(name)
+ self.assertEqual(compound_name, 'Ethanol')
+ self.assertEqual(compound_formula, 'C2H6O')
+ # No formula to remove
+ name = "Butanoic acid, 4-[(trimethylsilyl)oxy]-, trimethylsilyl ester"
+ compound_name, compound_formula = combine_output._remove_formula(name)
+ self.assertEqual(compound_name, name)
+ self.assertEqual(compound_formula, False)
+
+ def test_save_data(self):
+ '''
+ Tests the creation of the output tabular files (no content testing)
+ '''
+ temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_')
+ saved_single_data = '{0}/{1}'.format(temp_folder, 'output_single.tsv')
+ saved_multi_data = '{0}/{1}'.format(temp_folder, 'output_multi.tsv')
+ rfdata = combine_output._process_data(self.rf_output)
+ cldata = combine_output._process_data(self.cl_output)
+ merged, nhits = combine_output._merge_data(rfdata, cldata)
+ combine_output._save_data(merged, nhits, saved_single_data, saved_multi_data)
+ self.failUnless(os.path.exists(saved_single_data))
+ self.failUnless(os.path.exists(saved_multi_data))
+ shutil.rmtree(temp_folder)
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
diff -r 000000000000 -r dffc38727496 test/test_export_to_metexp_tabular.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_export_to_metexp_tabular.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,112 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+from GCMS import export_to_metexp_tabular
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+ def test_MM_calculations(self):
+ '''
+ test the implemented method for MM calculations for
+ given chemical formulas
+ '''
+ export_to_metexp_tabular.init_elements_and_masses_map()
+
+ formula = "C8H18O3"
+ # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+ result = export_to_metexp_tabular.get_molecular_mass(formula)
+ self.assertEqual(162.26, result)
+
+ formula = "CH2O3Fe2Ni"
+ # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+ result = export_to_metexp_tabular.get_molecular_mass(formula)
+ self.assertAlmostEqual(232.44, result, 2)
+
+
+
+
+
+ def test_combine_output_simple(self):
+ '''
+ comment me
+ '''
+ # Create out folder
+ outdir = "output/metexp/"
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ #Build up arguments and run
+
+ rankfilter_and_caslookup_combined_file = resource_filename(__name__, "data/dummy1_produced_combine_output_single.txt")
+ msclust_quantification_and_spectra_file = resource_filename(__name__, "data/dummy1_sim.txt")
+ output_csv = resource_filename(__name__, outdir + "metexp_tabular.txt")
+
+ sys.argv = ['test',
+ rankfilter_and_caslookup_combined_file,
+ msclust_quantification_and_spectra_file,
+ output_csv,
+ 'tomato',
+ 'leafs',
+ 'test experiment',
+ 'pieter',
+ 'DB5 column']
+
+ # Execute main function with arguments provided through sys.argv
+ export_to_metexp_tabular.main()
+
+ '''
+ # Asserts are based on reading in with process_data and comparing values of
+ # certain columns
+
+ # Check 3: library_lookup RI column, centrotype column, ri_svr column are correct:
+ caslookup_items = combine_output._process_data(input_caslookup)
+ rankfilter_items = combine_output._process_data(input_rankfilter)
+
+ # check that the caslookup RI column is correctly maintained in its original order in
+ # the combined file:
+ ri_caslookup = caslookup_items['RI']
+ ri_combine_single = combine_result_single_items['RI']
+ self.assertListEqual(ri_caslookup, ri_combine_single)
+
+ # check the centrotype column's integrity:
+ centrotype_caslookup = caslookup_items['Centrotype']
+ centrotype_combine_single = combine_result_single_items['Centrotype']
+ centrotype_rankfilter = _get_centrotype_rankfilter(rankfilter_items['ID'])
+ self.assertListEqual(centrotype_caslookup, centrotype_combine_single)
+ self.assertListEqual(centrotype_caslookup, centrotype_rankfilter)
+
+ # integration and integrity checks:
+ file_NIST = resource_filename(__name__, "data/integration/NIST_identification_results_tabular.txt")
+ file_NIST_items = combine_output._process_data(file_NIST)
+ # check that rank filter output has exactly the same ID items as the original NIST input file:
+ self.assertListEqual(file_NIST_items['ID'], rankfilter_items['ID'])
+ # check the same for the CAS column:
+ self.assertListEqual(_get_strippedcas(file_NIST_items['CAS']), rankfilter_items['CAS'])
+ # now check the NIST CAS column against the cas lookup results:
+ cas_NIST = _get_processedcas(file_NIST_items['CAS'])
+ self.assertListEqual(cas_NIST, caslookup_items['CAS'])
+ # now check the CAS of the combined result. If all checks are OK, it means the CAS column's order
+ # and values remained stable throughout all steps:
+ self.assertListEqual(rankfilter_items['CAS'], combine_result_single_items['CAS'])
+
+ # check that the rankfilter RIsvr column is correctly maintained in its original order in
+ # the combined file:
+ risvr_rankfilter = rankfilter_items['RIsvr']
+ risvr_combine_single = combine_result_single_items['RIsvr']
+ self.assertListEqual(risvr_rankfilter, risvr_combine_single)
+ '''
+
+
+
+def _read_file(filename):
+ '''
+ Helper method to quickly read a file
+ @param filename:
+ '''
+ with open(filename) as handle:
+ return handle.read()
diff -r 000000000000 -r dffc38727496 test/test_library_lookup.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_library_lookup.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,180 @@
+'''
+Created on Mar 6, 2012
+
+@author: marcelk
+'''
+from GCMS import library_lookup, match_library
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+import os
+import shutil
+import tempfile
+import unittest
+
+
+class Test(unittest.TestCase):
+ '''
+ Tests the 'library_lookup' Galaxy tool
+ '''
+
+ def setUp(self):
+ self.ri_database = resource_filename(__name__, "data/RIDB_subset.txt")
+ self.nist_output = resource_filename(__name__, "data/NIST_tabular.txt")
+ self.ridb_poly_regress = resource_filename(__name__, "data/ridb_poly_regression.txt")
+ self.ridb_linear_regress = resource_filename(__name__, "data/ridb_linear_regression.txt")
+
+ def test_create_lookup_table(self):
+ '''
+ Tests the 'create_lookup_table' function
+ '''
+ column_type = 'Capillary'
+ polarity = 'Semi-standard non-polar'
+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+ self.assertFalse(False in [res[4] == 'Capillary' for res in lookup_dict['4177166']])
+ self.assertEqual(['C51276336', '2,6-Dimethyl-octa-1,7-dien-3,6-diol', 'C10H18O2',
+ '1277', 'Capillary', 'Semi-standard non-polar', 'DB-5MS', '1',
+ 'C51276336_DB-5MS', '', '', ''], lookup_dict['51276336'][1])
+
+ def test_read_model(self):
+ '''
+ Tests reading the regression model data containing the parameters required for converting
+ retention indices between GC-columns
+ '''
+ model, _ = library_lookup._read_model(self.ridb_poly_regress)
+ # Order of values: coefficient 1 through 4, left limit, right limit
+ # Polynomial model
+ self.assertEqual([20.6155874639486, 0.945187096379008, 3.96480787567566e-05, -9.04377237159287e-09,
+ 628.0, 2944.0, 405.0, 0, 0.998685262365514], model['HP-5']['SE-54'])
+ self.assertEqual([-92.3963391356951, 1.26116176393346, -0.000191991657547972, 4.15387371263164e-08,
+ 494.0, 2198.0, 407.0, 0, 0.996665023122993], model['Apiezon L']['Squalane'])
+ # Linear model
+ model, _ = library_lookup._read_model(self.ridb_linear_regress)
+ self.assertEqual([2.81208738561543, 0.99482475526584, 628.0, 2944.0, 405.0, 0, 0.998643883946458],
+ model['HP-5']['SE-54'])
+ self.assertEqual([19.979922768462, 0.993741869298272, 494.0, 2198.0, 407.0, 0, 0.99636062891041],
+ model['Apiezon L']['Squalane'])
+
+ def test_apply_regression(self):
+ '''
+ Tests the regression model on some arbitrary retention indices
+ '''
+ poly_model, _ = library_lookup._read_model(self.ridb_poly_regress)
+ linear_model, _ = library_lookup._read_model(self.ridb_linear_regress)
+ retention_indices = [1000, 1010, 1020, 1030, 1040, 1050]
+ converted_poly = []
+ converted_linear = []
+ for ri in retention_indices:
+ converted_poly.append(library_lookup._apply_poly_regression('HP-5', 'DB-5', ri, poly_model))
+ converted_linear.append(library_lookup._apply_linear_regression('HP-5', 'DB-5', ri, linear_model))
+
+ self.assertEqual([1003.0566541860778, 1013.0979459524663, 1023.1358645806529, 1033.170466241159,
+ 1043.2018071045052, 1053.2299433412131], converted_poly)
+ self.assertEqual([1001.8127584915925, 1011.830140783027, 1021.8475230744615, 1031.864905365896,
+ 1041.8822876573306, 1051.899669948765], converted_linear)
+
+ # Test polynomial limit detection, the following RI falls outside of the possible limits
+ ri = 3400
+ converted_poly = library_lookup._apply_poly_regression('HP-5', 'DB-5', ri, poly_model)
+ self.assertEqual(False, converted_poly)
+
+ def test_preferred_hit(self):
+ ''' Tests the matching of the hits with the preferred column, including regression '''
+ model, method = library_lookup._read_model(self.ridb_poly_regress)
+ column_type = 'Capillary'
+ polarity = 'Semi-standard non-polar'
+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+ hits = lookup_dict['150867']
+ # No regression, should however consider order of given preference
+ match = library_lookup._preferred(hits, ['SE-52', 'DB-5', 'HP-5'], column_type, polarity, model, method)
+ expected = (['C150867', '(E)-phytol', 'C20H40O', '2110', 'Capillary',
+ 'Semi-standard non-polar', 'SE-52', '', 'C150867_SE-52', '', '', ''], False)
+ self.assertEqual(expected, match)
+
+ # Perform regression by looking for 'OV-101' which isn't there. 'SE-52' has the best regression model
+ # of the available columns
+ match = library_lookup._preferred(hits, ['OV-101'], column_type, polarity, model, method)
+ expected = (['C150867', '(E)-phytol', 'C20H40O', 2158.5769891569125, 'Capillary',
+ 'Semi-standard non-polar', 'SE-52', '', 'C150867_SE-52', '', '', ''], 'SE-52')
+ self.assertEqual(expected, match)
+
+ def test_format_result(self):
+ '''
+ Tests the 'format_result' function
+ '''
+ column_type = 'Capillary'
+ polarity = 'Semi-standard non-polar'
+
+ # Look for DB-5
+ pref_column = ['DB-5']
+ model, method = library_lookup._read_model(self.ridb_poly_regress)
+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,
+ polarity, model, method)#False, None)
+
+ # remove non-hits from set:
+ data = _get_hits_only(data)
+ self.assertEqual(['C544354', 'Ethyl linoleate', 'C20H36O2', '2155', 'Capillary', 'Semi-standard non-polar',
+ 'DB-5', '1', 'C544354_DB-5', '1810', 'None', '', '', '0'], data[20])
+ self.assertEqual(111, len(data))
+
+ # Look for both DB-5 and HP-5
+ pref_column = ['DB-5', 'HP-5']
+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type,
+ polarity, False, None)
+ # remove non-hits from set:
+ data = _get_hits_only(data)
+ self.assertEqual(['C502614', '.beta.-(E)-Farnesene', 'C15H24', '1508', 'Capillary', 'Semi-standard non-polar',
+ 'DB-5', '1', 'C502614_DB-5', '942', 'None', '1482', '1522', '22'], data[50])
+ self.assertEqual(106, len(data))
+
+
+ def test_save_data(self):
+ '''
+ Tests the creation of the output tabular file
+ '''
+ temp_folder = tempfile.mkdtemp(prefix='gcms_combine_output_')
+ saved_data = '{0}/{1}'.format(temp_folder, 'output.tsv')
+ column_type = 'Capillary'
+ polarity = 'Semi-standard non-polar'
+ pref_column = ['DB-5']
+ lookup_dict = library_lookup.create_lookup_table(self.ri_database, column_type, polarity)
+ data = library_lookup.format_result(lookup_dict, self.nist_output, pref_column, column_type, polarity, False, None)
+ library_lookup._save_data(data, saved_data)
+ self.failUnless(os.path.exists(saved_data))
+ shutil.rmtree(temp_folder)
+
+
+ def test_match_library_get_lib_files(self):
+ '''
+ Tests the match_library.py functionality
+ '''
+ riqc_libs_dir = resource_filename(__name__, "../repositories/PRIMS-metabolomics/RI_DB_libraries")
+ get_library_files_output = match_library.get_directory_files(riqc_libs_dir)
+ self.assertEqual(2, len(get_library_files_output))
+ self.assertEqual("Library_RI_DB_capillary_columns-noDuplicates", get_library_files_output[0][0])
+ #TODO change assert below to assert that the result is a file, so the test can run on other dirs as well:
+ #self.assertEqual("E:\\workspace\\PRIMS-metabolomics\\python-tools\\tools\\GCMS\\test\\data\\riqc_libs\\RI DB library (capillary columns) Dec.2012.txt", get_library_files_output[0][1])
+ #self.assertEqual("RI DB library (capillary columns) Jan.2013", get_library_files_output[1][0])
+ try:
+ get_library_files_output = match_library.get_directory_files("/blah")
+ # should not come here
+ self.assertTrue(False)
+ except:
+ # should come here
+ self.assertTrue(True)
+
+def _get_hits_only(data):
+ '''
+ removes items that have RI == 0.0 and Name == '' (these are dummy lines just for the output
+ '''
+ result = []
+ for item_idx in xrange(len(data)):
+ item = data[item_idx]
+ if item[1] != '' and item[3] > 0.0 :
+ result.append(item)
+
+ return result
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
diff -r 000000000000 -r dffc38727496 test/test_match_library.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_match_library.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,51 @@
+'''
+Created on Mar 6, 2012
+
+@author: marcelk
+'''
+from GCMS import match_library
+import unittest
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+
+
+class Test(unittest.TestCase):
+ '''
+ Tests the 'match_library' Galaxy tool
+ '''
+ nist_db = resource_filename(__name__, "data/RIDB_subset.txt")
+
+ def test_get_column_type(self):
+ '''
+ Tests the 'get_column_type' function that returns tuples of unique elements
+ for column types in the RI database
+ '''
+ galaxy_output = match_library.get_column_type(self.nist_db)
+ self.assertEqual([('Capillary(9999)', 'Capillary', False)], galaxy_output)
+
+ def test_filter_column(self):
+ '''
+ Tests the 'filter_column' function showing the column phase for all 'Capillary' columns
+ '''
+ galaxy_output = match_library.filter_column(self.nist_db, 'Capillary')
+ self.assertEqual([('Semi-standard non-polar(9999)', 'Semi-standard non-polar', False)], galaxy_output)
+
+ def test_filter_column2(self):
+ '''
+ Tests the 'filter_column' function showing all possibilities for columns having both the
+ 'Capillary' and 'Semi-standard non-polar' properties
+ '''
+ galaxy_output = match_library.filter_column2(self.nist_db, 'Capillary', 'Semi-standard non-polar')
+ self.failUnless(('Apiezon M(6)', 'Apiezon M', False) in galaxy_output)
+
+ def test_count_occurrence(self):
+ '''
+ Tests the 'count_occurrence' function
+ '''
+ test_list = [2, 0, 0, 2, 1, 3, 4, 5, 3, 2, 3, 4, 5, 5, 4, 2, 5, 3, 4, 3, 5, 4, 2, 0, 4]
+ counts = match_library.count_occurrence(test_list)
+ self.assertEqual({0: 3, 1: 1, 2: 5, 3: 5, 4: 6, 5: 5}, counts)
+
+
+if __name__ == "__main__":
+ #import sys;sys.argv = ['', 'Test.testName']
+ unittest.main()
diff -r 000000000000 -r dffc38727496 test/test_query_mass_repos.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_mass_repos.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,62 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+from MS import query_mass_repos
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+
+
+ def test_simple(self):
+ '''
+ Simple initial test
+ '''
+ # Create out folder
+ outdir = "output/query_mass_repos/"
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ #Build up arguments and run
+
+ # input_file = sys.argv[1]
+ # molecular_mass_col = sys.argv[2]
+ # repository_file = sys.argv[3]
+ # mass_tolerance = float(sys.argv[4])
+ # output_result = sys.argv[5]
+
+ input_file = resource_filename(__name__, "data/service_query_tabular.txt")
+
+ molecular_mass_col = "mass (Da)"
+ dblink_file = resource_filename(__name__, "data/MFSearcher ExactMassDB service.txt")
+ output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+
+
+
+ sys.argv = ['test',
+ input_file,
+ molecular_mass_col,
+ dblink_file,
+ '0.001',
+ 'ms',
+ output_result]
+
+ # Execute main function with arguments provided through sys.argv
+ query_mass_repos.main()
+
+
+
+
+
+def _read_file(filename):
+ '''
+ Helper method to quickly read a file
+ @param filename:
+ '''
+ with open(filename) as handle:
+ return handle.read()
diff -r 000000000000 -r dffc38727496 test/test_query_metexp.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,83 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+# copied from test_export_to_metexp_tabular.py
+# def test_MM_calculations(self):
+# '''
+# test the implemented method for MM calculations for
+# given chemical formulas
+# '''
+# export_to_metexp_tabular.init_elements_and_masses_map()
+#
+# formula = "C8H18O3"
+# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+# result = export_to_metexp_tabular.get_molecular_mass(formula)
+# self.assertEqual(162.26, result)
+#
+# formula = "CH2O3Fe2Ni"
+# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+# result = export_to_metexp_tabular.get_molecular_mass(formula)
+# self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+ def test_simple(self):
+ '''
+ Simple initial test
+ '''
+ # Create out folder
+ outdir = "output/metexp_query/"
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ #Build up arguments and run
+
+ # input_file = sys.argv[1]
+ # molecular_mass_col = sys.argv[2]
+ # formula_col = sys.argv[3]
+ # metexp_dblink_file = sys.argv[4]
+ # output_result = sys.argv[5]
+
+ input_file = resource_filename(__name__, "data/metexp_query_tabular.txt")
+ casid_col = "CAS"
+ formula_col = "FORMULA"
+ molecular_mass_col = "MM"
+ metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+ output_result = resource_filename(__name__, outdir + "metexp_query_results_added.txt")
+
+ sys.argv = ['test',
+ input_file,
+ casid_col,
+ formula_col,
+ molecular_mass_col,
+ metexp_dblink_file,
+ 'GC',
+ output_result]
+
+ # Execute main function with arguments provided through sys.argv
+ query_metexp.main()
+
+ # TODO - asserts (base them on DB being filled with test data form metexp unit test for upload method)
+ # PA
+
+
+
+
+def _read_file(filename):
+ '''
+ Helper method to quickly read a file
+ @param filename:
+ '''
+ with open(filename) as handle:
+ return handle.read()
diff -r 000000000000 -r dffc38727496 test/test_query_metexp_LARGE.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test/test_query_metexp_LARGE.py Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,79 @@
+'''Integration tests for the GCMS project'''
+
+from pkg_resources import resource_filename # @UnresolvedImport # pylint: disable=E0611
+from GCMS import query_metexp
+import os.path
+import sys
+import unittest
+
+
+class IntegrationTest(unittest.TestCase):
+
+
+# def test_MM_calculations(self):
+# '''
+# test the implemented method for MM calculations for
+# given chemical formulas
+# '''
+# export_to_metexp_tabular.init_elements_and_masses_map()
+#
+# formula = "C8H18O3"
+# # should be = 12.01*8 + 1.01*18 + 16*3 = 162.26
+# result = export_to_metexp_tabular.get_molecular_mass(formula)
+# self.assertEqual(162.26, result)
+#
+# formula = "CH2O3Fe2Ni"
+# # should be = 12.01*1 + 1.01*2 + 16*3 + 55.85*2 + 58.71 = 232.44
+# result = export_to_metexp_tabular.get_molecular_mass(formula)
+# self.assertAlmostEqual(232.44, result, 2)
+#
+#
+#
+
+
+ def test_large(self):
+ '''
+ Simple test, but on larger set, last test executed in 28s
+ '''
+ # Create out folder
+ outdir = "output/metexp_query/"
+ if not os.path.exists(outdir):
+ os.makedirs(outdir)
+
+ #Build up arguments and run
+
+ # input_file = sys.argv[1]
+ # molecular_mass_col = sys.argv[2]
+ # formula_col = sys.argv[3]
+ # metexp_dblink_file = sys.argv[4]
+ # output_result = sys.argv[5]
+
+ input_file = resource_filename(__name__, "data/metexp_query_tabular_large.txt")
+ casid_col = "CAS"
+ formula_col = "FORMULA"
+ molecular_mass_col = "MM"
+ metexp_dblink_file = resource_filename(__name__, "data/METEXP Test DB.txt")
+ output_result = resource_filename(__name__, outdir + "metexp_query_results_added_LARGE.txt")
+
+ sys.argv = ['test',
+ input_file,
+ casid_col,
+ formula_col,
+ molecular_mass_col,
+ metexp_dblink_file,
+ 'GC',
+ output_result]
+
+ # Execute main function with arguments provided through sys.argv
+ query_metexp.main()
+
+
+
+
+def _read_file(filename):
+ '''
+ Helper method to quickly read a file
+ @param filename:
+ '''
+ with open(filename) as handle:
+ return handle.read()
diff -r 000000000000 -r dffc38727496 tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_dependencies.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+ This dependency:
+ Ensures R 3.1.1 installation is triggered (via dependency).
+ Ensures Bioconductor 3.0 and package metaMS, multtest and snow are installed.
+
+
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 xcms_differential_analysis.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_differential_analysis.r Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,85 @@
+## read args:
+args <- commandArgs(TRUE)
+#cat("args <- \"\"\n")
+## a xcms xset saved as .RData
+args.xsetData <- args[1]
+#cat(paste("args.xsetData <- \"", args[1], "\"\n", sep=""))
+
+args.class1 <- args[2]
+args.class2 <- args[3]
+#cat(paste("args.class1 <- \"", args[2], "\"\n", sep=""))
+#cat(paste("args.class2 <- \"", args[3], "\"\n", sep=""))
+
+args.topcount <- strtoi(args[4])
+#cat(paste("args.topcount <- ", args[4], "\n", sep=""))
+
+args.outTable <- args[5]
+
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+#cat(paste("args.htmlReportFile <- \"", args[6], "\"\n", sep=""))
+#cat(paste("args.htmlReportFile.files_path <- \"", args[7], "\"\n", sep=""))
+
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+tryCatch(
+ {
+ library(metaMS)
+ library(xcms)
+ #library("R2HTML")
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+
+ # info: levels(xcmsSet@phenoData$class) also gives access to the class names
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ # set cairo as default for png plots:
+ png = function (...) grDevices::png(...,type='cairo')
+ # run diffreport
+ reporttab <- diffreport(xsetData, args.class1, args.class2, paste(args.htmlReportFile.files_path,"/fig", sep=""), args.topcount, metlin = 0.15, h=480, w=640)
+
+ # write out tsv table:
+ write.table(reporttab, args.outTable, sep="\t", row.names=FALSE)
+
+ message("\nGenerating report.........")
+
+ cat("
Differential analysis report
", file= args.htmlReportFile)
+ #HTML(reporttab[1:args.topcount,], file= args.htmlReportFile)
+ figuresPath <- paste(args.htmlReportFile.files_path, "/fig_eic", sep="")
+ message(figuresPath)
+ listOfFiles <- list.files(path = figuresPath)
+ for (i in 1:length(listOfFiles))
+ {
+ figureName <- listOfFiles[i]
+ # maybe we still need to copy the figures to the args.htmlReportFile.files_path
+ cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE)
+ cat(paste("", sep=""), file= args.htmlReportFile, append=TRUE)
+ }
+
+ message("finished generating report")
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 xcms_differential_analysis.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_differential_analysis.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,55 @@
+
+ Runs xcms diffreport function for differential Analsysis
+
+ R_bioc_metams
+
+
+ xcms_differential_analysis.r
+ $xsetData
+ "$class1"
+ "$class2"
+ $topcount
+ $outTable
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+Runs xcms diffreport for showing the most significant differences between two sets/classes of samples. This tool also creates extracted ion chromatograms (EICs) for
+the most significant differences. The figure below shows an example of such an EIC.
+
+.. image:: $PATH_TO_IMAGES/diffreport.png
+
+
+
+
+
+
+ 10.1021/ac051437y
+
+
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 xcms_get_alignment_eic.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_alignment_eic.r Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,153 @@
+# shows all alignment results in a rt region
+## read args:
+args <- commandArgs(TRUE)
+# xset data:
+args.xsetData <- args[1]
+
+args.rtStart <- strtoi(args[2])
+args.rtEnd <- strtoi(args[3])
+
+# limit max diff to 600 and minNrSamples to at least 2 :
+if (args.rtEnd - args.rtStart > 600)
+ stop("The RT window should be <= 600")
+
+args.minNrSamples <- strtoi(args[4]) #min nr samples
+if (args.minNrSamples < 2)
+ stop("Set 'Minimum number of samples' to 2 or higher")
+
+
+args.sampleNames <- strsplit(args[5], ",")[[1]]
+# trim leading and trailing spaces:
+args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
+
+## report files
+args.htmlReportFile <- args[6]
+args.htmlReportFile.files_path <- args[7]
+
+
+if (length(args) == 8)
+{
+ args.outLogFile <- args[8]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+
+
+tryCatch(
+ {
+ library(metaMS)
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
+
+ write(paste("
Extracted Ion Chromatograms (EIC) of alignments with peaks in ",args.minNrSamples, " or more samples
"),
+ file=args.htmlReportFile)
+
+ gt <- groups(xsetData)
+ message("\nParsed groups... ")
+ groupidx1 <- which(gt[,"rtmed"] > args.rtStart & gt[,"rtmed"] < args.rtEnd & gt[,"npeaks"] >= args.minNrSamples) # this should be only on samples selected
+
+ if (length(groupidx1) > 0)
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames)
+ #eicraw <- getEIC(xsetData, groupidx = c(groupidx1), sampleidx=args.sampleNames, rt = "raw")
+
+ #sampleNamesIdx <- which(sampnames(LC$xset@xcmsSet) %in% args.sampleNames, arr.ind = TRUE)
+ #or (from bioconductor code for getEIC): NB: this is assuming sample indexes used in data are ordered after the order of sample names!!
+ sampNames <- sampnames(xsetData)
+ sampleNamesIdx <- match( args.sampleNames, sampNames)
+ message(paste("Samples: ", sampleNamesIdx))
+
+ #TODO html <- paste(html, "
")
+ message(paste("\nPlotting figures... "))
+
+ #get the mz list (interestingly, this [,"mz"] is relatively slow):
+ peakMzList <- xsetData@peaks[,"mz"]
+ peakSampleList <- xsetData@peaks[,"sample"]
+ #signal to noise list:
+ peakSnList <- xsetData@peaks[,"sn"]
+
+ message(paste("Total nr of peaks: ", length(peakMzList)))
+
+ for (i in 1:length(groupidx1))
+ {
+ groupMembers <- xsetData@groupidx[[groupidx1[i]]]
+
+ groupMzList <- ""
+ groupSampleList <- ""
+ finalGroupSize <- 0
+
+ for (j in 1:length(groupMembers))
+ {
+ #get only the peaks from the selected samples:
+ memberSample <- peakSampleList[groupMembers[j]]
+ memberSn <- peakSnList[groupMembers[j]]
+ if (!is.na(memberSn) && memberSample %in% sampleNamesIdx)
+ {
+ message(paste("Group: ", groupidx1[i], " / Member sample: ", memberSample))
+ memberMz <- peakMzList[groupMembers[j]]
+
+
+ if (finalGroupSize == 0)
+ {
+ groupMzList <- memberMz
+ groupSampleList <- sampNames[memberSample]
+ } else {
+ groupMzList <- paste(groupMzList,",",memberMz, sep="")
+ groupSampleList <- paste(groupSampleList,",",sampNames[memberSample], sep="")
+ }
+
+ finalGroupSize <- finalGroupSize +1
+ }
+ }
+ # here we do the real check on group size and only the groups that have enough
+ # members with signal to noise > 0 will be plotted here:
+ if (finalGroupSize >= args.minNrSamples)
+ {
+ message(paste("Plotting figure ",i, " of max ", length(groupidx1)," figures... "))
+
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ write(paste("", sep="") ,
+ file=args.htmlReportFile, append=TRUE)
+
+ png( figureName, type="cairo", width=800 )
+ plot(eiccor, xsetData, groupidx = i)
+ devname = dev.off()
+
+ write(paste("
Alignment id: [", groupidx1[i], "]. m/z list of peaks in alignment with signal/noise <> NA: ", groupMzList,"
m/z values found in the following samples respectively: ", groupSampleList,"
", sep="") ,
+ file=args.htmlReportFile, append=TRUE)
+ }
+ }
+
+ }
+
+ write("",
+ file=args.htmlReportFile, append=TRUE)
+ message("finished generating report")
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
diff -r 000000000000 -r dffc38727496 xcms_get_alignment_eic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_alignment_eic.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,73 @@
+
+ Extracts alignment EICs from feature alignment data
+
+ R_bioc_metams
+
+
+ xcms_get_alignment_eic.r
+ $xsetData
+ $rtStart
+ $rtEnd
+ $minNrSamples
+ "$sampleNames"
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool finds the alignments in the specified RT window and extracts alignment EICs from feature alignment data using XCMS's getEIC method.
+It produces a HTML report showing the EIC plots and the mass list of each alignment. The figure below shows an example of such an EIC plot, showing also the difference between
+two classes, with extra alignment information beneath it.
+
+.. image:: $PATH_TO_IMAGES/diffreport.png
+
+Alignment id: 1709. m/z list of peaks in alignment:
+614.002922098482,613.998019830021,614.000382307519,613.998229980469,613.998229980469
+
+
+
+
+ 10.1021/ac051437y
+
+
\ No newline at end of file
diff -r 000000000000 -r dffc38727496 xcms_get_mass_eic.r
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_mass_eic.r Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,162 @@
+## read args:
+args <- commandArgs(TRUE)
+# xset data:
+args.xsetData <- args[1]
+
+args.rtStart <- strtoi(args[2])
+args.rtEnd <- strtoi(args[3])
+
+args.mzStart <- as.double(args[4])
+args.mzEnd <- as.double(args[5])
+# there are 2 options: specify a mz range or a mz list:
+if (args.mzStart < 0)
+{
+ args.mzList <- as.double(strsplit(args[6], ",")[[1]])
+ cat(typeof(as.double(strsplit(args[6], ",")[[1]])))
+ args.mzTolPpm <- as.double(args[7])
+ # calculate mzends based on ppm tol:
+ mzListEnd <- c()
+ mzListStart <- c()
+ for (i in 1:length(args.mzList))
+ {
+ mzEnd <- args.mzList[i] + args.mzList[i]*args.mzTolPpm/1000000.0
+ mzStart <- args.mzList[i] - args.mzList[i]*args.mzTolPpm/1000000.0
+ mzListEnd <- c(mzListEnd, mzEnd)
+ mzListStart <- c(mzListStart, mzStart)
+ }
+ str(mzListStart)
+ str(mzListEnd)
+} else {
+ mzListEnd <- c(args.mzEnd)
+ mzListStart <- c(args.mzStart)
+}
+
+args.sampleNames <- strsplit(args[8], ",")[[1]]
+# trim leading and trailing spaces:
+args.sampleNames <- gsub("^\\s+|\\s+$", "", args.sampleNames)
+
+args.combineSamples <- args[9]
+args.rtPlotMode <- args[10]
+
+## report files
+args.htmlReportFile <- args[11]
+args.htmlReportFile.files_path <- args[12]
+
+
+if (length(args) == 13)
+{
+ args.outLogFile <- args[13]
+ # suppress messages:
+ # Send all STDERR to STDOUT using sink() see http://mazamascience.com/WorkingWithData/?p=888
+ msg <- file(args.outLogFile, open="wt")
+ sink(msg, type="message")
+ sink(msg, type="output")
+}
+
+# TODO - add option to do masses in same plot (if given in same line oid) or in separate plots
+# TODO2 - let it run in parallel
+
+tryCatch(
+ {
+ library(metaMS)
+
+ # load the xset data :
+ xsetData <- readRDS(args.xsetData)
+ # if here to support both scenarios:
+ if ("xcmsSet" %in% slotNames(xsetData) )
+ {
+ xsetData <- xsetData@xcmsSet
+ }
+
+ # report
+ dir.create(file.path(args.htmlReportFile.files_path), showWarnings = FALSE, recursive = TRUE)
+ message(paste("\nGenerating report.........in ", args.htmlReportFile.files_path))
+
+ html <- "
Extracted Ion Chromatograms (EIC) matching criteria
"
+
+ if (args.combineSamples == "No")
+ {
+ if (length(args.sampleNames) > 1 && length(mzListStart) > 1 && length(args.sampleNames) != length(mzListStart))
+ stop(paste("The number of sample names should match the number of m/z values in the list. Found ", length(mzListStart),
+ " masses while ", length(args.sampleNames), " sample names were given."))
+
+ iterSize <- length(args.sampleNames)
+ # these can be set to 1 or 0 just as a trick to iterate OR not over the items. If the respective list is of length 1, only the first item should be used
+ fixSampleIdx <- 1
+ fixMzListIdx <- 1
+ if (length(args.sampleNames) == 1)
+ {
+ fixSampleIdx <- 0
+ iterSize <- length(mzListStart)
+ }
+ if (length(mzListStart) == 1)
+ {
+ fixMzListIdx <- 0
+ }
+ lineColors <- rainbow(iterSize)
+ for (i in 0:(iterSize-1))
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData,
+ mzrange=matrix(c(mzListStart[i*fixMzListIdx+1],mzListEnd[i*fixMzListIdx+1]),nrow=1,ncol=2,byrow=TRUE),
+ rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
+ sampleidx=c(args.sampleNames[i*fixSampleIdx+1]), rt=args.rtPlotMode)
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ html <- paste(html," ", sep="")
+ png( figureName, type="cairo", width=1100,height=250 )
+ #plot(eiccor, col=lineColors[i+1])
+ # black is better in this case:
+ plot(eiccor)
+ legend('topright', # places a legend at the appropriate place
+ legend=c(args.sampleNames[i*fixSampleIdx+1]), # puts text in the legend
+ lty=c(1,1), # gives the legend appropriate symbols (lines)
+ lwd=c(2.5,2.5))
+
+ devname = dev.off()
+ }
+
+ } else {
+ for (i in 1:length(mzListStart))
+ {
+ message("\nGetting EIC... ")
+ eiccor <- getEIC(xsetData,
+ mzrange=matrix(c(mzListStart[i],mzListEnd[i]),nrow=1,ncol=2,byrow=TRUE),
+ rtrange=matrix(c(args.rtStart,args.rtEnd),nrow=1,ncol=2,byrow=TRUE),
+ sampleidx=args.sampleNames, rt = args.rtPlotMode)
+
+ #set size, set option (plot per sample, plot per mass)
+
+ message("\nPlotting figures... ")
+ figureName <- paste(args.htmlReportFile.files_path, "/figure", i,".png", sep="")
+ html <- paste(html,"", sep="")
+ png( figureName, type="cairo", width=1100,height=450 )
+ lineColors <- rainbow(length(args.sampleNames))
+ plot(eiccor, col=lineColors)
+ legend('topright', # places a legend at the appropriate place
+ legend=args.sampleNames, # puts text in the legend
+ lty=c(1,1), # gives the legend appropriate symbols (lines)
+ lwd=c(2.5,2.5),
+ col=lineColors)
+ devname = dev.off()
+ }
+ }
+ if (args.rtPlotMode == "corrected")
+ {
+ html <- paste(html,"
*rt values are corrected ones
")
+ }
+ html <- paste(html,"")
+ message("finished generating report")
+ write(html,file=args.htmlReportFile)
+ # unlink(args.htmlReportFile)
+ cat("\nWarnings================:\n")
+ str( warnings() )
+ },
+ error=function(cond) {
+ sink(NULL, type="message") # default setting
+ sink(stderr(), type="output")
+ message("\nERROR: ===========\n")
+ print(cond)
+ }
+ )
diff -r 000000000000 -r dffc38727496 xcms_get_mass_eic.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xcms_get_mass_eic.xml Sat Feb 07 22:02:00 2015 +0100
@@ -0,0 +1,117 @@
+
+ Extracts EICs for a given list of masses
+
+ R_bioc_metams
+
+
+ xcms_get_mass_eic.r
+ $xsetData
+ $rtStart
+ $rtEnd
+ #if $massParameters.massParametersType == "window"
+ $massParameters.mzStart
+ $massParameters.mzEnd
+ -1
+ "."
+ #else
+ -1
+ -1
+ "$massParameters.mzList"
+ $massParameters.mzTolPpm
+ #end if
+ "$sampleNames"
+ $combineSamples
+ $rtPlotMode
+ $htmlReportFile
+ $htmlReportFile.files_path
+ $outLogFile
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+.. class:: infomark
+
+This tool will plot EICs for a given list of masses (or a mass window) using XCMS's getEIC method.
+It produces a HTML report showing the EIC plots, one for each given mass. The figure below shows an example of such an EIC plot.
+
+.. image:: $PATH_TO_IMAGES/massEIC.png
+
+
+
+
+ 10.1021/ac051437y
+
+
\ No newline at end of file