2
|
1 #' wl-02-11-2017, Thu: commence
|
|
2 #' wl-07-11-2017, Tue: debug using manual test
|
|
3 #' wl-10-11-2017, Fri: change 'steps' as float
|
|
4 #' wl-24-11-2017, Fri: Major changes
|
|
5 #' wl-25-11-2017, Sat: error handling
|
|
6 #' wl-07-11-2017, Thu: add debug codes
|
|
7 #' wl-25-01-2018, Thu: remove user's input for 'offset'
|
|
8 #' wl-30-01-2018, Tue: fix bugs in 'standards'
|
|
9 #' wl-12-02-2018, Mon: change output file as tabular (.tab) for galaxy only
|
|
10 #' wl-14-02-2018, Wed: save cluster intensity data
|
|
11 #' wl-28-03-2019, Thu: apply style_file() to reformat this script and use
|
|
12 #' vim's folding as outline view. Without reformatng, the folding
|
|
13 #' is messy.
|
|
14 #' Usages:
|
|
15 #' 1.) For command line and galaxy, change `com_f` to TRUE.
|
|
16 #' 2.) For command line, change `home_dir` as appropriate
|
|
17 #' For Windows, run: massPix.bat
|
|
18 #' For Linux, run: ./massPix.sh
|
|
19 #' 3.) For interactive environment, change `com_f` to FALSE
|
|
20
|
|
21 ## ==== General settings ====
|
|
22
|
|
23 rm(list = ls(all = T))
|
|
24 set.seed(123)
|
|
25
|
|
26 #' flag for command-line use or not. If false, only for debug interactively.
|
|
27 com_f <- T
|
|
28
|
|
29 #' galaxy will stop even if R has warning message
|
|
30 options(warn = -1) #' disable R warning. Turn back: options(warn=0)
|
|
31
|
|
32 #' Setup R error handling to go to stderr
|
|
33 #' options(show.error.messages = F, error = function() {
|
|
34 #' cat(geterrmessage(), file = stderr())
|
|
35 #' q("no", 1, F)
|
|
36 #' })
|
|
37
|
|
38 #' we need that to not crash galaxy with an UTF8 error on German LC settings.
|
|
39 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
|
|
40
|
|
41 suppressPackageStartupMessages({
|
|
42 library(optparse)
|
|
43 library(calibrate)
|
|
44 library(rJava)
|
|
45 library(WriteXLS)
|
|
46 })
|
|
47
|
|
48 ## ==== Command line or interactive setting ====
|
|
49
|
|
50 if (com_f) {
|
|
51
|
|
52 #' Setup home directory
|
|
53 #' wl-24-11-2017, Fri: A dummy function for the base directory. The reason
|
|
54 #' to write such a function is to keep the returned values by
|
|
55 #' 'commandArgs' with 'trailingOnly = FALSE' in a local environment
|
|
56 #' otherwise 'parse_args' will use the results of
|
|
57 #' 'commandArgs(trailingOnly = FALSE)' even with 'args =
|
|
58 #' commandArgs(trailingOnly = TRUE)' in its argument area.
|
|
59 func <- function() {
|
|
60 argv <- commandArgs(trailingOnly = FALSE)
|
|
61 path <- sub("--file=", "", argv[grep("--file=", argv)])
|
|
62 }
|
|
63 #' prog_name <- basename(func())
|
|
64 home_dir <- paste0(dirname(func()), "/")
|
|
65
|
|
66 #' Specify our desired options in a list by default OptionParser will add
|
|
67 #' an help option equivalent to make_option(c("-h", "--help"),
|
|
68 #' action="store_true", default=FALSE, help="Show this help message and
|
|
69 #' exit")
|
|
70 option_list <- list(
|
|
71 make_option(c("-v", "--verbose"),
|
|
72 action = "store_true", default = TRUE,
|
|
73 help = "Print extra output [default]"
|
|
74 ),
|
|
75 make_option(c("-q", "--quietly"),
|
|
76 action = "store_false",
|
|
77 dest = "verbose", help = "Print little output"
|
|
78 ),
|
|
79
|
|
80 #' input files
|
|
81 make_option("--imzML_file",
|
|
82 type = "character",
|
|
83 help = "Mass spectrometry imaging data to be processed.
|
|
84 Currently imzML format is supported."
|
|
85 ),
|
|
86 make_option("--image_file",
|
|
87 type = "character",
|
|
88 help = "Processed imaging data to be further analysis."
|
|
89 ),
|
|
90
|
|
91 #' image processing
|
|
92 make_option("--process", type = "logical", default = TRUE),
|
|
93
|
|
94 #' make library
|
|
95 make_option("--ionisation_mode", type = "character", default = "positive"),
|
|
96 make_option("--fixed", type = "logical", default = FALSE),
|
|
97 make_option("--fixed_FA", type = "double", default = 16),
|
|
98
|
|
99 #' mz_extractor
|
|
100 make_option("--thres_int", type = "integer", default = 100000),
|
|
101 make_option("--thres_low", type = "integer", default = 200),
|
|
102 make_option("--thres_high", type = "integer", default = 1000),
|
|
103
|
|
104 #' peak_bin
|
|
105 make_option("--bin_ppm", type = "integer", default = 10),
|
|
106
|
|
107 #' subset_image
|
|
108 make_option("--percentage_deiso", type = "integer", default = 3),
|
|
109
|
|
110 #' filter
|
|
111 make_option("--steps", type = "double", default = 0.05),
|
|
112 make_option("--thres_filter", type = "integer", default = 11),
|
|
113
|
|
114 #' deisotope
|
|
115 make_option("--ppm", type = "integer", default = 3),
|
|
116 make_option("--no_isotopes", type = "integer", default = 2),
|
|
117 make_option("--prop_1", type = "double", default = 0.9),
|
|
118 make_option("--prop_2", type = "double", default = 0.5),
|
|
119 make_option("--search_mod", type = "logical", default = TRUE),
|
|
120 make_option("--mod",
|
|
121 type = "character",
|
|
122 default = "c(NL = T, label = F, oxidised = F, desat = F)"
|
|
123 ),
|
|
124
|
|
125 #' annotate
|
|
126 make_option("--ppm_annotate", type = "integer", default = 10),
|
|
127
|
|
128 #' normalise
|
|
129 make_option("--norm_type", type = "character", default = "TIC"),
|
|
130 make_option("--standards", type = "character", default = "NULL"),
|
|
131
|
|
132 #' output parameters and files
|
|
133 make_option("--image_out",
|
|
134 type = "character", default = "image.tsv",
|
|
135 help = "Processed imaging data visualisation"
|
|
136 ),
|
|
137
|
|
138 make_option("--rdata", type = "logical", default = TRUE),
|
|
139 make_option("--rdata_out",
|
|
140 type = "character", default = "r_running.rdata",
|
|
141 help = "All the running results in RData for inspection."
|
|
142 ),
|
|
143
|
|
144 #' plot parameters
|
|
145 make_option("--scale", type = "integer", default = 100),
|
|
146 make_option("--nlevels", type = "integer", default = 50),
|
|
147 make_option("--res_spatial", type = "integer", default = 50),
|
|
148 make_option("--rem_outliers", type = "logical", default = TRUE),
|
|
149 make_option("--summary", type = "logical", default = FALSE),
|
|
150 make_option("--title", type = "logical", default = TRUE),
|
|
151
|
|
152 #' pca plot
|
|
153 make_option("--pca", type = "logical", default = TRUE),
|
|
154 make_option("--pca_out", type = "character", default = "pca.pdf"),
|
|
155 make_option("--scale_type", type = "character", default = "cs"),
|
|
156 make_option("--transform", type = "logical", default = FALSE),
|
|
157 make_option("--PCnum", type = "integer", default = 5),
|
|
158 make_option("--loading", type = "logical", default = TRUE),
|
|
159 make_option("--loading_out", type = "character", default = "loading.xlsx"),
|
|
160
|
|
161 #' slice plot
|
|
162 make_option("--slice", type = "logical", default = TRUE),
|
|
163 make_option("--row", type = "integer", default = 12),
|
|
164 make_option("--slice_out", type = "character", default = "slice.pdf"),
|
|
165
|
|
166 #' cluster plot
|
|
167 make_option("--clus", type = "logical", default = TRUE),
|
|
168 make_option("--clus_out", type = "character", default = "clus.pdf"),
|
|
169 make_option("--cluster_type", type = "character", default = "kmeans"),
|
|
170 make_option("--clusters", type = "integer", default = 5),
|
|
171 make_option("--intensity", type = "logical", default = TRUE),
|
|
172 make_option("--intensity_out", type = "character", default = "intensity.tsv")
|
|
173 )
|
|
174
|
|
175 opt <- parse_args(
|
|
176 object = OptionParser(option_list = option_list),
|
|
177 args = commandArgs(trailingOnly = TRUE)
|
|
178 )
|
|
179 } else {
|
|
180 #' home_dir <- "C:/R_lwc/massPix/" #' for windows
|
|
181 home_dir <- "/home/wl/my_galaxy/massPix/" #' for linux. must be case-sensitive
|
|
182 opt <- list(
|
|
183 #' -------------------------------------------------------------------
|
|
184 #' input files. Note that using full path here.
|
|
185 imzML_file = paste0(home_dir, "test-data/test_pos.imzML"),
|
|
186 image_file = paste0(home_dir, "test-data/image_norm.tsv"),
|
|
187
|
|
188 #' image data processing parameters
|
|
189 process = T,
|
|
190
|
|
191 #' make library
|
|
192 ionisation_mode = "positive",
|
|
193 fixed = FALSE,
|
|
194 fixed_FA = 16,
|
|
195 #' mz_extractor
|
|
196 thres_int = 100000,
|
|
197 thres_low = 200,
|
|
198 thres_high = 1000,
|
|
199 #' peak_bin
|
|
200 bin_ppm = 10,
|
|
201 #' subset_image
|
|
202 percentage_deiso = 3,
|
|
203 #' filter
|
|
204 steps = 0.05,
|
|
205 thres_filter = 11,
|
|
206 #' deisotope
|
|
207 ppm = 3,
|
|
208 no_isotopes = 2,
|
|
209 prop_1 = 0.9,
|
|
210 prop_2 = 0.5,
|
|
211 search_mod = TRUE,
|
|
212 mod = "c(NL = T, label = F, oxidised = F, desat = F)",
|
|
213 #' annotate
|
|
214 ppm_annotate = 10,
|
|
215 #' normalise
|
|
216 norm_type = "TIC",
|
|
217 standards = "NULL",
|
|
218
|
|
219 #' output parameters and files
|
|
220 image_out = paste0(home_dir, "test-data/res/image.tsv"),
|
|
221
|
|
222 rdata = TRUE,
|
|
223 rdata_out = paste0(home_dir, "test-data/res/r_running.rdata"),
|
|
224
|
|
225 #' plot parameters
|
|
226 scale = 100,
|
|
227 nlevels = 50,
|
|
228 res_spatial = 50,
|
|
229 rem_outliers = TRUE,
|
|
230 summary = FALSE,
|
|
231 title = TRUE,
|
|
232
|
|
233 #' pca plot
|
|
234 pca = TRUE,
|
|
235 pca_out = paste0(home_dir, "test-data/res/pca.pdf"),
|
|
236 scale_type = "cs",
|
|
237 transform = FALSE,
|
|
238 PCnum = 5,
|
|
239 loading = TRUE,
|
|
240 loading_out = paste0(home_dir, "test-data/res/loading.xlsx"),
|
|
241
|
|
242 #' slice plot
|
|
243 slice = TRUE,
|
|
244 slice_out = paste0(home_dir, "test-data/res/slice.pdf"),
|
|
245 row = 12,
|
|
246
|
|
247 #' cluster plot
|
|
248 clus = TRUE,
|
|
249 clus_out = paste0(home_dir, "test-data/res/clus.pdf"),
|
|
250 cluster_type = "kmeans",
|
|
251 clusters = 5,
|
|
252 intensity = TRUE,
|
|
253 intensity_out = paste0(home_dir, "test-data/res/intensity.tsv")
|
|
254 )
|
|
255 }
|
|
256 #' opt
|
|
257
|
|
258 suppressPackageStartupMessages({
|
|
259 source(paste0(home_dir, "all_massPix.R"))
|
|
260 })
|
|
261
|
|
262 ## ==== Pre-processing ====
|
|
263
|
|
264 #' imzML converter
|
|
265 lib_dir <- paste0(home_dir, "libraries/")
|
|
266 imzMLparse <- paste0(home_dir, "imzMLConverter/imzMLConverter.jar")
|
|
267
|
|
268 options(java.parameters = "Xmx4g")
|
|
269
|
|
270 #' enforce the following required arguments
|
|
271 if (is.null(opt$imzML_file)) {
|
|
272 cat("'imzML_file' is required\n")
|
|
273 q(status = 1)
|
|
274 }
|
|
275 #' wl-07-02-2018, Wed: 'imzML_file' must be provided no matter what
|
|
276 #' 'process' is. For 'process' is FALSE, it gives 'x.cood' and 'y.cood' for
|
|
277 #' visualisation.
|
|
278
|
|
279 if (!opt$process) {
|
|
280 if (is.null(opt$image_file)) {
|
|
281 cat("'image_file' is required\n")
|
|
282 q(status = 1)
|
|
283 }
|
|
284 }
|
|
285
|
|
286 #' read in library files
|
|
287 read <- read.csv(paste(lib_dir, "lib_FA.csv", sep = "/"), sep = ",", header = T)
|
|
288 lookup_FA <- read[, 2:4]
|
|
289 row.names(lookup_FA) <- read[, 1]
|
|
290
|
|
291 read <- read.csv(paste(lib_dir, "lib_class.csv", sep = "/"), sep = ",", header = T)
|
|
292 lookup_lipid_class <- read[, 2:3]
|
|
293 row.names(lookup_lipid_class) <- read[, 1]
|
|
294
|
|
295 read <- read.csv(paste(lib_dir, "lib_element.csv", sep = "/"), sep = ",", header = T)
|
|
296 lookup_element <- read[, 2:3]
|
|
297 row.names(lookup_element) <- read[, 1]
|
|
298
|
|
299 read <- read.csv(paste(lib_dir, "lib_modification.csv", sep = "/"), sep = ",", header = T)
|
|
300 lookup_mod <- read[, 2:ncol(read)]
|
|
301 row.names(lookup_mod) <- read[, 1]
|
|
302
|
|
303 #' parsing the data and getting x and y dimensions
|
|
304 .jinit()
|
|
305 .jaddClassPath(path = imzMLparse)
|
|
306
|
|
307 imzML <- J("imzMLConverter.ImzMLHandler")$parseimzML(opt$imzML_file)
|
|
308 #' wl-07-11-2017, Tue: Location opt$ibd_file is also written into imzML.
|
|
309 #' Note that opt$imzML_file and opt$ibd_file must have the same file name
|
|
310 #' and extention names imzML and ibd, respectively. You can see this from
|
|
311 #' CPP file: (https://goo.gl/WTkFkn)
|
|
312 #'
|
|
313 #' // Remove ".imzML" from the end of the file
|
|
314 #' this->ibdLocation = imzMLFilename.substr(0, imzMLFilename.size()-6) + ".ibd";
|
|
315 #'
|
|
316 #' Otherwise this function: J(spectrum, 'getIntensityArray') does not work.
|
|
317 #' Three functions mzextractor, subsetImage and contructImage call this
|
|
318 #' function.
|
|
319 #' wl-25-11-2017, Sat: imzML and ibd file must be uploaded and located in
|
|
320 #' the same directory. If so, no need to pass ibd file into R code since
|
|
321 #' imzMLConverter will get ibd file implicitely based on the directory and
|
|
322 #' name of imzML file.
|
|
323
|
|
324 x.cood <- J(imzML, "getWidth")
|
|
325 y.cood <- J(imzML, "getHeight")
|
|
326
|
|
327 ## ==== Main Process ====
|
|
328
|
|
329 if (opt$process) {
|
|
330 #' make library
|
|
331 dbase <- makelibrary(
|
|
332 ionisation_mode = opt$ionisation_mode,
|
|
333 sel.class = NULL, fixed = opt$fixed,
|
|
334 fixed_FA = opt$fixed_FA,
|
|
335 lookup_lipid_class = lookup_lipid_class,
|
|
336 lookup_FA = lookup_FA,
|
|
337 lookup_element = lookup_element
|
|
338 )
|
|
339
|
|
340 #' Extract m/z and pick peaks
|
|
341 extracted <- mzextractor(
|
|
342 files = opt$imzML,
|
|
343 imzMLparse = imzMLparse,
|
|
344 thres.int = opt$thres_int,
|
|
345 thres.low = opt$thres_low,
|
|
346 thres.high = opt$thres_high
|
|
347 )
|
|
348
|
|
349 #' Bin all m/zs by ppm bucket
|
|
350 peaks <- peakpicker.bin(extracted = extracted, bin.ppm = opt$bin_ppm)
|
|
351
|
|
352 #' Generate subset of first image file to improve speed of deisotoping
|
|
353 temp.image <- subsetImage(
|
|
354 extracted = extracted, peaks = peaks,
|
|
355 percentage.deiso = opt$percentage_deiso,
|
|
356 thres.int = opt$thres_int,
|
|
357 thres.low = opt$thres_low,
|
|
358 thres.high = opt$thres_high,
|
|
359 files = opt$imzML,
|
|
360 imzMLparse = imzMLparse
|
|
361 )
|
|
362
|
|
363 #' Filter to a matrix subset that includes variables above a threshold of
|
|
364 #' missing values
|
|
365 temp.image.filtered <- filter(
|
|
366 imagedata.in = temp.image,
|
|
367 steps = seq(0, 1, opt$steps),
|
|
368 thres.filter = opt$thres_filter,
|
|
369 offset = 1
|
|
370 )
|
|
371
|
|
372 #' Perform deisotoping on a subset of the image
|
|
373 deisotoped <- deisotope(
|
|
374 ppm = opt$ppm, no_isotopes = opt$no_isotopes,
|
|
375 prop.1 = opt$prop_1, prop.2 = opt$prop_2,
|
|
376 peaks = list("", temp.image.filtered[, 1]),
|
|
377 image.sub = temp.image.filtered,
|
|
378 search.mod = opt$search_mod,
|
|
379 mod = eval(parse(text = opt$mod)),
|
|
380 lookup_mod = lookup_mod
|
|
381 )
|
|
382
|
|
383 #' Perform annotation of lipids using library
|
|
384 annotated <- annotate(
|
|
385 ionisation_mode = opt$ionisation_mode,
|
|
386 deisotoped = deisotoped,
|
|
387 ppm.annotate = opt$ppm_annotate,
|
|
388 dbase = dbase
|
|
389 )
|
|
390
|
|
391 #' make full image and add lipid ids
|
|
392 #' wl-23-08-2017: it takes **LONG TIME**.
|
|
393 final.image <- contructImage(
|
|
394 extracted = extracted,
|
|
395 deisotoped = deisotoped,
|
|
396 peaks = peaks, imzMLparse = imzMLparse,
|
|
397 thres.int = opt$thres_int,
|
|
398 thres.low = opt$thres_low,
|
|
399 thres.high = opt$thres_high,
|
|
400 files = opt$imzML
|
|
401 )
|
|
402
|
|
403 ids <- cbind(deisotoped[[2]][, 1], annotated, deisotoped[[2]][, 3:4])
|
|
404
|
|
405 #' Create annotated image
|
|
406 image.ann <- cbind(ids, final.image[, 2:ncol(final.image)])
|
|
407
|
|
408 #' Normalise image
|
|
409 image.norm <- normalise(
|
|
410 imagedata.in = image.ann,
|
|
411 norm.type = opt$norm_type,
|
|
412 standards = eval(parse(text = opt$standards)),
|
|
413 offset = 4
|
|
414 )
|
|
415
|
|
416 #' wl-12-02-2018, Mon: change the first column name
|
|
417 colnames(image.norm)[1] <- "peak"
|
|
418
|
|
419 #' save processed results
|
|
420 #' write.csv(image.norm, file=opt$image_out, row.names = FALSE)
|
|
421 write.table(image.norm, file = opt$image_out, sep = "\t", row.names = FALSE)
|
|
422
|
|
423 #' save to rda for debug
|
|
424 if (opt$rdata) {
|
|
425 save(image.norm, image.ann, final.image, annotated, deisotoped,
|
|
426 temp.image.filtered, temp.image, peaks, extracted, dbase,
|
|
427 file = opt$rdata_out
|
|
428 )
|
|
429 }
|
|
430 } else {
|
|
431 image.norm <- read.table(opt$image_file,
|
|
432 sep = "\t", header = TRUE,
|
|
433 na.strings = "", stringsAsFactors = T
|
|
434 )
|
|
435 }
|
|
436
|
|
437
|
|
438 ## ==== Perform PCA if requested ====
|
|
439
|
|
440 if (opt$pca) {
|
|
441 image.scale <- centreScale(
|
|
442 imagedata.in = image.norm,
|
|
443 scale.type = opt$scale_type,
|
|
444 transform = opt$transform,
|
|
445 offset = 4
|
|
446 )
|
|
447
|
|
448 pdf(file = opt$pca_out, onefile = T)
|
|
449 imagePca(
|
|
450 imagedata.in = image.scale, offset = 4,
|
|
451 PCnum = opt$PCnum, scale = opt$scale,
|
|
452 x.cood = x.cood, y.cood = y.cood,
|
|
453 nlevels = opt$nlevels, res.spatial = opt$res_spatial,
|
|
454 rem.outliers = opt$rem_outliers,
|
|
455 summary = opt$summary, title = opt$title
|
|
456 )
|
|
457 dev.off()
|
|
458
|
|
459 if (opt$loading) {
|
|
460 pca <- princomp(t(image.scale[, (4 + 1):ncol(image.scale)]),
|
|
461 cor = FALSE,
|
|
462 scores = TRUE, covmat = NULL
|
|
463 )
|
|
464 labs.all <- as.numeric(as.vector(image.scale[, 1]))
|
|
465
|
|
466 #' for (i in 1:opt$PCnum){
|
|
467 #' loadings <- pca$loadings[,i]
|
|
468 #' loadings <- cbind(loadings, labs.all)
|
|
469 #' write.csv(loadings, file=paste0(home_dir,"/res/", "loadings_PC",i,".csv"))
|
|
470 #' }
|
|
471
|
|
472 #' wl-05-02-2018, Mon: save as one excel file
|
|
473 ld <- lapply(1:opt$PCnum, function(i) {
|
|
474 loadings <- pca$loadings[, i]
|
|
475 loadings <- cbind(loadings, labs.all)
|
|
476 loadings <- as.data.frame(loadings)
|
|
477 })
|
|
478 names(ld) <- paste0("PC", 1:opt$PCnum)
|
|
479
|
|
480 WriteXLS(ld,
|
|
481 ExcelFileName = opt$loading_out,
|
|
482 row.names = F, FreezeRow = 1
|
|
483 )
|
|
484 }
|
|
485 }
|
|
486
|
|
487 ## ==== Make ion slice if requested ====
|
|
488
|
|
489 if (opt$slice) {
|
|
490 pdf(file = opt$slice_out, onefile = T)
|
|
491 imageSlice(
|
|
492 row = opt$row, imagedata.in = image.norm, scale = opt$scale,
|
|
493 x.cood = x.cood, y.cood = y.cood,
|
|
494 nlevels = opt$nlevels,
|
|
495 name = image.norm[opt$row, 1],
|
|
496 subname = image.norm[opt$row, 2],
|
|
497 offset = 4, res.spatial = opt$res_spatial,
|
|
498 rem.outliers = opt$rem_outliers, summary = opt$summary,
|
|
499 title = opt$title
|
|
500 )
|
|
501 dev.off()
|
|
502 }
|
|
503
|
|
504 ## ==== Perform clustering if requested ====
|
|
505
|
|
506 if (opt$clus) {
|
|
507 pdf(file = opt$clus_out, onefile = T)
|
|
508 intensity <- cluster(
|
|
509 cluster.type = opt$cluster_type,
|
|
510 imagedata.in = image.norm,
|
|
511 offset = 4, res.spatial = opt$res_spatial,
|
|
512 width = x.cood, height = y.cood,
|
|
513 clusters = opt$clusters
|
|
514 )
|
|
515 dev.off()
|
|
516
|
|
517 if (opt$intensity) {
|
|
518 #' write.table(intensity,file=opt$intensity_out,sep="\t")
|
|
519 #' wl-14-02-2018, Wed: more need to be done for "\t"
|
|
520 tmp <- cbind(Clusters = rownames(intensity), intensity)
|
|
521 write.table(tmp, file = opt$intensity_out, sep = "\t", row.name = FALSE)
|
|
522 }
|
|
523 }
|