comparison qualitymetrics_script.R @ 0:b4f5b5bc01dd draft

planemo upload for repository https://github.com/workflow4metabolomics/qualitymetrics.git commit 73366dd3473c509341ab9ba1df8ba748d08a50a1
author ethevenot
date Sat, 06 Aug 2016 12:01:17 -0400
parents
children 6d3b7b6573d8
comparison
equal deleted inserted replaced
-1:000000000000 0:b4f5b5bc01dd
1 ################################################################################################
2 # ANALYSES FOR QUALITY CONTROL #
3 # #
4 # Author: Melanie PETERA #
5 # User: Galaxy #
6 # Starting date: 04-09-2014 #
7 # V-1.0: Restriction of old filter script to CV filter #
8 # V-1.1: Addition of data check #
9 # V-1.2: Substitution of deletion by addition of indicator variable #
10 # V-1.3: Handling special characters #
11 # #
12 # #
13 # Input files: dataMatrix ; sampleMetadata ; variableMetadata #
14 # Output files: dataMatrix ; sampleMetadata ; variableMetadata #
15 # #
16 ################################################################################################
17
18 # Parameters (for dev)
19 if(FALSE){
20
21 ion.file.in <- "test/ressources/inputs/ex_data_IONS.txt" #tab file
22 meta.samp.file.in <- "test/ressources/inputs/ex_data_PROTOCOLE1.txt" #tab file
23 meta.ion.file.in <- "test/ressources/inputs/ex_data_METAION.txt" #tab file
24
25 ## ion.file.out <- "test/ressources/outputs/QCtest_ex_data_IONS.txt" #tab file
26 meta.samp.file.out <- "test/ressources/outputs/QCtest_ex_data_PROTOCOLE1.txt" #tab file
27 meta.ion.file.out <- "test/ressources/outputs/QCtest_ex_data_METAION.txt" #tab file
28
29 CV <- TRUE ; if(CV){Compa<-TRUE;seuil<-1.25}else{Compa<-NULL;seuil<-NULL}
30
31 poolAsPool1L <- FALSE
32
33 if(FALSE) { ## Sacuri dataset
34
35 ## 'example' input dir
36 exaDirInpC <- "example/input"
37
38 ion.file.in <- file.path(exaDirInpC, "dataMatrix.tsv")
39 meta.samp.file.in <- file.path(exaDirInpC, "sampleMetadata.tsv")
40 meta.ion.file.in <- file.path(exaDirInpC, "variableMetadata.tsv")
41
42 poolAsPool1L <- FALSE
43
44 ## 'example' output dir
45 exaDirOutC <- gsub("input", "output", exaDirInpC)
46
47 mata.samp.file.out <- file.path(exaDirOutC, "sampleMetadata.tsv")
48 meta.ion.file_out <- file.path(exaDirOutC, "variableMetadata.tsv")
49 fig.out <- file.path(exaDirOutC, "figure.pdf")
50 log.out <- file.path(exaDirOutC, "information.txt")
51
52 stopifnot(file.exists(exaDirOutC))
53
54 }
55
56 }
57
58 QualityControl <- function(ion.file.in, meta.samp.file.in, meta.ion.file.in,
59 CV, Compa, seuil, poolAsPool1L,
60 ion.file.out, meta.samp.file.out, meta.ion.file.out, fig.out, log.out){
61 # This function allows to analyse data to check its quality
62 # It needs 3 datasets: the data matrix, the variables' metadata, the samples' metadata.
63 # It generates 3 new datasets corresponding to the 3 inputs with additional columns.
64 #
65 # Parameters:
66 # - xxx.in: input files' names
67 # - xxx.out: output files' names
68 # - CV: CV calculation yes/no
69 # | > Compa: comparing pool and sample CVs (TRUE) or simple pool CV calculation (FALSE)
70 # | > seuil: maximum ratio tolerated between pool and sample CVs or maximum pool CV
71
72
73 # Input -----------------------------------------------------------------------------------
74
75 ion.data <- read.table(ion.file.in,sep="\t",header=TRUE,check.names=FALSE, stringsAsFactors = FALSE)
76 meta.samp.data <- read.table(meta.samp.file.in,sep="\t",header=TRUE,check.names=FALSE, stringsAsFactors = FALSE)
77 meta.ion.data <- read.table(meta.ion.file.in,sep="\t",header=TRUE,check.names=FALSE, stringsAsFactors = FALSE)
78
79 # Error vector
80 err.stock <- "\n"
81
82 # Table match check
83 table.check <- match3(ion.data,meta.samp.data,meta.ion.data)
84 check.err(table.check)
85
86 # StockID
87 samp.id <- stockID(ion.data,meta.samp.data,"sample")
88 ion.data <- samp.id$dataMatrix
89 meta.samp.data <- samp.id$Metadata
90 samp.id <- samp.id$id.match
91
92
93 # Function 1: CV calculation --------------------------------------------------------------
94 # Allows to class ions according to the Coefficient of Variation (CV):
95 # Compa=TRUE:
96 # CV of pools and CV of samples are compared (ration between pools' one and samples' one)
97 # and confronted to a given ration.
98 # Compa=FALSE:
99 # only CV of pools are considered ; compared to a given threshold
100
101 if(CV){
102
103 # Checking the sampleType variable
104 if(is.null(meta.samp.data$sampleType)){
105 err.stock <- c(err.stock,"\n-------",
106 "\nWarning : no 'sampleType' variable detected in sample meta-data !",
107 "\nCV can not be calculated.\n-------\n")
108 }else{
109 if(!("pool"%in%levels(factor(meta.samp.data$sampleType)))){
110 err.stock <- c(err.stock,"\n-------",
111 "\nWarning : no 'pool' detected in 'sampleType' variable (sample meta-data) !",
112 "\nCV can not be calculated.\n-------\n")
113 }else{
114 if((!("sample"%in%levels(factor(meta.samp.data$sampleType))))&(Compa)){
115 err.stock <- c(err.stock,"\n-------",
116 "\nWarning : no 'sample' detected in 'sampleType' variable (sample meta-data) !",
117 "\nCV can not be calculated.\n-------\n")
118 }else{
119
120 # Statement
121 tmp.ion <- data.frame(CV.ind=rep(NA,nrow(ion.data)),CV.samp=rep(NA,nrow(ion.data)),
122 CV.pool=rep(NA,nrow(ion.data)),ion.data,stringsAsFactors=FALSE)
123 # CV samples
124 tmp.samp <- which(colnames(tmp.ion)%in%meta.samp.data[which(meta.samp.data$sampleType=="sample"),1])
125 tmp.ion$CV.samp <- apply(tmp.ion[,tmp.samp],1,function(x)sd(x, na.rm = TRUE)) / rowMeans(tmp.ion[,tmp.samp], na.rm = TRUE)
126 tmp.ion$CV.samp[which(apply(tmp.ion[,tmp.samp],1,function(x)sd(x, na.rm = TRUE))==0)] <- 0
127 # CV pools
128 tmp.samp <- which(colnames(tmp.ion)%in%meta.samp.data[which(meta.samp.data$sampleType=="pool"),1])
129 tmp.ion$CV.pool <- apply(tmp.ion[,tmp.samp],1,function(x)sd(x, na.rm = TRUE)) / rowMeans(tmp.ion[,tmp.samp], na.rm = TRUE)
130 tmp.ion$CV.pool[which(apply(tmp.ion[,tmp.samp],1,function(x)sd(x, na.rm = TRUE))==0)] <- 0
131 # CV indicator
132 if(Compa){tmp.ion$CV.ind <- ifelse((tmp.ion$CV.pool)/(tmp.ion$CV.samp)>seuil,0,1)
133 }else{tmp.ion$CV.ind <- ifelse((tmp.ion$CV.pool)>seuil,0,1)}
134 # Addition of new columns in meta.ion.data
135 if(Compa){tmp.ion<-tmp.ion[,c(4,2,3,1,1)]}else{tmp.ion<-tmp.ion[,c(4,3,1,1)]}
136 tmp.ion[,ncol(tmp.ion)] <- 1:nrow(tmp.ion)
137 meta.ion.data <- merge(x=meta.ion.data,y=tmp.ion,by.x=1,by.y=1)
138 meta.ion.data <- meta.ion.data[order(meta.ion.data[,ncol(meta.ion.data)]),][,-ncol(meta.ion.data)]
139 rownames(meta.ion.data) <- NULL
140
141 rm(tmp.ion,tmp.samp)
142
143 }}}
144
145 } # end if(CV)
146
147 ## complementary metrics (ET)
148
149 datMN <- t(as.matrix(ion.data[, -1]))
150 colnames(datMN) <- ion.data[, 1]
151 datMN <- datMN[, meta.ion.data[, 1]] ## in case meta.ion.data has been re-ordered during the CV = TRUE computations
152 quaLs <- qualityMetricsF(datMN,
153 meta.samp.data,
154 meta.ion.data,
155 poolAsPool1L,
156 fig.out,
157 log.out)
158 meta.samp.data <- quaLs[["samDF"]]
159 meta.ion.data <- quaLs[["varDF"]]
160
161
162 # Output ----------------------------------------------------------------------------------
163
164 # Getting back original identifiers
165 id.ori <- reproduceID(ion.data,meta.samp.data,"sample",samp.id)
166 ion.data <- id.ori$dataMatrix
167 meta.samp.data <- id.ori$Metadata
168
169
170 # Error checking
171 if(length(err.stock)>1){
172 stop(err.stock)
173 }else{
174
175 ## write.table(ion.data, ion.file.out, sep="\t", row.names=FALSE, quote=FALSE)
176 write.table(meta.samp.data, meta.samp.file.out, sep="\t", row.names=FALSE, quote=FALSE)
177 write.table(meta.ion.data, meta.ion.file.out, sep="\t", row.names=FALSE, quote=FALSE)
178
179 }
180
181
182 } # end of QualityControl function
183
184
185 # Typical function call
186 # QualityControl(ion.file.in, meta.samp.file.in, meta.ion.file.in,
187 # CV, Compa, seuil,
188 # ion.file.out, meta.samp.file.out, meta.ion.file.out)
189
190
191 qualityMetricsF <- function(datMN,
192 samDF,
193 varDF,
194 pooAsPo1L = TRUE,
195 fig.pdfC = NULL,
196 log.txtC = NULL) {
197
198 optWrnN <- options()$warn
199 options(warn = -1)
200
201
202 ##------------------------------
203 ## Functions
204 ##------------------------------
205
206
207 allDigF <- function (string) { ## from the Hmisc package (all.digits)
208 k <- length(string)
209 result <- logical(k)
210 for (i in 1:k) {
211 st <- string[i]
212 ls <- nchar(st)
213 ex <- substring(st, 1:ls, 1:ls)
214 result[i] <- all(match(ex, c("0", "1", "2", "3", "4",
215 "5", "6", "7", "8", "9"), nomatch = 0) > 0)
216 }
217 result
218 }
219
220 datPloF <- function() { ## ploting data matrix
221
222 thrVn <- c(pvalue=0.001,
223 poolCv=0.3)
224
225 ## Constants
226
227 marLs <- list(dri = c(2.1, 2.6, 1.1, 1.1),
228 ima = c(1.1, 2.6, 4.1, 1.1),
229 msd = c(2.1, 2.6, 1.1, 0.6),
230 sam = c(3.1, 3.6, 1.1, 0.6),
231 pca = c(2.6, 3.6, 1.1, 0.6),
232 sca = c(1.1, 4.1, 4.1, 0.6),
233 tit = c(0.1, 0.6, 1.1, 0.6))
234 palHeaVc <- rev(rainbow(ceiling(256 * 1.5))[1:256])
235
236 ## Functions
237
238 axiPreF <- function(valVn,
239 lenN) {
240
241 if(NA %in% valVn) {
242 warning("NA in valVn")
243 valVn <- as.vector(na.omit(valVn))
244 }
245
246 if(lenN < length(valVn))
247 stop("The length of in vector must be inferior to the length of the length parameter.")
248
249 if(length(valVn) < lenN)
250 valVn <- seq(from = min(valVn), to = max(valVn), length.out = lenN)
251
252 preValVn <- pretty(valVn)
253
254 preLabVn <- preAtVn <- c()
255
256 for(n in 1:length(preValVn))
257 if(min(valVn) < preValVn[n] && preValVn[n] < max(valVn)) {
258 preLabVn <- c(preLabVn, preValVn[n])
259 preAtVn <- c(preAtVn, which(abs(valVn - preValVn[n]) == min(abs(valVn - preValVn[n])))[1])
260 }
261
262 return(list(atVn = preAtVn,
263 labVn = preLabVn))
264
265 }
266
267 colF <- function(vecVn)
268 sapply(vecVn,
269 function(outN) {
270 if(outN < ploRgeVn[1])
271 return(palHeaVc[1])
272 else if(outN > ploRgeVn[2])
273 return(palHeaVc[256])
274 else return(palHeaVc[round((outN - ploRgeVn[1]) / diff(ploRgeVn) * 256 + 1)])})
275
276 obsColF <- function(typVc) {
277
278 ## available color palette
279 palVc <- palette()
280
281 ## colors for common types are set aside
282 palVc <- palVc[!(palVc %in% c("black", "red", "green3"))]
283
284 ## filling in the types with dedicated colors
285 samTypVc <- sort(unique(samDF[, "sampleType"]))
286 samColVc <- character(length(samTypVc))
287 if("blank" %in% samTypVc)
288 samColVc[grepl("blank", samTypVc)] <- "black"
289 if("pool" %in% samTypVc)
290 samColVc[grepl("pool", samTypVc)] <- "red"
291 if("sample" %in% samTypVc)
292 samColVc[grepl("sample", samTypVc)] <- "green4"
293
294 ## filling in the other types
295 palColI <- 1
296 palColMaxI <- length(palVc)
297
298 while(any(samColVc == "")) {
299 typToColI <- which(samColVc == "")[1]
300 if(palColI <= palColMaxI)
301 samColVc[typToColI] <- palVc[palColI]
302 else
303 samColVc[typToColI] <- "gray"
304 palColI <- palColI + 1
305 }
306
307 names(samColVc) <- samTypVc
308
309 samColVc[typVc]
310
311 }
312
313 par(font = 2,
314 font.axis = 2,
315 font.lab = 2,
316 pch=18)
317
318 layout(matrix(c(1, 3, 4, 5, 5,
319 1, 7, 7, 7, 6,
320 2, 7, 7, 7, 6),
321 byrow = TRUE,
322 nrow = 3),
323 heights = c(1.8, 1.2, 2.5),
324 widths = c(3.5, 1.8, 2.8, 1, 0.8))
325
326 ## Colors
327 ##-------
328
329 if("sampleType" %in% colnames(samDF)) {
330 obsColVc <- obsColF(samDF[, "sampleType"])
331 } else
332 obsColVc <- rep("black", nrow(samDF))
333
334 ## PCA and Hotelling ellipse
335 ##--------------------------
336
337 vVn <- getPcaVarVn(ropLs)
338 vRelVn <- vVn / ncol(datMN)
339
340 par(mar = marLs[["pca"]])
341
342 plot(ropScoreMN,
343 type = "n",
344 xlab = "",
345 ylab = "",
346 xlim = range(ropScoreMN[, 1]) * 1.1)
347 mtext(paste("t1 (", round(vRelVn[1] * 100), "%)", sep = ""),
348 cex = 0.7,
349 line = 2,
350 side = 1)
351 mtext(paste("t2 (", round(vRelVn[2] * 100), "%)", sep = ""),
352 cex = 0.7,
353 las = 0,
354 line = 2,
355 side = 2)
356 abline(h = 0, lty = "dashed")
357 abline(v = 0, lty = "dashed")
358 radVn <- seq(0, 2 * pi, length.out = 100)
359
360 hotFisN <- hotN * qf(1 - thrVn["pvalue"], 2, n - 2)
361 lines(sqrt(var(ropScoreMN[, 1]) * hotFisN) * cos(radVn),
362 sqrt(var(ropScoreMN[, 2]) * hotFisN) * sin(radVn))
363
364 text(ropScoreMN[, 1],
365 ropScoreMN[, 2],
366 cex = 0.7,
367 col = obsColVc,
368 labels = rownames(datMN))
369
370 if("sampleType" %in% colnames(samDF)) {
371 obsColVuc <- obsColVc[sort(unique(names(obsColVc)))]
372 legOrdVc <- c("blank", paste0("pool", 8:1), "pool", "other", "sample")
373 obsColVuc <- obsColVuc[legOrdVc[legOrdVc %in% names(obsColVuc)]]
374
375 text(rep(par("usr")[1], times = length(obsColVuc)),
376 par("usr")[3] + (0.97 - length(obsColVuc) * 0.03 + 1:length(obsColVuc) * 0.03) * diff(par("usr")[3:4]),
377 col = obsColVuc,
378 font = 2,
379 labels = names(obsColVuc),
380 pos = 4)
381 }
382
383 ## Missing/low intensities and decile values
384 ##------------------------------------------
385
386 par(mar = marLs[["sam"]])
387
388 plot(missZscoVn,
389 deciZscoMaxVn,
390 type = "n",
391 xlab = "",
392 ylab = "",
393 xlim = c(min(missZscoVn),
394 max(missZscoVn) + 0.5))
395 mtext("amount of missing values (z-score)",
396 cex = 0.7,
397 line = 2,
398 side = 1)
399 mtext("deciles (zscore)",
400 cex = 0.7,
401 las = 0,
402 line = 2,
403 side = 2)
404 abline(h = qnorm(1 - thrVn["pvalue"] / 2) * c(-1, 1), lty = "dashed")
405 abline(v = qnorm(1 - thrVn["pvalue"] / 2) * c(-1, 1), lty = "dashed")
406 text(missZscoVn,
407 deciZscoMaxVn,
408 cex = 0.7,
409 col = obsColVc,
410 labels = rownames(datMN))
411
412 ## tit: Title
413 ##-----------
414
415 par(mar = marLs[["tit"]])
416 plot(0:1, bty = "n", type = "n", xaxt = "n", yaxt = "n", xlab = "", ylab = "")
417 text(1.5, 1, cex = 1.3, labels = "Quality Metrics")
418 text(1, 0.85, adj=0, cex = 1.1, labels = paste0("NAs: ",
419 round(length(which(is.na(c(datMN)))) / cumprod(dim(datMN))[2] * 100), "%"))
420 text(1, 0.75, adj=0, cex = 1.1, labels = paste0("0 values: ",
421 round(sum(abs(datMN) < epsN, na.rm=TRUE) / cumprod(dim(datMN))[2] * 100, 2), "%"))
422 text(1, 0.65, adj=0, cex = 1.1, labels = paste0("min: ", signif(min(datMN, na.rm=TRUE), 2)))
423 text(1, 0.55, adj=0, cex = 1.1, labels = paste0("median: ", signif(median(datMN, na.rm=TRUE), 2)))
424 text(1, 0.45, adj=0, cex = 1.1, labels = paste0("mean: ", signif(mean(datMN, na.rm=TRUE), 2)))
425 text(1, 0.35, adj=0, cex = 1.1, labels = paste0("max: ", signif(max(datMN, na.rm=TRUE), 2)))
426 if("sampleType" %in% colnames(samDF) &&
427 "pool" %in% samDF[, "sampleType"])
428 text(1,
429 0.25,
430 adj=0, cex = 1.1,
431 labels = paste0("pool CV < ",
432 round(thrVn["poolCv"] * 100), "%: ",
433 round(sum(varDF[, "pool_CV"] < thrVn["poolCv"]) / nrow(varDF) * 100),
434 "%"))
435
436 text(1, 0.1, adj=0, labels = paste0("Thresholds used in plots:"))
437 text(1, 0, adj=0, labels = paste0(" p-value = ", thrVn["pvalue"]))
438
439 ## dri: Analytical drift
440 ##----------------------
441
442 par(mar = marLs[["dri"]])
443
444 ## ordering
445
446 driDatMN <- datMN
447 driSamDF <- samDF
448
449 driSamDF[, "ordIniVi"] <- 1:nrow(driDatMN)
450
451 if("injectionOrder" %in% colnames(driSamDF)) {
452 if("batch" %in% colnames(driSamDF))
453 ordVi <- order(driSamDF[, "batch"],
454 driSamDF[, "injectionOrder"])
455 else
456 ordVi <- order(driSamDF[, "injectionOrder"])
457 } else
458 ordVi <- 1:nrow(driDatMN)
459
460 driDatMN <- driDatMN[ordVi, ]
461 driSamDF <- driSamDF[ordVi, ]
462
463 driColVc <- rep("black", nrow(driDatMN))
464 if("sampleType" %in% colnames(driSamDF))
465 driColVc <- obsColF(driSamDF[, "sampleType"])
466
467 plot(rowSums(driDatMN, na.rm=TRUE),
468 col = driColVc,
469 pch = 18,
470 xlab = "",
471 ylab = "")
472
473 mtext("injection order",
474 cex = 0.7,
475 line = 2,
476 side = 1)
477
478 mtext("Sum of intens. for all variables",
479 cex = 0.7,
480 line = 2,
481 side = 2)
482
483 ## msd: Sd vs Mean plot
484 ##---------------------
485
486 par(mar = marLs[["msd"]])
487 plot(apply(datMN, 2, function(y) mean(y, na.rm = TRUE)),
488 apply(datMN, 2, function(y) sd(y, na.rm = TRUE)),
489 col=obsColVc,
490 pch = 18,
491 xlab = "",
492 ylab = "")
493 mtext("mean",
494 cex = 0.7,
495 line = 2,
496 side = 1)
497 mtext("sd",
498 cex = 0.7,
499 line = 2,
500 side = 2)
501
502 ## sca-6: Color scale
503 ##-------------------
504
505 par(mar = marLs[["sca"]])
506
507 ylimVn <- c(0, 256)
508 ybottomVn <- 0:255
509 ytopVn <- 1:256
510
511 plot(x = 0,
512 y = 0,
513 font.axis = 2,
514 font.lab = 2,
515 type = "n",
516 xlim = c(0, 1),
517 ylim = ylimVn,
518 xlab = "",
519 ylab = "",
520 xaxs = "i",
521 yaxs = "i",
522 xaxt = "n",
523 yaxt = "n")
524
525 rect(xleft = 0,
526 ybottom = ybottomVn,
527 xright = 1,
528 ytop = ytopVn,
529 col = palHeaVc,
530 border = NA)
531
532 eval(parse(text = paste("axis(at = axiPreF(c(ifelse(min(datMN, na.rm = TRUE) == -Inf, yes = 0, no = min(datMN, na.rm = TRUE)) , max(datMN, na.rm = TRUE)), 256)$atVn,
533 font = 2,
534 font.axis = 2,
535 labels = axiPreF(c(ifelse(min(datMN, na.rm = TRUE) == -Inf, yes = 0, no = min(datMN, na.rm = TRUE)), max(datMN, na.rm = TRUE)), 256)$labVn,
536 las = 1,
537 lwd = 2,
538 lwd.ticks = 2,
539 side = 2,
540 xpd = TRUE)", sep = "")))
541
542 arrows(par("usr")[1],
543 par("usr")[4],
544 par("usr")[1],
545 par("usr")[3],
546 code = 0,
547 lwd = 2,
548 xpd = TRUE)
549
550 ## ima: Image
551 ##-----------
552
553 par(mar = marLs[["ima"]])
554
555 ploRgeVn <- range(datMN, na.rm = TRUE)
556
557 imaMN <- t(datMN)[, rev(1:nrow(datMN)), drop = FALSE]
558
559 image(x = 1:nrow(imaMN),
560 y = 1:ncol(imaMN),
561 z = imaMN,
562 col = palHeaVc,
563 font.axis = 2,
564 font.lab = 2,
565 xaxt = "n",
566 yaxt = "n",
567 xlab = "",
568 ylab = "")
569
570 if(length(rownames(datMN)) == 0) {
571 rowNamVc <- rep("", times = nrow(datMN))
572 } else
573 rowNamVc <- rownames(datMN)
574
575 if(length(colnames(datMN)) == 0) {
576 colNamVc <- rep("", times = ncol(datMN))
577 } else
578 colNamVc <- colnames(datMN)
579
580 xlaVc <- paste(paste(rep("[", 2),
581 c(1, nrow(imaMN)),
582 rep("] ", 2),
583 sep = ""),
584 rep("\n", times = 2),
585 c(colNamVc[1], tail(colNamVc, 1)),
586 sep = "")
587
588 for(k in 1:2)
589 axis(side = 3,
590 hadj = c(0, 1)[k],
591 at = c(1, nrow(imaMN))[k],
592 cex = 0.8,
593 font = 2,
594 labels = xlaVc[k],
595 line = -0.5,
596 tick = FALSE)
597
598
599 ylaVc <- paste(paste(rep("[", times = 2),
600 c(ncol(imaMN), 1),
601 rep("]", times = 2),
602 sep = ""),
603 rep("\n", times = 2),
604 c(tail(rowNamVc, 1), rowNamVc[1]),
605 sep = "")
606
607 for(k in 1:2)
608 axis(side = 2,
609 at = c(1, ncol(imaMN))[k],
610 cex = 0.8,
611 font = 2,
612 hadj = c(0, 1)[k],
613 labels = ylaVc[k],
614 las = 0,
615 line = -0.5,
616 lty = "blank",
617 tick = FALSE)
618
619 box(lwd = 2)
620
621
622 }
623
624
625 zScoreF <- function(x) {
626 sdxN <- sd(x, na.rm = TRUE)
627 if(sdxN < epsN)
628 return(rep(0, length(x)))
629 else
630 return((x - mean(x, na.rm = TRUE)) / sdxN)
631 }
632
633
634 ## Option
635 ##-------
636
637 strAsFacL <- options()$stringsAsFactors
638 options(stingsAsFactors = FALSE)
639
640 ## Constants
641 ##----------
642
643 epsN <- .Machine[["double.eps"]] ## [1] 2.22e-16
644
645
646 ##------------------------------
647 ## Start
648 ##------------------------------
649
650 if(!is.null(log.txtC))
651 sink(log.txtC)
652
653 ## Description
654 ##------------
655
656 cat("\n\nData description:\n\n", sep = "")
657 cat("observations:", nrow(datMN), "\n")
658 cat("variables:", ncol(datMN), "\n")
659 cat("missing:", sum(is.na(datMN)), "\n")
660 cat("0 values (%):",
661 sum(abs(datMN) < epsN, na.rm=TRUE) / cumprod(dim(datMN))[2] * 100, "\n")
662 cat("min:", min(datMN, na.rm=TRUE), "\n")
663 cat("mean:", signif(mean(datMN, na.rm=TRUE), 2), "\n")
664 cat("median:", signif(median(datMN, na.rm=TRUE), 2), "\n")
665 cat("max:", signif(max(datMN, na.rm=TRUE), 2), "\n")
666
667 if("sampleType" %in% colnames(samDF)) {
668 cat("\nSample types:\n", sep = "")
669 print(table(samDF[, "sampleType"]))
670 cat("\n", sep="")
671 }
672
673
674 ##------------------------------
675 ## Variable metrics
676 ##------------------------------
677
678
679 ## 'blank' observations
680
681 if("sampleType" %in% colnames(samDF) && "blank" %in% samDF[, "sampleType"]) {
682
683 cat("\nVariables: Blank mean, sd, and CV\n", sep="")
684
685 blkVl <- samDF[, "sampleType"] == "blank"
686
687 if(sum(blkVl) == 1)
688 varDF[, "blank_mean"] <- datMN[blkVl, ]
689 else
690 varDF[, "blank_mean"] <- apply(datMN[blkVl, , drop=FALSE], 2, function(varVn) mean(varVn, na.rm=TRUE))
691
692 if(sum(blkVl) == 1)
693 varDF[, "blank_sd"] <- rep(0, nrow(varDF))
694 else
695 varDF[, "blank_sd"] <- apply(datMN[blkVl, , drop=FALSE], 2, function(varVn) sd(varVn, na.rm=TRUE))
696
697 varDF[, "blank_CV"] <- varDF[, "blank_sd"] / varDF[, "blank_mean"]
698
699 }
700
701
702 ## 'sample' observations
703
704 if("sampleType" %in% colnames(samDF) && "sample" %in% samDF[, "sampleType"]) {
705
706 cat("\nVariables: Sample mean, sd, and CV\n", sep="")
707
708 samVl <- samDF[, "sampleType"] == "sample"
709
710 if(sum(samVl) == 1)
711 varDF[, "sample_mean"] <- datMN[samVl, ]
712 else
713 varDF[, "sample_mean"] <- apply(datMN[samVl, , drop=FALSE], 2, function(varVn) mean(varVn, na.rm=TRUE))
714
715 if(sum(samVl) == 1)
716 varDF[, "sample_sd"] <- rep(0, nrow(varDF))
717 else
718 varDF[, "sample_sd"] <- apply(datMN[samVl, , drop=FALSE], 2, function(varVn) sd(varVn, na.rm=TRUE))
719
720 varDF[, "sample_CV"] <- varDF[, "sample_sd"] / varDF[, "sample_mean"]
721
722 }
723
724 ## 'blank' mean / 'sample' mean ratio
725
726 if(all(c("blank_mean", "sample_mean") %in% colnames(varDF))) {
727
728 cat("\nVariables: Blank mean over sample mean\n", sep="")
729
730 varDF[, "blankMean_over_sampleMean"] <- varDF[, "blank_mean"] / varDF[, "sample_mean"]
731
732 }
733
734 ## 'pool' observations
735
736 if("sampleType" %in% colnames(samDF) && "pool" %in% samDF[, "sampleType"]) {
737
738 cat("\nVariables: Pool mean, sd, and CV\n", sep="")
739
740 pooVl <- samDF[, "sampleType"] == "pool"
741
742 if(sum(pooVl) == 1)
743 varDF[, "pool_mean"] <- datMN[pooVl, ]
744 else
745 varDF[, "pool_mean"] <- apply(datMN[pooVl, , drop=FALSE], 2, function(varVn) mean(varVn, na.rm=TRUE))
746
747 if(sum(pooVl) == 1)
748 varDF[, "pool_sd"] <- rep(0, nrow(varDF))
749 else
750 varDF[, "pool_sd"] <- apply(datMN[pooVl, , drop=FALSE], 2, function(varVn) sd(varVn, na.rm=TRUE))
751
752 varDF[, "pool_CV"] <- varDF[, "pool_sd"] / varDF[, "pool_mean"]
753
754 }
755
756 ## 'pool' CV / 'sample' CV ratio
757
758 if(all(c("pool_CV", "sample_CV") %in% colnames(varDF))) {
759
760 cat("\nVariables: Pool CV over sample CV\n", sep="")
761
762 varDF[, "poolCV_over_sampleCV"] <- varDF[, "pool_CV"] / varDF[, "sample_CV"]
763
764 }
765
766
767 ## 'pool' dilutions
768
769 if("sampleType" %in% colnames(samDF) && any(grepl("pool.+", samDF[, "sampleType"]))) {
770
771 pooVi <- grep("pool.*", samDF[, "sampleType"]) ## pool, pool2, pool4, poolInter, ...
772
773 pooNamVc <- samDF[pooVi, "sampleType"]
774
775 if(pooAsPo1L) {
776
777 pooNamVc[pooNamVc == "pool"] <- "pool1" ## 'pool' -> 'pool1'
778
779 } else {
780
781 pooVl <- pooNamVc == "pool"
782 pooVi <- pooVi[!pooVl]
783 pooNamVc <- pooNamVc[!pooVl]
784
785 }
786
787 pooDilVc <- gsub("pool", "", pooNamVc)
788
789 pooDilVl <- sapply(pooDilVc, allDigF)
790
791 if(sum(pooDilVl)) {
792
793 cat("\nVariables: Pool dilutions\n", sep="")
794
795 pooNamVc <- pooNamVc[pooDilVl] ## for the plot
796
797 pooVi <- pooVi[pooDilVl]
798
799 dilVn <- 1 / as.numeric(pooDilVc[pooDilVl])
800
801 varDF[, "poolDil_correl"] <- apply(datMN[pooVi, , drop=FALSE], 2,
802 function(varVn) cor(dilVn, varVn))
803
804 varDF[, "poolDil_pval"] <- apply(datMN[pooVi, , drop=FALSE], 2,
805 function(varVn) cor.test(dilVn, varVn)[["p.value"]])
806
807 }
808
809 }
810
811
812 ##------------------------------
813 ## Sample metrics
814 ##------------------------------
815
816
817 ## Hotelling: p-value associated to the distance from the center in the first PCA score plane
818
819 cat("\nObservations: Hotelling ellipse\n", sep="")
820
821 ropLs <- opls(datMN, predI = 2, plotL = FALSE, printL = FALSE)
822
823 ropScoreMN <- getScoreMN(ropLs)
824
825 invCovScoMN <- solve(cov(ropScoreMN))
826
827 n <- nrow(datMN)
828 hotN <- 2 * (n - 1) * (n^2 - 1) / (n^2 * (n - 2))
829
830 hotPvaVn <- apply(ropScoreMN,
831 1,
832 function(x)
833 1 - pf(1 / hotN * t(as.matrix(x)) %*% invCovScoMN %*% as.matrix(x), 2, n - 2))
834
835 samDF[, "hotelling_pval"] <- hotPvaVn
836
837 ## p-value associated to number of missing values
838
839 cat("\nObservations: Missing values\n", sep="")
840
841 missZscoVn <- zScoreF(apply(datMN,
842 1,
843 function(rowVn) {
844 sum(is.na(rowVn))
845 }))
846
847 samDF[, "missing_pval"] <- sapply(missZscoVn, function(zscoN) 2 * (1 - pnorm(abs(zscoN))))
848
849 ## p-value associated to the deciles of the profiles
850
851 cat("\nObservations: Profile deciles\n", sep="")
852
853 deciMN <- t(as.matrix(apply(datMN,
854 1,
855 function(x) quantile(x, 0.1 * 1:9, na.rm = TRUE))))
856
857 deciZscoMN <- apply(deciMN, 2, zScoreF)
858
859 deciZscoMaxVn <- apply(deciZscoMN, 1, function(rowVn) rowVn[which.max(abs(rowVn))])
860
861 samDF[, "decile_pval"] <- sapply(deciZscoMaxVn, function(zscoN) 2 * (1 - pnorm(abs(zscoN))))
862
863
864 ##------------------------------
865 ## Figure
866 ##------------------------------
867
868 cat("\nPlotting\n")
869
870 if(!is.null(fig.pdfC)) {
871 pdf(fig.pdfC, width=11, height=7)
872 } else
873 dev.new(width=11, height=7)
874
875 datPloF()
876
877 if(!is.null(fig.pdfC))
878 dev.off()
879
880
881 ##------------------------------
882 ## End
883 ##------------------------------
884
885
886 if(!is.null(log.txtC))
887 sink()
888
889 options(stingsAsFactors = strAsFacL)
890 options(warn = optWrnN)
891
892 return(list(samDF=samDF,
893 varDF=varDF))
894
895
896 } ## qualityMetricsF