Mercurial > repos > lecorguille > xcms_group
comparison lib.r @ 4:2db1d1d0f131 draft
planemo upload for repository https://github.com/workflow4metabolomics/xcms commit 83b80dcd96b379518c2e4ace992affc889d32ca6
author | lecorguille |
---|---|
date | Fri, 08 Apr 2016 10:39:09 -0400 |
parents | |
children | 2c1d7df89cf6 |
comparison
equal
deleted
inserted
replaced
3:98e33cdf0eb1 | 4:2db1d1d0f131 |
---|---|
1 # lib.r version="2.0.1" | |
2 #Authors ABiMS TEAM | |
3 #Lib.r for Galaxy Workflow4Metabo | |
4 #version 2.2 | |
5 #Based on lib.r 2.1 | |
6 #Modifications made by Guitton Yann | |
7 #correct bug in Base Peak Chromatogram (BPC) option, not only TIC when scanrange used in xcmsSet | |
8 #Note if scanrange is used a warning is prompted in R console but do not stop PDF generation | |
9 | |
10 | |
11 | |
12 | |
13 #@author Y. Guitton | |
14 getBPC <- function(file,rtcor=NULL, ...) { | |
15 object <- xcmsRaw(file) | |
16 sel <- profRange(object, ...) | |
17 cbind(if (is.null(rtcor)) object@scantime[sel$scanidx] else rtcor ,xcms:::colMax(object@env$profile[sel$massidx,sel$scanidx,drop=FALSE])) | |
18 #plotChrom(xcmsRaw(file), base=T) | |
19 } | |
20 | |
21 #@author Y. Guitton | |
22 getBPCs <- function (xcmsSet=NULL, pdfname="BPCs.pdf",rt=c("raw","corrected"), scanrange=NULL) { | |
23 cat("Creating BIC pdf...\n") | |
24 | |
25 if (is.null(xcmsSet)) { | |
26 cat("Enter an xcmsSet \n") | |
27 stop() | |
28 } else { | |
29 files <- filepaths(xcmsSet) | |
30 } | |
31 | |
32 class<-as.vector(levels(xcmsSet@phenoData[,1])) #sometime phenoData have more than 1 column use first as class | |
33 | |
34 classnames<-vector("list",length(class)) | |
35 for (i in 1:length(class)){ | |
36 classnames[[i]]<-which( xcmsSet@phenoData[,1]==class[i]) | |
37 } | |
38 | |
39 N <- dim(phenoData(xcmsSet))[1] | |
40 | |
41 TIC <- vector("list",N) | |
42 | |
43 | |
44 for (j in 1:N) { | |
45 | |
46 TIC[[j]] <- getBPC(files[j]) | |
47 #good for raw | |
48 # seems strange for corrected | |
49 #errors if scanrange used in xcmsSetgeneration | |
50 if (!is.null(xcmsSet) && rt == "corrected") | |
51 rtcor <- xcmsSet@rt$corrected[[j]] else | |
52 rtcor <- NULL | |
53 | |
54 TIC[[j]] <- getBPC(files[j],rtcor=rtcor) | |
55 # TIC[[j]][,1]<-rtcor | |
56 } | |
57 | |
58 | |
59 | |
60 pdf(pdfname,w=16,h=10) | |
61 cols <- rainbow(N) | |
62 lty = 1:N | |
63 pch = 1:N | |
64 #search for max x and max y in BPCs | |
65 xlim = range(sapply(TIC, function(x) range(x[,1]))) | |
66 ylim = range(sapply(TIC, function(x) range(x[,2]))) | |
67 ylim = c(-ylim[2], ylim[2]) | |
68 | |
69 | |
70 ##plot start | |
71 | |
72 if (length(class)>2){ | |
73 for (k in 1:(length(class)-1)){ | |
74 for (l in (k+1):length(class)){ | |
75 #print(paste(class[k],"vs",class[l],sep=" ")) | |
76 plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k]," vs ",class[l], sep=""), xlab = "Retention Time (min)", ylab = "BPC") | |
77 colvect<-NULL | |
78 for (j in 1:length(classnames[[k]])) { | |
79 tic <- TIC[[classnames[[k]][j]]] | |
80 # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l") | |
81 points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l") | |
82 colvect<-append(colvect,cols[classnames[[k]][j]]) | |
83 } | |
84 for (j in 1:length(classnames[[l]])) { | |
85 # i=class2names[j] | |
86 tic <- TIC[[classnames[[l]][j]]] | |
87 points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l") | |
88 colvect<-append(colvect,cols[classnames[[l]][j]]) | |
89 } | |
90 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch) | |
91 } | |
92 } | |
93 }#end if length >2 | |
94 | |
95 if (length(class)==2){ | |
96 k=1 | |
97 l=2 | |
98 colvect<-NULL | |
99 plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Base Peak Chromatograms \n","BPCs_",class[k],"vs",class[l], sep=""), xlab = "Retention Time (min)", ylab = "BPC") | |
100 | |
101 for (j in 1:length(classnames[[k]])) { | |
102 | |
103 tic <- TIC[[classnames[[k]][j]]] | |
104 # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l") | |
105 points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l") | |
106 colvect<-append(colvect,cols[classnames[[k]][j]]) | |
107 } | |
108 for (j in 1:length(classnames[[l]])) { | |
109 # i=class2names[j] | |
110 tic <- TIC[[classnames[[l]][j]]] | |
111 points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l") | |
112 colvect<-append(colvect,cols[classnames[[l]][j]]) | |
113 } | |
114 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch) | |
115 | |
116 }#end length ==2 | |
117 | |
118 dev.off() #pdf(pdfname,w=16,h=10) | |
119 | |
120 invisible(TIC) | |
121 } | |
122 | |
123 | |
124 | |
125 #@author Y. Guitton | |
126 getTIC <- function(file,rtcor=NULL) { | |
127 object <- xcmsRaw(file) | |
128 cbind(if (is.null(rtcor)) object@scantime else rtcor, rawEIC(object,mzrange=range(object@env$mz))$intensity) | |
129 } | |
130 | |
131 ## | |
132 ## overlay TIC from all files in current folder or from xcmsSet, create pdf | |
133 ## | |
134 #@author Y. Guitton | |
135 getTICs <- function(xcmsSet=NULL,files=NULL, pdfname="TICs.pdf",rt=c("raw","corrected")) { | |
136 cat("Creating TIC pdf...\n") | |
137 | |
138 if (is.null(xcmsSet)) { | |
139 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]", "[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]") | |
140 filepattern <- paste(paste("\\.", filepattern, "$", sep = ""), collapse = "|") | |
141 if (is.null(files)) | |
142 files <- getwd() | |
143 info <- file.info(files) | |
144 listed <- list.files(files[info$isdir], pattern = filepattern, recursive = TRUE, full.names = TRUE) | |
145 files <- c(files[!info$isdir], listed) | |
146 } else { | |
147 files <- filepaths(xcmsSet) | |
148 } | |
149 | |
150 class<-as.vector(levels(xcmsSet@phenoData[,1])) #sometime phenoData have more than 1 column use first as class | |
151 | |
152 classnames<-vector("list",length(class)) | |
153 for (i in 1:length(class)){ | |
154 classnames[[i]]<-which( xcmsSet@phenoData[,1]==class[i]) | |
155 } | |
156 | |
157 N <- length(files) | |
158 TIC <- vector("list",N) | |
159 | |
160 for (i in 1:N) { | |
161 if (!is.null(xcmsSet) && rt == "corrected") | |
162 rtcor <- xcmsSet@rt$corrected[[i]] else | |
163 rtcor <- NULL | |
164 TIC[[i]] <- getTIC(files[i],rtcor=rtcor) | |
165 } | |
166 | |
167 pdf(pdfname,w=16,h=10) | |
168 cols <- rainbow(N) | |
169 lty = 1:N | |
170 pch = 1:N | |
171 #search for max x and max y in TICs | |
172 xlim = range(sapply(TIC, function(x) range(x[,1]))) | |
173 ylim = range(sapply(TIC, function(x) range(x[,2]))) | |
174 ylim = c(-ylim[2], ylim[2]) | |
175 | |
176 | |
177 ##plot start | |
178 if (length(class)>2){ | |
179 for (k in 1:(length(class)-1)){ | |
180 for (l in (k+1):length(class)){ | |
181 #print(paste(class[k],"vs",class[l],sep=" ")) | |
182 plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k]," vs ",class[l], sep=""), xlab = "Retention Time (min)", ylab = "TIC") | |
183 colvect<-NULL | |
184 for (j in 1:length(classnames[[k]])) { | |
185 | |
186 tic <- TIC[[classnames[[k]][j]]] | |
187 # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l") | |
188 points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l") | |
189 colvect<-append(colvect,cols[classnames[[k]][j]]) | |
190 } | |
191 for (j in 1:length(classnames[[l]])) { | |
192 # i=class2names[j] | |
193 tic <- TIC[[classnames[[l]][j]]] | |
194 points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l") | |
195 colvect<-append(colvect,cols[classnames[[l]][j]]) | |
196 } | |
197 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch) | |
198 } | |
199 } | |
200 }#end if length >2 | |
201 if (length(class)==2){ | |
202 k=1 | |
203 l=2 | |
204 | |
205 plot(0, 0, type="n", xlim = xlim/60, ylim = ylim, main = paste("Total Ion Chromatograms \n","TICs_",class[k],"vs",class[l], sep=""), xlab = "Retention Time (min)", ylab = "TIC") | |
206 colvect<-NULL | |
207 for (j in 1:length(classnames[[k]])) { | |
208 tic <- TIC[[classnames[[k]][j]]] | |
209 # points(tic[,1]/60, tic[,2], col = cols[i], pch = pch[i], type="l") | |
210 points(tic[,1]/60, tic[,2], col = cols[classnames[[k]][j]], pch = pch[classnames[[k]][j]], type="l") | |
211 colvect<-append(colvect,cols[classnames[[k]][j]]) | |
212 } | |
213 for (j in 1:length(classnames[[l]])) { | |
214 # i=class2names[j] | |
215 tic <- TIC[[classnames[[l]][j]]] | |
216 points(tic[,1]/60, -tic[,2], col = cols[classnames[[l]][j]], pch = pch[classnames[[l]][j]], type="l") | |
217 colvect<-append(colvect,cols[classnames[[l]][j]]) | |
218 } | |
219 legend("topright",paste(basename(files[c(classnames[[k]],classnames[[l]])])), col = colvect, lty = lty, pch = pch) | |
220 | |
221 }#end length ==2 | |
222 dev.off() #pdf(pdfname,w=16,h=10) | |
223 | |
224 invisible(TIC) | |
225 } | |
226 | |
227 | |
228 | |
229 ## | |
230 ## Get the polarities from all the samples of a condition | |
231 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM | |
232 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM | |
233 getSampleMetadata <- function(xcmsSet=NULL, sampleMetadataOutput="sampleMetadata.tsv") { | |
234 cat("Creating the sampleMetadata file...\n") | |
235 | |
236 #Create the sampleMetada dataframe | |
237 sampleMetadata=xset@phenoData | |
238 sampleNamesOrigin=rownames(sampleMetadata) | |
239 sampleNamesMakeNames=make.names(sampleNamesOrigin) | |
240 | |
241 if (any(duplicated(sampleNamesMakeNames))) { | |
242 write("\n\nERROR: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names().\nIn your case, at least two columns after the renaming obtain the same name, thus XCMS will collapse those columns per name.", stderr()) | |
243 for (sampleName in sampleNamesOrigin) { | |
244 write(paste(sampleName,"\t->\t",make.names(sampleName)),stderr()) | |
245 } | |
246 stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.") | |
247 } | |
248 | |
249 if (!all(sampleNamesOrigin == sampleNamesMakeNames)) { | |
250 cat("\n\nWARNING: Usually, R has trouble to deal with special characters in its column names, so it rename them using make.names()\nIn your case, one or more sample names will be renamed in the sampleMetadata and dataMatrix files:\n") | |
251 for (sampleName in sampleNamesOrigin) { | |
252 cat(paste(sampleName,"\t->\t",make.names(sampleName),"\n")) | |
253 } | |
254 } | |
255 | |
256 sampleMetadata$sampleMetadata=sampleNamesMakeNames | |
257 sampleMetadata=cbind(sampleMetadata["sampleMetadata"],sampleMetadata["class"]) #Reorder columns | |
258 rownames(sampleMetadata)=NULL | |
259 | |
260 #Create a list of files name in the current directory | |
261 list_files=xset@filepaths | |
262 #For each sample file, the following actions are done | |
263 for (file in list_files){ | |
264 #Check if the file is in the CDF format | |
265 if (!mzR:::netCDFIsFile(file)){ | |
266 | |
267 # If the column isn't exist, with add one filled with NA | |
268 if (is.null(sampleMetadata$polarity)) sampleMetadata$polarity=NA | |
269 | |
270 #Create a simple xcmsRaw object for each sample | |
271 xcmsRaw=xcmsRaw(file) | |
272 #Extract the polarity (a list of polarities) | |
273 polarity=xcmsRaw@polarity | |
274 #Verify if all the scans have the same polarity | |
275 uniq_list=unique(polarity) | |
276 if (length(uniq_list)>1){ | |
277 polarity="mixed" | |
278 } else { | |
279 polarity=as.character(uniq_list) | |
280 } | |
281 #Transforms the character to obtain only the sample name | |
282 filename=basename(file) | |
283 library(tools) | |
284 samplename=file_path_sans_ext(filename) | |
285 | |
286 #Set the polarity attribute | |
287 sampleMetadata$polarity[sampleMetadata$sampleMetadata==samplename]=polarity | |
288 | |
289 #Delete xcmsRaw object because it creates a bug for the fillpeaks step | |
290 rm(xcmsRaw) | |
291 } | |
292 | |
293 } | |
294 | |
295 write.table(sampleMetadata, sep="\t", quote=FALSE, row.names=FALSE, file=sampleMetadataOutput) | |
296 | |
297 return(list("sampleNamesOrigin"=sampleNamesOrigin,"sampleNamesMakeNames"=sampleNamesMakeNames)) | |
298 | |
299 } | |
300 | |
301 | |
302 ## | |
303 ## This function check if xcms will found all the files | |
304 ## | |
305 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr ABiMS TEAM | |
306 checkFilesCompatibilityWithXcms <- function(directory) { | |
307 cat("Checking files filenames compatibilities with xmcs...\n") | |
308 # WHAT XCMS WILL FIND | |
309 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]") | |
310 filepattern <- paste(paste("\\.", filepattern, "$", sep = ""),collapse = "|") | |
311 info <- file.info(directory) | |
312 listed <- list.files(directory[info$isdir], pattern = filepattern,recursive = TRUE, full.names = TRUE) | |
313 files <- c(directory[!info$isdir], listed) | |
314 files_abs <- file.path(getwd(), files) | |
315 exists <- file.exists(files_abs) | |
316 files[exists] <- files_abs[exists] | |
317 files[exists] <- sub("//","/",files[exists]) | |
318 | |
319 # WHAT IS ON THE FILESYSTEM | |
320 filesystem_filepaths=system(paste("find $PWD/",directory," -not -name '\\.*' -not -path '*conda-env*' -type f -name \"*\"", sep=""), intern=T) | |
321 filesystem_filepaths=filesystem_filepaths[grep(filepattern, filesystem_filepaths, perl=T)] | |
322 | |
323 # COMPARISON | |
324 if (!is.na(table(filesystem_filepaths %in% files)["FALSE"])) { | |
325 write("\n\nERROR: List of the files which will not be imported by xcmsSet",stderr()) | |
326 write(filesystem_filepaths[!(filesystem_filepaths %in% files)],stderr()) | |
327 stop("\n\nERROR: One or more of your files will not be import by xcmsSet. It may due to bad characters in their filenames.") | |
328 | |
329 } | |
330 } | |
331 | |
332 | |
333 | |
334 ## | |
335 ## This function check if XML contains special caracters. It also checks integrity and completness. | |
336 ## | |
337 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM | |
338 checkXmlStructure <- function (directory) { | |
339 cat("Checking XML structure...\n") | |
340 | |
341 cmd=paste("IFS=$'\n'; for xml in $(find",directory,"-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'); do if [ $(xmllint --nonet --noout \"$xml\" 2> /dev/null; echo $?) -gt 0 ]; then echo $xml;fi; done;") | |
342 capture=system(cmd,intern=TRUE) | |
343 | |
344 if (length(capture)>0){ | |
345 #message=paste("The following mzXML or mzML file is incorrect, please check these files first:",capture) | |
346 write("\n\nERROR: The following mzXML or mzML file(s) are incorrect, please check these files first:", stderr()) | |
347 write(capture, stderr()) | |
348 stop("ERROR: xcmsSet cannot continue with incorrect mzXML or mzML files") | |
349 } | |
350 | |
351 } | |
352 | |
353 | |
354 ## | |
355 ## This function check if XML contain special characters | |
356 ## | |
357 #@author Misharl Monsoor misharl.monsoor@sb-roscoff.fr ABiMS TEAM | |
358 deleteXmlBadCharacters<- function (directory) { | |
359 cat("Checking Non ASCII characters in the XML...\n") | |
360 | |
361 processed=F | |
362 l=system( paste("find",directory, "-not -name '\\.*' -not -path '*conda-env*' -type f -iname '*.*ml*'"),intern=TRUE) | |
363 for (i in l){ | |
364 cmd=paste("LC_ALL=C grep '[^ -~]' \"",i,"\"",sep="") | |
365 capture=suppressWarnings(system(cmd,intern=TRUE)) | |
366 if (length(capture)>0){ | |
367 cmd=paste("perl -i -pe 's/[^[:ascii:]]//g;'",i) | |
368 print( paste("WARNING: Non ASCII characters have been removed from the ",i,"file") ) | |
369 c=system(cmd,intern=TRUE) | |
370 capture="" | |
371 processed=T | |
372 } | |
373 } | |
374 if (processed) cat("\n\n") | |
375 return(processed) | |
376 } | |
377 | |
378 | |
379 ## | |
380 ## This function will compute MD5 checksum to check the data integrity | |
381 ## | |
382 #@author Gildas Le Corguille lecorguille@sb-roscoff.fr | |
383 getMd5sum <- function (directory) { | |
384 cat("Compute md5 checksum...\n") | |
385 # WHAT XCMS WILL FIND | |
386 filepattern <- c("[Cc][Dd][Ff]", "[Nn][Cc]", "([Mm][Zz])?[Xx][Mm][Ll]","[Mm][Zz][Dd][Aa][Tt][Aa]", "[Mm][Zz][Mm][Ll]") | |
387 filepattern <- paste(paste("\\.", filepattern, "$", sep = ""),collapse = "|") | |
388 info <- file.info(directory) | |
389 listed <- list.files(directory[info$isdir], pattern = filepattern,recursive = TRUE, full.names = TRUE) | |
390 files <- c(directory[!info$isdir], listed) | |
391 exists <- file.exists(files) | |
392 files <- files[exists] | |
393 | |
394 library(tools) | |
395 | |
396 #cat("\n\n") | |
397 | |
398 return(as.matrix(md5sum(files))) | |
399 } | |
400 |