comparison limma_voom.R @ 1:76d01fe0ec36 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/limma_voom commit 58c05c0ce9334f8b9c800283cfd1f40573546edd
author iuc
date Wed, 05 Jul 2017 04:39:42 -0400
parents bdebdea5f6a7
children a330ddf43861
comparison
equal deleted inserted replaced
0:bdebdea5f6a7 1:76d01fe0ec36
1 # This tool takes in a matrix of feature counts as well as gene annotations and 1 # This tool takes in a matrix of feature counts as well as gene annotations and
2 # outputs a table of top expressions as well as various plots for differential 2 # outputs a table of top expressions as well as various plots for differential
3 # expression analysis 3 # expression analysis
4 # 4 #
5 # ARGS: 1.countPath -Path to RData input containing counts 5 # ARGS: 1.countPath -Path to RData input containing counts
6 # 2.annoPath -Path to RData input containing gene annotations 6 # 2.annoPath -Path to input containing gene annotations
7 # 3.htmlPath -Path to html file linking to other outputs 7 # 3.htmlPath -Path to html file linking to other outputs
8 # 4.outPath -Path to folder to write all output to 8 # 4.outPath -Path to folder to write all output to
9 # 5.rdaOpt -String specifying if RData should be saved 9 # 5.rdaOpt -String specifying if RData should be saved
10 # 6.normOpt -String specifying type of normalisation used 10 # 6.normOpt -String specifying type of normalisation used
11 # 7.weightOpt -String specifying usage of weights 11 # 7.weightOpt -String specifying usage of weights
13 # 9.cpmReq -Float specifying cpm requirement 13 # 9.cpmReq -Float specifying cpm requirement
14 # 10.sampleReq -Integer specifying cpm requirement 14 # 10.sampleReq -Integer specifying cpm requirement
15 # 11.pAdjOpt -String specifying the p-value adjustment method 15 # 11.pAdjOpt -String specifying the p-value adjustment method
16 # 12.pValReq -Float specifying the p-value requirement 16 # 12.pValReq -Float specifying the p-value requirement
17 # 13.lfcReq -Float specifying the log-fold-change requirement 17 # 13.lfcReq -Float specifying the log-fold-change requirement
18 # 14.factorData -String containing factor names and values 18 # 14.normCounts -String specifying if normalised counts should be output
19 # 15.factPath -Path to factor information file
20 # 16.factorData -Strings containing factor names and values if manually input
19 # 21 #
20 # OUT: Voom Plot 22 # OUT: Voom Plot
21 # BCV Plot 23 # BCV Plot
22 # MA Plot 24 # MA Plot
23 # Top Expression Table 25 # Expression Table
24 # HTML file linking to the ouputs 26 # HTML file linking to the ouputs
25 # 27 #
26 # Author: Shian Su - registertonysu@gmail.com - Jan 2014 28 # Author: Shian Su - registertonysu@gmail.com - Jan 2014
29 # Modified by: Maria Doyle - Jun 2017
27 30
28 # Record starting time 31 # Record starting time
29 timeStart <- as.character(Sys.time()) 32 timeStart <- as.character(Sys.time())
30 33
31 # Load all required libraries 34 # Load all required libraries
146 cpmReq <- as.numeric(argv[9]) 149 cpmReq <- as.numeric(argv[9])
147 sampleReq <- as.numeric(argv[10]) 150 sampleReq <- as.numeric(argv[10])
148 pAdjOpt <- as.character(argv[11]) 151 pAdjOpt <- as.character(argv[11])
149 pValReq <- as.numeric(argv[12]) 152 pValReq <- as.numeric(argv[12])
150 lfcReq <- as.numeric(argv[13]) 153 lfcReq <- as.numeric(argv[13])
151 factorData <- list() 154 normCounts <- as.character(argv[14])
152 for (i in 14:length(argv)) { 155 factPath <- as.character(argv[15])
153 newFact <- unlist(strsplit(as.character(argv[i]), split="::")) 156 # Process factors
154 factorData <- rbind(factorData, newFact) 157 if (as.character(argv[16])=="None") {
155 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... 158 factorData <- read.table(factPath, header=TRUE, sep="\t")
156 159 factors <- factorData[,-1]
157 # Process arguments 160 } else {
161 factorData <- list()
162 for (i in 16:length(argv)) {
163 newFact <- unlist(strsplit(as.character(argv[i]), split="::"))
164 factorData <- rbind(factorData, newFact)
165 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor.
166
167 # Set the row names to be the name of the factor and delete first row
168 row.names(factorData) <- factorData[, 1]
169 factorData <- factorData[, -1]
170 factorData <- sapply(factorData, sanitiseGroups)
171 factorData <- sapply(factorData, strsplit, split=",")
172 factorData <- sapply(factorData, make.names)
173 # Transform factor data into data frame of R factor objects
174 factors <- data.frame(factorData)
175
176 }
177
178 # Process other arguments
158 if (weightOpt=="yes") { 179 if (weightOpt=="yes") {
159 wantWeight <- TRUE 180 wantWeight <- TRUE
160 } else { 181 } else {
161 wantWeight <- FALSE 182 wantWeight <- FALSE
162 } 183 }
171 haveAnno <- FALSE 192 haveAnno <- FALSE
172 } else { 193 } else {
173 haveAnno <- TRUE 194 haveAnno <- TRUE
174 } 195 }
175 196
176 # Set the row names to be the name of the factor and delete first row 197 if (normCounts=="yes") {
177 row.names(factorData) <- factorData[, 1] 198 wantNorm <- TRUE
178 factorData <- factorData[, -1] 199 } else {
179 factorData <- sapply(factorData, sanitiseGroups) 200 wantNorm <- FALSE
180 factorData <- sapply(factorData, strsplit, split=",") 201 }
181 factorData <- sapply(factorData, make.names) 202
182
183 # Transform factor data into data frame of R factor objects
184 factors <- data.frame(factorData)
185 203
186 #Create output directory 204 #Create output directory
187 dir.create(outPath, showWarnings=FALSE) 205 dir.create(outPath, showWarnings=FALSE)
188 206
189 # Split up contrasts seperated by comma into a vector then sanitise 207 # Split up contrasts seperated by comma into a vector then sanitise
203 for (i in 1:length(contrastData)) { 221 for (i in 1:length(contrastData)) {
204 maOutPdf[i] <- makeOut(paste0("maplot_", contrastData[i], ".pdf")) 222 maOutPdf[i] <- makeOut(paste0("maplot_", contrastData[i], ".pdf"))
205 maOutPng[i] <- makeOut(paste0("maplot_", contrastData[i], ".png")) 223 maOutPng[i] <- makeOut(paste0("maplot_", contrastData[i], ".png"))
206 topOut[i] <- makeOut(paste0("limma-voom_", contrastData[i], ".tsv")) 224 topOut[i] <- makeOut(paste0("limma-voom_", contrastData[i], ".tsv"))
207 } # Save output paths for each contrast as vectors 225 } # Save output paths for each contrast as vectors
226 normOut <- makeOut("limma-voom_normcounts.tsv")
208 rdaOut <- makeOut("RData.rda") 227 rdaOut <- makeOut("RData.rda")
209 sessionOut <- makeOut("session_info.txt") 228 sessionOut <- makeOut("session_info.txt")
210 229
211 # Initialise data for html links and images, data frame with columns Label and 230 # Initialise data for html links and images, data frame with columns Label and
212 # Link 231 # Link
219 upCount <- numeric() 238 upCount <- numeric()
220 downCount <- numeric() 239 downCount <- numeric()
221 flatCount <- numeric() 240 flatCount <- numeric()
222 241
223 # Read in counts and geneanno data 242 # Read in counts and geneanno data
224 counts <- read.table(countPath, header=TRUE, sep="\t") 243 counts <- read.table(countPath, header=TRUE, sep="\t", stringsAsFactors=FALSE)
225 row.names(counts) <- counts[, 1] 244 row.names(counts) <- counts[, 1]
226 counts <- counts[ , -1] 245 counts <- counts[ , -1]
227 countsRows <- nrow(counts) 246 countsRows <- nrow(counts)
228 if (haveAnno) { 247 if (haveAnno) {
229 geneanno <- read.table(annoPath, header=TRUE, sep="\t") 248 geneanno <- read.table(annoPath, header=TRUE, sep="\t", stringsAsFactors=FALSE)
230 } 249 }
231 250
232 ################################################################################ 251 ################################################################################
233 ### Data Processing 252 ### Data Processing
234 ################################################################################ 253 ################################################################################
245 # Filter out genes that do not have a required cpm in a required number of 264 # Filter out genes that do not have a required cpm in a required number of
246 # samples 265 # samples
247 preFilterCount <- nrow(data$counts) 266 preFilterCount <- nrow(data$counts)
248 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq 267 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
249 data$counts <- data$counts[sel, ] 268 data$counts <- data$counts[sel, ]
250 data$genes <- data$genes[sel, ] 269 data$genes <- data$genes[sel, ,drop = FALSE]
251 postFilterCount <- nrow(data$counts) 270 postFilterCount <- nrow(data$counts)
252 filteredCount <- preFilterCount-postFilterCount 271 filteredCount <- preFilterCount-postFilterCount
253 272
254 # Creating naming data 273 # Creating naming data
255 samplenames <- colnames(data$counts) 274 samplenames <- colnames(data$counts)
256 sampleanno <- data.frame("sampleID"=samplenames, factors) 275 sampleanno <- data.frame("sampleID"=samplenames, factors)
276
257 277
258 # Generating the DGEList object "data" 278 # Generating the DGEList object "data"
259 data$samples <- sampleanno 279 data$samples <- sampleanno
260 data$samples$lib.size <- colSums(data$counts) 280 data$samples$lib.size <- colSums(data$counts)
261 data$samples$norm.factors <- 1 281 data$samples$norm.factors <- 1
262 row.names(data$samples) <- colnames(data$counts) 282 row.names(data$samples) <- colnames(data$counts)
263 data <- new("DGEList", data) 283 data <- new("DGEList", data)
264 284
265 factorList <- sapply(names(factors), pasteListName) 285 factorList <- sapply(names(factors), pasteListName)
266 formula <- "~0" 286
287 formula <- "~0"
267 for (i in 1:length(factorList)) { 288 for (i in 1:length(factorList)) {
268 formula <- paste(formula, factorList[i], sep="+") 289 formula <- paste(formula,factorList[i], sep="+")
269 } 290 }
291
270 formula <- formula(formula) 292 formula <- formula(formula)
271 design <- model.matrix(formula) 293 design <- model.matrix(formula)
294
272 for (i in 1:length(factorList)) { 295 for (i in 1:length(factorList)) {
273 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) 296 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE)
274 } 297 }
275 298
276 # Calculating normalising factor, estimating dispersion 299 # Calculating normalising factor, estimating dispersion
277 data <- calcNormFactors(data, method=normOpt) 300 data <- calcNormFactors(data, method=normOpt)
278 #data <- estimateDisp(data, design=design, robust=TRUE) 301 #data <- estimateDisp(data, design=design, robust=TRUE)
326 invisible(dev.off()) 349 invisible(dev.off())
327 350
328 # Generate voom fit 351 # Generate voom fit
329 voomFit <- lmFit(vData, design) 352 voomFit <- lmFit(vData, design)
330 353
354 }
355
356 # Save normalised counts (log2cpm)
357 if (wantNorm) {
358 norm_counts <- data.frame(vData$genes, vData$E)
359 write.table (norm_counts, file=normOut, row.names=FALSE, sep="\t")
360 linkData <- rbind(linkData, c("limma-voom_normcounts.tsv", "limma-voom_normcounts.tsv"))
331 } 361 }
332 362
333 # Fit linear model and estimate dispersion with eBayes 363 # Fit linear model and estimate dispersion with eBayes
334 voomFit <- contrasts.fit(voomFit, contrasts) 364 voomFit <- contrasts.fit(voomFit, contrasts)
335 voomFit <- eBayes(voomFit) 365 voomFit <- eBayes(voomFit)
366 396
367 # Write top expressions table 397 # Write top expressions table
368 top <- topTable(voomFit, coef=i, number=Inf, sort.by="P") 398 top <- topTable(voomFit, coef=i, number=Inf, sort.by="P")
369 write.table(top, file=topOut[i], row.names=FALSE, sep="\t") 399 write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
370 400
371 linkName <- paste0("limma-voom_", contrastData[i], 401 linkName <- paste0("limma-voom_", contrastData[i], ".tsv")
372 ".tsv")
373 linkAddr <- paste0("limma-voom_", contrastData[i], ".tsv") 402 linkAddr <- paste0("limma-voom_", contrastData[i], ".tsv")
374 linkData <- rbind(linkData, c(linkName, linkAddr)) 403 linkData <- rbind(linkData, c(linkName, linkAddr))
375 404
376 # Plot MA (log ratios vs mean average) using limma package on weighted data 405 # Plot MA (log ratios vs mean average) using limma package on weighted
377 pdf(maOutPdf[i]) 406 pdf(maOutPdf[i])
378 limma::plotMA(voomFit, status=status, coef=i, 407 limma::plotMA(voomFit, status=status, coef=i,
379 main=paste("MA Plot:", unmake.names(contrastData[i])), 408 main=paste("MA Plot:", unmake.names(contrastData[i])),
380 col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"), 409 col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"),
381 xlab="Average Expression", ylab="logFC") 410 xlab="Average Expression", ylab="logFC")
532 } 561 }
533 cata("</ul>\n") 562 cata("</ul>\n")
534 563
535 cata("<h4>Summary of experimental data:</h4>\n") 564 cata("<h4>Summary of experimental data:</h4>\n")
536 565
537 cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP*</p>\n") 566 cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP(S)*</p>\n")
538 567
539 cata("<table border=\"1\" cellpadding=\"3\">\n") 568 cata("<table border=\"1\" cellpadding=\"3\">\n")
540 cata("<tr>\n") 569 cata("<tr>\n")
541 TableItem() 570 TableHeadItem("SampleID")
542 for (i in names(factors)) { 571 TableHeadItem(names(factors)[1]," (Primary Factor)")
572
573 for (i in names(factors)[2:length(names(factors))]) {
543 TableHeadItem(i) 574 TableHeadItem(i)
544 } 575 }
545 cata("</tr>\n") 576 cata("</tr>\n")
546 577
547 for (i in 1:nrow(factors)) { 578 for (i in 1:nrow(factors)) {
548 cata("<tr>\n") 579 cata("<tr>\n")
549 TableHeadItem(row.names(factors)[i]) 580 TableHeadItem(row.names(factors)[i])
550 for (j in ncol(factors)) { 581 for (j in 1:ncol(factors)) {
551 TableItem(as.character(unmake.names(factors[i, j]))) 582 TableItem(as.character(unmake.names(factors[i, j])))
552 } 583 }
553 cata("</tr>\n") 584 cata("</tr>\n")
554 } 585 }
555 cata("</table>") 586 cata("</table>")