Mercurial > repos > iuc > limma_voom
comparison limma_voom.R @ 1:76d01fe0ec36 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/limma_voom commit 58c05c0ce9334f8b9c800283cfd1f40573546edd
author | iuc |
---|---|
date | Wed, 05 Jul 2017 04:39:42 -0400 |
parents | bdebdea5f6a7 |
children | a330ddf43861 |
comparison
equal
deleted
inserted
replaced
0:bdebdea5f6a7 | 1:76d01fe0ec36 |
---|---|
1 # This tool takes in a matrix of feature counts as well as gene annotations and | 1 # This tool takes in a matrix of feature counts as well as gene annotations and |
2 # outputs a table of top expressions as well as various plots for differential | 2 # outputs a table of top expressions as well as various plots for differential |
3 # expression analysis | 3 # expression analysis |
4 # | 4 # |
5 # ARGS: 1.countPath -Path to RData input containing counts | 5 # ARGS: 1.countPath -Path to RData input containing counts |
6 # 2.annoPath -Path to RData input containing gene annotations | 6 # 2.annoPath -Path to input containing gene annotations |
7 # 3.htmlPath -Path to html file linking to other outputs | 7 # 3.htmlPath -Path to html file linking to other outputs |
8 # 4.outPath -Path to folder to write all output to | 8 # 4.outPath -Path to folder to write all output to |
9 # 5.rdaOpt -String specifying if RData should be saved | 9 # 5.rdaOpt -String specifying if RData should be saved |
10 # 6.normOpt -String specifying type of normalisation used | 10 # 6.normOpt -String specifying type of normalisation used |
11 # 7.weightOpt -String specifying usage of weights | 11 # 7.weightOpt -String specifying usage of weights |
13 # 9.cpmReq -Float specifying cpm requirement | 13 # 9.cpmReq -Float specifying cpm requirement |
14 # 10.sampleReq -Integer specifying cpm requirement | 14 # 10.sampleReq -Integer specifying cpm requirement |
15 # 11.pAdjOpt -String specifying the p-value adjustment method | 15 # 11.pAdjOpt -String specifying the p-value adjustment method |
16 # 12.pValReq -Float specifying the p-value requirement | 16 # 12.pValReq -Float specifying the p-value requirement |
17 # 13.lfcReq -Float specifying the log-fold-change requirement | 17 # 13.lfcReq -Float specifying the log-fold-change requirement |
18 # 14.factorData -String containing factor names and values | 18 # 14.normCounts -String specifying if normalised counts should be output |
19 # 15.factPath -Path to factor information file | |
20 # 16.factorData -Strings containing factor names and values if manually input | |
19 # | 21 # |
20 # OUT: Voom Plot | 22 # OUT: Voom Plot |
21 # BCV Plot | 23 # BCV Plot |
22 # MA Plot | 24 # MA Plot |
23 # Top Expression Table | 25 # Expression Table |
24 # HTML file linking to the ouputs | 26 # HTML file linking to the ouputs |
25 # | 27 # |
26 # Author: Shian Su - registertonysu@gmail.com - Jan 2014 | 28 # Author: Shian Su - registertonysu@gmail.com - Jan 2014 |
29 # Modified by: Maria Doyle - Jun 2017 | |
27 | 30 |
28 # Record starting time | 31 # Record starting time |
29 timeStart <- as.character(Sys.time()) | 32 timeStart <- as.character(Sys.time()) |
30 | 33 |
31 # Load all required libraries | 34 # Load all required libraries |
146 cpmReq <- as.numeric(argv[9]) | 149 cpmReq <- as.numeric(argv[9]) |
147 sampleReq <- as.numeric(argv[10]) | 150 sampleReq <- as.numeric(argv[10]) |
148 pAdjOpt <- as.character(argv[11]) | 151 pAdjOpt <- as.character(argv[11]) |
149 pValReq <- as.numeric(argv[12]) | 152 pValReq <- as.numeric(argv[12]) |
150 lfcReq <- as.numeric(argv[13]) | 153 lfcReq <- as.numeric(argv[13]) |
151 factorData <- list() | 154 normCounts <- as.character(argv[14]) |
152 for (i in 14:length(argv)) { | 155 factPath <- as.character(argv[15]) |
153 newFact <- unlist(strsplit(as.character(argv[i]), split="::")) | 156 # Process factors |
154 factorData <- rbind(factorData, newFact) | 157 if (as.character(argv[16])=="None") { |
155 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... | 158 factorData <- read.table(factPath, header=TRUE, sep="\t") |
156 | 159 factors <- factorData[,-1] |
157 # Process arguments | 160 } else { |
161 factorData <- list() | |
162 for (i in 16:length(argv)) { | |
163 newFact <- unlist(strsplit(as.character(argv[i]), split="::")) | |
164 factorData <- rbind(factorData, newFact) | |
165 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor. | |
166 | |
167 # Set the row names to be the name of the factor and delete first row | |
168 row.names(factorData) <- factorData[, 1] | |
169 factorData <- factorData[, -1] | |
170 factorData <- sapply(factorData, sanitiseGroups) | |
171 factorData <- sapply(factorData, strsplit, split=",") | |
172 factorData <- sapply(factorData, make.names) | |
173 # Transform factor data into data frame of R factor objects | |
174 factors <- data.frame(factorData) | |
175 | |
176 } | |
177 | |
178 # Process other arguments | |
158 if (weightOpt=="yes") { | 179 if (weightOpt=="yes") { |
159 wantWeight <- TRUE | 180 wantWeight <- TRUE |
160 } else { | 181 } else { |
161 wantWeight <- FALSE | 182 wantWeight <- FALSE |
162 } | 183 } |
171 haveAnno <- FALSE | 192 haveAnno <- FALSE |
172 } else { | 193 } else { |
173 haveAnno <- TRUE | 194 haveAnno <- TRUE |
174 } | 195 } |
175 | 196 |
176 # Set the row names to be the name of the factor and delete first row | 197 if (normCounts=="yes") { |
177 row.names(factorData) <- factorData[, 1] | 198 wantNorm <- TRUE |
178 factorData <- factorData[, -1] | 199 } else { |
179 factorData <- sapply(factorData, sanitiseGroups) | 200 wantNorm <- FALSE |
180 factorData <- sapply(factorData, strsplit, split=",") | 201 } |
181 factorData <- sapply(factorData, make.names) | 202 |
182 | |
183 # Transform factor data into data frame of R factor objects | |
184 factors <- data.frame(factorData) | |
185 | 203 |
186 #Create output directory | 204 #Create output directory |
187 dir.create(outPath, showWarnings=FALSE) | 205 dir.create(outPath, showWarnings=FALSE) |
188 | 206 |
189 # Split up contrasts seperated by comma into a vector then sanitise | 207 # Split up contrasts seperated by comma into a vector then sanitise |
203 for (i in 1:length(contrastData)) { | 221 for (i in 1:length(contrastData)) { |
204 maOutPdf[i] <- makeOut(paste0("maplot_", contrastData[i], ".pdf")) | 222 maOutPdf[i] <- makeOut(paste0("maplot_", contrastData[i], ".pdf")) |
205 maOutPng[i] <- makeOut(paste0("maplot_", contrastData[i], ".png")) | 223 maOutPng[i] <- makeOut(paste0("maplot_", contrastData[i], ".png")) |
206 topOut[i] <- makeOut(paste0("limma-voom_", contrastData[i], ".tsv")) | 224 topOut[i] <- makeOut(paste0("limma-voom_", contrastData[i], ".tsv")) |
207 } # Save output paths for each contrast as vectors | 225 } # Save output paths for each contrast as vectors |
226 normOut <- makeOut("limma-voom_normcounts.tsv") | |
208 rdaOut <- makeOut("RData.rda") | 227 rdaOut <- makeOut("RData.rda") |
209 sessionOut <- makeOut("session_info.txt") | 228 sessionOut <- makeOut("session_info.txt") |
210 | 229 |
211 # Initialise data for html links and images, data frame with columns Label and | 230 # Initialise data for html links and images, data frame with columns Label and |
212 # Link | 231 # Link |
219 upCount <- numeric() | 238 upCount <- numeric() |
220 downCount <- numeric() | 239 downCount <- numeric() |
221 flatCount <- numeric() | 240 flatCount <- numeric() |
222 | 241 |
223 # Read in counts and geneanno data | 242 # Read in counts and geneanno data |
224 counts <- read.table(countPath, header=TRUE, sep="\t") | 243 counts <- read.table(countPath, header=TRUE, sep="\t", stringsAsFactors=FALSE) |
225 row.names(counts) <- counts[, 1] | 244 row.names(counts) <- counts[, 1] |
226 counts <- counts[ , -1] | 245 counts <- counts[ , -1] |
227 countsRows <- nrow(counts) | 246 countsRows <- nrow(counts) |
228 if (haveAnno) { | 247 if (haveAnno) { |
229 geneanno <- read.table(annoPath, header=TRUE, sep="\t") | 248 geneanno <- read.table(annoPath, header=TRUE, sep="\t", stringsAsFactors=FALSE) |
230 } | 249 } |
231 | 250 |
232 ################################################################################ | 251 ################################################################################ |
233 ### Data Processing | 252 ### Data Processing |
234 ################################################################################ | 253 ################################################################################ |
245 # Filter out genes that do not have a required cpm in a required number of | 264 # Filter out genes that do not have a required cpm in a required number of |
246 # samples | 265 # samples |
247 preFilterCount <- nrow(data$counts) | 266 preFilterCount <- nrow(data$counts) |
248 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq | 267 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq |
249 data$counts <- data$counts[sel, ] | 268 data$counts <- data$counts[sel, ] |
250 data$genes <- data$genes[sel, ] | 269 data$genes <- data$genes[sel, ,drop = FALSE] |
251 postFilterCount <- nrow(data$counts) | 270 postFilterCount <- nrow(data$counts) |
252 filteredCount <- preFilterCount-postFilterCount | 271 filteredCount <- preFilterCount-postFilterCount |
253 | 272 |
254 # Creating naming data | 273 # Creating naming data |
255 samplenames <- colnames(data$counts) | 274 samplenames <- colnames(data$counts) |
256 sampleanno <- data.frame("sampleID"=samplenames, factors) | 275 sampleanno <- data.frame("sampleID"=samplenames, factors) |
276 | |
257 | 277 |
258 # Generating the DGEList object "data" | 278 # Generating the DGEList object "data" |
259 data$samples <- sampleanno | 279 data$samples <- sampleanno |
260 data$samples$lib.size <- colSums(data$counts) | 280 data$samples$lib.size <- colSums(data$counts) |
261 data$samples$norm.factors <- 1 | 281 data$samples$norm.factors <- 1 |
262 row.names(data$samples) <- colnames(data$counts) | 282 row.names(data$samples) <- colnames(data$counts) |
263 data <- new("DGEList", data) | 283 data <- new("DGEList", data) |
264 | 284 |
265 factorList <- sapply(names(factors), pasteListName) | 285 factorList <- sapply(names(factors), pasteListName) |
266 formula <- "~0" | 286 |
287 formula <- "~0" | |
267 for (i in 1:length(factorList)) { | 288 for (i in 1:length(factorList)) { |
268 formula <- paste(formula, factorList[i], sep="+") | 289 formula <- paste(formula,factorList[i], sep="+") |
269 } | 290 } |
291 | |
270 formula <- formula(formula) | 292 formula <- formula(formula) |
271 design <- model.matrix(formula) | 293 design <- model.matrix(formula) |
294 | |
272 for (i in 1:length(factorList)) { | 295 for (i in 1:length(factorList)) { |
273 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) | 296 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) |
274 } | 297 } |
275 | 298 |
276 # Calculating normalising factor, estimating dispersion | 299 # Calculating normalising factor, estimating dispersion |
277 data <- calcNormFactors(data, method=normOpt) | 300 data <- calcNormFactors(data, method=normOpt) |
278 #data <- estimateDisp(data, design=design, robust=TRUE) | 301 #data <- estimateDisp(data, design=design, robust=TRUE) |
326 invisible(dev.off()) | 349 invisible(dev.off()) |
327 | 350 |
328 # Generate voom fit | 351 # Generate voom fit |
329 voomFit <- lmFit(vData, design) | 352 voomFit <- lmFit(vData, design) |
330 | 353 |
354 } | |
355 | |
356 # Save normalised counts (log2cpm) | |
357 if (wantNorm) { | |
358 norm_counts <- data.frame(vData$genes, vData$E) | |
359 write.table (norm_counts, file=normOut, row.names=FALSE, sep="\t") | |
360 linkData <- rbind(linkData, c("limma-voom_normcounts.tsv", "limma-voom_normcounts.tsv")) | |
331 } | 361 } |
332 | 362 |
333 # Fit linear model and estimate dispersion with eBayes | 363 # Fit linear model and estimate dispersion with eBayes |
334 voomFit <- contrasts.fit(voomFit, contrasts) | 364 voomFit <- contrasts.fit(voomFit, contrasts) |
335 voomFit <- eBayes(voomFit) | 365 voomFit <- eBayes(voomFit) |
366 | 396 |
367 # Write top expressions table | 397 # Write top expressions table |
368 top <- topTable(voomFit, coef=i, number=Inf, sort.by="P") | 398 top <- topTable(voomFit, coef=i, number=Inf, sort.by="P") |
369 write.table(top, file=topOut[i], row.names=FALSE, sep="\t") | 399 write.table(top, file=topOut[i], row.names=FALSE, sep="\t") |
370 | 400 |
371 linkName <- paste0("limma-voom_", contrastData[i], | 401 linkName <- paste0("limma-voom_", contrastData[i], ".tsv") |
372 ".tsv") | |
373 linkAddr <- paste0("limma-voom_", contrastData[i], ".tsv") | 402 linkAddr <- paste0("limma-voom_", contrastData[i], ".tsv") |
374 linkData <- rbind(linkData, c(linkName, linkAddr)) | 403 linkData <- rbind(linkData, c(linkName, linkAddr)) |
375 | 404 |
376 # Plot MA (log ratios vs mean average) using limma package on weighted data | 405 # Plot MA (log ratios vs mean average) using limma package on weighted |
377 pdf(maOutPdf[i]) | 406 pdf(maOutPdf[i]) |
378 limma::plotMA(voomFit, status=status, coef=i, | 407 limma::plotMA(voomFit, status=status, coef=i, |
379 main=paste("MA Plot:", unmake.names(contrastData[i])), | 408 main=paste("MA Plot:", unmake.names(contrastData[i])), |
380 col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"), | 409 col=alpha(c("firebrick", "blue"), 0.4), values=c("1", "-1"), |
381 xlab="Average Expression", ylab="logFC") | 410 xlab="Average Expression", ylab="logFC") |
532 } | 561 } |
533 cata("</ul>\n") | 562 cata("</ul>\n") |
534 | 563 |
535 cata("<h4>Summary of experimental data:</h4>\n") | 564 cata("<h4>Summary of experimental data:</h4>\n") |
536 | 565 |
537 cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP*</p>\n") | 566 cata("<p>*CHECK THAT SAMPLES ARE ASSOCIATED WITH CORRECT GROUP(S)*</p>\n") |
538 | 567 |
539 cata("<table border=\"1\" cellpadding=\"3\">\n") | 568 cata("<table border=\"1\" cellpadding=\"3\">\n") |
540 cata("<tr>\n") | 569 cata("<tr>\n") |
541 TableItem() | 570 TableHeadItem("SampleID") |
542 for (i in names(factors)) { | 571 TableHeadItem(names(factors)[1]," (Primary Factor)") |
572 | |
573 for (i in names(factors)[2:length(names(factors))]) { | |
543 TableHeadItem(i) | 574 TableHeadItem(i) |
544 } | 575 } |
545 cata("</tr>\n") | 576 cata("</tr>\n") |
546 | 577 |
547 for (i in 1:nrow(factors)) { | 578 for (i in 1:nrow(factors)) { |
548 cata("<tr>\n") | 579 cata("<tr>\n") |
549 TableHeadItem(row.names(factors)[i]) | 580 TableHeadItem(row.names(factors)[i]) |
550 for (j in ncol(factors)) { | 581 for (j in 1:ncol(factors)) { |
551 TableItem(as.character(unmake.names(factors[i, j]))) | 582 TableItem(as.character(unmake.names(factors[i, j]))) |
552 } | 583 } |
553 cata("</tr>\n") | 584 cata("</tr>\n") |
554 } | 585 } |
555 cata("</table>") | 586 cata("</table>") |