comparison hairpinTool.R @ 13:7aaa9bc23e3c

Added support for paired end reads - Changed terminology to generalise to sgRNA CRISPR experiments. - Added option to include second factor for statistical power - Added option to filter out samples with low counts - Added support for paired end reads - Added option to highlight only positive or negative fold change in smear plot - Fixed bug that caused tool to stop if more than enough sample annotations were supplied
author shian_su <registertonysu@gmail.com>
date Tue, 14 Oct 2014 17:05:07 +1100
parents ebb4cb1e8e35
children 5a917ea5bed2
comparison
equal deleted inserted replaced
12:ebb4cb1e8e35 13:7aaa9bc23e3c
1 # ARGS: 1.inputType -String specifying format of input (fastq or table) 1 # ARGS: 1.inputType -String specifying format of input (fastq or table)
2 # IF inputType is "fastQ": 2 # IF inputType is "fastq" or "pairedFastq:
3 # 2*.fastqPath -One or more strings specifying path to fastq files 3 # 2*.fastqPath -One or more strings specifying path to fastq files
4 # 2.annoPath -String specifying path to hairpin annotation table 4 # 2.annoPath -String specifying path to hairpin annotation table
5 # 3.samplePath -String specifying path to sample annotation table 5 # 3.samplePath -String specifying path to sample annotation table
6 # 4.barStart -Integer specifying starting position of barcode 6 # 4.barStart -Integer specifying starting position of barcode
7 # 5.barEnd -Integer specifying ending position of barcode 7 # 5.barEnd -Integer specifying ending position of barcode
8 # 6.hpStart -Integer specifying startins position of hairpin 8 # ###
9 # IF inputType is "pairedFastq":
10 # 6.barStartRev -Integer specifying starting position of barcode
11 # on reverse end
12 # 7.barEndRev -Integer specifying ending position of barcode
13 # on reverse end
14 # ###
15 # 8.hpStart -Integer specifying startins position of hairpin
9 # unique region 16 # unique region
10 # 7.hpEnd -Integer specifying ending position of hairpin 17 # 9.hpEnd -Integer specifying ending position of hairpin
11 # unique region 18 # unique region
12 # ###
13 # IF inputType is "counts": 19 # IF inputType is "counts":
14 # 2.countPath -String specifying path to count table 20 # 2.countPath -String specifying path to count table
15 # 3.annoPath -String specifying path to hairpin annotation table 21 # 3.annoPath -String specifying path to hairpin annotation table
16 # 4.samplePath -String specifying path to sample annotation table 22 # 4.samplePath -String specifying path to sample annotation table
17 # ### 23 # ###
18 # 8.cpmReq -Float specifying cpm requirement 24 # 10.secFactName -String specifying name of secondary factor
19 # 9.sampleReq -Integer specifying cpm requirement 25 # 11.cpmReq -Float specifying cpm requirement
20 # 10.fdrThresh -Float specifying the FDR requirement 26 # 12.sampleReq -Integer specifying cpm requirement
21 # 11.lfcThresh -Float specifying the log-fold-change requirement 27 # 13.readReq -Integer specifying read requirement
22 # 12.workMode -String specifying exact test or GLM usage 28 # 14.fdrThresh -Float specifying the FDR requirement
23 # 13.htmlPath -String specifying path to HTML file 29 # 15.lfcThresh -Float specifying the log-fold-change requirement
24 # 14.folderPath -STring specifying path to folder for output 30 # 16.workMode -String specifying exact test or GLM usage
31 # 17.htmlPath -String specifying path to HTML file
32 # 18.folderPath -String specifying path to folder for output
25 # IF workMode is "classic" (exact test) 33 # IF workMode is "classic" (exact test)
26 # 15.pairData[2] -String specifying first group for exact test 34 # 19.pairData[2] -String specifying first group for exact test
27 # 16.pairData[1] -String specifying second group for exact test 35 # 20.pairData[1] -String specifying second group for exact test
28 # ### 36 # ###
29 # IF workMode is "glm" 37 # IF workMode is "glm"
30 # 15.contrastData -String specifying contrasts to be made 38 # 19.contrastData -String specifying contrasts to be made
31 # 16.roastOpt -String specifying usage of gene-wise tests 39 # 20.roastOpt -String specifying usage of gene-wise tests
32 # 17.hairpinReq -String specifying hairpin requirement for gene- 40 # 21.hairpinReq -String specifying hairpin requirement for gene-
33 # wise test 41 # wise test
34 # 18.selectOpt -String specifying type of selection for barcode 42 # 22.selectOpt -String specifying type of selection for barcode
35 # plots 43 # plots
36 # 19.selectVals -String specifying members selected for barcode 44 # 23.selectVals -String specifying members selected for barcode
37 # plots 45 # plots
38 # ### 46 # ###
39 # 47 #
40 # OUT: Bar Plot of Counts Per Index 48 # OUT: Bar Plot of Counts Per Index
41 # Bar Plot of Counts Per Hairpin 49 # Bar Plot of Counts Per Hairpin
42 # MDS Plot 50 # MDS Plot
51 # BCV Plot
43 # Smear Plot 52 # Smear Plot
44 # Barcode Plots (If Genewise testing was selected) 53 # Barcode Plots (If Genewise testing was selected)
45 # Top Expression Table 54 # Top Expression Table
46 # Feature Counts Table 55 # Feature Counts Table
47 # HTML file linking to the ouputs 56 # HTML file linking to the ouputs
56 library(statmod, quietly=TRUE, warn.conflicts=FALSE) 65 library(statmod, quietly=TRUE, warn.conflicts=FALSE)
57 library(splines, quietly=TRUE, warn.conflicts=FALSE) 66 library(splines, quietly=TRUE, warn.conflicts=FALSE)
58 library(edgeR, quietly=TRUE, warn.conflicts=FALSE) 67 library(edgeR, quietly=TRUE, warn.conflicts=FALSE)
59 library(limma, quietly=TRUE, warn.conflicts=FALSE) 68 library(limma, quietly=TRUE, warn.conflicts=FALSE)
60 69
61 if (packageVersion("edgeR") < "3.5.27") { 70 if (packageVersion("edgeR") < "3.7.17") {
62 stop("Please update 'edgeR' to version >= 3.5.23 to run this tool") 71 stop("Please update 'edgeR' to version >= 3.7.17 to run this tool")
63 } 72 }
64 73
65 if (packageVersion("limma")<"3.19.19") { 74 if (packageVersion("limma")<"3.21.16") {
66 message("Update 'limma' to version >= 3.19.19 to see updated barcode graphs") 75 message("Update 'limma' to version >= 3.21.16 to see updated barcode graphs")
67 } 76 }
68 77
69 ################################################################################ 78 ################################################################################
70 ### Function declarations 79 ### Function declarations
71 ################################################################################ 80 ################################################################################
169 ################################################################################ 178 ################################################################################
170 179
171 # Grabbing arguments from command line 180 # Grabbing arguments from command line
172 argv <- commandArgs(TRUE) 181 argv <- commandArgs(TRUE)
173 182
174 # Remove fastq file paths after collecting from argument vector
175 inputType <- as.character(argv[1]) 183 inputType <- as.character(argv[1])
176 if (inputType=="fastq") { 184 if (inputType == "fastq") {
185
177 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], 186 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)],
178 fixed=TRUE)) 187 fixed=TRUE))
179 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] 188
189 # Remove fastq paths
190 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
191
192 fastqPathRev <- NULL
180 annoPath <- as.character(argv[2]) 193 annoPath <- as.character(argv[2])
181 samplePath <- as.character(argv[3]) 194 samplePath <- as.character(argv[3])
182 barStart <- as.numeric(argv[4]) 195 barStart <- as.numeric(argv[4])
183 barEnd <- as.numeric(argv[5]) 196 barEnd <- as.numeric(argv[5])
184 hpStart <- as.numeric(argv[6]) 197 barStartRev <- NULL
185 hpEnd <- as.numeric(argv[7]) 198 barStartRev <- NULL
186 } else if (inputType=="counts") { 199 hpStart <- as.numeric(argv[8])
200 hpEnd <- as.numeric(argv[9])
201 } else if (inputType=="pairedFastq") {
202
203 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)],
204 fixed=TRUE))
205
206 fastqPathRev <- as.character(gsub("fastqRev::", "",
207 argv[grepl("fastqRev::", argv)], fixed=TRUE))
208
209 # Remove fastq paths
210 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
211 argv <- argv[!grepl("fastqRev::", argv, fixed=TRUE)]
212
213 annoPath <- as.character(argv[2])
214 samplePath <- as.character(argv[3])
215 barStart <- as.numeric(argv[4])
216 barEnd <- as.numeric(argv[5])
217 barStartRev <- as.numeric(argv[6])
218 barEndRev <- as.numeric(argv[7])
219 hpStart <- as.numeric(argv[8])
220 hpEnd <- as.numeric(argv[9])
221 } else if (inputType == "counts") {
187 countPath <- as.character(argv[2]) 222 countPath <- as.character(argv[2])
188 annoPath <- as.character(argv[3]) 223 annoPath <- as.character(argv[3])
189 samplePath <- as.character(argv[4]) 224 samplePath <- as.character(argv[4])
190 } 225 }
191 226
192 cpmReq <- as.numeric(argv[8]) 227 secFactName <- as.character(argv[10])
193 sampleReq <- as.numeric(argv[9]) 228 cpmReq <- as.numeric(argv[11])
194 fdrThresh <- as.numeric(argv[10]) 229 sampleReq <- as.numeric(argv[12])
195 lfcThresh <- as.numeric(argv[11]) 230 readReq <- as.numeric(argv[13])
196 workMode <- as.character(argv[12]) 231 fdrThresh <- as.numeric(argv[14])
197 htmlPath <- as.character(argv[13]) 232 lfcThresh <- as.numeric(argv[15])
198 folderPath <- as.character(argv[14]) 233 selectDirection <- as.character(argv[16])
199 234 workMode <- as.character(argv[17])
200 if (workMode=="classic") { 235 htmlPath <- as.character(argv[18])
236 folderPath <- as.character(argv[19])
237
238 if (workMode == "classic") {
201 pairData <- character() 239 pairData <- character()
202 pairData[2] <- as.character(argv[15]) 240 pairData[2] <- as.character(argv[20])
203 pairData[1] <- as.character(argv[16]) 241 pairData[1] <- as.character(argv[21])
204 } else if (workMode=="glm") { 242 } else if (workMode == "glm") {
205 contrastData <- as.character(argv[15]) 243 contrastData <- as.character(argv[20])
206 roastOpt <- as.character(argv[16]) 244 roastOpt <- as.character(argv[21])
207 hairpinReq <- as.numeric(argv[17]) 245 hairpinReq <- as.numeric(argv[22])
208 selectOpt <- as.character(argv[18]) 246 selectOpt <- as.character(argv[23])
209 selectVals <- as.character(argv[19]) 247 selectVals <- as.character(argv[24])
210 } 248 }
211 249
212 # Read in inputs 250 # Read in inputs
213 251
214 samples <- read.table(samplePath, header=TRUE, sep="\t") 252 samples <- read.table(samplePath, header=TRUE, sep="\t")
253
215 anno <- read.table(annoPath, header=TRUE, sep="\t") 254 anno <- read.table(annoPath, header=TRUE, sep="\t")
216 if (inputType=="counts") { 255
256 if (inputType == "counts") {
217 counts <- read.table(countPath, header=TRUE, sep="\t") 257 counts <- read.table(countPath, header=TRUE, sep="\t")
218 } 258 }
219 259
220 ###################### Check inputs for correctness ############################ 260 ###################### Check inputs for correctness ############################
221 samples$ID <- make.names(samples$ID) 261 samples$ID <- make.names(samples$ID)
222 262
223 if (!any(grepl("group", names(samples)))) { 263 if ( !any(grepl("group", names(samples))) ) {
224 stop("'group' column not specified in sample annotation file") 264 stop("'group' column not specified in sample annotation file")
225 } # Check if grouping variable has been specified 265 } # Check if grouping variable has been specified
226 266
227 if (any(table(samples$ID)>1)){ 267 if (secFactName != "none") {
268 if ( !any(grepl(secFactName, names(samples))) ) {
269 tempStr <- paste0("Second factor specified as \"", secFactName, "\" but ",
270 "no such column name found in sample annotation file")
271 stop(tempStr)
272 } # Check if specified secondary factor is present
273 }
274
275
276 if ( any(table(samples$ID) > 1) ){
228 tab <- table(samples$ID) 277 tab <- table(samples$ID)
229 offenders <- paste(names(tab[tab>1]), collapse=", ") 278 offenders <- paste(names(tab[tab > 1]), collapse=", ")
230 offenders <- unmake.names(offenders) 279 offenders <- unmake.names(offenders)
231 stop("'ID' column of sample annotation must have unique values, values ", 280 stop("'ID' column of sample annotation must have unique values, values ",
232 offenders, " are repeated") 281 offenders, " are repeated")
233 } # Check that IDs in sample annotation are unique 282 } # Check that IDs in sample annotation are unique
234 283
235 if (inputType=="fastq") { 284 if (inputType == "fastq" || inputType == "pairedFastq") {
236 285
237 if (any(table(anno$ID)>1)){ 286 if ( any(table(anno$ID) > 1) ){
238 tab <- table(anno$ID) 287 tab <- table(anno$ID)
239 offenders <- paste(names(tab[tab>1]), collapse=", ") 288 offenders <- paste(names(tab[tab>1]), collapse=", ")
240 stop("'ID' column of hairpin annotation must have unique values, values ", 289 stop("'ID' column of hairpin annotation must have unique values, values ",
241 offenders, " are repeated") 290 offenders, " are repeated")
242 } # Check that IDs in hairpin annotation are unique 291 } # Check that IDs in hairpin annotation are unique
243 292
244 } else if (inputType=="counts") { 293 } else if (inputType == "counts") {
245 if (any(is.na(match(samples$ID, colnames(counts))))) { 294 # The first element of the colnames will be 'ID' and should not match
295 idFromSample <- samples$ID
296 idFromTable <- colnames(counts)[-1]
297 if (any(is.na(match(idFromTable, idFromSample)))) {
246 stop("not all samples have groups specified") 298 stop("not all samples have groups specified")
247 } # Check that a group has be specifed for each sample 299 } # Check that a group has be specifed for each sample
248 300
249 if (any(table(counts$ID)>1)){ 301 if ( any(table(counts$ID) > 1) ){
250 tab <- table(counts$ID) 302 tab <- table(counts$ID)
251 offenders <- paste(names(tab[tab>1]), collapse=", ") 303 offenders <- paste(names(tab[tab>1]), collapse=", ")
252 stop("'ID' column of count table must have unique values, values ", 304 stop("'ID' column of count table must have unique values, values ",
253 offenders, " are repeated") 305 offenders, " are repeated")
254 } # Check that IDs in count table are unique 306 } # Check that IDs in count table are unique
255 } 307 }
256 if (workMode=="glm") { 308 if (workMode == "glm") {
257 if (roastOpt == "yes") { 309 if (roastOpt == "yes") {
258 if (is.na(match("Gene", colnames(anno)))) { 310 if (is.na(match("Gene", colnames(anno)))) {
259 tempStr <- paste("Gene-wise tests selected but'Gene' column not", 311 tempStr <- paste("Gene-wise tests selected but'Gene' column not",
260 "specified in hairpin annotation file") 312 "specified in hairpin annotation file")
261 stop(tempStr) 313 stop(tempStr)
262 } 314 }
263 } 315 }
264 } 316 }
265 317
318 if (secFactName != "none") {
319 if (workMode != "glm") {
320 tempStr <- paste("only glm analysis type possible when secondary factor",
321 "used, please change appropriate option.")
322 }
323 }
324
266 ################################################################################ 325 ################################################################################
267 326
268 # Process arguments 327 # Process arguments
269 if (workMode=="glm") { 328 if (workMode == "glm") {
270 if (roastOpt=="yes") { 329 if (roastOpt == "yes") {
271 wantRoast <- TRUE 330 wantRoast <- TRUE
272 } else { 331 } else {
273 wantRoast <- FALSE 332 wantRoast <- FALSE
274 } 333 }
275 } 334 }
297 imgOut("bcv") 356 imgOut("bcv")
298 if (workMode == "classic") { 357 if (workMode == "classic") {
299 smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) 358 smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png"))
300 smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) 359 smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf"))
301 topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) 360 topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv"))
302 } else if (workMode=="glm") { 361 } else if (workMode == "glm") {
303 smearPng <- character() 362 smearPng <- character()
304 smearPdf <- character() 363 smearPdf <- character()
305 topOut <- character() 364 topOut <- character()
306 roastOut <- character() 365 roastOut <- character()
307 barcodePng <- character() 366 barcodePng <- character()
333 ################################################################################ 392 ################################################################################
334 ### Data Processing 393 ### Data Processing
335 ################################################################################ 394 ################################################################################
336 395
337 # Transform gene selection from string into index values for mroast 396 # Transform gene selection from string into index values for mroast
338 if (workMode=="glm") { 397 if (workMode == "glm") {
339 if (selectOpt=="rank") { 398 if (selectOpt == "rank") {
340 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) 399 selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
341 selectVals <- unlist(strsplit(selectVals, ",")) 400 selectVals <- unlist(strsplit(selectVals, ","))
342 401
343 for (i in 1:length(selectVals)) { 402 for (i in 1:length(selectVals)) {
344 if (grepl(":", selectVals[i], fixed=TRUE)) { 403 if (grepl(":", selectVals[i], fixed=TRUE)) {
354 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) 413 selectVals <- gsub(" ", "", selectVals, fixed=TRUE)
355 selectVals <- unlist(strsplit(selectVals, ",")) 414 selectVals <- unlist(strsplit(selectVals, ","))
356 } 415 }
357 } 416 }
358 417
359 if (inputType=="fastq") { 418 if (inputType == "fastq" || inputType == "pairedFastq") {
360 # Use EdgeR hairpin process and capture outputs 419 # Use EdgeR hairpin process and capture outputs
361 hpReadout <- capture.output( 420 hpReadout <- capture.output(
362 data <- processHairpinReads(readfile=fastqPath, 421 data <- processAmplicons(readfile=fastqPath, readfile2=fastqPathRev,
363 barcodefile=samplePath, 422 barcodefile=samplePath,
364 hairpinfile=annoPath, 423 hairpinfile=annoPath,
365 barcodeStart=barStart, barcodeEnd=barEnd, 424 barcodeStart=barStart, barcodeEnd=barEnd,
366 hairpinStart=hpStart, hairpinEnd=hpEnd, 425 barcodeStartRev=barStartRev,
367 verbose=TRUE) 426 barcodeEndRev=barEndRev,
427 hairpinStart=hpStart, hairpinEnd=hpEnd,
428 verbose=TRUE)
368 ) 429 )
369 430
370 # Remove function output entries that show processing data or is empty 431 # Remove function output entries that show processing data or is empty
371 hpReadout <- hpReadout[hpReadout!=""] 432 hpReadout <- hpReadout[hpReadout!=""]
372 hpReadout <- hpReadout[!grepl("Processing", hpReadout)] 433 hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
373 hpReadout <- hpReadout[!grepl("in file", hpReadout)] 434 hpReadout <- hpReadout[!grepl("in file", hpReadout)]
374 hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) 435 hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
375 436
376 # Make the names of groups syntactically valid (replace spaces with periods) 437 # Make the names of groups syntactically valid (replace spaces with periods)
377 data$samples$group <- make.names(data$samples$group) 438 data$samples$group <- make.names(data$samples$group)
378 } else if (inputType=="counts") { 439 if (secFactName != "none") {
440 data$samples[[secFactName]] <- make.names(data$samples[[secFactName]])
441 }
442 } else if (inputType == "counts") {
379 # Process counts information, set ID column to be row names 443 # Process counts information, set ID column to be row names
380 rownames(counts) <- counts$ID 444 rownames(counts) <- counts$ID
381 counts <- counts[ , !(colnames(counts)=="ID")] 445 counts <- counts[ , !(colnames(counts) == "ID")]
382 countsRows <- nrow(counts) 446 countsRows <- nrow(counts)
383 447
384 # Process group information 448 # Process group information
385 factors <- samples$group[match(samples$ID, colnames(counts))] 449 sampleNames <- colnames(counts)
450 matchedIndex <- match(sampleNames, samples$ID)
451 factors <- samples$group[matchedIndex]
452
453 if (secFactName != "none") {
454 secFactors <- samples[[secFactName]][matchedIndex]
455 }
456
386 annoRows <- nrow(anno) 457 annoRows <- nrow(anno)
387 anno <- anno[match(rownames(counts), anno$ID), ] 458 anno <- anno[match(rownames(counts), anno$ID), ]
388 annoMatched <- sum(!is.na(anno$ID)) 459 annoMatched <- sum(!is.na(anno$ID))
389 460
390 if (any(is.na(anno$ID))) { 461 if (any(is.na(anno$ID))) {
414 data$samples <- data$samples[sampleSel, ] 485 data$samples <- data$samples[sampleSel, ]
415 } 486 }
416 487
417 # Filter hairpins with low counts 488 # Filter hairpins with low counts
418 preFilterCount <- nrow(data) 489 preFilterCount <- nrow(data)
419 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq 490 selRow <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
420 data <- data[sel, ] 491 selCol <- colSums(data$counts) > readReq
492 data <- data[selRow, selCol]
493
494 # Check if any data survived filtering
495 if (length(data$counts) == 0) {
496 stop("no data remains after filtering, consider relaxing filters")
497 }
498
499 # Count number of filtered tags and samples
421 postFilterCount <- nrow(data) 500 postFilterCount <- nrow(data)
422 filteredCount <- preFilterCount-postFilterCount 501 filteredCount <- preFilterCount - postFilterCount
423 502 sampleFilterCount <- sum(!selCol)
424 # Estimate dispersions 503
425 data <- estimateDisp(data) 504 if (secFactName == "none") {
426 commonBCV <- sqrt(data$common.dispersion) 505 # Estimate dispersions
506 data <- estimateDisp(data)
507 commonBCV <- round(sqrt(data$common.dispersion), 4)
508 } else {
509 # Construct design
510 if (inputType == "counts") {
511
512 sampleNames <- colnames(counts)
513 matchedIndex <- match(sampleNames, samples$ID)
514 factors <- factor(make.names(samples$group[matchedIndex]))
515
516 secFactors <- factor(make.names(samples[[secFactName]][matchedIndex]))
517
518 } else if (inputType == "fastq" || inputType == "pairedFastq") {
519
520 factors <- factor(data$sample$group)
521 secFactors <- factor(data$sample[[secFactName]])
522
523 }
524
525 design <- model.matrix(~0 + factors + secFactors)
526
527 # Estimate dispersions
528 data <- estimateDisp(data, design=design)
529 commonBCV <- round(sqrt(data$common.dispersion), 4)
530 }
531
427 532
428 ################################################################################ 533 ################################################################################
429 ### Output Processing 534 ### Output Processing
430 ################################################################################ 535 ################################################################################
431 536
492 plotBCV(data, main="BCV Plot") 597 plotBCV(data, main="BCV Plot")
493 newEntry <- c("BCV Plot (.pdf)", "bcv.pdf") 598 newEntry <- c("BCV Plot (.pdf)", "bcv.pdf")
494 linkData <- rbind(linkData, newEntry) 599 linkData <- rbind(linkData, newEntry)
495 invisible(dev.off()) 600 invisible(dev.off())
496 601
497 if (workMode=="classic") { 602 if (workMode == "classic") {
498 # Assess differential representation using classic exact testing methodology 603 # Assess differential representation using classic exact testing methodology
499 # in edgeR 604 # in edgeR
500 testData <- exactTest(data, pair=pairData) 605 testData <- exactTest(data, pair=pairData)
501 606
502 top <- topTags(testData, n=Inf) 607 top <- topTags(testData, n=Inf)
608
609 if (selectDirection == "all") {
610 topIDs <- top$table[(top$table$FDR < fdrThresh) &
611 (abs(top$table$logFC) > lfcThresh), 1]
612 } else if (selectDirection == "up") {
613 topIDs <- top$table[(top$table$FDR < fdrThresh) &
614 (top$table$logFC > lfcThresh), 1]
615 } else if (selectDirection == "down") {
503 topIDs <- top$table[(top$table$FDR < fdrThresh) & 616 topIDs <- top$table[(top$table$FDR < fdrThresh) &
504 (abs(top$table$logFC) > lfcThresh), 1] 617 (top$table$logFC < -lfcThresh), 1]
618 }
505 619
506 write.table(top, file=topOut, row.names=FALSE, sep="\t") 620 write.table(top, file=topOut, row.names=FALSE, sep="\t")
507 621
508 linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], 622 linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1],
509 ") (.tsv)") 623 ") (.tsv)")
510 linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv") 624 linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv")
511 linkData <- rbind(linkData, c(linkName, linkAddr)) 625 linkData <- rbind(linkData, c(linkName, linkAddr))
512 626
513 upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) 627 upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh)
628
514 downCount[1] <- sum(top$table$FDR < fdrThresh & 629 downCount[1] <- sum(top$table$FDR < fdrThresh &
515 top$table$logFC < -lfcThresh) 630 top$table$logFC < -lfcThresh)
631
516 flatCount[1] <- sum(top$table$FDR > fdrThresh | 632 flatCount[1] <- sum(top$table$FDR > fdrThresh |
517 abs(top$table$logFC) < lfcThresh) 633 abs(top$table$logFC) < lfcThresh)
518 634
519 635
520 636
541 imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)") 657 imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)")
542 imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf") 658 imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf")
543 linkData <- rbind(linkData, c(imgName, imgAddr)) 659 linkData <- rbind(linkData, c(imgName, imgAddr))
544 invisible(dev.off()) 660 invisible(dev.off())
545 661
546 } else if (workMode=="glm") { 662 } else if (workMode == "glm") {
547 # Generating design information 663 # Generating design information
548 factors <- factor(data$sample$group) 664 if (secFactName == "none") {
549 design <- model.matrix(~0+factors) 665
550 666 factors <- factor(data$sample$group)
551 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) 667 design <- model.matrix(~0 + factors)
668
669 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE)
670
671 } else {
672
673 factors <- factor(data$sample$group)
674
675 if (inputType == "counts") {
676
677 sampleNames <- colnames(counts)
678 matchedIndex <- match(sampleNames, samples$ID)
679 factors <- factor(samples$group[matchedIndex])
680
681 secFactors <- factor(samples[[secFactName]][matchedIndex])
682
683 } else if (inputType == "fastq" || inputType == "pairedFastq") {
684
685 secFactors <- factor(data$sample[[secFactName]])
686
687 }
688
689 design <- model.matrix(~0 + factors + secFactors)
690
691 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE)
692 colnames(design) <- gsub("secFactors", secFactName, colnames(design),
693 fixed=TRUE)
694 }
695
552 696
553 # Split up contrasts seperated by comma into a vector 697 # Split up contrasts seperated by comma into a vector
554 contrastData <- unlist(strsplit(contrastData, split=",")) 698 contrastData <- unlist(strsplit(contrastData, split=","))
555 699
556 for (i in 1:length(contrastData)) { 700 for (i in 1:length(contrastData)) {
562 # Carry out Likelihood ratio test 706 # Carry out Likelihood ratio test
563 testData <- glmLRT(fit, contrast=contrasts) 707 testData <- glmLRT(fit, contrast=contrasts)
564 708
565 # Select hairpins with FDR < 0.05 to highlight on plot 709 # Select hairpins with FDR < 0.05 to highlight on plot
566 top <- topTags(testData, n=Inf) 710 top <- topTags(testData, n=Inf)
567 topIDs <- top$table[(top$table$FDR < fdrThresh) & 711
712 if (selectDirection == "all") {
713 topIDs <- top$table[(top$table$FDR < fdrThresh) &
568 (abs(top$table$logFC) > lfcThresh), 1] 714 (abs(top$table$logFC) > lfcThresh), 1]
715 } else if (selectDirection == "up") {
716 topIDs <- top$table[(top$table$FDR < fdrThresh) &
717 (top$table$logFC > lfcThresh), 1]
718 } else if (selectDirection == "down") {
719 topIDs <- top$table[(top$table$FDR < fdrThresh) &
720 (top$table$logFC < -lfcThresh), 1]
721 }
722
569 write.table(top, file=topOut[i], row.names=FALSE, sep="\t") 723 write.table(top, file=topOut[i], row.names=FALSE, sep="\t")
570 724
571 linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") 725 linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)")
572 linkAddr <- paste0("toptag(", contrastData[i], ").tsv") 726 linkAddr <- paste0("toptag(", contrastData[i], ").tsv")
573 linkData <- rbind(linkData, c(linkName, linkAddr)) 727 linkData <- rbind(linkData, c(linkName, linkAddr))
605 genes <- as.character(data$genes$Gene) 759 genes <- as.character(data$genes$Gene)
606 unq <- unique(genes) 760 unq <- unique(genes)
607 unq <- unq[!is.na(unq)] 761 unq <- unq[!is.na(unq)]
608 geneList <- list() 762 geneList <- list()
609 for (gene in unq) { 763 for (gene in unq) {
610 if (length(which(genes==gene)) >= hairpinReq) { 764 if (length(which(genes == gene)) >= hairpinReq) {
611 geneList[[gene]] <- which(genes==gene) 765 geneList[[gene]] <- which(genes == gene)
612 } 766 }
613 } 767 }
614 768
615 if (wantRoast) { 769 if (wantRoast) {
616 # Input preparaton for roast 770 # Input preparaton for roast
622 write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t") 776 write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t")
623 linkName <- paste0("Gene Level Analysis Table(", contrastData[i], 777 linkName <- paste0("Gene Level Analysis Table(", contrastData[i],
624 ") (.tsv)") 778 ") (.tsv)")
625 linkAddr <- paste0("gene_level(", contrastData[i], ").tsv") 779 linkAddr <- paste0("gene_level(", contrastData[i], ").tsv")
626 linkData <- rbind(linkData, c(linkName, linkAddr)) 780 linkData <- rbind(linkData, c(linkName, linkAddr))
627 if (selectOpt=="rank") { 781 if (selectOpt == "rank") {
628 selectedGenes <- rownames(roastData)[selectVals] 782 selectedGenes <- rownames(roastData)[selectVals]
629 } else { 783 } else {
630 selectedGenes <- selectVals 784 selectedGenes <- selectVals
631 } 785 }
632 786
666 } 820 }
667 821
668 # Generate data frame of the significant differences 822 # Generate data frame of the significant differences
669 sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) 823 sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount)
670 if (workMode == "glm") { 824 if (workMode == "glm") {
825
671 row.names(sigDiff) <- contrastData 826 row.names(sigDiff) <- contrastData
827
672 } else if (workMode == "classic") { 828 } else if (workMode == "classic") {
829
673 row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1]) 830 row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1])
831
674 } 832 }
675 833
676 # Output table of summarised counts 834 # Output table of summarised counts
677 ID <- rownames(data$counts) 835 ID <- rownames(data$counts)
678 outputCounts <- cbind(ID, data$counts) 836 outputCounts <- cbind(ID, data$counts)
700 HtmlHead("EdgeR Output") 858 HtmlHead("EdgeR Output")
701 859
702 cata("<body>\n") 860 cata("<body>\n")
703 cata("<h3>EdgeR Analysis Output:</h3>\n") 861 cata("<h3>EdgeR Analysis Output:</h3>\n")
704 cata("<h4>Input Summary:</h4>\n") 862 cata("<h4>Input Summary:</h4>\n")
705 if (inputType=="fastq") { 863 if (inputType == "fastq" || inputType == "pairedFastq") {
864
706 cata("<ul>\n") 865 cata("<ul>\n")
707 ListItem(hpReadout[1]) 866 ListItem(hpReadout[1])
708 ListItem(hpReadout[2]) 867 ListItem(hpReadout[2])
709 cata("</ul>\n") 868 cata("</ul>\n")
710 cata(hpReadout[3], "<br />\n") 869 cata(hpReadout[3], "<br />\n")
714 cata("</ul>\n") 873 cata("</ul>\n")
715 cata(hpReadout[8:11], sep="<br />\n") 874 cata(hpReadout[8:11], sep="<br />\n")
716 cata("<br />\n") 875 cata("<br />\n")
717 cata("<b>Please check that read percentages are consistent with ") 876 cata("<b>Please check that read percentages are consistent with ")
718 cata("expectations.</b><br >\n") 877 cata("expectations.</b><br >\n")
719 } else if (inputType=="counts") { 878
879 } else if (inputType == "counts") {
880
720 cata("<ul>\n") 881 cata("<ul>\n")
721 ListItem("Number of Samples: ", ncol(data$counts)) 882 ListItem("Number of Samples: ", ncol(data$counts))
722 ListItem("Number of Hairpins: ", countsRows) 883 ListItem("Number of Hairpins: ", countsRows)
723 ListItem("Number of annotations provided: ", annoRows) 884 ListItem("Number of annotations provided: ", annoRows)
724 ListItem("Number of annotations matched to hairpin: ", annoMatched) 885 ListItem("Number of annotations matched to hairpin: ", annoMatched)
725 cata("</ul>\n") 886 cata("</ul>\n")
887
726 } 888 }
727 889
728 cata("The estimated common biological coefficient of variation (BCV) is: ", 890 cata("The estimated common biological coefficient of variation (BCV) is: ",
729 commonBCV, "<br />\n") 891 commonBCV, "<br />\n")
892
893 if (secFactName == "none") {
894
895 cata("No secondary factor specified.<br />\n")
896
897 } else {
898
899 cata("Secondary factor specified as: ", secFactName, "<br />\n")
900
901 }
730 902
731 cata("<h4>Output:</h4>\n") 903 cata("<h4>Output:</h4>\n")
732 cata("PDF copies of JPEGS available in 'Plots' section.<br />\n") 904 cata("PDF copies of JPEGS available in 'Plots' section.<br />\n")
733 for (i in 1:nrow(imageData)) { 905 for (i in 1:nrow(imageData)) {
734 if (grepl("barcode", imageData$Link[i])) { 906 if (grepl("barcode", imageData$Link[i])) {
907
735 if (packageVersion("limma")<"3.19.19") { 908 if (packageVersion("limma")<"3.19.19") {
909
736 HtmlImage(imageData$Link[i], imageData$Label[i], 910 HtmlImage(imageData$Link[i], imageData$Label[i],
737 height=length(selectedGenes)*150) 911 height=length(selectedGenes)*150)
912
738 } else { 913 } else {
914
739 HtmlImage(imageData$Link[i], imageData$Label[i], 915 HtmlImage(imageData$Link[i], imageData$Label[i],
740 height=length(selectedGenes)*300) 916 height=length(selectedGenes)*300)
917
741 } 918 }
742 } else { 919 } else {
920
743 HtmlImage(imageData$Link[i], imageData$Label[i]) 921 HtmlImage(imageData$Link[i], imageData$Label[i])
922
744 } 923 }
745 } 924 }
746 cata("<br />\n") 925 cata("<br />\n")
747 926
748 cata("<h4>Differential Representation Counts:</h4>\n") 927 cata("<h4>Differential Representation Counts:</h4>\n")
777 HtmlLink(linkData$Link[i], linkData$Label[i]) 956 HtmlLink(linkData$Link[i], linkData$Label[i])
778 } 957 }
779 } 958 }
780 959
781 cata("<p>Alt-click links to download file.</p>\n") 960 cata("<p>Alt-click links to download file.</p>\n")
782 cata("<p>Click floppy disc icon associated history item to download ") 961 cata("<p>Click floppy disc icon on associated history item to download ")
783 cata("all files.</p>\n") 962 cata("all files.</p>\n")
784 cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n") 963 cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n")
785 964
786 cata("<h4>Additional Information:</h4>\n") 965 cata("<h4>Additional Information:</h4>\n")
787 966
788 if (inputType == "fastq") { 967 if (inputType == "fastq") {
968
789 ListItem("Data was gathered from fastq raw read file(s).") 969 ListItem("Data was gathered from fastq raw read file(s).")
970
790 } else if (inputType == "counts") { 971 } else if (inputType == "counts") {
972
791 ListItem("Data was gathered from a table of counts.") 973 ListItem("Data was gathered from a table of counts.")
792 } 974
793 975 }
794 if (cpmReq!=0 && sampleReq!=0) { 976
795 tempStr <- paste("Hairpins without more than", cpmReq, 977 if (cpmReq != 0 && sampleReq != 0) {
978 tempStr <- paste("Target sequences without more than", cpmReq,
796 "CPM in at least", sampleReq, "samples are insignificant", 979 "CPM in at least", sampleReq, "samples are insignificant",
797 "and filtered out.") 980 "and filtered out.")
798 ListItem(tempStr) 981 ListItem(tempStr)
982
799 filterProp <- round(filteredCount/preFilterCount*100, digits=2) 983 filterProp <- round(filteredCount/preFilterCount*100, digits=2)
800 tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, 984 tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
801 "%) hairpins were filtered out for low count-per-million.") 985 "%) target sequences were filtered out for low ",
986 "count-per-million.")
987 ListItem(tempStr)
988 }
989
990 if (sampleReq != 0) {
991 tempStr <- paste("Samples that did not produce more than", sampleReq,
992 "counts were filtered out.")
993 ListItem(tempStr)
994
995 tempStr <- paste0(sampleFilterCount, " samples were filtered out for low ",
996 "counts.")
802 ListItem(tempStr) 997 ListItem(tempStr)
803 } 998 }
804 999
805 if (exists("filteredSamples")) { 1000 if (exists("filteredSamples")) {
806 tempStr <- paste("The following samples were filtered out for having zero", 1001 tempStr <- paste("The following samples were filtered out for having zero",
807 "library size: ", filteredSamples) 1002 "library size: ", filteredSamples)
808 ListItem(tempStr) 1003 ListItem(tempStr)
809 } 1004 }
810 1005
811 if (workMode == "classic") { 1006 if (workMode == "classic") {
812 ListItem("An exact test was performed on each hairpin.") 1007 ListItem("An exact test was performed on each target sequence.")
813 } else if (workMode == "glm") { 1008 } else if (workMode == "glm") {
814 ListItem("A generalised linear model was fitted to each hairpin.") 1009 ListItem("A generalised linear model was fitted to each target sequence.")
815 } 1010 }
816 1011
817 cit <- character() 1012 cit <- character()
818 link <-character() 1013 link <-character()
819 link[1] <- paste0("<a href=\"", 1014 link[1] <- paste0("<a href=\"",