Mercurial > repos > shians > shrnaseq
comparison hairpinTool.R @ 13:7aaa9bc23e3c
Added support for paired end reads
- Changed terminology to generalise to sgRNA CRISPR experiments.
- Added option to include second factor for statistical power
- Added option to filter out samples with low counts
- Added support for paired end reads
- Added option to highlight only positive or negative fold change in smear plot
- Fixed bug that caused tool to stop if more than enough sample annotations
were supplied
author | shian_su <registertonysu@gmail.com> |
---|---|
date | Tue, 14 Oct 2014 17:05:07 +1100 |
parents | ebb4cb1e8e35 |
children | 5a917ea5bed2 |
comparison
equal
deleted
inserted
replaced
12:ebb4cb1e8e35 | 13:7aaa9bc23e3c |
---|---|
1 # ARGS: 1.inputType -String specifying format of input (fastq or table) | 1 # ARGS: 1.inputType -String specifying format of input (fastq or table) |
2 # IF inputType is "fastQ": | 2 # IF inputType is "fastq" or "pairedFastq: |
3 # 2*.fastqPath -One or more strings specifying path to fastq files | 3 # 2*.fastqPath -One or more strings specifying path to fastq files |
4 # 2.annoPath -String specifying path to hairpin annotation table | 4 # 2.annoPath -String specifying path to hairpin annotation table |
5 # 3.samplePath -String specifying path to sample annotation table | 5 # 3.samplePath -String specifying path to sample annotation table |
6 # 4.barStart -Integer specifying starting position of barcode | 6 # 4.barStart -Integer specifying starting position of barcode |
7 # 5.barEnd -Integer specifying ending position of barcode | 7 # 5.barEnd -Integer specifying ending position of barcode |
8 # 6.hpStart -Integer specifying startins position of hairpin | 8 # ### |
9 # IF inputType is "pairedFastq": | |
10 # 6.barStartRev -Integer specifying starting position of barcode | |
11 # on reverse end | |
12 # 7.barEndRev -Integer specifying ending position of barcode | |
13 # on reverse end | |
14 # ### | |
15 # 8.hpStart -Integer specifying startins position of hairpin | |
9 # unique region | 16 # unique region |
10 # 7.hpEnd -Integer specifying ending position of hairpin | 17 # 9.hpEnd -Integer specifying ending position of hairpin |
11 # unique region | 18 # unique region |
12 # ### | |
13 # IF inputType is "counts": | 19 # IF inputType is "counts": |
14 # 2.countPath -String specifying path to count table | 20 # 2.countPath -String specifying path to count table |
15 # 3.annoPath -String specifying path to hairpin annotation table | 21 # 3.annoPath -String specifying path to hairpin annotation table |
16 # 4.samplePath -String specifying path to sample annotation table | 22 # 4.samplePath -String specifying path to sample annotation table |
17 # ### | 23 # ### |
18 # 8.cpmReq -Float specifying cpm requirement | 24 # 10.secFactName -String specifying name of secondary factor |
19 # 9.sampleReq -Integer specifying cpm requirement | 25 # 11.cpmReq -Float specifying cpm requirement |
20 # 10.fdrThresh -Float specifying the FDR requirement | 26 # 12.sampleReq -Integer specifying cpm requirement |
21 # 11.lfcThresh -Float specifying the log-fold-change requirement | 27 # 13.readReq -Integer specifying read requirement |
22 # 12.workMode -String specifying exact test or GLM usage | 28 # 14.fdrThresh -Float specifying the FDR requirement |
23 # 13.htmlPath -String specifying path to HTML file | 29 # 15.lfcThresh -Float specifying the log-fold-change requirement |
24 # 14.folderPath -STring specifying path to folder for output | 30 # 16.workMode -String specifying exact test or GLM usage |
31 # 17.htmlPath -String specifying path to HTML file | |
32 # 18.folderPath -String specifying path to folder for output | |
25 # IF workMode is "classic" (exact test) | 33 # IF workMode is "classic" (exact test) |
26 # 15.pairData[2] -String specifying first group for exact test | 34 # 19.pairData[2] -String specifying first group for exact test |
27 # 16.pairData[1] -String specifying second group for exact test | 35 # 20.pairData[1] -String specifying second group for exact test |
28 # ### | 36 # ### |
29 # IF workMode is "glm" | 37 # IF workMode is "glm" |
30 # 15.contrastData -String specifying contrasts to be made | 38 # 19.contrastData -String specifying contrasts to be made |
31 # 16.roastOpt -String specifying usage of gene-wise tests | 39 # 20.roastOpt -String specifying usage of gene-wise tests |
32 # 17.hairpinReq -String specifying hairpin requirement for gene- | 40 # 21.hairpinReq -String specifying hairpin requirement for gene- |
33 # wise test | 41 # wise test |
34 # 18.selectOpt -String specifying type of selection for barcode | 42 # 22.selectOpt -String specifying type of selection for barcode |
35 # plots | 43 # plots |
36 # 19.selectVals -String specifying members selected for barcode | 44 # 23.selectVals -String specifying members selected for barcode |
37 # plots | 45 # plots |
38 # ### | 46 # ### |
39 # | 47 # |
40 # OUT: Bar Plot of Counts Per Index | 48 # OUT: Bar Plot of Counts Per Index |
41 # Bar Plot of Counts Per Hairpin | 49 # Bar Plot of Counts Per Hairpin |
42 # MDS Plot | 50 # MDS Plot |
51 # BCV Plot | |
43 # Smear Plot | 52 # Smear Plot |
44 # Barcode Plots (If Genewise testing was selected) | 53 # Barcode Plots (If Genewise testing was selected) |
45 # Top Expression Table | 54 # Top Expression Table |
46 # Feature Counts Table | 55 # Feature Counts Table |
47 # HTML file linking to the ouputs | 56 # HTML file linking to the ouputs |
56 library(statmod, quietly=TRUE, warn.conflicts=FALSE) | 65 library(statmod, quietly=TRUE, warn.conflicts=FALSE) |
57 library(splines, quietly=TRUE, warn.conflicts=FALSE) | 66 library(splines, quietly=TRUE, warn.conflicts=FALSE) |
58 library(edgeR, quietly=TRUE, warn.conflicts=FALSE) | 67 library(edgeR, quietly=TRUE, warn.conflicts=FALSE) |
59 library(limma, quietly=TRUE, warn.conflicts=FALSE) | 68 library(limma, quietly=TRUE, warn.conflicts=FALSE) |
60 | 69 |
61 if (packageVersion("edgeR") < "3.5.27") { | 70 if (packageVersion("edgeR") < "3.7.17") { |
62 stop("Please update 'edgeR' to version >= 3.5.23 to run this tool") | 71 stop("Please update 'edgeR' to version >= 3.7.17 to run this tool") |
63 } | 72 } |
64 | 73 |
65 if (packageVersion("limma")<"3.19.19") { | 74 if (packageVersion("limma")<"3.21.16") { |
66 message("Update 'limma' to version >= 3.19.19 to see updated barcode graphs") | 75 message("Update 'limma' to version >= 3.21.16 to see updated barcode graphs") |
67 } | 76 } |
68 | 77 |
69 ################################################################################ | 78 ################################################################################ |
70 ### Function declarations | 79 ### Function declarations |
71 ################################################################################ | 80 ################################################################################ |
169 ################################################################################ | 178 ################################################################################ |
170 | 179 |
171 # Grabbing arguments from command line | 180 # Grabbing arguments from command line |
172 argv <- commandArgs(TRUE) | 181 argv <- commandArgs(TRUE) |
173 | 182 |
174 # Remove fastq file paths after collecting from argument vector | |
175 inputType <- as.character(argv[1]) | 183 inputType <- as.character(argv[1]) |
176 if (inputType=="fastq") { | 184 if (inputType == "fastq") { |
185 | |
177 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], | 186 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], |
178 fixed=TRUE)) | 187 fixed=TRUE)) |
179 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] | 188 |
189 # Remove fastq paths | |
190 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] | |
191 | |
192 fastqPathRev <- NULL | |
180 annoPath <- as.character(argv[2]) | 193 annoPath <- as.character(argv[2]) |
181 samplePath <- as.character(argv[3]) | 194 samplePath <- as.character(argv[3]) |
182 barStart <- as.numeric(argv[4]) | 195 barStart <- as.numeric(argv[4]) |
183 barEnd <- as.numeric(argv[5]) | 196 barEnd <- as.numeric(argv[5]) |
184 hpStart <- as.numeric(argv[6]) | 197 barStartRev <- NULL |
185 hpEnd <- as.numeric(argv[7]) | 198 barStartRev <- NULL |
186 } else if (inputType=="counts") { | 199 hpStart <- as.numeric(argv[8]) |
200 hpEnd <- as.numeric(argv[9]) | |
201 } else if (inputType=="pairedFastq") { | |
202 | |
203 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], | |
204 fixed=TRUE)) | |
205 | |
206 fastqPathRev <- as.character(gsub("fastqRev::", "", | |
207 argv[grepl("fastqRev::", argv)], fixed=TRUE)) | |
208 | |
209 # Remove fastq paths | |
210 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] | |
211 argv <- argv[!grepl("fastqRev::", argv, fixed=TRUE)] | |
212 | |
213 annoPath <- as.character(argv[2]) | |
214 samplePath <- as.character(argv[3]) | |
215 barStart <- as.numeric(argv[4]) | |
216 barEnd <- as.numeric(argv[5]) | |
217 barStartRev <- as.numeric(argv[6]) | |
218 barEndRev <- as.numeric(argv[7]) | |
219 hpStart <- as.numeric(argv[8]) | |
220 hpEnd <- as.numeric(argv[9]) | |
221 } else if (inputType == "counts") { | |
187 countPath <- as.character(argv[2]) | 222 countPath <- as.character(argv[2]) |
188 annoPath <- as.character(argv[3]) | 223 annoPath <- as.character(argv[3]) |
189 samplePath <- as.character(argv[4]) | 224 samplePath <- as.character(argv[4]) |
190 } | 225 } |
191 | 226 |
192 cpmReq <- as.numeric(argv[8]) | 227 secFactName <- as.character(argv[10]) |
193 sampleReq <- as.numeric(argv[9]) | 228 cpmReq <- as.numeric(argv[11]) |
194 fdrThresh <- as.numeric(argv[10]) | 229 sampleReq <- as.numeric(argv[12]) |
195 lfcThresh <- as.numeric(argv[11]) | 230 readReq <- as.numeric(argv[13]) |
196 workMode <- as.character(argv[12]) | 231 fdrThresh <- as.numeric(argv[14]) |
197 htmlPath <- as.character(argv[13]) | 232 lfcThresh <- as.numeric(argv[15]) |
198 folderPath <- as.character(argv[14]) | 233 selectDirection <- as.character(argv[16]) |
199 | 234 workMode <- as.character(argv[17]) |
200 if (workMode=="classic") { | 235 htmlPath <- as.character(argv[18]) |
236 folderPath <- as.character(argv[19]) | |
237 | |
238 if (workMode == "classic") { | |
201 pairData <- character() | 239 pairData <- character() |
202 pairData[2] <- as.character(argv[15]) | 240 pairData[2] <- as.character(argv[20]) |
203 pairData[1] <- as.character(argv[16]) | 241 pairData[1] <- as.character(argv[21]) |
204 } else if (workMode=="glm") { | 242 } else if (workMode == "glm") { |
205 contrastData <- as.character(argv[15]) | 243 contrastData <- as.character(argv[20]) |
206 roastOpt <- as.character(argv[16]) | 244 roastOpt <- as.character(argv[21]) |
207 hairpinReq <- as.numeric(argv[17]) | 245 hairpinReq <- as.numeric(argv[22]) |
208 selectOpt <- as.character(argv[18]) | 246 selectOpt <- as.character(argv[23]) |
209 selectVals <- as.character(argv[19]) | 247 selectVals <- as.character(argv[24]) |
210 } | 248 } |
211 | 249 |
212 # Read in inputs | 250 # Read in inputs |
213 | 251 |
214 samples <- read.table(samplePath, header=TRUE, sep="\t") | 252 samples <- read.table(samplePath, header=TRUE, sep="\t") |
253 | |
215 anno <- read.table(annoPath, header=TRUE, sep="\t") | 254 anno <- read.table(annoPath, header=TRUE, sep="\t") |
216 if (inputType=="counts") { | 255 |
256 if (inputType == "counts") { | |
217 counts <- read.table(countPath, header=TRUE, sep="\t") | 257 counts <- read.table(countPath, header=TRUE, sep="\t") |
218 } | 258 } |
219 | 259 |
220 ###################### Check inputs for correctness ############################ | 260 ###################### Check inputs for correctness ############################ |
221 samples$ID <- make.names(samples$ID) | 261 samples$ID <- make.names(samples$ID) |
222 | 262 |
223 if (!any(grepl("group", names(samples)))) { | 263 if ( !any(grepl("group", names(samples))) ) { |
224 stop("'group' column not specified in sample annotation file") | 264 stop("'group' column not specified in sample annotation file") |
225 } # Check if grouping variable has been specified | 265 } # Check if grouping variable has been specified |
226 | 266 |
227 if (any(table(samples$ID)>1)){ | 267 if (secFactName != "none") { |
268 if ( !any(grepl(secFactName, names(samples))) ) { | |
269 tempStr <- paste0("Second factor specified as \"", secFactName, "\" but ", | |
270 "no such column name found in sample annotation file") | |
271 stop(tempStr) | |
272 } # Check if specified secondary factor is present | |
273 } | |
274 | |
275 | |
276 if ( any(table(samples$ID) > 1) ){ | |
228 tab <- table(samples$ID) | 277 tab <- table(samples$ID) |
229 offenders <- paste(names(tab[tab>1]), collapse=", ") | 278 offenders <- paste(names(tab[tab > 1]), collapse=", ") |
230 offenders <- unmake.names(offenders) | 279 offenders <- unmake.names(offenders) |
231 stop("'ID' column of sample annotation must have unique values, values ", | 280 stop("'ID' column of sample annotation must have unique values, values ", |
232 offenders, " are repeated") | 281 offenders, " are repeated") |
233 } # Check that IDs in sample annotation are unique | 282 } # Check that IDs in sample annotation are unique |
234 | 283 |
235 if (inputType=="fastq") { | 284 if (inputType == "fastq" || inputType == "pairedFastq") { |
236 | 285 |
237 if (any(table(anno$ID)>1)){ | 286 if ( any(table(anno$ID) > 1) ){ |
238 tab <- table(anno$ID) | 287 tab <- table(anno$ID) |
239 offenders <- paste(names(tab[tab>1]), collapse=", ") | 288 offenders <- paste(names(tab[tab>1]), collapse=", ") |
240 stop("'ID' column of hairpin annotation must have unique values, values ", | 289 stop("'ID' column of hairpin annotation must have unique values, values ", |
241 offenders, " are repeated") | 290 offenders, " are repeated") |
242 } # Check that IDs in hairpin annotation are unique | 291 } # Check that IDs in hairpin annotation are unique |
243 | 292 |
244 } else if (inputType=="counts") { | 293 } else if (inputType == "counts") { |
245 if (any(is.na(match(samples$ID, colnames(counts))))) { | 294 # The first element of the colnames will be 'ID' and should not match |
295 idFromSample <- samples$ID | |
296 idFromTable <- colnames(counts)[-1] | |
297 if (any(is.na(match(idFromTable, idFromSample)))) { | |
246 stop("not all samples have groups specified") | 298 stop("not all samples have groups specified") |
247 } # Check that a group has be specifed for each sample | 299 } # Check that a group has be specifed for each sample |
248 | 300 |
249 if (any(table(counts$ID)>1)){ | 301 if ( any(table(counts$ID) > 1) ){ |
250 tab <- table(counts$ID) | 302 tab <- table(counts$ID) |
251 offenders <- paste(names(tab[tab>1]), collapse=", ") | 303 offenders <- paste(names(tab[tab>1]), collapse=", ") |
252 stop("'ID' column of count table must have unique values, values ", | 304 stop("'ID' column of count table must have unique values, values ", |
253 offenders, " are repeated") | 305 offenders, " are repeated") |
254 } # Check that IDs in count table are unique | 306 } # Check that IDs in count table are unique |
255 } | 307 } |
256 if (workMode=="glm") { | 308 if (workMode == "glm") { |
257 if (roastOpt == "yes") { | 309 if (roastOpt == "yes") { |
258 if (is.na(match("Gene", colnames(anno)))) { | 310 if (is.na(match("Gene", colnames(anno)))) { |
259 tempStr <- paste("Gene-wise tests selected but'Gene' column not", | 311 tempStr <- paste("Gene-wise tests selected but'Gene' column not", |
260 "specified in hairpin annotation file") | 312 "specified in hairpin annotation file") |
261 stop(tempStr) | 313 stop(tempStr) |
262 } | 314 } |
263 } | 315 } |
264 } | 316 } |
265 | 317 |
318 if (secFactName != "none") { | |
319 if (workMode != "glm") { | |
320 tempStr <- paste("only glm analysis type possible when secondary factor", | |
321 "used, please change appropriate option.") | |
322 } | |
323 } | |
324 | |
266 ################################################################################ | 325 ################################################################################ |
267 | 326 |
268 # Process arguments | 327 # Process arguments |
269 if (workMode=="glm") { | 328 if (workMode == "glm") { |
270 if (roastOpt=="yes") { | 329 if (roastOpt == "yes") { |
271 wantRoast <- TRUE | 330 wantRoast <- TRUE |
272 } else { | 331 } else { |
273 wantRoast <- FALSE | 332 wantRoast <- FALSE |
274 } | 333 } |
275 } | 334 } |
297 imgOut("bcv") | 356 imgOut("bcv") |
298 if (workMode == "classic") { | 357 if (workMode == "classic") { |
299 smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) | 358 smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) |
300 smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) | 359 smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) |
301 topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) | 360 topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) |
302 } else if (workMode=="glm") { | 361 } else if (workMode == "glm") { |
303 smearPng <- character() | 362 smearPng <- character() |
304 smearPdf <- character() | 363 smearPdf <- character() |
305 topOut <- character() | 364 topOut <- character() |
306 roastOut <- character() | 365 roastOut <- character() |
307 barcodePng <- character() | 366 barcodePng <- character() |
333 ################################################################################ | 392 ################################################################################ |
334 ### Data Processing | 393 ### Data Processing |
335 ################################################################################ | 394 ################################################################################ |
336 | 395 |
337 # Transform gene selection from string into index values for mroast | 396 # Transform gene selection from string into index values for mroast |
338 if (workMode=="glm") { | 397 if (workMode == "glm") { |
339 if (selectOpt=="rank") { | 398 if (selectOpt == "rank") { |
340 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) | 399 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) |
341 selectVals <- unlist(strsplit(selectVals, ",")) | 400 selectVals <- unlist(strsplit(selectVals, ",")) |
342 | 401 |
343 for (i in 1:length(selectVals)) { | 402 for (i in 1:length(selectVals)) { |
344 if (grepl(":", selectVals[i], fixed=TRUE)) { | 403 if (grepl(":", selectVals[i], fixed=TRUE)) { |
354 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) | 413 selectVals <- gsub(" ", "", selectVals, fixed=TRUE) |
355 selectVals <- unlist(strsplit(selectVals, ",")) | 414 selectVals <- unlist(strsplit(selectVals, ",")) |
356 } | 415 } |
357 } | 416 } |
358 | 417 |
359 if (inputType=="fastq") { | 418 if (inputType == "fastq" || inputType == "pairedFastq") { |
360 # Use EdgeR hairpin process and capture outputs | 419 # Use EdgeR hairpin process and capture outputs |
361 hpReadout <- capture.output( | 420 hpReadout <- capture.output( |
362 data <- processHairpinReads(readfile=fastqPath, | 421 data <- processAmplicons(readfile=fastqPath, readfile2=fastqPathRev, |
363 barcodefile=samplePath, | 422 barcodefile=samplePath, |
364 hairpinfile=annoPath, | 423 hairpinfile=annoPath, |
365 barcodeStart=barStart, barcodeEnd=barEnd, | 424 barcodeStart=barStart, barcodeEnd=barEnd, |
366 hairpinStart=hpStart, hairpinEnd=hpEnd, | 425 barcodeStartRev=barStartRev, |
367 verbose=TRUE) | 426 barcodeEndRev=barEndRev, |
427 hairpinStart=hpStart, hairpinEnd=hpEnd, | |
428 verbose=TRUE) | |
368 ) | 429 ) |
369 | 430 |
370 # Remove function output entries that show processing data or is empty | 431 # Remove function output entries that show processing data or is empty |
371 hpReadout <- hpReadout[hpReadout!=""] | 432 hpReadout <- hpReadout[hpReadout!=""] |
372 hpReadout <- hpReadout[!grepl("Processing", hpReadout)] | 433 hpReadout <- hpReadout[!grepl("Processing", hpReadout)] |
373 hpReadout <- hpReadout[!grepl("in file", hpReadout)] | 434 hpReadout <- hpReadout[!grepl("in file", hpReadout)] |
374 hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) | 435 hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) |
375 | 436 |
376 # Make the names of groups syntactically valid (replace spaces with periods) | 437 # Make the names of groups syntactically valid (replace spaces with periods) |
377 data$samples$group <- make.names(data$samples$group) | 438 data$samples$group <- make.names(data$samples$group) |
378 } else if (inputType=="counts") { | 439 if (secFactName != "none") { |
440 data$samples[[secFactName]] <- make.names(data$samples[[secFactName]]) | |
441 } | |
442 } else if (inputType == "counts") { | |
379 # Process counts information, set ID column to be row names | 443 # Process counts information, set ID column to be row names |
380 rownames(counts) <- counts$ID | 444 rownames(counts) <- counts$ID |
381 counts <- counts[ , !(colnames(counts)=="ID")] | 445 counts <- counts[ , !(colnames(counts) == "ID")] |
382 countsRows <- nrow(counts) | 446 countsRows <- nrow(counts) |
383 | 447 |
384 # Process group information | 448 # Process group information |
385 factors <- samples$group[match(samples$ID, colnames(counts))] | 449 sampleNames <- colnames(counts) |
450 matchedIndex <- match(sampleNames, samples$ID) | |
451 factors <- samples$group[matchedIndex] | |
452 | |
453 if (secFactName != "none") { | |
454 secFactors <- samples[[secFactName]][matchedIndex] | |
455 } | |
456 | |
386 annoRows <- nrow(anno) | 457 annoRows <- nrow(anno) |
387 anno <- anno[match(rownames(counts), anno$ID), ] | 458 anno <- anno[match(rownames(counts), anno$ID), ] |
388 annoMatched <- sum(!is.na(anno$ID)) | 459 annoMatched <- sum(!is.na(anno$ID)) |
389 | 460 |
390 if (any(is.na(anno$ID))) { | 461 if (any(is.na(anno$ID))) { |
414 data$samples <- data$samples[sampleSel, ] | 485 data$samples <- data$samples[sampleSel, ] |
415 } | 486 } |
416 | 487 |
417 # Filter hairpins with low counts | 488 # Filter hairpins with low counts |
418 preFilterCount <- nrow(data) | 489 preFilterCount <- nrow(data) |
419 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq | 490 selRow <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq |
420 data <- data[sel, ] | 491 selCol <- colSums(data$counts) > readReq |
492 data <- data[selRow, selCol] | |
493 | |
494 # Check if any data survived filtering | |
495 if (length(data$counts) == 0) { | |
496 stop("no data remains after filtering, consider relaxing filters") | |
497 } | |
498 | |
499 # Count number of filtered tags and samples | |
421 postFilterCount <- nrow(data) | 500 postFilterCount <- nrow(data) |
422 filteredCount <- preFilterCount-postFilterCount | 501 filteredCount <- preFilterCount - postFilterCount |
423 | 502 sampleFilterCount <- sum(!selCol) |
424 # Estimate dispersions | 503 |
425 data <- estimateDisp(data) | 504 if (secFactName == "none") { |
426 commonBCV <- sqrt(data$common.dispersion) | 505 # Estimate dispersions |
506 data <- estimateDisp(data) | |
507 commonBCV <- round(sqrt(data$common.dispersion), 4) | |
508 } else { | |
509 # Construct design | |
510 if (inputType == "counts") { | |
511 | |
512 sampleNames <- colnames(counts) | |
513 matchedIndex <- match(sampleNames, samples$ID) | |
514 factors <- factor(make.names(samples$group[matchedIndex])) | |
515 | |
516 secFactors <- factor(make.names(samples[[secFactName]][matchedIndex])) | |
517 | |
518 } else if (inputType == "fastq" || inputType == "pairedFastq") { | |
519 | |
520 factors <- factor(data$sample$group) | |
521 secFactors <- factor(data$sample[[secFactName]]) | |
522 | |
523 } | |
524 | |
525 design <- model.matrix(~0 + factors + secFactors) | |
526 | |
527 # Estimate dispersions | |
528 data <- estimateDisp(data, design=design) | |
529 commonBCV <- round(sqrt(data$common.dispersion), 4) | |
530 } | |
531 | |
427 | 532 |
428 ################################################################################ | 533 ################################################################################ |
429 ### Output Processing | 534 ### Output Processing |
430 ################################################################################ | 535 ################################################################################ |
431 | 536 |
492 plotBCV(data, main="BCV Plot") | 597 plotBCV(data, main="BCV Plot") |
493 newEntry <- c("BCV Plot (.pdf)", "bcv.pdf") | 598 newEntry <- c("BCV Plot (.pdf)", "bcv.pdf") |
494 linkData <- rbind(linkData, newEntry) | 599 linkData <- rbind(linkData, newEntry) |
495 invisible(dev.off()) | 600 invisible(dev.off()) |
496 | 601 |
497 if (workMode=="classic") { | 602 if (workMode == "classic") { |
498 # Assess differential representation using classic exact testing methodology | 603 # Assess differential representation using classic exact testing methodology |
499 # in edgeR | 604 # in edgeR |
500 testData <- exactTest(data, pair=pairData) | 605 testData <- exactTest(data, pair=pairData) |
501 | 606 |
502 top <- topTags(testData, n=Inf) | 607 top <- topTags(testData, n=Inf) |
608 | |
609 if (selectDirection == "all") { | |
610 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
611 (abs(top$table$logFC) > lfcThresh), 1] | |
612 } else if (selectDirection == "up") { | |
613 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
614 (top$table$logFC > lfcThresh), 1] | |
615 } else if (selectDirection == "down") { | |
503 topIDs <- top$table[(top$table$FDR < fdrThresh) & | 616 topIDs <- top$table[(top$table$FDR < fdrThresh) & |
504 (abs(top$table$logFC) > lfcThresh), 1] | 617 (top$table$logFC < -lfcThresh), 1] |
618 } | |
505 | 619 |
506 write.table(top, file=topOut, row.names=FALSE, sep="\t") | 620 write.table(top, file=topOut, row.names=FALSE, sep="\t") |
507 | 621 |
508 linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], | 622 linkName <- paste0("Top Tags Table(", pairData[2], "-", pairData[1], |
509 ") (.tsv)") | 623 ") (.tsv)") |
510 linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv") | 624 linkAddr <- paste0("toptag(", pairData[2], "-", pairData[1], ").tsv") |
511 linkData <- rbind(linkData, c(linkName, linkAddr)) | 625 linkData <- rbind(linkData, c(linkName, linkAddr)) |
512 | 626 |
513 upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) | 627 upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) |
628 | |
514 downCount[1] <- sum(top$table$FDR < fdrThresh & | 629 downCount[1] <- sum(top$table$FDR < fdrThresh & |
515 top$table$logFC < -lfcThresh) | 630 top$table$logFC < -lfcThresh) |
631 | |
516 flatCount[1] <- sum(top$table$FDR > fdrThresh | | 632 flatCount[1] <- sum(top$table$FDR > fdrThresh | |
517 abs(top$table$logFC) < lfcThresh) | 633 abs(top$table$logFC) < lfcThresh) |
518 | 634 |
519 | 635 |
520 | 636 |
541 imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)") | 657 imgName <- paste0("Smear Plot(", pairData[2], "-", pairData[1], ") (.pdf)") |
542 imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf") | 658 imgAddr <- paste0("smear(", pairData[2], "-", pairData[1], ").pdf") |
543 linkData <- rbind(linkData, c(imgName, imgAddr)) | 659 linkData <- rbind(linkData, c(imgName, imgAddr)) |
544 invisible(dev.off()) | 660 invisible(dev.off()) |
545 | 661 |
546 } else if (workMode=="glm") { | 662 } else if (workMode == "glm") { |
547 # Generating design information | 663 # Generating design information |
548 factors <- factor(data$sample$group) | 664 if (secFactName == "none") { |
549 design <- model.matrix(~0+factors) | 665 |
550 | 666 factors <- factor(data$sample$group) |
551 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) | 667 design <- model.matrix(~0 + factors) |
668 | |
669 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) | |
670 | |
671 } else { | |
672 | |
673 factors <- factor(data$sample$group) | |
674 | |
675 if (inputType == "counts") { | |
676 | |
677 sampleNames <- colnames(counts) | |
678 matchedIndex <- match(sampleNames, samples$ID) | |
679 factors <- factor(samples$group[matchedIndex]) | |
680 | |
681 secFactors <- factor(samples[[secFactName]][matchedIndex]) | |
682 | |
683 } else if (inputType == "fastq" || inputType == "pairedFastq") { | |
684 | |
685 secFactors <- factor(data$sample[[secFactName]]) | |
686 | |
687 } | |
688 | |
689 design <- model.matrix(~0 + factors + secFactors) | |
690 | |
691 colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) | |
692 colnames(design) <- gsub("secFactors", secFactName, colnames(design), | |
693 fixed=TRUE) | |
694 } | |
695 | |
552 | 696 |
553 # Split up contrasts seperated by comma into a vector | 697 # Split up contrasts seperated by comma into a vector |
554 contrastData <- unlist(strsplit(contrastData, split=",")) | 698 contrastData <- unlist(strsplit(contrastData, split=",")) |
555 | 699 |
556 for (i in 1:length(contrastData)) { | 700 for (i in 1:length(contrastData)) { |
562 # Carry out Likelihood ratio test | 706 # Carry out Likelihood ratio test |
563 testData <- glmLRT(fit, contrast=contrasts) | 707 testData <- glmLRT(fit, contrast=contrasts) |
564 | 708 |
565 # Select hairpins with FDR < 0.05 to highlight on plot | 709 # Select hairpins with FDR < 0.05 to highlight on plot |
566 top <- topTags(testData, n=Inf) | 710 top <- topTags(testData, n=Inf) |
567 topIDs <- top$table[(top$table$FDR < fdrThresh) & | 711 |
712 if (selectDirection == "all") { | |
713 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
568 (abs(top$table$logFC) > lfcThresh), 1] | 714 (abs(top$table$logFC) > lfcThresh), 1] |
715 } else if (selectDirection == "up") { | |
716 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
717 (top$table$logFC > lfcThresh), 1] | |
718 } else if (selectDirection == "down") { | |
719 topIDs <- top$table[(top$table$FDR < fdrThresh) & | |
720 (top$table$logFC < -lfcThresh), 1] | |
721 } | |
722 | |
569 write.table(top, file=topOut[i], row.names=FALSE, sep="\t") | 723 write.table(top, file=topOut[i], row.names=FALSE, sep="\t") |
570 | 724 |
571 linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") | 725 linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") |
572 linkAddr <- paste0("toptag(", contrastData[i], ").tsv") | 726 linkAddr <- paste0("toptag(", contrastData[i], ").tsv") |
573 linkData <- rbind(linkData, c(linkName, linkAddr)) | 727 linkData <- rbind(linkData, c(linkName, linkAddr)) |
605 genes <- as.character(data$genes$Gene) | 759 genes <- as.character(data$genes$Gene) |
606 unq <- unique(genes) | 760 unq <- unique(genes) |
607 unq <- unq[!is.na(unq)] | 761 unq <- unq[!is.na(unq)] |
608 geneList <- list() | 762 geneList <- list() |
609 for (gene in unq) { | 763 for (gene in unq) { |
610 if (length(which(genes==gene)) >= hairpinReq) { | 764 if (length(which(genes == gene)) >= hairpinReq) { |
611 geneList[[gene]] <- which(genes==gene) | 765 geneList[[gene]] <- which(genes == gene) |
612 } | 766 } |
613 } | 767 } |
614 | 768 |
615 if (wantRoast) { | 769 if (wantRoast) { |
616 # Input preparaton for roast | 770 # Input preparaton for roast |
622 write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t") | 776 write.table(roastData, file=roastOut[i], row.names=FALSE, sep="\t") |
623 linkName <- paste0("Gene Level Analysis Table(", contrastData[i], | 777 linkName <- paste0("Gene Level Analysis Table(", contrastData[i], |
624 ") (.tsv)") | 778 ") (.tsv)") |
625 linkAddr <- paste0("gene_level(", contrastData[i], ").tsv") | 779 linkAddr <- paste0("gene_level(", contrastData[i], ").tsv") |
626 linkData <- rbind(linkData, c(linkName, linkAddr)) | 780 linkData <- rbind(linkData, c(linkName, linkAddr)) |
627 if (selectOpt=="rank") { | 781 if (selectOpt == "rank") { |
628 selectedGenes <- rownames(roastData)[selectVals] | 782 selectedGenes <- rownames(roastData)[selectVals] |
629 } else { | 783 } else { |
630 selectedGenes <- selectVals | 784 selectedGenes <- selectVals |
631 } | 785 } |
632 | 786 |
666 } | 820 } |
667 | 821 |
668 # Generate data frame of the significant differences | 822 # Generate data frame of the significant differences |
669 sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) | 823 sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) |
670 if (workMode == "glm") { | 824 if (workMode == "glm") { |
825 | |
671 row.names(sigDiff) <- contrastData | 826 row.names(sigDiff) <- contrastData |
827 | |
672 } else if (workMode == "classic") { | 828 } else if (workMode == "classic") { |
829 | |
673 row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1]) | 830 row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1]) |
831 | |
674 } | 832 } |
675 | 833 |
676 # Output table of summarised counts | 834 # Output table of summarised counts |
677 ID <- rownames(data$counts) | 835 ID <- rownames(data$counts) |
678 outputCounts <- cbind(ID, data$counts) | 836 outputCounts <- cbind(ID, data$counts) |
700 HtmlHead("EdgeR Output") | 858 HtmlHead("EdgeR Output") |
701 | 859 |
702 cata("<body>\n") | 860 cata("<body>\n") |
703 cata("<h3>EdgeR Analysis Output:</h3>\n") | 861 cata("<h3>EdgeR Analysis Output:</h3>\n") |
704 cata("<h4>Input Summary:</h4>\n") | 862 cata("<h4>Input Summary:</h4>\n") |
705 if (inputType=="fastq") { | 863 if (inputType == "fastq" || inputType == "pairedFastq") { |
864 | |
706 cata("<ul>\n") | 865 cata("<ul>\n") |
707 ListItem(hpReadout[1]) | 866 ListItem(hpReadout[1]) |
708 ListItem(hpReadout[2]) | 867 ListItem(hpReadout[2]) |
709 cata("</ul>\n") | 868 cata("</ul>\n") |
710 cata(hpReadout[3], "<br />\n") | 869 cata(hpReadout[3], "<br />\n") |
714 cata("</ul>\n") | 873 cata("</ul>\n") |
715 cata(hpReadout[8:11], sep="<br />\n") | 874 cata(hpReadout[8:11], sep="<br />\n") |
716 cata("<br />\n") | 875 cata("<br />\n") |
717 cata("<b>Please check that read percentages are consistent with ") | 876 cata("<b>Please check that read percentages are consistent with ") |
718 cata("expectations.</b><br >\n") | 877 cata("expectations.</b><br >\n") |
719 } else if (inputType=="counts") { | 878 |
879 } else if (inputType == "counts") { | |
880 | |
720 cata("<ul>\n") | 881 cata("<ul>\n") |
721 ListItem("Number of Samples: ", ncol(data$counts)) | 882 ListItem("Number of Samples: ", ncol(data$counts)) |
722 ListItem("Number of Hairpins: ", countsRows) | 883 ListItem("Number of Hairpins: ", countsRows) |
723 ListItem("Number of annotations provided: ", annoRows) | 884 ListItem("Number of annotations provided: ", annoRows) |
724 ListItem("Number of annotations matched to hairpin: ", annoMatched) | 885 ListItem("Number of annotations matched to hairpin: ", annoMatched) |
725 cata("</ul>\n") | 886 cata("</ul>\n") |
887 | |
726 } | 888 } |
727 | 889 |
728 cata("The estimated common biological coefficient of variation (BCV) is: ", | 890 cata("The estimated common biological coefficient of variation (BCV) is: ", |
729 commonBCV, "<br />\n") | 891 commonBCV, "<br />\n") |
892 | |
893 if (secFactName == "none") { | |
894 | |
895 cata("No secondary factor specified.<br />\n") | |
896 | |
897 } else { | |
898 | |
899 cata("Secondary factor specified as: ", secFactName, "<br />\n") | |
900 | |
901 } | |
730 | 902 |
731 cata("<h4>Output:</h4>\n") | 903 cata("<h4>Output:</h4>\n") |
732 cata("PDF copies of JPEGS available in 'Plots' section.<br />\n") | 904 cata("PDF copies of JPEGS available in 'Plots' section.<br />\n") |
733 for (i in 1:nrow(imageData)) { | 905 for (i in 1:nrow(imageData)) { |
734 if (grepl("barcode", imageData$Link[i])) { | 906 if (grepl("barcode", imageData$Link[i])) { |
907 | |
735 if (packageVersion("limma")<"3.19.19") { | 908 if (packageVersion("limma")<"3.19.19") { |
909 | |
736 HtmlImage(imageData$Link[i], imageData$Label[i], | 910 HtmlImage(imageData$Link[i], imageData$Label[i], |
737 height=length(selectedGenes)*150) | 911 height=length(selectedGenes)*150) |
912 | |
738 } else { | 913 } else { |
914 | |
739 HtmlImage(imageData$Link[i], imageData$Label[i], | 915 HtmlImage(imageData$Link[i], imageData$Label[i], |
740 height=length(selectedGenes)*300) | 916 height=length(selectedGenes)*300) |
917 | |
741 } | 918 } |
742 } else { | 919 } else { |
920 | |
743 HtmlImage(imageData$Link[i], imageData$Label[i]) | 921 HtmlImage(imageData$Link[i], imageData$Label[i]) |
922 | |
744 } | 923 } |
745 } | 924 } |
746 cata("<br />\n") | 925 cata("<br />\n") |
747 | 926 |
748 cata("<h4>Differential Representation Counts:</h4>\n") | 927 cata("<h4>Differential Representation Counts:</h4>\n") |
777 HtmlLink(linkData$Link[i], linkData$Label[i]) | 956 HtmlLink(linkData$Link[i], linkData$Label[i]) |
778 } | 957 } |
779 } | 958 } |
780 | 959 |
781 cata("<p>Alt-click links to download file.</p>\n") | 960 cata("<p>Alt-click links to download file.</p>\n") |
782 cata("<p>Click floppy disc icon associated history item to download ") | 961 cata("<p>Click floppy disc icon on associated history item to download ") |
783 cata("all files.</p>\n") | 962 cata("all files.</p>\n") |
784 cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n") | 963 cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n") |
785 | 964 |
786 cata("<h4>Additional Information:</h4>\n") | 965 cata("<h4>Additional Information:</h4>\n") |
787 | 966 |
788 if (inputType == "fastq") { | 967 if (inputType == "fastq") { |
968 | |
789 ListItem("Data was gathered from fastq raw read file(s).") | 969 ListItem("Data was gathered from fastq raw read file(s).") |
970 | |
790 } else if (inputType == "counts") { | 971 } else if (inputType == "counts") { |
972 | |
791 ListItem("Data was gathered from a table of counts.") | 973 ListItem("Data was gathered from a table of counts.") |
792 } | 974 |
793 | 975 } |
794 if (cpmReq!=0 && sampleReq!=0) { | 976 |
795 tempStr <- paste("Hairpins without more than", cpmReq, | 977 if (cpmReq != 0 && sampleReq != 0) { |
978 tempStr <- paste("Target sequences without more than", cpmReq, | |
796 "CPM in at least", sampleReq, "samples are insignificant", | 979 "CPM in at least", sampleReq, "samples are insignificant", |
797 "and filtered out.") | 980 "and filtered out.") |
798 ListItem(tempStr) | 981 ListItem(tempStr) |
982 | |
799 filterProp <- round(filteredCount/preFilterCount*100, digits=2) | 983 filterProp <- round(filteredCount/preFilterCount*100, digits=2) |
800 tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, | 984 tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, |
801 "%) hairpins were filtered out for low count-per-million.") | 985 "%) target sequences were filtered out for low ", |
986 "count-per-million.") | |
987 ListItem(tempStr) | |
988 } | |
989 | |
990 if (sampleReq != 0) { | |
991 tempStr <- paste("Samples that did not produce more than", sampleReq, | |
992 "counts were filtered out.") | |
993 ListItem(tempStr) | |
994 | |
995 tempStr <- paste0(sampleFilterCount, " samples were filtered out for low ", | |
996 "counts.") | |
802 ListItem(tempStr) | 997 ListItem(tempStr) |
803 } | 998 } |
804 | 999 |
805 if (exists("filteredSamples")) { | 1000 if (exists("filteredSamples")) { |
806 tempStr <- paste("The following samples were filtered out for having zero", | 1001 tempStr <- paste("The following samples were filtered out for having zero", |
807 "library size: ", filteredSamples) | 1002 "library size: ", filteredSamples) |
808 ListItem(tempStr) | 1003 ListItem(tempStr) |
809 } | 1004 } |
810 | 1005 |
811 if (workMode == "classic") { | 1006 if (workMode == "classic") { |
812 ListItem("An exact test was performed on each hairpin.") | 1007 ListItem("An exact test was performed on each target sequence.") |
813 } else if (workMode == "glm") { | 1008 } else if (workMode == "glm") { |
814 ListItem("A generalised linear model was fitted to each hairpin.") | 1009 ListItem("A generalised linear model was fitted to each target sequence.") |
815 } | 1010 } |
816 | 1011 |
817 cit <- character() | 1012 cit <- character() |
818 link <-character() | 1013 link <-character() |
819 link[1] <- paste0("<a href=\"", | 1014 link[1] <- paste0("<a href=\"", |