comparison hairpinTool.R @ 6:3d04308a99f9

- Added differentially expressed hairpin count output - Added running time output - Added counts table output
author shian_su <registertonysu@gmail.com>
date Fri, 11 Apr 2014 17:17:15 +1000
parents f8af57d6f60b
children 91e411fcdecc
comparison
equal deleted inserted replaced
5:17befe9f8b03 6:3d04308a99f9
1 # ARGS: 1.inputType -String specifying format of input (fastq or table)
2 # IF inputType is "fastQ":
3 # 2*.fastqPath -One or more strings specifying path to fastq files
4 # 2.annoPath -String specifying path to hairpin annotation table
5 # 3.samplePath -String specifying path to sample annotation table
6 # 4.barStart -Integer specifying starting position of barcode
7 # 5.barEnd -Integer specifying ending position of barcode
8 # 6.hpStart -Integer specifying startins position of hairpin
9 # unique region
10 # 7.hpEnd -Integer specifying ending position of hairpin
11 # unique region
12 # ###
13 # IF inputType is "counts":
14 # 2.countPath -String specifying path to count table
15 # 3.annoPath -String specifying path to hairpin annotation table
16 # 4.samplePath -String specifying path to sample annotation table
17 # ###
18 # 8.cpmReq -Float specifying cpm requirement
19 # 9.sampleReq -Integer specifying cpm requirement
20 # 10.fdrThresh -Float specifying the FDR requirement
21 # 11.lfcThresh -Float specifying the log-fold-change requirement
22 # 12.workMode -String specifying exact test or GLM usage
23 # 13.htmlPath -String specifying path to HTML file
24 # 14.folderPath -STring specifying path to folder for output
25 # IF workMode is "classic" (exact test)
26 # 15.pairData[2] -String specifying first group for exact test
27 # 16.pairData[1] -String specifying second group for exact test
28 # ###
29 # IF workMode is "glm"
30 # 15.contrastData -String specifying contrasts to be made
31 # 16.roastOpt -String specifying usage of gene-wise tests
32 # 17.hairpinReq -String specifying hairpin requirement for gene-
33 # wise test
34 # 18.selectOpt -String specifying type of selection for barcode
35 # plots
36 # 19.selectVals -String specifying members selected for barcode
37 # plots
38 #
39 # OUT: Bar Plot of Counts Per Index
40 # Bar Plot of Counts Per Hairpin
41 # MDS Plot
42 # Smear Plot
43 # Barcode Plots (If Genewise testing was selected)
44 # Top Expression Table
45 # HTML file linking to the ouputs
46 #
47 # Author: Shian Su - registertonysu@gmail.com - Jan 2014
48
1 # Record starting time 49 # Record starting time
2 timeStart <- as.character(Sys.time()) 50 timeStart <- as.character(Sys.time())
3 51
4 # Loading and checking required packages 52 # Loading and checking required packages
5 library(methods, quietly=TRUE, warn.conflicts=FALSE) 53 library(methods, quietly=TRUE, warn.conflicts=FALSE)
113 inputType <- as.character(argv[1]) 161 inputType <- as.character(argv[1])
114 if (inputType=="fastq") { 162 if (inputType=="fastq") {
115 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], 163 fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)],
116 fixed=TRUE)) 164 fixed=TRUE))
117 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] 165 argv <- argv[!grepl("fastq::", argv, fixed=TRUE)]
118 hairpinPath <- as.character(argv[2]) 166 annoPath <- as.character(argv[2])
119 samplePath <- as.character(argv[3]) 167 samplePath <- as.character(argv[3])
120 barStart <- as.numeric(argv[4]) 168 barStart <- as.numeric(argv[4])
121 barEnd <- as.numeric(argv[5]) 169 barEnd <- as.numeric(argv[5])
122 hpStart <- as.numeric(argv[6]) 170 hpStart <- as.numeric(argv[6])
123 hpEnd <- as.numeric(argv[7]) 171 hpEnd <- as.numeric(argv[7])
132 fdrThresh <- as.numeric(argv[10]) 180 fdrThresh <- as.numeric(argv[10])
133 lfcThresh <- as.numeric(argv[11]) 181 lfcThresh <- as.numeric(argv[11])
134 workMode <- as.character(argv[12]) 182 workMode <- as.character(argv[12])
135 htmlPath <- as.character(argv[13]) 183 htmlPath <- as.character(argv[13])
136 folderPath <- as.character(argv[14]) 184 folderPath <- as.character(argv[14])
185
137 if (workMode=="classic") { 186 if (workMode=="classic") {
138 pairData <- character() 187 pairData <- character()
139 pairData[2] <- as.character(argv[15]) 188 pairData[2] <- as.character(argv[15])
140 pairData[1] <- as.character(argv[16]) 189 pairData[1] <- as.character(argv[16])
141 } else if (workMode=="glm") { 190 } else if (workMode=="glm") {
145 selectOpt <- as.character(argv[18]) 194 selectOpt <- as.character(argv[18])
146 selectVals <- as.character(argv[19]) 195 selectVals <- as.character(argv[19])
147 } 196 }
148 197
149 # Read in inputs 198 # Read in inputs
150 if (inputType=="fastq") { 199
151 samples <- read.table(samplePath, header=TRUE, sep="\t") 200 samples <- read.table(samplePath, header=TRUE, sep="\t")
152 hairpins <- read.table(hairpinPath, header=TRUE, sep="\t") 201 anno <- read.table(annoPath, header=TRUE, sep="\t")
153 } else if (inputType=="counts") { 202 if (inputType=="counts") {
154 samples <- read.table(samplePath, header=TRUE, sep="\t")
155 counts <- read.table(countPath, header=TRUE, sep="\t") 203 counts <- read.table(countPath, header=TRUE, sep="\t")
156 anno <- read.table(annoPath, header=TRUE, sep="\t") 204 }
157 } 205
158 ###################### Check inputs for correctness ############################ 206 ###################### Check inputs for correctness ############################
159 samples$ID <- make.names(samples$ID) 207 samples$ID <- make.names(samples$ID)
160 208
161 if (!any(grepl("group", names(samples)))) { 209 if (!any(grepl("group", names(samples)))) {
162 stop("'group' column not specified in sample annotation file") 210 stop("'group' column not specified in sample annotation file")
164 212
165 if (any(table(samples$ID)>1)){ 213 if (any(table(samples$ID)>1)){
166 tab <- table(samples$ID) 214 tab <- table(samples$ID)
167 offenders <- paste(names(tab[tab>1]), collapse=", ") 215 offenders <- paste(names(tab[tab>1]), collapse=", ")
168 offenders <- unmake.names(offenders) 216 offenders <- unmake.names(offenders)
169 stop("ID column of sample annotation must have unique values, values ", 217 stop("'ID' column of sample annotation must have unique values, values ",
170 offenders, " are repeated") 218 offenders, " are repeated")
171 } # Check that IDs in sample annotation are unique 219 } # Check that IDs in sample annotation are unique
172 220
173 if (inputType=="fastq") { 221 if (inputType=="fastq") {
174 222
175 if (any(table(hairpins$ID)>1)){ 223 if (any(table(anno$ID)>1)){
176 tab <- table(hairpins$ID) 224 tab <- table(anno$ID)
177 offenders <- paste(names(tab[tab>1]), collapse=", ") 225 offenders <- paste(names(tab[tab>1]), collapse=", ")
178 stop("ID column of hairpin annotation must have unique values, values ", 226 stop("'ID' column of hairpin annotation must have unique values, values ",
179 offenders, " are repeated") 227 offenders, " are repeated")
180 } # Check that IDs in hairpin annotation are unique 228 } # Check that IDs in hairpin annotation are unique
181 229
182 } else if (inputType=="counts") { 230 } else if (inputType=="counts") {
183 if (any(is.na(match(samples$ID, colnames(counts))))) { 231 if (any(is.na(match(samples$ID, colnames(counts))))) {
185 } # Check that a group has be specifed for each sample 233 } # Check that a group has be specifed for each sample
186 234
187 if (any(table(counts$ID)>1)){ 235 if (any(table(counts$ID)>1)){
188 tab <- table(counts$ID) 236 tab <- table(counts$ID)
189 offenders <- paste(names(tab[tab>1]), collapse=", ") 237 offenders <- paste(names(tab[tab>1]), collapse=", ")
190 stop("ID column of count table must have unique values, values ", 238 stop("'ID' column of count table must have unique values, values ",
191 offenders, " are repeated") 239 offenders, " are repeated")
192 } # Check that IDs in count table are unique 240 } # Check that IDs in count table are unique
193 } 241 }
242 if (workMode=="glm") {
243 if (roastOpt == "yes") {
244 if (is.na(match("Gene", colnames(anno)))) {
245 tempStr <- paste("Gene-wise tests selected but'Gene' column not",
246 "specified in hairpin annotation file")
247 stop(tempStr)
248 }
249 }
250 }
251
194 ################################################################################ 252 ################################################################################
195 253
196 # Process arguments 254 # Process arguments
197 if (workMode=="glm") { 255 if (workMode=="glm") {
198 if (roastOpt=="yes") { 256 if (roastOpt=="yes") {
214 if (exists("pairData")) { 272 if (exists("pairData")) {
215 pairData <- make.names(pairData) 273 pairData <- make.names(pairData)
216 } 274 }
217 275
218 # Generate output folder and paths 276 # Generate output folder and paths
219 dir.create(folderPath) 277 dir.create(folderPath, showWarnings=FALSE)
220 278
221 # Generate links for outputs 279 # Generate links for outputs
222 imgOut("barHairpin") 280 imgOut("barHairpin")
223 imgOut("barIndex") 281 imgOut("barIndex")
224 imgOut("mds") 282 imgOut("mds")
241 roastOut[i] <- makeOut(paste0("roast(", contrastData[i], ").tsv")) 299 roastOut[i] <- makeOut(paste0("roast(", contrastData[i], ").tsv"))
242 barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png")) 300 barcodePng[i] <- makeOut(paste0("barcode(", contrastData[i], ").png"))
243 barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf")) 301 barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf"))
244 } 302 }
245 } 303 }
304 countsOut <- makeOut("counts.tsv")
305
246 # Initialise data for html links and images, table with the link label and 306 # Initialise data for html links and images, table with the link label and
247 # link address 307 # link address
248 linkData <- data.frame(Label=character(), Link=character(), 308 linkData <- data.frame(Label=character(), Link=character(),
249 stringsAsFactors=FALSE) 309 stringsAsFactors=FALSE)
250 imageData <- data.frame(Label=character(), Link=character(), 310 imageData <- data.frame(Label=character(), Link=character(),
274 selectVals <- unlist(strsplit(selectVals, " ")) 334 selectVals <- unlist(strsplit(selectVals, " "))
275 } 335 }
276 } 336 }
277 337
278 if (inputType=="fastq") { 338 if (inputType=="fastq") {
279 # Use EdgeR hairpin process and capture outputs 339 # Use EdgeR hairpin process and capture outputs
280 hpReadout <- capture.output( 340 hpReadout <- capture.output(
281 data <- processHairpinReads(fastqPath, samplePath, hairpinPath, 341 data <- processHairpinReads(fastqPath, samplePath, annoPath,
282 hairpinStart=hpStart, hairpinEnd=hpEnd, 342 hairpinStart=hpStart, hairpinEnd=hpEnd,
283 verbose=TRUE) 343 verbose=TRUE)
284 ) 344 )
285 345
286 # Remove function output entries that show processing data or is empty 346 # Remove function output entries that show processing data or is empty
287 hpReadout <- hpReadout[hpReadout!=""] 347 hpReadout <- hpReadout[hpReadout!=""]
288 hpReadout <- hpReadout[!grepl("Processing", hpReadout)] 348 hpReadout <- hpReadout[!grepl("Processing", hpReadout)]
289 hpReadout <- hpReadout[!grepl("in file", hpReadout)] 349 hpReadout <- hpReadout[!grepl("in file", hpReadout)]
290 hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) 350 hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE)
291 351
292 352 # Make the names of groups syntactically valid (replace spaces with periods)
293 # Make the names of groups syntactically valid (replace spaces with periods) 353 data$samples$group <- make.names(data$samples$group)
294 data$samples$group <- make.names(data$samples$group)
295 } else if (inputType=="counts") { 354 } else if (inputType=="counts") {
296 # Process counts information, set ID column to be row names 355 # Process counts information, set ID column to be row names
297 rownames(counts) <- counts$ID 356 rownames(counts) <- counts$ID
298 counts <- counts[ , !(colnames(counts)=="ID")] 357 counts <- counts[ , !(colnames(counts)=="ID")]
299 countsRows <- nrow(counts) 358 countsRows <- nrow(counts)
316 anno <- anno[sel, ] 375 anno <- anno[sel, ]
317 376
318 # Create DGEList 377 # Create DGEList
319 data <- DGEList(counts=counts, lib.size=colSums(counts), 378 data <- DGEList(counts=counts, lib.size=colSums(counts),
320 norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) 379 norm.factors=rep(1,ncol(counts)), genes=anno, group=factors)
321 380
322 # Make the names of groups syntactically valid (replace spaces with periods) 381 # Make the names of groups syntactically valid (replace spaces with periods)
323 data$samples$group <- make.names(data$samples$group) 382 data$samples$group <- make.names(data$samples$group)
324 } 383 }
325 384
326 # Filter hairpins with low counts 385 # Filter hairpins with low counts
386 preFilterCount <- nrow(data)
327 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq 387 sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq
328 data <- data[sel, ] 388 data <- data[sel, ]
389 postFilterCount <- nrow(data)
390 filteredCount <- preFilterCount-postFilterCount
329 391
330 # Estimate dispersions 392 # Estimate dispersions
331 data <- estimateDisp(data) 393 data <- estimateDisp(data)
332 commonBCV <- sqrt(data$common.dispersion) 394 commonBCV <- sqrt(data$common.dispersion)
333 395
538 dev.off() 600 dev.off()
539 } 601 }
540 } 602 }
541 } 603 }
542 604
543 # Record ending time 605 ID <- rownames(data$counts)
606 outputCounts <- cbind(ID, data$counts)
607 write.table(outputCounts, file=countsOut, row.names=FALSE, sep="\t")
608 linkName <- "Counts table (.tsv)"
609 linkAddr <- "counts.tsv"
610 linkData <- rbind(linkData, c(linkName, linkAddr))
611
612 # Record ending time and calculate total run time
544 timeEnd <- as.character(Sys.time()) 613 timeEnd <- as.character(Sys.time())
614 timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3))
615 timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE)
545 ################################################################################ 616 ################################################################################
546 ### HTML Generation 617 ### HTML Generation
547 ################################################################################ 618 ################################################################################
548 # Clear file 619 # Clear file
549 cat("", file=htmlPath) 620 cat("", file=htmlPath)
557 if (inputType=="fastq") { 628 if (inputType=="fastq") {
558 cata("<ul>\n") 629 cata("<ul>\n")
559 ListItem(hpReadout[1]) 630 ListItem(hpReadout[1])
560 ListItem(hpReadout[2]) 631 ListItem(hpReadout[2])
561 cata("</ul>\n") 632 cata("</ul>\n")
562 cata(hpReadout[3], "<br/>\n") 633 cata(hpReadout[3], "<br />\n")
563 cata("<ul>\n") 634 cata("<ul>\n")
564 ListItem(hpReadout[4]) 635 ListItem(hpReadout[4])
565 ListItem(hpReadout[7]) 636 ListItem(hpReadout[7])
566 cata("</ul>\n") 637 cata("</ul>\n")
567 cata(hpReadout[8:11], sep="<br/>\n") 638 cata(hpReadout[8:11], sep="<br />\n")
568 cata("<br />\n") 639 cata("<br />\n")
569 cata("<b>Please check that read percentages are consistent with ") 640 cata("<b>Please check that read percentages are consistent with ")
570 cata("expectations.</b><br >\n") 641 cata("expectations.</b><br >\n")
571 } else if (inputType=="counts") { 642 } else if (inputType=="counts") {
572 cata("<ul>\n") 643 cata("<ul>\n")
580 cata("The estimated common biological coefficient of variation (BCV) is: ", 651 cata("The estimated common biological coefficient of variation (BCV) is: ",
581 commonBCV, "<br />\n") 652 commonBCV, "<br />\n")
582 653
583 cata("<h4>Output:</h4>\n") 654 cata("<h4>Output:</h4>\n")
584 cata("All images displayed have PDF copy at the bottom of the page, these can ") 655 cata("All images displayed have PDF copy at the bottom of the page, these can ")
585 cata("exported in a pdf viewer to high resolution image format. <br/>\n") 656 cata("exported in a pdf viewer to high resolution image format. <br />\n")
586 for (i in 1:nrow(imageData)) { 657 for (i in 1:nrow(imageData)) {
587 if (grepl("barcode", imageData$Link[i])) { 658 if (grepl("barcode", imageData$Link[i])) {
588 if (packageVersion("limma")<"3.19.19") { 659 if (packageVersion("limma")<"3.19.19") {
589 HtmlImage(imageData$Link[i], imageData$Label[i], 660 HtmlImage(imageData$Link[i], imageData$Label[i],
590 height=length(selectedGenes)*150) 661 height=length(selectedGenes)*150)
594 } 665 }
595 } else { 666 } else {
596 HtmlImage(imageData$Link[i], imageData$Label[i]) 667 HtmlImage(imageData$Link[i], imageData$Label[i])
597 } 668 }
598 } 669 }
599 cata("<br/>\n") 670 cata("<br />\n")
600 671
601 cata("<h4>Plots:</h4>\n") 672 cata("<h4>Plots:</h4>\n")
602 for (i in 1:nrow(linkData)) { 673 for (i in 1:nrow(linkData)) {
603 if (!grepl(".tsv", linkData$Link[i])) { 674 if (!grepl(".tsv", linkData$Link[i])) {
604 HtmlLink(linkData$Link[i], linkData$Label[i]) 675 HtmlLink(linkData$Link[i], linkData$Label[i])
615 cata("<p>alt-click any of the links to download the file, or click the name ") 686 cata("<p>alt-click any of the links to download the file, or click the name ")
616 cata("of this task in the galaxy history panel and click on the floppy ") 687 cata("of this task in the galaxy history panel and click on the floppy ")
617 cata("disk icon to download all files in a zip archive.</p>\n") 688 cata("disk icon to download all files in a zip archive.</p>\n")
618 cata("<p>.tsv files are tab seperated files that can be viewed using Excel ") 689 cata("<p>.tsv files are tab seperated files that can be viewed using Excel ")
619 cata("or other spreadsheet programs</p>\n") 690 cata("or other spreadsheet programs</p>\n")
691
692 cata("<h4>Additional Information:</h4>\n")
693
694 if (inputType == "fastq") {
695 ListItem("Data was gathered from fastq raw read file(s).")
696 } else if (inputType == "counts") {
697 ListItem("Data was gathered from a table of counts.")
698 }
699
700 if (cpmReq!=0 && sampleReq!=0) {
701 tempStr <- paste("Hairpins that do not have more than", cpmReq,
702 "CPM in at least", sampleReq, "samples are considered",
703 "insignificant and filtered out.")
704 ListItem(tempStr)
705 filterProp <- round(filteredCount/preFilterCount*100, digits=2)
706 tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp,
707 "%) hairpins were filtered out for low count-per-million.")
708 ListItem(tempStr)
709 }
710
711 if (workMode == "classic") {
712 ListItem("An exact test was performed on each hairpin.")
713 } else if (workMode == "glm") {
714 ListItem("A generalised linear model was fitted to each hairpin.")
715 }
716
717
718
719 cit <- character()
720 link <-character()
721 link[1] <- paste0("<a href=\"",
722 "http://www.bioconductor.org/packages/release/bioc/",
723 "vignettes/limma/inst/doc/usersguide.pdf",
724 "\">", "limma User's Guide", "</a>.")
725 link[2] <- paste0("<a href=\"",
726 "http://www.bioconductor.org/packages/release/bioc/",
727 "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf",
728 "\">", "edgeR User's Guide", "</a>")
729
730 cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).",
731 "edgeR: a Bioconductor package for differential",
732 "expression analysis of digital gene expression",
733 "data. Bioinformatics 26, 139-140")
734 cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests",
735 "for assessing differences in tag abundance. Bioinformatics",
736 "23, 2881-2887")
737 cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of",
738 "negative binomial dispersion, with applications to SAGE data.",
739 "Biostatistics, 9, 321-332")
740
741 cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential",
742 "expression analysis of multifactor RNA-Seq experiments with",
743 "respect to biological variation. Nucleic Acids Research 40,",
744 "4288-4297")
745
746 cata("<h4>Citations</h4>")
747 cata("<ol>\n")
748 ListItem(cit[1])
749 ListItem(cit[2])
750 ListItem(cit[3])
751 ListItem(cit[4])
752 cata("</ol>\n")
753
620 cata("<table border=\"0\">\n") 754 cata("<table border=\"0\">\n")
621
622 cata("<tr>\n") 755 cata("<tr>\n")
623 TableItem("Task started at:"); TableItem(timeStart) 756 TableItem("Task started at:"); TableItem(timeStart)
624 cata("</tr>\n") 757 cata("</tr>\n")
625 cata("<tr>\n") 758 cata("<tr>\n")
626 TableItem("Task ended at:"); TableItem(timeEnd) 759 TableItem("Task ended at:"); TableItem(timeEnd)
627 cata("</tr>\n") 760 cata("</tr>\n")
761 cata("<tr>\n")
762 TableItem("Task run time:"); TableItem(timeTaken)
763 cata("<tr>\n")
764 cata("</table>\n")
628 765
629 cata("</body>\n") 766 cata("</body>\n")
630 cata("</html>") 767 cata("</html>")