Mercurial > repos > shians > shrnaseq
diff hairpinTool.R @ 6:3d04308a99f9
- Added differentially expressed hairpin count output
- Added running time output
- Added counts table output
author | shian_su <registertonysu@gmail.com> |
---|---|
date | Fri, 11 Apr 2014 17:17:15 +1000 |
parents | f8af57d6f60b |
children | 91e411fcdecc |
line wrap: on
line diff
--- a/hairpinTool.R Mon Feb 24 14:50:08 2014 +1100 +++ b/hairpinTool.R Fri Apr 11 17:17:15 2014 +1000 @@ -1,3 +1,51 @@ +# ARGS: 1.inputType -String specifying format of input (fastq or table) +# IF inputType is "fastQ": +# 2*.fastqPath -One or more strings specifying path to fastq files +# 2.annoPath -String specifying path to hairpin annotation table +# 3.samplePath -String specifying path to sample annotation table +# 4.barStart -Integer specifying starting position of barcode +# 5.barEnd -Integer specifying ending position of barcode +# 6.hpStart -Integer specifying startins position of hairpin +# unique region +# 7.hpEnd -Integer specifying ending position of hairpin +# unique region +# ### +# IF inputType is "counts": +# 2.countPath -String specifying path to count table +# 3.annoPath -String specifying path to hairpin annotation table +# 4.samplePath -String specifying path to sample annotation table +# ### +# 8.cpmReq -Float specifying cpm requirement +# 9.sampleReq -Integer specifying cpm requirement +# 10.fdrThresh -Float specifying the FDR requirement +# 11.lfcThresh -Float specifying the log-fold-change requirement +# 12.workMode -String specifying exact test or GLM usage +# 13.htmlPath -String specifying path to HTML file +# 14.folderPath -STring specifying path to folder for output +# IF workMode is "classic" (exact test) +# 15.pairData[2] -String specifying first group for exact test +# 16.pairData[1] -String specifying second group for exact test +# ### +# IF workMode is "glm" +# 15.contrastData -String specifying contrasts to be made +# 16.roastOpt -String specifying usage of gene-wise tests +# 17.hairpinReq -String specifying hairpin requirement for gene- +# wise test +# 18.selectOpt -String specifying type of selection for barcode +# plots +# 19.selectVals -String specifying members selected for barcode +# plots +# +# OUT: Bar Plot of Counts Per Index +# Bar Plot of Counts Per Hairpin +# MDS Plot +# Smear Plot +# Barcode Plots (If Genewise testing was selected) +# Top Expression Table +# HTML file linking to the ouputs +# +# Author: Shian Su - registertonysu@gmail.com - Jan 2014 + # Record starting time timeStart <- as.character(Sys.time()) @@ -115,7 +163,7 @@ fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], fixed=TRUE)) argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] - hairpinPath <- as.character(argv[2]) + annoPath <- as.character(argv[2]) samplePath <- as.character(argv[3]) barStart <- as.numeric(argv[4]) barEnd <- as.numeric(argv[5]) @@ -134,6 +182,7 @@ workMode <- as.character(argv[12]) htmlPath <- as.character(argv[13]) folderPath <- as.character(argv[14]) + if (workMode=="classic") { pairData <- character() pairData[2] <- as.character(argv[15]) @@ -147,14 +196,13 @@ } # Read in inputs -if (inputType=="fastq") { - samples <- read.table(samplePath, header=TRUE, sep="\t") - hairpins <- read.table(hairpinPath, header=TRUE, sep="\t") -} else if (inputType=="counts") { - samples <- read.table(samplePath, header=TRUE, sep="\t") + +samples <- read.table(samplePath, header=TRUE, sep="\t") +anno <- read.table(annoPath, header=TRUE, sep="\t") +if (inputType=="counts") { counts <- read.table(countPath, header=TRUE, sep="\t") - anno <- read.table(annoPath, header=TRUE, sep="\t") } + ###################### Check inputs for correctness ############################ samples$ID <- make.names(samples$ID) @@ -166,16 +214,16 @@ tab <- table(samples$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") offenders <- unmake.names(offenders) - stop("ID column of sample annotation must have unique values, values ", + stop("'ID' column of sample annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in sample annotation are unique if (inputType=="fastq") { - - if (any(table(hairpins$ID)>1)){ - tab <- table(hairpins$ID) + + if (any(table(anno$ID)>1)){ + tab <- table(anno$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") - stop("ID column of hairpin annotation must have unique values, values ", + stop("'ID' column of hairpin annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in hairpin annotation are unique @@ -187,10 +235,20 @@ if (any(table(counts$ID)>1)){ tab <- table(counts$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") - stop("ID column of count table must have unique values, values ", + stop("'ID' column of count table must have unique values, values ", offenders, " are repeated") } # Check that IDs in count table are unique } +if (workMode=="glm") { + if (roastOpt == "yes") { + if (is.na(match("Gene", colnames(anno)))) { + tempStr <- paste("Gene-wise tests selected but'Gene' column not", + "specified in hairpin annotation file") + stop(tempStr) + } + } +} + ################################################################################ # Process arguments @@ -216,7 +274,7 @@ } # Generate output folder and paths -dir.create(folderPath) +dir.create(folderPath, showWarnings=FALSE) # Generate links for outputs imgOut("barHairpin") @@ -243,6 +301,8 @@ barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf")) } } +countsOut <- makeOut("counts.tsv") + # Initialise data for html links and images, table with the link label and # link address linkData <- data.frame(Label=character(), Link=character(), @@ -276,22 +336,21 @@ } if (inputType=="fastq") { -# Use EdgeR hairpin process and capture outputs -hpReadout <- capture.output( - data <- processHairpinReads(fastqPath, samplePath, hairpinPath, + # Use EdgeR hairpin process and capture outputs + hpReadout <- capture.output( + data <- processHairpinReads(fastqPath, samplePath, annoPath, hairpinStart=hpStart, hairpinEnd=hpEnd, verbose=TRUE) -) - -# Remove function output entries that show processing data or is empty -hpReadout <- hpReadout[hpReadout!=""] -hpReadout <- hpReadout[!grepl("Processing", hpReadout)] -hpReadout <- hpReadout[!grepl("in file", hpReadout)] -hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) - - -# Make the names of groups syntactically valid (replace spaces with periods) -data$samples$group <- make.names(data$samples$group) + ) + + # Remove function output entries that show processing data or is empty + hpReadout <- hpReadout[hpReadout!=""] + hpReadout <- hpReadout[!grepl("Processing", hpReadout)] + hpReadout <- hpReadout[!grepl("in file", hpReadout)] + hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) + + # Make the names of groups syntactically valid (replace spaces with periods) + data$samples$group <- make.names(data$samples$group) } else if (inputType=="counts") { # Process counts information, set ID column to be row names rownames(counts) <- counts$ID @@ -318,14 +377,17 @@ # Create DGEList data <- DGEList(counts=counts, lib.size=colSums(counts), norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) - + # Make the names of groups syntactically valid (replace spaces with periods) data$samples$group <- make.names(data$samples$group) } # Filter hairpins with low counts +preFilterCount <- nrow(data) sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq data <- data[sel, ] +postFilterCount <- nrow(data) +filteredCount <- preFilterCount-postFilterCount # Estimate dispersions data <- estimateDisp(data) @@ -540,8 +602,17 @@ } } -# Record ending time +ID <- rownames(data$counts) +outputCounts <- cbind(ID, data$counts) +write.table(outputCounts, file=countsOut, row.names=FALSE, sep="\t") +linkName <- "Counts table (.tsv)" +linkAddr <- "counts.tsv" +linkData <- rbind(linkData, c(linkName, linkAddr)) + +# Record ending time and calculate total run time timeEnd <- as.character(Sys.time()) +timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3)) +timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE) ################################################################################ ### HTML Generation ################################################################################ @@ -559,12 +630,12 @@ ListItem(hpReadout[1]) ListItem(hpReadout[2]) cata("</ul>\n") - cata(hpReadout[3], "<br/>\n") + cata(hpReadout[3], "<br />\n") cata("<ul>\n") ListItem(hpReadout[4]) ListItem(hpReadout[7]) cata("</ul>\n") - cata(hpReadout[8:11], sep="<br/>\n") + cata(hpReadout[8:11], sep="<br />\n") cata("<br />\n") cata("<b>Please check that read percentages are consistent with ") cata("expectations.</b><br >\n") @@ -582,7 +653,7 @@ cata("<h4>Output:</h4>\n") cata("All images displayed have PDF copy at the bottom of the page, these can ") -cata("exported in a pdf viewer to high resolution image format. <br/>\n") +cata("exported in a pdf viewer to high resolution image format. <br />\n") for (i in 1:nrow(imageData)) { if (grepl("barcode", imageData$Link[i])) { if (packageVersion("limma")<"3.19.19") { @@ -596,7 +667,7 @@ HtmlImage(imageData$Link[i], imageData$Label[i]) } } -cata("<br/>\n") +cata("<br />\n") cata("<h4>Plots:</h4>\n") for (i in 1:nrow(linkData)) { @@ -617,14 +688,80 @@ cata("disk icon to download all files in a zip archive.</p>\n") cata("<p>.tsv files are tab seperated files that can be viewed using Excel ") cata("or other spreadsheet programs</p>\n") + +cata("<h4>Additional Information:</h4>\n") + +if (inputType == "fastq") { + ListItem("Data was gathered from fastq raw read file(s).") +} else if (inputType == "counts") { + ListItem("Data was gathered from a table of counts.") +} + +if (cpmReq!=0 && sampleReq!=0) { + tempStr <- paste("Hairpins that do not have more than", cpmReq, + "CPM in at least", sampleReq, "samples are considered", + "insignificant and filtered out.") + ListItem(tempStr) + filterProp <- round(filteredCount/preFilterCount*100, digits=2) + tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, + "%) hairpins were filtered out for low count-per-million.") + ListItem(tempStr) +} + +if (workMode == "classic") { + ListItem("An exact test was performed on each hairpin.") +} else if (workMode == "glm") { + ListItem("A generalised linear model was fitted to each hairpin.") +} + + + +cit <- character() +link <-character() +link[1] <- paste0("<a href=\"", + "http://www.bioconductor.org/packages/release/bioc/", + "vignettes/limma/inst/doc/usersguide.pdf", + "\">", "limma User's Guide", "</a>.") +link[2] <- paste0("<a href=\"", + "http://www.bioconductor.org/packages/release/bioc/", + "vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf", + "\">", "edgeR User's Guide", "</a>") + +cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).", + "edgeR: a Bioconductor package for differential", + "expression analysis of digital gene expression", + "data. Bioinformatics 26, 139-140") +cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests", + "for assessing differences in tag abundance. Bioinformatics", + "23, 2881-2887") +cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of", + "negative binomial dispersion, with applications to SAGE data.", + "Biostatistics, 9, 321-332") + +cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential", + "expression analysis of multifactor RNA-Seq experiments with", + "respect to biological variation. Nucleic Acids Research 40,", + "4288-4297") + +cata("<h4>Citations</h4>") +cata("<ol>\n") +ListItem(cit[1]) +ListItem(cit[2]) +ListItem(cit[3]) +ListItem(cit[4]) +cata("</ol>\n") + cata("<table border=\"0\">\n") - cata("<tr>\n") TableItem("Task started at:"); TableItem(timeStart) cata("</tr>\n") cata("<tr>\n") TableItem("Task ended at:"); TableItem(timeEnd) cata("</tr>\n") +cata("<tr>\n") +TableItem("Task run time:"); TableItem(timeTaken) +cata("<tr>\n") +cata("</table>\n") cata("</body>\n") cata("</html>")