# HG changeset patch # User shian_su # Date 1397200635 -36000 # Node ID 3d04308a99f9d09a6df68e559a9d81862605d586 # Parent 17befe9f8b033af5026bff9223547ee236e788e1 - Added differentially expressed hairpin count output - Added running time output - Added counts table output diff -r 17befe9f8b03 -r 3d04308a99f9 hairpinTool.R --- a/hairpinTool.R Mon Feb 24 14:50:08 2014 +1100 +++ b/hairpinTool.R Fri Apr 11 17:17:15 2014 +1000 @@ -1,3 +1,51 @@ +# ARGS: 1.inputType -String specifying format of input (fastq or table) +# IF inputType is "fastQ": +# 2*.fastqPath -One or more strings specifying path to fastq files +# 2.annoPath -String specifying path to hairpin annotation table +# 3.samplePath -String specifying path to sample annotation table +# 4.barStart -Integer specifying starting position of barcode +# 5.barEnd -Integer specifying ending position of barcode +# 6.hpStart -Integer specifying startins position of hairpin +# unique region +# 7.hpEnd -Integer specifying ending position of hairpin +# unique region +# ### +# IF inputType is "counts": +# 2.countPath -String specifying path to count table +# 3.annoPath -String specifying path to hairpin annotation table +# 4.samplePath -String specifying path to sample annotation table +# ### +# 8.cpmReq -Float specifying cpm requirement +# 9.sampleReq -Integer specifying cpm requirement +# 10.fdrThresh -Float specifying the FDR requirement +# 11.lfcThresh -Float specifying the log-fold-change requirement +# 12.workMode -String specifying exact test or GLM usage +# 13.htmlPath -String specifying path to HTML file +# 14.folderPath -STring specifying path to folder for output +# IF workMode is "classic" (exact test) +# 15.pairData[2] -String specifying first group for exact test +# 16.pairData[1] -String specifying second group for exact test +# ### +# IF workMode is "glm" +# 15.contrastData -String specifying contrasts to be made +# 16.roastOpt -String specifying usage of gene-wise tests +# 17.hairpinReq -String specifying hairpin requirement for gene- +# wise test +# 18.selectOpt -String specifying type of selection for barcode +# plots +# 19.selectVals -String specifying members selected for barcode +# plots +# +# OUT: Bar Plot of Counts Per Index +# Bar Plot of Counts Per Hairpin +# MDS Plot +# Smear Plot +# Barcode Plots (If Genewise testing was selected) +# Top Expression Table +# HTML file linking to the ouputs +# +# Author: Shian Su - registertonysu@gmail.com - Jan 2014 + # Record starting time timeStart <- as.character(Sys.time()) @@ -115,7 +163,7 @@ fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], fixed=TRUE)) argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] - hairpinPath <- as.character(argv[2]) + annoPath <- as.character(argv[2]) samplePath <- as.character(argv[3]) barStart <- as.numeric(argv[4]) barEnd <- as.numeric(argv[5]) @@ -134,6 +182,7 @@ workMode <- as.character(argv[12]) htmlPath <- as.character(argv[13]) folderPath <- as.character(argv[14]) + if (workMode=="classic") { pairData <- character() pairData[2] <- as.character(argv[15]) @@ -147,14 +196,13 @@ } # Read in inputs -if (inputType=="fastq") { - samples <- read.table(samplePath, header=TRUE, sep="\t") - hairpins <- read.table(hairpinPath, header=TRUE, sep="\t") -} else if (inputType=="counts") { - samples <- read.table(samplePath, header=TRUE, sep="\t") + +samples <- read.table(samplePath, header=TRUE, sep="\t") +anno <- read.table(annoPath, header=TRUE, sep="\t") +if (inputType=="counts") { counts <- read.table(countPath, header=TRUE, sep="\t") - anno <- read.table(annoPath, header=TRUE, sep="\t") } + ###################### Check inputs for correctness ############################ samples$ID <- make.names(samples$ID) @@ -166,16 +214,16 @@ tab <- table(samples$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") offenders <- unmake.names(offenders) - stop("ID column of sample annotation must have unique values, values ", + stop("'ID' column of sample annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in sample annotation are unique if (inputType=="fastq") { - - if (any(table(hairpins$ID)>1)){ - tab <- table(hairpins$ID) + + if (any(table(anno$ID)>1)){ + tab <- table(anno$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") - stop("ID column of hairpin annotation must have unique values, values ", + stop("'ID' column of hairpin annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in hairpin annotation are unique @@ -187,10 +235,20 @@ if (any(table(counts$ID)>1)){ tab <- table(counts$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") - stop("ID column of count table must have unique values, values ", + stop("'ID' column of count table must have unique values, values ", offenders, " are repeated") } # Check that IDs in count table are unique } +if (workMode=="glm") { + if (roastOpt == "yes") { + if (is.na(match("Gene", colnames(anno)))) { + tempStr <- paste("Gene-wise tests selected but'Gene' column not", + "specified in hairpin annotation file") + stop(tempStr) + } + } +} + ################################################################################ # Process arguments @@ -216,7 +274,7 @@ } # Generate output folder and paths -dir.create(folderPath) +dir.create(folderPath, showWarnings=FALSE) # Generate links for outputs imgOut("barHairpin") @@ -243,6 +301,8 @@ barcodePdf[i] <- makeOut(paste0("barcode(", contrastData[i], ").pdf")) } } +countsOut <- makeOut("counts.tsv") + # Initialise data for html links and images, table with the link label and # link address linkData <- data.frame(Label=character(), Link=character(), @@ -276,22 +336,21 @@ } if (inputType=="fastq") { -# Use EdgeR hairpin process and capture outputs -hpReadout <- capture.output( - data <- processHairpinReads(fastqPath, samplePath, hairpinPath, + # Use EdgeR hairpin process and capture outputs + hpReadout <- capture.output( + data <- processHairpinReads(fastqPath, samplePath, annoPath, hairpinStart=hpStart, hairpinEnd=hpEnd, verbose=TRUE) -) - -# Remove function output entries that show processing data or is empty -hpReadout <- hpReadout[hpReadout!=""] -hpReadout <- hpReadout[!grepl("Processing", hpReadout)] -hpReadout <- hpReadout[!grepl("in file", hpReadout)] -hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) - - -# Make the names of groups syntactically valid (replace spaces with periods) -data$samples$group <- make.names(data$samples$group) + ) + + # Remove function output entries that show processing data or is empty + hpReadout <- hpReadout[hpReadout!=""] + hpReadout <- hpReadout[!grepl("Processing", hpReadout)] + hpReadout <- hpReadout[!grepl("in file", hpReadout)] + hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) + + # Make the names of groups syntactically valid (replace spaces with periods) + data$samples$group <- make.names(data$samples$group) } else if (inputType=="counts") { # Process counts information, set ID column to be row names rownames(counts) <- counts$ID @@ -318,14 +377,17 @@ # Create DGEList data <- DGEList(counts=counts, lib.size=colSums(counts), norm.factors=rep(1,ncol(counts)), genes=anno, group=factors) - + # Make the names of groups syntactically valid (replace spaces with periods) data$samples$group <- make.names(data$samples$group) } # Filter hairpins with low counts +preFilterCount <- nrow(data) sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq data <- data[sel, ] +postFilterCount <- nrow(data) +filteredCount <- preFilterCount-postFilterCount # Estimate dispersions data <- estimateDisp(data) @@ -540,8 +602,17 @@ } } -# Record ending time +ID <- rownames(data$counts) +outputCounts <- cbind(ID, data$counts) +write.table(outputCounts, file=countsOut, row.names=FALSE, sep="\t") +linkName <- "Counts table (.tsv)" +linkAddr <- "counts.tsv" +linkData <- rbind(linkData, c(linkName, linkAddr)) + +# Record ending time and calculate total run time timeEnd <- as.character(Sys.time()) +timeTaken <- capture.output(round(difftime(timeEnd,timeStart), digits=3)) +timeTaken <- gsub("Time difference of ", "", timeTaken, fixed=TRUE) ################################################################################ ### HTML Generation ################################################################################ @@ -559,12 +630,12 @@ ListItem(hpReadout[1]) ListItem(hpReadout[2]) cata("\n") - cata(hpReadout[3], "
\n") + cata(hpReadout[3], "
\n") cata("\n") - cata(hpReadout[8:11], sep="
\n") + cata(hpReadout[8:11], sep="
\n") cata("
\n") cata("Please check that read percentages are consistent with ") cata("expectations.
\n") @@ -582,7 +653,7 @@ cata("

Output:

\n") cata("All images displayed have PDF copy at the bottom of the page, these can ") -cata("exported in a pdf viewer to high resolution image format.
\n") +cata("exported in a pdf viewer to high resolution image format.
\n") for (i in 1:nrow(imageData)) { if (grepl("barcode", imageData$Link[i])) { if (packageVersion("limma")<"3.19.19") { @@ -596,7 +667,7 @@ HtmlImage(imageData$Link[i], imageData$Label[i]) } } -cata("
\n") +cata("
\n") cata("

Plots:

\n") for (i in 1:nrow(linkData)) { @@ -617,14 +688,80 @@ cata("disk icon to download all files in a zip archive.

\n") cata("

.tsv files are tab seperated files that can be viewed using Excel ") cata("or other spreadsheet programs

\n") + +cata("

Additional Information:

\n") + +if (inputType == "fastq") { + ListItem("Data was gathered from fastq raw read file(s).") +} else if (inputType == "counts") { + ListItem("Data was gathered from a table of counts.") +} + +if (cpmReq!=0 && sampleReq!=0) { + tempStr <- paste("Hairpins that do not have more than", cpmReq, + "CPM in at least", sampleReq, "samples are considered", + "insignificant and filtered out.") + ListItem(tempStr) + filterProp <- round(filteredCount/preFilterCount*100, digits=2) + tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, + "%) hairpins were filtered out for low count-per-million.") + ListItem(tempStr) +} + +if (workMode == "classic") { + ListItem("An exact test was performed on each hairpin.") +} else if (workMode == "glm") { + ListItem("A generalised linear model was fitted to each hairpin.") +} + + + +cit <- character() +link <-character() +link[1] <- paste0("", "limma User's Guide", ".") +link[2] <- paste0("", "edgeR User's Guide", "") + +cit[1] <- paste("Robinson MD, McCarthy DJ and Smyth GK (2010).", + "edgeR: a Bioconductor package for differential", + "expression analysis of digital gene expression", + "data. Bioinformatics 26, 139-140") +cit[2] <- paste("Robinson MD and Smyth GK (2007). Moderated statistical tests", + "for assessing differences in tag abundance. Bioinformatics", + "23, 2881-2887") +cit[3] <- paste("Robinson MD and Smyth GK (2008). Small-sample estimation of", + "negative binomial dispersion, with applications to SAGE data.", + "Biostatistics, 9, 321-332") + +cit[4] <- paste("McCarthy DJ, Chen Y and Smyth GK (2012). Differential", + "expression analysis of multifactor RNA-Seq experiments with", + "respect to biological variation. Nucleic Acids Research 40,", + "4288-4297") + +cata("

Citations

") +cata("
    \n") +ListItem(cit[1]) +ListItem(cit[2]) +ListItem(cit[3]) +ListItem(cit[4]) +cata("
\n") + cata("\n") - cata("\n") TableItem("Task started at:"); TableItem(timeStart) cata("\n") cata("\n") TableItem("Task ended at:"); TableItem(timeEnd) cata("\n") +cata("\n") +TableItem("Task run time:"); TableItem(timeTaken) +cata("\n") +cata("
\n") cata("\n") cata("") diff -r 17befe9f8b03 -r 3d04308a99f9 hairpinTool.xml --- a/hairpinTool.xml Mon Feb 24 14:50:08 2014 +1100 +++ b/hairpinTool.xml Fri Apr 11 17:17:15 2014 +1000 @@ -1,11 +1,11 @@ - + Analyse hairpin differential representation using edgeR - edgeR - limma + edgeR + limma @@ -13,8 +13,8 @@ - hairpinTool.R $inputOpt.type - #if $inputOpt.type=="fastq": + hairpinTool.R $inputOpt.inputType + #if $inputOpt.inputType=="fastq": #for $i, $fas in enumerate($inputOpt.fastq): fastq::$fas.file #end for @@ -22,7 +22,7 @@ $inputOpt.hairpin $inputOpt.samples - #if $inputOpt.positions.option=="yes": + #if $inputOpt.positions.posOption=="yes": $inputOpt.positions.barstart $inputOpt.positions.barend $inputOpt.positions.hpstart @@ -35,12 +35,12 @@ #end if #else: $inputOpt.counts - $inputOpt.anno - "$inputOpt.factors" + $inputOpt.hairpin + $inputOpt.samples 0 0 0 #end if - #if $filterCPM.option=="yes": + #if $filterCPM.filtOption=="yes": $filterCPM.cpmReq $filterCPM.sampleReq #else: @@ -57,12 +57,12 @@ #if $workMode.mode=="classic": "$workMode.pair1" "$workMode.pair2" - #else: + #elif $workMode.mode=="glm": "$workMode.contrast" - $workMode.roast.option - #if $workMode.roast.option=="yes": + $workMode.roast.roastOption + #if $workMode.roast.roastOption=="yes": $workMode.roast.hairpinReq - $workMode.roast.select.option + $workMode.roast.select.selOption "$workMode.roast.select.selection" #else: 0 @@ -74,7 +74,7 @@ - + @@ -92,7 +92,7 @@ - @@ -118,15 +118,15 @@ - - - @@ -172,7 +172,7 @@ expression."/> - @@ -187,7 +187,7 @@ be analysed."/> - @@ -232,7 +232,6 @@ - .. class:: infomark