# HG changeset patch # User shian_su # Date 1413266707 -39600 # Node ID 7aaa9bc23e3c485645eea62f10d4908bfea22daa # Parent ebb4cb1e8e358e4610252f01d651a930f0b13b44 Added support for paired end reads - Changed terminology to generalise to sgRNA CRISPR experiments. - Added option to include second factor for statistical power - Added option to filter out samples with low counts - Added support for paired end reads - Added option to highlight only positive or negative fold change in smear plot - Fixed bug that caused tool to stop if more than enough sample annotations were supplied diff -r ebb4cb1e8e35 -r 7aaa9bc23e3c hairpinTool.R --- a/hairpinTool.R Wed Oct 01 16:00:43 2014 +1000 +++ b/hairpinTool.R Tue Oct 14 17:05:07 2014 +1100 @@ -1,45 +1,54 @@ # ARGS: 1.inputType -String specifying format of input (fastq or table) -# IF inputType is "fastQ": +# IF inputType is "fastq" or "pairedFastq: # 2*.fastqPath -One or more strings specifying path to fastq files -# 2.annoPath -String specifying path to hairpin annotation table +# 2.annoPath -String specifying path to hairpin annotation table # 3.samplePath -String specifying path to sample annotation table # 4.barStart -Integer specifying starting position of barcode # 5.barEnd -Integer specifying ending position of barcode -# 6.hpStart -Integer specifying startins position of hairpin +# ### +# IF inputType is "pairedFastq": +# 6.barStartRev -Integer specifying starting position of barcode +# on reverse end +# 7.barEndRev -Integer specifying ending position of barcode +# on reverse end +# ### +# 8.hpStart -Integer specifying startins position of hairpin # unique region -# 7.hpEnd -Integer specifying ending position of hairpin +# 9.hpEnd -Integer specifying ending position of hairpin # unique region -# ### # IF inputType is "counts": # 2.countPath -String specifying path to count table # 3.annoPath -String specifying path to hairpin annotation table # 4.samplePath -String specifying path to sample annotation table # ### -# 8.cpmReq -Float specifying cpm requirement -# 9.sampleReq -Integer specifying cpm requirement -# 10.fdrThresh -Float specifying the FDR requirement -# 11.lfcThresh -Float specifying the log-fold-change requirement -# 12.workMode -String specifying exact test or GLM usage -# 13.htmlPath -String specifying path to HTML file -# 14.folderPath -STring specifying path to folder for output +# 10.secFactName -String specifying name of secondary factor +# 11.cpmReq -Float specifying cpm requirement +# 12.sampleReq -Integer specifying cpm requirement +# 13.readReq -Integer specifying read requirement +# 14.fdrThresh -Float specifying the FDR requirement +# 15.lfcThresh -Float specifying the log-fold-change requirement +# 16.workMode -String specifying exact test or GLM usage +# 17.htmlPath -String specifying path to HTML file +# 18.folderPath -String specifying path to folder for output # IF workMode is "classic" (exact test) -# 15.pairData[2] -String specifying first group for exact test -# 16.pairData[1] -String specifying second group for exact test +# 19.pairData[2] -String specifying first group for exact test +# 20.pairData[1] -String specifying second group for exact test # ### # IF workMode is "glm" -# 15.contrastData -String specifying contrasts to be made -# 16.roastOpt -String specifying usage of gene-wise tests -# 17.hairpinReq -String specifying hairpin requirement for gene- +# 19.contrastData -String specifying contrasts to be made +# 20.roastOpt -String specifying usage of gene-wise tests +# 21.hairpinReq -String specifying hairpin requirement for gene- # wise test -# 18.selectOpt -String specifying type of selection for barcode +# 22.selectOpt -String specifying type of selection for barcode # plots -# 19.selectVals -String specifying members selected for barcode +# 23.selectVals -String specifying members selected for barcode # plots # ### # # OUT: Bar Plot of Counts Per Index # Bar Plot of Counts Per Hairpin # MDS Plot +# BCV Plot # Smear Plot # Barcode Plots (If Genewise testing was selected) # Top Expression Table @@ -58,12 +67,12 @@ library(edgeR, quietly=TRUE, warn.conflicts=FALSE) library(limma, quietly=TRUE, warn.conflicts=FALSE) -if (packageVersion("edgeR") < "3.5.27") { - stop("Please update 'edgeR' to version >= 3.5.23 to run this tool") +if (packageVersion("edgeR") < "3.7.17") { + stop("Please update 'edgeR' to version >= 3.7.17 to run this tool") } -if (packageVersion("limma")<"3.19.19") { - message("Update 'limma' to version >= 3.19.19 to see updated barcode graphs") +if (packageVersion("limma")<"3.21.16") { + message("Update 'limma' to version >= 3.21.16 to see updated barcode graphs") } ################################################################################ @@ -171,103 +180,153 @@ # Grabbing arguments from command line argv <- commandArgs(TRUE) -# Remove fastq file paths after collecting from argument vector inputType <- as.character(argv[1]) -if (inputType=="fastq") { +if (inputType == "fastq") { + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], fixed=TRUE)) - argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + + # Remove fastq paths + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + + fastqPathRev <- NULL annoPath <- as.character(argv[2]) samplePath <- as.character(argv[3]) barStart <- as.numeric(argv[4]) barEnd <- as.numeric(argv[5]) - hpStart <- as.numeric(argv[6]) - hpEnd <- as.numeric(argv[7]) -} else if (inputType=="counts") { + barStartRev <- NULL + barStartRev <- NULL + hpStart <- as.numeric(argv[8]) + hpEnd <- as.numeric(argv[9]) +} else if (inputType=="pairedFastq") { + + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], + fixed=TRUE)) + + fastqPathRev <- as.character(gsub("fastqRev::", "", + argv[grepl("fastqRev::", argv)], fixed=TRUE)) + + # Remove fastq paths + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + argv <- argv[!grepl("fastqRev::", argv, fixed=TRUE)] + + annoPath <- as.character(argv[2]) + samplePath <- as.character(argv[3]) + barStart <- as.numeric(argv[4]) + barEnd <- as.numeric(argv[5]) + barStartRev <- as.numeric(argv[6]) + barEndRev <- as.numeric(argv[7]) + hpStart <- as.numeric(argv[8]) + hpEnd <- as.numeric(argv[9]) +} else if (inputType == "counts") { countPath <- as.character(argv[2]) annoPath <- as.character(argv[3]) samplePath <- as.character(argv[4]) } - -cpmReq <- as.numeric(argv[8]) -sampleReq <- as.numeric(argv[9]) -fdrThresh <- as.numeric(argv[10]) -lfcThresh <- as.numeric(argv[11]) -workMode <- as.character(argv[12]) -htmlPath <- as.character(argv[13]) -folderPath <- as.character(argv[14]) -if (workMode=="classic") { +secFactName <- as.character(argv[10]) +cpmReq <- as.numeric(argv[11]) +sampleReq <- as.numeric(argv[12]) +readReq <- as.numeric(argv[13]) +fdrThresh <- as.numeric(argv[14]) +lfcThresh <- as.numeric(argv[15]) +selectDirection <- as.character(argv[16]) +workMode <- as.character(argv[17]) +htmlPath <- as.character(argv[18]) +folderPath <- as.character(argv[19]) + +if (workMode == "classic") { pairData <- character() - pairData[2] <- as.character(argv[15]) - pairData[1] <- as.character(argv[16]) -} else if (workMode=="glm") { - contrastData <- as.character(argv[15]) - roastOpt <- as.character(argv[16]) - hairpinReq <- as.numeric(argv[17]) - selectOpt <- as.character(argv[18]) - selectVals <- as.character(argv[19]) + pairData[2] <- as.character(argv[20]) + pairData[1] <- as.character(argv[21]) +} else if (workMode == "glm") { + contrastData <- as.character(argv[20]) + roastOpt <- as.character(argv[21]) + hairpinReq <- as.numeric(argv[22]) + selectOpt <- as.character(argv[23]) + selectVals <- as.character(argv[24]) } # Read in inputs samples <- read.table(samplePath, header=TRUE, sep="\t") + anno <- read.table(annoPath, header=TRUE, sep="\t") -if (inputType=="counts") { + +if (inputType == "counts") { counts <- read.table(countPath, header=TRUE, sep="\t") } ###################### Check inputs for correctness ############################ samples$ID <- make.names(samples$ID) -if (!any(grepl("group", names(samples)))) { +if ( !any(grepl("group", names(samples))) ) { stop("'group' column not specified in sample annotation file") } # Check if grouping variable has been specified -if (any(table(samples$ID)>1)){ +if (secFactName != "none") { + if ( !any(grepl(secFactName, names(samples))) ) { + tempStr <- paste0("Second factor specified as \"", secFactName, "\" but ", + "no such column name found in sample annotation file") + stop(tempStr) + } # Check if specified secondary factor is present +} + + +if ( any(table(samples$ID) > 1) ){ tab <- table(samples$ID) - offenders <- paste(names(tab[tab>1]), collapse=", ") + offenders <- paste(names(tab[tab > 1]), collapse=", ") offenders <- unmake.names(offenders) stop("'ID' column of sample annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in sample annotation are unique -if (inputType=="fastq") { +if (inputType == "fastq" || inputType == "pairedFastq") { - if (any(table(anno$ID)>1)){ + if ( any(table(anno$ID) > 1) ){ tab <- table(anno$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") stop("'ID' column of hairpin annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in hairpin annotation are unique -} else if (inputType=="counts") { - if (any(is.na(match(samples$ID, colnames(counts))))) { +} else if (inputType == "counts") { + # The first element of the colnames will be 'ID' and should not match + idFromSample <- samples$ID + idFromTable <- colnames(counts)[-1] + if (any(is.na(match(idFromTable, idFromSample)))) { stop("not all samples have groups specified") } # Check that a group has be specifed for each sample - if (any(table(counts$ID)>1)){ + if ( any(table(counts$ID) > 1) ){ tab <- table(counts$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") stop("'ID' column of count table must have unique values, values ", offenders, " are repeated") } # Check that IDs in count table are unique } -if (workMode=="glm") { +if (workMode == "glm") { if (roastOpt == "yes") { if (is.na(match("Gene", colnames(anno)))) { tempStr <- paste("Gene-wise tests selected but'Gene' column not", - "specified in hairpin annotation file") + "specified in hairpin annotation file") stop(tempStr) } } } +if (secFactName != "none") { + if (workMode != "glm") { + tempStr <- paste("only glm analysis type possible when secondary factor", + "used, please change appropriate option.") + } +} + ################################################################################ # Process arguments -if (workMode=="glm") { - if (roastOpt=="yes") { +if (workMode == "glm") { + if (roastOpt == "yes") { wantRoast <- TRUE } else { wantRoast <- FALSE @@ -299,7 +358,7 @@ smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) -} else if (workMode=="glm") { +} else if (workMode == "glm") { smearPng <- character() smearPdf <- character() topOut <- character() @@ -335,8 +394,8 @@ ################################################################################ # Transform gene selection from string into index values for mroast -if (workMode=="glm") { - if (selectOpt=="rank") { +if (workMode == "glm") { + if (selectOpt == "rank") { selectVals <- gsub(" ", "", selectVals, fixed=TRUE) selectVals <- unlist(strsplit(selectVals, ",")) @@ -356,33 +415,45 @@ } } -if (inputType=="fastq") { +if (inputType == "fastq" || inputType == "pairedFastq") { # Use EdgeR hairpin process and capture outputs hpReadout <- capture.output( - data <- processHairpinReads(readfile=fastqPath, - barcodefile=samplePath, - hairpinfile=annoPath, - barcodeStart=barStart, barcodeEnd=barEnd, - hairpinStart=hpStart, hairpinEnd=hpEnd, - verbose=TRUE) + data <- processAmplicons(readfile=fastqPath, readfile2=fastqPathRev, + barcodefile=samplePath, + hairpinfile=annoPath, + barcodeStart=barStart, barcodeEnd=barEnd, + barcodeStartRev=barStartRev, + barcodeEndRev=barEndRev, + hairpinStart=hpStart, hairpinEnd=hpEnd, + verbose=TRUE) ) - + # Remove function output entries that show processing data or is empty hpReadout <- hpReadout[hpReadout!=""] hpReadout <- hpReadout[!grepl("Processing", hpReadout)] hpReadout <- hpReadout[!grepl("in file", hpReadout)] hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) - + # Make the names of groups syntactically valid (replace spaces with periods) data$samples$group <- make.names(data$samples$group) -} else if (inputType=="counts") { + if (secFactName != "none") { + data$samples[[secFactName]] <- make.names(data$samples[[secFactName]]) + } +} else if (inputType == "counts") { # Process counts information, set ID column to be row names rownames(counts) <- counts$ID - counts <- counts[ , !(colnames(counts)=="ID")] + counts <- counts[ , !(colnames(counts) == "ID")] countsRows <- nrow(counts) # Process group information - factors <- samples$group[match(samples$ID, colnames(counts))] + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- samples$group[matchedIndex] + + if (secFactName != "none") { + secFactors <- samples[[secFactName]][matchedIndex] + } + annoRows <- nrow(anno) anno <- anno[match(rownames(counts), anno$ID), ] annoMatched <- sum(!is.na(anno$ID)) @@ -416,14 +487,48 @@ # Filter hairpins with low counts preFilterCount <- nrow(data) -sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq -data <- data[sel, ] +selRow <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq +selCol <- colSums(data$counts) > readReq +data <- data[selRow, selCol] + +# Check if any data survived filtering +if (length(data$counts) == 0) { + stop("no data remains after filtering, consider relaxing filters") +} + +# Count number of filtered tags and samples postFilterCount <- nrow(data) -filteredCount <- preFilterCount-postFilterCount +filteredCount <- preFilterCount - postFilterCount +sampleFilterCount <- sum(!selCol) + +if (secFactName == "none") { + # Estimate dispersions + data <- estimateDisp(data) + commonBCV <- round(sqrt(data$common.dispersion), 4) +} else { + # Construct design + if (inputType == "counts") { + + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- factor(make.names(samples$group[matchedIndex])) -# Estimate dispersions -data <- estimateDisp(data) -commonBCV <- sqrt(data$common.dispersion) + secFactors <- factor(make.names(samples[[secFactName]][matchedIndex])) + + } else if (inputType == "fastq" || inputType == "pairedFastq") { + + factors <- factor(data$sample$group) + secFactors <- factor(data$sample[[secFactName]]) + + } + + design <- model.matrix(~0 + factors + secFactors) + + # Estimate dispersions + data <- estimateDisp(data, design=design) + commonBCV <- round(sqrt(data$common.dispersion), 4) +} + ################################################################################ ### Output Processing @@ -494,14 +599,23 @@ linkData <- rbind(linkData, newEntry) invisible(dev.off()) -if (workMode=="classic") { +if (workMode == "classic") { # Assess differential representation using classic exact testing methodology # in edgeR testData <- exactTest(data, pair=pairData) top <- topTags(testData, n=Inf) + + if (selectDirection == "all") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + } else if (selectDirection == "up") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC > lfcThresh), 1] + } else if (selectDirection == "down") { topIDs <- top$table[(top$table$FDR < fdrThresh) & - (abs(top$table$logFC) > lfcThresh), 1] + (top$table$logFC < -lfcThresh), 1] +} write.table(top, file=topOut, row.names=FALSE, sep="\t") @@ -511,8 +625,10 @@ linkData <- rbind(linkData, c(linkName, linkAddr)) upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) + downCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC < -lfcThresh) + flatCount[1] <- sum(top$table$FDR > fdrThresh | abs(top$table$logFC) < lfcThresh) @@ -543,12 +659,40 @@ linkData <- rbind(linkData, c(imgName, imgAddr)) invisible(dev.off()) -} else if (workMode=="glm") { +} else if (workMode == "glm") { # Generating design information - factors <- factor(data$sample$group) - design <- model.matrix(~0+factors) + if (secFactName == "none") { + + factors <- factor(data$sample$group) + design <- model.matrix(~0 + factors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + + } else { + + factors <- factor(data$sample$group) + + if (inputType == "counts") { + + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- factor(samples$group[matchedIndex]) + + secFactors <- factor(samples[[secFactName]][matchedIndex]) + + } else if (inputType == "fastq" || inputType == "pairedFastq") { + + secFactors <- factor(data$sample[[secFactName]]) + + } + + design <- model.matrix(~0 + factors + secFactors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + colnames(design) <- gsub("secFactors", secFactName, colnames(design), + fixed=TRUE) + } - colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) # Split up contrasts seperated by comma into a vector contrastData <- unlist(strsplit(contrastData, split=",")) @@ -564,8 +708,18 @@ # Select hairpins with FDR < 0.05 to highlight on plot top <- topTags(testData, n=Inf) - topIDs <- top$table[(top$table$FDR < fdrThresh) & + + if (selectDirection == "all") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & (abs(top$table$logFC) > lfcThresh), 1] + } else if (selectDirection == "up") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC > lfcThresh), 1] + } else if (selectDirection == "down") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC < -lfcThresh), 1] + } + write.table(top, file=topOut[i], row.names=FALSE, sep="\t") linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") @@ -607,8 +761,8 @@ unq <- unq[!is.na(unq)] geneList <- list() for (gene in unq) { - if (length(which(genes==gene)) >= hairpinReq) { - geneList[[gene]] <- which(genes==gene) + if (length(which(genes == gene)) >= hairpinReq) { + geneList[[gene]] <- which(genes == gene) } } @@ -624,7 +778,7 @@ ") (.tsv)") linkAddr <- paste0("gene_level(", contrastData[i], ").tsv") linkData <- rbind(linkData, c(linkName, linkAddr)) - if (selectOpt=="rank") { + if (selectOpt == "rank") { selectedGenes <- rownames(roastData)[selectVals] } else { selectedGenes <- selectVals @@ -668,9 +822,13 @@ # Generate data frame of the significant differences sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) if (workMode == "glm") { + row.names(sigDiff) <- contrastData + } else if (workMode == "classic") { + row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1]) + } # Output table of summarised counts @@ -702,7 +860,8 @@ cata("\n") cata("

EdgeR Analysis Output:

\n") cata("

Input Summary:

\n") -if (inputType=="fastq") { +if (inputType == "fastq" || inputType == "pairedFastq") { + cata("