Mercurial > repos > shians > shrnaseq
changeset 13:7aaa9bc23e3c
Added support for paired end reads
- Changed terminology to generalise to sgRNA CRISPR experiments.
- Added option to include second factor for statistical power
- Added option to filter out samples with low counts
- Added support for paired end reads
- Added option to highlight only positive or negative fold change in smear plot
- Fixed bug that caused tool to stop if more than enough sample annotations
were supplied
author | shian_su <registertonysu@gmail.com> |
---|---|
date | Tue, 14 Oct 2014 17:05:07 +1100 |
parents | ebb4cb1e8e35 |
children | 44130e484a97 |
files | hairpinTool.R hairpinTool.xml |
diffstat | 2 files changed, 606 insertions(+), 203 deletions(-) [+] |
line wrap: on
line diff
--- a/hairpinTool.R Wed Oct 01 16:00:43 2014 +1000 +++ b/hairpinTool.R Tue Oct 14 17:05:07 2014 +1100 @@ -1,45 +1,54 @@ # ARGS: 1.inputType -String specifying format of input (fastq or table) -# IF inputType is "fastQ": +# IF inputType is "fastq" or "pairedFastq: # 2*.fastqPath -One or more strings specifying path to fastq files -# 2.annoPath -String specifying path to hairpin annotation table +# 2.annoPath -String specifying path to hairpin annotation table # 3.samplePath -String specifying path to sample annotation table # 4.barStart -Integer specifying starting position of barcode # 5.barEnd -Integer specifying ending position of barcode -# 6.hpStart -Integer specifying startins position of hairpin +# ### +# IF inputType is "pairedFastq": +# 6.barStartRev -Integer specifying starting position of barcode +# on reverse end +# 7.barEndRev -Integer specifying ending position of barcode +# on reverse end +# ### +# 8.hpStart -Integer specifying startins position of hairpin # unique region -# 7.hpEnd -Integer specifying ending position of hairpin +# 9.hpEnd -Integer specifying ending position of hairpin # unique region -# ### # IF inputType is "counts": # 2.countPath -String specifying path to count table # 3.annoPath -String specifying path to hairpin annotation table # 4.samplePath -String specifying path to sample annotation table # ### -# 8.cpmReq -Float specifying cpm requirement -# 9.sampleReq -Integer specifying cpm requirement -# 10.fdrThresh -Float specifying the FDR requirement -# 11.lfcThresh -Float specifying the log-fold-change requirement -# 12.workMode -String specifying exact test or GLM usage -# 13.htmlPath -String specifying path to HTML file -# 14.folderPath -STring specifying path to folder for output +# 10.secFactName -String specifying name of secondary factor +# 11.cpmReq -Float specifying cpm requirement +# 12.sampleReq -Integer specifying cpm requirement +# 13.readReq -Integer specifying read requirement +# 14.fdrThresh -Float specifying the FDR requirement +# 15.lfcThresh -Float specifying the log-fold-change requirement +# 16.workMode -String specifying exact test or GLM usage +# 17.htmlPath -String specifying path to HTML file +# 18.folderPath -String specifying path to folder for output # IF workMode is "classic" (exact test) -# 15.pairData[2] -String specifying first group for exact test -# 16.pairData[1] -String specifying second group for exact test +# 19.pairData[2] -String specifying first group for exact test +# 20.pairData[1] -String specifying second group for exact test # ### # IF workMode is "glm" -# 15.contrastData -String specifying contrasts to be made -# 16.roastOpt -String specifying usage of gene-wise tests -# 17.hairpinReq -String specifying hairpin requirement for gene- +# 19.contrastData -String specifying contrasts to be made +# 20.roastOpt -String specifying usage of gene-wise tests +# 21.hairpinReq -String specifying hairpin requirement for gene- # wise test -# 18.selectOpt -String specifying type of selection for barcode +# 22.selectOpt -String specifying type of selection for barcode # plots -# 19.selectVals -String specifying members selected for barcode +# 23.selectVals -String specifying members selected for barcode # plots # ### # # OUT: Bar Plot of Counts Per Index # Bar Plot of Counts Per Hairpin # MDS Plot +# BCV Plot # Smear Plot # Barcode Plots (If Genewise testing was selected) # Top Expression Table @@ -58,12 +67,12 @@ library(edgeR, quietly=TRUE, warn.conflicts=FALSE) library(limma, quietly=TRUE, warn.conflicts=FALSE) -if (packageVersion("edgeR") < "3.5.27") { - stop("Please update 'edgeR' to version >= 3.5.23 to run this tool") +if (packageVersion("edgeR") < "3.7.17") { + stop("Please update 'edgeR' to version >= 3.7.17 to run this tool") } -if (packageVersion("limma")<"3.19.19") { - message("Update 'limma' to version >= 3.19.19 to see updated barcode graphs") +if (packageVersion("limma")<"3.21.16") { + message("Update 'limma' to version >= 3.21.16 to see updated barcode graphs") } ################################################################################ @@ -171,103 +180,153 @@ # Grabbing arguments from command line argv <- commandArgs(TRUE) -# Remove fastq file paths after collecting from argument vector inputType <- as.character(argv[1]) -if (inputType=="fastq") { +if (inputType == "fastq") { + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], fixed=TRUE)) - argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + + # Remove fastq paths + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + + fastqPathRev <- NULL annoPath <- as.character(argv[2]) samplePath <- as.character(argv[3]) barStart <- as.numeric(argv[4]) barEnd <- as.numeric(argv[5]) - hpStart <- as.numeric(argv[6]) - hpEnd <- as.numeric(argv[7]) -} else if (inputType=="counts") { + barStartRev <- NULL + barStartRev <- NULL + hpStart <- as.numeric(argv[8]) + hpEnd <- as.numeric(argv[9]) +} else if (inputType=="pairedFastq") { + + fastqPath <- as.character(gsub("fastq::", "", argv[grepl("fastq::", argv)], + fixed=TRUE)) + + fastqPathRev <- as.character(gsub("fastqRev::", "", + argv[grepl("fastqRev::", argv)], fixed=TRUE)) + + # Remove fastq paths + argv <- argv[!grepl("fastq::", argv, fixed=TRUE)] + argv <- argv[!grepl("fastqRev::", argv, fixed=TRUE)] + + annoPath <- as.character(argv[2]) + samplePath <- as.character(argv[3]) + barStart <- as.numeric(argv[4]) + barEnd <- as.numeric(argv[5]) + barStartRev <- as.numeric(argv[6]) + barEndRev <- as.numeric(argv[7]) + hpStart <- as.numeric(argv[8]) + hpEnd <- as.numeric(argv[9]) +} else if (inputType == "counts") { countPath <- as.character(argv[2]) annoPath <- as.character(argv[3]) samplePath <- as.character(argv[4]) } - -cpmReq <- as.numeric(argv[8]) -sampleReq <- as.numeric(argv[9]) -fdrThresh <- as.numeric(argv[10]) -lfcThresh <- as.numeric(argv[11]) -workMode <- as.character(argv[12]) -htmlPath <- as.character(argv[13]) -folderPath <- as.character(argv[14]) -if (workMode=="classic") { +secFactName <- as.character(argv[10]) +cpmReq <- as.numeric(argv[11]) +sampleReq <- as.numeric(argv[12]) +readReq <- as.numeric(argv[13]) +fdrThresh <- as.numeric(argv[14]) +lfcThresh <- as.numeric(argv[15]) +selectDirection <- as.character(argv[16]) +workMode <- as.character(argv[17]) +htmlPath <- as.character(argv[18]) +folderPath <- as.character(argv[19]) + +if (workMode == "classic") { pairData <- character() - pairData[2] <- as.character(argv[15]) - pairData[1] <- as.character(argv[16]) -} else if (workMode=="glm") { - contrastData <- as.character(argv[15]) - roastOpt <- as.character(argv[16]) - hairpinReq <- as.numeric(argv[17]) - selectOpt <- as.character(argv[18]) - selectVals <- as.character(argv[19]) + pairData[2] <- as.character(argv[20]) + pairData[1] <- as.character(argv[21]) +} else if (workMode == "glm") { + contrastData <- as.character(argv[20]) + roastOpt <- as.character(argv[21]) + hairpinReq <- as.numeric(argv[22]) + selectOpt <- as.character(argv[23]) + selectVals <- as.character(argv[24]) } # Read in inputs samples <- read.table(samplePath, header=TRUE, sep="\t") + anno <- read.table(annoPath, header=TRUE, sep="\t") -if (inputType=="counts") { + +if (inputType == "counts") { counts <- read.table(countPath, header=TRUE, sep="\t") } ###################### Check inputs for correctness ############################ samples$ID <- make.names(samples$ID) -if (!any(grepl("group", names(samples)))) { +if ( !any(grepl("group", names(samples))) ) { stop("'group' column not specified in sample annotation file") } # Check if grouping variable has been specified -if (any(table(samples$ID)>1)){ +if (secFactName != "none") { + if ( !any(grepl(secFactName, names(samples))) ) { + tempStr <- paste0("Second factor specified as \"", secFactName, "\" but ", + "no such column name found in sample annotation file") + stop(tempStr) + } # Check if specified secondary factor is present +} + + +if ( any(table(samples$ID) > 1) ){ tab <- table(samples$ID) - offenders <- paste(names(tab[tab>1]), collapse=", ") + offenders <- paste(names(tab[tab > 1]), collapse=", ") offenders <- unmake.names(offenders) stop("'ID' column of sample annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in sample annotation are unique -if (inputType=="fastq") { +if (inputType == "fastq" || inputType == "pairedFastq") { - if (any(table(anno$ID)>1)){ + if ( any(table(anno$ID) > 1) ){ tab <- table(anno$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") stop("'ID' column of hairpin annotation must have unique values, values ", offenders, " are repeated") } # Check that IDs in hairpin annotation are unique -} else if (inputType=="counts") { - if (any(is.na(match(samples$ID, colnames(counts))))) { +} else if (inputType == "counts") { + # The first element of the colnames will be 'ID' and should not match + idFromSample <- samples$ID + idFromTable <- colnames(counts)[-1] + if (any(is.na(match(idFromTable, idFromSample)))) { stop("not all samples have groups specified") } # Check that a group has be specifed for each sample - if (any(table(counts$ID)>1)){ + if ( any(table(counts$ID) > 1) ){ tab <- table(counts$ID) offenders <- paste(names(tab[tab>1]), collapse=", ") stop("'ID' column of count table must have unique values, values ", offenders, " are repeated") } # Check that IDs in count table are unique } -if (workMode=="glm") { +if (workMode == "glm") { if (roastOpt == "yes") { if (is.na(match("Gene", colnames(anno)))) { tempStr <- paste("Gene-wise tests selected but'Gene' column not", - "specified in hairpin annotation file") + "specified in hairpin annotation file") stop(tempStr) } } } +if (secFactName != "none") { + if (workMode != "glm") { + tempStr <- paste("only glm analysis type possible when secondary factor", + "used, please change appropriate option.") + } +} + ################################################################################ # Process arguments -if (workMode=="glm") { - if (roastOpt=="yes") { +if (workMode == "glm") { + if (roastOpt == "yes") { wantRoast <- TRUE } else { wantRoast <- FALSE @@ -299,7 +358,7 @@ smearPng <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").png")) smearPdf <- makeOut(paste0("smear(", pairData[2], "-", pairData[1],").pdf")) topOut <- makeOut(paste0("toptag(", pairData[2], "-", pairData[1],").tsv")) -} else if (workMode=="glm") { +} else if (workMode == "glm") { smearPng <- character() smearPdf <- character() topOut <- character() @@ -335,8 +394,8 @@ ################################################################################ # Transform gene selection from string into index values for mroast -if (workMode=="glm") { - if (selectOpt=="rank") { +if (workMode == "glm") { + if (selectOpt == "rank") { selectVals <- gsub(" ", "", selectVals, fixed=TRUE) selectVals <- unlist(strsplit(selectVals, ",")) @@ -356,33 +415,45 @@ } } -if (inputType=="fastq") { +if (inputType == "fastq" || inputType == "pairedFastq") { # Use EdgeR hairpin process and capture outputs hpReadout <- capture.output( - data <- processHairpinReads(readfile=fastqPath, - barcodefile=samplePath, - hairpinfile=annoPath, - barcodeStart=barStart, barcodeEnd=barEnd, - hairpinStart=hpStart, hairpinEnd=hpEnd, - verbose=TRUE) + data <- processAmplicons(readfile=fastqPath, readfile2=fastqPathRev, + barcodefile=samplePath, + hairpinfile=annoPath, + barcodeStart=barStart, barcodeEnd=barEnd, + barcodeStartRev=barStartRev, + barcodeEndRev=barEndRev, + hairpinStart=hpStart, hairpinEnd=hpEnd, + verbose=TRUE) ) - + # Remove function output entries that show processing data or is empty hpReadout <- hpReadout[hpReadout!=""] hpReadout <- hpReadout[!grepl("Processing", hpReadout)] hpReadout <- hpReadout[!grepl("in file", hpReadout)] hpReadout <- gsub(" -- ", "", hpReadout, fixed=TRUE) - + # Make the names of groups syntactically valid (replace spaces with periods) data$samples$group <- make.names(data$samples$group) -} else if (inputType=="counts") { + if (secFactName != "none") { + data$samples[[secFactName]] <- make.names(data$samples[[secFactName]]) + } +} else if (inputType == "counts") { # Process counts information, set ID column to be row names rownames(counts) <- counts$ID - counts <- counts[ , !(colnames(counts)=="ID")] + counts <- counts[ , !(colnames(counts) == "ID")] countsRows <- nrow(counts) # Process group information - factors <- samples$group[match(samples$ID, colnames(counts))] + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- samples$group[matchedIndex] + + if (secFactName != "none") { + secFactors <- samples[[secFactName]][matchedIndex] + } + annoRows <- nrow(anno) anno <- anno[match(rownames(counts), anno$ID), ] annoMatched <- sum(!is.na(anno$ID)) @@ -416,14 +487,48 @@ # Filter hairpins with low counts preFilterCount <- nrow(data) -sel <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq -data <- data[sel, ] +selRow <- rowSums(cpm(data$counts) > cpmReq) >= sampleReq +selCol <- colSums(data$counts) > readReq +data <- data[selRow, selCol] + +# Check if any data survived filtering +if (length(data$counts) == 0) { + stop("no data remains after filtering, consider relaxing filters") +} + +# Count number of filtered tags and samples postFilterCount <- nrow(data) -filteredCount <- preFilterCount-postFilterCount +filteredCount <- preFilterCount - postFilterCount +sampleFilterCount <- sum(!selCol) + +if (secFactName == "none") { + # Estimate dispersions + data <- estimateDisp(data) + commonBCV <- round(sqrt(data$common.dispersion), 4) +} else { + # Construct design + if (inputType == "counts") { + + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- factor(make.names(samples$group[matchedIndex])) -# Estimate dispersions -data <- estimateDisp(data) -commonBCV <- sqrt(data$common.dispersion) + secFactors <- factor(make.names(samples[[secFactName]][matchedIndex])) + + } else if (inputType == "fastq" || inputType == "pairedFastq") { + + factors <- factor(data$sample$group) + secFactors <- factor(data$sample[[secFactName]]) + + } + + design <- model.matrix(~0 + factors + secFactors) + + # Estimate dispersions + data <- estimateDisp(data, design=design) + commonBCV <- round(sqrt(data$common.dispersion), 4) +} + ################################################################################ ### Output Processing @@ -494,14 +599,23 @@ linkData <- rbind(linkData, newEntry) invisible(dev.off()) -if (workMode=="classic") { +if (workMode == "classic") { # Assess differential representation using classic exact testing methodology # in edgeR testData <- exactTest(data, pair=pairData) top <- topTags(testData, n=Inf) + + if (selectDirection == "all") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (abs(top$table$logFC) > lfcThresh), 1] + } else if (selectDirection == "up") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC > lfcThresh), 1] + } else if (selectDirection == "down") { topIDs <- top$table[(top$table$FDR < fdrThresh) & - (abs(top$table$logFC) > lfcThresh), 1] + (top$table$logFC < -lfcThresh), 1] +} write.table(top, file=topOut, row.names=FALSE, sep="\t") @@ -511,8 +625,10 @@ linkData <- rbind(linkData, c(linkName, linkAddr)) upCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC > lfcThresh) + downCount[1] <- sum(top$table$FDR < fdrThresh & top$table$logFC < -lfcThresh) + flatCount[1] <- sum(top$table$FDR > fdrThresh | abs(top$table$logFC) < lfcThresh) @@ -543,12 +659,40 @@ linkData <- rbind(linkData, c(imgName, imgAddr)) invisible(dev.off()) -} else if (workMode=="glm") { +} else if (workMode == "glm") { # Generating design information - factors <- factor(data$sample$group) - design <- model.matrix(~0+factors) + if (secFactName == "none") { + + factors <- factor(data$sample$group) + design <- model.matrix(~0 + factors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + + } else { + + factors <- factor(data$sample$group) + + if (inputType == "counts") { + + sampleNames <- colnames(counts) + matchedIndex <- match(sampleNames, samples$ID) + factors <- factor(samples$group[matchedIndex]) + + secFactors <- factor(samples[[secFactName]][matchedIndex]) + + } else if (inputType == "fastq" || inputType == "pairedFastq") { + + secFactors <- factor(data$sample[[secFactName]]) + + } + + design <- model.matrix(~0 + factors + secFactors) + + colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) + colnames(design) <- gsub("secFactors", secFactName, colnames(design), + fixed=TRUE) + } - colnames(design) <- gsub("factors", "", colnames(design), fixed=TRUE) # Split up contrasts seperated by comma into a vector contrastData <- unlist(strsplit(contrastData, split=",")) @@ -564,8 +708,18 @@ # Select hairpins with FDR < 0.05 to highlight on plot top <- topTags(testData, n=Inf) - topIDs <- top$table[(top$table$FDR < fdrThresh) & + + if (selectDirection == "all") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & (abs(top$table$logFC) > lfcThresh), 1] + } else if (selectDirection == "up") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC > lfcThresh), 1] + } else if (selectDirection == "down") { + topIDs <- top$table[(top$table$FDR < fdrThresh) & + (top$table$logFC < -lfcThresh), 1] + } + write.table(top, file=topOut[i], row.names=FALSE, sep="\t") linkName <- paste0("Top Tags Table(", contrastData[i], ") (.tsv)") @@ -607,8 +761,8 @@ unq <- unq[!is.na(unq)] geneList <- list() for (gene in unq) { - if (length(which(genes==gene)) >= hairpinReq) { - geneList[[gene]] <- which(genes==gene) + if (length(which(genes == gene)) >= hairpinReq) { + geneList[[gene]] <- which(genes == gene) } } @@ -624,7 +778,7 @@ ") (.tsv)") linkAddr <- paste0("gene_level(", contrastData[i], ").tsv") linkData <- rbind(linkData, c(linkName, linkAddr)) - if (selectOpt=="rank") { + if (selectOpt == "rank") { selectedGenes <- rownames(roastData)[selectVals] } else { selectedGenes <- selectVals @@ -668,9 +822,13 @@ # Generate data frame of the significant differences sigDiff <- data.frame(Up=upCount, Flat=flatCount, Down=downCount) if (workMode == "glm") { + row.names(sigDiff) <- contrastData + } else if (workMode == "classic") { + row.names(sigDiff) <- paste0(pairData[2], "-", pairData[1]) + } # Output table of summarised counts @@ -702,7 +860,8 @@ cata("<body>\n") cata("<h3>EdgeR Analysis Output:</h3>\n") cata("<h4>Input Summary:</h4>\n") -if (inputType=="fastq") { +if (inputType == "fastq" || inputType == "pairedFastq") { + cata("<ul>\n") ListItem(hpReadout[1]) ListItem(hpReadout[2]) @@ -716,31 +875,51 @@ cata("<br />\n") cata("<b>Please check that read percentages are consistent with ") cata("expectations.</b><br >\n") -} else if (inputType=="counts") { + +} else if (inputType == "counts") { + cata("<ul>\n") ListItem("Number of Samples: ", ncol(data$counts)) ListItem("Number of Hairpins: ", countsRows) ListItem("Number of annotations provided: ", annoRows) ListItem("Number of annotations matched to hairpin: ", annoMatched) cata("</ul>\n") + } cata("The estimated common biological coefficient of variation (BCV) is: ", commonBCV, "<br />\n") +if (secFactName == "none") { + + cata("No secondary factor specified.<br />\n") + +} else { + + cata("Secondary factor specified as: ", secFactName, "<br />\n") + +} + cata("<h4>Output:</h4>\n") cata("PDF copies of JPEGS available in 'Plots' section.<br />\n") for (i in 1:nrow(imageData)) { if (grepl("barcode", imageData$Link[i])) { + if (packageVersion("limma")<"3.19.19") { + HtmlImage(imageData$Link[i], imageData$Label[i], height=length(selectedGenes)*150) + } else { + HtmlImage(imageData$Link[i], imageData$Label[i], height=length(selectedGenes)*300) + } } else { + HtmlImage(imageData$Link[i], imageData$Label[i]) + } } cata("<br />\n") @@ -779,26 +958,42 @@ } cata("<p>Alt-click links to download file.</p>\n") -cata("<p>Click floppy disc icon associated history item to download ") +cata("<p>Click floppy disc icon on associated history item to download ") cata("all files.</p>\n") cata("<p>.tsv files can be viewed in Excel or any spreadsheet program.</p>\n") cata("<h4>Additional Information:</h4>\n") if (inputType == "fastq") { + ListItem("Data was gathered from fastq raw read file(s).") + } else if (inputType == "counts") { + ListItem("Data was gathered from a table of counts.") + } -if (cpmReq!=0 && sampleReq!=0) { - tempStr <- paste("Hairpins without more than", cpmReq, +if (cpmReq != 0 && sampleReq != 0) { + tempStr <- paste("Target sequences without more than", cpmReq, "CPM in at least", sampleReq, "samples are insignificant", "and filtered out.") ListItem(tempStr) + filterProp <- round(filteredCount/preFilterCount*100, digits=2) tempStr <- paste0(filteredCount, " of ", preFilterCount," (", filterProp, - "%) hairpins were filtered out for low count-per-million.") + "%) target sequences were filtered out for low ", + "count-per-million.") + ListItem(tempStr) +} + +if (sampleReq != 0) { + tempStr <- paste("Samples that did not produce more than", sampleReq, + "counts were filtered out.") + ListItem(tempStr) + + tempStr <- paste0(sampleFilterCount, " samples were filtered out for low ", + "counts.") ListItem(tempStr) } @@ -809,9 +1004,9 @@ } if (workMode == "classic") { - ListItem("An exact test was performed on each hairpin.") + ListItem("An exact test was performed on each target sequence.") } else if (workMode == "glm") { - ListItem("A generalised linear model was fitted to each hairpin.") + ListItem("A generalised linear model was fitted to each target sequence.") } cit <- character()
--- a/hairpinTool.xml Wed Oct 01 16:00:43 2014 +1000 +++ b/hairpinTool.xml Tue Oct 14 17:05:07 2014 +1100 @@ -1,12 +1,13 @@ -<tool id="shRNAseq" name="shRNAseq Tool" version="1.0.13"> +<tool id="shRNAseq" name="shRNAseq Tool" version="1.2.0"> <description> - Analyse hairpin differential representation using edgeR + Analyse differential representation for shRNAseq and sgRNA based procedures + using edgeR package from Bioconductor. </description> <requirements> - <requirement type="R-module" version="3.6.2">edgeR</requirement> - <requirement type="R-module" version="3.20.7">limma</requirement> - <requirement type="package" version="3.0.3">R_3_0_3</requirement> + <requirement type="R-module" version="3.7.17">edgeR</requirement> + <requirement type="R-module" version="3.21.16">limma</requirement> + <requirement type="package" version="3.1.1">R_3_0_3</requirement> </requirements> <stdio> @@ -14,43 +15,90 @@ </stdio> <command interpreter="Rscript"> - hairpinTool.R $inputOpt.inputType + ampliconTool.R $inputOpt.inputType #if $inputOpt.inputType=="fastq": + #for $i, $fas in enumerate($inputOpt.fastq): fastq::$fas.file #end for $inputOpt.hairpin $inputOpt.samples + + #if $inputOpt.positions.posOption=="yes": + $inputOpt.positions.barstart + $inputOpt.positions.barend + 0 + 0 + $inputOpt.positions.hpstart + $inputOpt.positions.hpend + #else: + 1 + 5 + 0 + 0 + 37 + 57 + #end if + #elif $inputOpt.inputType=="pairedFastq": + + #for $i, $fas in enumerate($inputOpt.fastq): + fastq::$fas.file + #end for + + #for $i, $fas in enumerate($inputOpt.fastq): + fastqRev::$fas.fileRev + #end for + + $inputOpt.hairpin + $inputOpt.samples #if $inputOpt.positions.posOption=="yes": $inputOpt.positions.barstart $inputOpt.positions.barend + $inputOpt.positions.barstartRev + $inputOpt.positions.barendRev $inputOpt.positions.hpstart $inputOpt.positions.hpend #else: 1 5 + 0 + 0 37 57 #end if - #else: + + #elif $inputOpt.inputType=="counts": $inputOpt.counts $inputOpt.hairpin $inputOpt.samples - 0 0 0 + 0 + 0 + 0 + 0 + 0 #end if - + + #if $inputOpt.secondaryFactor.secFactorOpt=="yes": + $inputOpt.secondaryFactor.secFactName + #else: + "none" + #end if + #if $filterCPM.filtOption=="yes": $filterCPM.cpmReq $filterCPM.sampleReq + $filterCPM.readReq #else: -Inf -Inf + -Inf #end if $fdr $lfc + $direction $workMode.mode $outFile $outFile.files_path @@ -61,6 +109,7 @@ #elif $workMode.mode=="glm": "$workMode.contrast" $workMode.roast.roastOption + #if $workMode.roast.roastOption=="yes": $workMode.roast.hairpinReq $workMode.roast.select.selOption @@ -70,20 +119,22 @@ 0 0 #end if + #end if </command> <inputs> <conditional name="inputOpt"> + <param name="inputType" type="select" label="Input File Type"> <option value="fastq">FastQ File</option> + <option value="pairedFastq">Paired FastQ File</option> <option value="counts">Table of Counts</option> </param> - + <when value="fastq"> <param name="hairpin" type="data" format="tabular" - label="Hairpin Annotation"/> - + label="Target Annotation"/> <param name="samples" type="data" format="tabular" label="Sample Annotation"/> @@ -91,47 +142,171 @@ <repeat name="fastq" title="FastQ Files"> <param name="file" type="data" format="fastq"/> </repeat> + + <conditional name="secondaryFactor"> + <param name="secFactorOpt" type="select" + label="Include Secondary Factor"> + + <option value="no" selected="True">No</option> + + <option value="yes">Yes</option> + + </param> + + <when value="yes"> + + <param name="secFactName" type="text" label="Secondary Factor Name" + size="80"/> + + </when> + + <when value="no"> + </when> + </conditional> + <conditional name="positions"> <param name="posOption" type="select" - label="Specify Barcode and Hairpin Locations?" - help="Default Positions: Barcode: 1 to 5, Hairpin: 37 to 57."> + label="Specify Sample Index and Target Sequence Locations?" + help="Default Positions: Index: 1 to 5, Target: 37 to 57."> <option value="no" selected="True">No</option> <option value="yes">Yes</option> </param> <when value="yes"> <param name="barstart" type="integer" value="1" - label="Barcode Starting Position"/> + label="Index Starting Position"/> <param name="barend" type="integer" value="5" - label="Barcode Ending Position"/> + label="Index Ending Position"/> <param name="hpstart" type="integer" value="37" - label="Hairpin Starting Position"/> + label="Target Starting Position"/> <param name="hpend" type="integer" value="57" - label="Hairpin Ending Position"/> + label="Target Ending Position"/> </when> <when value="no"/> </conditional> </when> - + + <when value="pairedFastq"> + <param name="hairpin" type="data" format="tabular" + label="Target Sequence Annotation"/> + + <param name="samples" type="data" format="tabular" + label="Sample Annotation"/> + + <repeat name="fastq" title="FastQ Files"> + <param name="file" type="data" format="fastq"/> + <param name="fileRev" type="data" format="fastq"/> + </repeat> + + <conditional name="secondaryFactor"> + + <param name="secFactorOpt" type="select" + label="Include Secondary Factor"> + + <option value="no" selected="True">No</option> + + <option value="yes">Yes</option> + + </param> + + <when value="yes"> + + <param name="secFactName" type="text" label="Secondary Factor Name" + size="80"/> + + </when> + + <when value="no"> + </when> + </conditional> + + <conditional name="positions"> + + <param name="posOption" type="select" + label="Specify Sample Index and Target Sequence Locations?" + help="Default Positions: Index: 1 to 5, Input required for + reverse end, Target: 37 to 57."> + + <option value="no" selected="True">No</option> + + <option value="yes">Yes</option> + + </param> + + <when value="yes"> + <param name="barstart" type="integer" value="1" + label="Index Starting Position"/> + + <param name="barend" type="integer" value="5" + label="Index Ending Position"/> + + <param name="barstartRev" type="integer" value="0" + label="Reverse Index Starting Position"/> + + <param name="barendRev" type="integer" value="0" + label="Reverse Index Ending Position"/> + + <param name="hpstart" type="integer" value="37" + label="Target Starting Position"/> + + <param name="hpend" type="integer" value="57" + label="Target Ending Position"/> + </when> + + <when value="no"> + </when> + + </conditional> + + </when> + <when value="counts"> + <param name="counts" type="data" format="tabular" label="Counts Table"/> + <param name="hairpin" type="data" format="tabular" - label="Hairpin Annotation"/> + label="Target Sequence Annotation"/> + <param name="samples" type="data" format="tabular" label="Sample Annotation"/> + + <conditional name="secondaryFactor"> + + <param name="secFactorOpt" type="select" + label="Include Secondary Factor"> + + <option value="no" selected="True">No</option> + + <option value="yes">Yes</option> + + </param> + + <when value="yes"> + + <param name="secFactName" type="text" label="Secondary Factor Name" + size="80"/> + + </when> + + <when value="no"> + </when> + + </conditional> + </when> + </conditional> <conditional name="filterCPM"> <param name="filtOption" type="select" label="Filter Low CPM?" - help="Ignore hairpins with very low representation when performing - analysis."> + help="Ignore target sequences with very low representation when + performing analysis."> <option value="yes">Yes</option> - <option value="no">No</option> + <option value="no">No</option> </param> <when value="yes"> @@ -142,6 +317,12 @@ label="Minimum Samples" help="Filter out all the genes that do not meet the minimum CPM in at least this many samples."/> + + <param name="readReq" type="integer" value="1000" min="0" + label="Minimum Reads" + help="Filter out all samples that do not have the minimum + number of reads."/> + </when> <when value="no"/> @@ -175,17 +356,18 @@ <conditional name="roast"> <param name="roastOption" type="select" label="Perform Gene Level Analysis?" - help="Analyse LogFC tendencies for hairpins belonging - to the same gene."> + help="Analyse LogFC tendencies for target sequences belonging + to the same gene. NOTE: this is a slow procedure that + scales badly with the number of genes analysed."> <option value="no">No</option> <option value="yes">Yes</option> </param> <when value="yes"> <param name="hairpinReq" type="integer" value="2" min="2" - label="Minimum Hairpins" - help="Only genes with at least this many hairpins will - be analysed."/> + label="Minimum Targets Found" + help="Only genes with at least this many target sequences + found will be analysed."/> <conditional name="select"> <param name="selOption" type="select" @@ -223,25 +405,33 @@ label="FDR Threshold" help="All observations below this threshold will be highlighted in the smear plot."/> + <param name="lfc" type="float" value="0" min="0" label="Absolute LogFC Threshold" help="In additional to meeting the FDR requirement, the absolute value of the log-fold-change of the observation must be above this threshold to be highlighted."/> + + <param name="direction" type="select" label="Highlight Option" + help="Only hightlight positive or negative fold changes in smear plot?"> + <option value="all">Default</option> + <option value="up">Positive Only</option> + <option value="down">Negative Only</option> + </param> </inputs> <outputs> - <data format="html" name="outFile" label="shRNAseq Analysis"/> + <data format="html" name="outFile" label="TagSeq Analysis"/> </outputs> <help> .. class:: infomark **What it does** -Given tables containing information about the hairpins and their associated -barcodes, information about the samples and fastq file containing the hairpin -reads. This tool will generate plots and tables for the analysis of differential -representation. +Given tables containing information about the hairpins/sgRNA and their +associated sample indices, information about the samples and fastq file +containing the sequencing reads. This tool will generate plots and tables for +the analysis of differential representation. .. class:: infomark @@ -257,14 +447,15 @@ **Input File Type:** This tool is able to either generate counts from a raw FastQ file given the -information regarding the samples and hairpins. Alternatively if a table of -counts has already been generated it can also be used. +information regarding the samples and hairpins/sgRNA. Alternatively if a table +of counts has already been generated it can also be used. **Counts Table (Counts Input):** -A tab delimited text table of information regarding the counts of hairpins. -Should have a column 'ID' to denote the hairpins that counts correspond to. Each -additional column should have titles corresponding to the label for the sample. +A tab delimited text table of information regarding the counts of +hairpins/sgRNA. Should have a column 'ID' to denote the hairpins/sgRNA that +counts correspond to. Each additional column should have titles corresponding to +the label for the sample. Example:: @@ -281,62 +472,80 @@ Hairpin7 49501 49076 47611 ... -**Hairpin Annotation:** +**Target Sequence Annotation:** -A tab delimited text table of information regarding the hairpins. Should have -columns 'ID', 'Sequences' and 'Gene' to uniquely identify the hairpin, align it -with the reads to produce counts and identify which gene the hairpin acts on. +A tab delimited text table of information regarding the targetted +hairpins/sgRNA sequence. Should have columns 'ID', 'Sequences' and 'Gene' to +uniquely identify the target, align it with the reads to produce counts and +identify which gene the target acts on. NOTE: the column names are case sensitive and should be input exactly as they are shown here. Example:: - ID Sequences Gene - Control1 TCTCGCTTGGGCGAGAGTAAG 2 - Control2 CCGCCTGAAGTCTCTGATTAA 2 - Control3 AGGAATTATAATGCTTATCTA 2 - Hairpin1 AAGGCAGAGACTGACCACCTA 4 - Hairpin2 GAGCGACCTGGTGTTACTCTA 4 - Hairpin3 ATGGTGTAAATAGAGCTGTTA 4 - Hairpin4 CAGCTCATCTTCTGTGAAGAA 4 - Hairpin5 CAGCTCTGTGGGTCAGAAGAA 4 - Hairpin6 CCAGGCACAGATCTCAAGATA 4 - Hairpin7 ATGACAAGAAAGACATCTCAA 7 + ID Sequences Gene + Control1 TCTCGCTTGGGCGAGAGTAAG 2 + Control2 CCGCCTGAAGTCTCTGATTAA 2 + Control3 AGGAATTATAATGCTTATCTA 2 + Hairpin1 AAGGCAGAGACTGACCACCTA 4 + Hairpin2 GAGCGACCTGGTGTTACTCTA 4 + Hairpin3 ATGGTGTAAATAGAGCTGTTA 4 + Hairpin4 CAGCTCATCTTCTGTGAAGAA 4 + Hairpin5 CAGCTCTGTGGGTCAGAAGAA 4 + Hairpin6 CCAGGCACAGATCTCAAGATA 4 + Hairpin7 ATGACAAGAAAGACATCTCAA 7 ... **Sample Annotation (FastQ Input):** A tab delimited text table of information regarding the samples. Should have columns 'ID', 'Sequences' and 'group' to uniquely identify each sample, identify -the sample in the reads by its barcode sequence and correctly group replicates -for analysis. Additional columns may inserted for annotation purposes and will -not interfere with analysis as long as the necessary columns are present. +the sample in the reads by its sample index sequence and correctly group +replicates for analysis. Additional columns may inserted for annotation purposes +and will not interfere with analysis as long as the necessary columns are +present. -NOTE: the column names are case sensitive and should be input exactly as they -are shown here. +NOTE: With the exception of other_group, column names are case sensitive and +should be input exactly as they are shown here. The other_group column can be +named by the user and specified in the "Include Secondary Factor" option of the +tool. Example:: - ID Sequences group Replicate - 3 GAAAG Day 2 1 - 6 GAACC Day 10 1 - 9 GAAGA Day 5 GFP neg 1 - 16 GAATT Day 5 GFP pos 1 - 18 GACAC Day 2 2 - 21 GACCA Day 10 2 - 28 GACGT Day 5 GFP neg 2 - 31 GACTG Day 5 GFP pos 2 - 33 GAGAA Day 2 3 - 40 GAGCT Day 10 3 + ID Sequences group other_group Replicate + 3 GAAAG Day 2 male 1 + 6 GAACC Day 10 female 1 + 9 GAAGA Day 5 GFP neg male 1 + 16 GAATT Day 5 GFP pos male 1 + 18 GACAC Day 2 female 2 + 21 GACCA Day 10 male 2 + 28 GACGT Day 5 GFP neg male 2 + 31 GACTG Day 5 GFP pos female 2 + 33 GAGAA Day 2 male 3 + 40 GAGCT Day 10 female 3 ... -**Specify Barcode and Hairpin Locations (FastQ Input):** +**Include Secondary Factor** +If there are two factors involved in the experiment (i.e. Age and Gender) then +then secondary factor should be included to improve the statistical analysis. +The secondary factor should be specified as a column in the sample annotation +file and the corresponding column name should be input exactly as it is into +the provided field in the tool. + +NOTE: Currently the secondary factor is used only to improve statistical +analysis, comparisons can only be made in the primary factor specified as +"group" in the sample annotation. + +**Specify Sample Index and Target Sequence Locations (FastQ Input):** It is assumed that in the sequencing reads that the first 5 bases are the -barcodes and that bases 37-57 are the hairpins. If this is not the case then the -values of the positions can be changed, however it still requires the barcodes -and hairpins to be in a consistent location an in a continuous sequence. +sample index sequence and that bases 37-57 are the hairpins/sgRNA. If this is +not the case then the values of the positions can be changed, however it still +requires the sample indices and hairpins/sgRNA to be in a consistent location an +in a continuous sequence. + +NOTE: position values start at 1 for the first base. **Filter Low CPM?:** @@ -346,21 +555,21 @@ **Analysis Type:** - * **Classic Exact Test:** This allows two experimental groups to be compared and - p-values for differential representation derivec for each hairpin. Simple and - fast for straightforward comparisons. In this option you will have the option of - "*Compare* x *To* y" which implicitly subtracts the data from y from that of x - to produce the comparison. + * **Classic Exact Test:** This allows two experimental groups to be compared + and p-values for differential representation derivec for each target + sequence. Simple and fast for straightforward comparisons. In this option you + will have the option of "*Compare* x *To* y" which implicitly subtracts the + data from y from that of x to produce the comparison. - * **Generalised Linear Model:** This allow for complex contrasts to be specified - and also gene level analysis to be performed. If this option is chosen then - contrasts must be explicitly stated in equations and multiple contrasts can be - made. In addition there will be the option to analyse hairpins on a per-gene - basis to see if hairpins belonging to a particular gene have any overall - tendencies for the direction of their log-fold-change. + * **Generalised Linear Model:** This allow for complex contrasts to be specified + and also gene level analysis to be performed. If this option is chosen then + contrasts must be explicitly stated in equations and multiple contrasts can + be made. In addition there will be the option to analyse hairpins/sgRNA on a + per-gene basis to see if hairpins/sgRNA belonging to a particular gene have + any overall tendencies for the direction of their log-fold-change. **FDR Threshold:** -The smear plot in the output will have hairpins highlighted to signify +The smear plot in the output will have hairpins/sgRNA highlighted to signify significant differential representation. The significance is determined by contorlling the false discovery rate, only those with a FDR lower than the threshold will be highlighted in the plot. @@ -379,10 +588,10 @@ using. The methodology articles are listed in Section 2.1 of the limma User's Guide. - * Smyth, GK (2005). Limma: linear models for microarray data. In: - 'Bioinformatics and Computational Biology Solutions using R and - Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, - W. Huber (eds), Springer, New York, pages 397-420. + * Smyth, GK (2005). Limma: linear models for microarray data. In: + 'Bioinformatics and Computational Biology Solutions using R and + Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, + W. Huber (eds), Springer, New York, pages 397-420. .. class:: infomark @@ -392,25 +601,24 @@ the various original statistical methods implemented in edgeR. See Section 1.2 in the User's Guide for more detail. - * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor - package for differential expression analysis of digital gene expression - data. Bioinformatics 26, 139-140 - - * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing - differences in tag abundance. Bioinformatics 23, 2881-2887 - - * Robinson MD and Smyth GK (2008). Small-sample estimation of negative - binomial dispersion, with applications to SAGE data. - Biostatistics, 9, 321-332 - - * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis - of multifactor RNA-Seq experiments with respect to biological variation. - Nucleic Acids Research 40, 4288-4297 - + * Robinson MD, McCarthy DJ and Smyth GK (2010). edgeR: a Bioconductor + package for differential expression analysis of digital gene expression + data. Bioinformatics 26, 139-140 + + * Robinson MD and Smyth GK (2007). Moderated statistical tests for assessing + differences in tag abundance. Bioinformatics 23, 2881-2887 + + * Robinson MD and Smyth GK (2008). Small-sample estimation of negative + binomial dispersion, with applications to SAGE data. + Biostatistics, 9, 321-332 + + * McCarthy DJ, Chen Y and Smyth GK (2012). Differential expression analysis + of multifactor RNA-Seq experiments with respect to biological variation. + Nucleic Acids Research 40, 4288-4297 + Report problems to: su.s@wehi.edu.au .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html </help> </tool> -