Mercurial > repos > iuc > deseq2
changeset 17:d9e5cadc7f0b draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/deseq2 commit b95582cea8320d5488056a9576474f79cec53be8
author | iuc |
---|---|
date | Wed, 05 Sep 2018 15:54:03 -0400 |
parents | a416957ee305 |
children | 3bf1b3ec1ddf |
files | deseq2.R deseq2.xml get_deseq_dataset.R test-data/batch_factors.tab test-data/out_deseq2_sailfish.tab |
diffstat | 5 files changed, 139 insertions(+), 62 deletions(-) [+] |
line wrap: on
line diff
--- a/deseq2.R Fri Aug 03 17:23:46 2018 -0400 +++ b/deseq2.R Wed Sep 05 15:54:03 2018 -0400 @@ -46,6 +46,7 @@ spec <- matrix(c( "quiet", "q", 0, "logical", "help", "h", 0, "logical", + "batch_factors", "", 1, "character", "outfile", "o", 1, "character", "countsfile", "n", 1, "character", "header", "H", 0, "logical", @@ -87,24 +88,13 @@ FALSE } -if (!is.null(opt$header)) { - hasHeader <- TRUE -} else { - hasHeader <- FALSE +source_local <- function(fname){ + argv <- commandArgs(trailingOnly = FALSE) + base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) + source(paste(base_dir, fname, sep="/")) } -if (!is.null(opt$tximport)) { - if (is.null(opt$tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport") - if (tolower(file_ext(opt$tx2gene)) == "gtf") { - gtfFile <- opt$tx2gene - } else { - gtfFile <- NULL - tx2gene <- read.table(opt$tx2gene, header=FALSE) - } - useTXI <- TRUE -} else { - useTXI <- FALSE -} +source_local('get_deseq_dataset.R') suppressPackageStartupMessages({ library("DESeq2") @@ -197,53 +187,34 @@ cat("\n---------------------\n") } -# For JSON input from Galaxy, path is absolute -dir <- "" - -if (!useTXI & hasHeader) { - countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)}) - tbl <- do.call("cbind", countfiles) - colnames(tbl) <- rownames(sampleTable) # take sample ids from header - - # check for htseq report lines (from DESeqDataSetFromHTSeqCount function) - oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual", - "not_aligned", "alignment_not_unique") - specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames - tbl <- tbl[!specialRows, , drop = FALSE] - - dds <- DESeqDataSetFromMatrix(countData = tbl, - colData = sampleTable[,-c(1:2), drop=FALSE], - design = designFormula) -} else if (!useTXI & !hasHeader) { +dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula=designFormula, tximport=opt$tximport, txtype=opt$txtype, tx2gene=opt$tx2gene) - # construct the object from HTSeq files - dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, - directory = dir, - design = designFormula) - labs <- unname(unlist(filenames_to_labels[colnames(dds)])) - colnames(dds) <- labs +apply_batch_factors <- function (dds, batch_factors) { + rownames(batch_factors) <- batch_factors$identifier + batch_factors <- subset(batch_factors, select = -c(identifier, condition)) + dds_samples <- colnames(dds) + batch_samples <- rownames(batch_factors) + if (!setequal(batch_samples, dds_samples)) { + stop("Batch factor names don't correspond to input sample names, check input files") + } + dds_data <- colData(dds) + # Merge dds_data with batch_factors using indexes, which are sample names + # Set sort to False, which maintains the order in dds_data + reordered_batch <- merge(dds_data, batch_factors, by.x = 0, by.y = 0, sort=F) + batch_factors <- reordered_batch[, ncol(dds_data):ncol(reordered_batch)] + for (factor in colnames(batch_factors)) { + dds[[factor]] <- batch_factors[[factor]] + } + colnames(dds) <- reordered_batch[,1] + return(dds) +} -} else { - # construct the object using tximport - # first need to make the tx2gene table - # this takes ~2-3 minutes using Bioconductor functions - if (!is.null(gtfFile)) { - suppressPackageStartupMessages({ - library("GenomicFeatures") - }) - txdb <- makeTxDbFromGFF(gtfFile, format="gtf") - k <- keys(txdb, keytype = "GENEID") - df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME") - tx2gene <- df[, 2:1] # tx ID, then gene ID - } - library("tximport") - txiFiles <- as.character(sampleTable[,2]) - labs <- unname(unlist(filenames_to_labels[sampleTable[,1]])) - names(txiFiles) <- labs - txi <- tximport(txiFiles, type=opt$txtype, tx2gene=tx2gene) - dds <- DESeqDataSetFromTximport(txi, - sampleTable[,3:ncol(sampleTable),drop=FALSE], - designFormula) +if (!is.null(opt$batch_factors)) { + batch_factors <- read.table(opt$batch_factors, sep="\t", header=T) + dds <- apply_batch_factors(dds = dds, batch_factors = batch_factors) + batch_design <- colnames(batch_factors)[-c(1,2)] + designFormula <- as.formula(paste("~", paste(c(batch_design, rev(factors)), collapse=" + "))) + design(dds) <- designFormula } if (verbose) {
--- a/deseq2.xml Fri Aug 03 17:23:46 2018 -0400 +++ b/deseq2.xml Wed Sep 05 15:54:03 2018 -0400 @@ -64,6 +64,9 @@ -f '#echo json.dumps(temp_factor_names)#' -l '#echo json.dumps(filename_to_element_identifiers)#' -t $fit_type + #if $batch_factors + --batch_factors '$batch_factors' + #end if #if $outlier_replace_off: -a #end if @@ -105,7 +108,7 @@ <param name="countsFile" type="data" format="tabular" multiple="true" label="Counts file(s)"/> </repeat> </repeat> - + <param name="batch_factors" type="data" format="tabular" optional="true" label="(Optional) provide a tabular file with additional batch factors to include in the model." help="You can produce this file using RUVSeq or svaseq."/> <param name="header" type="boolean" truevalue="-H" falsevalue="" checked="true" label="Files have header?" help="If this option is set to Yes, the tool will assume that the count files have column headers in the first row. Default: Yes" /> <conditional name="tximport"> @@ -206,6 +209,28 @@ </assert_contents> </output> </test> + <!--Ensure additional batch factor correction works --> + <test expect_num_outputs="2"> + <repeat name="rep_factorName"> + <param name="factorName" value="Treatment"/> + <repeat name="rep_factorLevel"> + <param name="factorLevel" value="Treated"/> + <param name="countsFile" value="GSM461179_treat_single.counts,GSM461180_treat_paired.counts,GSM461181_treat_paired.counts"/> + </repeat> + <repeat name="rep_factorLevel"> + <param name="factorLevel" value="Untreated"/> + <param name="countsFile" value="GSM461176_untreat_single.counts,GSM461177_untreat_paired.counts,GSM461178_untreat_paired.counts,GSM461182_untreat_single.counts"/> + </repeat> + </repeat> + <param name="batch_factors" value="batch_factors.tab"/> + <param name="pdf" value="False"/> + <param name="normCounts" value="True"/> + <output name="deseq_out"> + <assert_contents> + <has_text_matching expression="FBgn0003360\t1933.*\t-2.9.*\t0.1.*\t-26.*\t1.*-152\t4.*-149" /> + </assert_contents> + </output> + </test> <!--Ensure counts files without header works --> <test expect_num_outputs="2"> <repeat name="rep_factorName">
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/get_deseq_dataset.R Wed Sep 05 15:54:03 2018 -0400 @@ -0,0 +1,69 @@ +get_deseq_dataset <- function(sampleTable, header, designFormula, tximport, txtype, tx2gene) { + + dir <- "" + + if (!is.null(header)) { + hasHeader <- TRUE + } else { + hasHeader <- FALSE + } + + if (!is.null(tximport)) { + if (is.null(tx2gene)) stop("A transcript-to-gene map or a GTF file is required for tximport") + if (tolower(file_ext(opt$tx2gene)) == "gtf") { + gtfFile <-tx2gene + } else { + gtfFile <- NULL + tx2gene <- read.table(tx2gene, header=FALSE) + } + useTXI <- TRUE + } else { + useTXI <- FALSE + } + + if (!useTXI & hasHeader) { + countfiles <- lapply(as.character(sampleTable$filename), function(x){read.delim(x, row.names=1)}) + tbl <- do.call("cbind", countfiles) + colnames(tbl) <- rownames(sampleTable) # take sample ids from header + + # check for htseq report lines (from DESeqDataSetFromHTSeqCount function) + oldSpecialNames <- c("no_feature", "ambiguous", "too_low_aQual", + "not_aligned", "alignment_not_unique") + specialRows <- (substr(rownames(tbl), 1, 1) == "_") | rownames(tbl) %in% oldSpecialNames + tbl <- tbl[!specialRows, , drop = FALSE] + + dds <- DESeqDataSetFromMatrix(countData = tbl, + colData = subset(sampleTable, select=-(filename)), + design = designFormula) + } else if (!useTXI & !hasHeader) { + + # construct the object from HTSeq files + dds <- DESeqDataSetFromHTSeqCount(sampleTable = sampleTable, + directory = dir, + design = designFormula) + colnames(dds) <- row.names(sampleTable) + + } else { + # construct the object using tximport + # first need to make the tx2gene table + # this takes ~2-3 minutes using Bioconductor functions + if (!is.null(gtfFile)) { + suppressPackageStartupMessages({ + library("GenomicFeatures") + }) + txdb <- makeTxDbFromGFF(gtfFile, format="gtf") + k <- keys(txdb, keytype = "GENEID") + df <- select(txdb, keys = k, keytype = "GENEID", columns = "TXNAME") + tx2gene <- df[, 2:1] # tx ID, then gene ID + } + library("tximport") + txiFiles <- as.character(sampleTable$filename) + labs <- row.names(sampleTable) + names(txiFiles) <- labs + txi <- tximport(txiFiles, type=txtype, tx2gene=tx2gene) + dds <- DESeqDataSetFromTximport(txi, + subset(sampleTable, select=-c(filename)), + designFormula) + } + return(dds) +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/batch_factors.tab Wed Sep 05 15:54:03 2018 -0400 @@ -0,0 +1,8 @@ +identifier condition W_1 +GSM461179_treat_single.counts Treatment -0.607164236699804 +GSM461180_treat_paired.counts Treatment 0.261662524312529 +GSM461181_treat_paired.counts Treatment 0.341268033254441 +GSM461176_untreat_single.counts Control -0.297792561552867 +GSM461177_untreat_paired.counts Control 0.322555604859173 +GSM461178_untreat_paired.counts Control 0.345745982909988 +GSM461182_untreat_single.counts Control -0.366275347083457
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/out_deseq2_sailfish.tab Wed Sep 05 15:54:03 2018 -0400 @@ -0,0 +1,4 @@ +MIR6859-2 1.1858265336986 -1.58325519934585 1.29566733255222 -1.22196119294536 0.221722302676964 0.886889210707857 +DDX11L1 1.91972630671259 -0.204757212641591 1.31524142129485 -0.155680325548148 0.876285006005361 0.999999999763967 +MIR6859-1 0.294068009390125 -0.712896459907293 1.28824575367762 -0.55338545294805 0.579999497913718 0.999999999763967 +WASH7P 114.545909292233 -3.75034693001422e-10 1.2677635335979 -2.95823852841924e-10 0.999999999763967 0.999999999763967