# HG changeset patch # User amawla # Date 1421201546 18000 # Node ID 91ca33096034d791533026473b337f43b5a9967a Uploaded diff -r 000000000000 -r 91ca33096034 edgeR.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edgeR.pl Tue Jan 13 21:12:26 2015 -0500 @@ -0,0 +1,389 @@ +#!/bin/perl + +#EdgeR.pl Version 0.0.3 +#Contributors: Monica Britton, Blythe Durbin-Johnson, Joseph Fass, Nikhil Joshi, Alex Mawla + +use strict; +use warnings; +use Getopt::Std; +use File::Basename; +use File::Path qw(make_path remove_tree); + +$| = 1; + +my %OPTIONS = (a => "glm", d => "tag", f => "BH", r => 5, u => "movingave"); + +getopts('a:d:e:f:h:lmn:o:r:tu:', \%OPTIONS); + + +die qq( +Usage: edgeR.pl [OPTIONS] factor::factor1::levels [factor::factor2::levels ...] cp::cont_pred1::values [cp::cont_pred2::values ...] cnt::contrast1 [cnt::contrast2] matrix + +OPTIONS: -a STR Type Of Analysis [glm, pw, limma] (default: $OPTIONS{a}) + -d STR The dispersion estimate to use for GLM analysis [tag] (default: $OPTIONS{d}) + -e STR Path to place additional output files + -f STR False discovery rate adjustment method [BH] (default: $OPTIONS{f}) + -h STR Name of html file for additional files + -l Output the normalised digital gene expression matrix in log2 format (only applicable when using limma and -n is also specified) + -m Perform all pairwise comparisons + -n STR File name to output the normalised digital gene expression matrix (only applicable when usinf glm or limma model) + -o STR File name to output csv file with results + -r INT Common Dispersion Rowsum Filter, ony applicable when 1 factor analysis selected (default: $OPTIONS{r}) + -t Estimate Tagwise Disp when performing 1 factor analysis + -u STR Method for allowing the prior distribution for the dispersion to be abundance- dependent ["movingave"] (default: $OPTIONS{u}) + +) if(!@ARGV); + +my $matrix = pop @ARGV; + +make_path($OPTIONS{e}); +open(Rcmd,">$OPTIONS{e}/r_script.R") or die "Cannot open $OPTIONS{e}/r_script.R\n\n"; +print Rcmd " +zz <- file(\"$OPTIONS{e}/r_script.err\", open=\"wt\") +sink(zz) +sink(zz, type=\"message\") + +library(edgeR) +library(limma) + +toc <- read.table(\"$matrix\", sep=\"\\t\", comment=\"\", as.is=T) +groups <- sapply(toc[1, -1], strsplit, \":\") +for(i in 1:length(groups)) { g <- make.names(groups[[i]][2]); names(groups)[i] <- g; groups[[i]] <- groups[[i]][-2] } +colnames(toc) <- make.names(toc[2,]) +toc[,1] <- gsub(\",\", \".\", toc[,1]) +tagnames <- toc[-(1:2), 1] +rownames(toc) <- toc[,1] +toc <- toc[-(1:2), -1] +for(i in colnames(toc)) toc[, i] <- as.numeric(toc[,i]) +norm_factors <- calcNormFactors(as.matrix(toc)) + +pw_tests <- list() +uniq_groups <- unique(names(groups)) +for(i in 1:(length(uniq_groups)-1)) for(j in (i+1):length(uniq_groups)) pw_tests[[length(pw_tests)+1]] <- c(uniq_groups[i], uniq_groups[j]) +DGE <- DGEList(toc, lib.size=norm_factors*colSums(toc), group=names(groups)) +pdf(\"$OPTIONS{e}/MA_plots_normalisation.pdf\", width=14) +for(i in 1:length(pw_tests)) { + j <- c(which(names(groups) == pw_tests[[i]][1])[1], which(names(groups) == pw_tests[[i]][2])[1]) + par(mfrow = c(1, 2)) + maPlot(toc[, j[1]], toc[, j[2]], normalize = TRUE, pch = 19, cex = 0.2, ylim = c(-10, 10), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]])) + grid(col = \"blue\") + abline(h = log2(norm_factors[j[2]]), col = \"red\", lwd = 4) + maPlot(DGE\$counts[, j[1]]/DGE\$samples\$lib.size[j[1]], DGE\$counts[, j[2]]/DGE\$samples\$lib.size[j[2]], normalize = FALSE, pch = 19, cex = 0.2, ylim = c(-8, 8), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]], \"Normalised\")) + grid(col = \"blue\") +} +dev.off() +pdf(file=\"$OPTIONS{e}/MDSplot.pdf\") +plotMDS(DGE, main=\"MDS Plot\", col=as.numeric(factor(names(groups)))+1, xlim=c(-3,3)) +dev.off() +tested <- list() +"; + +my $all_cont; +my @add_cont; +my @fact; +my @fact_names; +my @cp; +my @cp_names; +if(@ARGV) { + foreach my $input (@ARGV) { + my @tmp = split "::", $input; + if($tmp[0] eq "factor") { + $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g; + push @fact_names, $tmp[1]; + $tmp[2] =~ s/:/\", \"/g; + $tmp[2] = "\"".$tmp[2]."\""; + push @fact, $tmp[2]; + } elsif($tmp[0] eq "cp") { + $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g; + push @cp_names, $tmp[1]; + $tmp[2] =~ s/:/, /g; + push @cp, $tmp[2]; + } elsif($tmp[0] eq "cnt") { + push @add_cont, $tmp[1]; + } else { + die("Unknown Input: $input\n"); + } + } +} + +if($OPTIONS{a} eq "pw") { + print Rcmd " +disp <- estimateCommonDisp(DGE, rowsum.filter=$OPTIONS{r}) +"; + if(defined $OPTIONS{t}) { + print Rcmd " +disp <- estimateTrendedDisp (disp) +disp <- estimateTagwiseDisp(disp, trend=\"$OPTIONS{u}\") +pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\") +plotBCV(disp, cex=0.4) +abline(h=disp\$common.dispersion, col=\"firebrick\", lwd=3) +dev.off() +"; + } + print Rcmd " +for(i in 1:length(pw_tests)) { + tested[[i]] <- exactTest(disp, pair=pw_tests[[i]]) + names(tested)[i] <- paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\") +} +pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\") +for(i in 1:length(pw_tests)) { + dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\") + if(sum(dt) > 0) { + de_tags <- rownames(disp)[which(dt != 0)] + ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\" + } else { + de_tags <- rownames(topTags(tested[[i]], n=100)\$table) + ttl <- \"Top 100 tags\" + } + + if(length(dt) < 5000) { + pointcex = 0.5 + } else { + pointcex = 0.2 + } + plotSmear(disp, pair=pw_tests[[i]], de.tags = de_tags, main = paste(\"Smear Plot\", names(tested)[i]), cex=0.5) + abline(h = c(-1, 1), col = \"blue\") + legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\") +} +dev.off() +"; +} +elsif($OPTIONS{a} eq "glm") { + for(my $fct = 0; $fct <= $#fact_names; $fct++) { + print Rcmd " + $fact_names[$fct] <- c($fact[$fct]) + "; + } + for(my $fct = 0; $fct <= $#cp_names; $fct++) { + print Rcmd " + $cp_names[$fct] <- c($cp[$fct]) + "; + } + my $all_fact = ""; + if(@fact_names) { + foreach (@fact_names) { + $all_fact .= " + factor($_)"; + } + } + my $all_cp = ""; + if(@cp_names) { + $all_cp = " + ".join(" + ", @cp_names); + } + print Rcmd " + group_fact <- factor(names(groups)) + design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp}) + colnames(design) <- sub(\"group_fact\", \"\", colnames(design)) + "; + foreach my $fct (@fact_names) { + print Rcmd " + colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design))) + "; + } + if($OPTIONS{d} eq "tag") { + print Rcmd " + disp <- estimateGLMCommonDisp(DGE, design) + disp <- estimateGLMTrendedDisp(disp, design) + disp <- estimateGLMTagwiseDisp(disp, design) + fit <- glmFit(disp, design) + pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\") + plotBCV(disp, cex=0.4) + dev.off() + "; + } + if(@add_cont) { + $all_cont = "\"".join("\", \"", @add_cont)."\""; + print Rcmd " + cont <- c(${all_cont}) + for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont) + for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont) +"; + } else { + print Rcmd " +cont <- NULL +"; + } + if(defined $OPTIONS{m}) { + print Rcmd " +for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")) +"; + } + if(!defined $OPTIONS{m} && !@add_cont){ + die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n"); + } + print Rcmd " +fit <- glmFit(disp, design) +cont <- makeContrasts(contrasts=cont, levels=design) +for(i in colnames(cont)) tested[[i]] <- glmLRT(fit, contrast=cont[,i]) +pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\") +for(i in colnames(cont)) { + dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\") + if(sum(dt) > 0) { + de_tags <- rownames(disp)[which(dt != 0)] + ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\" + } else { + de_tags <- rownames(topTags(tested[[i]], n=100)\$table) + ttl <- \"Top 100 tags\" + } + + if(length(dt) < 5000) { + pointcex = 0.5 + } else { + pointcex = 0.2 + } + plotSmear(disp, de.tags = de_tags, main = paste(\"Smear Plot\", i), cex=pointcex) + abline(h = c(-1, 1), col = \"blue\") + legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\") +} +dev.off() + + "; + if(defined $OPTIONS{n}) { + print Rcmd " + tab <- data.frame(ID=rownames(fit\$fitted.values), fit\$fitted.values, stringsAsFactors=F) + write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F) + "; + } +} elsif($OPTIONS{a} eq "limma") { + for(my $fct = 0; $fct <= $#fact_names; $fct++) { + print Rcmd " +$fact_names[$fct] <- c($fact[$fct]) +"; + } + for(my $fct = 0; $fct <= $#cp_names; $fct++) { + print Rcmd " +$cp_names[$fct] <- c($cp[$fct]) +"; + } + my $all_fact = ""; + if(@fact_names) { + foreach (@fact_names) { + $all_fact .= " + factor($_)"; + } + } + my $all_cp = ""; + if(@cp_names) { + $all_cp = " + ".join(" + ", @cp_names); + } + print Rcmd " + +group_fact <- factor(names(groups)) +design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp}) +colnames(design) <- sub(\"group_fact\", \"\", colnames(design)) +"; + foreach my $fct (@fact_names) { + print Rcmd " +colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design))) +"; + } + print Rcmd " +isexpr <- rowSums(cpm(toc)>1) >= 1 +toc <- toc[isexpr, ] +pdf(file=\"$OPTIONS{e}/LIMMA_voom.pdf\") +y <- voom(toc, design, plot=TRUE, lib.size=colSums(toc)*norm_factors) +dev.off() + +pdf(file=\"$OPTIONS{e}/LIMMA_MDS_plot.pdf\") +plotMDS(y, labels=colnames(toc), col=as.numeric(factor(names(groups)))+1, gene.selection=\"common\") +dev.off() +fit <- lmFit(y, design) +"; + if(defined $OPTIONS{n}) { + if(defined $OPTIONS{l}) { + print Rcmd " +tab <- data.frame(ID=rownames(y\$E), y\$E, stringsAsFactors=F) +"; + } else { + print Rcmd " +tab <- data.frame(ID=rownames(y\$E), 2^y\$E, stringsAsFactors=F) +"; + } + print Rcmd " +write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F) +"; + } + if(@add_cont) { + $all_cont = "\"".join("\", \"", @add_cont)."\""; + print Rcmd " +cont <- c(${all_cont}) +for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont) +for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont) +"; + } else { + print Rcmd " +cont <- NULL +"; + } + if(defined $OPTIONS{m}) { + print Rcmd " +for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")) +"; + } + if(!defined $OPTIONS{m} && !@add_cont){ + die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n"); + } + print Rcmd " +cont <- makeContrasts(contrasts=cont, levels=design) +fit2 <- contrasts.fit(fit, cont) +fit2 <- eBayes(fit2) +"; +} else { + die("Anaysis type $OPTIONS{a} not found\n"); + +} +if($OPTIONS{a} ne "limma") { + print Rcmd " +options(digits = 6) +tab <- NULL +for(i in names(tested)) { + tab_tmp <- topTags(tested[[i]], n=Inf, adjust.method=\"$OPTIONS{f}\")[[1]] + colnames(tab_tmp) <- paste(i, colnames(tab_tmp), sep=\":\") + tab_tmp <- tab_tmp[tagnames,] + if(is.null(tab)) { + tab <- tab_tmp + } else tab <- cbind(tab, tab_tmp) +} +tab <- cbind(Feature=rownames(tab), tab) +"; +} else { + print Rcmd " +tab <- NULL +options(digits = 6) +for(i in colnames(fit2)) { + tab_tmp <- topTable(fit2, coef=i, n=Inf, sort.by=\"none\", adjust.method=\"$OPTIONS{f}\") + colnames(tab_tmp)[-1] <- paste(i, colnames(tab_tmp)[-1], sep=\":\") + if(is.null(tab)) { + tab <- tab_tmp + } else tab <- cbind(tab, tab_tmp[,-1]) +} +"; +} +print Rcmd " +write.table(tab, \"$OPTIONS{o}\", quote=F, sep=\"\\t\", row.names=F) +sink(type=\"message\") +sink() +"; +close(Rcmd); +system("R --no-restore --no-save --no-readline < $OPTIONS{e}/r_script.R > $OPTIONS{e}/r_script.out"); + +open(HTML, ">$OPTIONS{h}"); +print HTML "EdgeR: Empirical analysis of digital gene expression data

EdgeR Additional Files:

\n"; +close(HTML); + diff -r 000000000000 -r 91ca33096034 edgeR.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/edgeR.xml Tue Jan 13 21:12:26 2015 -0500 @@ -0,0 +1,178 @@ + + - Estimates differential gene expression for short read sequence count using methods appropriate for count data + + edgeR + limma + + + edgeR.pl -a $analysis_type.analysis -e $html_file.files_path -f BH -h $html_file -o $output + + + #if $analysis_type.analysis == "pw": + -r $analysis_type.rowsumfilter + #if $analysis_type.tagwise_disp.twd == "TRUE": + -u movingave + -t + #end if + + #else if $analysis_type.analysis == "glm": + #if $analysis_type.exp.export_norm == "true": + -n $norm_exp + #end if + -d tag + $analysis_type.cont_pw + #for $cnt in $analysis_type.contrasts: + "cnt::${cnt.add_cont}" + #end for + + + #else + #if $analysis_type.exp.export_norm == "true": + -n $norm_exp $analysis_type.exp.log + #end if + $analysis_type.cont_pw + #for $cnt in $analysis_type.contrasts: + "cnt::${cnt.add_cont}" + #end for + #end if + $matrix + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + analysis_type[ "analysis" ] != "pw" and analysis_type[ "exp" ][ "export_norm" ] == "true" + + + + + +.. class:: infomark + +**What it does** + +Estimates differential gene expression for short read sequence count using methods appropriate for count data. +If you have paired data you may also want to consider Tophat/Cufflinks. +Input must be raw count data for each sequence arranged in a rectangular matrix as a tabular file. +Note - no scaling - please make sure you have untransformed raw counts of reads for each sequence. + +Performs digital differential gene expression analysis between groups (eg a treatment and control). +Biological replicates provide information about experimental variability required for reliable inference. + +**What it does not do** +edgeR_ requires biological replicates. +Without replicates you can't account for known important experimental sources of variability that the approach implemented here requires. + + +**Input** +A count matrix containing sequence names as rows and sample specific counts of reads from this sequence as columns. +The matrix must have 2 header rows, the first indicating the group assignment and the second uniquely identifiying the samples. It must also contain a unique set of (eg Feature) names in the first column. + +Example:: + + # G1:Mut G1:Mut G1:Mut G2:WT G2:WT G2:WT + #Feature Spl1 Spl2 Spl3 Spl4 Spl5 Spl6 + NM_001001130 97 43 61 34 73 26 + NM_001001144 25 8 9 3 5 5 + NM_001001152 72 45 29 20 31 13 + NM_001001160 0 1 1 1 0 0 + NM_001001177 0 1 0 4 3 3 + NM_001001178 0 2 1 0 4 0 + NM_001001179 0 0 0 0 0 2 + NM_001001180 0 0 0 0 0 2 + NM_001001181 415 319 462 185 391 155 + NM_001001182 1293 945 987 297 938 496 + NM_001001183 5 4 11 7 11 2 + NM_001001184 135 198 178 110 205 64 + NM_001001185 186 1 0 1 1 0 + NM_001001186 75 90 91 34 63 54 + NM_001001187 267 236 170 165 202 51 + NM_001001295 5 2 6 1 7 0 + NM_001001309 1 0 0 1 2 1 + ... + + +Please use the "Count reads in features with htseq-count" tool to generate the count matrix. + +**Output** + +A tabular file containing relative expression levels, statistical estimates of differential expression probability, R scripts, log, and some helpful diagnostic plots. + +**Fixed Parameters** + +Method for allowing the prior distribution for the dispersion to be abundance-dependent used: movingave + +False discovery rate adjustment method used: Benjamini and Hochberg (1995) + +GLM dispersion estimate used: Tagwise Dispersion + +Gene filter used: less than 1 count per million reads + +.. class:: infomark + +**Attribution** +This tool wraps the edgeR_ Bioconductor package so all calculations and plots are controlled by that code. See edgeR_ for all documentation and appropriate attribution. +Recommended reference is Mark D. Robinson, Davis J. McCarthy, Gordon K. Smyth, PMCID: PMC2796818 + +.. class:: infomark + +**Attribution** +When applying the LIMMA (Linear models for RNA-Seq) anlysis the tool also makes use of the limma_ Bioconductor package. +Recommended reference is Smyth, G. K. (2005). Limma: linear models for microarray data. In: 'Bioinformatics and Computational Biology Solutions using R and Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, W. Huber (eds), Springer, New York, pages 397--420. + + .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html + .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html + + + + +