changeset 0:91ca33096034 draft

Uploaded
author amawla
date Tue, 13 Jan 2015 21:12:26 -0500
parents
children aab4a565c0e8
files edgeR.pl edgeR.xml
diffstat 2 files changed, 567 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/edgeR.pl	Tue Jan 13 21:12:26 2015 -0500
@@ -0,0 +1,389 @@
+#!/bin/perl
+
+#EdgeR.pl Version 0.0.3
+#Contributors: Monica Britton, Blythe Durbin-Johnson, Joseph Fass, Nikhil Joshi, Alex Mawla
+
+use strict;
+use warnings;
+use Getopt::Std;
+use File::Basename;
+use File::Path qw(make_path remove_tree);
+
+$| = 1;
+
+my %OPTIONS = (a => "glm",  d => "tag", f => "BH", r => 5, u => "movingave");
+
+getopts('a:d:e:f:h:lmn:o:r:tu:', \%OPTIONS);
+
+ 
+die qq(
+Usage:   edgeR.pl [OPTIONS] factor::factor1::levels [factor::factor2::levels ...] cp::cont_pred1::values [cp::cont_pred2::values ...] cnt::contrast1 [cnt::contrast2] matrix
+
+OPTIONS:	-a	STR	Type Of Analysis [glm, pw, limma] (default: $OPTIONS{a})
+			-d	STR	The dispersion estimate to use for GLM analysis [tag] (default: $OPTIONS{d})
+			-e	STR	Path to place additional output files
+			-f	STR	False discovery rate adjustment method [BH] (default: $OPTIONS{f})
+			-h	STR	Name of html file for additional files
+			-l		Output the normalised digital gene expression matrix in log2 format (only applicable when using limma and -n is also specified)
+			-m		Perform all pairwise comparisons
+			-n	STR	File name to output the normalised digital gene expression matrix (only applicable when usinf glm or limma model)
+			-o	STR	File name to output csv file with results
+			-r	INT	Common Dispersion Rowsum Filter, ony applicable when 1 factor analysis selected (default: $OPTIONS{r})
+			-t		Estimate Tagwise Disp when performing 1 factor analysis
+			-u	STR	Method for allowing the prior distribution for the dispersion to be abundance- dependent ["movingave"] (default: $OPTIONS{u})
+ 
+) if(!@ARGV);
+
+my $matrix = pop @ARGV;
+
+make_path($OPTIONS{e});
+open(Rcmd,">$OPTIONS{e}/r_script.R") or die "Cannot open $OPTIONS{e}/r_script.R\n\n";
+print Rcmd "
+zz <- file(\"$OPTIONS{e}/r_script.err\", open=\"wt\")
+sink(zz)
+sink(zz, type=\"message\")
+
+library(edgeR)
+library(limma)
+
+toc <- read.table(\"$matrix\", sep=\"\\t\", comment=\"\", as.is=T)
+groups <- sapply(toc[1, -1], strsplit, \":\")
+for(i in 1:length(groups)) { g <- make.names(groups[[i]][2]); names(groups)[i] <- g; groups[[i]] <- groups[[i]][-2] }
+colnames(toc) <- make.names(toc[2,])
+toc[,1] <- gsub(\",\", \".\", toc[,1])
+tagnames <- toc[-(1:2), 1]
+rownames(toc) <- toc[,1]
+toc <- toc[-(1:2), -1]
+for(i in colnames(toc)) toc[, i] <- as.numeric(toc[,i])
+norm_factors <- calcNormFactors(as.matrix(toc))
+
+pw_tests <- list()
+uniq_groups <- unique(names(groups))
+for(i in 1:(length(uniq_groups)-1)) for(j in (i+1):length(uniq_groups)) pw_tests[[length(pw_tests)+1]] <- c(uniq_groups[i], uniq_groups[j])
+DGE <- DGEList(toc, lib.size=norm_factors*colSums(toc), group=names(groups))
+pdf(\"$OPTIONS{e}/MA_plots_normalisation.pdf\", width=14)
+for(i in 1:length(pw_tests)) {
+	j <- c(which(names(groups) == pw_tests[[i]][1])[1], which(names(groups) == pw_tests[[i]][2])[1])
+	par(mfrow = c(1, 2))
+	maPlot(toc[, j[1]], toc[, j[2]], normalize = TRUE, pch = 19, cex = 0.2, ylim = c(-10, 10), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]]))
+	grid(col = \"blue\")
+	abline(h = log2(norm_factors[j[2]]), col = \"red\", lwd = 4)
+	maPlot(DGE\$counts[, j[1]]/DGE\$samples\$lib.size[j[1]], DGE\$counts[, j[2]]/DGE\$samples\$lib.size[j[2]], normalize = FALSE, pch = 19, cex = 0.2, ylim = c(-8, 8), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]], \"Normalised\"))
+	grid(col = \"blue\")
+}
+dev.off()
+pdf(file=\"$OPTIONS{e}/MDSplot.pdf\")
+plotMDS(DGE, main=\"MDS Plot\", col=as.numeric(factor(names(groups)))+1, xlim=c(-3,3))
+dev.off()
+tested <- list()
+";
+
+my $all_cont;
+my @add_cont;
+my @fact;
+my @fact_names;
+my @cp;
+my @cp_names;
+if(@ARGV) {
+	foreach my $input (@ARGV) {
+		my @tmp = split "::", $input;
+		if($tmp[0] eq "factor") {
+			$tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g;
+			push @fact_names, $tmp[1];
+			$tmp[2] =~ s/:/\", \"/g;
+			$tmp[2] = "\"".$tmp[2]."\"";
+			push @fact, $tmp[2];
+		} elsif($tmp[0] eq "cp") {
+			$tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g;
+			push @cp_names, $tmp[1];
+			$tmp[2] =~ s/:/, /g;
+			push @cp, $tmp[2];
+		} elsif($tmp[0] eq "cnt") {
+			push @add_cont, $tmp[1];
+		} else {
+			die("Unknown Input: $input\n");
+		}
+	}
+}
+
+if($OPTIONS{a} eq "pw") {
+	print Rcmd "
+disp <- estimateCommonDisp(DGE, rowsum.filter=$OPTIONS{r})
+";
+	if(defined $OPTIONS{t}) {
+		print Rcmd "
+disp <- estimateTrendedDisp (disp)
+disp <- estimateTagwiseDisp(disp, trend=\"$OPTIONS{u}\")
+pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\")
+plotBCV(disp, cex=0.4)
+abline(h=disp\$common.dispersion, col=\"firebrick\", lwd=3)
+dev.off()
+";
+	}
+	print Rcmd "
+for(i in 1:length(pw_tests)) {
+	tested[[i]] <- exactTest(disp, pair=pw_tests[[i]])
+	names(tested)[i] <- paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")
+}
+pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\")
+for(i in 1:length(pw_tests)) {
+	dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\")
+	if(sum(dt) > 0) {
+		de_tags <- rownames(disp)[which(dt != 0)]
+		ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\"
+	} else {
+		de_tags <- rownames(topTags(tested[[i]], n=100)\$table)
+		ttl <- \"Top 100 tags\"
+	}
+
+	if(length(dt) < 5000) {
+		pointcex = 0.5
+	} else {
+		pointcex = 0.2
+	}
+	plotSmear(disp, pair=pw_tests[[i]], de.tags = de_tags, main = paste(\"Smear Plot\", names(tested)[i]), cex=0.5)
+	abline(h = c(-1, 1), col = \"blue\")
+	legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\")
+}
+dev.off()
+"; 
+} 
+elsif($OPTIONS{a} eq "glm") {
+	for(my $fct = 0; $fct <= $#fact_names; $fct++) {
+		print Rcmd "
+			$fact_names[$fct] <- c($fact[$fct])
+		";
+	}
+	for(my $fct = 0; $fct <= $#cp_names; $fct++) {
+		print Rcmd "
+			$cp_names[$fct] <- c($cp[$fct])
+		";
+	}
+	my $all_fact = "";
+	if(@fact_names) {
+		foreach (@fact_names) {
+			$all_fact .= " + factor($_)";
+		}
+    	}
+	my $all_cp = "";
+	if(@cp_names) {
+		$all_cp = " + ".join(" + ", @cp_names);
+	}
+	print Rcmd "
+		group_fact <- factor(names(groups))
+		design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp})
+		colnames(design) <- sub(\"group_fact\", \"\", colnames(design))
+	";
+	foreach my $fct (@fact_names) {
+		print Rcmd "
+			colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design)))
+		";
+	}
+	if($OPTIONS{d} eq "tag") {
+		print Rcmd "
+			disp <- estimateGLMCommonDisp(DGE, design)
+			disp <- estimateGLMTrendedDisp(disp, design)
+			disp <- estimateGLMTagwiseDisp(disp, design)
+			fit <- glmFit(disp, design)
+			pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\")
+			plotBCV(disp, cex=0.4)
+			dev.off()
+		";
+	}
+	if(@add_cont) {
+		$all_cont = "\"".join("\", \"", @add_cont)."\"";
+		print Rcmd "
+			cont <- c(${all_cont})
+			for(i in uniq_groups)  cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont)
+			for(i in uniq_groups)  cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont)
+";
+	} else {
+		print Rcmd "
+cont <- NULL
+";
+	}
+	if(defined $OPTIONS{m}) {
+		print Rcmd "
+for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\"))
+";
+	}
+	if(!defined $OPTIONS{m} && !@add_cont){
+		die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n");
+	}
+	print Rcmd "
+fit <- glmFit(disp, design)
+cont <- makeContrasts(contrasts=cont, levels=design)
+for(i in colnames(cont)) tested[[i]] <- glmLRT(fit, contrast=cont[,i])
+pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\")
+for(i in colnames(cont)) {
+        dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\")
+        if(sum(dt) > 0) {
+                de_tags <- rownames(disp)[which(dt != 0)]
+                ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\"
+        } else {
+                de_tags <- rownames(topTags(tested[[i]], n=100)\$table)
+                ttl <- \"Top 100 tags\"
+        }
+
+        if(length(dt) < 5000) {
+                pointcex = 0.5
+        } else {
+                pointcex = 0.2
+        }
+        plotSmear(disp, de.tags = de_tags, main = paste(\"Smear Plot\", i), cex=pointcex)
+        abline(h = c(-1, 1), col = \"blue\")
+        legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\")
+}
+dev.off()
+
+	";
+	if(defined $OPTIONS{n}) {
+		print Rcmd "
+			tab <- data.frame(ID=rownames(fit\$fitted.values), fit\$fitted.values, stringsAsFactors=F)
+			write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F)
+		";
+	}
+} elsif($OPTIONS{a} eq "limma") {
+	for(my $fct = 0; $fct <= $#fact_names; $fct++) {
+		print Rcmd "
+$fact_names[$fct] <- c($fact[$fct])
+";
+	}
+	for(my $fct = 0; $fct <= $#cp_names; $fct++) {
+		print Rcmd "
+$cp_names[$fct] <- c($cp[$fct])
+";
+	}
+	my $all_fact = "";
+	if(@fact_names) {
+		foreach (@fact_names) {
+			$all_fact .= " + factor($_)";
+		}
+	}
+	my $all_cp = "";
+	if(@cp_names) {
+		$all_cp = " + ".join(" + ", @cp_names);
+	}
+	print Rcmd "
+
+group_fact <- factor(names(groups))
+design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp})
+colnames(design) <- sub(\"group_fact\", \"\", colnames(design))
+";
+	foreach my $fct (@fact_names) {
+		print Rcmd "
+colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design)))
+";
+	}
+	print Rcmd "
+isexpr <- rowSums(cpm(toc)>1) >= 1
+toc <- toc[isexpr, ]
+pdf(file=\"$OPTIONS{e}/LIMMA_voom.pdf\")
+y <- voom(toc, design, plot=TRUE, lib.size=colSums(toc)*norm_factors)
+dev.off()
+
+pdf(file=\"$OPTIONS{e}/LIMMA_MDS_plot.pdf\")
+plotMDS(y, labels=colnames(toc), col=as.numeric(factor(names(groups)))+1, gene.selection=\"common\")
+dev.off()
+fit <- lmFit(y, design)
+";
+	if(defined $OPTIONS{n}) {
+		if(defined $OPTIONS{l}) {
+			print Rcmd "
+tab <- data.frame(ID=rownames(y\$E), y\$E, stringsAsFactors=F)
+";
+		} else {
+			print Rcmd "
+tab <- data.frame(ID=rownames(y\$E), 2^y\$E, stringsAsFactors=F)
+";
+		}
+		print Rcmd "
+write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F)
+";
+	}
+	if(@add_cont) {
+		$all_cont = "\"".join("\", \"", @add_cont)."\"";
+		print Rcmd "
+cont <- c(${all_cont})
+for(i in uniq_groups)  cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont)
+for(i in uniq_groups)  cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont)
+";
+	} else {
+		print Rcmd "
+cont <- NULL
+";
+	}
+	if(defined $OPTIONS{m}) {
+		print Rcmd "
+for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\"))
+";
+	}
+	if(!defined $OPTIONS{m} && !@add_cont){
+		die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n");
+	}
+	print Rcmd "
+cont <- makeContrasts(contrasts=cont, levels=design)
+fit2 <- contrasts.fit(fit, cont)
+fit2 <- eBayes(fit2)
+";
+} else {
+	die("Anaysis type $OPTIONS{a} not found\n");
+
+}
+if($OPTIONS{a} ne "limma") {
+	print Rcmd "
+options(digits = 6)
+tab <- NULL
+for(i in names(tested)) {
+	tab_tmp <- topTags(tested[[i]], n=Inf, adjust.method=\"$OPTIONS{f}\")[[1]]
+	colnames(tab_tmp) <- paste(i, colnames(tab_tmp), sep=\":\")
+	tab_tmp <- tab_tmp[tagnames,]
+	if(is.null(tab)) {
+		tab <- tab_tmp
+	} else tab <- cbind(tab, tab_tmp)
+}
+tab <- cbind(Feature=rownames(tab), tab)
+";
+} else {
+	print Rcmd "
+tab <- NULL
+options(digits = 6)
+for(i in colnames(fit2)) {
+	tab_tmp <- topTable(fit2, coef=i, n=Inf, sort.by=\"none\", adjust.method=\"$OPTIONS{f}\")
+	colnames(tab_tmp)[-1] <- paste(i, colnames(tab_tmp)[-1], sep=\":\")
+	if(is.null(tab)) {
+		tab <- tab_tmp
+	} else tab <- cbind(tab, tab_tmp[,-1])
+}
+";
+}
+print Rcmd "
+write.table(tab, \"$OPTIONS{o}\", quote=F, sep=\"\\t\", row.names=F)
+sink(type=\"message\")
+sink()
+";
+close(Rcmd);
+system("R --no-restore --no-save --no-readline < $OPTIONS{e}/r_script.R > $OPTIONS{e}/r_script.out");
+
+open(HTML, ">$OPTIONS{h}");
+print HTML "<html><head><title>EdgeR: Empirical analysis of digital gene expression data</title></head><body><h3>EdgeR Additional Files:</h3><p><ul>\n";
+print HTML "<li><a href=MA_plots_normalisation.pdf>MA_plots_normalisation.pdf</a></li>\n";
+print HTML "<li><a href=MDSplot.pdf>MDSplot.pdf</a></li>\n";
+if($OPTIONS{a} eq "pw") {
+	if(defined $OPTIONS{t}) {
+		print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n";
+	}
+	print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n";
+} elsif($OPTIONS{a} eq "glm" && $OPTIONS{d} eq "tag") {
+	print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n";
+	print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n";
+} elsif($OPTIONS{a} eq "limma") {
+	print HTML "<li><a href=LIMMA_MDS_plot.pdf>LIMMA_MDS_plot.pdf</a></li>\n";
+	print HTML "<li><a href=LIMMA_voom.pdf>LIMMA_voom.pdf</a></li>\n";
+}
+print HTML "<li><a href=r_script.R>r_script.R</a></li>\n";
+print HTML "<li><a href=r_script.out>r_script.out</a></li>\n";
+print HTML "<li><a href=r_script.err>r_script.err</a></li>\n";
+print HTML "</ul></p>\n";
+close(HTML);
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/edgeR.xml	Tue Jan 13 21:12:26 2015 -0500
@@ -0,0 +1,178 @@
+<tool id="edgeR" name="edgeR" version="0.0.3">
+  <description> - Estimates differential gene expression for short read sequence count using methods appropriate for count data</description>
+  <requirements>
+        <requirement type="R-module">edgeR</requirement>
+        <requirement type="R-module">limma</requirement>
+  </requirements>
+  <command interpreter="perl">
+  	edgeR.pl -a $analysis_type.analysis -e $html_file.files_path -f BH -h $html_file -o $output
+
+  	<!--Pairwise comparisons 1 Factor Analysis-->
+  	#if $analysis_type.analysis == "pw":
+   		-r $analysis_type.rowsumfilter
+		#if $analysis_type.tagwise_disp.twd == "TRUE":
+      			-u movingave
+      			-t 
+     		#end if
+ 	<!--GLM Generalized Linear Models (Multiple Factors)-->
+  	#else if $analysis_type.analysis == "glm":
+		#if $analysis_type.exp.export_norm == "true":
+			-n $norm_exp
+		#end if
+ 		-d tag
+		$analysis_type.cont_pw
+		#for $cnt in $analysis_type.contrasts:
+			"cnt::${cnt.add_cont}"
+		#end for
+
+	<!--LIMMA Linearized Models (Multiple Factors)-->
+	#else
+		#if $analysis_type.exp.export_norm == "true":
+			-n $norm_exp $analysis_type.exp.log
+		#end if
+		$analysis_type.cont_pw
+		#for $cnt in $analysis_type.contrasts:
+			"cnt::${cnt.add_cont}"
+		#end for
+	#end if
+	$matrix
+				
+  </command>
+
+  <inputs>
+  	<param name="matrix" type="data" format="tabular" label="Digital Expression Matrix"/>
+  	<conditional name="analysis_type">
+		<param name="analysis" type="select" label="Type Of Analysis">
+			<option value="pw">Pairwise comparisons (1 Factor Analysis)</option>
+			<option value="glm" selected="true">Generalized Linear Models (Multiple Factor Analysis using GLM)</option>
+			<option value="limma">Linear Models for RNA-Seq (Multiple Factor Analysis using LIMMA)</option>
+		</param>
+		<when value="pw">
+			<param name="rowsumfilter" type="integer" value="5" label="Common Dispersion Rowsum Filter" help="Numeric scalar giving a value for the filtering out of low abundance tags in the estimation of the common dispersion. Only tags with total sum of counts above this value are used in the estimation of the common dispersion. Low abundance tags can adversely affect the estimation of the common dispersion, so this argument allows the user to select an appropriate filter threshold for the tag abundance."/>
+ 		<conditional name="tagwise_disp">
+    				<param name="twd" type="select" label="Maximize the Negative Binomial Weighted Conditional Likelihood" help="Calculate and use an estimate of the dispersion parameter for each tag">
+    					<option value="TRUE" selected="true">True</option>
+    					<option value="FALSE">False</option>
+   				</param>
+    			</conditional>         
+		</when>
+		<when value="glm">
+			<param name="cont_pw" type="boolean" truevalue="-m" falsevalue="" checked="True" label="Perform all pairwise comparisons" help="Include all pairwise comparisons in the contrast matrix."/>
+			<repeat name="contrasts" title="Contrast">
+				<param name="add_cont" title="Contrast" type="text" label="Enter the contrast of interest, e.g. (G1+G2)/2-G3 (no spaces or commas)"/>
+			</repeat>
+			<conditional name="exp">	
+				<param name="export_norm" type="select" label="Save Normalised DGE Matrix">
+					<option value="true">Yes</option>
+					<option value="false">No</option>
+				</param>
+			</conditional>
+		</when>
+		<when value="limma">
+			<param name="cont_pw" type="boolean" truevalue="-m" falsevalue="" checked="True" label="Perform all pairwise comparisons" help="Include all pairwise comparisons in the contrast matrix."/>
+			<repeat name="contrasts" title="Contrast">
+				<param name="add_cont" title="Contrast" type="text" label="Enter the contrast of interest, e.g. (G1+G2)/2-G3 (no spaces or commas)"/>
+			</repeat>
+			<conditional name="exp">	
+				<param name="export_norm" type="select" label="Save Normalised DGE Matrix">
+					<option value="true">Yes</option>
+					<option value="false">No</option>
+				</param>
+				<when value="true">
+					<param name="log" type="boolean" truevalue="-l" falsevalue="" checked="True" label="Export Normalised DGE Matrix in Log2" help="Selecting this will log base 2 transform the Normalised Digital Gene Expression Matrix."/>
+				</when>
+			</conditional>
+		</when>
+	</conditional>
+  </inputs>
+  
+  <outputs>
+    	<data format="tabular" name="output" label="EdgeR analysis on ${matrix.name}"/>
+    	<data name="html_file" format="html" label="EdgeR analysis plots for ${matrix.name}"/>
+    	<data name="norm_exp" format="tabular" label="EdgeR Norm Expr Matrix for ${matrix.name}">
+    		<filter>analysis_type[ "analysis" ] != "pw" and analysis_type[ "exp" ][ "export_norm" ] == "true"</filter>
+    	</data>
+  </outputs>
+  	
+	<help>
+
+.. class:: infomark
+    
+**What it does**
+
+Estimates differential gene expression for short read sequence count using methods appropriate for count data.
+If you have paired data you may also want to consider Tophat/Cufflinks. 
+Input must be raw count data for each sequence arranged in a rectangular matrix as a tabular file.
+Note - no scaling - please make sure you have untransformed raw counts of reads for each sequence.
+ 
+Performs digital differential gene expression analysis between groups (eg a treatment and control).
+Biological replicates provide information about experimental variability required for reliable inference.
+
+**What it does not do**
+edgeR_ requires biological replicates. 
+Without replicates you can't account for known important experimental sources of variability that the approach implemented here requires.
+
+
+**Input**
+A count matrix containing sequence names as rows and sample specific counts of reads from this sequence as columns.
+The matrix must have 2 header rows, the first indicating the group assignment and the second uniquely identifiying the samples. It must also contain a unique set of (eg Feature) names in the first column. 
+
+Example::
+
+	#	G1:Mut	G1:Mut	G1:Mut	G2:WT	G2:WT	G2:WT
+	#Feature	Spl1	Spl2	Spl3	Spl4	Spl5	Spl6
+	NM_001001130	97	43	61	34	73	26
+	NM_001001144	25	8	9	3	5	5
+	NM_001001152	72	45	29	20	31	13
+	NM_001001160	0	1	1	1	0	0
+	NM_001001177	0	1	0	4	3	3
+	NM_001001178	0	2	1	0	4	0
+	NM_001001179	0	0	0	0	0	2
+	NM_001001180	0	0	0	0	0	2
+	NM_001001181	415	319	462	185	391	155
+	NM_001001182	1293	945	987	297	938	496
+	NM_001001183	5	4	11	7	11	2
+	NM_001001184	135	198	178	110	205	64
+	NM_001001185	186	1	0	1	1	0
+	NM_001001186	75	90	91	34	63	54
+	NM_001001187	267	236	170	165	202	51
+	NM_001001295	5	2	6	1	7	0
+	NM_001001309	1	0	0	1	2	1
+	...
+	
+
+Please use the "Count reads in features with htseq-count" tool to generate the count matrix.
+
+**Output**
+
+A tabular file containing relative expression levels, statistical estimates of differential expression probability, R scripts, log, and some helpful diagnostic plots.
+
+**Fixed Parameters**
+ 
+Method for allowing the prior distribution for the dispersion to be abundance-dependent used: movingave
+
+False discovery rate adjustment method used: Benjamini and Hochberg (1995)
+
+GLM dispersion estimate used: Tagwise Dispersion
+
+Gene filter used: less than 1 count per million reads
+ 
+.. class:: infomark
+
+**Attribution**
+This tool wraps the edgeR_ Bioconductor package so all calculations and plots are controlled by that code. See edgeR_ for all documentation and appropriate attribution. 
+Recommended reference is Mark D. Robinson, Davis J. McCarthy, Gordon K. Smyth, PMCID: PMC2796818
+
+.. class:: infomark
+
+**Attribution**
+When applying the LIMMA (Linear models for RNA-Seq) anlysis the tool also makes use of the limma_ Bioconductor package.
+Recommended reference is Smyth, G. K. (2005). Limma: linear models for microarray data. In: 'Bioinformatics and Computational Biology Solutions using R and Bioconductor'. R. Gentleman, V. Carey, S. Dudoit, R. Irizarry, W. Huber (eds), Springer, New York, pages 397--420.
+
+ .. _edgeR: http://www.bioconductor.org/packages/release/bioc/html/edgeR.html
+ .. _limma: http://www.bioconductor.org/packages/release/bioc/html/limma.html
+
+
+	</help>
+  
+</tool>