2
|
1 #/bin/perl
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use Getopt::Std;
|
|
6 use File::Basename;
|
|
7 use File::Path qw(make_path remove_tree);
|
|
8 $| = 1;
|
|
9
|
|
10 # Grab and set all options
|
|
11 my %OPTIONS = (a => "glm", d => "tag", f => "BH", p => 0.3, r => 5, u => "movingave");
|
|
12
|
|
13 getopts('a:d:e:f:h:lmn:o:p:r:tu:', \%OPTIONS);
|
|
14
|
|
15 die qq(
|
|
16 Usage: edgeR.pl [OPTIONS] factor::factor1::levels [factor::factor2::levels ...] cp::cont_pred1::values [cp::cont_pred2::values ...] cnt::contrast1 [cnt::contrast2] matrix
|
|
17
|
|
18 OPTIONS: -a STR Type Of Analysis [glm, pw, limma] (default: $OPTIONS{a})
|
|
19 -d STR The dispersion estimate to use for GLM analysis [tag, trend, common] (default: $OPTIONS{d})
|
|
20 -e STR Path to place additional output files
|
|
21 -f STR False discovery rate adjustment method [BH, holm, hochberg, hommel, BY, none] (default: $OPTIONS{f})
|
|
22 -h STR Name of html file for additional files
|
|
23 -l Output the normalised digital gene expression matrix in log2 format (only applicable when using limma and -n is also specified)
|
|
24 -m Perform all pairwise comparisons
|
|
25 -n STR File name to output the normalised digital gene expression matrix (only applicable when usinf glm or limma model)
|
|
26 -o STR File name to output csv file with results
|
|
27 -p FLT The proportion of all tags/genes to be used for the locally weighted estimation of the tagwise dispersion, ony applicable when 1 factor analysis selected (default: $OPTIONS{p})
|
|
28 -r INT Common Dispersion Rowsum Filter, ony applicable when 1 factor analysis selected (default: $OPTIONS{r})
|
|
29 -t Estimate Tagwise Disp when performing 1 factor analysis
|
|
30 -u STR Method for allowing the prior distribution for the dispersion to be abundance- dependent ["movingave", "tricube", "none"] (default: $OPTIONS{u})
|
|
31
|
|
32 ) if(!@ARGV);
|
|
33
|
|
34 my $matrix = pop @ARGV;
|
|
35
|
|
36 make_path($OPTIONS{e});
|
|
37 open(Rcmd,">$OPTIONS{e}/r_script.R") or die "Cannot open $OPTIONS{e}/r_script.R\n\n";
|
|
38 print Rcmd "
|
|
39 zz <- file(\"$OPTIONS{e}/r_script.err\", open=\"wt\")
|
|
40 sink(zz)
|
|
41 sink(zz, type=\"message\")
|
|
42
|
|
43 library(edgeR)
|
|
44 library(limma)
|
|
45
|
|
46 # read in matrix and groups
|
|
47 toc <- read.table(\"$matrix\", sep=\"\\t\", comment=\"\", as.is=T)
|
|
48 groups <- sapply(toc[1, -1], strsplit, \":\")
|
|
49 for(i in 1:length(groups)) { g <- make.names(groups[[i]][2]); names(groups)[i] <- g; groups[[i]] <- groups[[i]][-2] }
|
|
50 colnames(toc) <- make.names(toc[2,])
|
|
51 toc[,1] <- gsub(\",\", \".\", toc[,1])
|
|
52 tagnames <- toc[-(1:2), 1]
|
|
53 rownames(toc) <- toc[,1]
|
|
54 toc <- toc[-(1:2), -1]
|
|
55 for(i in colnames(toc)) toc[, i] <- as.numeric(toc[,i])
|
|
56 norm_factors <- calcNormFactors(as.matrix(toc))
|
|
57
|
|
58 pw_tests <- list()
|
|
59 uniq_groups <- unique(names(groups))
|
|
60 for(i in 1:(length(uniq_groups)-1)) for(j in (i+1):length(uniq_groups)) pw_tests[[length(pw_tests)+1]] <- c(uniq_groups[i], uniq_groups[j])
|
|
61 DGE <- DGEList(toc, lib.size=norm_factors*colSums(toc), group=names(groups))
|
|
62 pdf(\"$OPTIONS{e}/MA_plots_normalisation.pdf\", width=14)
|
|
63 for(i in 1:length(pw_tests)) {
|
|
64 j <- c(which(names(groups) == pw_tests[[i]][1])[1], which(names(groups) == pw_tests[[i]][2])[1])
|
|
65 par(mfrow = c(1, 2))
|
|
66 maPlot(toc[, j[1]], toc[, j[2]], normalize = TRUE, pch = 19, cex = 0.2, ylim = c(-10, 10), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]]))
|
|
67 grid(col = \"blue\")
|
|
68 abline(h = log2(norm_factors[j[2]]), col = \"red\", lwd = 4)
|
|
69 maPlot(DGE\$counts[, j[1]]/DGE\$samples\$lib.size[j[1]], DGE\$counts[, j[2]]/DGE\$samples\$lib.size[j[2]], normalize = FALSE, pch = 19, cex = 0.2, ylim = c(-8, 8), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]], \"Normalised\"))
|
|
70 grid(col = \"blue\")
|
|
71 }
|
|
72 dev.off()
|
|
73 pdf(file=\"$OPTIONS{e}/MDSplot.pdf\")
|
|
74 plotMDS(DGE, main=\"MDS Plot\", col=as.numeric(factor(names(groups)))+1, xlim=c(-3,3))
|
|
75 dev.off()
|
|
76 tested <- list()
|
|
77 ";
|
|
78
|
|
79 my $all_cont;
|
|
80 my @add_cont;
|
|
81 my @fact;
|
|
82 my @fact_names;
|
|
83 my @cp;
|
|
84 my @cp_names;
|
|
85 if(@ARGV) {
|
|
86 foreach my $input (@ARGV) {
|
|
87 my @tmp = split "::", $input;
|
|
88 if($tmp[0] eq "factor") {
|
|
89 $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g;
|
|
90 push @fact_names, $tmp[1];
|
|
91 $tmp[2] =~ s/:/\", \"/g;
|
|
92 $tmp[2] = "\"".$tmp[2]."\"";
|
|
93 push @fact, $tmp[2];
|
|
94 } elsif($tmp[0] eq "cp") {
|
|
95 $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g;
|
|
96 push @cp_names, $tmp[1];
|
|
97 $tmp[2] =~ s/:/, /g;
|
|
98 push @cp, $tmp[2];
|
|
99 } elsif($tmp[0] eq "cnt") {
|
|
100 push @add_cont, $tmp[1];
|
|
101 } else {
|
|
102 die("Unknown Input: $input\n");
|
|
103 }
|
|
104 }
|
|
105 }
|
|
106
|
|
107 if($OPTIONS{a} eq "pw") {
|
|
108 print Rcmd "
|
|
109 disp <- estimateCommonDisp(DGE, rowsum.filter=$OPTIONS{r})
|
|
110 ";
|
|
111 if(defined $OPTIONS{t}) {
|
|
112 print Rcmd "
|
|
113 disp <- estimateTagwiseDisp(disp, trend=\"$OPTIONS{u}\", prop.used=$OPTIONS{p})
|
|
114 pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\")
|
|
115 plot(log2(1e06*disp\$conc\$conc.common), disp\$tagwise.dispersion, xlab=\"Counts per million (log2 scale)\", ylab=\"Tagwise dispersion\")
|
|
116 abline(h=disp\$common.dispersion, col=\"firebrick\", lwd=3)
|
|
117 dev.off()
|
|
118 "
|
|
119 }
|
|
120 print Rcmd "
|
|
121 for(i in 1:length(pw_tests)) {
|
|
122 tested[[i]] <- exactTest(disp, pair=pw_tests[[i]])
|
|
123 names(tested)[i] <- paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")
|
|
124 }
|
|
125 pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\")
|
|
126 for(i in 1:length(pw_tests)) {
|
|
127 if(nrow(decideTestsDGE(tested[[i]] , p.value=0.05)) > 0) {
|
|
128 de_tags <- rownames(decideTestsDGE(tested[[i]] , p.value=0.05, adjust.method=\"$OPTIONS{f}\"))
|
|
129 ttl <- \"(Diff. Exp. Genes With adj. Pvalue < 0.05 highlighted)\"
|
|
130 } else {
|
|
131 de_tags <- rownames(topTags(tested[[i]], n=100)\$table)
|
|
132 ttl <- \"(Top 100 tags highlighted)\"
|
|
133 }
|
|
134
|
|
135 plotSmear(disp, pair=pw_tests[[i]], de.tags = de_tags, main = paste(\"FC plot\", ttl))
|
|
136 abline(h = c(-2, 2), col = \"dodgerblue\")
|
|
137 }
|
|
138 dev.off()
|
|
139 ";
|
|
140 } elsif($OPTIONS{a} eq "glm") {
|
|
141 for(my $fct = 0; $fct <= $#fact_names; $fct++) {
|
|
142 print Rcmd "
|
|
143 $fact_names[$fct] <- c($fact[$fct])
|
|
144 ";
|
|
145 }
|
|
146 for(my $fct = 0; $fct <= $#cp_names; $fct++) {
|
|
147 print Rcmd "
|
|
148 $cp_names[$fct] <- c($cp[$fct])
|
|
149 ";
|
|
150 }
|
|
151 my $all_fact = "";
|
|
152 if(@fact_names) {
|
|
153 foreach (@fact_names) {
|
|
154 $all_fact .= " + factor($_)";
|
|
155 }
|
|
156 }
|
|
157 my $all_cp = "";
|
|
158 if(@cp_names) {
|
|
159 $all_cp = " + ".join(" + ", @cp_names);
|
|
160 }
|
|
161 print Rcmd "
|
|
162 group_fact <- factor(names(groups))
|
|
163 design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp})
|
|
164 colnames(design) <- sub(\"group_fact\", \"\", colnames(design))
|
|
165 ";
|
|
166 foreach my $fct (@fact_names) {
|
|
167 print Rcmd "
|
|
168 colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design)))
|
|
169 ";
|
|
170 }
|
|
171 print Rcmd "
|
|
172 disp <- estimateGLMCommonDisp(DGE, design)
|
|
173 ";
|
|
174 if($OPTIONS{d} eq "tag" || $OPTIONS{d} eq "trend") {
|
|
175 print Rcmd "
|
|
176 disp <- estimateGLMTrendedDisp(disp, design)
|
|
177 ";
|
|
178 }
|
|
179 if($OPTIONS{d} eq "tag") {
|
|
180 print Rcmd "
|
|
181 disp <- estimateGLMTagwiseDisp(disp, design)
|
|
182 fit <- glmFit(disp, design)
|
|
183 pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\")
|
|
184 plot(fit\$abund+log(1e06), sqrt(disp\$tagwise.dispersion), xlab=\"Counts per million (log2 scale)\", ylab=\"Tagwise dispersion\")
|
|
185 oo <- order(disp\$abundance)
|
|
186 lines(fit\$abundance[oo]+log(1e06), sqrt(disp\$trended.dispersion[oo]), col=\"dodgerblue\", lwd=3)
|
|
187 abline(h=sqrt(disp\$common.dispersion), col=\"firebrick\", lwd=3)
|
|
188 dev.off()
|
|
189 ";
|
|
190 }
|
|
191 if(@add_cont) {
|
|
192 $all_cont = "\"".join("\", \"", @add_cont)."\"";
|
|
193 print Rcmd "
|
|
194 cont <- c(${all_cont})
|
|
195 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont)
|
|
196 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont)
|
|
197 ";
|
|
198 } else {
|
|
199 print Rcmd "
|
|
200 cont <- NULL
|
|
201 ";
|
|
202 }
|
|
203 if(defined $OPTIONS{m}) {
|
|
204 print Rcmd "
|
|
205 for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\"))
|
|
206 ";
|
|
207 }
|
|
208 if(!defined $OPTIONS{m} && !@add_cont){
|
|
209 die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n");
|
|
210 }
|
|
211 print Rcmd "
|
|
212 fit <- glmFit(disp, design)
|
|
213 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
214 for(i in colnames(cont)) tested[[i]] <- glmLRT(disp, fit, contrast=cont[,i])
|
|
215 ";
|
|
216 if(defined $OPTIONS{n}) {
|
|
217 print Rcmd "
|
|
218 tab <- data.frame(ID=rownames(fit\$fitted.values), fit\$fitted.values, stringsAsFactors=F)
|
|
219 write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F)
|
|
220 ";
|
|
221 }
|
|
222 } elsif($OPTIONS{a} eq "limma") {
|
|
223 for(my $fct = 0; $fct <= $#fact_names; $fct++) {
|
|
224 print Rcmd "
|
|
225 $fact_names[$fct] <- c($fact[$fct])
|
|
226 ";
|
|
227 }
|
|
228 for(my $fct = 0; $fct <= $#cp_names; $fct++) {
|
|
229 print Rcmd "
|
|
230 $cp_names[$fct] <- c($cp[$fct])
|
|
231 ";
|
|
232 }
|
|
233 my $all_fact = "";
|
|
234 if(@fact_names) {
|
|
235 foreach (@fact_names) {
|
|
236 $all_fact .= " + factor($_)";
|
|
237 }
|
|
238 }
|
|
239 my $all_cp = "";
|
|
240 if(@cp_names) {
|
|
241 $all_cp = " + ".join(" + ", @cp_names);
|
|
242 }
|
|
243 print Rcmd "
|
|
244 group_fact <- factor(names(groups))
|
|
245 design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp})
|
|
246 colnames(design) <- sub(\"group_fact\", \"\", colnames(design))
|
|
247 ";
|
|
248 foreach my $fct (@fact_names) {
|
|
249 print Rcmd "
|
|
250 colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design)))
|
|
251 ";
|
|
252 }
|
|
253 print Rcmd "
|
|
254 isexpr <- rowSums(cpm(toc)>1) >= 2
|
|
255 toc <- toc[isexpr, ]
|
|
256 pdf(file=\"$OPTIONS{e}/LIMMA_voom.pdf\")
|
|
257 y <- voom(toc, design, plot=TRUE, lib.size=colSums(toc)*norm_factors)
|
|
258 dev.off()
|
|
259
|
|
260 pdf(file=\"$OPTIONS{e}/LIMMA_MDS_plot.pdf\")
|
|
261 plotMDS(y, labels=colnames(toc), col=as.numeric(factor(names(groups)))+1, gene.selection=\"common\")
|
|
262 dev.off()
|
|
263 fit <- lmFit(y, design)
|
|
264 ";
|
|
265 if(defined $OPTIONS{n}) {
|
|
266 if(defined $OPTIONS{l}) {
|
|
267 print Rcmd "
|
|
268 tab <- data.frame(ID=rownames(y\$E), y\$E, stringsAsFactors=F)
|
|
269 ";
|
|
270 } else {
|
|
271 print Rcmd "
|
|
272 tab <- data.frame(ID=rownames(y\$E), 2^y\$E, stringsAsFactors=F)
|
|
273 ";
|
|
274 }
|
|
275 print Rcmd "
|
|
276 write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F)
|
|
277 ";
|
|
278 }
|
|
279 if(@add_cont) {
|
|
280 $all_cont = "\"".join("\", \"", @add_cont)."\"";
|
|
281 print Rcmd "
|
|
282 cont <- c(${all_cont})
|
|
283 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont)
|
|
284 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont)
|
|
285 ";
|
|
286 } else {
|
|
287 print Rcmd "
|
|
288 cont <- NULL
|
|
289 ";
|
|
290 }
|
|
291 if(defined $OPTIONS{m}) {
|
|
292 print Rcmd "
|
|
293 for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\"))
|
|
294 ";
|
|
295 }
|
|
296 if(!defined $OPTIONS{m} && !@add_cont){
|
|
297 die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n");
|
|
298 }
|
|
299 print Rcmd "
|
|
300 cont <- makeContrasts(contrasts=cont, levels=design)
|
|
301 fit2 <- contrasts.fit(fit, cont)
|
|
302 fit2 <- eBayes(fit2)
|
|
303 ";
|
|
304 } else {
|
|
305 die("Anaysis type $OPTIONS{a} not found\n");
|
|
306
|
|
307 }
|
|
308
|
|
309 if($OPTIONS{a} ne "limma") {
|
|
310 print Rcmd "
|
|
311 options(digits = 6)
|
|
312 tab <- NULL
|
|
313 for(i in names(tested)) {
|
|
314 tab_tmp <- topTags(tested[[i]], n=Inf, adjust.method=\"$OPTIONS{f}\")[[1]]
|
|
315 colnames(tab_tmp) <- paste(i, colnames(tab_tmp), sep=\":\")
|
|
316 tab_tmp <- tab_tmp[tagnames,]
|
|
317 if(is.null(tab)) {
|
|
318 tab <- tab_tmp
|
|
319 } else tab <- cbind(tab, tab_tmp)
|
|
320 }
|
|
321 tab <- cbind(Feature=rownames(tab), tab)
|
|
322 ";
|
|
323 } else {
|
|
324 print Rcmd "
|
|
325 tab <- NULL
|
|
326 options(digits = 6)
|
|
327 for(i in colnames(fit2)) {
|
|
328 tab_tmp <- topTable(fit2, coef=i, n=Inf, sort.by=\"none\", adjust.method=\"$OPTIONS{f}\")
|
|
329 colnames(tab_tmp)[-1] <- paste(i, colnames(tab_tmp)[-1], sep=\":\")
|
|
330 if(is.null(tab)) {
|
|
331 tab <- tab_tmp
|
|
332 } else tab <- cbind(tab, tab_tmp[,-1])
|
|
333 }
|
|
334 ";
|
|
335 }
|
|
336 print Rcmd "
|
|
337 write.table(tab, \"$OPTIONS{o}\", quote=F, sep=\"\\t\", row.names=F)
|
|
338 sink(type=\"message\")
|
|
339 sink()
|
|
340 ";
|
|
341 close(Rcmd);
|
|
342 system("R --no-restore --no-save --no-readline < $OPTIONS{e}/r_script.R > $OPTIONS{e}/r_script.out");
|
|
343
|
|
344 open(HTML, ">$OPTIONS{h}");
|
|
345 print HTML "<html><head><title>EdgeR: Empirical analysis of digital gene expression data</title></head><body><h3>EdgeR Additional Files:</h3><p><ul>\n";
|
|
346 print HTML "<li><a href=MA_plots_normalisation.pdf>MA_plots_normalisation.pdf</a></li>\n";
|
|
347 print HTML "<li><a href=MDSplot.pdf>MDSplot.pdf</a></li>\n";
|
|
348 if($OPTIONS{a} eq "pw") {
|
|
349 if(defined $OPTIONS{t}) {
|
|
350 print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n";
|
|
351 }
|
|
352 print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n";
|
|
353 } elsif($OPTIONS{a} eq "glm" && $OPTIONS{d} eq "tag") {
|
|
354 print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n";
|
|
355 } elsif($OPTIONS{a} eq "limma") {
|
|
356 print HTML "<li><a href=LIMMA_MDS_plot.pdf>LIMMA_MDS_plot.pdf</a></li>\n";
|
|
357 print HTML "<li><a href=LIMMA_voom.pdf>LIMMA_voom.pdf</a></li>\n";
|
|
358 }
|
|
359 print HTML "<li><a href=r_script.R>r_script.R</a></li>\n";
|
|
360 print HTML "<li><a href=r_script.out>r_script.out</a></li>\n";
|
|
361 print HTML "<li><a href=r_script.err>r_script.err</a></li>\n";
|
|
362 print HTML "</ul></p>\n";
|
|
363 close(HTML);
|
|
364
|