comparison edgeR.pl @ 3:0f51cd8ddfb0 draft

Uploaded
author rouan
date Thu, 26 Dec 2013 05:35:42 -0500
parents
children
comparison
equal deleted inserted replaced
2:22c941d89a89 3:0f51cd8ddfb0
1 #/bin/perl
2
3 use strict;
4 use warnings;
5 use Getopt::Std;
6 use File::Basename;
7 use File::Path qw(make_path remove_tree);
8 $| = 1;
9
10 # Grab and set all options
11 my %OPTIONS = (a => "glm", d => "tag", f => "BH", r => 5, u => "movingave");
12
13 getopts('a:d:e:f:h:lmn:o:r:tu:', \%OPTIONS);
14
15 die qq(
16 Usage: edgeR.pl [OPTIONS] factor::factor1::levels [factor::factor2::levels ...] cp::cont_pred1::values [cp::cont_pred2::values ...] cnt::contrast1 [cnt::contrast2] matrix
17
18 OPTIONS: -a STR Type Of Analysis [glm, pw, limma] (default: $OPTIONS{a})
19 -d STR The dispersion estimate to use for GLM analysis [tag, trend, common] (default: $OPTIONS{d})
20 -e STR Path to place additional output files
21 -f STR False discovery rate adjustment method [BH, holm, hochberg, hommel, BY, none] (default: $OPTIONS{f})
22 -h STR Name of html file for additional files
23 -l Output the normalised digital gene expression matrix in log2 format (only applicable when using limma and -n is also specified)
24 -m Perform all pairwise comparisons
25 -n STR File name to output the normalised digital gene expression matrix (only applicable when usinf glm or limma model)
26 -o STR File name to output csv file with results
27 -r INT Common Dispersion Rowsum Filter, ony applicable when 1 factor analysis selected (default: $OPTIONS{r})
28 -t Estimate Tagwise Disp when performing 1 factor analysis
29 -u STR Method for allowing the prior distribution for the dispersion to be abundance- dependent ["movingave", "tricube", "none"] (default: $OPTIONS{u})
30
31 ) if(!@ARGV);
32
33 my $matrix = pop @ARGV;
34
35 make_path($OPTIONS{e});
36 open(Rcmd,">$OPTIONS{e}/r_script.R") or die "Cannot open $OPTIONS{e}/r_script.R\n\n";
37 print Rcmd "
38 zz <- file(\"$OPTIONS{e}/r_script.err\", open=\"wt\")
39 sink(zz)
40 sink(zz, type=\"message\")
41
42 library(edgeR)
43 library(limma)
44
45 # read in matrix and groups
46 toc <- read.table(\"$matrix\", sep=\"\\t\", comment=\"\", as.is=T)
47 groups <- sapply(toc[1, -1], strsplit, \":\")
48 for(i in 1:length(groups)) { g <- make.names(groups[[i]][2]); names(groups)[i] <- g; groups[[i]] <- groups[[i]][-2] }
49 colnames(toc) <- make.names(toc[2,])
50 toc[,1] <- gsub(\",\", \".\", toc[,1])
51 tagnames <- toc[-(1:2), 1]
52 rownames(toc) <- toc[,1]
53 toc <- toc[-(1:2), -1]
54 for(i in colnames(toc)) toc[, i] <- as.numeric(toc[,i])
55 norm_factors <- calcNormFactors(as.matrix(toc))
56
57 pw_tests <- list()
58 uniq_groups <- unique(names(groups))
59 for(i in 1:(length(uniq_groups)-1)) for(j in (i+1):length(uniq_groups)) pw_tests[[length(pw_tests)+1]] <- c(uniq_groups[i], uniq_groups[j])
60 DGE <- DGEList(toc, lib.size=norm_factors*colSums(toc), group=names(groups))
61 pdf(\"$OPTIONS{e}/MA_plots_normalisation.pdf\", width=14)
62 for(i in 1:length(pw_tests)) {
63 j <- c(which(names(groups) == pw_tests[[i]][1])[1], which(names(groups) == pw_tests[[i]][2])[1])
64 par(mfrow = c(1, 2))
65 maPlot(toc[, j[1]], toc[, j[2]], normalize = TRUE, pch = 19, cex = 0.2, ylim = c(-10, 10), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]]))
66 grid(col = \"blue\")
67 abline(h = log2(norm_factors[j[2]]), col = \"red\", lwd = 4)
68 maPlot(DGE\$counts[, j[1]]/DGE\$samples\$lib.size[j[1]], DGE\$counts[, j[2]]/DGE\$samples\$lib.size[j[2]], normalize = FALSE, pch = 19, cex = 0.2, ylim = c(-8, 8), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]], \"Normalised\"))
69 grid(col = \"blue\")
70 }
71 dev.off()
72 pdf(file=\"$OPTIONS{e}/MDSplot.pdf\")
73 plotMDS(DGE, main=\"MDS Plot\", col=as.numeric(factor(names(groups)))+1, xlim=c(-3,3))
74 dev.off()
75 tested <- list()
76 ";
77
78 my $all_cont;
79 my @add_cont;
80 my @fact;
81 my @fact_names;
82 my @cp;
83 my @cp_names;
84 if(@ARGV) {
85 foreach my $input (@ARGV) {
86 my @tmp = split "::", $input;
87 if($tmp[0] eq "factor") {
88 $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g;
89 push @fact_names, $tmp[1];
90 $tmp[2] =~ s/:/\", \"/g;
91 $tmp[2] = "\"".$tmp[2]."\"";
92 push @fact, $tmp[2];
93 } elsif($tmp[0] eq "cp") {
94 $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g;
95 push @cp_names, $tmp[1];
96 $tmp[2] =~ s/:/, /g;
97 push @cp, $tmp[2];
98 } elsif($tmp[0] eq "cnt") {
99 push @add_cont, $tmp[1];
100 } else {
101 die("Unknown Input: $input\n");
102 }
103 }
104 }
105
106 if($OPTIONS{a} eq "pw") {
107 print Rcmd "
108 disp <- estimateCommonDisp(DGE, rowsum.filter=$OPTIONS{r})
109 ";
110 if(defined $OPTIONS{t}) {
111 print Rcmd "
112 disp <- estimateTagwiseDisp(disp, trend=\"$OPTIONS{u}\")
113 pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\")
114 plotBCV(disp, cex=0.4)
115 abline(h=disp\$common.dispersion, col=\"firebrick\", lwd=3)
116 dev.off()
117 ";
118 }
119 print Rcmd "
120 for(i in 1:length(pw_tests)) {
121 tested[[i]] <- exactTest(disp, pair=pw_tests[[i]])
122 names(tested)[i] <- paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")
123 }
124 pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\")
125 for(i in 1:length(pw_tests)) {
126 dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\")
127 if(sum(dt) > 0) {
128 de_tags <- rownames(disp)[which(dt != 0)]
129 ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\"
130 } else {
131 de_tags <- rownames(topTags(tested[[i]], n=100)\$table)
132 ttl <- \"Top 100 tags\"
133 }
134
135 if(length(dt) < 5000) {
136 pointcex = 0.5
137 } else {
138 pointcex = 0.2
139 }
140 plotSmear(disp, pair=pw_tests[[i]], de.tags = de_tags, main = paste(\"Smear Plot\", names(tested)[i]), cex=0.5)
141 abline(h = c(-1, 1), col = \"blue\")
142 legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\")
143 }
144 dev.off()
145 ";
146 } elsif($OPTIONS{a} eq "glm") {
147 for(my $fct = 0; $fct <= $#fact_names; $fct++) {
148 print Rcmd "
149 $fact_names[$fct] <- c($fact[$fct])
150 ";
151 }
152 for(my $fct = 0; $fct <= $#cp_names; $fct++) {
153 print Rcmd "
154 $cp_names[$fct] <- c($cp[$fct])
155 ";
156 }
157 my $all_fact = "";
158 if(@fact_names) {
159 foreach (@fact_names) {
160 $all_fact .= " + factor($_)";
161 }
162 }
163 my $all_cp = "";
164 if(@cp_names) {
165 $all_cp = " + ".join(" + ", @cp_names);
166 }
167 print Rcmd "
168 group_fact <- factor(names(groups))
169 design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp})
170 colnames(design) <- sub(\"group_fact\", \"\", colnames(design))
171 ";
172 foreach my $fct (@fact_names) {
173 print Rcmd "
174 colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design)))
175 ";
176 }
177 print Rcmd "
178 disp <- estimateGLMCommonDisp(DGE, design)
179 ";
180 if($OPTIONS{d} eq "tag" || $OPTIONS{d} eq "trend") {
181 print Rcmd "
182 disp <- estimateGLMTrendedDisp(disp, design)
183 ";
184 }
185 if($OPTIONS{d} eq "tag") {
186 print Rcmd "
187 disp <- estimateGLMTagwiseDisp(disp, design)
188 fit <- glmFit(disp, design)
189 pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\")
190 plotBCV(disp, cex=0.4)
191 dev.off()
192 ";
193 }
194 if(@add_cont) {
195 $all_cont = "\"".join("\", \"", @add_cont)."\"";
196 print Rcmd "
197 cont <- c(${all_cont})
198 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont)
199 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont)
200 ";
201 } else {
202 print Rcmd "
203 cont <- NULL
204 ";
205 }
206 if(defined $OPTIONS{m}) {
207 print Rcmd "
208 for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\"))
209 ";
210 }
211 if(!defined $OPTIONS{m} && !@add_cont){
212 die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n");
213 }
214 print Rcmd "
215 fit <- glmFit(disp, design)
216 cont <- makeContrasts(contrasts=cont, levels=design)
217 for(i in colnames(cont)) tested[[i]] <- glmLRT(fit, contrast=cont[,i])
218 pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\")
219 for(i in colnames(cont)) {
220 dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\")
221 if(sum(dt) > 0) {
222 de_tags <- rownames(disp)[which(dt != 0)]
223 ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\"
224 } else {
225 de_tags <- rownames(topTags(tested[[i]], n=100)\$table)
226 ttl <- \"Top 100 tags\"
227 }
228
229 if(length(dt) < 5000) {
230 pointcex = 0.5
231 } else {
232 pointcex = 0.2
233 }
234 plotSmear(disp, de.tags = de_tags, main = paste(\"Smear Plot\", i), cex=pointcex)
235 abline(h = c(-1, 1), col = \"blue\")
236 legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\")
237 }
238 dev.off()
239 ";
240 if(defined $OPTIONS{n}) {
241 print Rcmd "
242 tab <- data.frame(ID=rownames(fit\$fitted.values), fit\$fitted.values, stringsAsFactors=F)
243 write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F)
244 ";
245 }
246 } elsif($OPTIONS{a} eq "limma") {
247 for(my $fct = 0; $fct <= $#fact_names; $fct++) {
248 print Rcmd "
249 $fact_names[$fct] <- c($fact[$fct])
250 ";
251 }
252 for(my $fct = 0; $fct <= $#cp_names; $fct++) {
253 print Rcmd "
254 $cp_names[$fct] <- c($cp[$fct])
255 ";
256 }
257 my $all_fact = "";
258 if(@fact_names) {
259 foreach (@fact_names) {
260 $all_fact .= " + factor($_)";
261 }
262 }
263 my $all_cp = "";
264 if(@cp_names) {
265 $all_cp = " + ".join(" + ", @cp_names);
266 }
267 print Rcmd "
268 group_fact <- factor(names(groups))
269 design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp})
270 colnames(design) <- sub(\"group_fact\", \"\", colnames(design))
271 ";
272 foreach my $fct (@fact_names) {
273 print Rcmd "
274 colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design)))
275 ";
276 }
277 print Rcmd "
278 isexpr <- rowSums(cpm(toc)>1) >= 2
279 toc <- toc[isexpr, ]
280 pdf(file=\"$OPTIONS{e}/LIMMA_voom.pdf\")
281 y <- voom(toc, design, plot=TRUE, lib.size=colSums(toc)*norm_factors)
282 dev.off()
283
284 pdf(file=\"$OPTIONS{e}/LIMMA_MDS_plot.pdf\")
285 plotMDS(y, labels=colnames(toc), col=as.numeric(factor(names(groups)))+1, gene.selection=\"common\")
286 dev.off()
287 fit <- lmFit(y, design)
288 ";
289 if(defined $OPTIONS{n}) {
290 if(defined $OPTIONS{l}) {
291 print Rcmd "
292 tab <- data.frame(ID=rownames(y\$E), y\$E, stringsAsFactors=F)
293 ";
294 } else {
295 print Rcmd "
296 tab <- data.frame(ID=rownames(y\$E), 2^y\$E, stringsAsFactors=F)
297 ";
298 }
299 print Rcmd "
300 write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F)
301 ";
302 }
303 if(@add_cont) {
304 $all_cont = "\"".join("\", \"", @add_cont)."\"";
305 print Rcmd "
306 cont <- c(${all_cont})
307 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont)
308 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont)
309 ";
310 } else {
311 print Rcmd "
312 cont <- NULL
313 ";
314 }
315 if(defined $OPTIONS{m}) {
316 print Rcmd "
317 for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\"))
318 ";
319 }
320 if(!defined $OPTIONS{m} && !@add_cont){
321 die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n");
322 }
323 print Rcmd "
324 cont <- makeContrasts(contrasts=cont, levels=design)
325 fit2 <- contrasts.fit(fit, cont)
326 fit2 <- eBayes(fit2)
327 ";
328 } else {
329 die("Anaysis type $OPTIONS{a} not found\n");
330
331 }
332
333 if($OPTIONS{a} ne "limma") {
334 print Rcmd "
335 options(digits = 6)
336 tab <- NULL
337 for(i in names(tested)) {
338 tab_tmp <- topTags(tested[[i]], n=Inf, adjust.method=\"$OPTIONS{f}\")[[1]]
339 colnames(tab_tmp) <- paste(i, colnames(tab_tmp), sep=\":\")
340 tab_tmp <- tab_tmp[tagnames,]
341 if(is.null(tab)) {
342 tab <- tab_tmp
343 } else tab <- cbind(tab, tab_tmp)
344 }
345 tab <- cbind(Feature=rownames(tab), tab)
346 ";
347 } else {
348 print Rcmd "
349 tab <- NULL
350 options(digits = 6)
351 for(i in colnames(fit2)) {
352 tab_tmp <- topTable(fit2, coef=i, n=Inf, sort.by=\"none\", adjust.method=\"$OPTIONS{f}\")
353 colnames(tab_tmp)[-1] <- paste(i, colnames(tab_tmp)[-1], sep=\":\")
354 if(is.null(tab)) {
355 tab <- tab_tmp
356 } else tab <- cbind(tab, tab_tmp[,-1])
357 }
358 ";
359 }
360 print Rcmd "
361 write.table(tab, \"$OPTIONS{o}\", quote=F, sep=\"\\t\", row.names=F)
362 sink(type=\"message\")
363 sink()
364 ";
365 close(Rcmd);
366 system("R --no-restore --no-save --no-readline < $OPTIONS{e}/r_script.R > $OPTIONS{e}/r_script.out");
367
368 open(HTML, ">$OPTIONS{h}");
369 print HTML "<html><head><title>EdgeR: Empirical analysis of digital gene expression data</title></head><body><h3>EdgeR Additional Files:</h3><p><ul>\n";
370 print HTML "<li><a href=MA_plots_normalisation.pdf>MA_plots_normalisation.pdf</a></li>\n";
371 print HTML "<li><a href=MDSplot.pdf>MDSplot.pdf</a></li>\n";
372 if($OPTIONS{a} eq "pw") {
373 if(defined $OPTIONS{t}) {
374 print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n";
375 }
376 print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n";
377 } elsif($OPTIONS{a} eq "glm" && $OPTIONS{d} eq "tag") {
378 print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n";
379 print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n";
380 } elsif($OPTIONS{a} eq "glm" && ($OPTIONS{d} eq "trend" || $OPTIONS{d} eq "common")) {
381 print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n";
382 } elsif($OPTIONS{a} eq "limma") {
383 print HTML "<li><a href=LIMMA_MDS_plot.pdf>LIMMA_MDS_plot.pdf</a></li>\n";
384 print HTML "<li><a href=LIMMA_voom.pdf>LIMMA_voom.pdf</a></li>\n";
385 }
386 print HTML "<li><a href=r_script.R>r_script.R</a></li>\n";
387 print HTML "<li><a href=r_script.out>r_script.out</a></li>\n";
388 print HTML "<li><a href=r_script.err>r_script.err</a></li>\n";
389 print HTML "</ul></p>\n";
390 close(HTML);
391