Mercurial > repos > rouan > casoarnv1
comparison edgeR.pl @ 3:0f51cd8ddfb0 draft
Uploaded
author | rouan |
---|---|
date | Thu, 26 Dec 2013 05:35:42 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:22c941d89a89 | 3:0f51cd8ddfb0 |
---|---|
1 #/bin/perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 use Getopt::Std; | |
6 use File::Basename; | |
7 use File::Path qw(make_path remove_tree); | |
8 $| = 1; | |
9 | |
10 # Grab and set all options | |
11 my %OPTIONS = (a => "glm", d => "tag", f => "BH", r => 5, u => "movingave"); | |
12 | |
13 getopts('a:d:e:f:h:lmn:o:r:tu:', \%OPTIONS); | |
14 | |
15 die qq( | |
16 Usage: edgeR.pl [OPTIONS] factor::factor1::levels [factor::factor2::levels ...] cp::cont_pred1::values [cp::cont_pred2::values ...] cnt::contrast1 [cnt::contrast2] matrix | |
17 | |
18 OPTIONS: -a STR Type Of Analysis [glm, pw, limma] (default: $OPTIONS{a}) | |
19 -d STR The dispersion estimate to use for GLM analysis [tag, trend, common] (default: $OPTIONS{d}) | |
20 -e STR Path to place additional output files | |
21 -f STR False discovery rate adjustment method [BH, holm, hochberg, hommel, BY, none] (default: $OPTIONS{f}) | |
22 -h STR Name of html file for additional files | |
23 -l Output the normalised digital gene expression matrix in log2 format (only applicable when using limma and -n is also specified) | |
24 -m Perform all pairwise comparisons | |
25 -n STR File name to output the normalised digital gene expression matrix (only applicable when usinf glm or limma model) | |
26 -o STR File name to output csv file with results | |
27 -r INT Common Dispersion Rowsum Filter, ony applicable when 1 factor analysis selected (default: $OPTIONS{r}) | |
28 -t Estimate Tagwise Disp when performing 1 factor analysis | |
29 -u STR Method for allowing the prior distribution for the dispersion to be abundance- dependent ["movingave", "tricube", "none"] (default: $OPTIONS{u}) | |
30 | |
31 ) if(!@ARGV); | |
32 | |
33 my $matrix = pop @ARGV; | |
34 | |
35 make_path($OPTIONS{e}); | |
36 open(Rcmd,">$OPTIONS{e}/r_script.R") or die "Cannot open $OPTIONS{e}/r_script.R\n\n"; | |
37 print Rcmd " | |
38 zz <- file(\"$OPTIONS{e}/r_script.err\", open=\"wt\") | |
39 sink(zz) | |
40 sink(zz, type=\"message\") | |
41 | |
42 library(edgeR) | |
43 library(limma) | |
44 | |
45 # read in matrix and groups | |
46 toc <- read.table(\"$matrix\", sep=\"\\t\", comment=\"\", as.is=T) | |
47 groups <- sapply(toc[1, -1], strsplit, \":\") | |
48 for(i in 1:length(groups)) { g <- make.names(groups[[i]][2]); names(groups)[i] <- g; groups[[i]] <- groups[[i]][-2] } | |
49 colnames(toc) <- make.names(toc[2,]) | |
50 toc[,1] <- gsub(\",\", \".\", toc[,1]) | |
51 tagnames <- toc[-(1:2), 1] | |
52 rownames(toc) <- toc[,1] | |
53 toc <- toc[-(1:2), -1] | |
54 for(i in colnames(toc)) toc[, i] <- as.numeric(toc[,i]) | |
55 norm_factors <- calcNormFactors(as.matrix(toc)) | |
56 | |
57 pw_tests <- list() | |
58 uniq_groups <- unique(names(groups)) | |
59 for(i in 1:(length(uniq_groups)-1)) for(j in (i+1):length(uniq_groups)) pw_tests[[length(pw_tests)+1]] <- c(uniq_groups[i], uniq_groups[j]) | |
60 DGE <- DGEList(toc, lib.size=norm_factors*colSums(toc), group=names(groups)) | |
61 pdf(\"$OPTIONS{e}/MA_plots_normalisation.pdf\", width=14) | |
62 for(i in 1:length(pw_tests)) { | |
63 j <- c(which(names(groups) == pw_tests[[i]][1])[1], which(names(groups) == pw_tests[[i]][2])[1]) | |
64 par(mfrow = c(1, 2)) | |
65 maPlot(toc[, j[1]], toc[, j[2]], normalize = TRUE, pch = 19, cex = 0.2, ylim = c(-10, 10), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]])) | |
66 grid(col = \"blue\") | |
67 abline(h = log2(norm_factors[j[2]]), col = \"red\", lwd = 4) | |
68 maPlot(DGE\$counts[, j[1]]/DGE\$samples\$lib.size[j[1]], DGE\$counts[, j[2]]/DGE\$samples\$lib.size[j[2]], normalize = FALSE, pch = 19, cex = 0.2, ylim = c(-8, 8), main=paste(\"MA Plot\", colnames(toc)[j[1]], \"vs\", colnames(toc)[j[2]], \"Normalised\")) | |
69 grid(col = \"blue\") | |
70 } | |
71 dev.off() | |
72 pdf(file=\"$OPTIONS{e}/MDSplot.pdf\") | |
73 plotMDS(DGE, main=\"MDS Plot\", col=as.numeric(factor(names(groups)))+1, xlim=c(-3,3)) | |
74 dev.off() | |
75 tested <- list() | |
76 "; | |
77 | |
78 my $all_cont; | |
79 my @add_cont; | |
80 my @fact; | |
81 my @fact_names; | |
82 my @cp; | |
83 my @cp_names; | |
84 if(@ARGV) { | |
85 foreach my $input (@ARGV) { | |
86 my @tmp = split "::", $input; | |
87 if($tmp[0] eq "factor") { | |
88 $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g; | |
89 push @fact_names, $tmp[1]; | |
90 $tmp[2] =~ s/:/\", \"/g; | |
91 $tmp[2] = "\"".$tmp[2]."\""; | |
92 push @fact, $tmp[2]; | |
93 } elsif($tmp[0] eq "cp") { | |
94 $tmp[1] =~ s/[ \?\(\)\[\]\/\\=+<>:;\"\',\*\^\|\&-]/./g; | |
95 push @cp_names, $tmp[1]; | |
96 $tmp[2] =~ s/:/, /g; | |
97 push @cp, $tmp[2]; | |
98 } elsif($tmp[0] eq "cnt") { | |
99 push @add_cont, $tmp[1]; | |
100 } else { | |
101 die("Unknown Input: $input\n"); | |
102 } | |
103 } | |
104 } | |
105 | |
106 if($OPTIONS{a} eq "pw") { | |
107 print Rcmd " | |
108 disp <- estimateCommonDisp(DGE, rowsum.filter=$OPTIONS{r}) | |
109 "; | |
110 if(defined $OPTIONS{t}) { | |
111 print Rcmd " | |
112 disp <- estimateTagwiseDisp(disp, trend=\"$OPTIONS{u}\") | |
113 pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\") | |
114 plotBCV(disp, cex=0.4) | |
115 abline(h=disp\$common.dispersion, col=\"firebrick\", lwd=3) | |
116 dev.off() | |
117 "; | |
118 } | |
119 print Rcmd " | |
120 for(i in 1:length(pw_tests)) { | |
121 tested[[i]] <- exactTest(disp, pair=pw_tests[[i]]) | |
122 names(tested)[i] <- paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\") | |
123 } | |
124 pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\") | |
125 for(i in 1:length(pw_tests)) { | |
126 dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\") | |
127 if(sum(dt) > 0) { | |
128 de_tags <- rownames(disp)[which(dt != 0)] | |
129 ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\" | |
130 } else { | |
131 de_tags <- rownames(topTags(tested[[i]], n=100)\$table) | |
132 ttl <- \"Top 100 tags\" | |
133 } | |
134 | |
135 if(length(dt) < 5000) { | |
136 pointcex = 0.5 | |
137 } else { | |
138 pointcex = 0.2 | |
139 } | |
140 plotSmear(disp, pair=pw_tests[[i]], de.tags = de_tags, main = paste(\"Smear Plot\", names(tested)[i]), cex=0.5) | |
141 abline(h = c(-1, 1), col = \"blue\") | |
142 legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\") | |
143 } | |
144 dev.off() | |
145 "; | |
146 } elsif($OPTIONS{a} eq "glm") { | |
147 for(my $fct = 0; $fct <= $#fact_names; $fct++) { | |
148 print Rcmd " | |
149 $fact_names[$fct] <- c($fact[$fct]) | |
150 "; | |
151 } | |
152 for(my $fct = 0; $fct <= $#cp_names; $fct++) { | |
153 print Rcmd " | |
154 $cp_names[$fct] <- c($cp[$fct]) | |
155 "; | |
156 } | |
157 my $all_fact = ""; | |
158 if(@fact_names) { | |
159 foreach (@fact_names) { | |
160 $all_fact .= " + factor($_)"; | |
161 } | |
162 } | |
163 my $all_cp = ""; | |
164 if(@cp_names) { | |
165 $all_cp = " + ".join(" + ", @cp_names); | |
166 } | |
167 print Rcmd " | |
168 group_fact <- factor(names(groups)) | |
169 design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp}) | |
170 colnames(design) <- sub(\"group_fact\", \"\", colnames(design)) | |
171 "; | |
172 foreach my $fct (@fact_names) { | |
173 print Rcmd " | |
174 colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design))) | |
175 "; | |
176 } | |
177 print Rcmd " | |
178 disp <- estimateGLMCommonDisp(DGE, design) | |
179 "; | |
180 if($OPTIONS{d} eq "tag" || $OPTIONS{d} eq "trend") { | |
181 print Rcmd " | |
182 disp <- estimateGLMTrendedDisp(disp, design) | |
183 "; | |
184 } | |
185 if($OPTIONS{d} eq "tag") { | |
186 print Rcmd " | |
187 disp <- estimateGLMTagwiseDisp(disp, design) | |
188 fit <- glmFit(disp, design) | |
189 pdf(file=\"$OPTIONS{e}/Tagwise_Dispersion_vs_Abundance.pdf\") | |
190 plotBCV(disp, cex=0.4) | |
191 dev.off() | |
192 "; | |
193 } | |
194 if(@add_cont) { | |
195 $all_cont = "\"".join("\", \"", @add_cont)."\""; | |
196 print Rcmd " | |
197 cont <- c(${all_cont}) | |
198 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont) | |
199 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont) | |
200 "; | |
201 } else { | |
202 print Rcmd " | |
203 cont <- NULL | |
204 "; | |
205 } | |
206 if(defined $OPTIONS{m}) { | |
207 print Rcmd " | |
208 for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")) | |
209 "; | |
210 } | |
211 if(!defined $OPTIONS{m} && !@add_cont){ | |
212 die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n"); | |
213 } | |
214 print Rcmd " | |
215 fit <- glmFit(disp, design) | |
216 cont <- makeContrasts(contrasts=cont, levels=design) | |
217 for(i in colnames(cont)) tested[[i]] <- glmLRT(fit, contrast=cont[,i]) | |
218 pdf(file=\"$OPTIONS{e}/Smear_Plots.pdf\") | |
219 for(i in colnames(cont)) { | |
220 dt <- decideTestsDGE(tested[[i]], p.value=0.05, adjust.method=\"$OPTIONS{f}\") | |
221 if(sum(dt) > 0) { | |
222 de_tags <- rownames(disp)[which(dt != 0)] | |
223 ttl <- \"Diff. Exp. Genes With adj. Pvalue < 0.05\" | |
224 } else { | |
225 de_tags <- rownames(topTags(tested[[i]], n=100)\$table) | |
226 ttl <- \"Top 100 tags\" | |
227 } | |
228 | |
229 if(length(dt) < 5000) { | |
230 pointcex = 0.5 | |
231 } else { | |
232 pointcex = 0.2 | |
233 } | |
234 plotSmear(disp, de.tags = de_tags, main = paste(\"Smear Plot\", i), cex=pointcex) | |
235 abline(h = c(-1, 1), col = \"blue\") | |
236 legend(\"topright\", c(\"2 Fold Change\", ttl) , lty=c(1, NA), pch=c(NA, 19), pt.cex=0.5, col=c(\"blue\", \"red\"), bty=\"n\") | |
237 } | |
238 dev.off() | |
239 "; | |
240 if(defined $OPTIONS{n}) { | |
241 print Rcmd " | |
242 tab <- data.frame(ID=rownames(fit\$fitted.values), fit\$fitted.values, stringsAsFactors=F) | |
243 write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F) | |
244 "; | |
245 } | |
246 } elsif($OPTIONS{a} eq "limma") { | |
247 for(my $fct = 0; $fct <= $#fact_names; $fct++) { | |
248 print Rcmd " | |
249 $fact_names[$fct] <- c($fact[$fct]) | |
250 "; | |
251 } | |
252 for(my $fct = 0; $fct <= $#cp_names; $fct++) { | |
253 print Rcmd " | |
254 $cp_names[$fct] <- c($cp[$fct]) | |
255 "; | |
256 } | |
257 my $all_fact = ""; | |
258 if(@fact_names) { | |
259 foreach (@fact_names) { | |
260 $all_fact .= " + factor($_)"; | |
261 } | |
262 } | |
263 my $all_cp = ""; | |
264 if(@cp_names) { | |
265 $all_cp = " + ".join(" + ", @cp_names); | |
266 } | |
267 print Rcmd " | |
268 group_fact <- factor(names(groups)) | |
269 design <- model.matrix(~ -1 + group_fact${all_fact}${all_cp}) | |
270 colnames(design) <- sub(\"group_fact\", \"\", colnames(design)) | |
271 "; | |
272 foreach my $fct (@fact_names) { | |
273 print Rcmd " | |
274 colnames(design) <- make.names(sub(\"factor.$fct.\", \"\", colnames(design))) | |
275 "; | |
276 } | |
277 print Rcmd " | |
278 isexpr <- rowSums(cpm(toc)>1) >= 2 | |
279 toc <- toc[isexpr, ] | |
280 pdf(file=\"$OPTIONS{e}/LIMMA_voom.pdf\") | |
281 y <- voom(toc, design, plot=TRUE, lib.size=colSums(toc)*norm_factors) | |
282 dev.off() | |
283 | |
284 pdf(file=\"$OPTIONS{e}/LIMMA_MDS_plot.pdf\") | |
285 plotMDS(y, labels=colnames(toc), col=as.numeric(factor(names(groups)))+1, gene.selection=\"common\") | |
286 dev.off() | |
287 fit <- lmFit(y, design) | |
288 "; | |
289 if(defined $OPTIONS{n}) { | |
290 if(defined $OPTIONS{l}) { | |
291 print Rcmd " | |
292 tab <- data.frame(ID=rownames(y\$E), y\$E, stringsAsFactors=F) | |
293 "; | |
294 } else { | |
295 print Rcmd " | |
296 tab <- data.frame(ID=rownames(y\$E), 2^y\$E, stringsAsFactors=F) | |
297 "; | |
298 } | |
299 print Rcmd " | |
300 write.table(tab, \"$OPTIONS{n}\", quote=F, sep=\"\\t\", row.names=F) | |
301 "; | |
302 } | |
303 if(@add_cont) { | |
304 $all_cont = "\"".join("\", \"", @add_cont)."\""; | |
305 print Rcmd " | |
306 cont <- c(${all_cont}) | |
307 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"([^0-9])\", sep=\"\"), paste(i, \"\\\\1\", sep=\"\"), cont) | |
308 for(i in uniq_groups) cont <- gsub(paste(groups[[i]], \"\$\", sep=\"\"), i, cont) | |
309 "; | |
310 } else { | |
311 print Rcmd " | |
312 cont <- NULL | |
313 "; | |
314 } | |
315 if(defined $OPTIONS{m}) { | |
316 print Rcmd " | |
317 for(i in 1:length(pw_tests)) cont <- c(cont, paste(pw_tests[[i]][2], \"-\", pw_tests[[i]][1], sep=\"\")) | |
318 "; | |
319 } | |
320 if(!defined $OPTIONS{m} && !@add_cont){ | |
321 die("No Contrasts have been specified, you must at least either select multiple pairwise comparisons or specify a custom contrast\n"); | |
322 } | |
323 print Rcmd " | |
324 cont <- makeContrasts(contrasts=cont, levels=design) | |
325 fit2 <- contrasts.fit(fit, cont) | |
326 fit2 <- eBayes(fit2) | |
327 "; | |
328 } else { | |
329 die("Anaysis type $OPTIONS{a} not found\n"); | |
330 | |
331 } | |
332 | |
333 if($OPTIONS{a} ne "limma") { | |
334 print Rcmd " | |
335 options(digits = 6) | |
336 tab <- NULL | |
337 for(i in names(tested)) { | |
338 tab_tmp <- topTags(tested[[i]], n=Inf, adjust.method=\"$OPTIONS{f}\")[[1]] | |
339 colnames(tab_tmp) <- paste(i, colnames(tab_tmp), sep=\":\") | |
340 tab_tmp <- tab_tmp[tagnames,] | |
341 if(is.null(tab)) { | |
342 tab <- tab_tmp | |
343 } else tab <- cbind(tab, tab_tmp) | |
344 } | |
345 tab <- cbind(Feature=rownames(tab), tab) | |
346 "; | |
347 } else { | |
348 print Rcmd " | |
349 tab <- NULL | |
350 options(digits = 6) | |
351 for(i in colnames(fit2)) { | |
352 tab_tmp <- topTable(fit2, coef=i, n=Inf, sort.by=\"none\", adjust.method=\"$OPTIONS{f}\") | |
353 colnames(tab_tmp)[-1] <- paste(i, colnames(tab_tmp)[-1], sep=\":\") | |
354 if(is.null(tab)) { | |
355 tab <- tab_tmp | |
356 } else tab <- cbind(tab, tab_tmp[,-1]) | |
357 } | |
358 "; | |
359 } | |
360 print Rcmd " | |
361 write.table(tab, \"$OPTIONS{o}\", quote=F, sep=\"\\t\", row.names=F) | |
362 sink(type=\"message\") | |
363 sink() | |
364 "; | |
365 close(Rcmd); | |
366 system("R --no-restore --no-save --no-readline < $OPTIONS{e}/r_script.R > $OPTIONS{e}/r_script.out"); | |
367 | |
368 open(HTML, ">$OPTIONS{h}"); | |
369 print HTML "<html><head><title>EdgeR: Empirical analysis of digital gene expression data</title></head><body><h3>EdgeR Additional Files:</h3><p><ul>\n"; | |
370 print HTML "<li><a href=MA_plots_normalisation.pdf>MA_plots_normalisation.pdf</a></li>\n"; | |
371 print HTML "<li><a href=MDSplot.pdf>MDSplot.pdf</a></li>\n"; | |
372 if($OPTIONS{a} eq "pw") { | |
373 if(defined $OPTIONS{t}) { | |
374 print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n"; | |
375 } | |
376 print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n"; | |
377 } elsif($OPTIONS{a} eq "glm" && $OPTIONS{d} eq "tag") { | |
378 print HTML "<li><a href=Tagwise_Dispersion_vs_Abundance.pdf>Tagwise_Dispersion_vs_Abundance.pdf</a></li>\n"; | |
379 print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n"; | |
380 } elsif($OPTIONS{a} eq "glm" && ($OPTIONS{d} eq "trend" || $OPTIONS{d} eq "common")) { | |
381 print HTML "<li><a href=Smear_Plots.pdf>Smear_Plots.pdf</a></li>\n"; | |
382 } elsif($OPTIONS{a} eq "limma") { | |
383 print HTML "<li><a href=LIMMA_MDS_plot.pdf>LIMMA_MDS_plot.pdf</a></li>\n"; | |
384 print HTML "<li><a href=LIMMA_voom.pdf>LIMMA_voom.pdf</a></li>\n"; | |
385 } | |
386 print HTML "<li><a href=r_script.R>r_script.R</a></li>\n"; | |
387 print HTML "<li><a href=r_script.out>r_script.out</a></li>\n"; | |
388 print HTML "<li><a href=r_script.err>r_script.err</a></li>\n"; | |
389 print HTML "</ul></p>\n"; | |
390 close(HTML); | |
391 |