Mercurial > repos > artbio > repenrich
comparison edgeR_repenrich.R @ 11:6bba3e33c2e7 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/repenrich commit 11df9f0b68d35d3a9424a17e4cefee6cfb9d4c19
author | artbio |
---|---|
date | Sat, 09 Mar 2024 22:32:46 +0000 |
parents | 5dd4791c7b70 |
children | 530626b0757c |
comparison
equal
deleted
inserted
replaced
10:6f4143893463 | 11:6bba3e33c2e7 |
---|---|
1 #!/usr/bin/env Rscript | |
2 | |
3 # A command-line interface to edgeR for use with Galaxy edger-repenrich | |
4 # written by Christophe Antoniewski drosofff@gmail.com 2017.05.30 | |
5 | |
6 | |
7 # setup R error handling to go to stderr | 1 # setup R error handling to go to stderr |
8 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 2 options(show.error.messages = FALSE, error = function() { |
3 cat(geterrmessage(), file = stderr()) | |
4 q("no", 1, FALSE) | |
5 }) | |
9 | 6 |
10 # To not crash galaxy with an UTF8 error with not-US LC settings. | 7 # To not crash galaxy with an UTF8 error with not-US LC settings. |
11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
12 | 9 |
10 # load libraries | |
13 library("getopt") | 11 library("getopt") |
14 library("tools") | 12 library("tools") |
13 library("rjson") | |
14 suppressPackageStartupMessages({ | |
15 library("edgeR") | |
16 library("limma") | |
17 }) | |
18 | |
15 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) | 19 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) |
16 args <- commandArgs(trailingOnly = TRUE) | |
17 | 20 |
18 # get options, using the spec as defined by the enclosed list. | 21 # get options, using the spec as defined by the enclosed list. |
19 # we read the options from the default: commandArgs(TRUE). | 22 # we read the options from the default: commandArgs(TRUE). |
20 spec <- matrix(c( | 23 spec <- matrix( |
21 "quiet", "q", 0, "logical", | 24 c( |
22 "help", "h", 0, "logical", | 25 "quiet", "q", 0, "logical", |
23 "outfile", "o", 1, "character", | 26 "outfile", "o", 1, "character", |
24 "countsfile", "n", 1, "character", | 27 "countsfile", "n", 1, "character", |
25 "factorName", "N", 1, "character", | 28 "factorName", "N", 1, "character", |
26 "levelNameA", "A", 1, "character", | 29 "levelNameA", "A", 1, "character", |
27 "levelNameB", "B", 1, "character", | 30 "levelNameB", "B", 1, "character", |
28 "levelAfiles", "a", 1, "character", | 31 "levelAfiles", "a", 1, "character", |
29 "levelBfiles", "b", 1, "character", | 32 "levelBfiles", "b", 1, "character", |
30 "alignmentA", "i", 1, "character", | 33 "alignmentA", "i", 1, "character", |
31 "alignmentB", "j", 1, "character", | 34 "alignmentB", "j", 1, "character", |
32 "plots" , "p", 1, "character"), | 35 "plots", "p", 1, "character" |
33 byrow=TRUE, ncol=4) | 36 ), |
37 byrow = TRUE, ncol = 4 | |
38 ) | |
34 opt <- getopt(spec) | 39 opt <- getopt(spec) |
35 | 40 |
36 # if help was asked for print a friendly message | |
37 # and exit with a non-zero error code | |
38 if (!is.null(opt$help)) { | |
39 cat(getopt(spec, usage=TRUE)) | |
40 q(status=1) | |
41 } | |
42 | |
43 # enforce the following required arguments | |
44 if (is.null(opt$outfile)) { | |
45 cat("'outfile' is required\n") | |
46 q(status=1) | |
47 } | |
48 if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) { | |
49 cat("input count files are required for both levels\n") | |
50 q(status=1) | |
51 } | |
52 if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) { | |
53 cat("total aligned read files are required for both levels\n") | |
54 q(status=1) | |
55 } | |
56 | |
57 verbose <- if (is.null(opt$quiet)) { | |
58 TRUE | |
59 } else { | |
60 FALSE | |
61 } | |
62 | |
63 suppressPackageStartupMessages({ | |
64 library("edgeR") | |
65 library("limma") | |
66 }) | |
67 | |
68 # build levels A and B file lists | 41 # build levels A and B file lists |
69 | |
70 library("rjson") | |
71 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") | 42 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error") |
72 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") | 43 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error") |
73 listA <- list() | 44 listA <- list() |
74 indice = 0 | 45 indice <- 0 |
75 listA[["level"]] <- opt$levelNameA | 46 listA[["level"]] <- opt$levelNameA |
76 for (file in filesA) { | 47 for (file in filesA) { |
77 indice = indice +1 | 48 indice <- indice + 1 |
78 listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE) | 49 listA[[paste0(opt$levelNameA, "_", indice)]] <- read.delim(file, header = FALSE) |
79 } | 50 } |
80 listB <- list() | 51 listB <- list() |
81 indice = 0 | 52 indice <- 0 |
82 listB[["level"]] <- opt$levelNameB | 53 listB[["level"]] <- opt$levelNameB |
83 for (file in filesB) { | 54 for (file in filesB) { |
84 indice = indice +1 | 55 indice <- indice + 1 |
85 listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE) | 56 listB[[paste0(opt$levelNameB, "_", indice)]] <- read.delim(file, header = FALSE) |
86 } | 57 } |
87 | 58 |
88 # build a counts table | 59 # build a counts table |
89 counts <- data.frame(row.names=listA[[2]][,1]) | 60 counts <- data.frame(row.names = listA[[2]][, 1]) |
90 for (element in names(listA[-1])) { | 61 for (element in names(listA[-1])) { |
91 counts<-cbind(counts, listA[[element]][,4]) | 62 counts <- cbind(counts, listA[[element]][, 4]) |
92 } | 63 } |
93 for (element in names(listB[-1])) { | 64 for (element in names(listB[-1])) { |
94 counts<-cbind(counts, listB[[element]][,4]) | 65 counts <- cbind(counts, listB[[element]][, 4]) |
95 } | 66 } |
96 colnames(counts)=c(names(listA[-1]), names(listB[-1])) | 67 colnames(counts) <- c(names(listA[-1]), names(listB[-1])) |
97 | 68 |
98 # build aligned counts vector | 69 # build aligned counts vector |
99 | |
100 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error") | 70 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error") |
101 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error") | 71 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error") |
102 sizes <- c() | 72 sizes <- c() |
103 for (file in filesi) { | 73 for (file in filesi) { |
104 sizes <- c(sizes, read.delim(file, header=TRUE)[1,1]) | 74 sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1]) |
105 } | 75 } |
106 for (file in filesj) { | 76 for (file in filesj) { |
107 sizes <- c(sizes, read.delim(file, header=TRUE)[1,1]) | 77 sizes <- c(sizes, read.delim(file, header = TRUE)[1, 1]) |
108 } | 78 } |
109 | 79 |
110 # build a meta data object | 80 # build a meta data object |
111 | |
112 meta <- data.frame( | 81 meta <- data.frame( |
113 row.names=colnames(counts), | 82 row.names = colnames(counts), |
114 condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ), | 83 condition = c(rep(opt$levelNameA, length(filesA)), rep(opt$levelNameB, length(filesB))), |
115 libsize=sizes | 84 libsize = sizes |
116 ) | 85 ) |
117 | 86 |
118 | 87 |
119 # Define the library size and conditions for the GLM | 88 # Define the library size and conditions for the GLM |
120 libsize <- meta$libsize | 89 libsize <- meta$libsize |
121 condition <- factor(meta$condition) | 90 condition <- factor(meta$condition) |
122 design <- model.matrix(~0+condition) | 91 design <- model.matrix(~ 0 + condition) |
123 colnames(design) <- levels(meta$condition) | 92 colnames(design) <- levels(condition) |
124 | |
125 | 93 |
126 # Build a DGE object for the GLM | 94 # Build a DGE object for the GLM |
127 y <- DGEList(counts=counts, lib.size=libsize) | 95 y <- DGEList(counts = counts, lib.size = libsize) |
128 | 96 |
129 # Normalize the data | 97 # Normalize the data |
130 y <- calcNormFactors(y) | 98 y <- calcNormFactors(y) |
131 y$samples | |
132 # plotMDS(y) latter | |
133 | 99 |
134 # Estimate the variance | 100 # Estimate the variance |
135 y <- estimateGLMCommonDisp(y, design) | 101 y <- estimateGLMCommonDisp(y, design) |
136 y <- estimateGLMTrendedDisp(y, design) | 102 y <- estimateGLMTrendedDisp(y, design) |
137 y <- estimateGLMTagwiseDisp(y, design) | 103 y <- estimateGLMTagwiseDisp(y, design) |
138 # plotBCV(y) latter | |
139 | 104 |
140 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads | 105 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads |
141 cpm <- cpm(y, log=FALSE, lib.size=libsize) | 106 cpm <- cpm(y, log = FALSE, lib.size = libsize) |
142 cpm <- as.data.frame(cpm) | 107 cpm <- as.data.frame(cpm) |
143 colnames(cpm) <- colnames(counts) | 108 colnames(cpm) <- colnames(counts) |
144 if (!is.null(opt$countsfile)) { | 109 if (!is.null(opt$countsfile)) { |
145 normalizedAbundance <- data.frame(Tag=rownames(cpm)) | 110 normalizedAbundance <- data.frame(Tag = rownames(cpm)) |
146 normalizedAbundance <- cbind(normalizedAbundance, cpm) | 111 normalizedAbundance <- cbind(normalizedAbundance, cpm) |
147 write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE) | 112 write.table(normalizedAbundance, file = opt$countsfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) |
148 } | 113 } |
149 | 114 |
150 # Conduct fitting of the GLM | 115 # Conduct fitting of the GLM |
151 yfit <- glmFit(y, design) | 116 yfit <- glmFit(y, design) |
152 | 117 |
153 # Initialize result matrices to contain the results of the GLM | 118 # Initialize result matrices to contain the results of the GLM |
154 results <- matrix(nrow=dim(counts)[1],ncol=0) | 119 results <- matrix(nrow = dim(counts)[1], ncol = 0) |
155 logfc <- matrix(nrow=dim(counts)[1],ncol=0) | 120 logfc <- matrix(nrow = dim(counts)[1], ncol = 0) |
156 | 121 |
157 # Make the comparisons for the GLM | 122 # Make the comparisons for the GLM |
158 my.contrasts <- makeContrasts( | 123 my.contrasts <- makeContrasts( |
159 paste0(opt$levelNameA,"_",opt$levelNameB," = ", opt$levelNameA, " - ", opt$levelNameB), | 124 paste0(opt$levelNameA, "_", opt$levelNameB, " = ", opt$levelNameA, " - ", opt$levelNameB), |
160 levels = design | 125 levels = design |
161 ) | 126 ) |
162 | 127 |
163 # Define the contrasts used in the comparisons | 128 # Define the contrasts used in the comparisons |
164 allcontrasts = paste0(opt$levelNameA," vs ",opt$levelNameB) | 129 allcontrasts <- paste0(opt$levelNameA, " vs ", opt$levelNameB) |
165 | 130 |
166 # Conduct a for loop that will do the fitting of the GLM for each comparison | 131 # Conduct a for loop that will do the fitting of the GLM for each comparison |
167 # Put the results into the results objects | 132 # Put the results into the results objects |
168 lrt <- glmLRT(yfit, contrast=my.contrasts[,1]) | 133 lrt <- glmLRT(yfit, contrast = my.contrasts[, 1]) |
169 plotSmear(lrt, de.tags=rownames(y)) | 134 res <- topTags(lrt, n = dim(c)[1], sort.by = "none")$table |
170 title(allcontrasts) | 135 results <- cbind(results, res[, c(1, 5)]) |
171 res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table | 136 logfc <- cbind(logfc, res[c(1)]) |
172 results <- cbind(results,res[,c(1,5)]) | |
173 logfc <- cbind(logfc,res[c(1)]) | |
174 | 137 |
175 # Add the repeat types back into the results. | 138 # Add the repeat types back into the results. |
176 # We should still have the same order as the input data | 139 # We should still have the same order as the input data |
177 results$class <- listA[[2]][,2] | 140 results$class <- listA[[2]][, 2] |
178 results$type <- listA[[2]][,3] | 141 results$type <- listA[[2]][, 3] |
179 | |
180 # Sort the results table by the FDR | 142 # Sort the results table by the FDR |
181 results <- results[with(results, order(FDR)), ] | 143 results <- results[with(results, order(FDR)), ] |
182 | |
183 # Save the results | |
184 write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE) | |
185 | 144 |
186 # Plot Fold Changes for repeat classes and types | 145 # Plot Fold Changes for repeat classes and types |
187 | 146 |
188 # open the device and plots | 147 # open the device and plots |
189 if (!is.null(opt$plots)) { | 148 if (!is.null(opt$plots)) { |
190 if (verbose) cat("creating plots\n") | |
191 pdf(opt$plots) | 149 pdf(opt$plots) |
192 plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples") | 150 plotMDS(y, main = "Multidimensional Scaling Plot Of Distances Between Samples") |
193 plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot") | 151 plotBCV(y, xlab = "Gene abundance (Average log CPM)", main = "Biological Coefficient of Variation Plot") |
194 logFC <- results[, "logFC"] | 152 logFC <- results[, "logFC"] |
195 # Plot the repeat classes | 153 # Plot the repeat classes |
196 classes <- with(results, reorder(class, -logFC, median)) | 154 classes <- with(results, reorder(class, -logFC, median)) |
197 classes | 155 classes |
198 par(mar=c(6,10,4,1)) | 156 par(mar = c(6, 10, 4, 1)) |
199 boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE, | 157 boxplot(logFC ~ classes, |
200 las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Class")) | 158 data = results, outline = FALSE, horizontal = TRUE, |
201 abline(v=0) | 159 las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Class") |
160 ) | |
161 abline(v = 0) | |
202 # Plot the repeat types | 162 # Plot the repeat types |
203 types <- with(results, reorder(type, -logFC, median)) | 163 types <- with(results, reorder(type, -logFC, median)) |
204 boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE, | 164 boxplot(logFC ~ types, |
205 las=2, xlab="log2(Fold Change)", main=paste0(allcontrasts, ", by Type"), yaxt="n") | 165 data = results, outline = FALSE, horizontal = TRUE, |
206 axis(2, cex.axis=(1*28)/(length(levels(types))), | 166 las = 2, xlab = "log2(Fold Change)", ylab = "", cex.axis = 0.7, main = paste0(allcontrasts, ", by Type") |
207 at=seq(from=1, to=length(levels(types))), | 167 ) |
208 labels=levels(types), las=2) | 168 abline(v = 0) |
209 abline(v=0) | |
210 # volcano plot | 169 # volcano plot |
211 TEdata = cbind(rownames(results), as.data.frame(results), score=-log(results$FDR, 10)) | 170 TEdata <- cbind(rownames(results), as.data.frame(results), score = -log(results$FDR, 10)) |
212 colnames(TEdata) = c("Tag","log2FC", "FDR", "Class", "Type", "score") | 171 colnames(TEdata) <- c("Tag", "log2FC", "FDR", "Class", "Type", "score") |
213 color = ifelse(TEdata$FDR<0.05, "red", "black") | 172 color <- ifelse(TEdata$FDR < 0.05, "red", "black") |
214 s <- subset(TEdata, FDR<0.01) | 173 s <- subset(TEdata, FDR < 0.01) |
215 with(TEdata, plot(log2FC, score, pch=20, col=color, main="Volcano plot (all tag types)", ylab="-log10(FDR)")) | 174 with(TEdata, plot(log2FC, score, pch = 20, col = color, main = "Volcano plot (all tag types)", ylab = "-log10(FDR)")) |
216 text(s[,2], s[,6], labels = s[,5], pos = seq(from=1, to=3), cex=0.5) | 175 text(s[, 2], s[, 6], labels = s[, 5], pos = seq(from = 1, to = 3), cex = 0.5) |
217 } | 176 } |
218 | 177 |
219 # close the plot device | 178 # close the plot device |
220 if (!is.null(opt$plots)) { | 179 if (!is.null(opt$plots)) { |
221 cat("closing plot device\n") | 180 cat("closing plot device\n") |
222 dev.off() | 181 dev.off() |
223 } | 182 } |
224 | 183 |
184 # Save the results | |
185 results <- cbind(TE_item = rownames(results), results) | |
186 colnames(results) <- c("TE_item", "log2FC", "FDR", "Class", "Type") | |
187 results$log2FC <- format(results$log2FC, digits = 5) | |
188 results$FDR <- format(results$FDR, digits = 5) | |
189 write.table(results, opt$outfile, quote = FALSE, sep = "\t", col.names = TRUE, row.names = FALSE) | |
190 | |
225 cat("Session information:\n\n") | 191 cat("Session information:\n\n") |
226 | |
227 sessionInfo() | 192 sessionInfo() |
228 |