comparison edgeR_repenrich.R @ 1:54a3f3a195d6 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/repenrich commit 114b47cc624e39b4f485c8623458fc98494c564d
author drosofff
date Mon, 29 May 2017 13:11:57 -0400
parents
children 1805b262c12d
comparison
equal deleted inserted replaced
0:1435d142041b 1:54a3f3a195d6
1 #!/usr/bin/env Rscript
2
3 # A command-line interface to edgeR for use with Galaxy edger-repenrich
4 # written by Christophe Antoniewski drosofff@gmail.com 2017.05.30
5
6
7 # setup R error handling to go to stderr
8 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
9
10 # To not crash galaxy with an UTF8 error with not-US LC settings.
11 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
12
13 library("getopt")
14 library("tools")
15 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
16 args <- commandArgs(trailingOnly = TRUE)
17
18 # get options, using the spec as defined by the enclosed list.
19 # we read the options from the default: commandArgs(TRUE).
20 spec <- matrix(c(
21 "quiet", "q", 0, "logical",
22 "help", "h", 0, "logical",
23 "outfile", "o", 1, "character",
24 "countsfile", "n", 1, "character",
25 "factorName", "N", 1, "character",
26 "levelNameA", "A", 1, "character",
27 "levelNameB", "B", 1, "character",
28 "levelAfiles", "a", 1, "character",
29 "levelBfiles", "b", 1, "character",
30 "alignmentA", "i", 1, "character",
31 "alignmentB", "j", 1, "character",
32 "plots" , "p", 1, "character"),
33 byrow=TRUE, ncol=4)
34 opt <- getopt(spec)
35
36 # if help was asked for print a friendly message
37 # and exit with a non-zero error code
38 if (!is.null(opt$help)) {
39 cat(getopt(spec, usage=TRUE))
40 q(status=1)
41 }
42
43 # enforce the following required arguments
44 if (is.null(opt$outfile)) {
45 cat("'outfile' is required\n")
46 q(status=1)
47 }
48 if (is.null(opt$levelAfiles) | is.null(opt$levelBfiles)) {
49 cat("input count files are required for both levels\n")
50 q(status=1)
51 }
52 if (is.null(opt$alignmentA) | is.null(opt$alignmentB)) {
53 cat("total aligned read files are required for both levels\n")
54 q(status=1)
55 }
56
57 verbose <- if (is.null(opt$quiet)) {
58 TRUE
59 } else {
60 FALSE
61 }
62
63 suppressPackageStartupMessages({
64 library("edgeR")
65 library("limma")
66 })
67
68 # build levels A and B file lists
69
70 library("rjson")
71 filesA <- fromJSON(opt$levelAfiles, method = "C", unexpected.escape = "error")
72 filesB <- fromJSON(opt$levelBfiles, method = "C", unexpected.escape = "error")
73 listA <- list()
74 indice = 0
75 listA[["level"]] <- opt$levelNameA
76 for (file in filesA) {
77 indice = indice +1
78 listA[[paste0(opt$levelNameA,"_",indice)]] <- read.delim(file, header=FALSE)
79 }
80 listB <- list()
81 indice = 0
82 listB[["level"]] <- opt$levelNameB
83 for (file in filesB) {
84 indice = indice +1
85 listB[[paste0(opt$levelNameB,"_",indice)]] <- read.delim(file, header=FALSE)
86 }
87
88 # build a counts table
89 counts <- data.frame(row.names=listA[[2]][,1])
90 for (element in names(listA[-1])) {
91 counts<-cbind(counts, listA[[element]][,4])
92 }
93 for (element in names(listB[-1])) {
94 counts<-cbind(counts, listB[[element]][,4])
95 }
96 colnames(counts)=c(names(listA[-1]), names(listB[-1]))
97
98 # build aligned counts vector
99
100 filesi <- fromJSON(opt$alignmentA, method = "C", unexpected.escape = "error")
101 filesj <- fromJSON(opt$alignmentB, method = "C", unexpected.escape = "error")
102 sizes <- c()
103 for (file in filesi) {
104 sizes <- c(sizes, read.delim(file, header=FALSE)[1,1])
105 }
106 for (file in filesj) {
107 sizes <- c(sizes, read.delim(file, header=FALSE)[1,1])
108 }
109
110 # build a meta data object
111
112 meta <- data.frame(
113 row.names=colnames(counts),
114 condition=c(rep(opt$levelNameA,length(filesA)), rep(opt$levelNameB,length(filesB)) ),
115 libsize=sizes
116 )
117
118
119 # Define the library size and conditions for the GLM
120 libsize <- meta$libsize
121 condition <- factor(meta$condition)
122 design <- model.matrix(~0+condition)
123 colnames(design) <- levels(meta$condition)
124
125
126 # Build a DGE object for the GLM
127 y <- DGEList(counts=counts, lib.size=libsize)
128
129 # Normalize the data
130 y <- calcNormFactors(y)
131 y$samples
132 # plotMDS(y) latter
133
134 # Estimate the variance
135 y <- estimateGLMCommonDisp(y, design)
136 y <- estimateGLMTrendedDisp(y, design)
137 y <- estimateGLMTagwiseDisp(y, design)
138 # plotBCV(y) latter
139
140 # Builds and outputs an object to contain the normalized read abundance in counts per million of reads
141 cpm <- cpm(y, log=FALSE, lib.size=libsize)
142 cpm <- as.data.frame(cpm)
143 colnames(cpm) <- colnames(counts)
144 if (!is.null(opt$countsfile)) {
145 normalizedAbundance <- data.frame(Tag=rownames(cpm))
146 normalizedAbundance <- cbind(normalizedAbundance, cpm)
147 write.table(normalizedAbundance, file=opt$countsfile, sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
148 }
149
150 # test
151 print(counts)
152 print(cpm)
153
154 # Conduct fitting of the GLM
155 yfit <- glmFit(y, design)
156
157 # Initialize result matrices to contain the results of the GLM
158 results <- matrix(nrow=dim(counts)[1],ncol=0)
159 logfc <- matrix(nrow=dim(counts)[1],ncol=0)
160
161 # Make the comparisons for the GLM
162 my.contrasts <- makeContrasts(
163 paste0(opt$levelNameB,"_",opt$levelNameA," = ", opt$levelNameB, " - ", opt$levelNameA),
164 levels = design
165 )
166
167 # Define the contrasts used in the comparisons
168 allcontrasts = paste0(opt$levelNameB," vs ",opt$levelNameA)
169
170 # Conduct a for loop that will do the fitting of the GLM for each comparison
171 # Put the results into the results objects
172 lrt <- glmLRT(yfit, contrast=my.contrasts[,1])
173 plotSmear(lrt, de.tags=rownames(y))
174 title(allcontrasts)
175 res <- topTags(lrt,n=dim(c)[1],sort.by="none")$table
176 results <- cbind(results,res[,c(1,5)])
177 logfc <- cbind(logfc,res[c(1)])
178
179 # Add the repeat types back into the results.
180 # We should still have the same order as the input data
181 results$class <- listA[[2]][,2]
182 results$type <- listA[[2]][,3]
183
184 # Sort the results table by the FDR
185 results <- results[with(results, order(FDR)), ]
186
187 # Save the results
188 write.table(results, opt$outfile, quote=FALSE, sep="\t", col.names=FALSE)
189
190 # Plot Fold Changes for repeat classes and types
191
192 # open the device and plots
193 if (!is.null(opt$plots)) {
194 if (verbose) cat("creating plots\n")
195 pdf(opt$plots)
196 plotMDS(y, main="Multidimensional Scaling Plot Of Distances Between Samples")
197 plotBCV(y, xlab="Gene abundance (Average log CPM)", main="Biological Coefficient of Variation Plot")
198 logFC <- results[, "logFC"]
199 # Plot the repeat classes
200 classes <- with(results, reorder(class, -logFC, median))
201 par(mar=c(6,10,4,1))
202 boxplot(logFC ~ classes, data=results, outline=FALSE, horizontal=TRUE,
203 las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Class"))
204 abline(v=0)
205 # Plot the repeat types
206 types <- with(results, reorder(type, -logFC, median))
207 boxplot(logFC ~ types, data=results, outline=FALSE, horizontal=TRUE,
208 las=2, xlab="log(Fold Change)", main=paste0(allcontrasts, ", by Type"))
209 abline(v=0)
210 }
211
212 # close the plot device
213 if (!is.null(opt$plots)) {
214 cat("closing plot device\n")
215 dev.off()
216 }
217
218 cat("Session information:\n\n")
219
220 sessionInfo()
221