Mercurial > repos > proteore > proteore_topgo
comparison enrichment_v3.R @ 0:472ad7da3d92 draft
planemo upload commit 9f9e8b72a239e58b5087b2b3737262c25cc2671e-dirty
author | proteore |
---|---|
date | Mon, 04 Dec 2017 09:39:21 -0500 |
parents | |
children | 829cbdb71efa |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:472ad7da3d92 |
---|---|
1 # enrichment_v3.R | |
2 # Usage : Rscript --vanilla enrichment_v3.R --inputtype tabfile (or | |
3 # copypaste) --input file.txt --ontology "BP/CC/MF" --option option (e.g | |
4 # : classic/elim...) --threshold threshold --correction correction --textoutput | |
5 # text --barplotoutput barplot | |
6 # --dotplotoutput dotplot --column column --geneuniver human | |
7 # e.g : Rscript --vanilla enrichment_v3.R --inputtype tabfile --input file.txt | |
8 # --ontology BP --option classic --threshold 1e-15 --correction holm | |
9 # --textoutput TRUE | |
10 # --barplotoutput TRUE --dotplotoutput TRUE --column c1 --geneuniverse | |
11 # org.Hs.eg.db | |
12 # INPUT : | |
13 # - type of input. Can be ids separated by a blank space (copypast), or a text | |
14 # file (tabfile) | |
15 # - file with at least one column of ensembl ids | |
16 # - gene ontology category : Biological Process (BP), Cellular Component (CC), Molecular Function (MF) | |
17 # - test option (relative to topGO algorithms) : elim, weight01, parentchild, or no option (classic) | |
18 # - threshold for enriched GO term pvalues (e.g : 1e-15) | |
19 # - correction for multiple testing (see p.adjust options : holm, hochberg, hommel, bonferroni, BH, BY,fdr,none | |
20 # - outputs wanted in this order text, barplot, dotplot with boolean value (e.g | |
21 # : TRUE TRUE TRUE ). | |
22 # Declare the output not wanted as none | |
23 # - column containing the ensembl ids if the input file is a tabfile | |
24 # - gene universe reference for the user chosen specie | |
25 # - header : if the input is a text file, does this text file have a header | |
26 # (TRUE/FALSE) | |
27 # | |
28 # OUTPUT : | |
29 # - outputs commanded by the user named respectively result.tsv for the text | |
30 # results file, barplot.png for the barplot image file and dotplot.png for the | |
31 # dotplot image file | |
32 | |
33 | |
34 # loading topGO library | |
35 library("topGO") | |
36 | |
37 '%!in%' <- function(x,y)!('%in%'(x,y)) | |
38 | |
39 | |
40 # Parse command line arguments | |
41 | |
42 args = commandArgs(trailingOnly = TRUE) | |
43 | |
44 # create a list of the arguments from the command line, separated by a blank space | |
45 hh <- paste(unlist(args),collapse=' ') | |
46 | |
47 # delete the first element of the list which is always a blank space | |
48 listoptions <- unlist(strsplit(hh,'--'))[-1] | |
49 | |
50 # for each input, split the arguments with blank space as separator, unlist, | |
51 # and delete the first element which is the input name (e.g --inputtype) | |
52 options.args <- sapply(listoptions,function(x){ | |
53 unlist(strsplit(x, ' '))[-1] | |
54 }) | |
55 # same as the step above, except that only the names are kept | |
56 options.names <- sapply(listoptions,function(x){ | |
57 option <- unlist(strsplit(x, ' '))[1] | |
58 }) | |
59 names(options.args) <- unlist(options.names) | |
60 | |
61 | |
62 if (length(options.args) != 12) { | |
63 stop("Not enough/Too many arguments", call. = FALSE) | |
64 } | |
65 | |
66 typeinput = options.args[1] | |
67 listfile = options.args[2] | |
68 onto = as.character(options.args[3]) | |
69 option = as.character(options.args[4]) | |
70 correction = as.character(options.args[6]) | |
71 threshold = as.numeric(options.args[5]) | |
72 text = as.character(options.args[7]) | |
73 barplot = as.character(options.args[8]) | |
74 dotplot = as.character(options.args[9]) | |
75 column = as.numeric(gsub("c","",options.args[10])) | |
76 geneuniverse = as.character(options.args[11]) | |
77 header = as.character(options.args[12]) | |
78 | |
79 if (typeinput=="copypaste"){ | |
80 sample = as.data.frame(unlist(listfile)) | |
81 sample = sample[,column] | |
82 } | |
83 if (typeinput=="tabfile"){ | |
84 | |
85 if (header=="TRUE"){ | |
86 sample = read.table(listfile,header=TRUE,sep="\t",na.strings="NA",fill=TRUE) | |
87 }else{ | |
88 sample = read.table(listfile,header=FALSE,sep="\t",na.strings="NA",fill=TRUE) | |
89 } | |
90 sample = sample[,column] | |
91 | |
92 } | |
93 # Launch enrichment analysis and return result data from the analysis or the null | |
94 # object if the enrichment could not be done. | |
95 goEnrichment = function(geneuniverse,sample,onto){ | |
96 | |
97 # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their | |
98 # associated ensembl ids according to the org package | |
99 xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl") | |
100 allGenes = unique(unlist(xx)) | |
101 # check if the genes given by the user can be found in the org package (gene | |
102 # universe), that is in | |
103 # allGenes | |
104 if (length(intersect(sample,allGenes))==0){ | |
105 | |
106 print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.") | |
107 return(c(NULL,NULL)) | |
108 | |
109 } | |
110 | |
111 geneList = factor(as.integer(allGenes %in% sample)) | |
112 names(geneList) <- allGenes | |
113 | |
114 | |
115 #topGO enrichment | |
116 | |
117 | |
118 # Creation of a topGOdata object | |
119 # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy | |
120 # Parameters : | |
121 # ontology : character string specifying the ontology of interest (BP, CC, MF) | |
122 # allGenes : named vector of type numeric or factor | |
123 # annot : tells topGO how to map genes to GO annotations. | |
124 # argument not used here : nodeSize : at which minimal number of GO annotations | |
125 # do we consider a gene | |
126 | |
127 myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl") | |
128 | |
129 | |
130 # Performing enrichment tests | |
131 result <- runTest(myGOdata, algorithm=option, statistic="fisher") | |
132 return(c(result,myGOdata)) | |
133 } | |
134 | |
135 # Some libraries such as GOsummaries won't be able to treat the values such as | |
136 # "< 1e-30" produced by topGO. As such it is important to delete the < char | |
137 # with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output. | |
138 deleteInfChar = function(values){ | |
139 | |
140 lines = grep("<",values) | |
141 if (length(lines)!=0){ | |
142 for (line in lines){ | |
143 values[line]=gsub("<","",values[line]) | |
144 } | |
145 } | |
146 return(values) | |
147 } | |
148 | |
149 corrMultipleTesting = function(result, myGOdata,correction,threshold){ | |
150 | |
151 # adjust for multiple testing | |
152 if (correction!="none"){ | |
153 # GenTable : transforms the result object into a list. Filters can be applied | |
154 # (e.g : with the topNodes argument, to get for instance only the n first | |
155 # GO terms with the lowest pvalues), but as we want to apply a correction we | |
156 # take all the GO terms, no matter their pvalues | |
157 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score)) | |
158 # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these | |
159 # values are converted to 1e-30 to be able to correct the pvalues | |
160 pvaluestmp = deleteInfChar(allRes$test) | |
161 | |
162 # the correction is done from the modified pvalues | |
163 allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp)) | |
164 allRes = as.data.frame(allRes) | |
165 | |
166 # Rename the test column by pvalues, so that is more explicit | |
167 nb = which(names(allRes) %in% c("test")) | |
168 | |
169 names(allRes)[nb] = "pvalues" | |
170 | |
171 allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),] | |
172 if (length(allRes$pvalues)==0){ | |
173 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") | |
174 return(NULL) | |
175 } | |
176 allRes = allRes[order(allRes$qvalues),] | |
177 } | |
178 | |
179 if (correction=="none"){ | |
180 # get all the go terms under user threshold | |
181 mysummary <- summary(attributes(result)$score <= threshold) | |
182 numsignif <- as.integer(mysummary[[3]]) | |
183 # get all significant nodes | |
184 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif) | |
185 | |
186 | |
187 allRes = as.data.frame(allRes) | |
188 # Rename the test column by pvalues, so that is more explicit | |
189 nb = which(names(allRes) %in% c("test")) | |
190 names(allRes)[nb] = "pvalues" | |
191 if (numsignif==0){ | |
192 | |
193 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") | |
194 return(NULL) | |
195 } | |
196 | |
197 allRes = allRes[order(allRes$pvalues),] | |
198 } | |
199 | |
200 return(allRes) | |
201 } | |
202 | |
203 # roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17 | |
204 roundValues = function(values){ | |
205 for (line in 1:length(values)){ | |
206 values[line]=as.numeric(gsub(".*e","1e",as.character(values[line]))) | |
207 } | |
208 return(values) | |
209 } | |
210 | |
211 createDotPlot = function(data, onto){ | |
212 | |
213 values = deleteInfChar(data$pvalues) | |
214 values = roundValues(values) | |
215 values = as.numeric(values) | |
216 | |
217 geneRatio = data$Significant/data$Annotated | |
218 goTerms = data$Term | |
219 count = data$Significant | |
220 | |
221 labely = paste("GO terms",onto,sep=" ") | |
222 png(filename="dotplot.png",res=300, width = 3200, height = 3200, units = "px") | |
223 sp1 = ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point() + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n") | |
224 | |
225 plot(sp1) | |
226 dev.off() | |
227 } | |
228 | |
229 createBarPlot = function(data, onto){ | |
230 | |
231 | |
232 values = deleteInfChar(data$pvalues) | |
233 values = roundValues(values) | |
234 | |
235 values = as.numeric(values) | |
236 goTerms = data$Term | |
237 count = data$Significant | |
238 png(filename="barplot.png",res=300, width = 3200, height = 3200, units = "px") | |
239 | |
240 labely = paste("GO terms",onto,sep=" ") | |
241 p<-ggplot(data, aes(x=goTerms, y=count,fill=values)) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") | |
242 plot(p) | |
243 dev.off() | |
244 } | |
245 | |
246 | |
247 # Produce the different outputs | |
248 createOutputs = function(result, cut_result,text, barplot, dotplot, onto){ | |
249 | |
250 | |
251 if (is.null(result)){ | |
252 | |
253 if (text=="TRUE"){ | |
254 | |
255 err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." | |
256 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) | |
257 | |
258 } | |
259 | |
260 if (barplot=="TRUE"){ | |
261 | |
262 png(filename="barplot.png") | |
263 plot.new() | |
264 #text(0,0,err_msg) | |
265 dev.off() | |
266 } | |
267 | |
268 if (dotplot=="TRUE"){ | |
269 | |
270 png(filename="dotplot.png") | |
271 plot.new() | |
272 #text(0,0,err_msg) | |
273 dev.off() | |
274 | |
275 } | |
276 return(TRUE) | |
277 } | |
278 | |
279 | |
280 if (is.null(cut_result)){ | |
281 | |
282 | |
283 if (text=="TRUE"){ | |
284 | |
285 err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." | |
286 write.table(err_msg, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) | |
287 | |
288 } | |
289 | |
290 if (barplot=="TRUE"){ | |
291 | |
292 png(filename="barplot.png") | |
293 plot.new() | |
294 text(0,0,err_msg) | |
295 dev.off() | |
296 } | |
297 | |
298 if (dotplot=="TRUE"){ | |
299 | |
300 png(filename="dotplot.png") | |
301 plot.new() | |
302 text(0,0,err_msg) | |
303 dev.off() | |
304 | |
305 } | |
306 return(TRUE) | |
307 | |
308 | |
309 | |
310 } | |
311 | |
312 if (text=="TRUE"){ | |
313 write.table(cut_result, file='result.csv', quote=FALSE, sep='\t', col.names = T, row.names = F) | |
314 } | |
315 | |
316 if (barplot=="TRUE"){ | |
317 | |
318 createBarPlot(cut_result, onto) | |
319 } | |
320 | |
321 if (dotplot=="TRUE"){ | |
322 | |
323 createDotPlot(cut_result, onto) | |
324 } | |
325 return(TRUE) | |
326 } | |
327 | |
328 | |
329 | |
330 # Load R library ggplot2 to plot graphs | |
331 library(ggplot2) | |
332 | |
333 # Launch enrichment analysis | |
334 allresult = goEnrichment(geneuniverse,sample,onto) | |
335 result = allresult[1][[1]] | |
336 myGOdata = allresult[2][[1]] | |
337 if (!is.null(result)){ | |
338 | |
339 # Adjust the result with a multiple testing correction or not and with the user | |
340 # p-value cutoff | |
341 cut_result = corrMultipleTesting(result,myGOdata, correction,threshold) | |
342 }else{ | |
343 | |
344 cut_result=NULL | |
345 | |
346 } | |
347 | |
348 | |
349 createOutputs(result, cut_result,text, barplot, dotplot, onto) | |
350 |