Mercurial > repos > proteore > proteore_topgo
comparison topGO_enrichment.R @ 10:e3430084c996 draft
planemo upload commit ad5f1c5a1a71d7fa2bc8bac408856aa80b0fc2a3
author | proteore |
---|---|
date | Tue, 18 Dec 2018 10:06:00 -0500 |
parents | |
children | fa2e27165d5d |
comparison
equal
deleted
inserted
replaced
9:70c0c8757f5f | 10:e3430084c996 |
---|---|
1 options(warn=-1) #TURN OFF WARNINGS !!!!!! | |
2 | |
3 suppressMessages(library(ggplot2)) | |
4 suppressMessages(library(topGO)) | |
5 | |
6 get_args <- function(){ | |
7 | |
8 ## Collect arguments | |
9 args <- commandArgs(TRUE) | |
10 | |
11 ## Default setting when no arguments passed | |
12 if(length(args) < 1) { | |
13 args <- c("--help") | |
14 } | |
15 | |
16 ## Help section | |
17 if("--help" %in% args) { | |
18 cat("Pathview R script | |
19 Arguments: | |
20 --help Print this test | |
21 --input_type | |
22 --onto | |
23 --option | |
24 --correction | |
25 --threshold | |
26 --text | |
27 --plot | |
28 --column | |
29 --geneuniverse | |
30 --header | |
31 | |
32 Example: | |
33 Rscript --vanilla enrichment_v3.R --inputtype=tabfile (or copypaste) --input=file.txt --ontology='BP/CC/MF' --option=option | |
34 (e.g : classic/elim...) --threshold=threshold --correction=correction --textoutput=text --barplotoutput=barplot --dotplotoutput=dotplot | |
35 --column=column --geneuniver=human \n\n") | |
36 | |
37 q(save="no") | |
38 } | |
39 | |
40 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | |
41 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | |
42 args <- as.list(as.character(argsDF$V2)) | |
43 names(args) <- argsDF$V1 | |
44 | |
45 return(args) | |
46 } | |
47 | |
48 read_file <- function(path,header){ | |
49 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE) | |
50 if (inherits(file,"try-error")){ | |
51 stop("File not found !") | |
52 }else{ | |
53 return(file) | |
54 } | |
55 } | |
56 | |
57 get_list_from_cp <-function(list){ | |
58 list = gsub(";"," ",list) | |
59 list = strsplit(list, "[ \t\n]+")[[1]] | |
60 list = list[list != ""] #remove empty entry | |
61 list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2") | |
62 return(list) | |
63 } | |
64 | |
65 check_ens_ids <- function(vector) { | |
66 ens_pattern = "^(ENS[A-Z]+[0-9]{11}|[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?|CG[0-9]+|[A-Z0-9]+\\.[0-9]+|YM[A-Z][0-9]{3}[a-z][0-9])$" | |
67 return(grepl(ens_pattern,vector)) | |
68 } | |
69 | |
70 str2bool <- function(x){ | |
71 if (any(is.element(c("t","true"),tolower(x)))){ | |
72 return (TRUE) | |
73 }else if (any(is.element(c("f","false"),tolower(x)))){ | |
74 return (FALSE) | |
75 }else{ | |
76 return(NULL) | |
77 } | |
78 } | |
79 | |
80 # Some libraries such as GOsummaries won't be able to treat the values such as | |
81 # "< 1e-30" produced by topGO. As such it is important to delete the < char | |
82 # with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output. | |
83 deleteInfChar = function(values){ | |
84 | |
85 lines = grep("<",values) | |
86 if (length(lines)!=0){ | |
87 for (line in lines){ | |
88 values[line]=gsub("<","",values[line]) | |
89 } | |
90 } | |
91 return(values) | |
92 } | |
93 | |
94 corrMultipleTesting = function(result, myGOdata,correction,threshold){ | |
95 | |
96 # adjust for multiple testing | |
97 if (correction!="none"){ | |
98 # GenTable : transforms the result object into a list. Filters can be applied | |
99 # (e.g : with the topNodes argument, to get for instance only the n first | |
100 # GO terms with the lowest pvalues), but as we want to apply a correction we | |
101 # take all the GO terms, no matter their pvalues | |
102 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score)) | |
103 # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these | |
104 # values are converted to 1e-30 to be able to correct the pvalues | |
105 pvaluestmp = deleteInfChar(allRes$test) | |
106 | |
107 # the correction is done from the modified pvalues | |
108 allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp)) | |
109 allRes = as.data.frame(allRes) | |
110 | |
111 # Rename the test column by pvalues, so that is more explicit | |
112 nb = which(names(allRes) %in% c("test")) | |
113 | |
114 names(allRes)[nb] = "pvalues" | |
115 | |
116 allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),] | |
117 if (length(allRes$pvalues)==0){ | |
118 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") | |
119 return(NULL) | |
120 } | |
121 allRes = allRes[order(allRes$qvalues),] | |
122 } | |
123 | |
124 if (correction=="none"){ | |
125 # get all the go terms under user threshold | |
126 mysummary <- summary(attributes(result)$score <= threshold) | |
127 numsignif <- as.integer(mysummary[[3]]) | |
128 # get all significant nodes | |
129 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif) | |
130 | |
131 | |
132 allRes = as.data.frame(allRes) | |
133 # Rename the test column by pvalues, so that is more explicit | |
134 nb = which(names(allRes) %in% c("test")) | |
135 names(allRes)[nb] = "pvalues" | |
136 if (numsignif==0){ | |
137 | |
138 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value") | |
139 return(NULL) | |
140 } | |
141 | |
142 allRes = allRes[order(allRes$pvalues),] | |
143 } | |
144 | |
145 return(allRes) | |
146 } | |
147 | |
148 # roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17 | |
149 roundValues = function(values){ | |
150 for (line in 1:length(values)){ | |
151 values[line]=as.numeric(gsub(".*e","1e",as.character(values[line]))) | |
152 } | |
153 return(values) | |
154 } | |
155 | |
156 createDotPlot = function(data, onto){ | |
157 | |
158 values = deleteInfChar(data$pvalues) | |
159 values = roundValues(values) | |
160 values = as.numeric(values) | |
161 | |
162 geneRatio = data$Significant/data$Annotated | |
163 goTerms = data$Term | |
164 count = data$Significant | |
165 | |
166 labely = paste("GO terms",onto,sep=" ") | |
167 ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point( ) + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n" ) | |
168 ggsave("dotplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") | |
169 } | |
170 | |
171 createBarPlot = function(data, onto){ | |
172 | |
173 | |
174 values = deleteInfChar(data$pvalues) | |
175 values = roundValues(values) | |
176 values = as.numeric(values) | |
177 | |
178 goTerms = data$Term | |
179 count = data$Significant | |
180 | |
181 labely = paste("GO terms",onto,sep=" ") | |
182 ggplot(data, aes(x=goTerms, y=count,fill=values,scale(scale = 0.5))) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n") | |
183 ggsave("barplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm") | |
184 } | |
185 | |
186 # Produce the different outputs | |
187 createOutputs = function(result, cut_result,text, barplot, dotplot, onto){ | |
188 | |
189 if (is.null(result)){ | |
190 if (text){ | |
191 err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)." | |
192 write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) | |
193 } | |
194 if (barplot){ | |
195 png(filename="barplot.png") | |
196 plot.new() | |
197 #text(0,0,err_msg) | |
198 dev.off() | |
199 } | |
200 if (dotplot){ | |
201 png(filename="dotplot.png") | |
202 plot.new() | |
203 #text(0,0,err_msg) | |
204 dev.off() | |
205 } | |
206 opt <- options(show.error.messages=FALSE) | |
207 on.exit(options(opt)) | |
208 stop("null result") | |
209 } | |
210 | |
211 if (is.null(cut_result)){ | |
212 if (text){ | |
213 err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value." | |
214 write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) | |
215 } | |
216 if (barplot){ | |
217 png(filename="barplot.png") | |
218 plot.new() | |
219 text(0,0,err_msg) | |
220 dev.off() | |
221 } | |
222 if (dotplot){ | |
223 png(filename="dotplot.png") | |
224 plot.new() | |
225 text(0,0,err_msg) | |
226 dev.off() | |
227 } | |
228 opt <- options(show.error.messages=FALSE) | |
229 on.exit(options(opt)) | |
230 stop("null cut_result") | |
231 } | |
232 | |
233 if (text){ | |
234 write.table(cut_result, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F) | |
235 } | |
236 | |
237 if (barplot){ | |
238 createBarPlot(cut_result, onto) | |
239 } | |
240 | |
241 if (dotplot){ | |
242 createDotPlot(cut_result, onto) | |
243 } | |
244 } | |
245 | |
246 # Launch enrichment analysis and return result data from the analysis or the null | |
247 # object if the enrichment could not be done. | |
248 goEnrichment = function(geneuniverse,sample,background_sample,onto){ | |
249 | |
250 if (is.null(background_sample)){ | |
251 xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl") # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their associated ensembl ids according to the org package | |
252 allGenes = unique(unlist(xx)) # check if the genes given by the user can be found in the org package (gene universe), that is in allGenes | |
253 } else { | |
254 allGenes = background_sample | |
255 } | |
256 | |
257 if (length(intersect(sample,allGenes))==0){ | |
258 print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.") | |
259 return(c(NULL,NULL)) | |
260 } | |
261 | |
262 geneList = factor(as.integer(allGenes %in% sample)) | |
263 if (length(levels(geneList)) == 1 ){ | |
264 stop("All or none of the background genes are found in tested genes dataset, enrichment analysis can't be done") | |
265 } | |
266 names(geneList) <- allGenes | |
267 | |
268 #topGO enrichment | |
269 | |
270 # Creation of a topGOdata object | |
271 # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy | |
272 # Parameters : | |
273 # ontology : character string specifying the ontology of interest (BP, CC, MF) | |
274 # allGenes : named vector of type numeric or factor | |
275 # annot : tells topGO how to map genes to GO annotations. | |
276 # argument not used here : nodeSize : at which minimal number of GO annotations | |
277 # do we consider a gene | |
278 | |
279 myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl") | |
280 | |
281 # Performing enrichment tests | |
282 result <- runTest(myGOdata, algorithm=option, statistic="fisher") | |
283 return(c(result,myGOdata)) | |
284 } | |
285 | |
286 args <- get_args() | |
287 | |
288 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.rda") | |
289 #load("/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.rda") | |
290 | |
291 input_type = args$inputtype | |
292 input = args$input | |
293 onto = args$ontology | |
294 option = args$option | |
295 correction = args$correction | |
296 threshold = as.numeric(args$threshold) | |
297 text = str2bool(args$textoutput) | |
298 barplot = "barplot" %in% unlist(strsplit(args$plot,",")) | |
299 dotplot = "dotplot" %in% unlist(strsplit(args$plot,",")) | |
300 column = as.numeric(gsub("c","",args$column)) | |
301 geneuniverse = args$geneuniverse | |
302 header = str2bool(args$header) | |
303 background = str2bool(args$background) | |
304 if (background){ | |
305 background_genes = args$background_genes | |
306 background_input_type = args$background_input_type | |
307 background_header = str2bool(args$background_header) | |
308 background_column = as.numeric(gsub("c","",args$background_column)) | |
309 } | |
310 | |
311 #get input | |
312 if (input_type=="copy_paste"){ | |
313 sample <- get_list_from_cp(input) | |
314 } else if (input_type=="file"){ | |
315 tab=read_file(input,header) | |
316 sample = trimws(unlist(strsplit(tab[,column],";"))) | |
317 } | |
318 | |
319 #check of ENS ids | |
320 if (! any(check_ens_ids(sample))){ | |
321 print("no ensembl gene ids found in your ids list, please check your IDs in input or the selected column of your input file") | |
322 stop() | |
323 } | |
324 | |
325 #get input if background genes | |
326 if (background){ | |
327 if (background_input_type=="copy_paste"){ | |
328 background_sample <- get_list_from_cp(background_genes) | |
329 } else if (background_input_type=="file"){ | |
330 background_tab=read_file(background_genes,background_header) | |
331 background_sample = unique(trimws(unlist(strsplit(background_tab[,background_column],";")))) | |
332 } | |
333 #check of ENS ids | |
334 if (! any(check_ens_ids(background_sample))){ | |
335 print("no ensembl gene ids found in your background ids list, please check your IDs in input or the selected column of your input file") | |
336 stop() | |
337 } | |
338 } else { | |
339 background_sample=NULL | |
340 } | |
341 | |
342 # Launch enrichment analysis | |
343 allresult = suppressMessages(goEnrichment(geneuniverse,sample,background_sample,onto)) | |
344 result = allresult[1][[1]] | |
345 myGOdata = allresult[2][[1]] | |
346 if (!is.null(result)){ | |
347 cut_result = corrMultipleTesting(result,myGOdata, correction,threshold) # Adjust the result with a multiple testing correction or not and with the user, p-value cutoff | |
348 }else{ | |
349 cut_result=NULL | |
350 } | |
351 | |
352 createOutputs(result, cut_result,text, barplot, dotplot, onto) |