comparison topGO_enrichment.R @ 10:e3430084c996 draft

planemo upload commit ad5f1c5a1a71d7fa2bc8bac408856aa80b0fc2a3
author proteore
date Tue, 18 Dec 2018 10:06:00 -0500
parents
children fa2e27165d5d
comparison
equal deleted inserted replaced
9:70c0c8757f5f 10:e3430084c996
1 options(warn=-1) #TURN OFF WARNINGS !!!!!!
2
3 suppressMessages(library(ggplot2))
4 suppressMessages(library(topGO))
5
6 get_args <- function(){
7
8 ## Collect arguments
9 args <- commandArgs(TRUE)
10
11 ## Default setting when no arguments passed
12 if(length(args) < 1) {
13 args <- c("--help")
14 }
15
16 ## Help section
17 if("--help" %in% args) {
18 cat("Pathview R script
19 Arguments:
20 --help Print this test
21 --input_type
22 --onto
23 --option
24 --correction
25 --threshold
26 --text
27 --plot
28 --column
29 --geneuniverse
30 --header
31
32 Example:
33 Rscript --vanilla enrichment_v3.R --inputtype=tabfile (or copypaste) --input=file.txt --ontology='BP/CC/MF' --option=option
34 (e.g : classic/elim...) --threshold=threshold --correction=correction --textoutput=text --barplotoutput=barplot --dotplotoutput=dotplot
35 --column=column --geneuniver=human \n\n")
36
37 q(save="no")
38 }
39
40 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
41 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
42 args <- as.list(as.character(argsDF$V2))
43 names(args) <- argsDF$V1
44
45 return(args)
46 }
47
48 read_file <- function(path,header){
49 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE)
50 if (inherits(file,"try-error")){
51 stop("File not found !")
52 }else{
53 return(file)
54 }
55 }
56
57 get_list_from_cp <-function(list){
58 list = gsub(";"," ",list)
59 list = strsplit(list, "[ \t\n]+")[[1]]
60 list = list[list != ""] #remove empty entry
61 list = gsub("-.+", "", list) #Remove isoform accession number (e.g. "-2")
62 return(list)
63 }
64
65 check_ens_ids <- function(vector) {
66 ens_pattern = "^(ENS[A-Z]+[0-9]{11}|[A-Z]{3}[0-9]{3}[A-Za-z](-[A-Za-z])?|CG[0-9]+|[A-Z0-9]+\\.[0-9]+|YM[A-Z][0-9]{3}[a-z][0-9])$"
67 return(grepl(ens_pattern,vector))
68 }
69
70 str2bool <- function(x){
71 if (any(is.element(c("t","true"),tolower(x)))){
72 return (TRUE)
73 }else if (any(is.element(c("f","false"),tolower(x)))){
74 return (FALSE)
75 }else{
76 return(NULL)
77 }
78 }
79
80 # Some libraries such as GOsummaries won't be able to treat the values such as
81 # "< 1e-30" produced by topGO. As such it is important to delete the < char
82 # with the deleteInfChar function. Nevertheless the user will have access to the original results in the text output.
83 deleteInfChar = function(values){
84
85 lines = grep("<",values)
86 if (length(lines)!=0){
87 for (line in lines){
88 values[line]=gsub("<","",values[line])
89 }
90 }
91 return(values)
92 }
93
94 corrMultipleTesting = function(result, myGOdata,correction,threshold){
95
96 # adjust for multiple testing
97 if (correction!="none"){
98 # GenTable : transforms the result object into a list. Filters can be applied
99 # (e.g : with the topNodes argument, to get for instance only the n first
100 # GO terms with the lowest pvalues), but as we want to apply a correction we
101 # take all the GO terms, no matter their pvalues
102 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=length(attributes(result)$score))
103 # Some pvalues given by topGO are not numeric (e.g : "<1e-30). As such, these
104 # values are converted to 1e-30 to be able to correct the pvalues
105 pvaluestmp = deleteInfChar(allRes$test)
106
107 # the correction is done from the modified pvalues
108 allRes$qvalues = p.adjust(pvaluestmp, method = as.character(correction), n = length(pvaluestmp))
109 allRes = as.data.frame(allRes)
110
111 # Rename the test column by pvalues, so that is more explicit
112 nb = which(names(allRes) %in% c("test"))
113
114 names(allRes)[nb] = "pvalues"
115
116 allRes = allRes[which(as.numeric(allRes$pvalues) <= threshold),]
117 if (length(allRes$pvalues)==0){
118 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
119 return(NULL)
120 }
121 allRes = allRes[order(allRes$qvalues),]
122 }
123
124 if (correction=="none"){
125 # get all the go terms under user threshold
126 mysummary <- summary(attributes(result)$score <= threshold)
127 numsignif <- as.integer(mysummary[[3]])
128 # get all significant nodes
129 allRes <- GenTable(myGOdata, test = result, orderBy = "result", ranksOf = "result",topNodes=numsignif)
130
131
132 allRes = as.data.frame(allRes)
133 # Rename the test column by pvalues, so that is more explicit
134 nb = which(names(allRes) %in% c("test"))
135 names(allRes)[nb] = "pvalues"
136 if (numsignif==0){
137
138 print("Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value")
139 return(NULL)
140 }
141
142 allRes = allRes[order(allRes$pvalues),]
143 }
144
145 return(allRes)
146 }
147
148 # roundValues will simplify the results by rounding down the values. For instance 1.1e-17 becomes 1e-17
149 roundValues = function(values){
150 for (line in 1:length(values)){
151 values[line]=as.numeric(gsub(".*e","1e",as.character(values[line])))
152 }
153 return(values)
154 }
155
156 createDotPlot = function(data, onto){
157
158 values = deleteInfChar(data$pvalues)
159 values = roundValues(values)
160 values = as.numeric(values)
161
162 geneRatio = data$Significant/data$Annotated
163 goTerms = data$Term
164 count = data$Significant
165
166 labely = paste("GO terms",onto,sep=" ")
167 ggplot(data,aes(x=geneRatio,y=goTerms, color=values,size=count)) +geom_point( ) + scale_colour_gradientn(colours=c("red","violet","blue")) + xlab("Gene Ratio") + ylab(labely) + labs(color="p-values\n" )
168 ggsave("dotplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm")
169 }
170
171 createBarPlot = function(data, onto){
172
173
174 values = deleteInfChar(data$pvalues)
175 values = roundValues(values)
176 values = as.numeric(values)
177
178 goTerms = data$Term
179 count = data$Significant
180
181 labely = paste("GO terms",onto,sep=" ")
182 ggplot(data, aes(x=goTerms, y=count,fill=values,scale(scale = 0.5))) + ylab("Gene count") + xlab(labely) +geom_bar(stat="identity") + scale_fill_gradientn(colours=c("red","violet","blue")) + coord_flip() + labs(fill="p-values\n")
183 ggsave("barplot.png", device = "png", dpi = 320, limitsize = TRUE, width = 15, height = 15, units="cm")
184 }
185
186 # Produce the different outputs
187 createOutputs = function(result, cut_result,text, barplot, dotplot, onto){
188
189 if (is.null(result)){
190 if (text){
191 err_msg = "None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably either have no associated GO terms or are not ENSG identifiers (e.g : ENSG00000012048)."
192 write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F)
193 }
194 if (barplot){
195 png(filename="barplot.png")
196 plot.new()
197 #text(0,0,err_msg)
198 dev.off()
199 }
200 if (dotplot){
201 png(filename="dotplot.png")
202 plot.new()
203 #text(0,0,err_msg)
204 dev.off()
205 }
206 opt <- options(show.error.messages=FALSE)
207 on.exit(options(opt))
208 stop("null result")
209 }
210
211 if (is.null(cut_result)){
212 if (text){
213 err_msg = "Threshold was too stringent, no GO term found with pvalue equal or lesser than the threshold value."
214 write.table(err_msg, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F)
215 }
216 if (barplot){
217 png(filename="barplot.png")
218 plot.new()
219 text(0,0,err_msg)
220 dev.off()
221 }
222 if (dotplot){
223 png(filename="dotplot.png")
224 plot.new()
225 text(0,0,err_msg)
226 dev.off()
227 }
228 opt <- options(show.error.messages=FALSE)
229 on.exit(options(opt))
230 stop("null cut_result")
231 }
232
233 if (text){
234 write.table(cut_result, file='result', quote=FALSE, sep='\t', col.names = T, row.names = F)
235 }
236
237 if (barplot){
238 createBarPlot(cut_result, onto)
239 }
240
241 if (dotplot){
242 createDotPlot(cut_result, onto)
243 }
244 }
245
246 # Launch enrichment analysis and return result data from the analysis or the null
247 # object if the enrichment could not be done.
248 goEnrichment = function(geneuniverse,sample,background_sample,onto){
249
250 if (is.null(background_sample)){
251 xx = annFUN.org(onto,mapping=geneuniverse,ID="ensembl") # get all the GO terms of the corresponding ontology (BP/CC/MF) and all their associated ensembl ids according to the org package
252 allGenes = unique(unlist(xx)) # check if the genes given by the user can be found in the org package (gene universe), that is in allGenes
253 } else {
254 allGenes = background_sample
255 }
256
257 if (length(intersect(sample,allGenes))==0){
258 print("None of the input ids can be found in the org package data, enrichment analysis cannot be realized. \n The inputs ids probably have no associated GO terms.")
259 return(c(NULL,NULL))
260 }
261
262 geneList = factor(as.integer(allGenes %in% sample))
263 if (length(levels(geneList)) == 1 ){
264 stop("All or none of the background genes are found in tested genes dataset, enrichment analysis can't be done")
265 }
266 names(geneList) <- allGenes
267
268 #topGO enrichment
269
270 # Creation of a topGOdata object
271 # It will contain : the list of genes of interest, the GO annotations and the GO hierarchy
272 # Parameters :
273 # ontology : character string specifying the ontology of interest (BP, CC, MF)
274 # allGenes : named vector of type numeric or factor
275 # annot : tells topGO how to map genes to GO annotations.
276 # argument not used here : nodeSize : at which minimal number of GO annotations
277 # do we consider a gene
278
279 myGOdata = new("topGOdata", description="SEA with TopGO", ontology=onto, allGenes=geneList, annot = annFUN.org, mapping=geneuniverse,ID="ensembl")
280
281 # Performing enrichment tests
282 result <- runTest(myGOdata, algorithm=option, statistic="fisher")
283 return(c(result,myGOdata))
284 }
285
286 args <- get_args()
287
288 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.rda")
289 #load("/home/dchristiany/proteore_project/ProteoRE/tools/topGO/args.rda")
290
291 input_type = args$inputtype
292 input = args$input
293 onto = args$ontology
294 option = args$option
295 correction = args$correction
296 threshold = as.numeric(args$threshold)
297 text = str2bool(args$textoutput)
298 barplot = "barplot" %in% unlist(strsplit(args$plot,","))
299 dotplot = "dotplot" %in% unlist(strsplit(args$plot,","))
300 column = as.numeric(gsub("c","",args$column))
301 geneuniverse = args$geneuniverse
302 header = str2bool(args$header)
303 background = str2bool(args$background)
304 if (background){
305 background_genes = args$background_genes
306 background_input_type = args$background_input_type
307 background_header = str2bool(args$background_header)
308 background_column = as.numeric(gsub("c","",args$background_column))
309 }
310
311 #get input
312 if (input_type=="copy_paste"){
313 sample <- get_list_from_cp(input)
314 } else if (input_type=="file"){
315 tab=read_file(input,header)
316 sample = trimws(unlist(strsplit(tab[,column],";")))
317 }
318
319 #check of ENS ids
320 if (! any(check_ens_ids(sample))){
321 print("no ensembl gene ids found in your ids list, please check your IDs in input or the selected column of your input file")
322 stop()
323 }
324
325 #get input if background genes
326 if (background){
327 if (background_input_type=="copy_paste"){
328 background_sample <- get_list_from_cp(background_genes)
329 } else if (background_input_type=="file"){
330 background_tab=read_file(background_genes,background_header)
331 background_sample = unique(trimws(unlist(strsplit(background_tab[,background_column],";"))))
332 }
333 #check of ENS ids
334 if (! any(check_ens_ids(background_sample))){
335 print("no ensembl gene ids found in your background ids list, please check your IDs in input or the selected column of your input file")
336 stop()
337 }
338 } else {
339 background_sample=NULL
340 }
341
342 # Launch enrichment analysis
343 allresult = suppressMessages(goEnrichment(geneuniverse,sample,background_sample,onto))
344 result = allresult[1][[1]]
345 myGOdata = allresult[2][[1]]
346 if (!is.null(result)){
347 cut_result = corrMultipleTesting(result,myGOdata, correction,threshold) # Adjust the result with a multiple testing correction or not and with the user, p-value cutoff
348 }else{
349 cut_result=NULL
350 }
351
352 createOutputs(result, cut_result,text, barplot, dotplot, onto)