comparison GO-enrich.R @ 13:f6107b8ae8f8 draft default tip

"planemo upload commit be9070db4a3b178ab45ecd9c9c3ab369240f7617-dirty"
author proteore
date Fri, 09 Apr 2021 14:39:05 +0000
parents d951677a50d4
children
comparison
equal deleted inserted replaced
12:85f039f53414 13:f6107b8ae8f8
1 options(warn=-1) #TURN OFF WARNINGS !!!!!! 1 options(warn = -1) #TURN OFF WARNINGS !!!!!!
2 suppressMessages(library(clusterProfiler,quietly = TRUE)) 2 suppressMessages(library(clusterProfiler, quietly = TRUE))
3 3
4 # Read file and return file content as data.frame 4 # Read file and return file content as data.frame
5 read_file <- function(path,header){ 5 read_file <- function(path, header) {
6 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="",check.names = F),silent=TRUE) 6 file <- try(read.csv(path, header = header, sep = "\t",
7 if (inherits(file,"try-error")){ 7 stringsAsFactors = FALSE, quote = "", check.names = F),
8 silent = TRUE)
9 if (inherits(file, "try-error")) {
8 stop("File not found !") 10 stop("File not found !")
9 }else{ 11 }else{
10 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] 12 file <- file[!apply(is.na(file) | file == "", 1, all), , drop = FALSE]
11 return(file) 13 return(file)
12 } 14 }
13 } 15 }
14 16
15 #return the number of character from the longest description found (from the 10 first) 17
16 max_str_length_10_first <- function(vector){ 18 #return the number of character from the longest description found
19 #(from the 10 first)
20 max_str_length_10_first <- function(vector) {
17 vector <- as.vector(vector) 21 vector <- as.vector(vector)
18 nb_description = length(vector) 22 nb_description <- length(vector)
19 if (nb_description >= 10){nb_description=10} 23 if (nb_description >= 10) {
24 nb_description <- 10
25 }
26
20 return(max(nchar(vector[1:nb_description]))) 27 return(max(nchar(vector[1:nb_description])))
21 } 28 }
22 29
23 str2bool <- function(x){ 30 str2bool <- function(x) {
24 if (any(is.element(c("t","true"),tolower(x)))){ 31 if (any(is.element(c("t", "true"), tolower(x)))) {
25 return (TRUE) 32 return(TRUE)
26 }else if (any(is.element(c("f","false"),tolower(x)))){ 33 }else if (any(is.element(c("f", "false"), tolower(x)))) {
27 return (FALSE) 34 return(FALSE)
35
28 }else{ 36 }else{
29 return(NULL) 37 return(NULL)
30 } 38 }
31 } 39 }
32 40
33 #used before the limit was set to 50 characters 41 #used before the limit was set to 50 characters
34 width_by_max_char <- function (nb_max_char) { 42 width_by_max_char <- function(nb_max_char) {
35 if (nb_max_char < 50 ){ 43 if (nb_max_char < 50) {
36 width=600 44 width <- 600
37 } else if (nb_max_char < 75) { 45 } else if (nb_max_char < 75) {
38 width=800 46 width <- 800
39 } else if (nb_max_char < 100) { 47 } else if (nb_max_char < 100) {
40 width=900 48 width <- 900
41 } else { 49 } else {
42 width=1000 50 width <- 1000
43 } 51 }
44 return (width) 52 return(width)
45 } 53 }
46 54
47 repartition_GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) { 55
48 ggo<-groupGO(gene=geneid, 56 repartition_go <- function(geneid, orgdb, ontology,
49 OrgDb = orgdb, 57 level = 3, readable = TRUE) {
50 ont=ontology, 58 ggo <- groupGO(gene = geneid,
51 level=level, 59 OrgDb = orgdb,
52 readable=TRUE) 60 ont = ontology,
53 61 level = level,
54 if (length(ggo@result$ID) > 0 ) { 62 readable = TRUE)
55 ggo@result$Description <- sapply(as.vector(ggo@result$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) 63
56 #nb_max_char = max_str_length_10_first(ggo$Description) 64
57 #width = width_by_max_char(nb_max_char) 65 if (length(ggo@result$ID) > 0) {
66 ggo@result$Description <- sapply(as.vector(ggo@result$Description), #nolint
67 function(x) {
68 ifelse(nchar(x) > 50,
69 substr(x, 1, 50), x)}, USE.NAMES = FALSE)
70
71
58 name <- paste("GGO_", ontology, "_bar-plot", sep = "") 72 name <- paste("GGO_", ontology, "_bar-plot", sep = "")
59 png(name,height = 720, width = 600) 73 png(name, height = 720, width = 600)
60 p <- barplot(ggo, showCategory=10) 74 p <- barplot(ggo, showCategory = 10)
61 print(p) 75 print(p)
62 dev.off() 76 dev.off()
63 ggo <- as.data.frame(ggo) 77 ggo <- as.data.frame(ggo)
64 return(ggo) 78 return(ggo)
65 } 79 }
66 } 80 }
67 81
82 # nolint end
83
68 # GO over-representation test 84 # GO over-representation test
69 enrich_GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff,plot) { 85 enrich_go <- function(geneid, universe, orgdb, ontology, pval_cutoff,
70 ego<-enrichGO(gene=geneid, 86 qval_cutoff, plot) {
71 universe=universe, 87 ego <- enrichGO(gene = geneid,
72 OrgDb=orgdb, 88
73 ont=ontology, 89 universe = universe,
74 pAdjustMethod="BH", 90 OrgDb = orgdb,
75 pvalueCutoff=pval_cutoff, 91 ont = ontology,
76 qvalueCutoff=qval_cutoff, 92 pAdjustMethod = "BH",
77 readable=TRUE) 93 pvalueCutoff = pval_cutoff,
78 94 qvalueCutoff = qval_cutoff,
95 readable = TRUE)
96
79 # Plot bar & dot plots 97 # Plot bar & dot plots
80 #if there are enriched GopTerms 98 #if there are enriched GoTerms
81 if (length(ego$ID)>0){ 99
82 100
83 ego@result$Description <- sapply(ego@result$Description, function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) 101
84 #nb_max_char = max_str_length_10_first(ego$Description) 102 if (length(ego$ID) > 0) {
85 #width = width_by_max_char(nb_max_char) 103
86 104 ego@result$Description <- sapply(ego@result$Description, function(x) { #nolint
87 if ("dotplot" %in% plot ){ 105 ifelse(nchar(x) > 50, substr(x, 1, 50), x)
106 }, USE.NAMES = FALSE)
107
108
109 if ("dotplot" %in% plot) {
88 dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "") 110 dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "")
89 png(dot_name,height = 720, width = 600) 111 png(dot_name, height = 720, width = 600)
90 p <- dotplot(ego, showCategory=10) 112 p <- dotplot(ego, showCategory = 10) #nolint
91 print(p) 113 print(p)
92 dev.off() 114 dev.off()
93 } 115 }
94 116
95 if ("barplot" %in% plot ){ 117 if ("barplot" %in% plot) {
96 bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "") 118 bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "")
97 png(bar_name,height = 720, width = 600) 119 png(bar_name, height = 720, width = 600)
98 p <- barplot(ego, showCategory=10) 120 p <- barplot(ego, showCategory = 10)
99 print(p) 121 print(p)
100 dev.off() 122 dev.off()
101 123 }
102 } 124
103 ego <- as.data.frame(ego) 125 ego <- as.data.frame(ego)
104 return(ego) 126 return(ego)
105 } else { 127 } else {
106 warning(paste("No Go terms enriched (EGO) found for ",ontology,"ontology"),immediate. = TRUE,noBreaks. = TRUE,call. = FALSE) 128 warning(paste("No Go terms enriched (EGO) found for ",
107 } 129 ontology, "ontology"), immediate. = TRUE, noBreaks. = TRUE, call. = FALSE)
108 } 130 }
109 131 }
110 clean_ids <- function(ids){ 132
111 ids = gsub(" ","",ids) 133
112 ids = ids[which(ids!="")] 134 clean_ids <- function(ids) {
113 ids = ids[which(ids!="NA")] 135 ids <- gsub(" ", "", ids)
114 ids = ids[!is.na(ids)] 136 ids <- ids[which(ids != "")]
115 137 ids <- ids[which(ids != "NA")]
116 return(ids) 138 ids <- ids[!is.na(ids)]
117 } 139
118 140 return(ids)
119 check_ids <- function(vector,type) { 141 }
120 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" 142
121 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" 143 check_ids <- function(vector, type) {
122 if (type == "entrez") 144 uniprot_pattern <- "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" #nolint
123 return(grepl(entrez_id,vector)) 145 entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" #nolint
124 else if (type == "uniprot") { 146 if (type == "entrez") {
125 return(grepl(uniprot_pattern,vector)) 147 return(grepl(entrez_id, vector))
126 } 148 }else if (type == "uniprot") {
127 } 149 return(grepl(uniprot_pattern, vector))
128 150 }
129 get_args <- function(){ 151 }
152
153 get_args <- function() {
130 args <- commandArgs(TRUE) 154 args <- commandArgs(TRUE)
131 if(length(args)<1) { 155 if (length(args) < 1) {
132 args <- c("--help") 156 args <- c("--help")
133 } 157 }
134 158
135 # Help section 159 # Help section
136 if("--help" %in% args) { 160 if ("--help" %in% args) {
137 cat("clusterProfiler Enrichment Analysis 161 cat("clusterProfiler Enrichment Analysis
138 Arguments: 162 Arguments:
139 --input_type: type of input (list of id or filename) 163 --input_type: type of input (list of id or filename)
140 --input: input 164 --input: input
141 --ncol: the column number which contains list of input IDs 165 --ncol: the column number which contains list of input IDs
150 --onto_opt: ontology options 174 --onto_opt: ontology options
151 --go_function: groupGO/enrichGO 175 --go_function: groupGO/enrichGO
152 --level: 1-3 176 --level: 1-3
153 --pval_cutoff 177 --pval_cutoff
154 --qval_cutoff 178 --qval_cutoff
155 --text_output: text output filename 179 --text_output: text output filename
156 --plot : type of visualization, dotplot or/and barplot \n") 180 --plot : type of visualization, dotplot or/and barplot \n")
157 q(save="no") 181 q(save = "no")
158 } 182 }
159 # Parse arguments 183 # Parse arguments
160 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") 184
161 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) 185 parseargs <- function(x) strsplit(sub("^--", "", x), "=")
162 args <- as.list(as.character(argsDF$V2)) 186 argsdf <- as.data.frame(do.call("rbind", parseargs(args)))
163 names(args) <- argsDF$V1 187 args <- as.list(as.character(argsdf$V2))
164 188 names(args) <- argsdf$V1
165 return(args) 189 return(args)
166 } 190 }
167 191
168 192 main <- function() { #nolint
169 main <- function() {
170
171 #get args from command 193 #get args from command
172 args <- get_args() 194 args <- get_args()
173 195
174 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda") 196 go_represent <- str2bool(args$go_represent)
175 #load("/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda") 197 go_enrich <- str2bool(args$go_enrich)
176 198 if (go_enrich) {
177 go_represent=str2bool(args$go_represent) 199 plot <- unlist(strsplit(args$plot, ","))
178 go_enrich=str2bool(args$go_enrich) 200 }
179 if (go_enrich){ 201
180 plot = unlist(strsplit(args$plot,","))
181 }
182
183 suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE)) 202 suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE))
184 203
185 # Extract OrgDb 204 # Extract OrgDb
186 if (args$species=="org.Hs.eg.db") { 205 if (args$species == "org.Hs.eg.db") {
187 orgdb<-org.Hs.eg.db 206 orgdb <- org.Hs.eg.db #nolint
188 } else if (args$species=="org.Mm.eg.db") { 207 } else if (args$species == "org.Mm.eg.db") {
189 orgdb<-org.Mm.eg.db 208 orgdb <- org.Mm.eg.db #nolint
190 } else if (args$species=="org.Rn.eg.db") { 209 } else if (args$species == "org.Rn.eg.db") {
191 orgdb<-org.Rn.eg.db 210 orgdb <- org.Rn.eg.db #nolint
192 } 211 }
193 212
194 # Extract input IDs 213 # Extract input IDs
195 input_type = args$input_type 214 input_type <- args$input_type
196 id_type = args$id_type 215 id_type <- args$id_type
197 216
198 if (input_type == "text") { 217 if (input_type == "text") {
199 input = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";")) 218 input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
200 } else if (input_type == "file") { 219 } else if (input_type == "file") {
201 filename = args$input 220 filename <- args$input
202 ncol = args$ncol 221 ncol <- args$ncol
203 # Check ncol 222 # Check ncol
204 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { 223 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
205 stop("Please enter the right format for column number: c[number]") 224 stop("Please enter the right format for column number: c[number]")
206 } else { 225 } else {
207 ncol = as.numeric(gsub("c", "", ncol)) 226 ncol <- as.numeric(gsub("c", "", ncol))
208 } 227 }
209 header = str2bool(args$header) # Get file content 228
210 file = read_file(filename, header) # Extract Protein IDs list 229 header <- str2bool(args$header) # Get file content
211 input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) 230 file <- read_file(filename, header) # Extract Protein IDs list
212 } 231 input <- unlist(sapply(as.character(file[, ncol]),
213 input = clean_ids(input) 232 function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
214 233 }
234 input <- clean_ids(input)
235
215 ## Get input gene list from input IDs 236 ## Get input gene list from input IDs
216 #ID format Conversion 237 #ID format Conversion
217 #This case : from UNIPROT (protein id) to ENTREZ (gene id) 238 #This case : from UNIPROT (protein id) to ENTREZ (gene id)
218 #bitr = conversion function from clusterProfiler 239 #bitr = conversion function from clusterProfiler
219 if (id_type=="Uniprot" & any(check_ids(input,"uniprot"))) { 240 if (id_type == "Uniprot" & any(check_ids(input, "uniprot"))) {
220 any(check_ids(input,"uniprot")) 241 any(check_ids(input, "uniprot"))
221 idFrom<-"UNIPROT" 242
222 idTo<-"ENTREZID" 243 idfrom <- "UNIPROT"
223 suppressMessages(gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb)) 244 idto <- "ENTREZID"
224 gene<-unique(gene$ENTREZID) 245 suppressMessages(gene <- bitr(input, fromType = idfrom, toType = idto,
225 } else if (id_type=="Entrez" & any(check_ids(input,"entrez"))) { 246 OrgDb = orgdb))
226 gene<-unique(input) 247
248 gene <- unique(gene$ENTREZID)
249 } else if (id_type == "Entrez" & any(check_ids(input, "entrez"))) {
250 gene <- unique(input)
227 } else { 251 } else {
228 stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file")) 252
253 stop(paste(id_type, "not found in your ids list,
254 please check your IDs in input or the selected column of your input file"))
255
229 } 256 }
230 257
231 ontology <- strsplit(args$onto_opt, ",")[[1]] 258 ontology <- strsplit(args$onto_opt, ",")[[1]]
232 259
233 ## Extract GGO/EGO arguments 260 ## Extract GGO/EGO arguments
234 if (go_represent) {level <- as.numeric(args$level)} 261 if (go_represent) {
262 level <- as.numeric(args$level)
263
264 }
265
235 if (go_enrich) { 266 if (go_enrich) {
236 pval_cutoff <- as.numeric(args$pval_cutoff) 267 pval_cutoff <- as.numeric(args$pval_cutoff)
237 qval_cutoff <- as.numeric(args$qval_cutoff) 268 qval_cutoff <- as.numeric(args$qval_cutoff)
238 # Extract universe background genes (same as input file) 269 # Extract universe background genes (same as input file)
239 if (!is.null(args$universe_type)) { 270 if (!is.null(args$universe_type)) {
240 universe_type = args$universe_type 271 universe_type <- args$universe_type
241 if (universe_type == "text") { 272 if (universe_type == "text") {
242 universe = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";")) 273 universe <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
243 } else if (universe_type == "file") { 274 } else if (universe_type == "file") {
244 universe_filename = args$universe 275 universe_filename <- args$universe
245 universe_ncol = args$uncol 276 universe_ncol <- args$uncol
246 # Check ncol 277 # Check ncol
247 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) { 278 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) {
248 stop("Please enter the right format for column number: c[number]") 279 stop("Please enter the right format for column number: c[number]")
249 } else { 280 } else {
250 universe_ncol = as.numeric(gsub("c", "", universe_ncol)) 281 universe_ncol <- as.numeric(gsub("c", "", universe_ncol))
251 } 282 }
252 universe_header = str2bool(args$uheader) 283 universe_header <- str2bool(args$uheader)
253 # Get file content 284 # Get file content
254 universe_file = read_file(universe_filename, universe_header) 285 universe_file <- read_file(universe_filename, universe_header)
255 # Extract Protein IDs list 286 # Extract Protein IDs list
256 universe <- unlist(sapply(universe_file[,universe_ncol], function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) 287 universe <- unlist(sapply(universe_file[, universe_ncol],
288 function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
257 } 289 }
258 universe = clean_ids(input) 290 universe <- clean_ids(input)
259 universe_id_type = args$universe_id_type 291 universe_id_type <- args$universe_id_type
260 ##to initialize 292 ##to initialize
261 if (universe_id_type=="Uniprot" & any(check_ids(universe,"uniprot"))) { 293 if (universe_id_type == "Uniprot" & any(check_ids(universe, "uniprot"))) {
262 idFrom<-"UNIPROT" 294 idfrom <- "UNIPROT"
263 idTo<-"ENTREZID" 295 idto <- "ENTREZID"
264 suppressMessages(universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb)) 296 suppressMessages(universe_gene <- bitr(universe, fromType = idfrom,
265 universe_gene<-unique(universe_gene$ENTREZID) 297 toType = idto, OrgDb = orgdb))
266 } else if (universe_id_type=="Entrez" & any(check_ids(universe,"entrez"))) { 298 universe_gene <- unique(universe_gene$ENTREZID)
267 universe_gene<-unique(unlist(universe)) 299 } else if (universe_id_type == "Entrez" &
300 any(check_ids(universe, "entrez"))) {
301 universe_gene <- unique(unlist(universe))
268 } else { 302 } else {
269 if (universe_type=="text"){ 303 if (universe_type == "text") {
270 print(paste(universe_id_type,"not found in your background IDs list",sep=" ")) 304 print(paste(universe_id_type, "not found in your background IDs list",
305 sep = " "))
271 } else { 306 } else {
272 print(paste(universe_id_type,"not found in the column",universe_ncol,"of your background IDs file",sep=" ")) 307 print(paste(universe_id_type, "not found in the column",
308 universe_ncol, "of your background IDs file", sep = " "))
309
273 } 310 }
274 universe_gene = NULL 311 universe_gene <- NULL
275 } 312 }
276 } else { 313 } else {
277 universe_gene = NULL 314 universe_gene <- NULL
278 } 315 }
279 } else { 316 } else {
280 universe_gene = NULL 317 universe_gene <- NULL
281 } 318 }
282 319
283 ##enrichGO : GO over-representation test 320 ##enrichGO : GO over-representation test
284 for (onto in ontology) { 321 for (onto in ontology) {
285 if (go_represent) { 322 if (go_represent) {
286 ggo<-repartition_GO(gene, orgdb, onto, level, readable=TRUE) 323 ggo <- repartition_go(gene, orgdb, onto, level, readable = TRUE)
287 if (is.list(ggo)){ggo <- as.data.frame(apply(ggo, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA 324 if (is.list(ggo)) {
288 output_path = paste("cluster_profiler_GGO_",onto,".tsv",sep="") 325 ggo <- as.data.frame(apply(ggo, c(1, 2),
289 write.table(ggo, output_path, sep="\t", row.names = FALSE, quote = FALSE ) 326 function(x) gsub("^$|^ $", NA, x)))
327 } #convert "" and " " to NA
328 output_path <- paste("cluster_profiler_GGO_", onto, ".tsv", sep = "")
329 write.table(ggo, output_path, sep = "\t",
330 row.names = FALSE, quote = FALSE)
331
290 } 332 }
291 333
292 if (go_enrich) { 334 if (go_enrich) {
293 ego<-enrich_GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff,plot) 335 ego <- enrich_go(gene, universe_gene, orgdb, onto, pval_cutoff,
294 if (is.list(ego)){ego <- as.data.frame(apply(ego, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA 336 qval_cutoff, plot)
295 output_path = paste("cluster_profiler_EGO_",onto,".tsv",sep="") 337 if (is.list(ego)) {
296 write.table(ego, output_path, sep="\t", row.names = FALSE, quote = FALSE ) 338 ego <- as.data.frame(apply(ego, c(1, 2),
297 } 339 function(x) gsub("^$|^ $", NA, x)))
298 } 340 } #convert "" and " " to NA
299 } 341 output_path <- paste("cluster_profiler_EGO_", onto, ".tsv", sep = "")
300 342 write.table(ego, output_path, sep = "\t",
301 if(!interactive()) { 343 row.names = FALSE, quote = FALSE)
344 }
345 }
346 }
347
348 if (!interactive()) {
302 main() 349 main()
303 } 350 }