Mercurial > repos > proteore > proteore_clusterprofiler
comparison GO-enrich.R @ 7:4609346d8108 draft
planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
author | proteore |
---|---|
date | Tue, 18 Dec 2018 09:21:32 -0500 |
parents | 5e16cec55146 |
children | b29255864039 |
comparison
equal
deleted
inserted
replaced
6:5e16cec55146 | 7:4609346d8108 |
---|---|
1 library(clusterProfiler) | 1 options(warn=-1) #TURN OFF WARNINGS !!!!!! |
2 | 2 suppressMessages(library(clusterProfiler,quietly = TRUE)) |
3 #library(org.Sc.sgd.db) | |
4 library(org.Hs.eg.db) | |
5 library(org.Mm.eg.db) | |
6 | 3 |
7 # Read file and return file content as data.frame | 4 # Read file and return file content as data.frame |
8 readfile = function(filename, header) { | 5 read_file <- function(path,header){ |
9 if (header == "true") { | 6 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="",check.names = F),silent=TRUE) |
10 # Read only first line of the file as header: | 7 if (inherits(file,"try-error")){ |
11 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") | 8 stop("File not found !") |
12 #Read the data of the files (skipping the first row) | 9 }else{ |
13 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") | |
14 # Remove empty rows | |
15 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | 10 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] |
16 #And assign the header to the data | 11 return(file) |
17 names(file) <- headers | 12 } |
18 } | 13 } |
19 else { | 14 |
20 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") | 15 #return the number of character from the longest description found (from the 10 first) |
21 # Remove empty rows | 16 max_str_length_10_first <- function(vector){ |
22 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] | 17 vector <- as.vector(vector) |
23 } | 18 nb_description = length(vector) |
24 return(file) | 19 if (nb_description >= 10){nb_description=10} |
20 return(max(nchar(vector[1:nb_description]))) | |
21 } | |
22 | |
23 str2bool <- function(x){ | |
24 if (any(is.element(c("t","true"),tolower(x)))){ | |
25 return (TRUE) | |
26 }else if (any(is.element(c("f","false"),tolower(x)))){ | |
27 return (FALSE) | |
28 }else{ | |
29 return(NULL) | |
30 } | |
31 } | |
32 | |
33 #used before the limit was set to 50 characters | |
34 width_by_max_char <- function (nb_max_char) { | |
35 if (nb_max_char < 50 ){ | |
36 width=600 | |
37 } else if (nb_max_char < 75) { | |
38 width=800 | |
39 } else if (nb_max_char < 100) { | |
40 width=900 | |
41 } else { | |
42 width=1000 | |
43 } | |
44 return (width) | |
25 } | 45 } |
26 | 46 |
27 repartition.GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) { | 47 repartition.GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) { |
28 ggo<-groupGO(gene=geneid, | 48 ggo<-groupGO(gene=geneid, |
29 OrgDb = orgdb, | 49 OrgDb = orgdb, |
30 ont=ontology, | 50 ont=ontology, |
31 level=level, | 51 level=level, |
32 readable=TRUE) | 52 readable=TRUE) |
33 name <- paste("GGO.", ontology, ".png", sep = "") | 53 |
34 png(name) | 54 if (length(ggo@result$ID) > 0 ) { |
35 p <- barplot(ggo, showCategory=10) | 55 ggo@result$Description <- sapply(as.vector(ggo@result$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) |
36 print(p) | 56 #nb_max_char = max_str_length_10_first(ggo$Description) |
37 dev.off() | 57 #width = width_by_max_char(nb_max_char) |
38 return(ggo) | 58 name <- paste("GGO_", ontology, "_bar-plot", sep = "") |
59 png(name,height = 720, width = 600) | |
60 p <- barplot(ggo, showCategory=10) | |
61 print(p) | |
62 dev.off() | |
63 ggo <- as.data.frame(ggo) | |
64 return(ggo) | |
65 } | |
39 } | 66 } |
40 | 67 |
41 # GO over-representation test | 68 # GO over-representation test |
42 enrich.GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff) { | 69 enrich.GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff,plot) { |
43 ego<-enrichGO(gene=geneid, | 70 ego<-enrichGO(gene=geneid, |
44 universe=universe, | 71 universe=universe, |
45 OrgDb=orgdb, | 72 OrgDb=orgdb, |
46 keytype="ENTREZID", | |
47 ont=ontology, | 73 ont=ontology, |
48 pAdjustMethod="BH", | 74 pAdjustMethod="BH", |
49 pvalueCutoff=pval_cutoff, | 75 pvalueCutoff=pval_cutoff, |
50 qvalueCutoff=qval_cutoff, | 76 qvalueCutoff=qval_cutoff, |
51 readable=TRUE) | 77 readable=TRUE) |
78 | |
52 # Plot bar & dot plots | 79 # Plot bar & dot plots |
53 bar_name <- paste("EGO.", ontology, ".bar.png", sep = "") | 80 #if there are enriched GopTerms |
54 png(bar_name) | 81 if (length(ego$ID)>0){ |
55 p <- barplot(ego) | 82 |
56 print(p) | 83 ego@result$Description <- sapply(ego@result$Description, function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE) |
57 dev.off() | 84 #nb_max_char = max_str_length_10_first(ego$Description) |
58 dot_name <- paste("EGO.", ontology, ".dot.png", sep = "") | 85 #width = width_by_max_char(nb_max_char) |
59 png(dot_name) | 86 |
60 p <- dotplot(ego, showCategory=10) | 87 if ("dotplot" %in% plot ){ |
61 print(p) | 88 dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "") |
62 dev.off() | 89 png(dot_name,height = 720, width = 600) |
63 return(ego) | 90 p <- dotplot(ego, showCategory=10) |
91 print(p) | |
92 dev.off() | |
93 } | |
94 | |
95 if ("barplot" %in% plot ){ | |
96 bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "") | |
97 png(bar_name,height = 720, width = 600) | |
98 p <- barplot(ego, showCategory=10) | |
99 print(p) | |
100 dev.off() | |
101 | |
102 } | |
103 ego <- as.data.frame(ego) | |
104 return(ego) | |
105 } else { | |
106 warning(paste("No Go terms enriched (EGO) found for ",ontology,"ontology"),immediate. = TRUE,noBreaks. = TRUE,call. = FALSE) | |
107 } | |
108 } | |
109 | |
110 check_ids <- function(vector,type) { | |
111 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" | |
112 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" | |
113 if (type == "entrez") | |
114 return(grepl(entrez_id,vector)) | |
115 else if (type == "uniprot") { | |
116 return(grepl(uniprot_pattern,vector)) | |
117 } | |
64 } | 118 } |
65 | 119 |
66 clusterProfiler = function() { | 120 clusterProfiler = function() { |
67 args <- commandArgs(TRUE) | 121 args <- commandArgs(TRUE) |
68 if(length(args)<1) { | 122 if(length(args)<1) { |
87 --onto_opt: ontology options | 141 --onto_opt: ontology options |
88 --go_function: groupGO/enrichGO | 142 --go_function: groupGO/enrichGO |
89 --level: 1-3 | 143 --level: 1-3 |
90 --pval_cutoff | 144 --pval_cutoff |
91 --qval_cutoff | 145 --qval_cutoff |
92 --text_output: text output filename \n") | 146 --text_output: text output filename |
147 --plot : type of visualization, dotplot or/and barplot \n") | |
93 q(save="no") | 148 q(save="no") |
94 } | 149 } |
95 # Parse arguments | 150 # Parse arguments |
96 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | 151 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") |
97 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | 152 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) |
98 args <- as.list(as.character(argsDF$V2)) | 153 args <- as.list(as.character(argsDF$V2)) |
99 names(args) <- argsDF$V1 | 154 names(args) <- argsDF$V1 |
100 #print(args) | 155 plot = unlist(strsplit(args$plot,",")) |
101 | 156 go_represent=str2bool(args$go_represent) |
157 go_enrich=str2bool(args$go_enrich) | |
158 | |
159 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda") | |
160 #load("/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda") | |
161 | |
162 suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE)) | |
163 | |
102 # Extract OrgDb | 164 # Extract OrgDb |
103 if (args$species=="human") { | 165 if (args$species=="org.Hs.eg.db") { |
104 orgdb<-org.Hs.eg.db | 166 orgdb<-org.Hs.eg.db |
105 } | 167 } else if (args$species=="org.Mm.eg.db") { |
106 else if (args$species=="mouse") { | |
107 orgdb<-org.Mm.eg.db | 168 orgdb<-org.Mm.eg.db |
108 } | 169 } else if (args$species=="org.Rn.eg.db") { |
109 else if (args$species=="rat") { | |
110 orgdb<-org.Rn.eg.db | 170 orgdb<-org.Rn.eg.db |
111 } | 171 } |
112 | 172 |
113 # Extract input IDs | 173 # Extract input IDs |
114 input_type = args$input_type | 174 input_type = args$input_type |
175 id_type = args$id_type | |
176 | |
115 if (input_type == "text") { | 177 if (input_type == "text") { |
116 input = strsplit(args$input, "[ \t\n]+")[[1]] | 178 input = strsplit(args$input, "[ \t\n]+")[[1]] |
117 } | 179 } else if (input_type == "file") { |
118 else if (input_type == "file") { | |
119 filename = args$input | 180 filename = args$input |
120 ncol = args$ncol | 181 ncol = args$ncol |
121 # Check ncol | 182 # Check ncol |
122 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { | 183 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { |
123 stop("Please enter the right format for column number: c[number]") | 184 stop("Please enter the right format for column number: c[number]") |
124 } | 185 } else { |
125 else { | |
126 ncol = as.numeric(gsub("c", "", ncol)) | 186 ncol = as.numeric(gsub("c", "", ncol)) |
127 } | 187 } |
128 header = args$header | 188 header = str2bool(args$header) # Get file content |
129 # Get file content | 189 file = read_file(filename, header) # Extract Protein IDs list |
130 file = readfile(filename, header) | 190 input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)) |
131 # Extract Protein IDs list | 191 } |
132 input = c() | 192 |
133 for (row in as.character(file[,ncol])) { | 193 |
134 input = c(input, strsplit(row, ";")[[1]][1]) | |
135 } | |
136 } | |
137 id_type = args$id_type | |
138 ## Get input gene list from input IDs | 194 ## Get input gene list from input IDs |
139 #ID format Conversion | 195 #ID format Conversion |
140 #This case : from UNIPROT (protein id) to ENTREZ (gene id) | 196 #This case : from UNIPROT (protein id) to ENTREZ (gene id) |
141 #bitr = conversion function from clusterProfiler | 197 #bitr = conversion function from clusterProfiler |
142 if (id_type=="Uniprot") { | 198 if (id_type=="Uniprot" & any(check_ids(input,"uniprot"))) { |
199 any(check_ids(input,"uniprot")) | |
143 idFrom<-"UNIPROT" | 200 idFrom<-"UNIPROT" |
144 idTo<-"ENTREZID" | 201 idTo<-"ENTREZID" |
145 gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb) | 202 suppressMessages(gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb)) |
146 gene<-unique(gene$ENTREZID) | 203 gene<-unique(gene$ENTREZID) |
147 } | 204 } else if (id_type=="Entrez" & any(check_ids(input,"entrez"))) { |
148 else if (id_type=="Entrez") { | |
149 gene<-unique(input) | 205 gene<-unique(input) |
206 } else { | |
207 stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file")) | |
150 } | 208 } |
151 | 209 |
152 ontology <- strsplit(args$onto_opt, ",")[[1]] | 210 ontology <- strsplit(args$onto_opt, ",")[[1]] |
211 | |
153 ## Extract GGO/EGO arguments | 212 ## Extract GGO/EGO arguments |
154 if (args$go_represent == "true") { | 213 if (go_represent) {level <- as.numeric(args$level)} |
155 go_represent <- args$go_represent | 214 if (go_enrich) { |
156 level <- as.numeric(args$level) | |
157 } | |
158 if (args$go_enrich == "true") { | |
159 go_enrich <- args$go_enrich | |
160 pval_cutoff <- as.numeric(args$pval_cutoff) | 215 pval_cutoff <- as.numeric(args$pval_cutoff) |
161 qval_cutoff <- as.numeric(args$qval_cutoff) | 216 qval_cutoff <- as.numeric(args$qval_cutoff) |
162 # Extract universe background genes (same as input file) | 217 # Extract universe background genes (same as input file) |
163 if (!is.null(args$universe_type)) { | 218 if (!is.null(args$universe_type)) { |
164 universe_type = args$universe_type | 219 universe_type = args$universe_type |
165 if (universe_type == "text") { | 220 if (universe_type == "text") { |
166 universe = strsplit(args$universe, "[ \t\n]+")[[1]] | 221 universe = strsplit(args$universe, "[ \t\n]+")[[1]] |
167 } | 222 } else if (universe_type == "file") { |
168 else if (universe_type == "file") { | |
169 universe_filename = args$universe | 223 universe_filename = args$universe |
170 universe_ncol = args$uncol | 224 universe_ncol = args$uncol |
171 # Check ncol | 225 # Check ncol |
172 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) { | 226 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) { |
173 stop("Please enter the right format for column number: c[number]") | 227 stop("Please enter the right format for column number: c[number]") |
174 } | 228 } else { |
175 else { | |
176 universe_ncol = as.numeric(gsub("c", "", universe_ncol)) | 229 universe_ncol = as.numeric(gsub("c", "", universe_ncol)) |
177 } | 230 } |
178 universe_header = args$uheader | 231 universe_header = str2bool(args$uheader) |
179 # Get file content | 232 # Get file content |
180 universe_file = readfile(universe_filename, universe_header) | 233 universe_file = read_file(universe_filename, universe_header) |
181 # Extract Protein IDs list | 234 # Extract Protein IDs list |
182 universe = c() | 235 universe <- sapply(universe_file[,universe_ncol], function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE) |
183 for (row in as.character(universe_file[,universe_ncol])) { | |
184 universe = c(universe, strsplit(row, ";")[[1]][1]) | |
185 } | |
186 } | 236 } |
187 universe_id_type = args$universe_id_type | 237 universe_id_type = args$universe_id_type |
188 ##to initialize | 238 ##to initialize |
189 if (universe_id_type=="Uniprot") { | 239 if (universe_id_type=="Uniprot" & any(check_ids(universe,"uniprot"))) { |
190 idFrom<-"UNIPROT" | 240 idFrom<-"UNIPROT" |
191 idTo<-"ENTREZID" | 241 idTo<-"ENTREZID" |
192 universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb) | 242 suppressMessages(universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb)) |
193 universe_gene<-unique(universe_gene$ENTREZID) | 243 universe_gene<-unique(universe_gene$ENTREZID) |
194 } | 244 } else if (universe_id_type=="Entrez" & any(check_ids(universe,"entrez"))) { |
195 else if (universe_id_type=="Entrez") { | 245 universe_gene<-unique(unlist(universe)) |
196 universe_gene<-unique(universe) | 246 } else { |
197 } | 247 if (universe_type=="text"){ |
198 } | 248 print(paste(universe_id_type,"not found in your background IDs list",sep=" ")) |
199 else { | 249 } else { |
250 print(paste(universe_id_type,"not found in the column",universe_ncol,"of your background IDs file",sep=" ")) | |
251 } | |
252 universe_gene = NULL | |
253 } | |
254 } else { | |
200 universe_gene = NULL | 255 universe_gene = NULL |
201 } | 256 } |
257 } else { | |
258 universe_gene = NULL | |
202 } | 259 } |
203 | 260 |
204 ##enrichGO : GO over-representation test | 261 ##enrichGO : GO over-representation test |
205 for (onto in ontology) { | 262 for (onto in ontology) { |
206 if (args$go_represent == "true") { | 263 if (go_represent) { |
207 ggo<-repartition.GO(gene, orgdb, onto, level, readable=TRUE) | 264 ggo<-repartition.GO(gene, orgdb, onto, level, readable=TRUE) |
208 write.table(ggo, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE) | 265 if (is.list(ggo)){ggo <- as.data.frame(apply(ggo, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA |
209 } | 266 output_path = paste("cluster_profiler_GGO_",onto,".tsv",sep="") |
210 if (args$go_enrich == "true") { | 267 write.table(ggo, output_path, sep="\t", row.names = FALSE, quote = FALSE ) |
211 ego<-enrich.GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff) | 268 } |
212 write.table(ego, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE) | 269 |
270 if (go_enrich) { | |
271 ego<-enrich.GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff,plot) | |
272 if (is.list(ego)){ego <- as.data.frame(apply(ego, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA | |
273 output_path = paste("cluster_profiler_EGO_",onto,".tsv",sep="") | |
274 write.table(ego, output_path, sep="\t", row.names = FALSE, quote = FALSE ) | |
213 } | 275 } |
214 } | 276 } |
215 } | 277 } |
216 | 278 |
217 clusterProfiler() | 279 clusterProfiler() |