comparison GO-enrich.R @ 7:4609346d8108 draft

planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba
author proteore
date Tue, 18 Dec 2018 09:21:32 -0500
parents 5e16cec55146
children b29255864039
comparison
equal deleted inserted replaced
6:5e16cec55146 7:4609346d8108
1 library(clusterProfiler) 1 options(warn=-1) #TURN OFF WARNINGS !!!!!!
2 2 suppressMessages(library(clusterProfiler,quietly = TRUE))
3 #library(org.Sc.sgd.db)
4 library(org.Hs.eg.db)
5 library(org.Mm.eg.db)
6 3
7 # Read file and return file content as data.frame 4 # Read file and return file content as data.frame
8 readfile = function(filename, header) { 5 read_file <- function(path,header){
9 if (header == "true") { 6 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="",check.names = F),silent=TRUE)
10 # Read only first line of the file as header: 7 if (inherits(file,"try-error")){
11 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") 8 stop("File not found !")
12 #Read the data of the files (skipping the first row) 9 }else{
13 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
14 # Remove empty rows
15 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] 10 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
16 #And assign the header to the data 11 return(file)
17 names(file) <- headers 12 }
18 } 13 }
19 else { 14
20 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "") 15 #return the number of character from the longest description found (from the 10 first)
21 # Remove empty rows 16 max_str_length_10_first <- function(vector){
22 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE] 17 vector <- as.vector(vector)
23 } 18 nb_description = length(vector)
24 return(file) 19 if (nb_description >= 10){nb_description=10}
20 return(max(nchar(vector[1:nb_description])))
21 }
22
23 str2bool <- function(x){
24 if (any(is.element(c("t","true"),tolower(x)))){
25 return (TRUE)
26 }else if (any(is.element(c("f","false"),tolower(x)))){
27 return (FALSE)
28 }else{
29 return(NULL)
30 }
31 }
32
33 #used before the limit was set to 50 characters
34 width_by_max_char <- function (nb_max_char) {
35 if (nb_max_char < 50 ){
36 width=600
37 } else if (nb_max_char < 75) {
38 width=800
39 } else if (nb_max_char < 100) {
40 width=900
41 } else {
42 width=1000
43 }
44 return (width)
25 } 45 }
26 46
27 repartition.GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) { 47 repartition.GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) {
28 ggo<-groupGO(gene=geneid, 48 ggo<-groupGO(gene=geneid,
29 OrgDb = orgdb, 49 OrgDb = orgdb,
30 ont=ontology, 50 ont=ontology,
31 level=level, 51 level=level,
32 readable=TRUE) 52 readable=TRUE)
33 name <- paste("GGO.", ontology, ".png", sep = "") 53
34 png(name) 54 if (length(ggo@result$ID) > 0 ) {
35 p <- barplot(ggo, showCategory=10) 55 ggo@result$Description <- sapply(as.vector(ggo@result$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE)
36 print(p) 56 #nb_max_char = max_str_length_10_first(ggo$Description)
37 dev.off() 57 #width = width_by_max_char(nb_max_char)
38 return(ggo) 58 name <- paste("GGO_", ontology, "_bar-plot", sep = "")
59 png(name,height = 720, width = 600)
60 p <- barplot(ggo, showCategory=10)
61 print(p)
62 dev.off()
63 ggo <- as.data.frame(ggo)
64 return(ggo)
65 }
39 } 66 }
40 67
41 # GO over-representation test 68 # GO over-representation test
42 enrich.GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff) { 69 enrich.GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff,plot) {
43 ego<-enrichGO(gene=geneid, 70 ego<-enrichGO(gene=geneid,
44 universe=universe, 71 universe=universe,
45 OrgDb=orgdb, 72 OrgDb=orgdb,
46 keytype="ENTREZID",
47 ont=ontology, 73 ont=ontology,
48 pAdjustMethod="BH", 74 pAdjustMethod="BH",
49 pvalueCutoff=pval_cutoff, 75 pvalueCutoff=pval_cutoff,
50 qvalueCutoff=qval_cutoff, 76 qvalueCutoff=qval_cutoff,
51 readable=TRUE) 77 readable=TRUE)
78
52 # Plot bar & dot plots 79 # Plot bar & dot plots
53 bar_name <- paste("EGO.", ontology, ".bar.png", sep = "") 80 #if there are enriched GopTerms
54 png(bar_name) 81 if (length(ego$ID)>0){
55 p <- barplot(ego) 82
56 print(p) 83 ego@result$Description <- sapply(ego@result$Description, function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE)
57 dev.off() 84 #nb_max_char = max_str_length_10_first(ego$Description)
58 dot_name <- paste("EGO.", ontology, ".dot.png", sep = "") 85 #width = width_by_max_char(nb_max_char)
59 png(dot_name) 86
60 p <- dotplot(ego, showCategory=10) 87 if ("dotplot" %in% plot ){
61 print(p) 88 dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "")
62 dev.off() 89 png(dot_name,height = 720, width = 600)
63 return(ego) 90 p <- dotplot(ego, showCategory=10)
91 print(p)
92 dev.off()
93 }
94
95 if ("barplot" %in% plot ){
96 bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "")
97 png(bar_name,height = 720, width = 600)
98 p <- barplot(ego, showCategory=10)
99 print(p)
100 dev.off()
101
102 }
103 ego <- as.data.frame(ego)
104 return(ego)
105 } else {
106 warning(paste("No Go terms enriched (EGO) found for ",ontology,"ontology"),immediate. = TRUE,noBreaks. = TRUE,call. = FALSE)
107 }
108 }
109
110 check_ids <- function(vector,type) {
111 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
112 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
113 if (type == "entrez")
114 return(grepl(entrez_id,vector))
115 else if (type == "uniprot") {
116 return(grepl(uniprot_pattern,vector))
117 }
64 } 118 }
65 119
66 clusterProfiler = function() { 120 clusterProfiler = function() {
67 args <- commandArgs(TRUE) 121 args <- commandArgs(TRUE)
68 if(length(args)<1) { 122 if(length(args)<1) {
87 --onto_opt: ontology options 141 --onto_opt: ontology options
88 --go_function: groupGO/enrichGO 142 --go_function: groupGO/enrichGO
89 --level: 1-3 143 --level: 1-3
90 --pval_cutoff 144 --pval_cutoff
91 --qval_cutoff 145 --qval_cutoff
92 --text_output: text output filename \n") 146 --text_output: text output filename
147 --plot : type of visualization, dotplot or/and barplot \n")
93 q(save="no") 148 q(save="no")
94 } 149 }
95 # Parse arguments 150 # Parse arguments
96 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") 151 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
97 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) 152 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
98 args <- as.list(as.character(argsDF$V2)) 153 args <- as.list(as.character(argsDF$V2))
99 names(args) <- argsDF$V1 154 names(args) <- argsDF$V1
100 #print(args) 155 plot = unlist(strsplit(args$plot,","))
101 156 go_represent=str2bool(args$go_represent)
157 go_enrich=str2bool(args$go_enrich)
158
159 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda")
160 #load("/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda")
161
162 suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE))
163
102 # Extract OrgDb 164 # Extract OrgDb
103 if (args$species=="human") { 165 if (args$species=="org.Hs.eg.db") {
104 orgdb<-org.Hs.eg.db 166 orgdb<-org.Hs.eg.db
105 } 167 } else if (args$species=="org.Mm.eg.db") {
106 else if (args$species=="mouse") {
107 orgdb<-org.Mm.eg.db 168 orgdb<-org.Mm.eg.db
108 } 169 } else if (args$species=="org.Rn.eg.db") {
109 else if (args$species=="rat") {
110 orgdb<-org.Rn.eg.db 170 orgdb<-org.Rn.eg.db
111 } 171 }
112 172
113 # Extract input IDs 173 # Extract input IDs
114 input_type = args$input_type 174 input_type = args$input_type
175 id_type = args$id_type
176
115 if (input_type == "text") { 177 if (input_type == "text") {
116 input = strsplit(args$input, "[ \t\n]+")[[1]] 178 input = strsplit(args$input, "[ \t\n]+")[[1]]
117 } 179 } else if (input_type == "file") {
118 else if (input_type == "file") {
119 filename = args$input 180 filename = args$input
120 ncol = args$ncol 181 ncol = args$ncol
121 # Check ncol 182 # Check ncol
122 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { 183 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
123 stop("Please enter the right format for column number: c[number]") 184 stop("Please enter the right format for column number: c[number]")
124 } 185 } else {
125 else {
126 ncol = as.numeric(gsub("c", "", ncol)) 186 ncol = as.numeric(gsub("c", "", ncol))
127 } 187 }
128 header = args$header 188 header = str2bool(args$header) # Get file content
129 # Get file content 189 file = read_file(filename, header) # Extract Protein IDs list
130 file = readfile(filename, header) 190 input = unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
131 # Extract Protein IDs list 191 }
132 input = c() 192
133 for (row in as.character(file[,ncol])) { 193
134 input = c(input, strsplit(row, ";")[[1]][1])
135 }
136 }
137 id_type = args$id_type
138 ## Get input gene list from input IDs 194 ## Get input gene list from input IDs
139 #ID format Conversion 195 #ID format Conversion
140 #This case : from UNIPROT (protein id) to ENTREZ (gene id) 196 #This case : from UNIPROT (protein id) to ENTREZ (gene id)
141 #bitr = conversion function from clusterProfiler 197 #bitr = conversion function from clusterProfiler
142 if (id_type=="Uniprot") { 198 if (id_type=="Uniprot" & any(check_ids(input,"uniprot"))) {
199 any(check_ids(input,"uniprot"))
143 idFrom<-"UNIPROT" 200 idFrom<-"UNIPROT"
144 idTo<-"ENTREZID" 201 idTo<-"ENTREZID"
145 gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb) 202 suppressMessages(gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb))
146 gene<-unique(gene$ENTREZID) 203 gene<-unique(gene$ENTREZID)
147 } 204 } else if (id_type=="Entrez" & any(check_ids(input,"entrez"))) {
148 else if (id_type=="Entrez") {
149 gene<-unique(input) 205 gene<-unique(input)
206 } else {
207 stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file"))
150 } 208 }
151 209
152 ontology <- strsplit(args$onto_opt, ",")[[1]] 210 ontology <- strsplit(args$onto_opt, ",")[[1]]
211
153 ## Extract GGO/EGO arguments 212 ## Extract GGO/EGO arguments
154 if (args$go_represent == "true") { 213 if (go_represent) {level <- as.numeric(args$level)}
155 go_represent <- args$go_represent 214 if (go_enrich) {
156 level <- as.numeric(args$level)
157 }
158 if (args$go_enrich == "true") {
159 go_enrich <- args$go_enrich
160 pval_cutoff <- as.numeric(args$pval_cutoff) 215 pval_cutoff <- as.numeric(args$pval_cutoff)
161 qval_cutoff <- as.numeric(args$qval_cutoff) 216 qval_cutoff <- as.numeric(args$qval_cutoff)
162 # Extract universe background genes (same as input file) 217 # Extract universe background genes (same as input file)
163 if (!is.null(args$universe_type)) { 218 if (!is.null(args$universe_type)) {
164 universe_type = args$universe_type 219 universe_type = args$universe_type
165 if (universe_type == "text") { 220 if (universe_type == "text") {
166 universe = strsplit(args$universe, "[ \t\n]+")[[1]] 221 universe = strsplit(args$universe, "[ \t\n]+")[[1]]
167 } 222 } else if (universe_type == "file") {
168 else if (universe_type == "file") {
169 universe_filename = args$universe 223 universe_filename = args$universe
170 universe_ncol = args$uncol 224 universe_ncol = args$uncol
171 # Check ncol 225 # Check ncol
172 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) { 226 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) {
173 stop("Please enter the right format for column number: c[number]") 227 stop("Please enter the right format for column number: c[number]")
174 } 228 } else {
175 else {
176 universe_ncol = as.numeric(gsub("c", "", universe_ncol)) 229 universe_ncol = as.numeric(gsub("c", "", universe_ncol))
177 } 230 }
178 universe_header = args$uheader 231 universe_header = str2bool(args$uheader)
179 # Get file content 232 # Get file content
180 universe_file = readfile(universe_filename, universe_header) 233 universe_file = read_file(universe_filename, universe_header)
181 # Extract Protein IDs list 234 # Extract Protein IDs list
182 universe = c() 235 universe <- sapply(universe_file[,universe_ncol], function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)
183 for (row in as.character(universe_file[,universe_ncol])) {
184 universe = c(universe, strsplit(row, ";")[[1]][1])
185 }
186 } 236 }
187 universe_id_type = args$universe_id_type 237 universe_id_type = args$universe_id_type
188 ##to initialize 238 ##to initialize
189 if (universe_id_type=="Uniprot") { 239 if (universe_id_type=="Uniprot" & any(check_ids(universe,"uniprot"))) {
190 idFrom<-"UNIPROT" 240 idFrom<-"UNIPROT"
191 idTo<-"ENTREZID" 241 idTo<-"ENTREZID"
192 universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb) 242 suppressMessages(universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb))
193 universe_gene<-unique(universe_gene$ENTREZID) 243 universe_gene<-unique(universe_gene$ENTREZID)
194 } 244 } else if (universe_id_type=="Entrez" & any(check_ids(universe,"entrez"))) {
195 else if (universe_id_type=="Entrez") { 245 universe_gene<-unique(unlist(universe))
196 universe_gene<-unique(universe) 246 } else {
197 } 247 if (universe_type=="text"){
198 } 248 print(paste(universe_id_type,"not found in your background IDs list",sep=" "))
199 else { 249 } else {
250 print(paste(universe_id_type,"not found in the column",universe_ncol,"of your background IDs file",sep=" "))
251 }
252 universe_gene = NULL
253 }
254 } else {
200 universe_gene = NULL 255 universe_gene = NULL
201 } 256 }
257 } else {
258 universe_gene = NULL
202 } 259 }
203 260
204 ##enrichGO : GO over-representation test 261 ##enrichGO : GO over-representation test
205 for (onto in ontology) { 262 for (onto in ontology) {
206 if (args$go_represent == "true") { 263 if (go_represent) {
207 ggo<-repartition.GO(gene, orgdb, onto, level, readable=TRUE) 264 ggo<-repartition.GO(gene, orgdb, onto, level, readable=TRUE)
208 write.table(ggo, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE) 265 if (is.list(ggo)){ggo <- as.data.frame(apply(ggo, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA
209 } 266 output_path = paste("cluster_profiler_GGO_",onto,".tsv",sep="")
210 if (args$go_enrich == "true") { 267 write.table(ggo, output_path, sep="\t", row.names = FALSE, quote = FALSE )
211 ego<-enrich.GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff) 268 }
212 write.table(ego, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE) 269
270 if (go_enrich) {
271 ego<-enrich.GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff,plot)
272 if (is.list(ego)){ego <- as.data.frame(apply(ego, c(1,2), function(x) gsub("^$|^ $", NA, x)))} #convert "" and " " to NA
273 output_path = paste("cluster_profiler_EGO_",onto,".tsv",sep="")
274 write.table(ego, output_path, sep="\t", row.names = FALSE, quote = FALSE )
213 } 275 }
214 } 276 }
215 } 277 }
216 278
217 clusterProfiler() 279 clusterProfiler()