proteore_clusterprofiler: GO-enrich.R comparison

comparison GO-enrich.R @ 7:4609346d8108 draft

planemo upload commit 9af2cf12c26c94e7206751ccf101a3368f92d0ba

author	proteore
date	Tue, 18 Dec 2018 09:21:32 -0500
parents	5e16cec55146
children	b29255864039

comparison

equal deleted inserted replaced

-:5e16cec55146
+:4609346d8108
-library(clusterProfiler)
+options(warn=-1)  #TURN OFF WARNINGS !!!!!!
+suppressMessages(library(clusterProfiler,quietly = TRUE))
-#library(org.Sc.sgd.db)
-library(org.Hs.eg.db)
-library(org.Mm.eg.db)
 # Read file and return file content as data.frame
-readfile = function(filename, header) {
+read_file <- function(path,header){
-if (header == "true") {
+file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="",check.names = F),silent=TRUE)
-# Read only first line of the file as header:
+if (inherits(file,"try-error")){
-headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+stop("File not found !")
-#Read the data of the files (skipping the first row)
+}else{
-file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
-# Remove empty rows
 file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
-#And assign the header to the data
+return(file)
-names(file) <- headers
+}
 }
-else {
-file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE, na.strings=c("", "NA"), blank.lines.skip = TRUE, quote = "")
+#return the number of character from the longest description found (from the 10 first)
-# Remove empty rows
+max_str_length_10_first <- function(vector){
-file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+vector <- as.vector(vector)
-}
+nb_description = length(vector)
-return(file)
+if (nb_description >= 10){nb_description=10}
+return(max(nchar(vector[1:nb_description])))
+}
+str2bool <- function(x){
+if (any(is.element(c("t","true"),tolower(x)))){
+return (TRUE)
+}else if (any(is.element(c("f","false"),tolower(x)))){
+return (FALSE)
+}else{
+return(NULL)
+}
+}
+#used before the limit was set to 50 characters
+width_by_max_char <- function (nb_max_char) {
+if (nb_max_char < 50 ){
+width=600
+} else if (nb_max_char < 75) {
+width=800
+} else if (nb_max_char < 100) {
+width=900
+} else {
+width=1000
+}
+return (width)
 }
 repartition.GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) {
 ggo<-groupGO(gene=geneid,
 OrgDb = orgdb,
 ont=ontology,
 level=level,
 readable=TRUE)
-name <- paste("GGO.", ontology, ".png", sep = "")
-png(name)
+if (length(ggo@result$ID) > 0 ) {
-p <- barplot(ggo, showCategory=10)
+ggo@result$Description <- sapply(as.vector(ggo@result$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE)
-print(p)
+#nb_max_char = max_str_length_10_first(ggo$Description)
-dev.off()
+#width = width_by_max_char(nb_max_char)
-return(ggo)
+name <- paste("GGO_", ontology, "_bar-plot", sep = "")
+png(name,height = 720, width = 600)
+p <- barplot(ggo, showCategory=10)
+print(p)
+dev.off()
+ggo <- as.data.frame(ggo)
+return(ggo)
+}
 }
 # GO over-representation test
-enrich.GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff) {
+enrich.GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff,plot) {
 ego<-enrichGO(gene=geneid,
 universe=universe,
 OrgDb=orgdb,
-keytype="ENTREZID",
 ont=ontology,
 pAdjustMethod="BH",
 pvalueCutoff=pval_cutoff,
 qvalueCutoff=qval_cutoff,
 readable=TRUE)
 # Plot bar & dot plots
-bar_name <- paste("EGO.", ontology, ".bar.png", sep = "")
+#if there are enriched GopTerms
-png(bar_name)
+if (length(ego$ID)>0){
-p <- barplot(ego)
-print(p)
+ego@result$Description <- sapply(ego@result$Description, function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE)
-dev.off()
+#nb_max_char = max_str_length_10_first(ego$Description)
-dot_name <- paste("EGO.", ontology, ".dot.png", sep = "")
+#width = width_by_max_char(nb_max_char)
-png(dot_name)
-p <- dotplot(ego, showCategory=10)
+if ("dotplot" %in% plot ){
-print(p)
+dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "")
-dev.off()
+png(dot_name,height = 720, width = 600)
-return(ego)
+p <- dotplot(ego, showCategory=10)
+print(p)
+dev.off()
+}
+if ("barplot" %in% plot ){
+bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "")
+png(bar_name,height = 720, width = 600)
+p <- barplot(ego, showCategory=10)
+print(p)
+dev.off()
+}
+ego <- as.data.frame(ego)
+return(ego)
+} else {
+warning(paste("No Go terms enriched (EGO) found for ",ontology,"ontology"),immediate. = TRUE,noBreaks. = TRUE,call. = FALSE)
+}
+}
+check_ids <- function(vector,type) {
+uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
+entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
+if (type == "entrez")
+return(grepl(entrez_id,vector))
+else if (type == "uniprot") {
+return(grepl(uniprot_pattern,vector))
+}
 }
 clusterProfiler = function() {
 args <- commandArgs(TRUE)
 if(length(args)<1) {
 --onto_opt: ontology options
 --go_function: groupGO/enrichGO
 --level: 1-3
 --pval_cutoff
 --qval_cutoff
---text_output: text output filename \n")
+--text_output: text output filename
+--plot : type of visualization, dotplot or/and barplot \n")
 q(save="no")
 }
 # Parse arguments
 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
 args <- as.list(as.character(argsDF$V2))
 names(args) <- argsDF$V1
-#print(args)
+plot = unlist(strsplit(args$plot,","))
+go_represent=str2bool(args$go_represent)
+go_enrich=str2bool(args$go_enrich)
+#save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda")
+#load("/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda")
+suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE))
 # Extract OrgDb
-if (args$species=="human") {
+if (args$species=="org.Hs.eg.db") {
 orgdb<-org.Hs.eg.db
-}
+} else if (args$species=="org.Mm.eg.db") {
-else if (args$species=="mouse") {
 orgdb<-org.Mm.eg.db
-}
+} else if (args$species=="org.Rn.eg.db") {
-else if (args$species=="rat") {
 orgdb<-org.Rn.eg.db
 }
 # Extract input IDs
 input_type = args$input_type
+id_type = args$id_type
 if (input_type == "text") {
 input = strsplit(args$input, "[ \t\n]+")[[1]]
-}
+} else if (input_type == "file") {
-else if (input_type == "file") {
 filename = args$input
 ncol = args$ncol
 # Check ncol
 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
 stop("Please enter the right format for column number: c[number]")
-}
+} else {
-else {
 ncol = as.numeric(gsub("c", "", ncol))
 }
-header = args$header
+header = str2bool(args$header)                  # Get file content
-# Get file content
+file = read_file(filename, header)              # Extract Protein IDs list
-file = readfile(filename, header)
+input =  unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
-# Extract Protein IDs list
+}
-input = c()
-for (row in as.character(file[,ncol])) {
-input = c(input, strsplit(row, ";")[[1]][1])
-}
-}
-id_type = args$id_type
 ## Get input gene list from input IDs
 #ID format Conversion
 #This case : from UNIPROT (protein id) to ENTREZ (gene id)
 #bitr = conversion function from clusterProfiler
-if (id_type=="Uniprot") {
+if (id_type=="Uniprot" & any(check_ids(input,"uniprot"))) {
+any(check_ids(input,"uniprot"))
 idFrom<-"UNIPROT"
 idTo<-"ENTREZID"
-gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb)
+suppressMessages(gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb))
 gene<-unique(gene$ENTREZID)
-}
+} else if (id_type=="Entrez" & any(check_ids(input,"entrez"))) {
-else if (id_type=="Entrez") {
 gene<-unique(input)
+} else {
+stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file"))
 }
 ontology <- strsplit(args$onto_opt, ",")[[1]]
 ## Extract GGO/EGO arguments
-if (args$go_represent == "true") {
+if (go_represent) {level <- as.numeric(args$level)}
-go_represent <- args$go_represent
+if (go_enrich) {
-level <- as.numeric(args$level)
-}
-if (args$go_enrich == "true") {
-go_enrich <- args$go_enrich
 pval_cutoff <- as.numeric(args$pval_cutoff)
 qval_cutoff <- as.numeric(args$qval_cutoff)
 # Extract universe background genes (same as input file)
 if (!is.null(args$universe_type)) {
 universe_type = args$universe_type
 if (universe_type == "text") {
 universe = strsplit(args$universe, "[ \t\n]+")[[1]]
-}
+} else if (universe_type == "file") {
-else if (universe_type == "file") {
 universe_filename = args$universe
 universe_ncol = args$uncol
 # Check ncol
 if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) {
 stop("Please enter the right format for column number: c[number]")
-}
+} else {
-else {
 universe_ncol = as.numeric(gsub("c", "", universe_ncol))
 }
-universe_header = args$uheader
+universe_header = str2bool(args$uheader)
 # Get file content
-universe_file = readfile(universe_filename, universe_header)
+universe_file = read_file(universe_filename, universe_header)
 # Extract Protein IDs list
-universe = c()
+universe <- sapply(universe_file[,universe_ncol], function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE)
-for (row in as.character(universe_file[,universe_ncol])) {
-universe = c(universe, strsplit(row, ";")[[1]][1])
-}
 }
 universe_id_type = args$universe_id_type
 ##to initialize
-if (universe_id_type=="Uniprot") {
+if (universe_id_type=="Uniprot" & any(check_ids(universe,"uniprot"))) {
 idFrom<-"UNIPROT"
 idTo<-"ENTREZID"
-universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb)
+suppressMessages(universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb))
 universe_gene<-unique(universe_gene$ENTREZID)
-}
+} else if (universe_id_type=="Entrez" & any(check_ids(universe,"entrez"))) {
-else if (universe_id_type=="Entrez") {
+universe_gene<-unique(unlist(universe))
-universe_gene<-unique(universe)
+} else {
-}
+if (universe_type=="text"){
-}
+print(paste(universe_id_type,"not found in your background IDs list",sep=" "))
-else {
+} else {
+print(paste(universe_id_type,"not found in the column",universe_ncol,"of your background IDs file",sep=" "))
+}
+universe_gene = NULL
+}
+} else {
 universe_gene = NULL
 }
+} else {
+universe_gene = NULL
 }
 ##enrichGO : GO over-representation test
 for (onto in ontology) {
-if (args$go_represent == "true") {
+if (go_represent) {
 ggo<-repartition.GO(gene, orgdb, onto, level, readable=TRUE)
-write.table(ggo, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
+if (is.list(ggo)){ggo <- as.data.frame(apply(ggo, c(1,2), function(x) gsub("^$|^ $", NA, x)))}  #convert "" and " " to NA
-}
+output_path = paste("cluster_profiler_GGO_",onto,".tsv",sep="")
-if (args$go_enrich == "true") {
+write.table(ggo, output_path, sep="\t", row.names = FALSE, quote = FALSE )
-ego<-enrich.GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff)
+}
-write.table(ego, args$text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
+if (go_enrich) {
+ego<-enrich.GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff,plot)
+if (is.list(ego)){ego <- as.data.frame(apply(ego, c(1,2), function(x) gsub("^$|^ $", NA, x)))}  #convert "" and " " to NA
+output_path = paste("cluster_profiler_EGO_",onto,".tsv",sep="")
+write.table(ego, output_path, sep="\t", row.names = FALSE, quote = FALSE )
 }
 }
 }
 clusterProfiler()

Mercurial > repos > proteore > proteore_clusterprofiler

comparison GO-enrich.R @ 7:4609346d8108 draft