diff GO-enrich.R @ 13:f6107b8ae8f8 draft default tip

"planemo upload commit be9070db4a3b178ab45ecd9c9c3ab369240f7617-dirty"
author proteore
date Fri, 09 Apr 2021 14:39:05 +0000
parents d951677a50d4
children
line wrap: on
line diff
--- a/GO-enrich.R	Fri Jan 24 05:12:09 2020 -0500
+++ b/GO-enrich.R	Fri Apr 09 14:39:05 2021 +0000
@@ -1,63 +1,77 @@
-options(warn=-1)  #TURN OFF WARNINGS !!!!!!
-suppressMessages(library(clusterProfiler,quietly = TRUE))
+options(warn = -1)  #TURN OFF WARNINGS !!!!!!
+suppressMessages(library(clusterProfiler, quietly = TRUE))
 
 # Read file and return file content as data.frame
-read_file <- function(path,header){
-  file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="",check.names = F),silent=TRUE)
-  if (inherits(file,"try-error")){
+read_file <- function(path, header) {
+  file <- try(read.csv(path, header = header, sep = "\t",
+                       stringsAsFactors = FALSE, quote = "", check.names = F),
+                      silent = TRUE)
+  if (inherits(file, "try-error")) {
     stop("File not found !")
   }else{
-    file <- file[!apply(is.na(file) | file == "", 1, all), , drop=FALSE]
+    file <- file[!apply(is.na(file) | file == "", 1, all), , drop = FALSE]
     return(file)
   }
 }
 
-#return the number of character from the longest description found (from the 10 first)
-max_str_length_10_first <- function(vector){
+
+#return the number of character from the longest description found
+#(from the 10 first)
+max_str_length_10_first <- function(vector) {
   vector <- as.vector(vector)
-  nb_description = length(vector)
-  if (nb_description >= 10){nb_description=10}
+  nb_description <- length(vector)
+  if (nb_description >= 10) {
+    nb_description <- 10
+    }
+
   return(max(nchar(vector[1:nb_description])))
 }
 
-str2bool <- function(x){
-  if (any(is.element(c("t","true"),tolower(x)))){
-    return (TRUE)
-  }else if (any(is.element(c("f","false"),tolower(x)))){
-    return (FALSE)
+str2bool <- function(x) {
+  if (any(is.element(c("t", "true"), tolower(x)))) {
+    return(TRUE)
+  }else if (any(is.element(c("f", "false"), tolower(x)))) {
+    return(FALSE)
+
   }else{
     return(NULL)
   }
 }
 
 #used before the limit was set to 50 characters
-width_by_max_char <- function (nb_max_char) {
-  if (nb_max_char < 50 ){
-    width=600
+width_by_max_char <- function(nb_max_char) {
+  if (nb_max_char < 50) {
+    width <- 600
   } else if (nb_max_char < 75) {
-    width=800
+    width <- 800
   } else if (nb_max_char < 100) {
-    width=900
+    width <- 900
   } else {
-    width=1000
+    width <- 1000
   }
-  return (width)
+  return(width)
 }
 
-repartition_GO <- function(geneid, orgdb, ontology, level=3, readable=TRUE) {
-  ggo<-groupGO(gene=geneid, 
-               OrgDb = orgdb, 
-               ont=ontology, 
-               level=level, 
-               readable=TRUE)
+
+repartition_go <- function(geneid, orgdb, ontology,
+                           level = 3, readable = TRUE) {
+  ggo <- groupGO(gene = geneid,
+                OrgDb = orgdb,
+                ont = ontology,
+                level = level,
+                readable = TRUE)
+
 
-  if (length(ggo@result$ID) > 0 ) {
-    ggo@result$Description <- sapply(as.vector(ggo@result$Description), function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE)
-    #nb_max_char = max_str_length_10_first(ggo$Description)
-    #width = width_by_max_char(nb_max_char)
+  if (length(ggo@result$ID) > 0) {
+    ggo@result$Description <- sapply(as.vector(ggo@result$Description), #nolint
+                              function(x) {
+                                ifelse(nchar(x) > 50,
+                                substr(x, 1, 50), x)}, USE.NAMES = FALSE)
+
+
     name <- paste("GGO_", ontology, "_bar-plot", sep = "")
-    png(name,height = 720, width = 600)
-    p <- barplot(ggo, showCategory=10)
+    png(name, height = 720, width = 600)
+    p <- barplot(ggo, showCategory = 10)
     print(p)
     dev.off()
     ggo <- as.data.frame(ggo)
@@ -65,75 +79,85 @@
   }
 }
 
+# nolint end
+
 # GO over-representation test
-enrich_GO <- function(geneid, universe, orgdb, ontology, pval_cutoff, qval_cutoff,plot) {
-  ego<-enrichGO(gene=geneid,
-                universe=universe,
-                OrgDb=orgdb,
-                ont=ontology,
-                pAdjustMethod="BH",
-                pvalueCutoff=pval_cutoff,
-                qvalueCutoff=qval_cutoff,
-                readable=TRUE)
-  
+enrich_go <- function(geneid, universe, orgdb, ontology, pval_cutoff,
+                      qval_cutoff, plot) {
+  ego <- enrichGO(gene = geneid,
+
+                universe = universe,
+                OrgDb = orgdb,
+                ont = ontology,
+                pAdjustMethod = "BH",
+                pvalueCutoff = pval_cutoff,
+                qvalueCutoff = qval_cutoff,
+                readable = TRUE)
+
   # Plot bar & dot plots
-  #if there are enriched GopTerms
-  if (length(ego$ID)>0){
-    
-    ego@result$Description <- sapply(ego@result$Description, function(x) {ifelse(nchar(x)>50, substr(x,1,50),x)},USE.NAMES = FALSE)
-    #nb_max_char = max_str_length_10_first(ego$Description)
-    #width = width_by_max_char(nb_max_char)
-    
-    if ("dotplot" %in% plot ){
+  #if there are enriched GoTerms
+
+
+
+  if (length(ego$ID) > 0) {
+
+    ego@result$Description <- sapply(ego@result$Description, function(x) {  #nolint
+      ifelse(nchar(x) > 50, substr(x, 1, 50), x)
+      }, USE.NAMES = FALSE)
+
+
+    if ("dotplot" %in% plot) {
     dot_name <- paste("EGO_", ontology, "_dot-plot", sep = "")
-    png(dot_name,height = 720, width = 600)
-    p <- dotplot(ego, showCategory=10)
+    png(dot_name, height = 720, width = 600)
+    p <- dotplot(ego, showCategory = 10)  #nolint
     print(p)
     dev.off()
     }
 
-    if ("barplot" %in% plot ){
+    if ("barplot" %in% plot) {
     bar_name <- paste("EGO_", ontology, "_bar-plot", sep = "")
-    png(bar_name,height = 720, width = 600)
-    p <- barplot(ego, showCategory=10)
+    png(bar_name, height = 720, width = 600)
+    p <- barplot(ego, showCategory = 10)
     print(p)
     dev.off()
-    
     }
+
     ego <- as.data.frame(ego)
     return(ego)
   } else {
-    warning(paste("No Go terms enriched (EGO) found for ",ontology,"ontology"),immediate. = TRUE,noBreaks. = TRUE,call. = FALSE)
+    warning(paste("No Go terms enriched (EGO) found for ",
+    ontology, "ontology"), immediate. = TRUE, noBreaks. = TRUE, call. = FALSE)
   }
 }
 
-clean_ids <- function(ids){
-  ids = gsub(" ","",ids)
-  ids = ids[which(ids!="")]
-  ids = ids[which(ids!="NA")]
-  ids = ids[!is.na(ids)]
- 
-  return(ids) 
+
+clean_ids <- function(ids) {
+  ids <- gsub(" ", "", ids)
+  ids <- ids[which(ids != "")]
+  ids <- ids[which(ids != "NA")]
+  ids <- ids[!is.na(ids)]
+
+  return(ids)
 }
 
-check_ids <- function(vector,type) {
-  uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
-  entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
-  if (type == "entrez")
-    return(grepl(entrez_id,vector))
-  else if (type == "uniprot") {
-    return(grepl(uniprot_pattern,vector))
+check_ids <- function(vector, type) {
+  uniprot_pattern <- "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" #nolint
+  entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" #nolint
+  if (type == "entrez") {
+    return(grepl(entrez_id, vector))
+  }else if (type == "uniprot") {
+    return(grepl(uniprot_pattern, vector))
   }
 }
 
-get_args <- function(){
+get_args <- function() {
   args <- commandArgs(TRUE)
-  if(length(args)<1) {
+  if (length(args) < 1) {
     args <- c("--help")
   }
-  
+
   # Help section
-  if("--help" %in% args) {
+  if ("--help" %in% args) {
     cat("clusterProfiler Enrichment Analysis
     Arguments:
         --input_type: type of input (list of id or filename)
@@ -152,152 +176,175 @@
         --level: 1-3
         --pval_cutoff
         --qval_cutoff
-        --text_output: text output filename 
+        --text_output: text output filename
         --plot : type of visualization, dotplot or/and barplot \n")
-    q(save="no")
+    q(save = "no")
   }
   # Parse arguments
-  parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
-  argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
-  args <- as.list(as.character(argsDF$V2))
-  names(args) <- argsDF$V1
-  
+
+  parseargs <- function(x) strsplit(sub("^--", "", x), "=")
+  argsdf <- as.data.frame(do.call("rbind", parseargs(args)))
+  args <- as.list(as.character(argsdf$V2))
+  names(args) <- argsdf$V1
   return(args)
 }
 
-
-main <- function() {
-  
+main <- function() {  #nolint
   #get args from command
   args <- get_args()
 
-  #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda")
-  #load("/home/dchristiany/proteore_project/ProteoRE/tools/cluster_profiler/args.Rda")
-  
-  go_represent=str2bool(args$go_represent)
-  go_enrich=str2bool(args$go_enrich)
-  if (go_enrich){
-    plot = unlist(strsplit(args$plot,","))
+  go_represent <- str2bool(args$go_represent)
+  go_enrich <- str2bool(args$go_enrich)
+  if (go_enrich) {
+    plot <- unlist(strsplit(args$plot, ","))
   }
-  
+
   suppressMessages(library(args$species, character.only = TRUE, quietly = TRUE))
-  
+
   # Extract OrgDb
-  if (args$species=="org.Hs.eg.db") {
-    orgdb<-org.Hs.eg.db
-  } else if (args$species=="org.Mm.eg.db") {
-    orgdb<-org.Mm.eg.db
-  } else if (args$species=="org.Rn.eg.db") {
-    orgdb<-org.Rn.eg.db
+  if (args$species == "org.Hs.eg.db") {
+    orgdb <- org.Hs.eg.db #nolint
+  } else if (args$species == "org.Mm.eg.db") {
+    orgdb <- org.Mm.eg.db #nolint
+  } else if (args$species == "org.Rn.eg.db") {
+    orgdb <- org.Rn.eg.db #nolint
   }
 
   # Extract input IDs
-  input_type = args$input_type
-  id_type = args$id_type
-  
+  input_type <- args$input_type
+  id_type <- args$id_type
+
   if (input_type == "text") {
-    input = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";"))
+    input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
   } else if (input_type == "file") {
-    filename = args$input
-    ncol = args$ncol
+    filename <- args$input
+    ncol <- args$ncol
     # Check ncol
     if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
       stop("Please enter the right format for column number: c[number]")
     } else {
-      ncol = as.numeric(gsub("c", "", ncol))
+      ncol <- as.numeric(gsub("c", "", ncol))
     }
-    header = str2bool(args$header)                  # Get file content
-    file = read_file(filename, header)              # Extract Protein IDs list
-    input =  unlist(sapply(as.character(file[,ncol]),function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
+
+    header <- str2bool(args$header)       # Get file content
+    file <- read_file(filename, header)   # Extract Protein IDs list
+    input <-  unlist(sapply(as.character(file[, ncol]),
+                    function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
   }
-  input = clean_ids(input)
-  
+  input <- clean_ids(input)
+
   ## Get input gene list from input IDs
-  #ID format Conversion 
+  #ID format Conversion
   #This case : from UNIPROT (protein id) to ENTREZ (gene id)
   #bitr = conversion function from clusterProfiler
-  if (id_type=="Uniprot" & any(check_ids(input,"uniprot"))) {
-    any(check_ids(input,"uniprot"))
-    idFrom<-"UNIPROT"
-    idTo<-"ENTREZID"
-    suppressMessages(gene<-bitr(input, fromType=idFrom, toType=idTo, OrgDb=orgdb))
-    gene<-unique(gene$ENTREZID)
-  } else if (id_type=="Entrez" & any(check_ids(input,"entrez"))) {
-    gene<-unique(input)
+  if (id_type == "Uniprot" & any(check_ids(input, "uniprot"))) {
+    any(check_ids(input, "uniprot"))
+
+    idfrom <- "UNIPROT"
+    idto <- "ENTREZID"
+    suppressMessages(gene <- bitr(input, fromType = idfrom, toType = idto,
+                                  OrgDb = orgdb))
+
+    gene <- unique(gene$ENTREZID)
+  } else if (id_type == "Entrez" & any(check_ids(input, "entrez"))) {
+    gene <- unique(input)
   } else {
-    stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file"))
+
+    stop(paste(id_type, "not found in your ids list,
+     please check your IDs in input or the selected column of your input file"))
+
   }
 
   ontology <- strsplit(args$onto_opt, ",")[[1]]
-  
+
   ## Extract GGO/EGO arguments
-  if (go_represent) {level <- as.numeric(args$level)}
+  if (go_represent) {
+    level <- as.numeric(args$level)
+
+    }
+
   if (go_enrich) {
     pval_cutoff <- as.numeric(args$pval_cutoff)
     qval_cutoff <- as.numeric(args$qval_cutoff)
     # Extract universe background genes (same as input file)
     if (!is.null(args$universe_type)) {
-      universe_type = args$universe_type
+      universe_type <- args$universe_type
       if (universe_type == "text") {
-        universe = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";"))
+        universe <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
       } else if (universe_type == "file") {
-        universe_filename = args$universe
-        universe_ncol = args$uncol
+        universe_filename <- args$universe
+        universe_ncol <- args$uncol
         # Check ncol
         if (! as.numeric(gsub("c", "", universe_ncol)) %% 1 == 0) {
           stop("Please enter the right format for column number: c[number]")
         } else {
-          universe_ncol = as.numeric(gsub("c", "", universe_ncol))
+          universe_ncol <- as.numeric(gsub("c", "", universe_ncol))
         }
-        universe_header = str2bool(args$uheader)
+        universe_header <- str2bool(args$uheader)
         # Get file content
-        universe_file = read_file(universe_filename, universe_header)
+        universe_file <- read_file(universe_filename, universe_header)
         # Extract Protein IDs list
-        universe <- unlist(sapply(universe_file[,universe_ncol], function(x) rapply(strsplit(x,";"),c),USE.NAMES = FALSE))
+        universe <- unlist(sapply(universe_file[, universe_ncol],
+                function(x) rapply(strsplit(x, ";"), c), USE.NAMES = FALSE))
       }
-      universe = clean_ids(input)
-      universe_id_type = args$universe_id_type
+      universe <- clean_ids(input)
+      universe_id_type <- args$universe_id_type
       ##to initialize
-      if (universe_id_type=="Uniprot" & any(check_ids(universe,"uniprot"))) {
-        idFrom<-"UNIPROT"
-        idTo<-"ENTREZID"
-        suppressMessages(universe_gene<-bitr(universe, fromType=idFrom, toType=idTo, OrgDb=orgdb))
-        universe_gene<-unique(universe_gene$ENTREZID)
-      } else if (universe_id_type=="Entrez" & any(check_ids(universe,"entrez"))) {
-        universe_gene<-unique(unlist(universe))
+      if (universe_id_type == "Uniprot" & any(check_ids(universe, "uniprot"))) {
+        idfrom <- "UNIPROT"
+        idto <- "ENTREZID"
+        suppressMessages(universe_gene <- bitr(universe, fromType = idfrom,
+                                               toType = idto, OrgDb = orgdb))
+        universe_gene <- unique(universe_gene$ENTREZID)
+      } else if (universe_id_type == "Entrez" &
+                 any(check_ids(universe, "entrez"))) {
+        universe_gene <- unique(unlist(universe))
       } else {
-        if (universe_type=="text"){
-          print(paste(universe_id_type,"not found in your background IDs list",sep=" "))
+        if (universe_type == "text") {
+          print(paste(universe_id_type, "not found in your background IDs list",
+                      sep = " "))
         } else {
-          print(paste(universe_id_type,"not found in the column",universe_ncol,"of your background IDs file",sep=" "))
+          print(paste(universe_id_type, "not found in the column",
+                      universe_ncol, "of your background IDs file", sep = " "))
+
         }
-        universe_gene = NULL
-      } 
+        universe_gene <- NULL
+      }
     } else {
-      universe_gene = NULL
+      universe_gene <- NULL
     }
   } else {
-    universe_gene = NULL
+    universe_gene <- NULL
   }
 
   ##enrichGO : GO over-representation test
   for (onto in ontology) {
     if (go_represent) {
-      ggo<-repartition_GO(gene, orgdb, onto, level, readable=TRUE)
-      if (is.list(ggo)){ggo <- as.data.frame(apply(ggo, c(1,2), function(x) gsub("^$|^ $", NA, x)))}  #convert "" and " " to NA
-      output_path = paste("cluster_profiler_GGO_",onto,".tsv",sep="")
-      write.table(ggo, output_path, sep="\t", row.names = FALSE, quote = FALSE )
+      ggo <- repartition_go(gene, orgdb, onto, level, readable = TRUE)
+      if (is.list(ggo)) {
+        ggo <- as.data.frame(apply(ggo, c(1, 2),
+              function(x) gsub("^$|^ $", NA, x)))
+      }  #convert "" and " " to NA
+      output_path <- paste("cluster_profiler_GGO_", onto, ".tsv", sep = "")
+      write.table(ggo, output_path, sep = "\t",
+                  row.names = FALSE, quote = FALSE)
+
     }
 
     if (go_enrich) {
-      ego<-enrich_GO(gene, universe_gene, orgdb, onto, pval_cutoff, qval_cutoff,plot)
-      if (is.list(ego)){ego <- as.data.frame(apply(ego, c(1,2), function(x) gsub("^$|^ $", NA, x)))}  #convert "" and " " to NA
-      output_path = paste("cluster_profiler_EGO_",onto,".tsv",sep="")
-      write.table(ego, output_path, sep="\t", row.names = FALSE, quote = FALSE )
+      ego <- enrich_go(gene, universe_gene, orgdb, onto, pval_cutoff,
+                       qval_cutoff, plot)
+      if (is.list(ego)) {
+        ego <- as.data.frame(apply(ego, c(1, 2),
+                function(x) gsub("^$|^ $", NA, x)))
+        }  #convert "" and " " to NA
+      output_path <- paste("cluster_profiler_EGO_", onto, ".tsv", sep = "")
+      write.table(ego, output_path, sep = "\t",
+                  row.names = FALSE, quote = FALSE)
     }
   }
 }
 
-if(!interactive()) {
+if (!interactive()) {
   main()
 }