diff goprofiles.R @ 0:d89c09253c8d draft

planemo upload commit abb24d36c776520e73220d11386252d848173697-dirty
author proteore
date Sun, 26 Nov 2017 19:19:39 -0500
parents
children 1236ee08ccb8
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/goprofiles.R	Sun Nov 26 19:19:39 2017 -0500
@@ -0,0 +1,274 @@
+# Load necessary libraries
+library("org.Hs.eg.db", quietly=TRUE)
+library("goProfiles", quietly=TRUE)
+
+# Read file and return file content as data.frame?
+readfile = function(filename, header) {
+  if (header == "true") {
+    # Read only the first two lines of the files as data (without headers):
+    headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
+    #print("header")
+    #print(headers)
+    # Create the headers names with the two (or more) first rows, sappy allows to make operations over the columns (in this case paste) - read more about sapply here :
+    #headers_names <- sapply(headers, paste, collapse = "_")
+    #print(headers_names)
+    #Read the data of the files (skipping the first 2 rows):
+    file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
+    #print(file[1,])
+    #And assign the headers of step two to the data:
+    names(file) <- headers
+  }
+  else {
+    file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
+  }
+  return(file)
+}
+
+#filename = "/Users/LinCun/Documents/ProteoRE/usecase1/Check/HPA.Selection.134.txt"
+#test = readfile(filename)
+#str(test)
+#str(test$Gene.names)
+getprofile = function(ids, id_type, level, duplicate) {
+  ####################################################################
+  # Arguments
+  #   - ids: list of input IDs
+  #   - id_type: type of input IDs (UniProt/ENTREZID)
+  #   - level
+  #   - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE)
+  ####################################################################
+  
+  # Check if level is number
+  if (! as.numeric(level) %% 1 == 0) {
+    stop("Please enter an integer for level")
+  }
+  else {
+    level = as.numeric(level)
+  }
+  #genes = as.vector(file[,ncol])
+  
+  # Extract Gene Entrez ID
+  if (id_type == "Entrez") {
+    id = select(org.Hs.eg.db, ids, "ENTREZID", multiVals = "first")
+    genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))]
+  }
+  else {
+    genes_ids = c()
+    id = select(org.Hs.eg.db, ids, "ENTREZID", "UNIPROT", multiVals = "first")
+    if (duplicate == "TRUE") {
+      id = unique(id)
+    }
+    print(id[[1]])
+    genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))]
+    # IDs that have NA ENTREZID
+    NAs = id$UNIPROT[which(is.na(id$ENTREZID))]
+    print("IDs unable to convert to ENTREZID: ")
+    print(NAs)
+  }
+  #print(genes_ids)
+  # Convert Protein IDs into entrez ids
+  
+  # for (i in 1:length(id$UNIPROT)) {
+  #   print(i)
+  #   if (is.na(id[[2]][i])) {
+  #     print(id[[2]][i])
+  #   }
+  # }
+  # a = id[which(id$ENTREZID == "NA"),]
+  # print(a)
+  # print(a$UNIPROT)
+  #print(id[[1]][which(is.na(id$ENTREZID))])
+  #print(genes_ids)
+  # for (gene in genes) {
+  #   #id = as.character(mget(gene, org.Hs.egALIAS2EG, ifnotfound = NA))
+  #   id = select(org.Hs.eg.db, genes, "ENTREZID", "UNIPROT")
+  #   print(id)
+  #   genes_ids = append(genes_ids, id$ENTREZID)
+  # }
+  #print(genes_ids)
+  
+  # Create basic profiles
+  profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  profile.ALL = basicProfile(genes_ids, onto='ANY', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
+  
+  # Print profile
+  # printProfiles(profile)
+  
+  return(c(profile.CC, profile.MF, profile.BP, profile.ALL))
+}
+
+# Plot profiles to PNG
+plotPNG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
+  if (!is.null(profile.CC)) {
+    png("profile.CC.png")
+    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.BP)) {
+    png("profile.BP.png")
+    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.MF)) {
+    png("profile.MF.png")
+    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.ALL)) {
+    png("profile.ALL.png")
+    plotProfiles(profile.ALL, percentage=per, multiplePlots=T, aTitle=title)
+    dev.off()
+  }
+}
+
+# Plot profiles to JPEG
+plotJPEG = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
+  if (!is.null(profile.CC)) {
+    jpeg("profile.CC.jpeg")
+    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.BP)) {
+    jpeg("profile.BP.jpeg")
+    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.MF)) {
+    jpeg("profile.MF.jpeg")
+    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.ALL)) {
+    jpeg("profile.ALL.jpeg")
+    plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+}
+
+# Plot profiles to PDF
+plotPDF = function(profile.CC = NULL, profile.BP = NULL, profile.MF = NULL, profile.ALL = NULL, per = TRUE, title = TRUE) {
+  if (!is.null(profile.CC)) {
+    pdf("profile.CC.pdf")
+    plotProfiles(profile.CC, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.BP)) {
+    pdf("profile.BP.pdf")
+    plotProfiles(profile.BP, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.MF)) {
+    pdf("profile.MF.pdf")
+    plotProfiles(profile.MF, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+  if (!is.null(profile.ALL)) {
+    #print("all")
+    pdf("profile.ALL.pdf")
+    plotProfiles(profile.ALL, percentage=per, multiplePlots=FALSE, aTitle=title)
+    dev.off()
+  }
+}
+
+goprofiles = function() {
+  args = commandArgs(trailingOnly = TRUE)
+  #print(args)
+  # arguments: filename.R inputfile ncol "CC,MF,BP,ALL" "PNG,JPEG,PDF" level "TRUE"(percentage) "Title"
+  if (length(args) != 9) {
+    stop("Not enough/Too many arguments", call. = FALSE)
+  }
+  else {
+    input_type = args[2]
+    if (input_type == "text") {
+      input = strsplit(args[1], "\\s+")[[1]]
+    }
+    else if (input_type == "file") {
+      filename = strsplit(args[1], ",")[[1]][1]
+      ncol = strsplit(args[1], ",")[[1]][2]
+      # Check ncol
+      if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
+        stop("Please enter an integer for level")
+      }
+      else {
+        ncol = as.numeric(gsub("c", "", ncol))
+      }
+      header = strsplit(args[1], ",")[[1]][3]
+      # Get file content
+      file = readfile(filename, header)
+      # Extract Protein IDs list
+      input = c()
+      for (row in as.character(file[,ncol])) {
+        input = c(input, strsplit(row, ";")[[1]][1])
+      }
+    }
+    id_type = args[3]
+    ontoopt = strsplit(args[4], ",")[[1]]
+    #print(ontoopt)
+    #plotopt = strsplit(args[3], ",")
+    plotopt = args[5]
+    level = args[6]
+    per = as.logical(args[7])
+    title = args[8]
+    duplicate = args[9]
+
+    profiles = getprofile(input, id_type, level, duplicate)
+    profile.CC = profiles[1]
+    #print(profile.CC)
+    profile.MF = profiles[2]
+    #print(profile.MF)
+    profile.BP = profiles[3]
+    #print(profile.BP)
+    profile.ALL = profiles[-3:-1]
+    #print(profile.ALL)
+    #c(profile.ALL, profile.CC, profile.MF, profile.BP)
+    if ("CC" %in% ontoopt) {
+      if (grepl("PNG", plotopt)) {
+        plotPNG(profile.CC=profile.CC, per=per, title=title)
+      }
+      if (grepl("JPEG", plotopt)) {
+        plotJPEG(profile.CC = profile.CC, per=per, title=title)
+      }
+      if (grepl("PDF", plotopt)) {
+        plotPDF(profile.CC = profile.CC, per=per, title=title)
+      }
+    }
+    if ("MF" %in% ontoopt) {
+      if (grepl("PNG", plotopt)) {
+        plotPNG(profile.MF = profile.MF, per=per, title=title)
+      }
+      if (grepl("JPEG", plotopt)) {
+        plotJPEG(profile.MF = profile.MF, per=per, title=title)
+      }
+      if (grepl("PDF", plotopt)) {
+        plotPDF(profile.MF = profile.MF, per=per, title=title)
+      }
+    }
+    if ("BP" %in% ontoopt) {
+      if (grepl("PNG", plotopt)) {
+        plotPNG(profile.BP = profile.BP, per=per, title=title)
+      }
+      if (grepl("JPEG", plotopt)) {
+        plotJPEG(profile.BP = profile.BP, per=per, title=title)
+      }
+      if (grepl("PDF", plotopt)) {
+        plotPDF(profile.BP = profile.BP, per=per, title=title)
+      }
+    }
+    
+    #if (grepl("PNG", plotopt)) {
+    # plotPNG(profile.ALL = profile.ALL, per=per, title=title)
+    #}
+    #if (grepl("JPEG", plotopt)) {
+    # plotJPEG(profile.ALL = profile.ALL, per=per, title=title)
+    #}
+    #if (grepl("PDF", plotopt)) {
+    # plotPDF(profile.ALL = profile.ALL, per=per, title=title)
+    #}
+  }
+  
+}
+
+goprofiles()
+
+#Rscript go.R ../proteinGroups_Maud.txt "1" "CC" "PDF" 2 "TRUE" "Title"