Mercurial > repos > proteore > proteore_goprofiles

diff goprofiles.R @ 1:1236ee08ccb8 draft
planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author: proteore
date: Fri, 16 Feb 2018 03:40:36 -0500
parents: d89c09253c8d
children: 58a8ddd58dde
--- a/goprofiles.R	Sun Nov 26 19:19:39 2017 -0500
+++ b/goprofiles.R	Fri Feb 16 03:40:36 2018 -0500
@@ -5,16 +5,12 @@
 # Read file and return file content as data.frame?
 readfile = function(filename, header) {
   if (header == "true") {
-    # Read only the first two lines of the files as data (without headers):
+    # Read only the first line of the files as data (without headers):
     headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
-    #print("header")
-    #print(headers)
-    # Create the headers names with the two (or more) first rows, sappy allows to make operations over the columns (in this case paste) - read more about sapply here :
-    #headers_names <- sapply(headers, paste, collapse = "_")
-    #print(headers_names)
-    #Read the data of the files (skipping the first 2 rows):
+    #Read the data of the files (skipping the first row):
     file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
-    #print(file[1,])
+    # Remove empty rows
+    file <- file[!apply(is.na(file) | file == "", 1, all),]
     #And assign the headers of step two to the data:
     names(file) <- headers
   }
@@ -24,10 +20,6 @@
   return(file)
 }
 
-#filename = "/Users/LinCun/Documents/ProteoRE/usecase1/Check/HPA.Selection.134.txt"
-#test = readfile(filename)
-#str(test)
-#str(test$Gene.names)
 getprofile = function(ids, id_type, level, duplicate) {
   ####################################################################
   # Arguments
@@ -64,27 +56,6 @@
     print("IDs unable to convert to ENTREZID: ")
     print(NAs)
   }
-  #print(genes_ids)
-  # Convert Protein IDs into entrez ids
-  
-  # for (i in 1:length(id$UNIPROT)) {
-  #   print(i)
-  #   if (is.na(id[[2]][i])) {
-  #     print(id[[2]][i])
-  #   }
-  # }
-  # a = id[which(id$ENTREZID == "NA"),]
-  # print(a)
-  # print(a$UNIPROT)
-  #print(id[[1]][which(is.na(id$ENTREZID))])
-  #print(genes_ids)
-  # for (gene in genes) {
-  #   #id = as.character(mget(gene, org.Hs.egALIAS2EG, ifnotfound = NA))
-  #   id = select(org.Hs.eg.db, genes, "ENTREZID", "UNIPROT")
-  #   print(id)
-  #   genes_ids = append(genes_ids, id$ENTREZID)
-  # }
-  #print(genes_ids)
   
   # Create basic profiles
   profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
@@ -172,103 +143,117 @@
 }
 
 goprofiles = function() {
-  args = commandArgs(trailingOnly = TRUE)
-  #print(args)
-  # arguments: filename.R inputfile ncol "CC,MF,BP,ALL" "PNG,JPEG,PDF" level "TRUE"(percentage) "Title"
-  if (length(args) != 9) {
-    stop("Not enough/Too many arguments", call. = FALSE)
+  args <- commandArgs(TRUE)
+  if(length(args)<1) {
+    args <- c("--help")
   }
-  else {
-    input_type = args[2]
-    if (input_type == "text") {
-      input = strsplit(args[1], "\\s+")[[1]]
-    }
-    else if (input_type == "file") {
-      filename = strsplit(args[1], ",")[[1]][1]
-      ncol = strsplit(args[1], ",")[[1]][2]
-      # Check ncol
-      if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
-        stop("Please enter an integer for level")
-      }
-      else {
-        ncol = as.numeric(gsub("c", "", ncol))
-      }
-      header = strsplit(args[1], ",")[[1]][3]
-      # Get file content
-      file = readfile(filename, header)
-      # Extract Protein IDs list
-      input = c()
-      for (row in as.character(file[,ncol])) {
-        input = c(input, strsplit(row, ";")[[1]][1])
-      }
-    }
-    id_type = args[3]
-    ontoopt = strsplit(args[4], ",")[[1]]
-    #print(ontoopt)
-    #plotopt = strsplit(args[3], ",")
-    plotopt = args[5]
-    level = args[6]
-    per = as.logical(args[7])
-    title = args[8]
-    duplicate = args[9]
-
-    profiles = getprofile(input, id_type, level, duplicate)
-    profile.CC = profiles[1]
-    #print(profile.CC)
-    profile.MF = profiles[2]
-    #print(profile.MF)
-    profile.BP = profiles[3]
-    #print(profile.BP)
-    profile.ALL = profiles[-3:-1]
-    #print(profile.ALL)
-    #c(profile.ALL, profile.CC, profile.MF, profile.BP)
-    if ("CC" %in% ontoopt) {
-      if (grepl("PNG", plotopt)) {
-        plotPNG(profile.CC=profile.CC, per=per, title=title)
-      }
-      if (grepl("JPEG", plotopt)) {
-        plotJPEG(profile.CC = profile.CC, per=per, title=title)
-      }
-      if (grepl("PDF", plotopt)) {
-        plotPDF(profile.CC = profile.CC, per=per, title=title)
-      }
-    }
-    if ("MF" %in% ontoopt) {
-      if (grepl("PNG", plotopt)) {
-        plotPNG(profile.MF = profile.MF, per=per, title=title)
-      }
-      if (grepl("JPEG", plotopt)) {
-        plotJPEG(profile.MF = profile.MF, per=per, title=title)
-      }
-      if (grepl("PDF", plotopt)) {
-        plotPDF(profile.MF = profile.MF, per=per, title=title)
-      }
-    }
-    if ("BP" %in% ontoopt) {
-      if (grepl("PNG", plotopt)) {
-        plotPNG(profile.BP = profile.BP, per=per, title=title)
-      }
-      if (grepl("JPEG", plotopt)) {
-        plotJPEG(profile.BP = profile.BP, per=per, title=title)
-      }
-      if (grepl("PDF", plotopt)) {
-        plotPDF(profile.BP = profile.BP, per=per, title=title)
-      }
-    }
-    
-    #if (grepl("PNG", plotopt)) {
-    # plotPNG(profile.ALL = profile.ALL, per=per, title=title)
-    #}
-    #if (grepl("JPEG", plotopt)) {
-    # plotJPEG(profile.ALL = profile.ALL, per=per, title=title)
-    #}
-    #if (grepl("PDF", plotopt)) {
-    # plotPDF(profile.ALL = profile.ALL, per=per, title=title)
-    #}
+  
+  # Help section
+  if("--help" %in% args) {
+    cat("Selection and Annotation HPA
+    Arguments:
+        --input_type: type of input (list of id or filename)
+        --input: input
+        --ncol: the column number which you would like to apply...
+        --header: true/false if your file contains a header
+        --id_type: the type of input IDs (UniProt/EntrezID)
+        --onto_opt: ontology options
+        --plot_opt: plot extension options (PDF/JPEG/PNG)
+        --level: 1-3
+        --per
+        --title: title of the plot
+        --duplicate: remove dupliate input IDs (true/false)
+        --text_output: text output filename \n")
+    q(save="no")
   }
   
+  # Parse arguments
+  parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
+  argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
+  args <- as.list(as.character(argsDF$V2))
+  names(args) <- argsDF$V1
+
+  input_type = args$input_type
+  if (input_type == "text") {
+    input = strsplit(args$input, " ")[[1]]
+  }
+  else if (input_type == "file") {
+    filename = args$input
+    ncol = args$ncol
+    # Check ncol
+    if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
+      stop("Please enter an integer for level")
+    }
+    else {
+      ncol = as.numeric(gsub("c", "", ncol))
+    }
+    header = args$header
+    # Get file content
+    file = readfile(filename, header)
+    # Extract Protein IDs list
+    input = c()
+    for (row in as.character(file[,ncol])) {
+      input = c(input, strsplit(row, ";")[[1]][1])
+    }
+  }
+  id_type = args$id_type
+  ontoopt = strsplit(args$onto_opt, ",")[[1]]
+  #print(ontoopt)
+  #plotopt = strsplit(args[3], ",")
+  plotopt = args$plot_opt
+  level = args$level
+  per = as.logical(args$per)
+  title = args$title
+  duplicate = args$duplicate
+  text_output = args$text_output
+
+  profiles = getprofile(input, id_type, level, duplicate)
+  profile.CC = profiles[1]
+  #print(profile.CC)
+  profile.MF = profiles[2]
+  #print(profile.MF)
+  profile.BP = profiles[3]
+  #print(profile.BP)
+  profile.ALL = profiles[-3:-1]
+  #print(profile.ALL)
+  #c(profile.ALL, profile.CC, profile.MF, profile.BP)
+    
+  if ("CC" %in% ontoopt) {
+    write.table(profile.CC, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
+    if (grepl("PNG", plotopt)) {
+      plotPNG(profile.CC=profile.CC, per=per, title=title)
+    }
+    if (grepl("JPEG", plotopt)) {
+      plotJPEG(profile.CC = profile.CC, per=per, title=title)
+    }
+    if (grepl("PDF", plotopt)) {
+      plotPDF(profile.CC = profile.CC, per=per, title=title)
+    }
+  }
+  if ("MF" %in% ontoopt) {
+    write.table(profile.MF, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
+    if (grepl("PNG", plotopt)) {
+      plotPNG(profile.MF = profile.MF, per=per, title=title)
+    }
+    if (grepl("JPEG", plotopt)) {
+      plotJPEG(profile.MF = profile.MF, per=per, title=title)
+    }
+    if (grepl("PDF", plotopt)) {
+      plotPDF(profile.MF = profile.MF, per=per, title=title)
+    }
+  }
+  if ("BP" %in% ontoopt) {
+    write.table(profile.BP, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
+    if (grepl("PNG", plotopt)) {
+      plotPNG(profile.BP = profile.BP, per=per, title=title)
+    }
+    if (grepl("JPEG", plotopt)) {
+      plotJPEG(profile.BP = profile.BP, per=per, title=title)
+    }
+    if (grepl("PDF", plotopt)) {
+      plotPDF(profile.BP = profile.BP, per=per, title=title)
+    }
+  }
 }
 
 goprofiles()
-
-#Rscript go.R ../proteinGroups_Maud.txt "1" "CC" "PDF" 2 "TRUE" "Title"
author	proteore
date	Fri, 16 Feb 2018 03:40:36 -0500
parents	d89c09253c8d
children	58a8ddd58dde