comparison goprofiles.R @ 1:1236ee08ccb8 draft

planemo upload commit 5774fd6a5a746f36f6bf4671a51a39ea2b978300-dirty
author proteore
date Fri, 16 Feb 2018 03:40:36 -0500
parents d89c09253c8d
children 58a8ddd58dde
comparison
equal deleted inserted replaced
0:d89c09253c8d 1:1236ee08ccb8
3 library("goProfiles", quietly=TRUE) 3 library("goProfiles", quietly=TRUE)
4 4
5 # Read file and return file content as data.frame? 5 # Read file and return file content as data.frame?
6 readfile = function(filename, header) { 6 readfile = function(filename, header) {
7 if (header == "true") { 7 if (header == "true") {
8 # Read only the first two lines of the files as data (without headers): 8 # Read only the first line of the files as data (without headers):
9 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) 9 headers <- read.table(filename, nrows = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
10 #print("header") 10 #Read the data of the files (skipping the first row):
11 #print(headers)
12 # Create the headers names with the two (or more) first rows, sappy allows to make operations over the columns (in this case paste) - read more about sapply here :
13 #headers_names <- sapply(headers, paste, collapse = "_")
14 #print(headers_names)
15 #Read the data of the files (skipping the first 2 rows):
16 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) 11 file <- read.table(filename, skip = 1, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
17 #print(file[1,]) 12 # Remove empty rows
13 file <- file[!apply(is.na(file) | file == "", 1, all),]
18 #And assign the headers of step two to the data: 14 #And assign the headers of step two to the data:
19 names(file) <- headers 15 names(file) <- headers
20 } 16 }
21 else { 17 else {
22 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE) 18 file <- read.table(filename, header = FALSE, sep = "\t", stringsAsFactors = FALSE, fill = TRUE)
23 } 19 }
24 return(file) 20 return(file)
25 } 21 }
26 22
27 #filename = "/Users/LinCun/Documents/ProteoRE/usecase1/Check/HPA.Selection.134.txt"
28 #test = readfile(filename)
29 #str(test)
30 #str(test$Gene.names)
31 getprofile = function(ids, id_type, level, duplicate) { 23 getprofile = function(ids, id_type, level, duplicate) {
32 #################################################################### 24 ####################################################################
33 # Arguments 25 # Arguments
34 # - ids: list of input IDs 26 # - ids: list of input IDs
35 # - id_type: type of input IDs (UniProt/ENTREZID) 27 # - id_type: type of input IDs (UniProt/ENTREZID)
62 # IDs that have NA ENTREZID 54 # IDs that have NA ENTREZID
63 NAs = id$UNIPROT[which(is.na(id$ENTREZID))] 55 NAs = id$UNIPROT[which(is.na(id$ENTREZID))]
64 print("IDs unable to convert to ENTREZID: ") 56 print("IDs unable to convert to ENTREZID: ")
65 print(NAs) 57 print(NAs)
66 } 58 }
67 #print(genes_ids)
68 # Convert Protein IDs into entrez ids
69
70 # for (i in 1:length(id$UNIPROT)) {
71 # print(i)
72 # if (is.na(id[[2]][i])) {
73 # print(id[[2]][i])
74 # }
75 # }
76 # a = id[which(id$ENTREZID == "NA"),]
77 # print(a)
78 # print(a$UNIPROT)
79 #print(id[[1]][which(is.na(id$ENTREZID))])
80 #print(genes_ids)
81 # for (gene in genes) {
82 # #id = as.character(mget(gene, org.Hs.egALIAS2EG, ifnotfound = NA))
83 # id = select(org.Hs.eg.db, genes, "ENTREZID", "UNIPROT")
84 # print(id)
85 # genes_ids = append(genes_ids, id$ENTREZID)
86 # }
87 #print(genes_ids)
88 59
89 # Create basic profiles 60 # Create basic profiles
90 profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T) 61 profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
91 profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T) 62 profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
92 profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T) 63 profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage="org.Hs.eg.db", empty.cats=F, ord=T, na.rm=T)
170 dev.off() 141 dev.off()
171 } 142 }
172 } 143 }
173 144
174 goprofiles = function() { 145 goprofiles = function() {
175 args = commandArgs(trailingOnly = TRUE) 146 args <- commandArgs(TRUE)
176 #print(args) 147 if(length(args)<1) {
177 # arguments: filename.R inputfile ncol "CC,MF,BP,ALL" "PNG,JPEG,PDF" level "TRUE"(percentage) "Title" 148 args <- c("--help")
178 if (length(args) != 9) { 149 }
179 stop("Not enough/Too many arguments", call. = FALSE) 150
180 } 151 # Help section
181 else { 152 if("--help" %in% args) {
182 input_type = args[2] 153 cat("Selection and Annotation HPA
183 if (input_type == "text") { 154 Arguments:
184 input = strsplit(args[1], "\\s+")[[1]] 155 --input_type: type of input (list of id or filename)
185 } 156 --input: input
186 else if (input_type == "file") { 157 --ncol: the column number which you would like to apply...
187 filename = strsplit(args[1], ",")[[1]][1] 158 --header: true/false if your file contains a header
188 ncol = strsplit(args[1], ",")[[1]][2] 159 --id_type: the type of input IDs (UniProt/EntrezID)
189 # Check ncol 160 --onto_opt: ontology options
190 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { 161 --plot_opt: plot extension options (PDF/JPEG/PNG)
191 stop("Please enter an integer for level") 162 --level: 1-3
192 } 163 --per
193 else { 164 --title: title of the plot
194 ncol = as.numeric(gsub("c", "", ncol)) 165 --duplicate: remove dupliate input IDs (true/false)
195 } 166 --text_output: text output filename \n")
196 header = strsplit(args[1], ",")[[1]][3] 167 q(save="no")
197 # Get file content 168 }
198 file = readfile(filename, header) 169
199 # Extract Protein IDs list 170 # Parse arguments
200 input = c() 171 parseArgs <- function(x) strsplit(sub("^--", "", x), "=")
201 for (row in as.character(file[,ncol])) { 172 argsDF <- as.data.frame(do.call("rbind", parseArgs(args)))
202 input = c(input, strsplit(row, ";")[[1]][1]) 173 args <- as.list(as.character(argsDF$V2))
203 } 174 names(args) <- argsDF$V1
204 } 175
205 id_type = args[3] 176 input_type = args$input_type
206 ontoopt = strsplit(args[4], ",")[[1]] 177 if (input_type == "text") {
207 #print(ontoopt) 178 input = strsplit(args$input, " ")[[1]]
208 #plotopt = strsplit(args[3], ",") 179 }
209 plotopt = args[5] 180 else if (input_type == "file") {
210 level = args[6] 181 filename = args$input
211 per = as.logical(args[7]) 182 ncol = args$ncol
212 title = args[8] 183 # Check ncol
213 duplicate = args[9] 184 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
214 185 stop("Please enter an integer for level")
215 profiles = getprofile(input, id_type, level, duplicate) 186 }
216 profile.CC = profiles[1] 187 else {
217 #print(profile.CC) 188 ncol = as.numeric(gsub("c", "", ncol))
218 profile.MF = profiles[2] 189 }
219 #print(profile.MF) 190 header = args$header
220 profile.BP = profiles[3] 191 # Get file content
221 #print(profile.BP) 192 file = readfile(filename, header)
222 profile.ALL = profiles[-3:-1] 193 # Extract Protein IDs list
223 #print(profile.ALL) 194 input = c()
224 #c(profile.ALL, profile.CC, profile.MF, profile.BP) 195 for (row in as.character(file[,ncol])) {
225 if ("CC" %in% ontoopt) { 196 input = c(input, strsplit(row, ";")[[1]][1])
226 if (grepl("PNG", plotopt)) { 197 }
227 plotPNG(profile.CC=profile.CC, per=per, title=title) 198 }
228 } 199 id_type = args$id_type
229 if (grepl("JPEG", plotopt)) { 200 ontoopt = strsplit(args$onto_opt, ",")[[1]]
230 plotJPEG(profile.CC = profile.CC, per=per, title=title) 201 #print(ontoopt)
231 } 202 #plotopt = strsplit(args[3], ",")
232 if (grepl("PDF", plotopt)) { 203 plotopt = args$plot_opt
233 plotPDF(profile.CC = profile.CC, per=per, title=title) 204 level = args$level
234 } 205 per = as.logical(args$per)
235 } 206 title = args$title
236 if ("MF" %in% ontoopt) { 207 duplicate = args$duplicate
237 if (grepl("PNG", plotopt)) { 208 text_output = args$text_output
238 plotPNG(profile.MF = profile.MF, per=per, title=title) 209
239 } 210 profiles = getprofile(input, id_type, level, duplicate)
240 if (grepl("JPEG", plotopt)) { 211 profile.CC = profiles[1]
241 plotJPEG(profile.MF = profile.MF, per=per, title=title) 212 #print(profile.CC)
242 } 213 profile.MF = profiles[2]
243 if (grepl("PDF", plotopt)) { 214 #print(profile.MF)
244 plotPDF(profile.MF = profile.MF, per=per, title=title) 215 profile.BP = profiles[3]
245 } 216 #print(profile.BP)
246 } 217 profile.ALL = profiles[-3:-1]
247 if ("BP" %in% ontoopt) { 218 #print(profile.ALL)
248 if (grepl("PNG", plotopt)) { 219 #c(profile.ALL, profile.CC, profile.MF, profile.BP)
249 plotPNG(profile.BP = profile.BP, per=per, title=title)
250 }
251 if (grepl("JPEG", plotopt)) {
252 plotJPEG(profile.BP = profile.BP, per=per, title=title)
253 }
254 if (grepl("PDF", plotopt)) {
255 plotPDF(profile.BP = profile.BP, per=per, title=title)
256 }
257 }
258 220
259 #if (grepl("PNG", plotopt)) { 221 if ("CC" %in% ontoopt) {
260 # plotPNG(profile.ALL = profile.ALL, per=per, title=title) 222 write.table(profile.CC, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
261 #} 223 if (grepl("PNG", plotopt)) {
262 #if (grepl("JPEG", plotopt)) { 224 plotPNG(profile.CC=profile.CC, per=per, title=title)
263 # plotJPEG(profile.ALL = profile.ALL, per=per, title=title) 225 }
264 #} 226 if (grepl("JPEG", plotopt)) {
265 #if (grepl("PDF", plotopt)) { 227 plotJPEG(profile.CC = profile.CC, per=per, title=title)
266 # plotPDF(profile.ALL = profile.ALL, per=per, title=title) 228 }
267 #} 229 if (grepl("PDF", plotopt)) {
268 } 230 plotPDF(profile.CC = profile.CC, per=per, title=title)
269 231 }
232 }
233 if ("MF" %in% ontoopt) {
234 write.table(profile.MF, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
235 if (grepl("PNG", plotopt)) {
236 plotPNG(profile.MF = profile.MF, per=per, title=title)
237 }
238 if (grepl("JPEG", plotopt)) {
239 plotJPEG(profile.MF = profile.MF, per=per, title=title)
240 }
241 if (grepl("PDF", plotopt)) {
242 plotPDF(profile.MF = profile.MF, per=per, title=title)
243 }
244 }
245 if ("BP" %in% ontoopt) {
246 write.table(profile.BP, text_output, append = TRUE, sep="\t", row.names = FALSE, quote=FALSE)
247 if (grepl("PNG", plotopt)) {
248 plotPNG(profile.BP = profile.BP, per=per, title=title)
249 }
250 if (grepl("JPEG", plotopt)) {
251 plotJPEG(profile.BP = profile.BP, per=per, title=title)
252 }
253 if (grepl("PDF", plotopt)) {
254 plotPDF(profile.BP = profile.BP, per=per, title=title)
255 }
256 }
270 } 257 }
271 258
272 goprofiles() 259 goprofiles()
273
274 #Rscript go.R ../proteinGroups_Maud.txt "1" "CC" "PDF" 2 "TRUE" "Title"