comparison goprofiles.R @ 15:601027649251 draft default tip

"planemo upload commit 6bc2c32177acd823d5025bc90a562e6e6b2a233a"
author proteore
date Tue, 06 Apr 2021 16:59:30 +0000
parents 3ddc1f78773d
children
comparison
equal deleted inserted replaced
14:45b4ede6940e 15:601027649251
1 options(warn=-1) #TURN OFF WARNINGS !!!!!! 1 options(warn = -1) #TURN OFF WARNINGS !!!!!!
2 2
3 # Load necessary libraries 3 # Load necessary libraries
4 suppressMessages(library(goProfiles,quietly = TRUE)) 4 suppressMessages(library(goProfiles, quietly = TRUE))
5 5
6 # Read file and return file content as data.frame 6 # Read file and return file content as data.frame
7 read_file <- function(path,header){ 7 read_file <- function(path, header) {
8 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE) 8 file <- try(read.csv(path, header = header,
9 if (inherits(file,"try-error")){ 9 sep = "\t", stringsAsFactors = FALSE,
10 quote = "\"", check.names = F), silent = TRUE)
11 if (inherits(file, "try-error")) {
10 stop("File not found !") 12 stop("File not found !")
11 }else{ 13 }else{
12 return(file) 14 return(file)
13 } 15 }
14 } 16 }
15 17
16 #convert a string to boolean 18 #convert a string to boolean
17 str2bool <- function(x){ 19 str2bool <- function(x) {
18 if (any(is.element(c("t","true"),tolower(x)))){ 20 if (any(is.element(c("t", "true"), tolower(x)))) {
19 return (TRUE) 21 return(TRUE)
20 }else if (any(is.element(c("f","false"),tolower(x)))){ 22 }else if (any(is.element(c("f", "false"), tolower(x)))) {
21 return (FALSE) 23 return(FALSE)
22 }else{ 24 }else{
23 return(NULL) 25 return(NULL)
24 } 26 }
25 } 27 }
26 28
27 check_ids <- function(vector,type) { 29 check_ids <- function(vector, type) {
28 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" 30 # nolint start
29 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" 31 uniprot_pattern <-
30 if (type == "Entrez"){ 32 "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$"
31 return(grepl(entrez_id,vector)) 33 entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$"
34 # nolint end
35 if (type == "Entrez") {
36 return(grepl(entrez_id, vector))
32 } else if (type == "UniProt") { 37 } else if (type == "UniProt") {
33 return(grepl(uniprot_pattern,vector)) 38 return(grepl(uniprot_pattern, vector))
34 } 39 }
35 } 40 }
36 41
37 getprofile = function(ids, id_type, level, duplicate,species) { 42 getprofile <- function(ids, id_type, level, duplicate, species) {
38 #################################################################### 43 ####################################################################
39 # Arguments 44 # Arguments
40 # - ids: list of input IDs 45 # - ids: list of input IDs
41 # - id_type: type of input IDs (UniProt/ENTREZID) 46 # - id_type: type of input IDs (UniProt/ENTREZID)
42 # - level 47 # - level
43 # - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE) 48 # - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE)
44 # - species 49 # - species
45 #################################################################### 50 ####################################################################
46 51
47 library(species, character.only = TRUE, quietly = TRUE) 52 library(species, character.only = TRUE, quietly = TRUE)
48 53
49 if (species=="org.Hs.eg.db"){ 54 if (species == "org.Hs.eg.db") {
50 package=org.Hs.eg.db 55 package <- org.Hs.eg.db #nolint
51 } else if (species=="org.Mm.eg.db"){ 56 } else if (species == "org.Mm.eg.db") {
52 package=org.Mm.eg.db 57 package <- org.Mm.eg.db #nolint
53 } else if (species=="org.Rn.eg.db"){ 58 } else if (species == "org.Rn.eg.db") {
54 package=org.Rn.eg.db 59 package <- org.Rn.eg.db #nolint
55 } 60 }
56 61
57 # Check if level is number 62 # Check if level is number
58 if (! as.numeric(level) %% 1 == 0) { 63 if (! as.numeric(level) %% 1 == 0) {
59 stop("Please enter an integer for level") 64 stop("Please enter an integer for level")
60 } else { 65 } else {
61 level = as.numeric(level) 66 level <- as.numeric(level)
62 } 67 }
63 #genes = as.vector(file[,ncol]) 68
64 69
65 # Extract Gene Entrez ID 70 # Extract Gene Entrez ID
66 if (id_type == "Entrez") { 71 if (id_type == "Entrez") {
67 id = select(package, ids, "ENTREZID", multiVals = "first") 72 id <- select(package, ids, "ENTREZID", multiVals = "first") #nolint
68 } else { 73 } else {
69 id = select(package, ids, "ENTREZID", "UNIPROT", multiVals = "first") 74 id <- select(package, ids, "ENTREZID", "UNIPROT", multiVals = "first") #nolint
70 } 75 }
71 if (duplicate) { id = unique(id) } 76 if (duplicate) {
72 genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))] 77 id <- unique(id)
73 NAs = id$UNIPROT[which(is.na(id$ENTREZID))] # IDs that have NA ENTREZID 78 }
74 79 genes_ids <- id$ENTREZID[which(!is.na(id$ENTREZID))]
80 nas <- id$UNIPROT[which(is.na(id$ENTREZID))] # IDs that have NA ENTREZID #nolint
81
75 # Create basic profiles 82 # Create basic profiles
76 profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) 83 profile.CC <-
77 profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) 84 basicProfile(genes_ids, onto = "CC", level = level,
78 profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) 85 orgPackage = species, empty.cats = F, ord = T, na.rm = T)
79 profile.ALL = basicProfile(genes_ids, onto='ANY', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) 86 profile.BP <-
80 # Print profile 87 basicProfile(genes_ids, onto = "BP", level = level,
81 # printProfiles(profile) 88 orgPackage = species, empty.cats = F, ord = T, na.rm = T)
82 89 profile.MF <-
90 basicProfile(genes_ids, onto = "MF", level = level,
91 orgPackage = species, empty.cats = F, ord = T, na.rm = T)
92 profile.ALL <-
93 basicProfile(genes_ids, onto = "ANY", level = level,
94 orgPackage = species, empty.cats = F, ord = T, na.rm = T)
95
83 return(c(profile.CC, profile.MF, profile.BP, profile.ALL)) 96 return(c(profile.CC, profile.MF, profile.BP, profile.ALL))
84 } 97 }
85 98
86 #return height and width of plot in inches from profile 99 #return height and width of plot in inches from profile
87 plot_size_from_nb_onto <- function(profile){ 100 plot_size_from_nb_onto <- function(profile) {
88 width=10 101 width <- 10
89 range = seq(25, 2000, by=25) 102 range <- seq(25, 2000, by = 25)
90 names(range) = seq(5,242, by=3) 103 names(range) <- seq(5, 242, by = 3)
91 nb_onto = round(nrow(profile[[1]])/25)*25 104 nb_onto <- round(nrow(profile[[1]]) / 25) * 25
92 if (nb_onto < 25) {nb_onto = 25} 105 if (nb_onto < 25) {
93 if (nb_onto <= 2000) { 106 nb_onto <- 25
94 height= as.integer(names(which(range==nb_onto))) 107 }
95 } else { 108 if (nb_onto <= 2000) {
96 height=250 109 height <- as.integer(names(which(range == nb_onto)))
97 } 110 } else {
98 return (c(width,height)) 111 height <- 250
99 } 112 }
100 113 return(c(width, height))
101 make_plot <- function(profile,percent,title,onto,plot_opt){ 114 }
102 115
103 tmp <- plot_size_from_nb_onto (profile) 116 make_plot <- function(profile, percent, title, onto, plot_opt) {
117
118 tmp <- plot_size_from_nb_onto(profile)
104 width <- tmp[1] 119 width <- tmp[1]
105 height <- tmp[2] 120 height <- tmp[2]
106 121
107 if (plot_opt == "PDF") { 122 if (plot_opt == "PDF") {
108 file_name=paste("profile_",onto,".pdf",collapse="",sep="") 123 file_name <- paste("profile_", onto, ".pdf", collapse = "", sep = "")
109 pdf(file_name, width=width, heigh=height) 124 pdf(file_name, width = width, height = height)
110 } else if (plot_opt == "JPEG"){ 125 } else if (plot_opt == "JPEG") {
111 file_name=paste("profile_",onto,".jpeg",collapse="",sep="") 126 file_name <- paste("profile_", onto, ".jpeg", collapse = "", sep = "")
112 jpeg(file_name,width=width, height=height, units = "in", res=100) 127 jpeg(file_name, width = width, height = height, units = "in", res = 100)
113 } else if (plot_opt == "PNG"){ 128 } else if (plot_opt == "PNG") {
114 file_name=paste("profile_",onto,".png",collapse="",sep="") 129 file_name <- paste("profile_", onto, ".png", collapse = "", sep = "")
115 png(file_name,width=width, height=height, units = "in", res=100) 130 png(file_name, width = width, height = height, units = "in", res = 100)
116 } 131 }
117 plotProfiles(profile, percentage=percent, multiplePlots=FALSE, aTitle=title) 132 plotProfiles(profile, percentage = percent,
133 multiplePlots = FALSE, aTitle = title)
118 dev.off() 134 dev.off()
119 } 135 }
120 136
121 goprofiles = function() { 137 goprofiles <- function() {
122 args <- commandArgs(TRUE) 138 args <- commandArgs(TRUE)
123 if(length(args)<1) { 139 if (length(args) < 1) {
124 args <- c("--help") 140 args <- c("--help")
125 } 141 }
126 142
127 # Help section 143 # Help section
128 if("--help" %in% args) { 144 if ("--help" %in% args) {
129 cat("Selection and Annotation HPA 145 cat("Selection and Annotation HPA
130 Arguments: 146 Arguments:
131 --input_type: type of input (list of id or filename) 147 --input_type: type of input (list of id or filename)
132 --input: input 148 --input: input
133 --ncol: the column number which you would like to apply... 149 --ncol: the column number which you would like to apply...
139 --per 155 --per
140 --title: title of the plot 156 --title: title of the plot
141 --duplicate: remove dupliate input IDs (true/false) 157 --duplicate: remove dupliate input IDs (true/false)
142 --text_output: text output filename \n 158 --text_output: text output filename \n
143 --species") 159 --species")
144 q(save="no") 160 q(save = "no")
145 } 161 }
146 162
147 # Parse arguments 163 # Parse arguments
148 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") 164 parse_args <- function(x) strsplit(sub("^--", "", x), "=")
149 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) 165 args_df <- as.data.frame(do.call("rbind", parse_args(args)))
150 args <- as.list(as.character(argsDF$V2)) 166 args <- as.list(as.character(args_df$V2))
151 names(args) <- argsDF$V1 167 names(args) <- args_df$V1
152 168
153 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/goprofiles/args.Rda") 169
154 #load("/home/dchristiany/proteore_project/ProteoRE/tools/goprofiles/args.Rda") 170 id_type <- args$id_type
155 171 input_type <- args$input_type
156 id_type = args$id_type
157 input_type = args$input_type
158 if (input_type == "text") { 172 if (input_type == "text") {
159 input = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";")) 173 input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";"))
160 } else if (input_type == "file") { 174 } else if (input_type == "file") {
161 filename = args$input 175 filename <- args$input
162 ncol = args$ncol 176 ncol <- args$ncol
163 # Check ncol 177 # Check ncol
164 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { 178 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) {
165 stop("Please enter an integer for level") 179 stop("Please enter an integer for level")
166 } else { 180 } else {
167 ncol = as.numeric(gsub("c", "", ncol)) 181 ncol <- as.numeric(gsub("c", "", ncol))
168 } 182 }
169 header = str2bool(args$header) 183 header <- str2bool(args$header)
170 # Get file content 184 # Get file content
171 file = read_file(filename, header) 185 file <- read_file(filename, header)
172 # Extract Protein IDs list 186 # Extract Protein IDs list
173 input = unlist(strsplit(as.character(file[,ncol]),";")) 187 input <- unlist(strsplit(as.character(file[, ncol]), ";"))
174 } 188 }
175 input = input [which(!is.na(gsub("NA",NA,input)))] 189 input <- input [which(!is.na(gsub("NA", NA, input)))]
176 190
177 if (! any(check_ids(input,id_type))){ 191 if (! any(check_ids(input, id_type))) {
178 stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file")) 192 stop(paste(id_type,
179 } 193 "not found in your ids list, please check your IDs in input
180 194 or the selected column of your input file"))
181 ontoopt = strsplit(args$onto_opt, ",")[[1]] 195 }
182 onto_pos = as.integer(gsub("BP",3,gsub("MF",2,gsub("CC",1,ontoopt)))) 196
183 plotopt = args$plot_opt 197 ontoopt <- strsplit(args$onto_opt, ",")[[1]]
184 level = args$level 198 onto_pos <- as.integer(gsub("BP", 3, gsub("MF", 2, gsub("CC", 1, ontoopt))))
185 per = as.logical(args$per) 199 plotopt <- args$plot_opt
186 title = args$title 200 level <- args$level
187 duplicate = str2bool(args$duplicate) 201 per <- as.logical(args$per)
188 text_output = args$text_output 202 title <- args$title
189 species=args$species 203 duplicate <- str2bool(args$duplicate)
190 204 text_output <- args$text_output
191 profiles = getprofile(input, id_type, level, duplicate,species) 205 species <- args$species
192 206
207 profiles <- getprofile(input, id_type, level, duplicate, species)
208
193 for (index in onto_pos) { 209 for (index in onto_pos) {
194 onto = names(profiles[index]) 210 onto <- names(profiles[index])
195 profile=profiles[index] 211 profile <- profiles[index]
196 make_plot(profile,per,title,onto,plotopt) 212 make_plot(profile, per, title, onto, plotopt)
197 text_output=paste("goProfiles_",onto,"_",title,".tsv",sep="",collapse="") 213 text_output <- paste("goProfiles_", onto, "_",
198 profile = as.data.frame(profile) 214 title, ".tsv", sep = "", collapse = "")
199 profile <- as.data.frame(apply(profile, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" and " " to NA 215 profile <- as.data.frame(profile)
200 write.table(profile, text_output, sep="\t", row.names = FALSE, quote=FALSE, col.names = T) 216 profile <- as.data.frame(apply(profile, c(1, 2),
217 function(x) gsub("^$|^ $", NA, x))) #convert "" and " " to NA
218 write.table(profile, text_output, sep = "\t",
219 row.names = FALSE, quote = FALSE, col.names = T)
201 } 220 }
202 } 221 }
203 222
204 goprofiles() 223 goprofiles()