Mercurial > repos > proteore > proteore_goprofiles
comparison goprofiles.R @ 15:601027649251 draft default tip
"planemo upload commit 6bc2c32177acd823d5025bc90a562e6e6b2a233a"
author | proteore |
---|---|
date | Tue, 06 Apr 2021 16:59:30 +0000 |
parents | 3ddc1f78773d |
children |
comparison
equal
deleted
inserted
replaced
14:45b4ede6940e | 15:601027649251 |
---|---|
1 options(warn=-1) #TURN OFF WARNINGS !!!!!! | 1 options(warn = -1) #TURN OFF WARNINGS !!!!!! |
2 | 2 |
3 # Load necessary libraries | 3 # Load necessary libraries |
4 suppressMessages(library(goProfiles,quietly = TRUE)) | 4 suppressMessages(library(goProfiles, quietly = TRUE)) |
5 | 5 |
6 # Read file and return file content as data.frame | 6 # Read file and return file content as data.frame |
7 read_file <- function(path,header){ | 7 read_file <- function(path, header) { |
8 file <- try(read.csv(path,header=header, sep="\t",stringsAsFactors = FALSE, quote="\"", check.names = F),silent=TRUE) | 8 file <- try(read.csv(path, header = header, |
9 if (inherits(file,"try-error")){ | 9 sep = "\t", stringsAsFactors = FALSE, |
10 quote = "\"", check.names = F), silent = TRUE) | |
11 if (inherits(file, "try-error")) { | |
10 stop("File not found !") | 12 stop("File not found !") |
11 }else{ | 13 }else{ |
12 return(file) | 14 return(file) |
13 } | 15 } |
14 } | 16 } |
15 | 17 |
16 #convert a string to boolean | 18 #convert a string to boolean |
17 str2bool <- function(x){ | 19 str2bool <- function(x) { |
18 if (any(is.element(c("t","true"),tolower(x)))){ | 20 if (any(is.element(c("t", "true"), tolower(x)))) { |
19 return (TRUE) | 21 return(TRUE) |
20 }else if (any(is.element(c("f","false"),tolower(x)))){ | 22 }else if (any(is.element(c("f", "false"), tolower(x)))) { |
21 return (FALSE) | 23 return(FALSE) |
22 }else{ | 24 }else{ |
23 return(NULL) | 25 return(NULL) |
24 } | 26 } |
25 } | 27 } |
26 | 28 |
27 check_ids <- function(vector,type) { | 29 check_ids <- function(vector, type) { |
28 uniprot_pattern = "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" | 30 # nolint start |
29 entrez_id = "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" | 31 uniprot_pattern <- |
30 if (type == "Entrez"){ | 32 "^([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})$" |
31 return(grepl(entrez_id,vector)) | 33 entrez_id <- "^([0-9]+|[A-Z]{1,2}_[0-9]+|[A-Z]{1,2}_[A-Z]{1,4}[0-9]+)$" |
34 # nolint end | |
35 if (type == "Entrez") { | |
36 return(grepl(entrez_id, vector)) | |
32 } else if (type == "UniProt") { | 37 } else if (type == "UniProt") { |
33 return(grepl(uniprot_pattern,vector)) | 38 return(grepl(uniprot_pattern, vector)) |
34 } | 39 } |
35 } | 40 } |
36 | 41 |
37 getprofile = function(ids, id_type, level, duplicate,species) { | 42 getprofile <- function(ids, id_type, level, duplicate, species) { |
38 #################################################################### | 43 #################################################################### |
39 # Arguments | 44 # Arguments |
40 # - ids: list of input IDs | 45 # - ids: list of input IDs |
41 # - id_type: type of input IDs (UniProt/ENTREZID) | 46 # - id_type: type of input IDs (UniProt/ENTREZID) |
42 # - level | 47 # - level |
43 # - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE) | 48 # - duplicate: if the duplicated IDs should be removed or not (TRUE/FALSE) |
44 # - species | 49 # - species |
45 #################################################################### | 50 #################################################################### |
46 | 51 |
47 library(species, character.only = TRUE, quietly = TRUE) | 52 library(species, character.only = TRUE, quietly = TRUE) |
48 | 53 |
49 if (species=="org.Hs.eg.db"){ | 54 if (species == "org.Hs.eg.db") { |
50 package=org.Hs.eg.db | 55 package <- org.Hs.eg.db #nolint |
51 } else if (species=="org.Mm.eg.db"){ | 56 } else if (species == "org.Mm.eg.db") { |
52 package=org.Mm.eg.db | 57 package <- org.Mm.eg.db #nolint |
53 } else if (species=="org.Rn.eg.db"){ | 58 } else if (species == "org.Rn.eg.db") { |
54 package=org.Rn.eg.db | 59 package <- org.Rn.eg.db #nolint |
55 } | 60 } |
56 | 61 |
57 # Check if level is number | 62 # Check if level is number |
58 if (! as.numeric(level) %% 1 == 0) { | 63 if (! as.numeric(level) %% 1 == 0) { |
59 stop("Please enter an integer for level") | 64 stop("Please enter an integer for level") |
60 } else { | 65 } else { |
61 level = as.numeric(level) | 66 level <- as.numeric(level) |
62 } | 67 } |
63 #genes = as.vector(file[,ncol]) | 68 |
64 | 69 |
65 # Extract Gene Entrez ID | 70 # Extract Gene Entrez ID |
66 if (id_type == "Entrez") { | 71 if (id_type == "Entrez") { |
67 id = select(package, ids, "ENTREZID", multiVals = "first") | 72 id <- select(package, ids, "ENTREZID", multiVals = "first") #nolint |
68 } else { | 73 } else { |
69 id = select(package, ids, "ENTREZID", "UNIPROT", multiVals = "first") | 74 id <- select(package, ids, "ENTREZID", "UNIPROT", multiVals = "first") #nolint |
70 } | 75 } |
71 if (duplicate) { id = unique(id) } | 76 if (duplicate) { |
72 genes_ids = id$ENTREZID[which( ! is.na(id$ENTREZID))] | 77 id <- unique(id) |
73 NAs = id$UNIPROT[which(is.na(id$ENTREZID))] # IDs that have NA ENTREZID | 78 } |
74 | 79 genes_ids <- id$ENTREZID[which(!is.na(id$ENTREZID))] |
80 nas <- id$UNIPROT[which(is.na(id$ENTREZID))] # IDs that have NA ENTREZID #nolint | |
81 | |
75 # Create basic profiles | 82 # Create basic profiles |
76 profile.CC = basicProfile(genes_ids, onto='CC', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) | 83 profile.CC <- |
77 profile.BP = basicProfile(genes_ids, onto='BP', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) | 84 basicProfile(genes_ids, onto = "CC", level = level, |
78 profile.MF = basicProfile(genes_ids, onto='MF', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) | 85 orgPackage = species, empty.cats = F, ord = T, na.rm = T) |
79 profile.ALL = basicProfile(genes_ids, onto='ANY', level=level, orgPackage=species, empty.cats=F, ord=T, na.rm=T) | 86 profile.BP <- |
80 # Print profile | 87 basicProfile(genes_ids, onto = "BP", level = level, |
81 # printProfiles(profile) | 88 orgPackage = species, empty.cats = F, ord = T, na.rm = T) |
82 | 89 profile.MF <- |
90 basicProfile(genes_ids, onto = "MF", level = level, | |
91 orgPackage = species, empty.cats = F, ord = T, na.rm = T) | |
92 profile.ALL <- | |
93 basicProfile(genes_ids, onto = "ANY", level = level, | |
94 orgPackage = species, empty.cats = F, ord = T, na.rm = T) | |
95 | |
83 return(c(profile.CC, profile.MF, profile.BP, profile.ALL)) | 96 return(c(profile.CC, profile.MF, profile.BP, profile.ALL)) |
84 } | 97 } |
85 | 98 |
86 #return height and width of plot in inches from profile | 99 #return height and width of plot in inches from profile |
87 plot_size_from_nb_onto <- function(profile){ | 100 plot_size_from_nb_onto <- function(profile) { |
88 width=10 | 101 width <- 10 |
89 range = seq(25, 2000, by=25) | 102 range <- seq(25, 2000, by = 25) |
90 names(range) = seq(5,242, by=3) | 103 names(range) <- seq(5, 242, by = 3) |
91 nb_onto = round(nrow(profile[[1]])/25)*25 | 104 nb_onto <- round(nrow(profile[[1]]) / 25) * 25 |
92 if (nb_onto < 25) {nb_onto = 25} | 105 if (nb_onto < 25) { |
93 if (nb_onto <= 2000) { | 106 nb_onto <- 25 |
94 height= as.integer(names(which(range==nb_onto))) | 107 } |
95 } else { | 108 if (nb_onto <= 2000) { |
96 height=250 | 109 height <- as.integer(names(which(range == nb_onto))) |
97 } | 110 } else { |
98 return (c(width,height)) | 111 height <- 250 |
99 } | 112 } |
100 | 113 return(c(width, height)) |
101 make_plot <- function(profile,percent,title,onto,plot_opt){ | 114 } |
102 | 115 |
103 tmp <- plot_size_from_nb_onto (profile) | 116 make_plot <- function(profile, percent, title, onto, plot_opt) { |
117 | |
118 tmp <- plot_size_from_nb_onto(profile) | |
104 width <- tmp[1] | 119 width <- tmp[1] |
105 height <- tmp[2] | 120 height <- tmp[2] |
106 | 121 |
107 if (plot_opt == "PDF") { | 122 if (plot_opt == "PDF") { |
108 file_name=paste("profile_",onto,".pdf",collapse="",sep="") | 123 file_name <- paste("profile_", onto, ".pdf", collapse = "", sep = "") |
109 pdf(file_name, width=width, heigh=height) | 124 pdf(file_name, width = width, height = height) |
110 } else if (plot_opt == "JPEG"){ | 125 } else if (plot_opt == "JPEG") { |
111 file_name=paste("profile_",onto,".jpeg",collapse="",sep="") | 126 file_name <- paste("profile_", onto, ".jpeg", collapse = "", sep = "") |
112 jpeg(file_name,width=width, height=height, units = "in", res=100) | 127 jpeg(file_name, width = width, height = height, units = "in", res = 100) |
113 } else if (plot_opt == "PNG"){ | 128 } else if (plot_opt == "PNG") { |
114 file_name=paste("profile_",onto,".png",collapse="",sep="") | 129 file_name <- paste("profile_", onto, ".png", collapse = "", sep = "") |
115 png(file_name,width=width, height=height, units = "in", res=100) | 130 png(file_name, width = width, height = height, units = "in", res = 100) |
116 } | 131 } |
117 plotProfiles(profile, percentage=percent, multiplePlots=FALSE, aTitle=title) | 132 plotProfiles(profile, percentage = percent, |
133 multiplePlots = FALSE, aTitle = title) | |
118 dev.off() | 134 dev.off() |
119 } | 135 } |
120 | 136 |
121 goprofiles = function() { | 137 goprofiles <- function() { |
122 args <- commandArgs(TRUE) | 138 args <- commandArgs(TRUE) |
123 if(length(args)<1) { | 139 if (length(args) < 1) { |
124 args <- c("--help") | 140 args <- c("--help") |
125 } | 141 } |
126 | 142 |
127 # Help section | 143 # Help section |
128 if("--help" %in% args) { | 144 if ("--help" %in% args) { |
129 cat("Selection and Annotation HPA | 145 cat("Selection and Annotation HPA |
130 Arguments: | 146 Arguments: |
131 --input_type: type of input (list of id or filename) | 147 --input_type: type of input (list of id or filename) |
132 --input: input | 148 --input: input |
133 --ncol: the column number which you would like to apply... | 149 --ncol: the column number which you would like to apply... |
139 --per | 155 --per |
140 --title: title of the plot | 156 --title: title of the plot |
141 --duplicate: remove dupliate input IDs (true/false) | 157 --duplicate: remove dupliate input IDs (true/false) |
142 --text_output: text output filename \n | 158 --text_output: text output filename \n |
143 --species") | 159 --species") |
144 q(save="no") | 160 q(save = "no") |
145 } | 161 } |
146 | 162 |
147 # Parse arguments | 163 # Parse arguments |
148 parseArgs <- function(x) strsplit(sub("^--", "", x), "=") | 164 parse_args <- function(x) strsplit(sub("^--", "", x), "=") |
149 argsDF <- as.data.frame(do.call("rbind", parseArgs(args))) | 165 args_df <- as.data.frame(do.call("rbind", parse_args(args))) |
150 args <- as.list(as.character(argsDF$V2)) | 166 args <- as.list(as.character(args_df$V2)) |
151 names(args) <- argsDF$V1 | 167 names(args) <- args_df$V1 |
152 | 168 |
153 #save(args,file="/home/dchristiany/proteore_project/ProteoRE/tools/goprofiles/args.Rda") | 169 |
154 #load("/home/dchristiany/proteore_project/ProteoRE/tools/goprofiles/args.Rda") | 170 id_type <- args$id_type |
155 | 171 input_type <- args$input_type |
156 id_type = args$id_type | |
157 input_type = args$input_type | |
158 if (input_type == "text") { | 172 if (input_type == "text") { |
159 input = unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]],";")) | 173 input <- unlist(strsplit(strsplit(args$input, "[ \t\n]+")[[1]], ";")) |
160 } else if (input_type == "file") { | 174 } else if (input_type == "file") { |
161 filename = args$input | 175 filename <- args$input |
162 ncol = args$ncol | 176 ncol <- args$ncol |
163 # Check ncol | 177 # Check ncol |
164 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { | 178 if (! as.numeric(gsub("c", "", ncol)) %% 1 == 0) { |
165 stop("Please enter an integer for level") | 179 stop("Please enter an integer for level") |
166 } else { | 180 } else { |
167 ncol = as.numeric(gsub("c", "", ncol)) | 181 ncol <- as.numeric(gsub("c", "", ncol)) |
168 } | 182 } |
169 header = str2bool(args$header) | 183 header <- str2bool(args$header) |
170 # Get file content | 184 # Get file content |
171 file = read_file(filename, header) | 185 file <- read_file(filename, header) |
172 # Extract Protein IDs list | 186 # Extract Protein IDs list |
173 input = unlist(strsplit(as.character(file[,ncol]),";")) | 187 input <- unlist(strsplit(as.character(file[, ncol]), ";")) |
174 } | 188 } |
175 input = input [which(!is.na(gsub("NA",NA,input)))] | 189 input <- input [which(!is.na(gsub("NA", NA, input)))] |
176 | 190 |
177 if (! any(check_ids(input,id_type))){ | 191 if (! any(check_ids(input, id_type))) { |
178 stop(paste(id_type,"not found in your ids list, please check your IDs in input or the selected column of your input file")) | 192 stop(paste(id_type, |
179 } | 193 "not found in your ids list, please check your IDs in input |
180 | 194 or the selected column of your input file")) |
181 ontoopt = strsplit(args$onto_opt, ",")[[1]] | 195 } |
182 onto_pos = as.integer(gsub("BP",3,gsub("MF",2,gsub("CC",1,ontoopt)))) | 196 |
183 plotopt = args$plot_opt | 197 ontoopt <- strsplit(args$onto_opt, ",")[[1]] |
184 level = args$level | 198 onto_pos <- as.integer(gsub("BP", 3, gsub("MF", 2, gsub("CC", 1, ontoopt)))) |
185 per = as.logical(args$per) | 199 plotopt <- args$plot_opt |
186 title = args$title | 200 level <- args$level |
187 duplicate = str2bool(args$duplicate) | 201 per <- as.logical(args$per) |
188 text_output = args$text_output | 202 title <- args$title |
189 species=args$species | 203 duplicate <- str2bool(args$duplicate) |
190 | 204 text_output <- args$text_output |
191 profiles = getprofile(input, id_type, level, duplicate,species) | 205 species <- args$species |
192 | 206 |
207 profiles <- getprofile(input, id_type, level, duplicate, species) | |
208 | |
193 for (index in onto_pos) { | 209 for (index in onto_pos) { |
194 onto = names(profiles[index]) | 210 onto <- names(profiles[index]) |
195 profile=profiles[index] | 211 profile <- profiles[index] |
196 make_plot(profile,per,title,onto,plotopt) | 212 make_plot(profile, per, title, onto, plotopt) |
197 text_output=paste("goProfiles_",onto,"_",title,".tsv",sep="",collapse="") | 213 text_output <- paste("goProfiles_", onto, "_", |
198 profile = as.data.frame(profile) | 214 title, ".tsv", sep = "", collapse = "") |
199 profile <- as.data.frame(apply(profile, c(1,2), function(x) gsub("^$|^ $", NA, x))) #convert "" and " " to NA | 215 profile <- as.data.frame(profile) |
200 write.table(profile, text_output, sep="\t", row.names = FALSE, quote=FALSE, col.names = T) | 216 profile <- as.data.frame(apply(profile, c(1, 2), |
217 function(x) gsub("^$|^ $", NA, x))) #convert "" and " " to NA | |
218 write.table(profile, text_output, sep = "\t", | |
219 row.names = FALSE, quote = FALSE, col.names = T) | |
201 } | 220 } |
202 } | 221 } |
203 | 222 |
204 goprofiles() | 223 goprofiles() |