Mercurial > repos > galaxyp > pmd_fdr
comparison PMD_FDR_package_for_Galaxy.R @ 0:5cc0c32d05a2 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pmd_fdr commit 00f85eca73cd8afedfefbeec94a4462455ac1a9a"
author | galaxyp |
---|---|
date | Mon, 07 Oct 2019 11:59:37 -0400 |
parents | |
children | 460edeedeb7d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5cc0c32d05a2 |
---|---|
1 ############################################################################### | |
2 # PMD_FDR_package_for_Galaxy.R # | |
3 # # | |
4 # Project 021 - PMD-FDR for Galaxy-P # | |
5 # # | |
6 # Description: Computes iFDR and gFDR on PSMs as a script designed for Galaxy # | |
7 # Note that plotting code has been left in that is not used # | |
8 # in this file; this is the code I used to create figures for # | |
9 # publication. I left it in for potential development of views. # | |
10 # # | |
11 # This file was created by concatenating the following files: # | |
12 # # | |
13 # A - 005 - Parser - ArgParser.R # | |
14 # B - 019 - PMD-FDR - functions.R # | |
15 # C - 021 - PMD-FDR Wrapper - functions.R # | |
16 # D - 021 - PMD-FDR Main.R # | |
17 # # | |
18 # Required packages: argparser # | |
19 # stringr # | |
20 # RUnit # | |
21 # # | |
22 # Release date: 2019-10-05 # | |
23 # Version: 1.4 # | |
24 # # | |
25 ############################################################################### | |
26 # Package currently supports the following parameters: | |
27 # | |
28 # --psm_report full name and path to the PSM report | |
29 # --psm_report_1_percent full name and path to the PSM report for 1% FDR | |
30 # --output_i_fdr full name and path to the i-FDR output file | |
31 # --output_g_fdr full name and path to the g-FDR output file | |
32 # --output_densities full name and path to the densities output file | |
33 # | |
34 ############################################################################### | |
35 # A - 005 - Parser - ArgParser.R # | |
36 # # | |
37 # Description: Wrapper for argparser package, using RefClass # | |
38 # # | |
39 ############################################################################### | |
40 | |
41 #install.packages("argparser") | |
42 library(argparser) | |
43 | |
44 # Class definition | |
45 | |
46 ArgParser <- setRefClass("ArgParser", | |
47 fields = c("parser")) | |
48 ArgParser$methods( | |
49 initialize = function(...){ | |
50 parser <<- arg_parser(...) | |
51 }, | |
52 local_add_argument = function(...){ | |
53 parser <<- add_argument(parser, ...) | |
54 }, | |
55 parse_arguments = function(...){ | |
56 result = parse_args(parser, ...) | |
57 return(result) | |
58 } | |
59 ) | |
60 | |
61 ############################################################################### | |
62 # B - 019 - PMD-FDR - functions.R # | |
63 # # | |
64 # Primary work-horse for PMD-FDR # | |
65 # # | |
66 ############################################################################### | |
67 ############################################################################### | |
68 ####### Load libraries etc. | |
69 ############################################################################### | |
70 library(stringr) | |
71 library(RUnit) | |
72 | |
73 ############################################################# | |
74 ####### Global values (should be parameters to module but aren't yet) | |
75 ############################################################# | |
76 | |
77 MIN_GOOD_PEPTIDE_LENGTH <- 11 | |
78 MIN_ACCEPTABLE_POINTS_IN_DENSITY <- 10 | |
79 | |
80 ############################################################# | |
81 ####### General purpose functions | |
82 ############################################################# | |
83 # Creates a more useful error report when file is not reasonable | |
84 safe_file_exists <- function(file_path){ # Still not particularly useful in cases where it is a valid directory | |
85 tryCatch( | |
86 return(file_test(op = "-f", x=file_path)), | |
87 error=function(e) {simpleError(sprintf("file path is not valid: '%s'", file_path))} | |
88 ) | |
89 } | |
90 # My standard way of loading data into data.frames | |
91 load_standard_df <- function(file_path=NULL){ | |
92 clean_field_names = function(field_names){ | |
93 result <- field_names | |
94 idx_blank <- which(result == "") | |
95 result[idx_blank] <- sprintf("<Field %d>", idx_blank) | |
96 return(result) | |
97 } | |
98 if (safe_file_exists(file_path)){ | |
99 field_names <- read_field_names(file_path, sep = "\t") | |
100 field_names <- clean_field_names(field_names) | |
101 | |
102 if (length(field_names) == 0){ | |
103 return(data.frame()) | |
104 } | |
105 data <- read.table(file = file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, blank.lines.skip = TRUE)#, check.names = FALSE) | |
106 colnames(data) = field_names | |
107 } else { | |
108 stop(sprintf("File path does not exist: '%s'", file_path)) | |
109 } | |
110 return(data) | |
111 } | |
112 save_standard_df <- function(x=NULL, file_path=NULL){ | |
113 if (file_path != ""){ | |
114 write.table(x = x, file = file_path, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) | |
115 } | |
116 } | |
117 rename_column <- function(df=NULL, name_before=NULL, name_after=NULL, suppressWarnings=FALSE){ | |
118 if (is.null(df)){ | |
119 stop("Dataframe (df) does not exist - unable to rename column") | |
120 } | |
121 if (name_before %in% colnames(df)){ | |
122 df[,name_after] <- df[,name_before] | |
123 df[,name_before] <- NULL | |
124 } else if (!suppressWarnings){ | |
125 warning(sprintf("'%s' is not a field in the data frame and so has not been renamed", name_before)) | |
126 } | |
127 return(df) | |
128 } | |
129 rename_columns <- function(df=NULL, names_before=NULL, names_after=NULL){ | |
130 for (i in safe_iterator(length(names_before))){ | |
131 df <- rename_column(df, names_before[i], names_after[i]) | |
132 } | |
133 return(df) | |
134 } | |
135 round_to_tolerance <- function(x=NULL, tolerance=NULL, ...){ | |
136 return(function_to_tolerance(x=x, tolerance=tolerance, FUN=round, ...)) | |
137 } | |
138 function_to_tolerance <- function(x=NULL, tolerance=NULL, FUN=NULL, ...){ | |
139 return(FUN(x/tolerance, ...) * tolerance) | |
140 } | |
141 safe_median <- function(x) median(x, na.rm=TRUE) | |
142 normalize_density <- function(d){ | |
143 # Normalizes y-values in density function | |
144 # so that the integral under the curve is 1 | |
145 # (uses rectangles to approximate area) | |
146 delta_x <- diff(range(d$x)) / length(d$x) | |
147 unnormalized_integral <- delta_x * sum(d$y) | |
148 new_d <- d | |
149 new_d$y <- with(new_d, y ) | |
150 | |
151 return(new_d) | |
152 } | |
153 if_null <- function(cond=NULL, null_result=NULL, not_null_result=NULL){ | |
154 return(switch(1+is.null(cond), | |
155 not_null_result, | |
156 null_result)) | |
157 } | |
158 rainbow_with_fixed_intensity <- function(n=NULL, goal_intensity_0_1=NULL, alpha=NULL){ | |
159 goal_intensity <- 255*goal_intensity_0_1 | |
160 hex_colors <- rainbow(n) | |
161 rgb_colors <- col2rgb(hex_colors) | |
162 df_colors <- data.frame(t(rgb_colors)) | |
163 df_colors$intensity <- with(df_colors, 0.2989*red + 0.5870*green + 0.1140*blue) | |
164 | |
165 df_colors$white_black <- with(df_colors, ifelse(intensity < goal_intensity, 255, 0)) | |
166 df_colors$mix_level <- with(df_colors, (white_black - goal_intensity) / (white_black - intensity ) ) | |
167 df_colors$new_red <- with(df_colors, mix_level*red + (1-mix_level)*white_black) | |
168 df_colors$new_green <- with(df_colors, mix_level*green + (1-mix_level)*white_black) | |
169 df_colors$new_blue <- with(df_colors, mix_level*blue + (1-mix_level)*white_black) | |
170 names_pref_new <- c("new_red", "new_green", "new_blue") | |
171 names_no_pref <- c("red", "green", "blue") | |
172 df_colors <- df_colors[,names_pref_new] | |
173 df_colors <- rename_columns(df_colors, names_before = names_pref_new, names_after = names_no_pref) | |
174 rgb_colors <-as.matrix(df_colors/255 ) | |
175 | |
176 return(rgb(rgb_colors, alpha=alpha)) | |
177 } | |
178 safe_iterator <- function(n_steps = NULL){ | |
179 if (n_steps < 1){ | |
180 result = numeric(0) | |
181 } else { | |
182 result = 1:n_steps | |
183 } | |
184 return(result) | |
185 } | |
186 col2hex <- function(cols=NULL, col_alpha=255){ | |
187 if (all(col_alpha<=1)){ | |
188 col_alpha <- round(col_alpha*255) | |
189 } | |
190 col_matrix <- t(col2rgb(cols)) | |
191 results <- rgb(col_matrix, alpha=col_alpha, maxColorValue = 255) | |
192 return(results) | |
193 } | |
194 credible_interval <- function(x=NULL, N=NULL, precision=0.001, alpha=0.05){ | |
195 # Approximates "highest posterior density interval" | |
196 # Uses exact binomial but with a finite list of potential values (1/precision) | |
197 | |
198 p <- seq(from=0, to=1, by=precision) | |
199 d <- dbinom(x = x, size = N, prob = p) | |
200 d <- d / sum(d) | |
201 df <- data.frame(p=p, d=d) | |
202 df <- df[order(-df$d),] | |
203 df$cumsum <- cumsum(df$d) | |
204 max_idx <- sum(df$cumsum < (1-alpha)) + 1 | |
205 max_idx <- min(max_idx, nrow(df)) | |
206 | |
207 lower <- min(df$p[1:max_idx]) | |
208 upper <- max(df$p[1:max_idx]) | |
209 | |
210 return(c(lower,upper)) | |
211 } | |
212 verified_element_of_list <- function(parent_list=NULL, element_name=NULL, object_name=NULL){ | |
213 if (is.null(parent_list[[element_name]])){ | |
214 if (is.null(object_name)){ | |
215 object_name = "the list" | |
216 } | |
217 stop(sprintf("Element '%s' does not yet exist in %s", element_name, object_name)) | |
218 } | |
219 return(parent_list[[element_name]]) | |
220 } | |
221 read_field_names = function(file_path=NULL, sep = "\t"){ | |
222 con = file(file_path,"r") | |
223 fields = readLines(con, n=1) | |
224 close(con) | |
225 | |
226 if (length(fields) == 0){ | |
227 return(c()) | |
228 } | |
229 fields = strsplit(x = fields, split = sep)[[1]] | |
230 return(fields) | |
231 } | |
232 check_field_name = function(input_df = NULL, name_of_input_df=NULL, field_name=NULL){ | |
233 test_succeeded <- field_name %in% colnames(input_df) | |
234 current_columns <- paste0(colnames(input_df), collapse=", ") | |
235 checkTrue(test_succeeded, | |
236 msg = sprintf("Expected fieldname '%s' in %s (but did not find it among %s)", | |
237 field_name, name_of_input_df, current_columns)) | |
238 } | |
239 | |
240 ############################################################# | |
241 ####### Classes for Data | |
242 ############################################################# | |
243 | |
244 ############################################################################### | |
245 # Class: Data_Object | |
246 ############################################################################### | |
247 Data_Object <- setRefClass("Data_Object", | |
248 fields =list(m_is_dirty = "logical", | |
249 parents = "list", | |
250 children = "list", | |
251 class_name = "character")) | |
252 Data_Object$methods( | |
253 initialize = function(){ | |
254 m_is_dirty <<- TRUE | |
255 class_name <<- "Data_Object <abstract class - class_name needs to be set in subclass>" | |
256 }, | |
257 load_data = function(){ | |
258 #print(sprintf("Calling %s$load_data()", class_name)) # Useful for debugging | |
259 ensure_parents() | |
260 verify() | |
261 m_load_data() | |
262 set_dirty(new_value = FALSE) | |
263 }, | |
264 ensure = function(){ | |
265 if (m_is_dirty){ | |
266 load_data() | |
267 } | |
268 }, | |
269 set_dirty = function(new_value){ | |
270 if (new_value != m_is_dirty){ | |
271 m_is_dirty <<- new_value | |
272 set_children_dirty() | |
273 } | |
274 }, | |
275 verify = function(){ | |
276 stop(sprintf("verify() is an abstract method - define it in %s before calling load_data()", class_name)) | |
277 }, | |
278 m_load_data = function(){ | |
279 stop(sprintf("m_load_data() is an abstract method - define it in %s before calling load_data()", class_name)) | |
280 }, | |
281 append_parent = function(parent=NULL){ | |
282 parents <<- append(parents, parent) | |
283 }, | |
284 append_child = function(child=NULL){ | |
285 children <<- append(children, child) | |
286 }, | |
287 ensure_parents = function(){ | |
288 for (parent in parents){ | |
289 # print(sprintf("Calling %s$ensure()", parent$class_name)) # Useful for debugging | |
290 parent$ensure() | |
291 } | |
292 }, | |
293 set_children_dirty = function(){ | |
294 for (child in children){ | |
295 child$set_dirty(TRUE) | |
296 } | |
297 } | |
298 ) | |
299 ############################################################################### | |
300 # Class: Data_Object_Info | |
301 ############################################################################### | |
302 Data_Object_Info <- setRefClass("Data_Object_Info", | |
303 contains = "Data_Object", | |
304 fields =list( | |
305 data_file_name_1_percent_FDR = "character", | |
306 data_file_name = "character", | |
307 data_path_name = "character", | |
308 experiment_name = "character", | |
309 designation = "character", | |
310 | |
311 input_file_type = "character" | |
312 | |
313 #score_field_name = "character" | |
314 #collection_name="character", | |
315 #dir_results="character", | |
316 #dir_dataset="character", | |
317 #dataset_designation="character", | |
318 #file_name_dataset="character", | |
319 #file_name_dataset_1_percent="character", | |
320 #experiment_name="character" | |
321 ) ) | |
322 Data_Object_Info$methods( | |
323 initialize = function(){ | |
324 callSuper() | |
325 class_name <<- "Data_Object_Info - <Abstract class - class_name needs to be set in subclass>" | |
326 }, | |
327 verify = function(){ | |
328 checkFieldExists = function(field_name=NULL){ | |
329 field_value <- .self[[field_name]] | |
330 checkTrue(length(field_value) > 0, | |
331 sprintf("Field %s$%s has not been set (and should have been)", class_name, field_name)) | |
332 checkTrue(length(field_value) == 1, | |
333 sprintf("Field %s$%s has been set to multiple values (and should be a single value)", class_name, field_name)) | |
334 checkTrue(field_value != "", | |
335 sprintf("Field %s$%s has been set to an empty string (and should not have been)", class_name, field_name)) | |
336 } | |
337 checkFieldExists("data_file_name") | |
338 checkFieldExists("data_path_name") | |
339 checkFieldExists("experiment_name") | |
340 checkFieldExists("designation") | |
341 checkFieldExists("input_file_type") | |
342 #checkFieldExists("score_field_name") | |
343 }, | |
344 m_load_data = function(){ | |
345 # Nothing to do - this is really a data class | |
346 }, | |
347 file_path = function(){ | |
348 result <- file.path(data_path_name, data_file_name) | |
349 if (length(result) == 0){ | |
350 stop("Unable to validate file path - one or both of path name and file name are missing") | |
351 } | |
352 return(result) | |
353 }, | |
354 file_path_1_percent_FDR = function(){ | |
355 local_file_name <- get_data_file_name_1_percent_FDR() | |
356 if (length(local_file_name) == 0){ | |
357 result <- "" | |
358 } else { | |
359 result <- file.path(data_path_name, local_file_name) | |
360 } | |
361 | |
362 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream | |
363 | |
364 # if (length(result) == 0){ | |
365 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing") | |
366 # } | |
367 return(result) | |
368 }, | |
369 get_data_file_name_1_percent_FDR = function(){ | |
370 return(data_file_name_1_percent_FDR) | |
371 }, | |
372 collection_name = function(){ | |
373 result <- sprintf("%s_%s", experiment_name, designation) | |
374 return(result) | |
375 } | |
376 ) | |
377 ############################################################################### | |
378 # Class: Data_Object_Info_737_two_step | |
379 ############################################################################### | |
380 Data_Object_Info_737_two_step <- setRefClass("Data_Object_Info_737_two_step", | |
381 contains = "Data_Object_Info", | |
382 fields =list()) | |
383 Data_Object_Info_737_two_step$methods( | |
384 initialize = function(){ | |
385 callSuper() | |
386 class_name <<- "Data_Object_Info_737_two_step" | |
387 #score_field_name <<- "Confidence [%]" | |
388 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" | |
389 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_Multi_Stage_Two_Step.tabular.tabular" | |
390 data_path_name <<- file.path(".", "Data") | |
391 experiment_name <<- "Oral_737_NS" | |
392 designation <<- "two_step" | |
393 | |
394 input_file_type <<- "PSM_Report" | |
395 | |
396 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" | |
397 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" | |
398 | |
399 } | |
400 ) | |
401 | |
402 ############################################################################### | |
403 # Class: Data_Object_Info_737_combined | |
404 ############################################################################### | |
405 Data_Object_Info_737_combined <- setRefClass("Data_Object_Info_737_combined", | |
406 contains = "Data_Object_Info", | |
407 fields =list()) | |
408 Data_Object_Info_737_combined$methods( | |
409 initialize = function(){ | |
410 callSuper() | |
411 class_name <<- "Data_Object_Info_737_combined" | |
412 #score_field_name <<- "Confidence [%]" | |
413 data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" | |
414 data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_CombinedDB.tabular" | |
415 data_path_name <<- file.path(".", "Data") | |
416 experiment_name <<- "Oral_737_NS" | |
417 designation <<- "two_step" | |
418 | |
419 input_file_type <<- "PSM_Report" | |
420 | |
421 #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" | |
422 #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" | |
423 | |
424 } | |
425 ) | |
426 | |
427 ############################################################################### | |
428 # Class: Data_Object_Pyrococcus_tr | |
429 ############################################################################### | |
430 Data_Object_Pyrococcus_tr <- setRefClass("Data_Object_Pyrococcus_tr", | |
431 contains = "Data_Object_Info", | |
432 fields =list()) | |
433 Data_Object_Pyrococcus_tr$methods( | |
434 initialize = function(){ | |
435 callSuper() | |
436 class_name <<- "Data_Object_Pyrococcus_tr" | |
437 #score_field_name <<- "Confidence [%]" | |
438 data_file_name_1_percent_FDR <<- "" | |
439 data_file_name <<- "Pfu_traditional_Extended_PSM_Report.tabular" | |
440 data_path_name <<- file.path(".", "Data") | |
441 experiment_name <<- "Pyrococcus" | |
442 designation <<- "tr" | |
443 | |
444 input_file_type <<- "PSM_Report" | |
445 | |
446 } | |
447 ) | |
448 ############################################################################### | |
449 # Class: Data_Object_Mouse_Mutations | |
450 ############################################################################### | |
451 Data_Object_Mouse_Mutations <- setRefClass("Data_Object_Mouse_Mutations", | |
452 contains = "Data_Object_Info", | |
453 fields =list()) | |
454 Data_Object_Mouse_Mutations$methods( | |
455 initialize = function(){ | |
456 callSuper() | |
457 class_name <<- "Data_Object_Mouse_Mutations" | |
458 #score_field_name <<- "Confidence [%]" | |
459 data_file_name_1_percent_FDR <<- "" | |
460 data_file_name <<- "Combined_DB_Mouse_5PTM.tabular" | |
461 data_path_name <<- file.path(".", "Data") | |
462 experiment_name <<- "Mouse Mutations" | |
463 designation <<- "combined_05" | |
464 | |
465 input_file_type <<- "PSM_Report" | |
466 | |
467 } | |
468 ) | |
469 ############################################################################### | |
470 # Class: Data_Object_Raw_Data | |
471 ############################################################################### | |
472 Data_Object_Raw_Data <- setRefClass("Data_Object_Raw_Data", | |
473 contains = "Data_Object", | |
474 fields =list(df = "data.frame")) | |
475 Data_Object_Raw_Data$methods( | |
476 initialize = function(){ | |
477 callSuper() | |
478 class_name <<- "Data_Object_Raw_Data" | |
479 }, | |
480 verify = function(){ | |
481 # Check that file exists before using it | |
482 file_path <- get_info()$file_path() | |
483 if (! safe_file_exists(file_path)){ | |
484 stop(sprintf("Raw data file does not exist (%s)", file_path)) | |
485 } | |
486 # BUGBUG: Needs to also check the following: | |
487 # - file is tab-delimited | |
488 # - first row is a list of column names | |
489 }, | |
490 set_info = function(info){ | |
491 parents[["info"]] <<- info | |
492 }, | |
493 get_info = function(){ | |
494 return(verified_element_of_list(parents, "info", "Data_Object_Raw_Data$parents")) | |
495 }, | |
496 m_load_data = function(){ | |
497 info <- get_info() | |
498 df <<- load_standard_df(info$file_path()) | |
499 } | |
500 ) | |
501 ############################################################################### | |
502 # Class: Data_Object_Raw_1_Percent | |
503 ############################################################################### | |
504 Data_Object_Raw_1_Percent <- setRefClass("Data_Object_Raw_1_Percent", | |
505 contains = "Data_Object", | |
506 fields =list(df = "data.frame")) | |
507 Data_Object_Raw_1_Percent$methods( | |
508 initialize = function(){ | |
509 callSuper() | |
510 class_name <<- "Data_Object_Raw_1_Percent" | |
511 }, | |
512 set_info = function(info){ | |
513 parents[["info"]] <<- info | |
514 }, | |
515 verify = function(){ | |
516 # Do nothing - a missing file name is acceptable for this module and is dealt with in load() | |
517 }, | |
518 get_info = function(){ | |
519 return(verified_element_of_list(parents, "info", "Data_Object_Raw_1_Percent$parents")) | |
520 }, | |
521 m_load_data = function(){ | |
522 | |
523 info <- get_info() | |
524 file_path <- info$file_path_1_percent_FDR() | |
525 if (exists()){ | |
526 df <<- load_standard_df(info$file_path_1_percent_FDR()) | |
527 } # Note that failing to load is a valid state for this file, leading to not is_dirty. BUGBUG: this could lead to problems if a good file appears later | |
528 }, | |
529 exists = function(){ | |
530 | |
531 info <- get_info() | |
532 local_file_name <- info$get_data_file_name_1_percent_FDR() # Check file name not file path | |
533 | |
534 if (length(local_file_name) == 0 ){ # variable not set | |
535 result = FALSE | |
536 } else if (local_file_name == ""){ # variable set to empty string | |
537 result = FALSE | |
538 } else { | |
539 result = safe_file_exists(info$file_path_1_percent_FDR()) | |
540 } | |
541 | |
542 return(result) | |
543 } | |
544 ) | |
545 ############################################################################### | |
546 # Class: Data_Converter | |
547 ############################################################################### | |
548 Data_Converter <- setRefClass("Data_Converter", | |
549 fields =list(class_name = "character", | |
550 file_type = "character" | |
551 ) ) | |
552 Data_Converter$methods( | |
553 initialize = function(){ | |
554 class_name <<- "Data_Converter <abstract class - class_name needs to be set in subclass>" | |
555 file_type <<- "file_type has not been set before being used <needs to be set in initialize() of subclass>" | |
556 }, | |
557 check_raw_fields = function(info=NULL, raw_data=NULL){ | |
558 stop(sprintf("check_raw_fields() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name)) | |
559 }, | |
560 convert_data = function(){ | |
561 stop(sprintf("convert_data() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name)) | |
562 } | |
563 ) | |
564 ############################################################################### | |
565 # Class: Data_Converter_PMD_FDR_input_file | |
566 ############################################################################### | |
567 Data_Converter_PMD_FDR_input_file <- setRefClass("Data_Converter_PMD_FDR_input_file", | |
568 contains = "Data_Converter", | |
569 fields =list( | |
570 | |
571 ) ) | |
572 Data_Converter_PMD_FDR_input_file$methods( | |
573 initialize = function(){ | |
574 callSuper() | |
575 | |
576 class_name <<- "Data_Converter_PMD_FDR_input_file" | |
577 file_type <<- "PMD_FDR_file_type" | |
578 }, | |
579 check_raw_fields = function(info=NULL, raw_data=NULL){ | |
580 data_original <- raw_data$df | |
581 check_field_name(data_original, "raw_data", "PMD_FDR_input_score") | |
582 check_field_name(data_original, "raw_data", "PMD_FDR_pmd") | |
583 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_file") | |
584 check_field_name(data_original, "raw_data", "PMD_FDR_proteins") | |
585 check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_title") | |
586 check_field_name(data_original, "raw_data", "PMD_FDR_sequence") | |
587 check_field_name(data_original, "raw_data", "PMD_FDR_decoy") | |
588 }, | |
589 convert_data = function(info=NULL, raw_data=NULL){ | |
590 data_new <- raw_data$df | |
591 | |
592 return(data_new) # Pass through - everything should be in order | |
593 } | |
594 ) | |
595 ############################################################################### | |
596 # Class: Data_Converter_PSM_Report | |
597 ############################################################################### | |
598 Data_Converter_PSM_Report <- setRefClass("Data_Converter_PSM_Report", | |
599 contains = "Data_Converter", | |
600 fields =list( | |
601 | |
602 ) ) | |
603 Data_Converter_PSM_Report$methods( | |
604 initialize = function(){ | |
605 callSuper() | |
606 | |
607 class_name <<- "Data_Converter_PSM_Report" | |
608 file_type <<- "PSM_Report" | |
609 }, | |
610 check_raw_fields = function(info=NULL, raw_data=NULL){ | |
611 data_original <- raw_data$df | |
612 check_field_name(data_original, "raw_data", "Confidence [%]") | |
613 check_field_name(data_original, "raw_data", "Precursor m/z Error [ppm]") | |
614 check_field_name(data_original, "raw_data", "Spectrum File") | |
615 check_field_name(data_original, "raw_data", "Protein(s)") | |
616 check_field_name(data_original, "raw_data", "Spectrum Title") | |
617 check_field_name(data_original, "raw_data", "Decoy") | |
618 check_field_name(data_original, "raw_data", "Sequence") | |
619 | |
620 }, | |
621 convert_data = function(info=NULL, raw_data=NULL){ | |
622 data_new <- raw_data$df | |
623 | |
624 data_new$PMD_FDR_input_score <- data_new[, "Confidence [%]" ] | |
625 data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"] | |
626 data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ] | |
627 data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ] | |
628 data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ] | |
629 data_new$PMD_FDR_sequence <- data_new[, "Sequence" ] | |
630 data_new$PMD_FDR_decoy <- data_new[, "Decoy" ] | |
631 | |
632 return(data_new) | |
633 } | |
634 ) | |
635 ############################################################################### | |
636 # Class: Data_Converter_MaxQuant_Evidence | |
637 ############################################################################### | |
638 Data_Converter_MaxQuant_Evidence <- setRefClass("Data_Converter_MaxQuant_Evidence", | |
639 contains = "Data_Converter", | |
640 fields =list( | |
641 | |
642 ) ) | |
643 Data_Converter_MaxQuant_Evidence$methods( | |
644 initialize = function(){ | |
645 callSuper() | |
646 | |
647 class_name <<- "Data_Converter_MaxQuant_Evidence" | |
648 file_type <<- "MaxQuant_Evidence" | |
649 }, | |
650 check_raw_fields = function(info=NULL, raw_data=NULL){ | |
651 data_original <- raw_data$df | |
652 | |
653 check_field_name(data_original, "raw_data", "PEP") | |
654 check_field_name(data_original, "raw_data", "Mass error [ppm]") | |
655 check_field_name(data_original, "raw_data", "Proteins") | |
656 check_field_name(data_original, "raw_data", "Retention time") | |
657 check_field_name(data_original, "raw_data", "Sequence") | |
658 check_field_name(data_original, "raw_data", "Reverse") | |
659 }, | |
660 convert_data = function(info=NULL, raw_data=NULL){ | |
661 data_new <- raw_data$df | |
662 | |
663 data_new$PMD_FDR_input_score <- 100 * (1 - data_new[, "PEP" ]) | |
664 data_new$PMD_FDR_pmd <- data_new[, "Mass error [ppm]"] | |
665 data_new$PMD_FDR_spectrum_file <- "<place_holder - assumes a single spectra file>" | |
666 data_new$PMD_FDR_proteins <- data_new[, "Proteins" ] | |
667 data_new$PMD_FDR_spectrum_title <- data_new[, "Retention time" ] # Used for ordering peptides - not important in MaxQuant since PMD has already been normalized effectively | |
668 data_new$PMD_FDR_sequence <- data_new[, "Sequence" ] | |
669 data_new$PMD_FDR_decoy <- ifelse( data_new[, "Reverse" ] == "+", 1, 0) | |
670 | |
671 return(data_new) | |
672 } | |
673 ) | |
674 | |
675 ############################################################################### | |
676 # Class: Data_Object_Data_Converter | |
677 ############################################################################### | |
678 Data_Object_Data_Converter <- setRefClass("Data_Object_Data_Converter", | |
679 contains = "Data_Object", | |
680 fields =list(df = "data.frame", | |
681 data_converter = "Data_Converter")) | |
682 Data_Object_Data_Converter$methods( | |
683 initialize = function(){ | |
684 callSuper() | |
685 class_name <<- "Data_Object_Data_Converter" | |
686 }, | |
687 currently_supported_file_types = function(){ | |
688 return(c("PSM_Report", "PMD_FDR_input_file")) | |
689 }, | |
690 verify = function(){ | |
691 info <- get_info() | |
692 raw_data <- get_raw_data() | |
693 file_type <- get_info()$input_file_type | |
694 | |
695 set_file_type(file_type) | |
696 data_converter$check_raw_fields(info=info, raw_data=raw_data) | |
697 | |
698 }, | |
699 m_load_data = function(){ | |
700 | |
701 info <- get_info() | |
702 raw_data <- get_raw_data() | |
703 file_type <- get_info()$input_file_type | |
704 | |
705 df <<- data_converter$convert_data(info=info, raw_data=raw_data) | |
706 | |
707 }, | |
708 set_file_type = function(file_type = NULL){ | |
709 if (file_type == "PSM_Report" ){ | |
710 data_converter <<- Data_Converter_PSM_Report $new() | |
711 } else if (file_type == "PMD_FDR_input_file"){ | |
712 data_converter <<- Data_Converter_PMD_FDR_input_file$new() | |
713 } else if (file_type == "MaxQuant_Evidence"){ | |
714 data_converter <<- Data_Converter_MaxQuant_Evidence $new() | |
715 } else { | |
716 stop(sprintf("File type '%s' is not currently supported by PMD-FDR module", file_type)) | |
717 } | |
718 }, | |
719 set_info = function(info){ | |
720 parents[["info"]] <<- info | |
721 }, | |
722 get_info = function(){ | |
723 return(verified_element_of_list(parents, "info", "Data_Object_Data_Converter$parents")) | |
724 }, | |
725 set_raw_data = function(raw_data){ | |
726 parents[["raw_data"]] <<- raw_data | |
727 }, | |
728 get_raw_data = function(){ | |
729 return(verified_element_of_list(parents, "raw_data", "Data_Object_Data_Converter$parents")) | |
730 } | |
731 ) | |
732 ############################################################################### | |
733 # Class: Data_Object_Groupings | |
734 ############################################################################### | |
735 Data_Object_Groupings <- setRefClass("Data_Object_Groupings", | |
736 contains = "Data_Object", | |
737 fields =list(df = "data.frame")) | |
738 Data_Object_Groupings$methods( | |
739 initialize = function(){ | |
740 callSuper() | |
741 class_name <<- "Data_Object_Groupings" | |
742 }, | |
743 simplify_field_name = function(x=NULL){ | |
744 result <- gsub(pattern = "PMD_FDR_", replacement = "", x = x) | |
745 return(result) | |
746 }, | |
747 verify = function(){ | |
748 data_original <- get_data_converter()$df | |
749 | |
750 check_field_name(data_original, "data_converter", "PMD_FDR_input_score") | |
751 check_field_name(data_original, "data_converter", "PMD_FDR_pmd") | |
752 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_file") | |
753 check_field_name(data_original, "data_converter", "PMD_FDR_proteins") | |
754 check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_title") | |
755 check_field_name(data_original, "data_converter", "PMD_FDR_sequence") | |
756 check_field_name(data_original, "data_converter", "PMD_FDR_decoy") | |
757 | |
758 }, | |
759 m_load_data = function(){ | |
760 make_data_groups <- function(data_original=NULL){ | |
761 | |
762 # Functions supporting make_data_groups() | |
763 | |
764 standardize_fields <- function(data=NULL){ | |
765 data_new <- data | |
766 | |
767 info <- get_info() | |
768 info$ensure() | |
769 #field_name_of_score <- info$get_field_name_of_score() | |
770 | |
771 # #data_new <- rename_column(data_new, "Variable Modifications" , "ptm_list") | |
772 # data_new <- rename_column(data_new, field_name_of_score , "PMD_FDR_input_score") | |
773 # data_new <- rename_column(data_new, "Precursor m/z Error [ppm]", "PMD_FDR_pmd") | |
774 # #data_new <- rename_column(data_new, "Isotope Number" , "isotope_number") | |
775 # #data_new <- rename_column(data_new, "m/z" , "m_z") | |
776 # #data_new <- rename_column(data_new, "Measured Charge" , "charge") | |
777 # data_new <- rename_column(data_new, "Spectrum File" , "PMD_FDR_spectrum_file") | |
778 # data_new <- rename_column(data_new, "Protein(s)" , "PMD_FDR_proteins") | |
779 # data_new <- rename_column(data_new, "Spectrum Title" , "PMD_FDR_spectrum_title") | |
780 # data_new <- manage_decoy_column(data_new) | |
781 | |
782 # Now managed in Data_Converter | |
783 # data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ] | |
784 # data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"] | |
785 # data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ] | |
786 # data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ] | |
787 # data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ] | |
788 | |
789 data_new$value <- data_new$PMD_FDR_pmd | |
790 data_new$PMD_FDR_peptide_length <- str_length(data_new$PMD_FDR_sequence) | |
791 #data_new$charge_value <- with(data_new, as.numeric(substr(charge, start=1, stop=str_length(charge)-1))) | |
792 #data_new$measured_mass <- with(data_new, m_z*charge_value) | |
793 data_new$PMD_FDR_spectrum_index <- NA | |
794 data_new$PMD_FDR_spectrum_index[order(data_new$PMD_FDR_spectrum_title, na.last = TRUE)] <- 1:nrow(data_new) | |
795 | |
796 return(data_new) | |
797 } | |
798 add_grouped_variable <- function(data_groups = data_groups, field_name_to_group = NULL, vec.length.out = NULL, vec.tolerance = NULL, value_format = NULL){ | |
799 | |
800 # Support functions for add_grouped_variable() | |
801 find_interval_vec <- function(x=NULL, length.out = NULL, tolerance = NULL){ | |
802 q <- quantile(x = x, probs = seq(from=0, to=1, length.out = length.out), na.rm=TRUE) | |
803 q <- round_to_tolerance(q, tolerance = tolerance) | |
804 return(q) | |
805 } | |
806 get_group_data_frame <- function(vec=NULL, value_format = NULL){ | |
807 n <- length(vec) | |
808 a <- vec[-n] | |
809 b <- vec[-1] | |
810 | |
811 lower <- ifelse(a == b , "eq", NA) | |
812 lower <- ifelse(is.na(lower ), "ge", lower) | |
813 upper <- ifelse(a == b , "eq", NA) | |
814 upper[n-1] <- ifelse(is.na(upper[n-1]), "le", "eq") | |
815 upper <- ifelse(is.na(upper ), "lt", upper) | |
816 group <- data.frame(list(idx=1:(n-1), a=a, b=b, lower=lower, upper=upper)) | |
817 | |
818 name_format <- sprintf("%%%s_%%%s_%%s_%%s", value_format, value_format) | |
819 group$new_var <- with(group, sprintf(name_format, a, b, lower, upper)) | |
820 | |
821 return(group) | |
822 } | |
823 merge_group_with_data <- function(data_groups = NULL, group = NULL, vec = NULL, field_name_to_group = NULL){ | |
824 field_name_new <- sprintf("group_%s", simplify_field_name(field_name_to_group)) | |
825 group_idx <- findInterval(x = data_groups[,field_name_to_group], | |
826 vec = vec, | |
827 all.inside=TRUE) | |
828 data_groups$new_var <- group$new_var[group_idx] | |
829 data_groups <- rename_column(data_groups, "new_var", field_name_new) | |
830 } | |
831 # Body of add_grouped_variable() | |
832 | |
833 vec <- find_interval_vec(x = data_groups[[field_name_to_group]], | |
834 length.out = vec.length.out, | |
835 tolerance = vec.tolerance ) | |
836 group <- get_group_data_frame(vec = vec, | |
837 value_format = value_format) | |
838 df_new <- merge_group_with_data(data_groups = data_groups, | |
839 group = group, | |
840 vec = vec, | |
841 field_name_to_group = field_name_to_group) | |
842 df_new <- add_group_decoy(df_new, field_name_to_group) | |
843 | |
844 return(df_new) | |
845 } | |
846 add_already_grouped_variable <- function(field_name_to_group = NULL, data_groups = NULL ){ | |
847 old_name <- field_name_to_group | |
848 new_name <- sprintf("group_%s", simplify_field_name(old_name)) | |
849 df_new <- data_groups | |
850 df_new[[new_name]] <- data_groups[[old_name]] | |
851 | |
852 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group) | |
853 | |
854 return(df_new) | |
855 } | |
856 add_value_norm <- function(data_groups = NULL){ | |
857 | |
858 df_new <- data_groups | |
859 df_new$value_norm <- with(df_new, value - median_of_group_index) | |
860 | |
861 return(df_new) | |
862 } | |
863 add_protein_group <-function(data_groups = NULL){ | |
864 data_new <- data_groups | |
865 df_group_def <- data.frame(stringsAsFactors = FALSE, | |
866 list(pattern = c("" , "pfu_" , "cRAP"), | |
867 group_name = c("human", "pyrococcus", "contaminant"))) | |
868 for (i in 1:nrow(df_group_def)){ | |
869 idx <- grepl(pattern = df_group_def$pattern[i], | |
870 x = data_new$PMD_FDR_proteins) | |
871 data_new$group_proteins[idx] <- df_group_def$group_name[i] | |
872 } | |
873 | |
874 data_new <- add_group_decoy(data_groups = data_new, field_name_to_group = "PMD_FDR_proteins") | |
875 return(data_new) | |
876 } | |
877 add_group_decoy <- function(data_groups=NULL, field_name_to_group=NULL){ | |
878 simple_field_name <- simplify_field_name(field_name_to_group) | |
879 field_name_decoy <- sprintf("group_decoy_%s", simple_field_name) | |
880 field_name_group <- sprintf("group_%s", simple_field_name) | |
881 | |
882 data_groups[[field_name_decoy]] <- with(data_groups, ifelse(PMD_FDR_decoy, "decoy", data_groups[[field_name_group]])) | |
883 | |
884 return(data_groups) | |
885 } | |
886 add_group_training_class <- function(data_groups = NULL){ | |
887 df_new <- data_groups | |
888 | |
889 lowest_confidence_group <- min(data_groups$group_input_score) | |
890 | |
891 is_long_enough <- with(df_new, (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) ) | |
892 is_good <- with(df_new, (PMD_FDR_decoy == 0) & (PMD_FDR_input_score == 100) ) | |
893 is_bad <- with(df_new, (PMD_FDR_decoy == 1) ) | |
894 #is_used_to_train <- with(df_new, used_to_find_middle) # BUGBUG: circular definition | |
895 | |
896 idx_good <- which(is_good ) # & is_long_enough) | |
897 n_good <- length(idx_good) | |
898 idx_testing <- idx_good[c(TRUE,FALSE)] # Selects every other item | |
899 idx_training <- setdiff(idx_good, idx_testing) | |
900 | |
901 #is_good_short <- with(df_new, is_good & !is_long_enough ) | |
902 #is_good_long <- with(df_new, is_good & is_long_enough ) | |
903 is_bad_short <- with(df_new, is_bad & !is_long_enough ) | |
904 is_bad_long <- with(df_new, is_bad & is_long_enough ) | |
905 #is_good_training <- with(df_new, is_good_long & (used_to_find_middle == TRUE ) ) | |
906 #is_good_testing <- with(df_new, is_good_long & (used_to_find_middle == FALSE) ) | |
907 | |
908 df_new$group_training_class <- "other_short" # Default | |
909 df_new$group_training_class[is_long_enough ] <- "other_long" # Default (if long enough) | |
910 df_new$group_training_class[idx_training ] <- "good_training" # Length does not matter (anymore) | |
911 df_new$group_training_class[idx_testing ] <- "good_testing" # Ditto | |
912 #df_new$group_training_class[is_good_short ] <- "good_short" | |
913 df_new$group_training_class[is_bad_long ] <- "bad_long" # ...except for "bad" | |
914 df_new$group_training_class[is_bad_short ] <- "bad_short" | |
915 | |
916 df_new <- add_used_to_find_middle( data_groups = df_new ) # Guarantees consistency between duplicated definitions | |
917 | |
918 return(df_new) | |
919 } | |
920 add_used_to_find_middle <- function(data_groups = NULL){ | |
921 df_new <- data_groups | |
922 idx_used <- which(data_groups$group_training_class == "good_training") | |
923 | |
924 df_new$used_to_find_middle <- FALSE | |
925 df_new$used_to_find_middle[idx_used] <- TRUE | |
926 | |
927 return(df_new) | |
928 } | |
929 add_group_spectrum_index <- function(data_groups = NULL){ | |
930 | |
931 # Supporting functions for add_group_spectrum_index() | |
932 | |
933 get_breaks_all <- function(df_new){ | |
934 # Supporting function(s) for get_breaks_all() | |
935 | |
936 get_cut_points <- function(data_subset){ | |
937 | |
938 # Supporting function(s) for get_cut_points() | |
939 | |
940 cut_values <- function(data=NULL, minimum_segment_length=NULL){ | |
941 # using cpt.mean -- Appears to have a memory leak | |
942 #results_cpt <- cpt.mean(data=data, method="PELT", minimum_segment_length=minimum_segment_length) | |
943 #results <- results_cpt@cpts | |
944 | |
945 # Just look at the end | |
946 #results <- c(length(data)) | |
947 | |
948 # regularly spaced, slightly larger than minimum_segment_length | |
949 n_points <- length(data) | |
950 n_regions <- floor(n_points / minimum_segment_length) | |
951 n_regions <- ifelse(n_regions == 0, 1, n_regions) | |
952 results <- round(seq(1, n_points, length.out = n_regions + 1)) | |
953 results <- results[-1] | |
954 return(results) | |
955 } | |
956 remove_last <- function(x){ | |
957 return(x[-length(x)] ) | |
958 } | |
959 | |
960 # Main code of for get_cut_points() | |
961 max_idx = max(data_subset$PMD_FDR_spectrum_index) | |
962 data_sub_sub <- subset(data_subset, group_training_class == "good_training") #(PMD_FDR_input_score==100) & (PMD_FDR_decoy==0)) | |
963 minimum_segment_length = 50 | |
964 | |
965 values <- data_sub_sub$value | |
966 n_values <- length(values) | |
967 local_to_global_idx <- data_sub_sub$PMD_FDR_spectrum_index | |
968 if (n_values <= minimum_segment_length){ | |
969 result <- c() | |
970 } else { | |
971 local_idx <- cut_values(data=values, minimum_segment_length=minimum_segment_length) | |
972 result <- local_to_global_idx[local_idx] | |
973 result <- remove_last(result) | |
974 } | |
975 result <- c(result, max_idx) | |
976 return(result) | |
977 } | |
978 remove_last <- function(vec) { | |
979 return(vec[-length(vec)]) | |
980 } | |
981 | |
982 # Main code of get_breaks_all() | |
983 | |
984 breaks <- 1 | |
985 | |
986 files <- unique(df_new$PMD_FDR_spectrum_file) | |
987 | |
988 for (local_file in files){ | |
989 data_subset <- subset(df_new, (PMD_FDR_spectrum_file==local_file)) | |
990 if (nrow(data_subset) > 0){ | |
991 breaks <- c(breaks, get_cut_points(data_subset)) | |
992 } | |
993 } | |
994 breaks <- sort(unique(breaks)) | |
995 breaks <- remove_last(breaks) | |
996 breaks <- c(breaks, max(df_new$PMD_FDR_spectrum_index + 1)) | |
997 | |
998 return(breaks) | |
999 } | |
1000 | |
1001 # Main code of add_group_spectrum_index() | |
1002 | |
1003 field_name_to_group <- "PMD_FDR_spectrum_index" | |
1004 | |
1005 df_new <- data_groups[order(data_groups[[field_name_to_group]]),] | |
1006 breaks <- get_breaks_all(df_new) | |
1007 | |
1008 df_new$group_spectrum_index <- cut(x = df_new[[field_name_to_group]], breaks = breaks, right = FALSE, dig.lab = 6) | |
1009 df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group) | |
1010 | |
1011 return(df_new) | |
1012 } | |
1013 add_median_of_group_index <-function(data_groups = NULL){ | |
1014 field_median <- "median_of_group_index" | |
1015 data_good <- subset(data_groups, used_to_find_middle ) | |
1016 med <- aggregate(value~group_spectrum_index, data=data_good, FUN=safe_median) | |
1017 med <- rename_column(med, "value", field_median) | |
1018 | |
1019 data_groups[[field_median]] <- NULL | |
1020 df_new <- merge(data_groups, med) | |
1021 | |
1022 return(df_new) | |
1023 } | |
1024 add_1_percent_to_data_groups <- function(data_groups=NULL){ | |
1025 | |
1026 data_new <- data_groups | |
1027 | |
1028 if (get_raw_1_percent()$exists()){ | |
1029 # Load 1 percent file | |
1030 df_1_percent <- get_raw_1_percent()$df | |
1031 | |
1032 # Get relevant fields | |
1033 df_1_percent$is_in_1percent <- TRUE | |
1034 df_1_percent <- rename_column(df_1_percent, "Spectrum Title", "PMD_FDR_spectrum_title") | |
1035 df_1_percent <- df_1_percent[,c("PMD_FDR_spectrum_title", "is_in_1percent")] | |
1036 | |
1037 # Merge with data_groups | |
1038 data_new <- merge(data_new, df_1_percent, all.x=TRUE) | |
1039 data_new$is_in_1percent[is.na(data_new$is_in_1percent)] <- FALSE | |
1040 } | |
1041 | |
1042 # Save results | |
1043 return(data_new) | |
1044 | |
1045 } | |
1046 | |
1047 | |
1048 # Main code of make_data_groups() | |
1049 data_groups <- standardize_fields(data_original) | |
1050 | |
1051 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_input_score", | |
1052 data_groups = data_groups, | |
1053 vec.length.out = 14, | |
1054 vec.tolerance = 1, | |
1055 value_format = "03d") | |
1056 | |
1057 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_pmd", | |
1058 data_groups = data_groups, | |
1059 vec.length.out = 21, | |
1060 vec.tolerance = 0.1, | |
1061 value_format = "+05.1f") | |
1062 | |
1063 data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_peptide_length", | |
1064 data_groups = data_groups, | |
1065 vec.length.out = 11, | |
1066 vec.tolerance = 1, | |
1067 value_format = "02d") | |
1068 | |
1069 # data_groups <- add_grouped_variable(field_name_to_group = "m_z", | |
1070 # data_groups = data_groups, | |
1071 # vec.length.out = 11, | |
1072 # vec.tolerance = 10, | |
1073 # value_format = "04.0f") | |
1074 # | |
1075 # data_groups <- add_grouped_variable(field_name_to_group = "measured_mass", | |
1076 # data_groups = data_groups, | |
1077 # vec.length.out = 11, | |
1078 # vec.tolerance = 1, | |
1079 # value_format = "04.0f") | |
1080 # | |
1081 # data_groups <- add_already_grouped_variable(field_name_to_group = "isotope_number", | |
1082 # data_groups = data_groups ) | |
1083 # | |
1084 # data_groups <- add_already_grouped_variable(field_name_to_group = "charge", | |
1085 # data_groups = data_groups ) | |
1086 # | |
1087 data_groups <- add_already_grouped_variable(field_name_to_group = "PMD_FDR_spectrum_file", | |
1088 data_groups = data_groups ) | |
1089 data_groups <- add_protein_group(data_groups = data_groups) | |
1090 data_groups <- add_group_training_class( data_groups = data_groups) | |
1091 data_groups <- add_group_spectrum_index( data_groups = data_groups) | |
1092 data_groups <- add_median_of_group_index( data_groups = data_groups) | |
1093 data_groups <- add_value_norm( data_groups = data_groups) | |
1094 | |
1095 # fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "m_z", "PMD_FDR_peptide_length", "isotope_number", "charge", "PMD_FDR_spectrum_file", "measured_mass", "PMD_FDR_spectrum_index", "PMD_FDR_proteins") | |
1096 # fields_of_interest <- c("value", | |
1097 # "PMD_FDR_decoy", | |
1098 # "PMD_FDR_spectrum_title", | |
1099 # "median_of_group_index", | |
1100 # "value_norm", | |
1101 # "used_to_find_middle", | |
1102 # "group_training_class", | |
1103 # fields_of_interest, | |
1104 # sprintf("group_%s" , fields_of_interest), | |
1105 # sprintf("group_decoy_%s", fields_of_interest)) | |
1106 | |
1107 fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "PMD_FDR_peptide_length", "PMD_FDR_spectrum_file", "PMD_FDR_spectrum_index", "PMD_FDR_proteins") | |
1108 fields_of_interest <- c("value", | |
1109 "PMD_FDR_decoy", | |
1110 "PMD_FDR_spectrum_title", | |
1111 "median_of_group_index", | |
1112 "value_norm", | |
1113 "used_to_find_middle", | |
1114 "group_training_class", | |
1115 fields_of_interest, | |
1116 sprintf("group_%s" , simplify_field_name(fields_of_interest)), | |
1117 sprintf("group_decoy_%s", simplify_field_name(fields_of_interest))) | |
1118 | |
1119 data_groups <- data_groups[,fields_of_interest] | |
1120 data_groups <- add_1_percent_to_data_groups(data_groups) | |
1121 | |
1122 return(data_groups) | |
1123 } | |
1124 | |
1125 data_original <- get_data_converter()$df #parents[[INDEX_OF_ORIGINAL_DATA]]$df | |
1126 df <<- make_data_groups(data_original) | |
1127 }, | |
1128 set_info = function(info){ | |
1129 parents[["info"]] <<- info | |
1130 }, | |
1131 get_info = function(){ | |
1132 return(verified_element_of_list(parents, "info", "Data_Object_Groupings$parents")) | |
1133 }, | |
1134 set_data_converter = function(data_converter){ | |
1135 parents[["data_converter"]] <<- data_converter | |
1136 }, | |
1137 get_data_converter = function(){ | |
1138 return(verified_element_of_list(parents, "data_converter", "Data_Object_Groupings$parents")) | |
1139 }, | |
1140 set_raw_1_percent = function(raw_1_percent){ ############## BUGBUG: the 1% file should be using the same file type format as the standard data (but isn't) | |
1141 parents[["raw_1_percent"]] <<- raw_1_percent | |
1142 }, | |
1143 get_raw_1_percent = function(){ | |
1144 return(verified_element_of_list(parents, "raw_1_percent", "Data_Object_Groupings$parents")) | |
1145 } | |
1146 ) | |
1147 ############################################################################### | |
1148 # Class: Data_Object_Individual_FDR | |
1149 ############################################################################### | |
1150 Data_Object_Individual_FDR <- setRefClass("Data_Object_Individual_FDR", | |
1151 contains = "Data_Object", | |
1152 fields =list(df = "data.frame")) | |
1153 Data_Object_Individual_FDR$methods( | |
1154 initialize = function(){ | |
1155 callSuper() | |
1156 class_name <<- "Data_Object_Individual_FDR" | |
1157 }, | |
1158 verify = function(){ | |
1159 data_groups = get_data_groups()$df | |
1160 densities = get_densities()$df | |
1161 alpha = get_alpha()$df | |
1162 | |
1163 check_field_name(data_groups, "data_groups", "value_norm") | |
1164 check_field_name(data_groups, "data_groups", "group_decoy_input_score") | |
1165 check_field_name(data_groups, "data_groups", "PMD_FDR_peptide_length") | |
1166 check_field_name(data_groups, "data_groups", "PMD_FDR_input_score") | |
1167 check_field_name(alpha, "alpha", "alpha") # BUGBUG: I'm missing a field here... | |
1168 check_field_name(densities, "densities", "x") | |
1169 check_field_name(densities, "densities", "t") | |
1170 check_field_name(densities, "densities", "f") | |
1171 | |
1172 }, | |
1173 set_data_groups = function(parent){ | |
1174 parents[["data_groups"]] <<- parent | |
1175 }, | |
1176 get_data_groups = function(){ | |
1177 return(verified_element_of_list(parents, "data_groups", "Data_Object_Individual_FDR$parents")) | |
1178 }, | |
1179 set_densities = function(parent){ | |
1180 parents[["densities"]] <<- parent | |
1181 }, | |
1182 get_densities = function(){ | |
1183 return(verified_element_of_list(parents, "densities", "Data_Object_Individual_FDR$parents")) | |
1184 }, | |
1185 set_alpha = function(parent){ | |
1186 parents[["alpha"]] <<- parent | |
1187 }, | |
1188 get_alpha = function(){ | |
1189 return(verified_element_of_list(parents, "alpha", "Data_Object_Individual_FDR$parents")) | |
1190 }, | |
1191 m_load_data = function(){ | |
1192 add_FDR_to_data_groups <- function(data_groups=NULL, densities=NULL, alpha=NULL, field_value=NULL, field_decoy_group=NULL, set_decoy_to_1=FALSE){ | |
1193 # Support functions for add_FDR_to_data_groups() | |
1194 get_group_fdr <- function(group_stats = NULL, data_groups = NULL, densities=NULL){ | |
1195 group_fdr <- apply(X = densities, MARGIN = 2, FUN = max) | |
1196 df_group_fdr <- data.frame(group_fdr) | |
1197 df_group_fdr <- rename_column(df_group_fdr, "group_fdr", "v") | |
1198 df_group_fdr$group_of_interest <- names(group_fdr) | |
1199 t <- df_group_fdr[df_group_fdr$group_of_interest == "t", "v"] | |
1200 f <- df_group_fdr[df_group_fdr$group_of_interest == "f", "v"] | |
1201 df_group_fdr <- subset(df_group_fdr, !(group_of_interest %in% c("x", "t", "f"))) | |
1202 df_group_fdr$group_fdr <-(df_group_fdr$v - t) / (f - t) | |
1203 | |
1204 return(df_group_fdr) | |
1205 } | |
1206 | |
1207 get_mode <- function(x){ | |
1208 d <- density(x) | |
1209 return(d$x[which.max(d$y)]) | |
1210 } | |
1211 | |
1212 # Main code for add_FDR_to_data_groups() | |
1213 | |
1214 # Set up analysis | |
1215 data_new <- data_groups | |
1216 data_new$value_of_interest <- data_new[,field_value] | |
1217 data_new$group_of_interest <- data_new[,field_decoy_group] | |
1218 | |
1219 data_subset <- subset(data_new, PMD_FDR_peptide_length >= 11) | |
1220 | |
1221 # Identify mean PMD_FDR_input_score per group | |
1222 | |
1223 group_input_score <- aggregate(PMD_FDR_input_score~group_of_interest, data=data_subset, FUN=mean) | |
1224 group_input_score <- rename_column(group_input_score, "PMD_FDR_input_score", "group_input_score") | |
1225 | |
1226 #group_fdr <- get_group_fdr(data_groups = data_subset, densities=densities) | |
1227 group_stats <- merge(alpha, group_input_score) | |
1228 group_stats <- subset(group_stats, group_of_interest != "PMD_FDR_decoy") | |
1229 | |
1230 x=c(0,group_stats$group_input_score) | |
1231 y=c(1,group_stats$alpha) | |
1232 FUN_interp <- approxfun(x=x,y=y) | |
1233 | |
1234 data_new$interpolated_groupwise_FDR <- FUN_interp(data_new$PMD_FDR_input_score) | |
1235 if (set_decoy_to_1){ | |
1236 data_new$interpolated_groupwise_FDR[data_new$PMD_FDR_decoy == 1] <- 1 | |
1237 } | |
1238 | |
1239 return(data_new) | |
1240 } | |
1241 | |
1242 data_groups = get_data_groups()$df | |
1243 densities = get_densities()$df | |
1244 alpha = get_alpha()$df | |
1245 | |
1246 d_true <- densities[,c("x", "t")] | |
1247 d_false <- densities[,c("x", "f")] | |
1248 | |
1249 i_fdr <- add_FDR_to_data_groups(data_groups = data_groups, | |
1250 densities = densities, | |
1251 alpha = alpha, | |
1252 field_value ="value_norm", | |
1253 field_decoy_group = "group_decoy_input_score") | |
1254 # Derive local t | |
1255 interp_t <- splinefun(x=d_true$x, y=d_true$t) #approxfun(x=d_true$x, y=d_true$y) | |
1256 | |
1257 # Derive local f | |
1258 interp_f <- splinefun(x=d_false$x, y=d_false$f) #approxfun(x=d_true$x, y=d_true$y) | |
1259 | |
1260 # Derive local FDR | |
1261 i_fdr$t <- interp_t(i_fdr$value_of_interest) | |
1262 i_fdr$f <- interp_f(i_fdr$value_of_interest) | |
1263 i_fdr$alpha <- i_fdr$interpolated_groupwise_FDR | |
1264 i_fdr$i_fdr <- with(i_fdr, (alpha*f) / (alpha*f + (1-alpha)*t)) | |
1265 | |
1266 df <<- i_fdr | |
1267 | |
1268 } | |
1269 ) | |
1270 ############################################################################### | |
1271 # Class: Data_Object_Densities | |
1272 ############################################################################### | |
1273 Data_Object_Densities <- setRefClass("Data_Object_Densities", | |
1274 contains = "Data_Object", | |
1275 fields =list(df = "data.frame")) | |
1276 Data_Object_Densities$methods( | |
1277 initialize = function(){ | |
1278 callSuper() | |
1279 class_name <<- "Data_Object_Densities" | |
1280 }, | |
1281 verify = function(){ | |
1282 df_data_groups <- get_data_groups()$df | |
1283 | |
1284 checkTrue(nrow(df_data_groups) > 0, | |
1285 msg = "data_groups data frame was empty (and should not have been)") | |
1286 | |
1287 check_field_name(df_data_groups, "data_groups", "value_norm") | |
1288 check_field_name(df_data_groups, "data_groups", "group_decoy_input_score") | |
1289 check_field_name(df_data_groups, "data_groups", "group_training_class") | |
1290 }, | |
1291 set_data_groups = function(parent=NULL){ | |
1292 parents[["data_groups"]] <<- parent | |
1293 }, | |
1294 get_data_groups = function(){ | |
1295 return(verified_element_of_list(parent_list = parents, element_name = "data_groups", object_name = "Data_Object_Densities$parents")) | |
1296 }, | |
1297 m_load_data = function(){ | |
1298 | |
1299 # Support functions for make_densities() | |
1300 set_values_of_interest <- function(df_data_groups=NULL, field_group = NULL){ | |
1301 field_value = "value_norm" | |
1302 | |
1303 new_data_groups <- get_data_groups()$df | |
1304 new_data_groups$value_of_interest <- new_data_groups[,field_value] | |
1305 new_data_groups$group_of_interest <- new_data_groups[,field_group] | |
1306 #groups <- sort(unique(new_data_groups$group_of_interest)) | |
1307 | |
1308 return(new_data_groups) | |
1309 } | |
1310 get_ylim <- function(data_groups=NULL){ | |
1311 ylim <- range(data_groups$value_of_interest, na.rm = TRUE) | |
1312 return(ylim) | |
1313 } | |
1314 make_hit_density <- function(data_subset=NULL, descr_of_df=NULL, ylim=NULL){ | |
1315 #stop("Data_Object_Densities$make_hit_density() is untested beyond here") | |
1316 verify_density = function(data_subset=NULL, value_field=NULL, descr_of_df=NULL, ylim=NULL){ | |
1317 values <- data_subset[value_field] | |
1318 values <- values[! is.na(values)] | |
1319 if (length(values) < MIN_ACCEPTABLE_POINTS_IN_DENSITY){ | |
1320 stop (sprintf("There are too few valid %s (%d < %d) in %s to be used for calculating a density function", | |
1321 value_field, | |
1322 length(values), | |
1323 MIN_ACCEPTABLE_POINTS_IN_DENSITY, | |
1324 descr_of_df)) | |
1325 } | |
1326 d <- density(values, from = ylim[1], to = ylim[2]) | |
1327 | |
1328 return(d) | |
1329 } | |
1330 uniformalize_density <- function(d){ | |
1331 # Reorganizes y-values of density function so that | |
1332 # function is monotone increasing to mode | |
1333 # and monotone decreasing afterwards | |
1334 idx_mode <- which.max(d$y) | |
1335 idx_lower <- 1:(idx_mode-1) | |
1336 idx_upper <- idx_mode:length(d$y) | |
1337 | |
1338 values_lower <- d$y[idx_lower] | |
1339 values_upper <- d$y[idx_upper] | |
1340 | |
1341 new_d <- d | |
1342 new_d$y <- c(sort(values_lower, decreasing = FALSE), | |
1343 sort(values_upper, decreasing = TRUE)) | |
1344 | |
1345 return(new_d) | |
1346 } | |
1347 | |
1348 local_df <- subset(data_subset, | |
1349 (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) & | |
1350 (used_to_find_middle == FALSE)) | |
1351 d <- verify_density (data_subset=local_df, value_field = "value_of_interest", descr_of_df = descr_of_df, ylim=ylim) | |
1352 d <- normalize_density (d) | |
1353 d <- uniformalize_density(d) | |
1354 | |
1355 return(d) | |
1356 } | |
1357 make_true_hit_density <- function(data_groups=NULL){ | |
1358 d_true <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "good_testing") ), | |
1359 descr_of_df = "Good-testing dataset", | |
1360 ylim = get_ylim(data_groups)) | |
1361 return(d_true) | |
1362 } | |
1363 make_false_hit_density <- function(data_groups=NULL){ | |
1364 d_false <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "bad_long") ), | |
1365 descr_of_df = "Bad-long dataset", | |
1366 ylim = get_ylim(data_groups)) | |
1367 | |
1368 return(d_false) | |
1369 } | |
1370 add_v_densities <- function(data_groups=NULL, densities=NULL, field_group = NULL){ | |
1371 groups <- sort(unique(data_groups$group_of_interest)) | |
1372 | |
1373 new_densities <- densities | |
1374 | |
1375 for (local_group in groups){ | |
1376 d_v <- make_hit_density(data_subset = subset(data_groups, (group_of_interest == local_group)), | |
1377 descr_of_df = sprintf("subset of data (where %s is '%s')", | |
1378 field_group, | |
1379 local_group), | |
1380 ylim = get_ylim(data_groups)) | |
1381 new_densities[local_group] <- d_v$y | |
1382 } | |
1383 | |
1384 return(new_densities) | |
1385 } | |
1386 | |
1387 # Main section of make_densities() | |
1388 df_data_groups <- get_data_groups()$df | |
1389 new_data_groups <- set_values_of_interest(df_data_groups, field_group = "group_decoy_input_score") | |
1390 d_true <- make_true_hit_density( new_data_groups) | |
1391 d_false <- make_false_hit_density(new_data_groups) | |
1392 | |
1393 densities <- data.frame(x=d_true$x, | |
1394 t=d_true$y, | |
1395 f=d_false$y) | |
1396 densities <- add_v_densities(data_groups=new_data_groups, densities=densities, field_group = "group_decoy_input_score") | |
1397 df <<- densities | |
1398 } | |
1399 ) | |
1400 ############################################################################### | |
1401 # Class: Data_Object_Alpha | |
1402 ############################################################################### | |
1403 Data_Object_Alpha <- setRefClass("Data_Object_Alpha", | |
1404 contains = "Data_Object", | |
1405 fields =list(df = "data.frame")) | |
1406 Data_Object_Alpha$methods( | |
1407 initialize = function(){ | |
1408 callSuper() | |
1409 class_name <<- "Data_Object_Alpha" | |
1410 }, | |
1411 verify = function(){ | |
1412 densities <- get_densities()$df | |
1413 | |
1414 checkTrue(nrow(densities) > 0, | |
1415 msg = "Densities data.frame was empty (and should not have been)") | |
1416 }, | |
1417 set_densities = function(parent=NULL){ | |
1418 parents[["densities"]] <<- parent | |
1419 }, | |
1420 get_densities = function(){ | |
1421 return(verified_element_of_list(parent_list = parents, element_name = "densities", object_name = "Data_Object_Alpha")) | |
1422 }, | |
1423 m_load_data = function(){ | |
1424 | |
1425 densities <- get_densities()$df | |
1426 | |
1427 max_of_density = apply(X = densities, MARGIN = 2, FUN = max) | |
1428 df_alpha <- data.frame(stringsAsFactors = FALSE, | |
1429 list(v = max_of_density, | |
1430 group_of_interest = names(max_of_density))) | |
1431 df_alpha <- subset(df_alpha, group_of_interest != "x") | |
1432 t <- with(subset(df_alpha, group_of_interest=="t"), v) | |
1433 f <- with(subset(df_alpha, group_of_interest=="f"), v) | |
1434 df_alpha$alpha <- with(df_alpha, (t-v)/(t-f)) | |
1435 | |
1436 alpha <- df_alpha[,c("group_of_interest", "alpha")] | |
1437 alpha <- subset(alpha, (group_of_interest != "t") & (group_of_interest != "f")) | |
1438 | |
1439 df <<- alpha | |
1440 } | |
1441 ) | |
1442 ############################################################################### | |
1443 # Class: Data_Processor | |
1444 ############################################################################### | |
1445 Data_Processor <- setRefClass("Data_Processor", | |
1446 fields =list(info = "Data_Object_Info", | |
1447 raw_data = "Data_Object_Raw_Data", | |
1448 raw_1_percent = "Data_Object_Raw_1_Percent", | |
1449 data_converter = "Data_Object_Data_Converter", | |
1450 data_groups = "Data_Object_Groupings", | |
1451 densities = "Data_Object_Densities", | |
1452 alpha = "Data_Object_Alpha", | |
1453 i_fdr = "Data_Object_Individual_FDR")) | |
1454 Data_Processor$methods( | |
1455 initialize = function(p_info=NULL){ | |
1456 if (! is.null(p_info)){ | |
1457 set_info(p_info) | |
1458 } | |
1459 }, | |
1460 set_info = function(p_info=NULL){ | |
1461 # This initialization defines all of the dependencies between the various components | |
1462 | |
1463 info <<- p_info | |
1464 | |
1465 # raw_data | |
1466 raw_data$set_info(info) | |
1467 info$append_child(raw_data) | |
1468 | |
1469 # raw_1_percent | |
1470 raw_1_percent$set_info(info) | |
1471 info$append_child(raw_1_percent) | |
1472 | |
1473 # data_converter | |
1474 data_converter$set_info (info) | |
1475 data_converter$set_raw_data(raw_data) | |
1476 info $append_child (data_converter) | |
1477 raw_data $append_child (data_converter) | |
1478 | |
1479 # data_groups | |
1480 data_groups$set_info (info) | |
1481 data_groups$set_data_converter(data_converter) | |
1482 data_groups$set_raw_1_percent (raw_1_percent) | |
1483 info $append_child (data_groups) | |
1484 data_converter$append_child (data_groups) | |
1485 raw_1_percent $append_child (data_groups) | |
1486 | |
1487 # densities | |
1488 densities $set_data_groups(data_groups) | |
1489 data_groups$append_child (densities) | |
1490 | |
1491 # alpha | |
1492 alpha $set_densities(densities) | |
1493 densities$append_child (alpha) | |
1494 | |
1495 # i_fdr | |
1496 i_fdr$set_data_groups(data_groups) | |
1497 i_fdr$set_densities (densities) | |
1498 i_fdr$set_alpha (alpha) | |
1499 data_groups $append_child(i_fdr) | |
1500 densities $append_child(i_fdr) | |
1501 alpha $append_child(i_fdr) | |
1502 } | |
1503 ) | |
1504 | |
1505 | |
1506 ############################################################# | |
1507 ####### Classes for Plotting | |
1508 ############################################################# | |
1509 | |
1510 ############################################################################### | |
1511 # Class: Plot_Image | |
1512 ############################################################################### | |
1513 Plot_Image = setRefClass("Plot_Image", | |
1514 fields = list(data_processors = "list", | |
1515 plot_title = "character", | |
1516 include_text = "logical", | |
1517 include_main = "logical", | |
1518 x.intersp = "numeric", | |
1519 y.intersp = "numeric", | |
1520 scale = "numeric", | |
1521 main = "character", | |
1522 is_image_container = "logical")) | |
1523 Plot_Image$methods( | |
1524 initialize = function(p_data_processors = list(), | |
1525 p_include_main = TRUE, | |
1526 p_include_text = TRUE, | |
1527 p_is_image_container = FALSE){ | |
1528 include_main <<- p_include_main | |
1529 include_text <<- p_include_text | |
1530 data_processors <<- p_data_processors | |
1531 is_image_container <<- p_is_image_container | |
1532 }, | |
1533 plot_image = function(){ | |
1534 plot(main="Define plot_image() for subclass") # Abstract function | |
1535 }, | |
1536 get_n = function(){ | |
1537 stop("Need to define function get_n() for subclass") #Abstract function | |
1538 }, | |
1539 create_standard_main = function(){ | |
1540 needs_main <- function(){ | |
1541 return(include_text & include_main & !is_image_container) | |
1542 } | |
1543 if (needs_main()){ | |
1544 collection_name <- data_processors[[1]]$info$collection_name() | |
1545 main <<- sprintf("%s\n(Dataset: %s; n=%s)", plot_title, collection_name, format(get_n(), big.mark = ",")) | |
1546 } | |
1547 }, | |
1548 plot_image_in_window = function(p_scale=NULL, window_height=NULL, window_width=NULL){ | |
1549 scale <<- p_scale | |
1550 SIZE_AXIS <- 2.5 * scale # in the units used by mar | |
1551 SIZE_MAIN <- 2.5 * scale | |
1552 SIZE_NO_MARGIN <- 0.1 * scale | |
1553 FONT_SIZE <- 8 * scale | |
1554 WINDOW_WIDTH <- window_width * scale | |
1555 WINDOW_HEIGHT <- window_height * scale | |
1556 X_INTERSP <- 0.5 * scale + 0.4 # manages legend text spacing | |
1557 Y_INTERSP <- 0.5 * scale + 0.4 # manages | |
1558 | |
1559 if (include_main){ | |
1560 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_MAIN , SIZE_NO_MARGIN) | |
1561 } else { | |
1562 mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_NO_MARGIN, SIZE_NO_MARGIN) | |
1563 } | |
1564 mgp = c(SIZE_AXIS/2, SIZE_AXIS/4, 0) # Margin line (mex units) for axis title, axis labels, axis lines | |
1565 ps = FONT_SIZE | |
1566 x.intersp <<- X_INTERSP | |
1567 y.intersp <<- Y_INTERSP | |
1568 | |
1569 windows(width = WINDOW_WIDTH, height=WINDOW_HEIGHT) | |
1570 | |
1571 old_par <- par(mar=mar, ps=ps, mgp=mgp) | |
1572 create_standard_main() | |
1573 | |
1574 plot_image() | |
1575 if (!is_image_container){ | |
1576 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale) | |
1577 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale) | |
1578 box(lwd=scale) | |
1579 } | |
1580 par(old_par) | |
1581 }, | |
1582 plot_image_in_small_window = function(p_scale=1){ | |
1583 plot_image_in_window(p_scale=p_scale, window_height=2, window_width=3.25) | |
1584 }, | |
1585 plot_image_in_large_window = function(p_scale=1, window_height=NULL){ | |
1586 plot_image_in_window(p_scale=p_scale, window_height=window_height, window_width=7) | |
1587 } | |
1588 ) | |
1589 ############################################################################### | |
1590 # Class: Legend_Object | |
1591 ############################################################################### | |
1592 Legend_Object = setRefClass("Legend_Object", | |
1593 contains = "Plot_Image", | |
1594 fields = list(user_params = "list", | |
1595 scale = "numeric")) | |
1596 Legend_Object$methods( | |
1597 initialize = function(p_user_params = NULL, p_scale = NULL){ | |
1598 if (is.null(p_user_params)){ | |
1599 user_params <<- list() | |
1600 } else { | |
1601 user_params <<- p_user_params | |
1602 } | |
1603 if (is.null(p_scale)){ | |
1604 stop("Legend_Object must have a valid scale") | |
1605 } else { | |
1606 scale <<- p_scale | |
1607 } | |
1608 user_params$x <<- if_null(user_params$x , "topleft", user_params$x) | |
1609 user_params$y <<- if_null(user_params$y , NULL, user_params$y) | |
1610 user_params$bty <<- if_null(user_params$bty , "o", user_params$bty) | |
1611 user_params$lwd <<- if_null(user_params$lwd , NULL, user_params$lwd * scale) # Because we allow NULL, scale must be inside parens | |
1612 user_params$seg.len <<- if_null(user_params$seg.len , 3, user_params$seg.len ) * scale | |
1613 user_params$box.lwd <<- if_null(user_params$box.lwd , 1, user_params$box.lwd ) * scale | |
1614 user_params$x.intersp <<- if_null(user_params$x.intersp, 0.6, user_params$x.intersp) * scale | |
1615 user_params$y.intersp <<- if_null(user_params$y.intersp, 0.4, user_params$y.intersp) * scale + 0.2 | |
1616 }, | |
1617 show = function(){ | |
1618 first_legend = legend(x = user_params$x, | |
1619 y = user_params$y, | |
1620 title = "", | |
1621 legend = user_params$leg, | |
1622 col = user_params$col, | |
1623 bty = user_params$bty, | |
1624 lty = user_params$lty, | |
1625 lwd = user_params$lwd, | |
1626 seg.len = user_params$seg.len, | |
1627 box.lwd = user_params$box.lwd, | |
1628 x.intersp = user_params$x.intersp, | |
1629 y.intersp = user_params$y.intersp) | |
1630 new_x = first_legend$rect$left | |
1631 new_y = first_legend$rect$top + first_legend$rect$h * ifelse(scale==1, 0.07, 0.03 - (scale * 0.02)) #switch(scale, 0.01, -0.01, -0.03, -0.05)# (0.07 - 0.09 * ((scale-1)^2))#(0.15 - 0.08*scale)#.07 * (2 - scale) | |
1632 legend(x=new_x, y=new_y, title = user_params$title, legend = "", cex=1.15, bty="n") | |
1633 | |
1634 } | |
1635 ) | |
1636 ############################################################################### | |
1637 # Class: Plot_Multiple_Images | |
1638 ############################################################################### | |
1639 Plot_Multiple_Images = setRefClass("Plot_Multiple_Images", | |
1640 contains = "Plot_Image", | |
1641 fields = list(n_images_wide = "numeric", | |
1642 n_images_tall = "numeric", | |
1643 image_list = "list")) | |
1644 Plot_Multiple_Images$methods( | |
1645 initialize = function(p_n_images_wide=1, p_n_images_tall=2, p_image_list=NULL, ...){ | |
1646 n_images_wide <<- p_n_images_wide | |
1647 n_images_tall <<- p_n_images_tall | |
1648 image_list <<- p_image_list | |
1649 #plot_title <<- "True Hit and False Hit Distributions" | |
1650 | |
1651 callSuper(p_is_image_container=TRUE, ...) | |
1652 }, | |
1653 plot_image = function(){ | |
1654 # Support functions | |
1655 apply_mtext <- function(letter=NULL){ | |
1656 line=1.3*scale | |
1657 mtext(letter, side=1, line=line, adj=0) | |
1658 } | |
1659 # main code | |
1660 old_par <- par(mfrow=c(n_images_tall, n_images_wide)) | |
1661 i=0 | |
1662 n_images <- length(image_list) | |
1663 | |
1664 for (i in 1:n_images){ | |
1665 image <- image_list[[i]] | |
1666 image$create_standard_main() | |
1667 image$scale <- scale | |
1668 image$plot_image() | |
1669 axis(side=1, labels=include_text, tcl=-0.5, lwd=scale) | |
1670 axis(side=2, labels=include_text, tcl=-0.5, lwd=scale) | |
1671 box(lwd=scale) | |
1672 apply_mtext(letter=sprintf("(%s)", letters[i])) | |
1673 | |
1674 } | |
1675 par(old_par) | |
1676 | |
1677 } | |
1678 ) | |
1679 ############################################################################### | |
1680 # Class: Plot_Compare_PMD_and_Norm_Density | |
1681 ############################################################################### | |
1682 Plot_Compare_PMD_and_Norm_Density = setRefClass("Plot_Compare_PMD_and_Norm_Density", | |
1683 contains = "Plot_Image", | |
1684 fields = list(show_norm = "logical", | |
1685 display_n_psms = "logical")) | |
1686 Plot_Compare_PMD_and_Norm_Density$methods( | |
1687 initialize = function(p_show_norm=TRUE, p_display_n_psms=TRUE, ...){ | |
1688 show_norm <<- p_show_norm | |
1689 display_n_psms <<- p_display_n_psms | |
1690 plot_title <<- "True Hit and False Hit Distributions" | |
1691 | |
1692 callSuper(...) | |
1693 }, | |
1694 plot_image = function(){ | |
1695 # Support functions for plot_compare_PMD_and_norm_density() | |
1696 | |
1697 get_densities <- function(data_subset = NULL, var_value = NULL){ | |
1698 data_subset$value_of_interest <- data_subset[,var_value] | |
1699 from <- min(data_subset$value_of_interest, na.rm = TRUE) | |
1700 to <- max(data_subset$value_of_interest, na.rm = TRUE) | |
1701 xlim = range(data_subset$value_of_interest) | |
1702 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100)) | |
1703 data_false <- subset(data_subset, (PMD_FDR_decoy==1)) | |
1704 | |
1705 d_true <- with(data_true , density(value_of_interest, from = from, to = to, na.rm = TRUE)) | |
1706 d_false <- with(data_false, density(value_of_interest, from = from, to = to, na.rm = TRUE)) | |
1707 d_true <- normalize_density(d_true) | |
1708 d_false <- normalize_density(d_false) | |
1709 | |
1710 densities <- list(d_true=d_true, d_false=d_false, var_value = var_value, n_true = nrow(data_true), n_false = nrow(data_false)) | |
1711 | |
1712 return(densities) | |
1713 } | |
1714 get_xlim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){ | |
1715 xlim <- range(c( densities_a$d_true$x, densities_a$d_false$y)) | |
1716 if (show_norm){ | |
1717 xlim <- range(c(xlim, densities_b$d_true$x, densities_b$d_false$y)) | |
1718 } | |
1719 return(xlim) | |
1720 } | |
1721 get_ylim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){ | |
1722 ylim <- range(c( densities_a$d_true$y, densities_a$d_false$y)) | |
1723 if (show_norm){ | |
1724 ylim <- range(c(ylim, densities_b$d_true$y, densities_b$d_false$y)) | |
1725 } | |
1726 return(ylim) | |
1727 } | |
1728 plot_distributions <- function(densities = NULL, var_value= NULL, dataset_name = NULL, ...){ | |
1729 leg = list() | |
1730 leg$leg = c("Good", "Bad") | |
1731 if (display_n_psms){ | |
1732 leg$leg = sprintf("%s (%d PSMs)", | |
1733 leg$leg, | |
1734 c(densities$n_true, densities$n_false)) | |
1735 | |
1736 } | |
1737 leg$col = c("black", "red") | |
1738 leg$lwd = c(3 , 3) | |
1739 leg$lty = c(1 , 2) | |
1740 leg$title = "Hit Category" | |
1741 xlab = ifelse(var_value == "value", | |
1742 "PMD (ppm)", | |
1743 "PMD - normalized (ppm)") | |
1744 ylab = "Density" | |
1745 if (!include_text){ | |
1746 xlab = "" | |
1747 ylab = "" | |
1748 } | |
1749 plot( densities$d_true , col=leg$col[1], lwd=leg$lwd[1] * scale, lty=leg$lty[1], xaxt = "n", yaxt = "n", main=main, xlab = xlab, ylab=ylab, ...) | |
1750 lines(densities$d_false, col=leg$col[2], lwd=leg$lwd[2] * scale, lty=leg$lty[2]) | |
1751 abline(v=0, h=0, col="gray", lwd=1*scale) | |
1752 if (include_text){ | |
1753 legend_object <- Legend_Object$new(leg, scale) | |
1754 legend_object$show() | |
1755 #legend("topleft", legend=leg.leg, col=leg.col, lwd=leg.lwd, lty=leg.lty, x.intersp = x.intersp, y.intersp = y.intersp) | |
1756 } | |
1757 } | |
1758 | |
1759 # Main code block for plot_compare_PMD_and_norm_density | |
1760 data_processor <- data_processors[[1]] | |
1761 data_processor$data_groups$ensure() | |
1762 data_groups <- data_processor$data_groups$df | |
1763 | |
1764 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE) | |
1765 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) | |
1766 | |
1767 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value") | |
1768 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm") | |
1769 | |
1770 xlim=get_xlim(densities_a, densities_b, show_norm = show_norm) | |
1771 ylim=get_ylim(densities_a, densities_b, show_norm = show_norm) | |
1772 | |
1773 dataset_name <- data_processor$info$collection_name | |
1774 plot_distributions( densities=densities_a, var_value = "value" , dataset_name = dataset_name, xlim=xlim, ylim=ylim) | |
1775 if (show_norm){ | |
1776 plot_distributions(densities=densities_b, var_value = "value_norm", dataset_name = dataset_name, xlim=xlim, ylim=ylim) | |
1777 } | |
1778 }, | |
1779 get_n = function(){ | |
1780 data_processor <- data_processors[[1]] | |
1781 data_processor$data_groups$ensure() | |
1782 data_subset_a <- subset(data_processor$data_groups$df , used_to_find_middle == FALSE) | |
1783 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) | |
1784 | |
1785 if (show_norm){ | |
1786 data_subset <- data_subset_a | |
1787 } else { | |
1788 data_subset <- data_subset_b | |
1789 } | |
1790 | |
1791 data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100)) | |
1792 data_false <- subset(data_subset, (PMD_FDR_decoy==1)) | |
1793 | |
1794 return(nrow(data_true) + nrow(data_false)) | |
1795 } | |
1796 ) | |
1797 | |
1798 ############################################################################### | |
1799 # Class: Plot_Time_Invariance_Alt | |
1800 ############################################################################### | |
1801 Plot_Time_Invariance_Alt = setRefClass("Plot_Time_Invariance_Alt", | |
1802 contains = "Plot_Image", | |
1803 fields = list(show_norm = "logical", | |
1804 display_n_psms = "logical", | |
1805 training_class = "character", | |
1806 ylim = "numeric", | |
1807 field_of_interest = "character")) | |
1808 Plot_Time_Invariance_Alt$methods( | |
1809 initialize = function(p_ylim=NULL, p_training_class=NULL, p_field_of_interest="value_norm", ...){ | |
1810 get_subset_title <- function(training_class=NULL){ | |
1811 if (training_class == "bad_long"){ | |
1812 subset_title="bad only" | |
1813 } else if (training_class == "good_testing"){ | |
1814 subset_title="good-testing only" | |
1815 } else if (training_class == "good_training"){ | |
1816 subset_title="good-training only" | |
1817 } else if (training_class == "other"){ | |
1818 subset_title="other only" | |
1819 } else { | |
1820 stop("Unexpected training_class in plot_time_invariance") | |
1821 } | |
1822 return(subset_title) | |
1823 } | |
1824 | |
1825 ylim <<- p_ylim | |
1826 training_class <<- p_training_class | |
1827 field_of_interest <<- p_field_of_interest | |
1828 subset_title <- get_subset_title(training_class=training_class) | |
1829 backup_title <- sprintf("Middle 25%% PMD for spectra sorted by index%s", | |
1830 ifelse(is.null(subset_title), | |
1831 "", | |
1832 sprintf(" - %s", subset_title))) | |
1833 #plot_title <<- get_main(main_title=main, backup_title=backup_title, data_collection = data_collection) | |
1834 plot_title <<- backup_title | |
1835 | |
1836 callSuper(...) | |
1837 }, | |
1838 plot_image = function(){ | |
1839 # Support functions for plot_time_invariance() | |
1840 | |
1841 # Main code of plot_time_invariance() | |
1842 data_subset = get_data_subset() | |
1843 plot_group_spectrum_index_from_subset_boxes(data_subset = data_subset) | |
1844 abline(h=0, col="blue", lwd=scale) | |
1845 }, | |
1846 get_data_subset = function(){ | |
1847 data_processor <- data_processors[[1]] | |
1848 data_processor$data_groups$ensure() | |
1849 return(subset(data_processor$data_groups$df, (group_training_class==training_class))) | |
1850 }, | |
1851 get_n = function(){ | |
1852 return(nrow(get_data_subset())) | |
1853 }, | |
1854 plot_group_spectrum_index_from_subset_boxes = function(data_subset = NULL){ | |
1855 n_plot_groups <- 100 | |
1856 | |
1857 field_name_text <- ifelse(field_of_interest=="value", "PMD", "Translated PMD") | |
1858 new_subset <- data_subset | |
1859 new_subset$value_of_interest <- new_subset[,field_of_interest] | |
1860 new_subset <- new_subset[order(new_subset$PMD_FDR_spectrum_index),] | |
1861 | |
1862 idxs <- round_to_tolerance(seq(from=1, to=nrow(new_subset), length.out = n_plot_groups+1), 1) | |
1863 idxs_left <- idxs[-(n_plot_groups+1)] | |
1864 idxs_right <- idxs[-1] - 1 | |
1865 idxs_right[n_plot_groups] <- idxs_right[n_plot_groups] + 1 | |
1866 | |
1867 new_subset$plot_group <- NA | |
1868 for (i in 1:n_plot_groups){ | |
1869 new_subset$plot_group[idxs_left[i]:idxs_right[i]] <- i | |
1870 } | |
1871 xleft <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=min) | |
1872 xright <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=max) | |
1873 ybottom <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 - (0.25/2))}) | |
1874 ytop <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 + (0.25/2))}) | |
1875 boxes <- merge( rename_column(xleft , "PMD_FDR_spectrum_index" , "xleft"), | |
1876 merge( rename_column(xright , "PMD_FDR_spectrum_index" , "xright"), | |
1877 merge(rename_column(ybottom, "value_of_interest", "ybottom"), | |
1878 rename_column(ytop , "value_of_interest", "ytop")))) | |
1879 | |
1880 xlab <- "Spectrum Index" | |
1881 ylab <- sprintf("%s (ppm)", field_name_text ) | |
1882 if (is.null(ylim)){ | |
1883 ylim <<- range(new_subset$value_of_interest) | |
1884 } | |
1885 if (!include_text){ | |
1886 xlab="" | |
1887 ylab="" | |
1888 } | |
1889 plot(value_of_interest~PMD_FDR_spectrum_index, data=new_subset, type="n", ylim=ylim, xlab = xlab, ylab=ylab, main=main, xaxt="n", yaxt="n") | |
1890 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale)) | |
1891 #points(median_of_group_index~PMD_FDR_spectrum_index, data=data_subset, cex=.5, pch=15) | |
1892 axis(1, labels=include_text, lwd=scale) | |
1893 axis(2, labels=include_text, lwd=scale) | |
1894 box(lwd=scale) #box around plot area | |
1895 } | |
1896 | |
1897 ) | |
1898 ############################################################################### | |
1899 # Class: Plot_Time_Invariance_Alt_Before_and_After | |
1900 ############################################################################### | |
1901 Plot_Time_Invariance_Alt_Before_and_After = setRefClass("Plot_Time_Invariance_Alt_Before_and_After", | |
1902 contains = "Plot_Multiple_Images", | |
1903 fields = list()) | |
1904 Plot_Time_Invariance_Alt_Before_and_After$methods( | |
1905 initialize = function(p_data_processors = NULL, | |
1906 p_include_text=TRUE, | |
1907 p_include_main=FALSE, | |
1908 p_ylim = c(-4,4), ...){ | |
1909 plot_object1 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors, | |
1910 p_include_text=p_include_text, | |
1911 p_include_main=p_include_main, | |
1912 p_training_class = "good_testing", | |
1913 p_field_of_interest = "value", | |
1914 p_ylim = p_ylim) | |
1915 | |
1916 plot_object2 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors, | |
1917 p_include_text=p_include_text, | |
1918 p_include_main=p_include_main, | |
1919 p_training_class = "good_testing", | |
1920 p_field_of_interest = "value_norm", | |
1921 p_ylim = p_ylim) | |
1922 | |
1923 callSuper(p_n_images_wide=1, | |
1924 p_n_images_tall=2, | |
1925 p_include_text=p_include_text, | |
1926 p_include_main=p_include_main, | |
1927 p_image_list = list(plot_object1, plot_object2), ...) | |
1928 } | |
1929 ) | |
1930 | |
1931 ############################################################################### | |
1932 # Class: Plot_Density_PMD_and_Norm_Decoy_by_AA_Length | |
1933 ############################################################################### | |
1934 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length = setRefClass("Plot_Density_PMD_and_Norm_Decoy_by_AA_Length", | |
1935 contains = "Plot_Image", | |
1936 fields = list(show_norm = "logical")) | |
1937 Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$methods( | |
1938 initialize = function(p_show_norm=FALSE, ...){ | |
1939 plot_title <<- "The Decoy Bump: PMD Distribution of Decoy matches by peptide length" | |
1940 show_norm <<- p_show_norm | |
1941 callSuper(...) | |
1942 }, | |
1943 get_n = function(){ | |
1944 data_processor <- data_processors[[1]] | |
1945 data_processor$data_groups$ensure() | |
1946 data_subset <- subset(data_processor$data_groups$df, (PMD_FDR_decoy == 1)) | |
1947 return(nrow(data_subset)) | |
1948 }, | |
1949 plot_image = function(){ | |
1950 | |
1951 # Support functions for plot_density_PMD_and_norm_decoy_by_aa_length() | |
1952 | |
1953 add_group_peptide_length_special <- function(){ | |
1954 data_processor <- data_processors[[1]] | |
1955 data_processor$data_groups$ensure() | |
1956 data_groups <- data_processor$data_groups$df # the name data_groups is a data.frame instead of a Data_Object | |
1957 data_groups <- subset(data_groups, used_to_find_middle == FALSE) | |
1958 | |
1959 df_group_definition <- data.frame(stringsAsFactors = FALSE, | |
1960 list(group_peptide_length_special = c("06-08", "09-10", "11-12", "13-15", "16-20", "21-50"), | |
1961 min = c( 6 , 9 , 11 , 13 , 16 , 21 ), | |
1962 max = c( 8 , 10 , 12 , 15 , 20 , 50 ) )) | |
1963 group_peptide_length_special <- data.frame(list(PMD_FDR_peptide_length = 6:50)) | |
1964 group_peptide_length_special$min <- with(group_peptide_length_special, sapply(PMD_FDR_peptide_length, FUN = function(i) max(df_group_definition$min[df_group_definition$min <= i]))) | |
1965 group_peptide_length_special <- merge(group_peptide_length_special, df_group_definition) | |
1966 | |
1967 data_groups$group_peptide_length_special <- NULL | |
1968 new_data_groups <- (merge(data_groups, | |
1969 group_peptide_length_special[,c("PMD_FDR_peptide_length", | |
1970 "group_peptide_length_special")])) | |
1971 return(new_data_groups) | |
1972 } | |
1973 get_densities <- function(data_subset = NULL, field_value = NULL, field_group=NULL){ | |
1974 get_density_from_subset <- function(data_subset=NULL, xlim=NULL){ | |
1975 | |
1976 d_group <- with(data_subset , density(value_of_interest, from = xlim[1], to = xlim[2], na.rm=TRUE)) | |
1977 d_group <- normalize_density(d_group) | |
1978 | |
1979 return(d_group) | |
1980 } | |
1981 | |
1982 data_temp <- data_subset | |
1983 data_temp$value_of_interest <- data_temp[[field_value]] | |
1984 data_temp$group_of_interest <- data_temp[[field_group]] | |
1985 | |
1986 xlim = range(data_temp$value_of_interest, na.rm=TRUE) | |
1987 | |
1988 groups <- sort(unique(data_temp$group_of_interest)) | |
1989 n_groups <- length(groups) | |
1990 | |
1991 d_group <- get_density_from_subset( data_subset=data_temp, xlim = xlim ) | |
1992 densities <- list("All decoys" = d_group) | |
1993 for (i in 1:n_groups){ | |
1994 group <- groups[i] | |
1995 | |
1996 d_group <- get_density_from_subset( data_subset=subset(data_temp, (group_of_interest == group)), | |
1997 xlim = xlim ) | |
1998 densities[[group]] <- d_group | |
1999 } | |
2000 | |
2001 return(densities) | |
2002 } | |
2003 get_limits <- function(densities_a = NULL, densities_b = NULL){ | |
2004 xlim = c() | |
2005 ylim = c(0) | |
2006 for (single_density in densities_a){ | |
2007 xlim=range(c(xlim, single_density$x)) | |
2008 ylim=range(c(ylim, single_density$y)) | |
2009 } | |
2010 for (single_density in densities_b){ | |
2011 xlim=range(c(xlim, single_density$x)) | |
2012 ylim=range(c(ylim, single_density$y)) | |
2013 } | |
2014 | |
2015 return(list(xlim=xlim, ylim=ylim)) | |
2016 } | |
2017 plot_distributions <- function(data_groups = NULL, xlim=NULL, ylim=NULL, densities = NULL, field_group= NULL, field_value = "value", xlab_modifier = "", var_value= NULL, include_peak_dots=TRUE, dataset_name = NULL, ...){ | |
2018 data_groups$group_of_interest <- data_groups[[field_group]] | |
2019 data_groups$value_of_interest <- data_groups[[field_value]] | |
2020 | |
2021 # Main body of plot_decoy_distribution_by_field_of_interest() | |
2022 FIXED_LWD=3 | |
2023 | |
2024 groups <- sort(unique(data_groups$group_of_interest)) | |
2025 n <- length(groups) | |
2026 | |
2027 df_leg <- data.frame(stringsAsFactors = FALSE, | |
2028 list(leg = groups, | |
2029 col = rainbow_with_fixed_intensity(n = n, goal_intensity_0_1 = 0.4), | |
2030 lty = rep(1:6, length.out=n), | |
2031 lwd = rep(FIXED_LWD , n)) ) | |
2032 | |
2033 d <- densities[["All decoys"]] | |
2034 | |
2035 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", xlab_modifier) | |
2036 ylab = "Density" | |
2037 | |
2038 if (!include_text){ | |
2039 xlab="" | |
2040 ylab="" | |
2041 } | |
2042 plot(d, lwd=FIXED_LWD * scale, main=main, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n") | |
2043 | |
2044 ave_peak <- max(d$y) | |
2045 max_peak <- 0 | |
2046 | |
2047 for (local_group in groups){ | |
2048 data_subset <- subset(data_groups, group_of_interest == local_group) | |
2049 data_info <- subset(df_leg , leg == local_group) | |
2050 col <- data_info$col[1] | |
2051 lty <- data_info$lty[1] | |
2052 lwd <- data_info$lwd[1] | |
2053 if (nrow(data_subset) > 100){ | |
2054 d <- densities[[local_group]] #density(data_subset[[field_value]]) | |
2055 lines(d, col=col, lty=lty, lwd=lwd * scale) | |
2056 peak <- max(d$y) | |
2057 max_peak <- max(max_peak, peak) | |
2058 } | |
2059 } | |
2060 abline(v=0, h=0, lwd=scale) | |
2061 leg <- list(title = "Peptide length (aa)", | |
2062 leg = c("All decoys" , df_leg$leg), | |
2063 col = c(col2hex("black") , df_leg$col), | |
2064 lty = c(1 , df_leg$lty), | |
2065 lwd = c(FIXED_LWD , df_leg$lwd) | |
2066 ) | |
2067 if (include_text){ | |
2068 legend_object = Legend_Object$new(leg, scale) | |
2069 legend_object$show() | |
2070 #first_legend = legend(x="topleft", title = "", legend = leg$leg, col = leg$col, lty = leg$lty, lwd = leg$lwd, seg.len=leg$seg.len, box.lwd=leg$box.lwd, x.intersp = leg$x.intersp, y.intersp = leg$y.intersp) | |
2071 #new_x = first_legend$rect$left | |
2072 #new_y = first_legend$rect$top + first_legend$rect$h * .07 * (2 - scale) | |
2073 #legend(x=new_x, y=new_y, title = leg$title, legend = "", cex=1.15, bty="n") | |
2074 } | |
2075 | |
2076 box(lwd=scale) #box around plot area | |
2077 | |
2078 } | |
2079 | |
2080 # Main body for plot_density_PMD_and_norm_decoy_by_aa_length() | |
2081 | |
2082 data_mod <- add_group_peptide_length_special() | |
2083 data_mod <- subset(data_mod, PMD_FDR_decoy==1) | |
2084 | |
2085 densities_a <- get_densities(data_subset = data_mod, field_value = "value" , field_group = "group_peptide_length_special") | |
2086 densities_b <- get_densities(data_subset = data_mod, field_value = "value_norm", field_group = "group_peptide_length_special") | |
2087 | |
2088 data_processor <- data_processors[[1]] | |
2089 dataset_name <- data_processor$info$collection_name() | |
2090 | |
2091 limits <- get_limits(densities_a, densities_b) | |
2092 xlim <- limits$xlim | |
2093 ylim <- limits$ylim | |
2094 | |
2095 if (show_norm){ | |
2096 plot_distributions(data_groups = data_mod, densities=densities_b, field_value = "value_norm", xlab_modifier = " - normalized", field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim) | |
2097 } else { | |
2098 plot_distributions(data_groups = data_mod, densities=densities_a, field_value = "value" , xlab_modifier = "" , field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim) | |
2099 } | |
2100 } | |
2101 | |
2102 ) | |
2103 | |
2104 ############################################################################### | |
2105 # Class: Plot_Bad_CI | |
2106 ############################################################################### | |
2107 Plot_Bad_CI = setRefClass("Plot_Bad_CI", | |
2108 contains = "Plot_Image", | |
2109 fields = list(breaks = "numeric", | |
2110 ylim = "numeric")) | |
2111 Plot_Bad_CI$methods( | |
2112 initialize = function(p_breaks=20, p_ylim=NULL, ...){ | |
2113 if (is.null(p_ylim)){ | |
2114 ylim <<- numeric(0) | |
2115 } else { | |
2116 ylim <<- p_ylim | |
2117 } | |
2118 breaks <<- p_breaks | |
2119 plot_title <<- "Credible Intervals for proportion within range - bad" | |
2120 callSuper(...) | |
2121 }, | |
2122 data_processor = function(){ | |
2123 return(data_processors[[1]]) | |
2124 }, | |
2125 get_n = function(){ | |
2126 data_processor()$data_groups$ensure() | |
2127 return(nrow(subset(data_processor()$data_groups$df, (PMD_FDR_decoy == 1)))) | |
2128 }, | |
2129 plot_image = function(){ | |
2130 data_processor()$data_groups$ensure() | |
2131 data_groups <- data_processor()$data_groups$df | |
2132 data_decoy <- subset(data_groups, data_groups$group_training_class == "bad_long") | |
2133 data_decoy$region <- cut(x = data_decoy$value, breaks = breaks) | |
2134 table(data_decoy$region) | |
2135 regions <- unique(data_decoy$region) | |
2136 | |
2137 N = nrow(data_decoy) | |
2138 find_lower_ci_bound <- function(x){ | |
2139 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05) | |
2140 return(ci[1]) | |
2141 } | |
2142 find_upper_ci_bound <- function(x){ | |
2143 ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05) | |
2144 return(ci[2]) | |
2145 } | |
2146 xleft <- aggregate(value~region, data=data_decoy, FUN=min) | |
2147 xright <- aggregate(value~region, data=data_decoy, FUN=max) | |
2148 ytop <- aggregate(value~region, data=data_decoy, FUN=find_upper_ci_bound) | |
2149 ybottom <- aggregate(value~region, data=data_decoy, FUN=find_lower_ci_bound) | |
2150 | |
2151 xleft <- rename_column(xleft , "value", "xleft" ) | |
2152 xright <- rename_column(xright , "value", "xright" ) | |
2153 ytop <- rename_column(ytop , "value", "ytop" ) | |
2154 ybottom <- rename_column(ybottom, "value", "ybottom") | |
2155 | |
2156 boxes <- merge(merge(xleft, xright), merge(ytop, ybottom)) | |
2157 | |
2158 | |
2159 xlab <- "Precursor Mass Discrepancy (ppm)" | |
2160 ylab <- "Proportion of PSMs\nin subgroup" | |
2161 xlim=range(data_decoy$value, na.rm = TRUE) | |
2162 get_ylim(boxes=boxes) | |
2163 if (!include_text){ | |
2164 xlab="" | |
2165 ylab="" | |
2166 } | |
2167 | |
2168 plot(x=c(-10,10), y=c(0,1), type="n", ylim=ylim, xlim=xlim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n") | |
2169 | |
2170 with(boxes, rect(xleft=xleft, xright=xright, ytop=ytop, ybottom=ybottom, lwd=scale)) | |
2171 | |
2172 abline(h=1/breaks, col="blue", lwd=scale) | |
2173 }, | |
2174 get_ylim = function(boxes=NULL){ | |
2175 is_valid_range <- function(r=NULL){ | |
2176 return(length(r) == 2) | |
2177 } | |
2178 if (! is_valid_range(ylim)){ | |
2179 ylim <<- range(c(0,boxes$ytop, boxes$ybottom)) | |
2180 } | |
2181 } | |
2182 | |
2183 ) | |
2184 ############################################################################### | |
2185 # Class: Plot_Selective_Loss | |
2186 ############################################################################### | |
2187 Plot_Selective_Loss = setRefClass("Plot_Selective_Loss", | |
2188 contains = "Plot_Image", | |
2189 fields = list()) | |
2190 Plot_Selective_Loss$methods( | |
2191 initialize = function( ...){ | |
2192 plot_title <<- "PMD-FDR Selectively removes Bad Hits" | |
2193 callSuper(...) | |
2194 }, | |
2195 data_processor = function(){ | |
2196 return(data_processors[[1]]) | |
2197 }, | |
2198 get_n = function(){ | |
2199 data_processor()$i_fdr$ensure() | |
2200 data_subset <- data_processor()$i_fdr$df | |
2201 return(nrow(data_subset)) | |
2202 }, | |
2203 plot_image = function(){ | |
2204 # Support functions for plot_selective_loss() | |
2205 | |
2206 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){ | |
2207 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold) | |
2208 tbl <- with(updated_i_fdr, | |
2209 table(PMD_FDR_input_score >= score_threshold, | |
2210 new_confidence < score_threshold, | |
2211 group_decoy_proteins)) | |
2212 df <- data.frame(tbl) | |
2213 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum) | |
2214 df_n <- rename_column(df_n, name_before = "Freq", "n") | |
2215 df <- merge(df, df_n) | |
2216 df$rate_of_loss <- with(df, Freq/n) | |
2217 df <- subset(df, (Var1==TRUE) & (Var2==TRUE)) | |
2218 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")] | |
2219 if (nrow(df) > 0){ | |
2220 df$score_threshold <- score_threshold | |
2221 } | |
2222 return(df) | |
2223 } | |
2224 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){ | |
2225 df=data.frame() | |
2226 for (score_threshold in score_thresholds){ | |
2227 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold) | |
2228 df <- rbind(df, df_new_loss) | |
2229 } | |
2230 return(df) | |
2231 } | |
2232 | |
2233 # Main code for plot_selective_loss() | |
2234 | |
2235 updated_i_fdr <- data_processor()$i_fdr$df | |
2236 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100))) | |
2237 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100) | |
2238 xlim <- with(loss_record, range(score_threshold)) | |
2239 ylim <- c(0,1) | |
2240 xlab <- "Fixed Confidence threshold (PeptideShaker score)" | |
2241 ylab <- "Rate of PSM disqualification from PMD-FDR" | |
2242 lwd <- 4 | |
2243 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab) | |
2244 | |
2245 groups <- sort(unique(loss_record$group_decoy_proteins)) | |
2246 n_g <- length(groups) | |
2247 | |
2248 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1) | |
2249 ltys <- rep(1:6, length.out=n_g) | |
2250 | |
2251 leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, title="Species/Category") | |
2252 | |
2253 for (i in 1:n_g){ | |
2254 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$leg[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i]) | |
2255 } | |
2256 abline(h=0, v=100, lwd=scale) | |
2257 abline(h=c(0.1, 0.8), col="gray", lwd=scale) | |
2258 | |
2259 #leg = list(leg=group, col=col, lty=lty, lwd=lwd) | |
2260 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len)) | |
2261 legend_object <- Legend_Object$new(leg, scale) | |
2262 legend_object$show() | |
2263 } | |
2264 | |
2265 ) | |
2266 ############################################################################### | |
2267 # Class: Plot_Selective_Loss_for_TOC | |
2268 ############################################################################### | |
2269 Plot_Selective_Loss_for_TOC = setRefClass("Plot_Selective_Loss_for_TOC", | |
2270 contains = "Plot_Image", | |
2271 fields = list(xlab="character", | |
2272 ylab="character", | |
2273 title_x="numeric", | |
2274 title_y="numeric", | |
2275 legend_border="logical", | |
2276 legend_x = "numeric", | |
2277 legend_y = "numeric", | |
2278 legend_title="character", | |
2279 legend_location = "character", | |
2280 name_contaminant = "character", | |
2281 name_decoy = "character", | |
2282 name_human = "character", | |
2283 name_pyro = "character")) | |
2284 Plot_Selective_Loss_for_TOC$methods( | |
2285 initialize = function( ...){ | |
2286 plot_title <<- "PMD-FDR selectively removes bad hits" | |
2287 callSuper(...) | |
2288 xlab <<- "Confidence threshold (PeptideShaker)" | |
2289 ylab <<- "PMD Disqualifiction Rate" | |
2290 legend_border <<- FALSE | |
2291 #legend_title <<- "Species/Category" | |
2292 title_x <<- 50 | |
2293 title_y <<- 0.9 | |
2294 legend_x <<- 10 | |
2295 legend_y <<- 0.75 | |
2296 name_contaminant <<- "signal - contaminant" | |
2297 name_decoy <<- "decoy - reversed" | |
2298 name_human <<- "decoy - human" | |
2299 name_pyro <<- "signal - pyrococcus" | |
2300 }, | |
2301 data_processor = function(){ | |
2302 return(data_processors[[1]]) | |
2303 }, | |
2304 get_n = function(){ | |
2305 data_processor()$i_fdr$ensure() | |
2306 data_subset <- data_processor()$i_fdr$df | |
2307 return(nrow(data_subset)) | |
2308 }, | |
2309 plot_image = function(){ | |
2310 # Support functions for plot_selective_loss() | |
2311 | |
2312 samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){ | |
2313 data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold) | |
2314 tbl <- with(updated_i_fdr, | |
2315 table(PMD_FDR_input_score >= score_threshold, | |
2316 new_confidence < score_threshold, | |
2317 group_decoy_proteins)) | |
2318 df <- data.frame(tbl) | |
2319 df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum) | |
2320 df_n <- rename_column(df_n, name_before = "Freq", "n") | |
2321 df <- merge(df, df_n) | |
2322 df$rate_of_loss <- with(df, Freq/n) | |
2323 df <- subset(df, (Var1==TRUE) & (Var2==TRUE)) | |
2324 df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")] | |
2325 if (nrow(df) > 0){ | |
2326 df$score_threshold <- score_threshold | |
2327 } | |
2328 return(df) | |
2329 } | |
2330 get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){ | |
2331 df=data.frame() | |
2332 for (score_threshold in score_thresholds){ | |
2333 df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold) | |
2334 df <- rbind(df, df_new_loss) | |
2335 } | |
2336 return(df) | |
2337 } | |
2338 convert_groups <- function(groups=NULL){ | |
2339 new_groups <- groups | |
2340 new_groups <- gsub(pattern = "contaminant", replacement = name_contaminant, x = new_groups) | |
2341 new_groups <- gsub(pattern = "decoy" , replacement = name_decoy , x = new_groups) | |
2342 new_groups <- gsub(pattern = "human" , replacement = name_human , x = new_groups) | |
2343 new_groups <- gsub(pattern = "pyrococcus" , replacement = name_pyro , x = new_groups) | |
2344 | |
2345 return(new_groups) | |
2346 } | |
2347 | |
2348 # Main code for plot_selective_loss() | |
2349 | |
2350 updated_i_fdr <- data_processor()$i_fdr$df | |
2351 updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100))) | |
2352 loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100) | |
2353 xlim <- with(loss_record, range(score_threshold)) | |
2354 ylim <- c(0,1) | |
2355 #xlab <- "Fixed Confidence threshold (PeptideShaker score)" | |
2356 #ylab <- "Rate of PSM disqualification from PMD-FDR" | |
2357 lwd <- 4 | |
2358 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab) | |
2359 | |
2360 groups <- sort(unique(loss_record$group_decoy_proteins)) | |
2361 n_g <- length(groups) | |
2362 | |
2363 cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1) | |
2364 ltys <- rep(1:6, length.out=n_g) | |
2365 bty <- ifelse(legend_border, "o", "n") | |
2366 | |
2367 leg <- list(leg=convert_groups(groups), var_name=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y) | |
2368 #leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y) | |
2369 | |
2370 for (i in 1:n_g){ | |
2371 lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$var_name[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i]) | |
2372 } | |
2373 abline(h=0, v=100, lwd=scale) | |
2374 abline(h=c(0.1, 0.8), col="gray", lwd=scale) | |
2375 | |
2376 #leg = list(leg=group, col=col, lty=lty, lwd=lwd) | |
2377 #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len)) | |
2378 legend_object <- Legend_Object$new(leg, scale) | |
2379 legend_object$show() | |
2380 text(x=title_x, y=title_y, labels = plot_title) | |
2381 } | |
2382 | |
2383 ) | |
2384 ############################################################################### | |
2385 # Class: Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR | |
2386 ############################################################################### | |
2387 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR = setRefClass("Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR", | |
2388 contains = "Plot_Image", | |
2389 fields = list()) | |
2390 Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$methods( | |
2391 initialize = function( ...){ | |
2392 plot_title <<- "Precursor Mass Discrepance i-FDR for 1% Target-Decoy FDR PSMs" | |
2393 callSuper(...) | |
2394 }, | |
2395 data_processor = function(){ | |
2396 return(data_processors[[1]]) | |
2397 }, | |
2398 get_n = function(){ | |
2399 data_processor()$i_fdr$ensure() | |
2400 if (one_percent_calculation_exists()){ | |
2401 i_fdr <- data_processor()$i_fdr$df | |
2402 data_subset <- subset(i_fdr, is_in_1percent==TRUE) | |
2403 n <- nrow(data_subset) | |
2404 } else { | |
2405 n <- 0 | |
2406 } | |
2407 | |
2408 return (n) | |
2409 }, | |
2410 plot_image = function(){ | |
2411 if (one_percent_calculation_exists()){ | |
2412 i_fdr <- get_modified_fdr() | |
2413 report_good_discrepancies(i_fdr) | |
2414 data_TD_good <- get_data_TD_good(i_fdr) | |
2415 mean_results <- get_mean_results(data_TD_good) | |
2416 boxes <- mean_results | |
2417 boxes <- rename_columns(df = boxes, | |
2418 names_before = c("min_conf", "max_conf", "lower" , "upper"), | |
2419 names_after = c("xleft" , "xright" , "ybottom", "ytop" )) | |
2420 xlim <- range(boxes[,c("xleft", "xright")]) | |
2421 ylim <- range(boxes[,c("ybottom", "ytop")]) | |
2422 | |
2423 #head(mean_results) | |
2424 | |
2425 xlab = "Confidence Score (Peptide Shaker)" | |
2426 ylab = "Mean PMD i-FDR" | |
2427 | |
2428 if (!include_text){ | |
2429 xlab="" | |
2430 ylab="" | |
2431 } | |
2432 | |
2433 plot(mean_i_fdr~mean_conf, data=mean_results, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n", cex=scale, lwd=scale) | |
2434 with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale)) | |
2435 abline(b=-1, a=100, lwd=4*scale, col="dark gray") | |
2436 abline(h=0, v=100, lwd=1*scale) | |
2437 | |
2438 } else { | |
2439 stop(sprintf("Dataset '%s' does not include 1%% FDR data", data_processor()$info$collection_name())) | |
2440 } | |
2441 }, | |
2442 get_mean_results = function(data_TD_good = NULL){ | |
2443 mean_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=mean) | |
2444 mean_i_fdr <- rename_column(mean_i_fdr, "i_fdr", "mean_i_fdr") | |
2445 sd_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=sd) | |
2446 sd_i_fdr <- rename_column(sd_i_fdr, "i_fdr", "sd_i_fdr") | |
2447 n_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=length) | |
2448 n_i_fdr <- rename_column(n_i_fdr, "i_fdr", "n") | |
2449 mean_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=mean) | |
2450 mean_conf <- rename_column(mean_conf, "PMD_FDR_input_score", "mean_conf") | |
2451 min_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=min) | |
2452 min_conf <- rename_column(min_conf, "PMD_FDR_input_score", "min_conf") | |
2453 max_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=max) | |
2454 max_conf <- rename_column(max_conf, "PMD_FDR_input_score", "max_conf") | |
2455 | |
2456 mean_results <- mean_i_fdr | |
2457 mean_results <- merge(mean_results, sd_i_fdr) | |
2458 mean_results <- merge(mean_results, n_i_fdr) | |
2459 mean_results <- merge(mean_results, mean_conf) | |
2460 mean_results <- merge(mean_results, min_conf) | |
2461 mean_results <- merge(mean_results, max_conf) | |
2462 | |
2463 mean_results$se <- with(mean_results, sd_i_fdr / sqrt(n - 1)) | |
2464 mean_results$lower <- with(mean_results, mean_i_fdr - 2*se) | |
2465 mean_results$upper <- with(mean_results, mean_i_fdr + 2*se) | |
2466 return(mean_results) | |
2467 }, | |
2468 get_data_TD_good = function(i_fdr=NULL){ | |
2469 data_TD_good <- subset(i_fdr, TD_good==TRUE) | |
2470 data_TD_good <- data_TD_good[order(data_TD_good$PMD_FDR_input_score),] | |
2471 n <- nrow(data_TD_good) | |
2472 data_TD_good$conf_group <- cut(1:n, breaks=floor(n/100)) | |
2473 data_TD_good$i_fdr <- 100 * data_TD_good$i_fdr | |
2474 return(data_TD_good) | |
2475 }, | |
2476 get_modified_fdr = function(){ | |
2477 i_fdr <- data_processor()$i_fdr$df | |
2478 i_fdr$PMD_good <- i_fdr$i_fdr < 0.01 | |
2479 i_fdr$TD_good <- i_fdr$is_in_1percent == TRUE | |
2480 i_fdr$conf_good <- i_fdr$PMD_FDR_input_score == 100 | |
2481 return(i_fdr) | |
2482 }, | |
2483 one_percent_calculation_exists = function(){ | |
2484 data_processor()$raw_1_percent$ensure() | |
2485 return(data_processor()$raw_1_percent$exists())# "is_in_1percent" %in% colnames(data_processor()$i_fdr)) | |
2486 }, | |
2487 report_good_discrepancies = function(i_fdr=NULL){ | |
2488 with(subset(i_fdr, (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2489 with(subset(i_fdr, (PMD_FDR_input_score==100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2490 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2491 with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2492 with(subset(i_fdr, (PMD_FDR_input_score>= 90) & (PMD_FDR_input_score< 99) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) | |
2493 } | |
2494 | |
2495 ) | |
2496 | |
2497 ############################################################################### | |
2498 # Class: Plot_Density_PMD_by_Score | |
2499 ############################################################################### | |
2500 Plot_Density_PMD_by_Score = setRefClass("Plot_Density_PMD_by_Score", | |
2501 contains = "Plot_Image", | |
2502 fields = list(show_norm = "logical")) | |
2503 Plot_Density_PMD_by_Score$methods( | |
2504 initialize = function(p_show_norm=FALSE, ...){ | |
2505 show_norm <<- p_show_norm | |
2506 plot_title <<- "PMD distribution, by Confidence ranges" | |
2507 callSuper(...) | |
2508 | |
2509 }, | |
2510 data_processor = function(){ | |
2511 return(data_processors[[1]]) | |
2512 }, | |
2513 get_n = function(){ | |
2514 return(nrow(data_processor()$data_groups$df)) | |
2515 #data_subset <- data_collection$i_fdr | |
2516 #return(nrow(data_subset)) | |
2517 }, | |
2518 get_modified_data_groups = function(var_value = NULL){ | |
2519 # Note: Filters out used_to_find_middle | |
2520 # Note: Creates "value_of_interest" field | |
2521 # Note: Remakes "group_decoy_input_score" field | |
2522 data_new <- data_processor()$data_groups$df | |
2523 data_new <- subset(data_new, !used_to_find_middle ) | |
2524 data_new$value_of_interest <- data_new[, var_value] | |
2525 | |
2526 cutoff_points <- c(100, 100, 95, 80, 50, 0, 0) | |
2527 n <- length(cutoff_points) | |
2528 uppers <- cutoff_points[-n] | |
2529 lowers <- cutoff_points[-1] | |
2530 | |
2531 for (i in 1:(n-1)){ | |
2532 upper <- uppers[i] | |
2533 lower <- lowers[i] | |
2534 | |
2535 | |
2536 if (lower==upper){ | |
2537 idx <- with(data_new, which( (PMD_FDR_input_score == upper) & (PMD_FDR_decoy == 0))) | |
2538 cat_name <- sprintf("%d", upper) | |
2539 } else { | |
2540 idx <- with(data_new, which((PMD_FDR_input_score >= lower) & (PMD_FDR_input_score < upper) & (PMD_FDR_decoy == 0))) | |
2541 cat_name <- sprintf("%02d - %2d", lower, upper) | |
2542 } | |
2543 data_new$group_decoy_input_score[idx] <- cat_name | |
2544 } | |
2545 | |
2546 return(data_new) | |
2547 }, | |
2548 plot_image = function(){ | |
2549 | |
2550 # Support functions for plot_density_PMD_by_score() | |
2551 | |
2552 get_densities <- function(data_subset = NULL, var_value = NULL){ | |
2553 | |
2554 # Support functions for get_densities() | |
2555 | |
2556 # New version | |
2557 | |
2558 # Main body of get_densities() | |
2559 | |
2560 data_subset <- get_modified_data_groups(var_value=var_value) | |
2561 #data_subset$value_of_interest <- data_subset[,var_value] | |
2562 from <- min(data_subset$value_of_interest, na.rm=TRUE) | |
2563 to <- max(data_subset$value_of_interest, na.rm=TRUE) | |
2564 xlim = range(data_subset$value_of_interest, na.rm=TRUE) | |
2565 | |
2566 groups <- sort(unique(data_subset$group_decoy_input_score), decreasing = TRUE) | |
2567 n_groups <- length(groups) | |
2568 | |
2569 densities <- list(var_value = var_value, groups=groups) | |
2570 | |
2571 for (i in 1:n_groups){ | |
2572 group <- groups[i] | |
2573 | |
2574 data_group_single <- subset(data_subset, (group_decoy_input_score == group)) | |
2575 d_group <- with(data_group_single , density(value_of_interest, from = from, to = to, na.rm = TRUE)) | |
2576 d_group <- normalize_density(d_group) | |
2577 | |
2578 densities[[group]] <- d_group | |
2579 } | |
2580 | |
2581 return(densities) | |
2582 | |
2583 } | |
2584 get_xlim <- function(densities_a = NULL, densities_b = NULL){ | |
2585 groups <- densities_a$groups | |
2586 | |
2587 xlim <- 0 | |
2588 for (group in groups){ | |
2589 xlim <- range(xlim, densities_a[[group]]$x, densities_b[[group]]$x) | |
2590 } | |
2591 | |
2592 return(xlim) | |
2593 | |
2594 } | |
2595 get_ylim <- function(densities_a = NULL, densities_b = NULL){ | |
2596 groups <- densities_a$groups | |
2597 | |
2598 ylim <- 0 | |
2599 for (group in groups){ | |
2600 ylim <- range(ylim, densities_a[[group]]$y, densities_b[[group]]$y) | |
2601 } | |
2602 | |
2603 return(ylim) | |
2604 | |
2605 } | |
2606 plot_distributions <- function(densities = NULL, var_value= NULL,include_peak_dots=TRUE, xlab_modifier="", xlim=NULL, ylim=NULL, ...){ | |
2607 data_groups <- get_modified_data_groups(var_value=var_value) | |
2608 groups <- sort(unique(data_groups$group_decoy_input_score)) | |
2609 n_groups <- length(groups) | |
2610 | |
2611 groups_std <- setdiff(groups, c("100", "decoy", "0") ) | |
2612 groups_std <- sort(groups_std, decreasing = TRUE) | |
2613 groups_std <- c(groups_std, "0") | |
2614 n_std <- length(groups_std) | |
2615 cols <- rainbow_with_fixed_intensity(n = n_std, goal_intensity_0_1 = 0.5, alpha=0.5) | |
2616 | |
2617 leg <- list(group = c("100" , groups_std , "decoy" ), | |
2618 leg = c("100" , groups_std , "All Decoys" ), | |
2619 col = c(col2hex("black") , cols , col2hex("purple", col_alpha = 0.5)), | |
2620 lwd = c(4 , rep(2, n_std), 4 ), | |
2621 title = "Confidence Score") | |
2622 | |
2623 xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", | |
2624 xlab_modifier) | |
2625 ylab = "Density" | |
2626 if (!include_text){ | |
2627 xlab="" | |
2628 ylab="" | |
2629 } | |
2630 | |
2631 | |
2632 plot( x=xlim, y=ylim, col=leg$col[1], lwd=leg$lwd[1] * scale, main=main, xlab=xlab, ylab=ylab, xaxt="n", yaxt="n", cex=scale, type="n")#, lty=leg.lty[1], ...) | |
2633 | |
2634 include_peak_dots = FALSE # BUGBUG: Disabling this for now. Need to move this to class parameter | |
2635 | |
2636 for (i in 1:length(leg$group)){ | |
2637 group <- leg$group[i] | |
2638 d <- densities[[group]] | |
2639 lines(d, col=leg$col[i], lwd=leg$lwd[i] * scale) | |
2640 if (include_peak_dots){ | |
2641 x=d$x[which.max(d$y)] | |
2642 y=max(d$y) | |
2643 points(x=c(x,x), y=c(0,y), pch=19, col=leg$col[i], cex=scale) | |
2644 } | |
2645 } | |
2646 | |
2647 abline(v=0, lwd=scale) | |
2648 | |
2649 if (include_text){ | |
2650 legend_object = Legend_Object$new(leg, scale) | |
2651 legend_object$show() | |
2652 } | |
2653 | |
2654 } | |
2655 | |
2656 # Main body for plot_density_PMD_by_score() | |
2657 | |
2658 data_groups <- data_processor()$data_groups$df | |
2659 | |
2660 data_subset_a <- subset(data_groups , used_to_find_middle == FALSE) | |
2661 data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) | |
2662 | |
2663 densities_a <- get_densities(data_subset = data_subset_a, var_value = "value") | |
2664 densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm") | |
2665 | |
2666 xlim=get_xlim(densities_a, densities_b) | |
2667 ylim=get_ylim(densities_a, densities_b) | |
2668 | |
2669 dataset_name <- data_processor()$info$collection_name() | |
2670 if (show_norm){ | |
2671 plot_distributions(densities=densities_b, var_value = "value_norm", xlab_modifier = " - normalized", xlim=xlim, ylim=ylim) | |
2672 } else { | |
2673 plot_distributions(densities=densities_a, var_value = "value" , xlab_modifier = "" , xlim=xlim, ylim=ylim) | |
2674 } | |
2675 } | |
2676 ) | |
2677 ############################################################################### | |
2678 # Class: Plot_Dataset_Description | |
2679 ############################################################################### | |
2680 Plot_Dataset_Description = setRefClass("Plot_Dataset_Description", | |
2681 contains = "Plot_Multiple_Images", | |
2682 fields = list(ylim_time_invariance = "numeric")) | |
2683 Plot_Dataset_Description$methods( | |
2684 initialize = function(p_data_processors = NULL, | |
2685 p_include_text=TRUE, | |
2686 p_include_main=FALSE, | |
2687 p_ylim_time_invariance = c(-4,4), ...){ | |
2688 plot_object_r1_c1 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors, | |
2689 p_include_text=p_include_text, | |
2690 p_include_main=p_include_main, | |
2691 p_training_class = "good_testing", | |
2692 p_field_of_interest = "value", | |
2693 p_ylim = p_ylim_time_invariance) | |
2694 | |
2695 plot_object_r1_c2 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors, | |
2696 p_include_text=p_include_text, | |
2697 p_include_main=p_include_main, | |
2698 p_training_class = "good_testing", | |
2699 p_field_of_interest = "value_norm", | |
2700 p_ylim = p_ylim_time_invariance) | |
2701 plot_object_r2_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors, | |
2702 p_show_norm=FALSE, | |
2703 p_include_text=p_include_text, | |
2704 p_include_main=p_include_main) | |
2705 | |
2706 plot_object_r2_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors, | |
2707 p_show_norm=FALSE, | |
2708 p_include_text=p_include_text, | |
2709 p_include_main=p_include_main) | |
2710 | |
2711 plot_object_r3_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors, | |
2712 p_show_norm=TRUE, | |
2713 p_include_text=p_include_text, | |
2714 p_include_main=p_include_main) | |
2715 plot_object_r3_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors, | |
2716 p_show_norm=TRUE, | |
2717 p_include_text=p_include_text, | |
2718 p_include_main=p_include_main) | |
2719 callSuper(p_n_images_wide=2, | |
2720 p_n_images_tall=3, | |
2721 p_include_text=p_include_text, | |
2722 p_include_main=p_include_main, | |
2723 p_image_list = list(plot_object_r1_c1, plot_object_r1_c2, | |
2724 plot_object_r2_c1, plot_object_r2_c2, | |
2725 plot_object_r3_c1, plot_object_r3_c2), ...) | |
2726 | |
2727 } | |
2728 ) | |
2729 ############################################################################### | |
2730 # Class: Plots_for_Paper | |
2731 ############################################################################### | |
2732 Plots_for_Paper <- setRefClass("Plots_for_Paper", fields =list(data_processor_a = "Data_Processor", | |
2733 data_processor_b = "Data_Processor", | |
2734 data_processor_c = "Data_Processor", | |
2735 data_processor_d = "Data_Processor", | |
2736 include_text = "logical", | |
2737 include_main = "logical", | |
2738 mai = "numeric")) | |
2739 Plots_for_Paper$methods( | |
2740 initialize = function(){ | |
2741 data_processor_a <<- Data_Processor$new(p_info = Data_Object_Info_737_two_step$new()) | |
2742 data_processor_b <<- Data_Processor$new(p_info = Data_Object_Info_737_combined$new()) | |
2743 data_processor_c <<- Data_Processor$new(p_info = Data_Object_Pyrococcus_tr $new()) | |
2744 data_processor_d <<- Data_Processor$new(p_info = Data_Object_Mouse_Mutations $new()) | |
2745 }, | |
2746 create_plots_for_paper = function(include_main=TRUE, finalize=TRUE){ | |
2747 print_table_4_data() | |
2748 print_figure_2_data() | |
2749 plot_figure_D(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2750 plot_figure_C(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2751 plot_figure_B(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2752 plot_figure_A(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2753 plot_figure_8(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2754 plot_figure_7(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2755 plot_figure_6(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main) | |
2756 plot_figure_5(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2757 plot_figure_4(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) | |
2758 plot_figure_3(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main) | |
2759 }, | |
2760 print_figure_2_data = function(){ | |
2761 print(create_stats_for_grouping_figure(list(data_processor_a))) | |
2762 }, | |
2763 print_table_4_data = function(){ | |
2764 report_ranges_of_comparisons(processors = list(data_processor_a)) | |
2765 report_ranges_of_comparisons(processors = list(data_processor_c)) | |
2766 }, | |
2767 plot_figure_3 = function(p_scale=NULL, p_include_main=NULL){ | |
2768 plot_object <- Plot_Compare_PMD_and_Norm_Density$new(p_data_processor = list(data_processor_a), | |
2769 p_show_norm = FALSE, | |
2770 p_include_text = TRUE, | |
2771 p_include_main = p_include_main, | |
2772 p_display_n_psms = FALSE) | |
2773 plot_object$plot_image_in_small_window(p_scale=p_scale) | |
2774 }, | |
2775 plot_figure_4 = function(p_scale=NULL, p_include_main=NULL){ | |
2776 plot_object <- Plot_Time_Invariance_Alt_Before_and_After$new(p_data_processors = list(data_processor_a), | |
2777 p_include_text=TRUE, | |
2778 p_include_main=p_include_main, | |
2779 p_ylim = c(-4,4)) | |
2780 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2781 | |
2782 }, | |
2783 plot_figure_5 = function(p_scale=NULL, p_include_main=NULL){ | |
2784 plot_object <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors = list(data_processor_a), | |
2785 p_include_text=TRUE, | |
2786 p_include_main=p_include_main) | |
2787 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2788 }, | |
2789 plot_figure_6 = function(p_scale=NULL, p_include_main=NULL){ | |
2790 plot_object <- Plot_Bad_CI$new(p_data_processors = list(data_processor_a), | |
2791 p_include_text=TRUE, | |
2792 p_include_main=p_include_main) | |
2793 plot_object$plot_image_in_small_window(p_scale=p_scale) | |
2794 }, | |
2795 plot_figure_7 = function(p_scale=NULL, p_include_main=NULL){ | |
2796 plot_object <- Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$new(p_data_processors = list(data_processor_a), | |
2797 p_include_text=TRUE, | |
2798 p_include_main=p_include_main) | |
2799 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2800 }, | |
2801 plot_figure_8 = function(p_scale=NULL, p_include_main=NULL){ | |
2802 plot_object <- Plot_Selective_Loss$new(p_data_processors = list(data_processor_c), | |
2803 p_include_text=TRUE, | |
2804 p_include_main=p_include_main) | |
2805 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2806 }, | |
2807 plot_figure_A = function(p_scale=NULL, p_include_main=NULL){ | |
2808 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_a), | |
2809 p_include_text=TRUE, | |
2810 p_include_main=p_include_main, | |
2811 p_ylim_time_invariance=c(-4,4) ) | |
2812 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2813 }, | |
2814 plot_figure_B = function(p_scale=NULL, p_include_main=NULL){ | |
2815 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_b), | |
2816 p_include_text=TRUE, | |
2817 p_include_main=p_include_main, | |
2818 p_ylim_time_invariance=c(-4,4) ) | |
2819 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2820 }, | |
2821 plot_figure_C = function(p_scale=NULL, p_include_main=NULL){ | |
2822 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_c), | |
2823 p_include_text=TRUE, | |
2824 p_include_main=p_include_main, | |
2825 p_ylim_time_invariance=c(-4,4) ) | |
2826 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2827 }, | |
2828 plot_figure_D = function(p_scale=NULL, p_include_main=NULL){ | |
2829 plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_d), | |
2830 p_include_text=TRUE, | |
2831 p_include_main=p_include_main, | |
2832 p_ylim_time_invariance=c(-4,4) ) | |
2833 plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) | |
2834 }, | |
2835 create_stats_for_grouping_figure = function(processors=NULL){ | |
2836 processor <- processors[[1]] | |
2837 processor$i_fdr$ensure() | |
2838 aug_i_fdr <- processor$i_fdr$df | |
2839 aug_i_fdr$group_good_bad_other <- gsub("_.*", "", aug_i_fdr$group_training_class) | |
2840 aug_i_fdr$group_null <- "all" | |
2841 table(aug_i_fdr$group_training_class) | |
2842 table(aug_i_fdr$group_good_bad_other) | |
2843 table(aug_i_fdr$group_null) | |
2844 | |
2845 create_agg_fdr_stats <- function(i_fdr=NULL, grouping_var_name = NULL){ | |
2846 formula_fdr <- as.formula(sprintf("%s~%s", "i_fdr", grouping_var_name)) | |
2847 formula_len <- as.formula(sprintf("%s~%s", "PMD_FDR_peptide_length", grouping_var_name)) | |
2848 agg_fdr <- aggregate(formula=formula_fdr, data=i_fdr, FUN=mean) | |
2849 agg_n <- aggregate(formula=formula_fdr, data=i_fdr, FUN=length) | |
2850 agg_len <- aggregate(formula=formula_len, data=i_fdr, FUN=mean) | |
2851 agg_fdr <- rename_columns(df = agg_fdr, | |
2852 names_before = c(grouping_var_name, "i_fdr"), | |
2853 names_after = c("group" , "fdr")) | |
2854 agg_n <- rename_columns(df = agg_n, | |
2855 names_before = c(grouping_var_name, "i_fdr"), | |
2856 names_after = c("group" , "n")) | |
2857 agg_len <- rename_columns(df = agg_len, | |
2858 names_before = c(grouping_var_name), | |
2859 names_after = c("group" )) | |
2860 agg <- merge(agg_fdr, agg_n) | |
2861 agg <- merge(agg , agg_len) | |
2862 | |
2863 return(agg) | |
2864 } | |
2865 | |
2866 agg_detail <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_training_class") | |
2867 agg_grouped <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_good_bad_other") | |
2868 agg_all <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_null") | |
2869 | |
2870 agg <- rbind(agg_detail, agg_grouped) | |
2871 agg <- rbind(agg, agg_all) | |
2872 | |
2873 agg$fdr <- ifelse(agg$fdr < 1, agg$fdr, 1) | |
2874 | |
2875 linear_combo <- function(x=NULL, a0=NULL, a1=NULL){ | |
2876 result <- (a0 * (1-x) + a1 * x) | |
2877 return(result) | |
2878 } | |
2879 | |
2880 agg$r <- linear_combo(agg$fdr, a0=197, a1= 47) | |
2881 agg$g <- linear_combo(agg$fdr, a0= 90, a1= 85) | |
2882 agg$b <- linear_combo(agg$fdr, a0= 17, a1=151) | |
2883 | |
2884 return(agg) | |
2885 }, | |
2886 report_ranges_of_comparisons = function(processors=NULL){ | |
2887 report_comparison_of_Confidence_and_PMD = function (i_fdr = NULL, min_conf=NULL, max_conf=NULL, include_max=FALSE){ | |
2888 report_PMD_confidence_comparison_from_subset = function(data_subset=NULL, group_name=NULL){ | |
2889 print(group_name) | |
2890 print(sprintf(" Number of PSMs: %d", nrow(data_subset))) | |
2891 mean_confidence <- mean(data_subset$PMD_FDR_input_score) | |
2892 print(sprintf(" Mean Confidence Score: %3.1f", mean_confidence)) | |
2893 print(sprintf(" PeptideShaker g-FDR: %3.1f", 100-mean_confidence)) | |
2894 mean_PMD_FDR = mean(data_subset$i_fdr) | |
2895 print(sprintf(" PMD g-FDR: %3.1f", 100*mean_PMD_FDR)) | |
2896 #col <- col2hex("black", 0.2) | |
2897 #plot(data_subset$i_fdr, pch=".", cex=2, col=col) | |
2898 #abline(h=0) | |
2899 } | |
2900 | |
2901 if (is.null(max_conf)) { | |
2902 data_subset <- subset(i_fdr, PMD_FDR_input_score == min_conf) | |
2903 group_name <- sprintf("Group %d", min_conf) | |
2904 } else if (include_max){ | |
2905 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score <= max_conf)) | |
2906 group_name <- sprintf("Group %d through %d", min_conf, max_conf) | |
2907 } else { | |
2908 data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score < max_conf)) | |
2909 group_name <- sprintf("Group %d to %d", min_conf, max_conf) | |
2910 } | |
2911 | |
2912 report_PMD_confidence_comparison_from_subset(data_subset=data_subset, group_name=group_name) | |
2913 } | |
2914 | |
2915 processor <- processors[[1]] | |
2916 processor$i_fdr$ensure() | |
2917 i_fdr <- processor$i_fdr$df | |
2918 info <- processor$info | |
2919 print(sprintf("PMD and Confidence comparison for -- %s", info$collection_name())) | |
2920 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf=100, max_conf=NULL, include_max=TRUE) | |
2921 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 99, max_conf=100 , include_max=FALSE) | |
2922 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 90, max_conf= 99 , include_max=FALSE) | |
2923 report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 0, max_conf=100 , include_max=TRUE) | |
2924 } | |
2925 ) | |
2926 ############################################################################### | |
2927 # C - 021 - PMD-FDR Wrapper - functions.R # | |
2928 # # | |
2929 # Creates the necessary structure to convert the PMD-FDR code into one that # | |
2930 # can run as a batch file # | |
2931 # # | |
2932 ############################################################################### | |
2933 ############################################################################### | |
2934 # Class: ModuleArgParser_PMD_FDR | |
2935 ############################################################################### | |
2936 ModuleArgParser_PMD_FDR <- setRefClass("ModuleArgParser_PMD_FDR", | |
2937 contains = c("ArgParser"), | |
2938 fields =list(args = "character") ) | |
2939 ModuleArgParser_PMD_FDR$methods( | |
2940 initialize = function(description = "Computes individual and global FDR using Precursor Mass Discrepancy (PMD-FDR)", ...){ | |
2941 callSuper(description=description, ...) | |
2942 local_add_argument("--psm_report" , help="full name and path to the PSM report") | |
2943 local_add_argument("--psm_report_1_percent", default = "" , help="full name and path to the PSM report for 1% FDR") | |
2944 local_add_argument("--output_i_fdr" , default = "" , help="full name and path to the i-FDR output file ") | |
2945 local_add_argument("--output_g_fdr" , default = "" , help="full name and path to the g-FDR output file ") | |
2946 local_add_argument("--output_densities" , default = "" , help="full name and path to the densities output file ") | |
2947 #local_add_argument("--score_field_name" , default = "" , help="name of score field (in R format)") | |
2948 local_add_argument("--input_file_type" , default = "PMD_FDR_input_file", help="type of input file (currently supports: PSM_Report)") | |
2949 } | |
2950 ) | |
2951 ############################################################################### | |
2952 # Class: Data_Object_Parser | |
2953 ############################################################################### | |
2954 Data_Object_Parser <- setRefClass("Data_Object_Parser", | |
2955 contains = c("Data_Object"), | |
2956 fields =list(parser = "ModuleArgParser_PMD_FDR", | |
2957 args = "character", | |
2958 parsing_results = "list") ) | |
2959 Data_Object_Parser$methods( | |
2960 initialize = function(){ | |
2961 callSuper() | |
2962 class_name <<- "Data_Object_Parser" | |
2963 }, | |
2964 verify = function(){ | |
2965 # Nothing to do here - parser handles verification during load | |
2966 }, | |
2967 m_load_data = function(){ | |
2968 if (length(args) == 0){ | |
2969 parsing_results <<- parser$parse_arguments(NULL) | |
2970 } else { | |
2971 parsing_results <<- parser$parse_arguments(args) | |
2972 } | |
2973 | |
2974 }, | |
2975 set_args = function(p_args=NULL){ | |
2976 # This is primarily used for testing. In operation arguments will be passed automatically (through use of commandArgs) | |
2977 args <<- p_args | |
2978 set_dirty(TRUE) | |
2979 } | |
2980 ) | |
2981 ############################################################################### | |
2982 # Class: Data_Object_Info_Parser | |
2983 ############################################################################### | |
2984 Data_Object_Info_Parser <- setRefClass("Data_Object_Info_Parser", | |
2985 contains = c("Data_Object_Info"), | |
2986 fields =list( | |
2987 output_i_fdr = "character", | |
2988 output_g_fdr = "character", | |
2989 output_densities = "character" | |
2990 ) ) | |
2991 Data_Object_Info_Parser$methods( | |
2992 initialize = function(){ | |
2993 callSuper() | |
2994 class_name <<- "Data_Object_Info_Parser" | |
2995 }, | |
2996 verify = function(){ | |
2997 check_field_exists = function(field_name=NULL, check_empty = TRUE){ | |
2998 field_value <- get_parser()$parsing_results[field_name] | |
2999 checkTrue(! is.null(field_value), | |
3000 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value)) | |
3001 if (check_empty){ | |
3002 checkTrue(! is.null(field_value), | |
3003 msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value)) | |
3004 } | |
3005 } | |
3006 # Check parameters passed in | |
3007 check_field_exists("junk") | |
3008 check_field_exists("psm_report") | |
3009 check_field_exists("psm_report_1_percent", check_empty = FALSE) | |
3010 check_field_exists("output_i_fdr" , check_empty = FALSE) | |
3011 check_field_exists("output_g_fdr" , check_empty = FALSE) | |
3012 check_field_exists("output_densities" , check_empty = FALSE) | |
3013 #check_field_exists("score_field_name") | |
3014 check_field_exists("input_file_type") | |
3015 }, | |
3016 m_load_data = function(){ | |
3017 parsing_results <- get_parser()$parsing_results | |
3018 | |
3019 data_file_name <<- as.character(parsing_results["psm_report"]) | |
3020 data_file_name_1_percent_FDR <<- as.character(parsing_results["psm_report_1_percent"]) | |
3021 data_path_name <<- as.character(parsing_results[""]) | |
3022 #experiment_name <<- data_file_name | |
3023 #designation <<- "" | |
3024 output_i_fdr <<- as.character(parsing_results["output_i_fdr"]) | |
3025 output_g_fdr <<- as.character(parsing_results["output_g_fdr"]) | |
3026 output_densities <<- as.character(parsing_results["output_densities"]) | |
3027 | |
3028 input_file_type <<- as.character(parsing_results["input_file_type"]) | |
3029 #score_field_name <<- as.character(parsing_results["score_field_name"]) | |
3030 }, | |
3031 set_parser = function(parser){ | |
3032 parents[["parser"]] <<- parser | |
3033 }, | |
3034 get_parser = function(){ | |
3035 return(verified_element_of_list(parents, "parser", "Data_Object_Info_Parser$parents")) | |
3036 }, | |
3037 file_path = function(){ | |
3038 result <- data_file_name # Now assumes that full path is provided | |
3039 if (length(result) == 0){ | |
3040 stop("Unable to validate file path - file name is missing") | |
3041 } | |
3042 return(result) | |
3043 }, | |
3044 file_path_1_percent_FDR = function(){ | |
3045 local_file_name <- get_data_file_name_1_percent_FDR() | |
3046 if (length(local_file_name) == 0){ | |
3047 result <- "" | |
3048 } else { | |
3049 result <- local_file_name # path name is no longer relevant | |
3050 } | |
3051 | |
3052 # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream | |
3053 | |
3054 # if (length(result) == 0){ | |
3055 # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing") | |
3056 # } | |
3057 return(result) | |
3058 }, | |
3059 get_data_file_name_1_percent_FDR = function(){ | |
3060 return(data_file_name_1_percent_FDR) | |
3061 }, | |
3062 collection_name = function(){ | |
3063 result <- "" | |
3064 return(result) | |
3065 } | |
3066 | |
3067 ) | |
3068 ############################################################################### | |
3069 # Class: Processor_PMD_FDR_for_Galaxy | |
3070 # Purpose: Wrapper on tools from Project 019 to enable a Galaxy-based interface | |
3071 ############################################################################### | |
3072 Processor_PMD_FDR_for_Galaxy <- setRefClass("Processor_PMD_FDR_for_Galaxy", | |
3073 fields = list( | |
3074 parser = "Data_Object_Parser", | |
3075 info = "Data_Object_Info_Parser", | |
3076 raw_data = "Data_Object_Raw_Data", | |
3077 raw_1_percent = "Data_Object_Raw_1_Percent", | |
3078 data_converter = "Data_Object_Data_Converter", | |
3079 data_groups = "Data_Object_Groupings", | |
3080 densities = "Data_Object_Densities", | |
3081 alpha = "Data_Object_Alpha", | |
3082 i_fdr = "Data_Object_Individual_FDR" | |
3083 )) | |
3084 Processor_PMD_FDR_for_Galaxy$methods( | |
3085 initialize = function(){ | |
3086 # This initialization defines all of the dependencies between the various components | |
3087 # (Unfortunately, inheriting from Data_Processor leads to issues - I had to reimplement it here with a change to "info") | |
3088 | |
3089 # info | |
3090 info$set_parser(parser) | |
3091 parser$append_child(info) | |
3092 | |
3093 # raw_data | |
3094 raw_data$set_info(info) | |
3095 info$append_child(raw_data) | |
3096 | |
3097 # raw_1_percent | |
3098 raw_1_percent$set_info(info) | |
3099 info$append_child(raw_1_percent) | |
3100 | |
3101 # data_converter | |
3102 data_converter$set_info (info) | |
3103 data_converter$set_raw_data(raw_data) | |
3104 info $append_child (data_converter) | |
3105 raw_data $append_child (data_converter) | |
3106 | |
3107 # data_groups | |
3108 data_groups$set_info (info) | |
3109 data_groups$set_data_converter(data_converter) | |
3110 data_groups$set_raw_1_percent (raw_1_percent) | |
3111 info $append_child (data_groups) | |
3112 data_converter$append_child (data_groups) | |
3113 raw_1_percent $append_child (data_groups) | |
3114 | |
3115 # densities | |
3116 densities $set_data_groups(data_groups) | |
3117 data_groups$append_child (densities) | |
3118 | |
3119 # alpha | |
3120 alpha $set_densities(densities) | |
3121 densities$append_child (alpha) | |
3122 | |
3123 # i_fdr | |
3124 i_fdr$set_data_groups(data_groups) | |
3125 i_fdr$set_densities (densities) | |
3126 i_fdr$set_alpha (alpha) | |
3127 data_groups $append_child(i_fdr) | |
3128 densities $append_child(i_fdr) | |
3129 alpha $append_child(i_fdr) | |
3130 | |
3131 }, | |
3132 compute = function(){ | |
3133 #i_fdr is currently the lowest level object - it ultimately depends on everything else. | |
3134 i_fdr$ensure() # All pieces on which i_fdr depends are automatically verified and computed (through their verify() and ensure()) | |
3135 | |
3136 save_standard_df(x = densities$df, file_path = info$output_densities) | |
3137 save_standard_df(x = alpha$df, file_path = info$output_g_fdr) | |
3138 save_standard_df(x = i_fdr$df, file_path = info$output_i_fdr) | |
3139 } | |
3140 ) | |
3141 ############################################################################### | |
3142 # D - 021 - PMD-FDR Main.R # | |
3143 # # | |
3144 # File Description: Contains the base code that interprets the parameters # | |
3145 # and computes i-FDR and g-FDR for a mass spec project # | |
3146 # # | |
3147 ############################################################################### | |
3148 argv <- commandArgs(TRUE) # Saves the parameters (command code) | |
3149 | |
3150 processor <- Processor_PMD_FDR_for_Galaxy$new() | |
3151 processor$parser$set_args(argv) | |
3152 processor$compute() | |
3153 |