Mercurial > repos > galaxyp > pmd_fdr
view PMD_FDR_package_for_Galaxy.R @ 0:5cc0c32d05a2 draft
"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pmd_fdr commit 00f85eca73cd8afedfefbeec94a4462455ac1a9a"
author | galaxyp |
---|---|
date | Mon, 07 Oct 2019 11:59:37 -0400 |
parents | |
children | 460edeedeb7d |
line wrap: on
line source
############################################################################### # PMD_FDR_package_for_Galaxy.R # # # # Project 021 - PMD-FDR for Galaxy-P # # # # Description: Computes iFDR and gFDR on PSMs as a script designed for Galaxy # # Note that plotting code has been left in that is not used # # in this file; this is the code I used to create figures for # # publication. I left it in for potential development of views. # # # # This file was created by concatenating the following files: # # # # A - 005 - Parser - ArgParser.R # # B - 019 - PMD-FDR - functions.R # # C - 021 - PMD-FDR Wrapper - functions.R # # D - 021 - PMD-FDR Main.R # # # # Required packages: argparser # # stringr # # RUnit # # # # Release date: 2019-10-05 # # Version: 1.4 # # # ############################################################################### # Package currently supports the following parameters: # # --psm_report full name and path to the PSM report # --psm_report_1_percent full name and path to the PSM report for 1% FDR # --output_i_fdr full name and path to the i-FDR output file # --output_g_fdr full name and path to the g-FDR output file # --output_densities full name and path to the densities output file # ############################################################################### # A - 005 - Parser - ArgParser.R # # # # Description: Wrapper for argparser package, using RefClass # # # ############################################################################### #install.packages("argparser") library(argparser) # Class definition ArgParser <- setRefClass("ArgParser", fields = c("parser")) ArgParser$methods( initialize = function(...){ parser <<- arg_parser(...) }, local_add_argument = function(...){ parser <<- add_argument(parser, ...) }, parse_arguments = function(...){ result = parse_args(parser, ...) return(result) } ) ############################################################################### # B - 019 - PMD-FDR - functions.R # # # # Primary work-horse for PMD-FDR # # # ############################################################################### ############################################################################### ####### Load libraries etc. ############################################################################### library(stringr) library(RUnit) ############################################################# ####### Global values (should be parameters to module but aren't yet) ############################################################# MIN_GOOD_PEPTIDE_LENGTH <- 11 MIN_ACCEPTABLE_POINTS_IN_DENSITY <- 10 ############################################################# ####### General purpose functions ############################################################# # Creates a more useful error report when file is not reasonable safe_file_exists <- function(file_path){ # Still not particularly useful in cases where it is a valid directory tryCatch( return(file_test(op = "-f", x=file_path)), error=function(e) {simpleError(sprintf("file path is not valid: '%s'", file_path))} ) } # My standard way of loading data into data.frames load_standard_df <- function(file_path=NULL){ clean_field_names = function(field_names){ result <- field_names idx_blank <- which(result == "") result[idx_blank] <- sprintf("<Field %d>", idx_blank) return(result) } if (safe_file_exists(file_path)){ field_names <- read_field_names(file_path, sep = "\t") field_names <- clean_field_names(field_names) if (length(field_names) == 0){ return(data.frame()) } data <- read.table(file = file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, blank.lines.skip = TRUE)#, check.names = FALSE) colnames(data) = field_names } else { stop(sprintf("File path does not exist: '%s'", file_path)) } return(data) } save_standard_df <- function(x=NULL, file_path=NULL){ if (file_path != ""){ write.table(x = x, file = file_path, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE) } } rename_column <- function(df=NULL, name_before=NULL, name_after=NULL, suppressWarnings=FALSE){ if (is.null(df)){ stop("Dataframe (df) does not exist - unable to rename column") } if (name_before %in% colnames(df)){ df[,name_after] <- df[,name_before] df[,name_before] <- NULL } else if (!suppressWarnings){ warning(sprintf("'%s' is not a field in the data frame and so has not been renamed", name_before)) } return(df) } rename_columns <- function(df=NULL, names_before=NULL, names_after=NULL){ for (i in safe_iterator(length(names_before))){ df <- rename_column(df, names_before[i], names_after[i]) } return(df) } round_to_tolerance <- function(x=NULL, tolerance=NULL, ...){ return(function_to_tolerance(x=x, tolerance=tolerance, FUN=round, ...)) } function_to_tolerance <- function(x=NULL, tolerance=NULL, FUN=NULL, ...){ return(FUN(x/tolerance, ...) * tolerance) } safe_median <- function(x) median(x, na.rm=TRUE) normalize_density <- function(d){ # Normalizes y-values in density function # so that the integral under the curve is 1 # (uses rectangles to approximate area) delta_x <- diff(range(d$x)) / length(d$x) unnormalized_integral <- delta_x * sum(d$y) new_d <- d new_d$y <- with(new_d, y ) return(new_d) } if_null <- function(cond=NULL, null_result=NULL, not_null_result=NULL){ return(switch(1+is.null(cond), not_null_result, null_result)) } rainbow_with_fixed_intensity <- function(n=NULL, goal_intensity_0_1=NULL, alpha=NULL){ goal_intensity <- 255*goal_intensity_0_1 hex_colors <- rainbow(n) rgb_colors <- col2rgb(hex_colors) df_colors <- data.frame(t(rgb_colors)) df_colors$intensity <- with(df_colors, 0.2989*red + 0.5870*green + 0.1140*blue) df_colors$white_black <- with(df_colors, ifelse(intensity < goal_intensity, 255, 0)) df_colors$mix_level <- with(df_colors, (white_black - goal_intensity) / (white_black - intensity ) ) df_colors$new_red <- with(df_colors, mix_level*red + (1-mix_level)*white_black) df_colors$new_green <- with(df_colors, mix_level*green + (1-mix_level)*white_black) df_colors$new_blue <- with(df_colors, mix_level*blue + (1-mix_level)*white_black) names_pref_new <- c("new_red", "new_green", "new_blue") names_no_pref <- c("red", "green", "blue") df_colors <- df_colors[,names_pref_new] df_colors <- rename_columns(df_colors, names_before = names_pref_new, names_after = names_no_pref) rgb_colors <-as.matrix(df_colors/255 ) return(rgb(rgb_colors, alpha=alpha)) } safe_iterator <- function(n_steps = NULL){ if (n_steps < 1){ result = numeric(0) } else { result = 1:n_steps } return(result) } col2hex <- function(cols=NULL, col_alpha=255){ if (all(col_alpha<=1)){ col_alpha <- round(col_alpha*255) } col_matrix <- t(col2rgb(cols)) results <- rgb(col_matrix, alpha=col_alpha, maxColorValue = 255) return(results) } credible_interval <- function(x=NULL, N=NULL, precision=0.001, alpha=0.05){ # Approximates "highest posterior density interval" # Uses exact binomial but with a finite list of potential values (1/precision) p <- seq(from=0, to=1, by=precision) d <- dbinom(x = x, size = N, prob = p) d <- d / sum(d) df <- data.frame(p=p, d=d) df <- df[order(-df$d),] df$cumsum <- cumsum(df$d) max_idx <- sum(df$cumsum < (1-alpha)) + 1 max_idx <- min(max_idx, nrow(df)) lower <- min(df$p[1:max_idx]) upper <- max(df$p[1:max_idx]) return(c(lower,upper)) } verified_element_of_list <- function(parent_list=NULL, element_name=NULL, object_name=NULL){ if (is.null(parent_list[[element_name]])){ if (is.null(object_name)){ object_name = "the list" } stop(sprintf("Element '%s' does not yet exist in %s", element_name, object_name)) } return(parent_list[[element_name]]) } read_field_names = function(file_path=NULL, sep = "\t"){ con = file(file_path,"r") fields = readLines(con, n=1) close(con) if (length(fields) == 0){ return(c()) } fields = strsplit(x = fields, split = sep)[[1]] return(fields) } check_field_name = function(input_df = NULL, name_of_input_df=NULL, field_name=NULL){ test_succeeded <- field_name %in% colnames(input_df) current_columns <- paste0(colnames(input_df), collapse=", ") checkTrue(test_succeeded, msg = sprintf("Expected fieldname '%s' in %s (but did not find it among %s)", field_name, name_of_input_df, current_columns)) } ############################################################# ####### Classes for Data ############################################################# ############################################################################### # Class: Data_Object ############################################################################### Data_Object <- setRefClass("Data_Object", fields =list(m_is_dirty = "logical", parents = "list", children = "list", class_name = "character")) Data_Object$methods( initialize = function(){ m_is_dirty <<- TRUE class_name <<- "Data_Object <abstract class - class_name needs to be set in subclass>" }, load_data = function(){ #print(sprintf("Calling %s$load_data()", class_name)) # Useful for debugging ensure_parents() verify() m_load_data() set_dirty(new_value = FALSE) }, ensure = function(){ if (m_is_dirty){ load_data() } }, set_dirty = function(new_value){ if (new_value != m_is_dirty){ m_is_dirty <<- new_value set_children_dirty() } }, verify = function(){ stop(sprintf("verify() is an abstract method - define it in %s before calling load_data()", class_name)) }, m_load_data = function(){ stop(sprintf("m_load_data() is an abstract method - define it in %s before calling load_data()", class_name)) }, append_parent = function(parent=NULL){ parents <<- append(parents, parent) }, append_child = function(child=NULL){ children <<- append(children, child) }, ensure_parents = function(){ for (parent in parents){ # print(sprintf("Calling %s$ensure()", parent$class_name)) # Useful for debugging parent$ensure() } }, set_children_dirty = function(){ for (child in children){ child$set_dirty(TRUE) } } ) ############################################################################### # Class: Data_Object_Info ############################################################################### Data_Object_Info <- setRefClass("Data_Object_Info", contains = "Data_Object", fields =list( data_file_name_1_percent_FDR = "character", data_file_name = "character", data_path_name = "character", experiment_name = "character", designation = "character", input_file_type = "character" #score_field_name = "character" #collection_name="character", #dir_results="character", #dir_dataset="character", #dataset_designation="character", #file_name_dataset="character", #file_name_dataset_1_percent="character", #experiment_name="character" ) ) Data_Object_Info$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Info - <Abstract class - class_name needs to be set in subclass>" }, verify = function(){ checkFieldExists = function(field_name=NULL){ field_value <- .self[[field_name]] checkTrue(length(field_value) > 0, sprintf("Field %s$%s has not been set (and should have been)", class_name, field_name)) checkTrue(length(field_value) == 1, sprintf("Field %s$%s has been set to multiple values (and should be a single value)", class_name, field_name)) checkTrue(field_value != "", sprintf("Field %s$%s has been set to an empty string (and should not have been)", class_name, field_name)) } checkFieldExists("data_file_name") checkFieldExists("data_path_name") checkFieldExists("experiment_name") checkFieldExists("designation") checkFieldExists("input_file_type") #checkFieldExists("score_field_name") }, m_load_data = function(){ # Nothing to do - this is really a data class }, file_path = function(){ result <- file.path(data_path_name, data_file_name) if (length(result) == 0){ stop("Unable to validate file path - one or both of path name and file name are missing") } return(result) }, file_path_1_percent_FDR = function(){ local_file_name <- get_data_file_name_1_percent_FDR() if (length(local_file_name) == 0){ result <- "" } else { result <- file.path(data_path_name, local_file_name) } # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream # if (length(result) == 0){ # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing") # } return(result) }, get_data_file_name_1_percent_FDR = function(){ return(data_file_name_1_percent_FDR) }, collection_name = function(){ result <- sprintf("%s_%s", experiment_name, designation) return(result) } ) ############################################################################### # Class: Data_Object_Info_737_two_step ############################################################################### Data_Object_Info_737_two_step <- setRefClass("Data_Object_Info_737_two_step", contains = "Data_Object_Info", fields =list()) Data_Object_Info_737_two_step$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Info_737_two_step" #score_field_name <<- "Confidence [%]" data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_Multi_Stage_Two_Step.tabular.tabular" data_path_name <<- file.path(".", "Data") experiment_name <<- "Oral_737_NS" designation <<- "two_step" input_file_type <<- "PSM_Report" #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" } ) ############################################################################### # Class: Data_Object_Info_737_combined ############################################################################### Data_Object_Info_737_combined <- setRefClass("Data_Object_Info_737_combined", contains = "Data_Object_Info", fields =list()) Data_Object_Info_737_combined$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Info_737_combined" #score_field_name <<- "Confidence [%]" data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" data_file_name <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_CombinedDB.tabular" data_path_name <<- file.path(".", "Data") experiment_name <<- "Oral_737_NS" designation <<- "two_step" input_file_type <<- "PSM_Report" #data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular" #data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular" } ) ############################################################################### # Class: Data_Object_Pyrococcus_tr ############################################################################### Data_Object_Pyrococcus_tr <- setRefClass("Data_Object_Pyrococcus_tr", contains = "Data_Object_Info", fields =list()) Data_Object_Pyrococcus_tr$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Pyrococcus_tr" #score_field_name <<- "Confidence [%]" data_file_name_1_percent_FDR <<- "" data_file_name <<- "Pfu_traditional_Extended_PSM_Report.tabular" data_path_name <<- file.path(".", "Data") experiment_name <<- "Pyrococcus" designation <<- "tr" input_file_type <<- "PSM_Report" } ) ############################################################################### # Class: Data_Object_Mouse_Mutations ############################################################################### Data_Object_Mouse_Mutations <- setRefClass("Data_Object_Mouse_Mutations", contains = "Data_Object_Info", fields =list()) Data_Object_Mouse_Mutations$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Mouse_Mutations" #score_field_name <<- "Confidence [%]" data_file_name_1_percent_FDR <<- "" data_file_name <<- "Combined_DB_Mouse_5PTM.tabular" data_path_name <<- file.path(".", "Data") experiment_name <<- "Mouse Mutations" designation <<- "combined_05" input_file_type <<- "PSM_Report" } ) ############################################################################### # Class: Data_Object_Raw_Data ############################################################################### Data_Object_Raw_Data <- setRefClass("Data_Object_Raw_Data", contains = "Data_Object", fields =list(df = "data.frame")) Data_Object_Raw_Data$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Raw_Data" }, verify = function(){ # Check that file exists before using it file_path <- get_info()$file_path() if (! safe_file_exists(file_path)){ stop(sprintf("Raw data file does not exist (%s)", file_path)) } # BUGBUG: Needs to also check the following: # - file is tab-delimited # - first row is a list of column names }, set_info = function(info){ parents[["info"]] <<- info }, get_info = function(){ return(verified_element_of_list(parents, "info", "Data_Object_Raw_Data$parents")) }, m_load_data = function(){ info <- get_info() df <<- load_standard_df(info$file_path()) } ) ############################################################################### # Class: Data_Object_Raw_1_Percent ############################################################################### Data_Object_Raw_1_Percent <- setRefClass("Data_Object_Raw_1_Percent", contains = "Data_Object", fields =list(df = "data.frame")) Data_Object_Raw_1_Percent$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Raw_1_Percent" }, set_info = function(info){ parents[["info"]] <<- info }, verify = function(){ # Do nothing - a missing file name is acceptable for this module and is dealt with in load() }, get_info = function(){ return(verified_element_of_list(parents, "info", "Data_Object_Raw_1_Percent$parents")) }, m_load_data = function(){ info <- get_info() file_path <- info$file_path_1_percent_FDR() if (exists()){ df <<- load_standard_df(info$file_path_1_percent_FDR()) } # Note that failing to load is a valid state for this file, leading to not is_dirty. BUGBUG: this could lead to problems if a good file appears later }, exists = function(){ info <- get_info() local_file_name <- info$get_data_file_name_1_percent_FDR() # Check file name not file path if (length(local_file_name) == 0 ){ # variable not set result = FALSE } else if (local_file_name == ""){ # variable set to empty string result = FALSE } else { result = safe_file_exists(info$file_path_1_percent_FDR()) } return(result) } ) ############################################################################### # Class: Data_Converter ############################################################################### Data_Converter <- setRefClass("Data_Converter", fields =list(class_name = "character", file_type = "character" ) ) Data_Converter$methods( initialize = function(){ class_name <<- "Data_Converter <abstract class - class_name needs to be set in subclass>" file_type <<- "file_type has not been set before being used <needs to be set in initialize() of subclass>" }, check_raw_fields = function(info=NULL, raw_data=NULL){ stop(sprintf("check_raw_fields() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name)) }, convert_data = function(){ stop(sprintf("convert_data() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name)) } ) ############################################################################### # Class: Data_Converter_PMD_FDR_input_file ############################################################################### Data_Converter_PMD_FDR_input_file <- setRefClass("Data_Converter_PMD_FDR_input_file", contains = "Data_Converter", fields =list( ) ) Data_Converter_PMD_FDR_input_file$methods( initialize = function(){ callSuper() class_name <<- "Data_Converter_PMD_FDR_input_file" file_type <<- "PMD_FDR_file_type" }, check_raw_fields = function(info=NULL, raw_data=NULL){ data_original <- raw_data$df check_field_name(data_original, "raw_data", "PMD_FDR_input_score") check_field_name(data_original, "raw_data", "PMD_FDR_pmd") check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_file") check_field_name(data_original, "raw_data", "PMD_FDR_proteins") check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_title") check_field_name(data_original, "raw_data", "PMD_FDR_sequence") check_field_name(data_original, "raw_data", "PMD_FDR_decoy") }, convert_data = function(info=NULL, raw_data=NULL){ data_new <- raw_data$df return(data_new) # Pass through - everything should be in order } ) ############################################################################### # Class: Data_Converter_PSM_Report ############################################################################### Data_Converter_PSM_Report <- setRefClass("Data_Converter_PSM_Report", contains = "Data_Converter", fields =list( ) ) Data_Converter_PSM_Report$methods( initialize = function(){ callSuper() class_name <<- "Data_Converter_PSM_Report" file_type <<- "PSM_Report" }, check_raw_fields = function(info=NULL, raw_data=NULL){ data_original <- raw_data$df check_field_name(data_original, "raw_data", "Confidence [%]") check_field_name(data_original, "raw_data", "Precursor m/z Error [ppm]") check_field_name(data_original, "raw_data", "Spectrum File") check_field_name(data_original, "raw_data", "Protein(s)") check_field_name(data_original, "raw_data", "Spectrum Title") check_field_name(data_original, "raw_data", "Decoy") check_field_name(data_original, "raw_data", "Sequence") }, convert_data = function(info=NULL, raw_data=NULL){ data_new <- raw_data$df data_new$PMD_FDR_input_score <- data_new[, "Confidence [%]" ] data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"] data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ] data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ] data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ] data_new$PMD_FDR_sequence <- data_new[, "Sequence" ] data_new$PMD_FDR_decoy <- data_new[, "Decoy" ] return(data_new) } ) ############################################################################### # Class: Data_Converter_MaxQuant_Evidence ############################################################################### Data_Converter_MaxQuant_Evidence <- setRefClass("Data_Converter_MaxQuant_Evidence", contains = "Data_Converter", fields =list( ) ) Data_Converter_MaxQuant_Evidence$methods( initialize = function(){ callSuper() class_name <<- "Data_Converter_MaxQuant_Evidence" file_type <<- "MaxQuant_Evidence" }, check_raw_fields = function(info=NULL, raw_data=NULL){ data_original <- raw_data$df check_field_name(data_original, "raw_data", "PEP") check_field_name(data_original, "raw_data", "Mass error [ppm]") check_field_name(data_original, "raw_data", "Proteins") check_field_name(data_original, "raw_data", "Retention time") check_field_name(data_original, "raw_data", "Sequence") check_field_name(data_original, "raw_data", "Reverse") }, convert_data = function(info=NULL, raw_data=NULL){ data_new <- raw_data$df data_new$PMD_FDR_input_score <- 100 * (1 - data_new[, "PEP" ]) data_new$PMD_FDR_pmd <- data_new[, "Mass error [ppm]"] data_new$PMD_FDR_spectrum_file <- "<place_holder - assumes a single spectra file>" data_new$PMD_FDR_proteins <- data_new[, "Proteins" ] data_new$PMD_FDR_spectrum_title <- data_new[, "Retention time" ] # Used for ordering peptides - not important in MaxQuant since PMD has already been normalized effectively data_new$PMD_FDR_sequence <- data_new[, "Sequence" ] data_new$PMD_FDR_decoy <- ifelse( data_new[, "Reverse" ] == "+", 1, 0) return(data_new) } ) ############################################################################### # Class: Data_Object_Data_Converter ############################################################################### Data_Object_Data_Converter <- setRefClass("Data_Object_Data_Converter", contains = "Data_Object", fields =list(df = "data.frame", data_converter = "Data_Converter")) Data_Object_Data_Converter$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Data_Converter" }, currently_supported_file_types = function(){ return(c("PSM_Report", "PMD_FDR_input_file")) }, verify = function(){ info <- get_info() raw_data <- get_raw_data() file_type <- get_info()$input_file_type set_file_type(file_type) data_converter$check_raw_fields(info=info, raw_data=raw_data) }, m_load_data = function(){ info <- get_info() raw_data <- get_raw_data() file_type <- get_info()$input_file_type df <<- data_converter$convert_data(info=info, raw_data=raw_data) }, set_file_type = function(file_type = NULL){ if (file_type == "PSM_Report" ){ data_converter <<- Data_Converter_PSM_Report $new() } else if (file_type == "PMD_FDR_input_file"){ data_converter <<- Data_Converter_PMD_FDR_input_file$new() } else if (file_type == "MaxQuant_Evidence"){ data_converter <<- Data_Converter_MaxQuant_Evidence $new() } else { stop(sprintf("File type '%s' is not currently supported by PMD-FDR module", file_type)) } }, set_info = function(info){ parents[["info"]] <<- info }, get_info = function(){ return(verified_element_of_list(parents, "info", "Data_Object_Data_Converter$parents")) }, set_raw_data = function(raw_data){ parents[["raw_data"]] <<- raw_data }, get_raw_data = function(){ return(verified_element_of_list(parents, "raw_data", "Data_Object_Data_Converter$parents")) } ) ############################################################################### # Class: Data_Object_Groupings ############################################################################### Data_Object_Groupings <- setRefClass("Data_Object_Groupings", contains = "Data_Object", fields =list(df = "data.frame")) Data_Object_Groupings$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Groupings" }, simplify_field_name = function(x=NULL){ result <- gsub(pattern = "PMD_FDR_", replacement = "", x = x) return(result) }, verify = function(){ data_original <- get_data_converter()$df check_field_name(data_original, "data_converter", "PMD_FDR_input_score") check_field_name(data_original, "data_converter", "PMD_FDR_pmd") check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_file") check_field_name(data_original, "data_converter", "PMD_FDR_proteins") check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_title") check_field_name(data_original, "data_converter", "PMD_FDR_sequence") check_field_name(data_original, "data_converter", "PMD_FDR_decoy") }, m_load_data = function(){ make_data_groups <- function(data_original=NULL){ # Functions supporting make_data_groups() standardize_fields <- function(data=NULL){ data_new <- data info <- get_info() info$ensure() #field_name_of_score <- info$get_field_name_of_score() # #data_new <- rename_column(data_new, "Variable Modifications" , "ptm_list") # data_new <- rename_column(data_new, field_name_of_score , "PMD_FDR_input_score") # data_new <- rename_column(data_new, "Precursor m/z Error [ppm]", "PMD_FDR_pmd") # #data_new <- rename_column(data_new, "Isotope Number" , "isotope_number") # #data_new <- rename_column(data_new, "m/z" , "m_z") # #data_new <- rename_column(data_new, "Measured Charge" , "charge") # data_new <- rename_column(data_new, "Spectrum File" , "PMD_FDR_spectrum_file") # data_new <- rename_column(data_new, "Protein(s)" , "PMD_FDR_proteins") # data_new <- rename_column(data_new, "Spectrum Title" , "PMD_FDR_spectrum_title") # data_new <- manage_decoy_column(data_new) # Now managed in Data_Converter # data_new$PMD_FDR_input_score <- data_new[, field_name_of_score ] # data_new$PMD_FDR_pmd <- data_new[, "Precursor m/z Error [ppm]"] # data_new$PMD_FDR_spectrum_file <- data_new[, "Spectrum File" ] # data_new$PMD_FDR_proteins <- data_new[, "Protein(s)" ] # data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title" ] data_new$value <- data_new$PMD_FDR_pmd data_new$PMD_FDR_peptide_length <- str_length(data_new$PMD_FDR_sequence) #data_new$charge_value <- with(data_new, as.numeric(substr(charge, start=1, stop=str_length(charge)-1))) #data_new$measured_mass <- with(data_new, m_z*charge_value) data_new$PMD_FDR_spectrum_index <- NA data_new$PMD_FDR_spectrum_index[order(data_new$PMD_FDR_spectrum_title, na.last = TRUE)] <- 1:nrow(data_new) return(data_new) } add_grouped_variable <- function(data_groups = data_groups, field_name_to_group = NULL, vec.length.out = NULL, vec.tolerance = NULL, value_format = NULL){ # Support functions for add_grouped_variable() find_interval_vec <- function(x=NULL, length.out = NULL, tolerance = NULL){ q <- quantile(x = x, probs = seq(from=0, to=1, length.out = length.out), na.rm=TRUE) q <- round_to_tolerance(q, tolerance = tolerance) return(q) } get_group_data_frame <- function(vec=NULL, value_format = NULL){ n <- length(vec) a <- vec[-n] b <- vec[-1] lower <- ifelse(a == b , "eq", NA) lower <- ifelse(is.na(lower ), "ge", lower) upper <- ifelse(a == b , "eq", NA) upper[n-1] <- ifelse(is.na(upper[n-1]), "le", "eq") upper <- ifelse(is.na(upper ), "lt", upper) group <- data.frame(list(idx=1:(n-1), a=a, b=b, lower=lower, upper=upper)) name_format <- sprintf("%%%s_%%%s_%%s_%%s", value_format, value_format) group$new_var <- with(group, sprintf(name_format, a, b, lower, upper)) return(group) } merge_group_with_data <- function(data_groups = NULL, group = NULL, vec = NULL, field_name_to_group = NULL){ field_name_new <- sprintf("group_%s", simplify_field_name(field_name_to_group)) group_idx <- findInterval(x = data_groups[,field_name_to_group], vec = vec, all.inside=TRUE) data_groups$new_var <- group$new_var[group_idx] data_groups <- rename_column(data_groups, "new_var", field_name_new) } # Body of add_grouped_variable() vec <- find_interval_vec(x = data_groups[[field_name_to_group]], length.out = vec.length.out, tolerance = vec.tolerance ) group <- get_group_data_frame(vec = vec, value_format = value_format) df_new <- merge_group_with_data(data_groups = data_groups, group = group, vec = vec, field_name_to_group = field_name_to_group) df_new <- add_group_decoy(df_new, field_name_to_group) return(df_new) } add_already_grouped_variable <- function(field_name_to_group = NULL, data_groups = NULL ){ old_name <- field_name_to_group new_name <- sprintf("group_%s", simplify_field_name(old_name)) df_new <- data_groups df_new[[new_name]] <- data_groups[[old_name]] df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group) return(df_new) } add_value_norm <- function(data_groups = NULL){ df_new <- data_groups df_new$value_norm <- with(df_new, value - median_of_group_index) return(df_new) } add_protein_group <-function(data_groups = NULL){ data_new <- data_groups df_group_def <- data.frame(stringsAsFactors = FALSE, list(pattern = c("" , "pfu_" , "cRAP"), group_name = c("human", "pyrococcus", "contaminant"))) for (i in 1:nrow(df_group_def)){ idx <- grepl(pattern = df_group_def$pattern[i], x = data_new$PMD_FDR_proteins) data_new$group_proteins[idx] <- df_group_def$group_name[i] } data_new <- add_group_decoy(data_groups = data_new, field_name_to_group = "PMD_FDR_proteins") return(data_new) } add_group_decoy <- function(data_groups=NULL, field_name_to_group=NULL){ simple_field_name <- simplify_field_name(field_name_to_group) field_name_decoy <- sprintf("group_decoy_%s", simple_field_name) field_name_group <- sprintf("group_%s", simple_field_name) data_groups[[field_name_decoy]] <- with(data_groups, ifelse(PMD_FDR_decoy, "decoy", data_groups[[field_name_group]])) return(data_groups) } add_group_training_class <- function(data_groups = NULL){ df_new <- data_groups lowest_confidence_group <- min(data_groups$group_input_score) is_long_enough <- with(df_new, (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) ) is_good <- with(df_new, (PMD_FDR_decoy == 0) & (PMD_FDR_input_score == 100) ) is_bad <- with(df_new, (PMD_FDR_decoy == 1) ) #is_used_to_train <- with(df_new, used_to_find_middle) # BUGBUG: circular definition idx_good <- which(is_good ) # & is_long_enough) n_good <- length(idx_good) idx_testing <- idx_good[c(TRUE,FALSE)] # Selects every other item idx_training <- setdiff(idx_good, idx_testing) #is_good_short <- with(df_new, is_good & !is_long_enough ) #is_good_long <- with(df_new, is_good & is_long_enough ) is_bad_short <- with(df_new, is_bad & !is_long_enough ) is_bad_long <- with(df_new, is_bad & is_long_enough ) #is_good_training <- with(df_new, is_good_long & (used_to_find_middle == TRUE ) ) #is_good_testing <- with(df_new, is_good_long & (used_to_find_middle == FALSE) ) df_new$group_training_class <- "other_short" # Default df_new$group_training_class[is_long_enough ] <- "other_long" # Default (if long enough) df_new$group_training_class[idx_training ] <- "good_training" # Length does not matter (anymore) df_new$group_training_class[idx_testing ] <- "good_testing" # Ditto #df_new$group_training_class[is_good_short ] <- "good_short" df_new$group_training_class[is_bad_long ] <- "bad_long" # ...except for "bad" df_new$group_training_class[is_bad_short ] <- "bad_short" df_new <- add_used_to_find_middle( data_groups = df_new ) # Guarantees consistency between duplicated definitions return(df_new) } add_used_to_find_middle <- function(data_groups = NULL){ df_new <- data_groups idx_used <- which(data_groups$group_training_class == "good_training") df_new$used_to_find_middle <- FALSE df_new$used_to_find_middle[idx_used] <- TRUE return(df_new) } add_group_spectrum_index <- function(data_groups = NULL){ # Supporting functions for add_group_spectrum_index() get_breaks_all <- function(df_new){ # Supporting function(s) for get_breaks_all() get_cut_points <- function(data_subset){ # Supporting function(s) for get_cut_points() cut_values <- function(data=NULL, minimum_segment_length=NULL){ # using cpt.mean -- Appears to have a memory leak #results_cpt <- cpt.mean(data=data, method="PELT", minimum_segment_length=minimum_segment_length) #results <- results_cpt@cpts # Just look at the end #results <- c(length(data)) # regularly spaced, slightly larger than minimum_segment_length n_points <- length(data) n_regions <- floor(n_points / minimum_segment_length) n_regions <- ifelse(n_regions == 0, 1, n_regions) results <- round(seq(1, n_points, length.out = n_regions + 1)) results <- results[-1] return(results) } remove_last <- function(x){ return(x[-length(x)] ) } # Main code of for get_cut_points() max_idx = max(data_subset$PMD_FDR_spectrum_index) data_sub_sub <- subset(data_subset, group_training_class == "good_training") #(PMD_FDR_input_score==100) & (PMD_FDR_decoy==0)) minimum_segment_length = 50 values <- data_sub_sub$value n_values <- length(values) local_to_global_idx <- data_sub_sub$PMD_FDR_spectrum_index if (n_values <= minimum_segment_length){ result <- c() } else { local_idx <- cut_values(data=values, minimum_segment_length=minimum_segment_length) result <- local_to_global_idx[local_idx] result <- remove_last(result) } result <- c(result, max_idx) return(result) } remove_last <- function(vec) { return(vec[-length(vec)]) } # Main code of get_breaks_all() breaks <- 1 files <- unique(df_new$PMD_FDR_spectrum_file) for (local_file in files){ data_subset <- subset(df_new, (PMD_FDR_spectrum_file==local_file)) if (nrow(data_subset) > 0){ breaks <- c(breaks, get_cut_points(data_subset)) } } breaks <- sort(unique(breaks)) breaks <- remove_last(breaks) breaks <- c(breaks, max(df_new$PMD_FDR_spectrum_index + 1)) return(breaks) } # Main code of add_group_spectrum_index() field_name_to_group <- "PMD_FDR_spectrum_index" df_new <- data_groups[order(data_groups[[field_name_to_group]]),] breaks <- get_breaks_all(df_new) df_new$group_spectrum_index <- cut(x = df_new[[field_name_to_group]], breaks = breaks, right = FALSE, dig.lab = 6) df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group) return(df_new) } add_median_of_group_index <-function(data_groups = NULL){ field_median <- "median_of_group_index" data_good <- subset(data_groups, used_to_find_middle ) med <- aggregate(value~group_spectrum_index, data=data_good, FUN=safe_median) med <- rename_column(med, "value", field_median) data_groups[[field_median]] <- NULL df_new <- merge(data_groups, med) return(df_new) } add_1_percent_to_data_groups <- function(data_groups=NULL){ data_new <- data_groups if (get_raw_1_percent()$exists()){ # Load 1 percent file df_1_percent <- get_raw_1_percent()$df # Get relevant fields df_1_percent$is_in_1percent <- TRUE df_1_percent <- rename_column(df_1_percent, "Spectrum Title", "PMD_FDR_spectrum_title") df_1_percent <- df_1_percent[,c("PMD_FDR_spectrum_title", "is_in_1percent")] # Merge with data_groups data_new <- merge(data_new, df_1_percent, all.x=TRUE) data_new$is_in_1percent[is.na(data_new$is_in_1percent)] <- FALSE } # Save results return(data_new) } # Main code of make_data_groups() data_groups <- standardize_fields(data_original) data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_input_score", data_groups = data_groups, vec.length.out = 14, vec.tolerance = 1, value_format = "03d") data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_pmd", data_groups = data_groups, vec.length.out = 21, vec.tolerance = 0.1, value_format = "+05.1f") data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_peptide_length", data_groups = data_groups, vec.length.out = 11, vec.tolerance = 1, value_format = "02d") # data_groups <- add_grouped_variable(field_name_to_group = "m_z", # data_groups = data_groups, # vec.length.out = 11, # vec.tolerance = 10, # value_format = "04.0f") # # data_groups <- add_grouped_variable(field_name_to_group = "measured_mass", # data_groups = data_groups, # vec.length.out = 11, # vec.tolerance = 1, # value_format = "04.0f") # # data_groups <- add_already_grouped_variable(field_name_to_group = "isotope_number", # data_groups = data_groups ) # # data_groups <- add_already_grouped_variable(field_name_to_group = "charge", # data_groups = data_groups ) # data_groups <- add_already_grouped_variable(field_name_to_group = "PMD_FDR_spectrum_file", data_groups = data_groups ) data_groups <- add_protein_group(data_groups = data_groups) data_groups <- add_group_training_class( data_groups = data_groups) data_groups <- add_group_spectrum_index( data_groups = data_groups) data_groups <- add_median_of_group_index( data_groups = data_groups) data_groups <- add_value_norm( data_groups = data_groups) # fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "m_z", "PMD_FDR_peptide_length", "isotope_number", "charge", "PMD_FDR_spectrum_file", "measured_mass", "PMD_FDR_spectrum_index", "PMD_FDR_proteins") # fields_of_interest <- c("value", # "PMD_FDR_decoy", # "PMD_FDR_spectrum_title", # "median_of_group_index", # "value_norm", # "used_to_find_middle", # "group_training_class", # fields_of_interest, # sprintf("group_%s" , fields_of_interest), # sprintf("group_decoy_%s", fields_of_interest)) fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "PMD_FDR_peptide_length", "PMD_FDR_spectrum_file", "PMD_FDR_spectrum_index", "PMD_FDR_proteins") fields_of_interest <- c("value", "PMD_FDR_decoy", "PMD_FDR_spectrum_title", "median_of_group_index", "value_norm", "used_to_find_middle", "group_training_class", fields_of_interest, sprintf("group_%s" , simplify_field_name(fields_of_interest)), sprintf("group_decoy_%s", simplify_field_name(fields_of_interest))) data_groups <- data_groups[,fields_of_interest] data_groups <- add_1_percent_to_data_groups(data_groups) return(data_groups) } data_original <- get_data_converter()$df #parents[[INDEX_OF_ORIGINAL_DATA]]$df df <<- make_data_groups(data_original) }, set_info = function(info){ parents[["info"]] <<- info }, get_info = function(){ return(verified_element_of_list(parents, "info", "Data_Object_Groupings$parents")) }, set_data_converter = function(data_converter){ parents[["data_converter"]] <<- data_converter }, get_data_converter = function(){ return(verified_element_of_list(parents, "data_converter", "Data_Object_Groupings$parents")) }, set_raw_1_percent = function(raw_1_percent){ ############## BUGBUG: the 1% file should be using the same file type format as the standard data (but isn't) parents[["raw_1_percent"]] <<- raw_1_percent }, get_raw_1_percent = function(){ return(verified_element_of_list(parents, "raw_1_percent", "Data_Object_Groupings$parents")) } ) ############################################################################### # Class: Data_Object_Individual_FDR ############################################################################### Data_Object_Individual_FDR <- setRefClass("Data_Object_Individual_FDR", contains = "Data_Object", fields =list(df = "data.frame")) Data_Object_Individual_FDR$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Individual_FDR" }, verify = function(){ data_groups = get_data_groups()$df densities = get_densities()$df alpha = get_alpha()$df check_field_name(data_groups, "data_groups", "value_norm") check_field_name(data_groups, "data_groups", "group_decoy_input_score") check_field_name(data_groups, "data_groups", "PMD_FDR_peptide_length") check_field_name(data_groups, "data_groups", "PMD_FDR_input_score") check_field_name(alpha, "alpha", "alpha") # BUGBUG: I'm missing a field here... check_field_name(densities, "densities", "x") check_field_name(densities, "densities", "t") check_field_name(densities, "densities", "f") }, set_data_groups = function(parent){ parents[["data_groups"]] <<- parent }, get_data_groups = function(){ return(verified_element_of_list(parents, "data_groups", "Data_Object_Individual_FDR$parents")) }, set_densities = function(parent){ parents[["densities"]] <<- parent }, get_densities = function(){ return(verified_element_of_list(parents, "densities", "Data_Object_Individual_FDR$parents")) }, set_alpha = function(parent){ parents[["alpha"]] <<- parent }, get_alpha = function(){ return(verified_element_of_list(parents, "alpha", "Data_Object_Individual_FDR$parents")) }, m_load_data = function(){ add_FDR_to_data_groups <- function(data_groups=NULL, densities=NULL, alpha=NULL, field_value=NULL, field_decoy_group=NULL, set_decoy_to_1=FALSE){ # Support functions for add_FDR_to_data_groups() get_group_fdr <- function(group_stats = NULL, data_groups = NULL, densities=NULL){ group_fdr <- apply(X = densities, MARGIN = 2, FUN = max) df_group_fdr <- data.frame(group_fdr) df_group_fdr <- rename_column(df_group_fdr, "group_fdr", "v") df_group_fdr$group_of_interest <- names(group_fdr) t <- df_group_fdr[df_group_fdr$group_of_interest == "t", "v"] f <- df_group_fdr[df_group_fdr$group_of_interest == "f", "v"] df_group_fdr <- subset(df_group_fdr, !(group_of_interest %in% c("x", "t", "f"))) df_group_fdr$group_fdr <-(df_group_fdr$v - t) / (f - t) return(df_group_fdr) } get_mode <- function(x){ d <- density(x) return(d$x[which.max(d$y)]) } # Main code for add_FDR_to_data_groups() # Set up analysis data_new <- data_groups data_new$value_of_interest <- data_new[,field_value] data_new$group_of_interest <- data_new[,field_decoy_group] data_subset <- subset(data_new, PMD_FDR_peptide_length >= 11) # Identify mean PMD_FDR_input_score per group group_input_score <- aggregate(PMD_FDR_input_score~group_of_interest, data=data_subset, FUN=mean) group_input_score <- rename_column(group_input_score, "PMD_FDR_input_score", "group_input_score") #group_fdr <- get_group_fdr(data_groups = data_subset, densities=densities) group_stats <- merge(alpha, group_input_score) group_stats <- subset(group_stats, group_of_interest != "PMD_FDR_decoy") x=c(0,group_stats$group_input_score) y=c(1,group_stats$alpha) FUN_interp <- approxfun(x=x,y=y) data_new$interpolated_groupwise_FDR <- FUN_interp(data_new$PMD_FDR_input_score) if (set_decoy_to_1){ data_new$interpolated_groupwise_FDR[data_new$PMD_FDR_decoy == 1] <- 1 } return(data_new) } data_groups = get_data_groups()$df densities = get_densities()$df alpha = get_alpha()$df d_true <- densities[,c("x", "t")] d_false <- densities[,c("x", "f")] i_fdr <- add_FDR_to_data_groups(data_groups = data_groups, densities = densities, alpha = alpha, field_value ="value_norm", field_decoy_group = "group_decoy_input_score") # Derive local t interp_t <- splinefun(x=d_true$x, y=d_true$t) #approxfun(x=d_true$x, y=d_true$y) # Derive local f interp_f <- splinefun(x=d_false$x, y=d_false$f) #approxfun(x=d_true$x, y=d_true$y) # Derive local FDR i_fdr$t <- interp_t(i_fdr$value_of_interest) i_fdr$f <- interp_f(i_fdr$value_of_interest) i_fdr$alpha <- i_fdr$interpolated_groupwise_FDR i_fdr$i_fdr <- with(i_fdr, (alpha*f) / (alpha*f + (1-alpha)*t)) df <<- i_fdr } ) ############################################################################### # Class: Data_Object_Densities ############################################################################### Data_Object_Densities <- setRefClass("Data_Object_Densities", contains = "Data_Object", fields =list(df = "data.frame")) Data_Object_Densities$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Densities" }, verify = function(){ df_data_groups <- get_data_groups()$df checkTrue(nrow(df_data_groups) > 0, msg = "data_groups data frame was empty (and should not have been)") check_field_name(df_data_groups, "data_groups", "value_norm") check_field_name(df_data_groups, "data_groups", "group_decoy_input_score") check_field_name(df_data_groups, "data_groups", "group_training_class") }, set_data_groups = function(parent=NULL){ parents[["data_groups"]] <<- parent }, get_data_groups = function(){ return(verified_element_of_list(parent_list = parents, element_name = "data_groups", object_name = "Data_Object_Densities$parents")) }, m_load_data = function(){ # Support functions for make_densities() set_values_of_interest <- function(df_data_groups=NULL, field_group = NULL){ field_value = "value_norm" new_data_groups <- get_data_groups()$df new_data_groups$value_of_interest <- new_data_groups[,field_value] new_data_groups$group_of_interest <- new_data_groups[,field_group] #groups <- sort(unique(new_data_groups$group_of_interest)) return(new_data_groups) } get_ylim <- function(data_groups=NULL){ ylim <- range(data_groups$value_of_interest, na.rm = TRUE) return(ylim) } make_hit_density <- function(data_subset=NULL, descr_of_df=NULL, ylim=NULL){ #stop("Data_Object_Densities$make_hit_density() is untested beyond here") verify_density = function(data_subset=NULL, value_field=NULL, descr_of_df=NULL, ylim=NULL){ values <- data_subset[value_field] values <- values[! is.na(values)] if (length(values) < MIN_ACCEPTABLE_POINTS_IN_DENSITY){ stop (sprintf("There are too few valid %s (%d < %d) in %s to be used for calculating a density function", value_field, length(values), MIN_ACCEPTABLE_POINTS_IN_DENSITY, descr_of_df)) } d <- density(values, from = ylim[1], to = ylim[2]) return(d) } uniformalize_density <- function(d){ # Reorganizes y-values of density function so that # function is monotone increasing to mode # and monotone decreasing afterwards idx_mode <- which.max(d$y) idx_lower <- 1:(idx_mode-1) idx_upper <- idx_mode:length(d$y) values_lower <- d$y[idx_lower] values_upper <- d$y[idx_upper] new_d <- d new_d$y <- c(sort(values_lower, decreasing = FALSE), sort(values_upper, decreasing = TRUE)) return(new_d) } local_df <- subset(data_subset, (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) & (used_to_find_middle == FALSE)) d <- verify_density (data_subset=local_df, value_field = "value_of_interest", descr_of_df = descr_of_df, ylim=ylim) d <- normalize_density (d) d <- uniformalize_density(d) return(d) } make_true_hit_density <- function(data_groups=NULL){ d_true <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "good_testing") ), descr_of_df = "Good-testing dataset", ylim = get_ylim(data_groups)) return(d_true) } make_false_hit_density <- function(data_groups=NULL){ d_false <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "bad_long") ), descr_of_df = "Bad-long dataset", ylim = get_ylim(data_groups)) return(d_false) } add_v_densities <- function(data_groups=NULL, densities=NULL, field_group = NULL){ groups <- sort(unique(data_groups$group_of_interest)) new_densities <- densities for (local_group in groups){ d_v <- make_hit_density(data_subset = subset(data_groups, (group_of_interest == local_group)), descr_of_df = sprintf("subset of data (where %s is '%s')", field_group, local_group), ylim = get_ylim(data_groups)) new_densities[local_group] <- d_v$y } return(new_densities) } # Main section of make_densities() df_data_groups <- get_data_groups()$df new_data_groups <- set_values_of_interest(df_data_groups, field_group = "group_decoy_input_score") d_true <- make_true_hit_density( new_data_groups) d_false <- make_false_hit_density(new_data_groups) densities <- data.frame(x=d_true$x, t=d_true$y, f=d_false$y) densities <- add_v_densities(data_groups=new_data_groups, densities=densities, field_group = "group_decoy_input_score") df <<- densities } ) ############################################################################### # Class: Data_Object_Alpha ############################################################################### Data_Object_Alpha <- setRefClass("Data_Object_Alpha", contains = "Data_Object", fields =list(df = "data.frame")) Data_Object_Alpha$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Alpha" }, verify = function(){ densities <- get_densities()$df checkTrue(nrow(densities) > 0, msg = "Densities data.frame was empty (and should not have been)") }, set_densities = function(parent=NULL){ parents[["densities"]] <<- parent }, get_densities = function(){ return(verified_element_of_list(parent_list = parents, element_name = "densities", object_name = "Data_Object_Alpha")) }, m_load_data = function(){ densities <- get_densities()$df max_of_density = apply(X = densities, MARGIN = 2, FUN = max) df_alpha <- data.frame(stringsAsFactors = FALSE, list(v = max_of_density, group_of_interest = names(max_of_density))) df_alpha <- subset(df_alpha, group_of_interest != "x") t <- with(subset(df_alpha, group_of_interest=="t"), v) f <- with(subset(df_alpha, group_of_interest=="f"), v) df_alpha$alpha <- with(df_alpha, (t-v)/(t-f)) alpha <- df_alpha[,c("group_of_interest", "alpha")] alpha <- subset(alpha, (group_of_interest != "t") & (group_of_interest != "f")) df <<- alpha } ) ############################################################################### # Class: Data_Processor ############################################################################### Data_Processor <- setRefClass("Data_Processor", fields =list(info = "Data_Object_Info", raw_data = "Data_Object_Raw_Data", raw_1_percent = "Data_Object_Raw_1_Percent", data_converter = "Data_Object_Data_Converter", data_groups = "Data_Object_Groupings", densities = "Data_Object_Densities", alpha = "Data_Object_Alpha", i_fdr = "Data_Object_Individual_FDR")) Data_Processor$methods( initialize = function(p_info=NULL){ if (! is.null(p_info)){ set_info(p_info) } }, set_info = function(p_info=NULL){ # This initialization defines all of the dependencies between the various components info <<- p_info # raw_data raw_data$set_info(info) info$append_child(raw_data) # raw_1_percent raw_1_percent$set_info(info) info$append_child(raw_1_percent) # data_converter data_converter$set_info (info) data_converter$set_raw_data(raw_data) info $append_child (data_converter) raw_data $append_child (data_converter) # data_groups data_groups$set_info (info) data_groups$set_data_converter(data_converter) data_groups$set_raw_1_percent (raw_1_percent) info $append_child (data_groups) data_converter$append_child (data_groups) raw_1_percent $append_child (data_groups) # densities densities $set_data_groups(data_groups) data_groups$append_child (densities) # alpha alpha $set_densities(densities) densities$append_child (alpha) # i_fdr i_fdr$set_data_groups(data_groups) i_fdr$set_densities (densities) i_fdr$set_alpha (alpha) data_groups $append_child(i_fdr) densities $append_child(i_fdr) alpha $append_child(i_fdr) } ) ############################################################# ####### Classes for Plotting ############################################################# ############################################################################### # Class: Plot_Image ############################################################################### Plot_Image = setRefClass("Plot_Image", fields = list(data_processors = "list", plot_title = "character", include_text = "logical", include_main = "logical", x.intersp = "numeric", y.intersp = "numeric", scale = "numeric", main = "character", is_image_container = "logical")) Plot_Image$methods( initialize = function(p_data_processors = list(), p_include_main = TRUE, p_include_text = TRUE, p_is_image_container = FALSE){ include_main <<- p_include_main include_text <<- p_include_text data_processors <<- p_data_processors is_image_container <<- p_is_image_container }, plot_image = function(){ plot(main="Define plot_image() for subclass") # Abstract function }, get_n = function(){ stop("Need to define function get_n() for subclass") #Abstract function }, create_standard_main = function(){ needs_main <- function(){ return(include_text & include_main & !is_image_container) } if (needs_main()){ collection_name <- data_processors[[1]]$info$collection_name() main <<- sprintf("%s\n(Dataset: %s; n=%s)", plot_title, collection_name, format(get_n(), big.mark = ",")) } }, plot_image_in_window = function(p_scale=NULL, window_height=NULL, window_width=NULL){ scale <<- p_scale SIZE_AXIS <- 2.5 * scale # in the units used by mar SIZE_MAIN <- 2.5 * scale SIZE_NO_MARGIN <- 0.1 * scale FONT_SIZE <- 8 * scale WINDOW_WIDTH <- window_width * scale WINDOW_HEIGHT <- window_height * scale X_INTERSP <- 0.5 * scale + 0.4 # manages legend text spacing Y_INTERSP <- 0.5 * scale + 0.4 # manages if (include_main){ mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_MAIN , SIZE_NO_MARGIN) } else { mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_NO_MARGIN, SIZE_NO_MARGIN) } mgp = c(SIZE_AXIS/2, SIZE_AXIS/4, 0) # Margin line (mex units) for axis title, axis labels, axis lines ps = FONT_SIZE x.intersp <<- X_INTERSP y.intersp <<- Y_INTERSP windows(width = WINDOW_WIDTH, height=WINDOW_HEIGHT) old_par <- par(mar=mar, ps=ps, mgp=mgp) create_standard_main() plot_image() if (!is_image_container){ axis(side=1, labels=include_text, tcl=-0.5, lwd=scale) axis(side=2, labels=include_text, tcl=-0.5, lwd=scale) box(lwd=scale) } par(old_par) }, plot_image_in_small_window = function(p_scale=1){ plot_image_in_window(p_scale=p_scale, window_height=2, window_width=3.25) }, plot_image_in_large_window = function(p_scale=1, window_height=NULL){ plot_image_in_window(p_scale=p_scale, window_height=window_height, window_width=7) } ) ############################################################################### # Class: Legend_Object ############################################################################### Legend_Object = setRefClass("Legend_Object", contains = "Plot_Image", fields = list(user_params = "list", scale = "numeric")) Legend_Object$methods( initialize = function(p_user_params = NULL, p_scale = NULL){ if (is.null(p_user_params)){ user_params <<- list() } else { user_params <<- p_user_params } if (is.null(p_scale)){ stop("Legend_Object must have a valid scale") } else { scale <<- p_scale } user_params$x <<- if_null(user_params$x , "topleft", user_params$x) user_params$y <<- if_null(user_params$y , NULL, user_params$y) user_params$bty <<- if_null(user_params$bty , "o", user_params$bty) user_params$lwd <<- if_null(user_params$lwd , NULL, user_params$lwd * scale) # Because we allow NULL, scale must be inside parens user_params$seg.len <<- if_null(user_params$seg.len , 3, user_params$seg.len ) * scale user_params$box.lwd <<- if_null(user_params$box.lwd , 1, user_params$box.lwd ) * scale user_params$x.intersp <<- if_null(user_params$x.intersp, 0.6, user_params$x.intersp) * scale user_params$y.intersp <<- if_null(user_params$y.intersp, 0.4, user_params$y.intersp) * scale + 0.2 }, show = function(){ first_legend = legend(x = user_params$x, y = user_params$y, title = "", legend = user_params$leg, col = user_params$col, bty = user_params$bty, lty = user_params$lty, lwd = user_params$lwd, seg.len = user_params$seg.len, box.lwd = user_params$box.lwd, x.intersp = user_params$x.intersp, y.intersp = user_params$y.intersp) new_x = first_legend$rect$left new_y = first_legend$rect$top + first_legend$rect$h * ifelse(scale==1, 0.07, 0.03 - (scale * 0.02)) #switch(scale, 0.01, -0.01, -0.03, -0.05)# (0.07 - 0.09 * ((scale-1)^2))#(0.15 - 0.08*scale)#.07 * (2 - scale) legend(x=new_x, y=new_y, title = user_params$title, legend = "", cex=1.15, bty="n") } ) ############################################################################### # Class: Plot_Multiple_Images ############################################################################### Plot_Multiple_Images = setRefClass("Plot_Multiple_Images", contains = "Plot_Image", fields = list(n_images_wide = "numeric", n_images_tall = "numeric", image_list = "list")) Plot_Multiple_Images$methods( initialize = function(p_n_images_wide=1, p_n_images_tall=2, p_image_list=NULL, ...){ n_images_wide <<- p_n_images_wide n_images_tall <<- p_n_images_tall image_list <<- p_image_list #plot_title <<- "True Hit and False Hit Distributions" callSuper(p_is_image_container=TRUE, ...) }, plot_image = function(){ # Support functions apply_mtext <- function(letter=NULL){ line=1.3*scale mtext(letter, side=1, line=line, adj=0) } # main code old_par <- par(mfrow=c(n_images_tall, n_images_wide)) i=0 n_images <- length(image_list) for (i in 1:n_images){ image <- image_list[[i]] image$create_standard_main() image$scale <- scale image$plot_image() axis(side=1, labels=include_text, tcl=-0.5, lwd=scale) axis(side=2, labels=include_text, tcl=-0.5, lwd=scale) box(lwd=scale) apply_mtext(letter=sprintf("(%s)", letters[i])) } par(old_par) } ) ############################################################################### # Class: Plot_Compare_PMD_and_Norm_Density ############################################################################### Plot_Compare_PMD_and_Norm_Density = setRefClass("Plot_Compare_PMD_and_Norm_Density", contains = "Plot_Image", fields = list(show_norm = "logical", display_n_psms = "logical")) Plot_Compare_PMD_and_Norm_Density$methods( initialize = function(p_show_norm=TRUE, p_display_n_psms=TRUE, ...){ show_norm <<- p_show_norm display_n_psms <<- p_display_n_psms plot_title <<- "True Hit and False Hit Distributions" callSuper(...) }, plot_image = function(){ # Support functions for plot_compare_PMD_and_norm_density() get_densities <- function(data_subset = NULL, var_value = NULL){ data_subset$value_of_interest <- data_subset[,var_value] from <- min(data_subset$value_of_interest, na.rm = TRUE) to <- max(data_subset$value_of_interest, na.rm = TRUE) xlim = range(data_subset$value_of_interest) data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100)) data_false <- subset(data_subset, (PMD_FDR_decoy==1)) d_true <- with(data_true , density(value_of_interest, from = from, to = to, na.rm = TRUE)) d_false <- with(data_false, density(value_of_interest, from = from, to = to, na.rm = TRUE)) d_true <- normalize_density(d_true) d_false <- normalize_density(d_false) densities <- list(d_true=d_true, d_false=d_false, var_value = var_value, n_true = nrow(data_true), n_false = nrow(data_false)) return(densities) } get_xlim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){ xlim <- range(c( densities_a$d_true$x, densities_a$d_false$y)) if (show_norm){ xlim <- range(c(xlim, densities_b$d_true$x, densities_b$d_false$y)) } return(xlim) } get_ylim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){ ylim <- range(c( densities_a$d_true$y, densities_a$d_false$y)) if (show_norm){ ylim <- range(c(ylim, densities_b$d_true$y, densities_b$d_false$y)) } return(ylim) } plot_distributions <- function(densities = NULL, var_value= NULL, dataset_name = NULL, ...){ leg = list() leg$leg = c("Good", "Bad") if (display_n_psms){ leg$leg = sprintf("%s (%d PSMs)", leg$leg, c(densities$n_true, densities$n_false)) } leg$col = c("black", "red") leg$lwd = c(3 , 3) leg$lty = c(1 , 2) leg$title = "Hit Category" xlab = ifelse(var_value == "value", "PMD (ppm)", "PMD - normalized (ppm)") ylab = "Density" if (!include_text){ xlab = "" ylab = "" } plot( densities$d_true , col=leg$col[1], lwd=leg$lwd[1] * scale, lty=leg$lty[1], xaxt = "n", yaxt = "n", main=main, xlab = xlab, ylab=ylab, ...) lines(densities$d_false, col=leg$col[2], lwd=leg$lwd[2] * scale, lty=leg$lty[2]) abline(v=0, h=0, col="gray", lwd=1*scale) if (include_text){ legend_object <- Legend_Object$new(leg, scale) legend_object$show() #legend("topleft", legend=leg.leg, col=leg.col, lwd=leg.lwd, lty=leg.lty, x.intersp = x.intersp, y.intersp = y.intersp) } } # Main code block for plot_compare_PMD_and_norm_density data_processor <- data_processors[[1]] data_processor$data_groups$ensure() data_groups <- data_processor$data_groups$df data_subset_a <- subset(data_groups , used_to_find_middle == FALSE) data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) densities_a <- get_densities(data_subset = data_subset_a, var_value = "value") densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm") xlim=get_xlim(densities_a, densities_b, show_norm = show_norm) ylim=get_ylim(densities_a, densities_b, show_norm = show_norm) dataset_name <- data_processor$info$collection_name plot_distributions( densities=densities_a, var_value = "value" , dataset_name = dataset_name, xlim=xlim, ylim=ylim) if (show_norm){ plot_distributions(densities=densities_b, var_value = "value_norm", dataset_name = dataset_name, xlim=xlim, ylim=ylim) } }, get_n = function(){ data_processor <- data_processors[[1]] data_processor$data_groups$ensure() data_subset_a <- subset(data_processor$data_groups$df , used_to_find_middle == FALSE) data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) if (show_norm){ data_subset <- data_subset_a } else { data_subset <- data_subset_b } data_true <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100)) data_false <- subset(data_subset, (PMD_FDR_decoy==1)) return(nrow(data_true) + nrow(data_false)) } ) ############################################################################### # Class: Plot_Time_Invariance_Alt ############################################################################### Plot_Time_Invariance_Alt = setRefClass("Plot_Time_Invariance_Alt", contains = "Plot_Image", fields = list(show_norm = "logical", display_n_psms = "logical", training_class = "character", ylim = "numeric", field_of_interest = "character")) Plot_Time_Invariance_Alt$methods( initialize = function(p_ylim=NULL, p_training_class=NULL, p_field_of_interest="value_norm", ...){ get_subset_title <- function(training_class=NULL){ if (training_class == "bad_long"){ subset_title="bad only" } else if (training_class == "good_testing"){ subset_title="good-testing only" } else if (training_class == "good_training"){ subset_title="good-training only" } else if (training_class == "other"){ subset_title="other only" } else { stop("Unexpected training_class in plot_time_invariance") } return(subset_title) } ylim <<- p_ylim training_class <<- p_training_class field_of_interest <<- p_field_of_interest subset_title <- get_subset_title(training_class=training_class) backup_title <- sprintf("Middle 25%% PMD for spectra sorted by index%s", ifelse(is.null(subset_title), "", sprintf(" - %s", subset_title))) #plot_title <<- get_main(main_title=main, backup_title=backup_title, data_collection = data_collection) plot_title <<- backup_title callSuper(...) }, plot_image = function(){ # Support functions for plot_time_invariance() # Main code of plot_time_invariance() data_subset = get_data_subset() plot_group_spectrum_index_from_subset_boxes(data_subset = data_subset) abline(h=0, col="blue", lwd=scale) }, get_data_subset = function(){ data_processor <- data_processors[[1]] data_processor$data_groups$ensure() return(subset(data_processor$data_groups$df, (group_training_class==training_class))) }, get_n = function(){ return(nrow(get_data_subset())) }, plot_group_spectrum_index_from_subset_boxes = function(data_subset = NULL){ n_plot_groups <- 100 field_name_text <- ifelse(field_of_interest=="value", "PMD", "Translated PMD") new_subset <- data_subset new_subset$value_of_interest <- new_subset[,field_of_interest] new_subset <- new_subset[order(new_subset$PMD_FDR_spectrum_index),] idxs <- round_to_tolerance(seq(from=1, to=nrow(new_subset), length.out = n_plot_groups+1), 1) idxs_left <- idxs[-(n_plot_groups+1)] idxs_right <- idxs[-1] - 1 idxs_right[n_plot_groups] <- idxs_right[n_plot_groups] + 1 new_subset$plot_group <- NA for (i in 1:n_plot_groups){ new_subset$plot_group[idxs_left[i]:idxs_right[i]] <- i } xleft <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=min) xright <- aggregate(PMD_FDR_spectrum_index ~plot_group, data=new_subset, FUN=max) ybottom <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 - (0.25/2))}) ytop <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 + (0.25/2))}) boxes <- merge( rename_column(xleft , "PMD_FDR_spectrum_index" , "xleft"), merge( rename_column(xright , "PMD_FDR_spectrum_index" , "xright"), merge(rename_column(ybottom, "value_of_interest", "ybottom"), rename_column(ytop , "value_of_interest", "ytop")))) xlab <- "Spectrum Index" ylab <- sprintf("%s (ppm)", field_name_text ) if (is.null(ylim)){ ylim <<- range(new_subset$value_of_interest) } if (!include_text){ xlab="" ylab="" } plot(value_of_interest~PMD_FDR_spectrum_index, data=new_subset, type="n", ylim=ylim, xlab = xlab, ylab=ylab, main=main, xaxt="n", yaxt="n") with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale)) #points(median_of_group_index~PMD_FDR_spectrum_index, data=data_subset, cex=.5, pch=15) axis(1, labels=include_text, lwd=scale) axis(2, labels=include_text, lwd=scale) box(lwd=scale) #box around plot area } ) ############################################################################### # Class: Plot_Time_Invariance_Alt_Before_and_After ############################################################################### Plot_Time_Invariance_Alt_Before_and_After = setRefClass("Plot_Time_Invariance_Alt_Before_and_After", contains = "Plot_Multiple_Images", fields = list()) Plot_Time_Invariance_Alt_Before_and_After$methods( initialize = function(p_data_processors = NULL, p_include_text=TRUE, p_include_main=FALSE, p_ylim = c(-4,4), ...){ plot_object1 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors, p_include_text=p_include_text, p_include_main=p_include_main, p_training_class = "good_testing", p_field_of_interest = "value", p_ylim = p_ylim) plot_object2 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors, p_include_text=p_include_text, p_include_main=p_include_main, p_training_class = "good_testing", p_field_of_interest = "value_norm", p_ylim = p_ylim) callSuper(p_n_images_wide=1, p_n_images_tall=2, p_include_text=p_include_text, p_include_main=p_include_main, p_image_list = list(plot_object1, plot_object2), ...) } ) ############################################################################### # Class: Plot_Density_PMD_and_Norm_Decoy_by_AA_Length ############################################################################### Plot_Density_PMD_and_Norm_Decoy_by_AA_Length = setRefClass("Plot_Density_PMD_and_Norm_Decoy_by_AA_Length", contains = "Plot_Image", fields = list(show_norm = "logical")) Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$methods( initialize = function(p_show_norm=FALSE, ...){ plot_title <<- "The Decoy Bump: PMD Distribution of Decoy matches by peptide length" show_norm <<- p_show_norm callSuper(...) }, get_n = function(){ data_processor <- data_processors[[1]] data_processor$data_groups$ensure() data_subset <- subset(data_processor$data_groups$df, (PMD_FDR_decoy == 1)) return(nrow(data_subset)) }, plot_image = function(){ # Support functions for plot_density_PMD_and_norm_decoy_by_aa_length() add_group_peptide_length_special <- function(){ data_processor <- data_processors[[1]] data_processor$data_groups$ensure() data_groups <- data_processor$data_groups$df # the name data_groups is a data.frame instead of a Data_Object data_groups <- subset(data_groups, used_to_find_middle == FALSE) df_group_definition <- data.frame(stringsAsFactors = FALSE, list(group_peptide_length_special = c("06-08", "09-10", "11-12", "13-15", "16-20", "21-50"), min = c( 6 , 9 , 11 , 13 , 16 , 21 ), max = c( 8 , 10 , 12 , 15 , 20 , 50 ) )) group_peptide_length_special <- data.frame(list(PMD_FDR_peptide_length = 6:50)) group_peptide_length_special$min <- with(group_peptide_length_special, sapply(PMD_FDR_peptide_length, FUN = function(i) max(df_group_definition$min[df_group_definition$min <= i]))) group_peptide_length_special <- merge(group_peptide_length_special, df_group_definition) data_groups$group_peptide_length_special <- NULL new_data_groups <- (merge(data_groups, group_peptide_length_special[,c("PMD_FDR_peptide_length", "group_peptide_length_special")])) return(new_data_groups) } get_densities <- function(data_subset = NULL, field_value = NULL, field_group=NULL){ get_density_from_subset <- function(data_subset=NULL, xlim=NULL){ d_group <- with(data_subset , density(value_of_interest, from = xlim[1], to = xlim[2], na.rm=TRUE)) d_group <- normalize_density(d_group) return(d_group) } data_temp <- data_subset data_temp$value_of_interest <- data_temp[[field_value]] data_temp$group_of_interest <- data_temp[[field_group]] xlim = range(data_temp$value_of_interest, na.rm=TRUE) groups <- sort(unique(data_temp$group_of_interest)) n_groups <- length(groups) d_group <- get_density_from_subset( data_subset=data_temp, xlim = xlim ) densities <- list("All decoys" = d_group) for (i in 1:n_groups){ group <- groups[i] d_group <- get_density_from_subset( data_subset=subset(data_temp, (group_of_interest == group)), xlim = xlim ) densities[[group]] <- d_group } return(densities) } get_limits <- function(densities_a = NULL, densities_b = NULL){ xlim = c() ylim = c(0) for (single_density in densities_a){ xlim=range(c(xlim, single_density$x)) ylim=range(c(ylim, single_density$y)) } for (single_density in densities_b){ xlim=range(c(xlim, single_density$x)) ylim=range(c(ylim, single_density$y)) } return(list(xlim=xlim, ylim=ylim)) } plot_distributions <- function(data_groups = NULL, xlim=NULL, ylim=NULL, densities = NULL, field_group= NULL, field_value = "value", xlab_modifier = "", var_value= NULL, include_peak_dots=TRUE, dataset_name = NULL, ...){ data_groups$group_of_interest <- data_groups[[field_group]] data_groups$value_of_interest <- data_groups[[field_value]] # Main body of plot_decoy_distribution_by_field_of_interest() FIXED_LWD=3 groups <- sort(unique(data_groups$group_of_interest)) n <- length(groups) df_leg <- data.frame(stringsAsFactors = FALSE, list(leg = groups, col = rainbow_with_fixed_intensity(n = n, goal_intensity_0_1 = 0.4), lty = rep(1:6, length.out=n), lwd = rep(FIXED_LWD , n)) ) d <- densities[["All decoys"]] xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", xlab_modifier) ylab = "Density" if (!include_text){ xlab="" ylab="" } plot(d, lwd=FIXED_LWD * scale, main=main, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n") ave_peak <- max(d$y) max_peak <- 0 for (local_group in groups){ data_subset <- subset(data_groups, group_of_interest == local_group) data_info <- subset(df_leg , leg == local_group) col <- data_info$col[1] lty <- data_info$lty[1] lwd <- data_info$lwd[1] if (nrow(data_subset) > 100){ d <- densities[[local_group]] #density(data_subset[[field_value]]) lines(d, col=col, lty=lty, lwd=lwd * scale) peak <- max(d$y) max_peak <- max(max_peak, peak) } } abline(v=0, h=0, lwd=scale) leg <- list(title = "Peptide length (aa)", leg = c("All decoys" , df_leg$leg), col = c(col2hex("black") , df_leg$col), lty = c(1 , df_leg$lty), lwd = c(FIXED_LWD , df_leg$lwd) ) if (include_text){ legend_object = Legend_Object$new(leg, scale) legend_object$show() #first_legend = legend(x="topleft", title = "", legend = leg$leg, col = leg$col, lty = leg$lty, lwd = leg$lwd, seg.len=leg$seg.len, box.lwd=leg$box.lwd, x.intersp = leg$x.intersp, y.intersp = leg$y.intersp) #new_x = first_legend$rect$left #new_y = first_legend$rect$top + first_legend$rect$h * .07 * (2 - scale) #legend(x=new_x, y=new_y, title = leg$title, legend = "", cex=1.15, bty="n") } box(lwd=scale) #box around plot area } # Main body for plot_density_PMD_and_norm_decoy_by_aa_length() data_mod <- add_group_peptide_length_special() data_mod <- subset(data_mod, PMD_FDR_decoy==1) densities_a <- get_densities(data_subset = data_mod, field_value = "value" , field_group = "group_peptide_length_special") densities_b <- get_densities(data_subset = data_mod, field_value = "value_norm", field_group = "group_peptide_length_special") data_processor <- data_processors[[1]] dataset_name <- data_processor$info$collection_name() limits <- get_limits(densities_a, densities_b) xlim <- limits$xlim ylim <- limits$ylim if (show_norm){ plot_distributions(data_groups = data_mod, densities=densities_b, field_value = "value_norm", xlab_modifier = " - normalized", field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim) } else { plot_distributions(data_groups = data_mod, densities=densities_a, field_value = "value" , xlab_modifier = "" , field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim) } } ) ############################################################################### # Class: Plot_Bad_CI ############################################################################### Plot_Bad_CI = setRefClass("Plot_Bad_CI", contains = "Plot_Image", fields = list(breaks = "numeric", ylim = "numeric")) Plot_Bad_CI$methods( initialize = function(p_breaks=20, p_ylim=NULL, ...){ if (is.null(p_ylim)){ ylim <<- numeric(0) } else { ylim <<- p_ylim } breaks <<- p_breaks plot_title <<- "Credible Intervals for proportion within range - bad" callSuper(...) }, data_processor = function(){ return(data_processors[[1]]) }, get_n = function(){ data_processor()$data_groups$ensure() return(nrow(subset(data_processor()$data_groups$df, (PMD_FDR_decoy == 1)))) }, plot_image = function(){ data_processor()$data_groups$ensure() data_groups <- data_processor()$data_groups$df data_decoy <- subset(data_groups, data_groups$group_training_class == "bad_long") data_decoy$region <- cut(x = data_decoy$value, breaks = breaks) table(data_decoy$region) regions <- unique(data_decoy$region) N = nrow(data_decoy) find_lower_ci_bound <- function(x){ ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05) return(ci[1]) } find_upper_ci_bound <- function(x){ ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05) return(ci[2]) } xleft <- aggregate(value~region, data=data_decoy, FUN=min) xright <- aggregate(value~region, data=data_decoy, FUN=max) ytop <- aggregate(value~region, data=data_decoy, FUN=find_upper_ci_bound) ybottom <- aggregate(value~region, data=data_decoy, FUN=find_lower_ci_bound) xleft <- rename_column(xleft , "value", "xleft" ) xright <- rename_column(xright , "value", "xright" ) ytop <- rename_column(ytop , "value", "ytop" ) ybottom <- rename_column(ybottom, "value", "ybottom") boxes <- merge(merge(xleft, xright), merge(ytop, ybottom)) xlab <- "Precursor Mass Discrepancy (ppm)" ylab <- "Proportion of PSMs\nin subgroup" xlim=range(data_decoy$value, na.rm = TRUE) get_ylim(boxes=boxes) if (!include_text){ xlab="" ylab="" } plot(x=c(-10,10), y=c(0,1), type="n", ylim=ylim, xlim=xlim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n") with(boxes, rect(xleft=xleft, xright=xright, ytop=ytop, ybottom=ybottom, lwd=scale)) abline(h=1/breaks, col="blue", lwd=scale) }, get_ylim = function(boxes=NULL){ is_valid_range <- function(r=NULL){ return(length(r) == 2) } if (! is_valid_range(ylim)){ ylim <<- range(c(0,boxes$ytop, boxes$ybottom)) } } ) ############################################################################### # Class: Plot_Selective_Loss ############################################################################### Plot_Selective_Loss = setRefClass("Plot_Selective_Loss", contains = "Plot_Image", fields = list()) Plot_Selective_Loss$methods( initialize = function( ...){ plot_title <<- "PMD-FDR Selectively removes Bad Hits" callSuper(...) }, data_processor = function(){ return(data_processors[[1]]) }, get_n = function(){ data_processor()$i_fdr$ensure() data_subset <- data_processor()$i_fdr$df return(nrow(data_subset)) }, plot_image = function(){ # Support functions for plot_selective_loss() samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){ data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold) tbl <- with(updated_i_fdr, table(PMD_FDR_input_score >= score_threshold, new_confidence < score_threshold, group_decoy_proteins)) df <- data.frame(tbl) df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum) df_n <- rename_column(df_n, name_before = "Freq", "n") df <- merge(df, df_n) df$rate_of_loss <- with(df, Freq/n) df <- subset(df, (Var1==TRUE) & (Var2==TRUE)) df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")] if (nrow(df) > 0){ df$score_threshold <- score_threshold } return(df) } get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){ df=data.frame() for (score_threshold in score_thresholds){ df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold) df <- rbind(df, df_new_loss) } return(df) } # Main code for plot_selective_loss() updated_i_fdr <- data_processor()$i_fdr$df updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100))) loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100) xlim <- with(loss_record, range(score_threshold)) ylim <- c(0,1) xlab <- "Fixed Confidence threshold (PeptideShaker score)" ylab <- "Rate of PSM disqualification from PMD-FDR" lwd <- 4 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab) groups <- sort(unique(loss_record$group_decoy_proteins)) n_g <- length(groups) cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1) ltys <- rep(1:6, length.out=n_g) leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, title="Species/Category") for (i in 1:n_g){ lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$leg[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i]) } abline(h=0, v=100, lwd=scale) abline(h=c(0.1, 0.8), col="gray", lwd=scale) #leg = list(leg=group, col=col, lty=lty, lwd=lwd) #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len)) legend_object <- Legend_Object$new(leg, scale) legend_object$show() } ) ############################################################################### # Class: Plot_Selective_Loss_for_TOC ############################################################################### Plot_Selective_Loss_for_TOC = setRefClass("Plot_Selective_Loss_for_TOC", contains = "Plot_Image", fields = list(xlab="character", ylab="character", title_x="numeric", title_y="numeric", legend_border="logical", legend_x = "numeric", legend_y = "numeric", legend_title="character", legend_location = "character", name_contaminant = "character", name_decoy = "character", name_human = "character", name_pyro = "character")) Plot_Selective_Loss_for_TOC$methods( initialize = function( ...){ plot_title <<- "PMD-FDR selectively removes bad hits" callSuper(...) xlab <<- "Confidence threshold (PeptideShaker)" ylab <<- "PMD Disqualifiction Rate" legend_border <<- FALSE #legend_title <<- "Species/Category" title_x <<- 50 title_y <<- 0.9 legend_x <<- 10 legend_y <<- 0.75 name_contaminant <<- "signal - contaminant" name_decoy <<- "decoy - reversed" name_human <<- "decoy - human" name_pyro <<- "signal - pyrococcus" }, data_processor = function(){ return(data_processors[[1]]) }, get_n = function(){ data_processor()$i_fdr$ensure() data_subset <- data_processor()$i_fdr$df return(nrow(data_subset)) }, plot_image = function(){ # Support functions for plot_selective_loss() samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){ data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold) tbl <- with(updated_i_fdr, table(PMD_FDR_input_score >= score_threshold, new_confidence < score_threshold, group_decoy_proteins)) df <- data.frame(tbl) df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum) df_n <- rename_column(df_n, name_before = "Freq", "n") df <- merge(df, df_n) df$rate_of_loss <- with(df, Freq/n) df <- subset(df, (Var1==TRUE) & (Var2==TRUE)) df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")] if (nrow(df) > 0){ df$score_threshold <- score_threshold } return(df) } get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){ df=data.frame() for (score_threshold in score_thresholds){ df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold) df <- rbind(df, df_new_loss) } return(df) } convert_groups <- function(groups=NULL){ new_groups <- groups new_groups <- gsub(pattern = "contaminant", replacement = name_contaminant, x = new_groups) new_groups <- gsub(pattern = "decoy" , replacement = name_decoy , x = new_groups) new_groups <- gsub(pattern = "human" , replacement = name_human , x = new_groups) new_groups <- gsub(pattern = "pyrococcus" , replacement = name_pyro , x = new_groups) return(new_groups) } # Main code for plot_selective_loss() updated_i_fdr <- data_processor()$i_fdr$df updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100))) loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100) xlim <- with(loss_record, range(score_threshold)) ylim <- c(0,1) #xlab <- "Fixed Confidence threshold (PeptideShaker score)" #ylab <- "Rate of PSM disqualification from PMD-FDR" lwd <- 4 plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab) groups <- sort(unique(loss_record$group_decoy_proteins)) n_g <- length(groups) cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1) ltys <- rep(1:6, length.out=n_g) bty <- ifelse(legend_border, "o", "n") leg <- list(leg=convert_groups(groups), var_name=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y) #leg <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y) for (i in 1:n_g){ lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$var_name[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i]) } abline(h=0, v=100, lwd=scale) abline(h=c(0.1, 0.8), col="gray", lwd=scale) #leg = list(leg=group, col=col, lty=lty, lwd=lwd) #with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len)) legend_object <- Legend_Object$new(leg, scale) legend_object$show() text(x=title_x, y=title_y, labels = plot_title) } ) ############################################################################### # Class: Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR ############################################################################### Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR = setRefClass("Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR", contains = "Plot_Image", fields = list()) Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$methods( initialize = function( ...){ plot_title <<- "Precursor Mass Discrepance i-FDR for 1% Target-Decoy FDR PSMs" callSuper(...) }, data_processor = function(){ return(data_processors[[1]]) }, get_n = function(){ data_processor()$i_fdr$ensure() if (one_percent_calculation_exists()){ i_fdr <- data_processor()$i_fdr$df data_subset <- subset(i_fdr, is_in_1percent==TRUE) n <- nrow(data_subset) } else { n <- 0 } return (n) }, plot_image = function(){ if (one_percent_calculation_exists()){ i_fdr <- get_modified_fdr() report_good_discrepancies(i_fdr) data_TD_good <- get_data_TD_good(i_fdr) mean_results <- get_mean_results(data_TD_good) boxes <- mean_results boxes <- rename_columns(df = boxes, names_before = c("min_conf", "max_conf", "lower" , "upper"), names_after = c("xleft" , "xright" , "ybottom", "ytop" )) xlim <- range(boxes[,c("xleft", "xright")]) ylim <- range(boxes[,c("ybottom", "ytop")]) #head(mean_results) xlab = "Confidence Score (Peptide Shaker)" ylab = "Mean PMD i-FDR" if (!include_text){ xlab="" ylab="" } plot(mean_i_fdr~mean_conf, data=mean_results, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n", cex=scale, lwd=scale) with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale)) abline(b=-1, a=100, lwd=4*scale, col="dark gray") abline(h=0, v=100, lwd=1*scale) } else { stop(sprintf("Dataset '%s' does not include 1%% FDR data", data_processor()$info$collection_name())) } }, get_mean_results = function(data_TD_good = NULL){ mean_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=mean) mean_i_fdr <- rename_column(mean_i_fdr, "i_fdr", "mean_i_fdr") sd_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=sd) sd_i_fdr <- rename_column(sd_i_fdr, "i_fdr", "sd_i_fdr") n_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=length) n_i_fdr <- rename_column(n_i_fdr, "i_fdr", "n") mean_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=mean) mean_conf <- rename_column(mean_conf, "PMD_FDR_input_score", "mean_conf") min_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=min) min_conf <- rename_column(min_conf, "PMD_FDR_input_score", "min_conf") max_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=max) max_conf <- rename_column(max_conf, "PMD_FDR_input_score", "max_conf") mean_results <- mean_i_fdr mean_results <- merge(mean_results, sd_i_fdr) mean_results <- merge(mean_results, n_i_fdr) mean_results <- merge(mean_results, mean_conf) mean_results <- merge(mean_results, min_conf) mean_results <- merge(mean_results, max_conf) mean_results$se <- with(mean_results, sd_i_fdr / sqrt(n - 1)) mean_results$lower <- with(mean_results, mean_i_fdr - 2*se) mean_results$upper <- with(mean_results, mean_i_fdr + 2*se) return(mean_results) }, get_data_TD_good = function(i_fdr=NULL){ data_TD_good <- subset(i_fdr, TD_good==TRUE) data_TD_good <- data_TD_good[order(data_TD_good$PMD_FDR_input_score),] n <- nrow(data_TD_good) data_TD_good$conf_group <- cut(1:n, breaks=floor(n/100)) data_TD_good$i_fdr <- 100 * data_TD_good$i_fdr return(data_TD_good) }, get_modified_fdr = function(){ i_fdr <- data_processor()$i_fdr$df i_fdr$PMD_good <- i_fdr$i_fdr < 0.01 i_fdr$TD_good <- i_fdr$is_in_1percent == TRUE i_fdr$conf_good <- i_fdr$PMD_FDR_input_score == 100 return(i_fdr) }, one_percent_calculation_exists = function(){ data_processor()$raw_1_percent$ensure() return(data_processor()$raw_1_percent$exists())# "is_in_1percent" %in% colnames(data_processor()$i_fdr)) }, report_good_discrepancies = function(i_fdr=NULL){ with(subset(i_fdr, (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) with(subset(i_fdr, (PMD_FDR_input_score==100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) with(subset(i_fdr, (PMD_FDR_input_score>= 90) & (PMD_FDR_input_score< 99) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good))) } ) ############################################################################### # Class: Plot_Density_PMD_by_Score ############################################################################### Plot_Density_PMD_by_Score = setRefClass("Plot_Density_PMD_by_Score", contains = "Plot_Image", fields = list(show_norm = "logical")) Plot_Density_PMD_by_Score$methods( initialize = function(p_show_norm=FALSE, ...){ show_norm <<- p_show_norm plot_title <<- "PMD distribution, by Confidence ranges" callSuper(...) }, data_processor = function(){ return(data_processors[[1]]) }, get_n = function(){ return(nrow(data_processor()$data_groups$df)) #data_subset <- data_collection$i_fdr #return(nrow(data_subset)) }, get_modified_data_groups = function(var_value = NULL){ # Note: Filters out used_to_find_middle # Note: Creates "value_of_interest" field # Note: Remakes "group_decoy_input_score" field data_new <- data_processor()$data_groups$df data_new <- subset(data_new, !used_to_find_middle ) data_new$value_of_interest <- data_new[, var_value] cutoff_points <- c(100, 100, 95, 80, 50, 0, 0) n <- length(cutoff_points) uppers <- cutoff_points[-n] lowers <- cutoff_points[-1] for (i in 1:(n-1)){ upper <- uppers[i] lower <- lowers[i] if (lower==upper){ idx <- with(data_new, which( (PMD_FDR_input_score == upper) & (PMD_FDR_decoy == 0))) cat_name <- sprintf("%d", upper) } else { idx <- with(data_new, which((PMD_FDR_input_score >= lower) & (PMD_FDR_input_score < upper) & (PMD_FDR_decoy == 0))) cat_name <- sprintf("%02d - %2d", lower, upper) } data_new$group_decoy_input_score[idx] <- cat_name } return(data_new) }, plot_image = function(){ # Support functions for plot_density_PMD_by_score() get_densities <- function(data_subset = NULL, var_value = NULL){ # Support functions for get_densities() # New version # Main body of get_densities() data_subset <- get_modified_data_groups(var_value=var_value) #data_subset$value_of_interest <- data_subset[,var_value] from <- min(data_subset$value_of_interest, na.rm=TRUE) to <- max(data_subset$value_of_interest, na.rm=TRUE) xlim = range(data_subset$value_of_interest, na.rm=TRUE) groups <- sort(unique(data_subset$group_decoy_input_score), decreasing = TRUE) n_groups <- length(groups) densities <- list(var_value = var_value, groups=groups) for (i in 1:n_groups){ group <- groups[i] data_group_single <- subset(data_subset, (group_decoy_input_score == group)) d_group <- with(data_group_single , density(value_of_interest, from = from, to = to, na.rm = TRUE)) d_group <- normalize_density(d_group) densities[[group]] <- d_group } return(densities) } get_xlim <- function(densities_a = NULL, densities_b = NULL){ groups <- densities_a$groups xlim <- 0 for (group in groups){ xlim <- range(xlim, densities_a[[group]]$x, densities_b[[group]]$x) } return(xlim) } get_ylim <- function(densities_a = NULL, densities_b = NULL){ groups <- densities_a$groups ylim <- 0 for (group in groups){ ylim <- range(ylim, densities_a[[group]]$y, densities_b[[group]]$y) } return(ylim) } plot_distributions <- function(densities = NULL, var_value= NULL,include_peak_dots=TRUE, xlab_modifier="", xlim=NULL, ylim=NULL, ...){ data_groups <- get_modified_data_groups(var_value=var_value) groups <- sort(unique(data_groups$group_decoy_input_score)) n_groups <- length(groups) groups_std <- setdiff(groups, c("100", "decoy", "0") ) groups_std <- sort(groups_std, decreasing = TRUE) groups_std <- c(groups_std, "0") n_std <- length(groups_std) cols <- rainbow_with_fixed_intensity(n = n_std, goal_intensity_0_1 = 0.5, alpha=0.5) leg <- list(group = c("100" , groups_std , "decoy" ), leg = c("100" , groups_std , "All Decoys" ), col = c(col2hex("black") , cols , col2hex("purple", col_alpha = 0.5)), lwd = c(4 , rep(2, n_std), 4 ), title = "Confidence Score") xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", xlab_modifier) ylab = "Density" if (!include_text){ xlab="" ylab="" } plot( x=xlim, y=ylim, col=leg$col[1], lwd=leg$lwd[1] * scale, main=main, xlab=xlab, ylab=ylab, xaxt="n", yaxt="n", cex=scale, type="n")#, lty=leg.lty[1], ...) include_peak_dots = FALSE # BUGBUG: Disabling this for now. Need to move this to class parameter for (i in 1:length(leg$group)){ group <- leg$group[i] d <- densities[[group]] lines(d, col=leg$col[i], lwd=leg$lwd[i] * scale) if (include_peak_dots){ x=d$x[which.max(d$y)] y=max(d$y) points(x=c(x,x), y=c(0,y), pch=19, col=leg$col[i], cex=scale) } } abline(v=0, lwd=scale) if (include_text){ legend_object = Legend_Object$new(leg, scale) legend_object$show() } } # Main body for plot_density_PMD_by_score() data_groups <- data_processor()$data_groups$df data_subset_a <- subset(data_groups , used_to_find_middle == FALSE) data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11) densities_a <- get_densities(data_subset = data_subset_a, var_value = "value") densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm") xlim=get_xlim(densities_a, densities_b) ylim=get_ylim(densities_a, densities_b) dataset_name <- data_processor()$info$collection_name() if (show_norm){ plot_distributions(densities=densities_b, var_value = "value_norm", xlab_modifier = " - normalized", xlim=xlim, ylim=ylim) } else { plot_distributions(densities=densities_a, var_value = "value" , xlab_modifier = "" , xlim=xlim, ylim=ylim) } } ) ############################################################################### # Class: Plot_Dataset_Description ############################################################################### Plot_Dataset_Description = setRefClass("Plot_Dataset_Description", contains = "Plot_Multiple_Images", fields = list(ylim_time_invariance = "numeric")) Plot_Dataset_Description$methods( initialize = function(p_data_processors = NULL, p_include_text=TRUE, p_include_main=FALSE, p_ylim_time_invariance = c(-4,4), ...){ plot_object_r1_c1 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors, p_include_text=p_include_text, p_include_main=p_include_main, p_training_class = "good_testing", p_field_of_interest = "value", p_ylim = p_ylim_time_invariance) plot_object_r1_c2 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors, p_include_text=p_include_text, p_include_main=p_include_main, p_training_class = "good_testing", p_field_of_interest = "value_norm", p_ylim = p_ylim_time_invariance) plot_object_r2_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors, p_show_norm=FALSE, p_include_text=p_include_text, p_include_main=p_include_main) plot_object_r2_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors, p_show_norm=FALSE, p_include_text=p_include_text, p_include_main=p_include_main) plot_object_r3_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors, p_show_norm=TRUE, p_include_text=p_include_text, p_include_main=p_include_main) plot_object_r3_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors, p_show_norm=TRUE, p_include_text=p_include_text, p_include_main=p_include_main) callSuper(p_n_images_wide=2, p_n_images_tall=3, p_include_text=p_include_text, p_include_main=p_include_main, p_image_list = list(plot_object_r1_c1, plot_object_r1_c2, plot_object_r2_c1, plot_object_r2_c2, plot_object_r3_c1, plot_object_r3_c2), ...) } ) ############################################################################### # Class: Plots_for_Paper ############################################################################### Plots_for_Paper <- setRefClass("Plots_for_Paper", fields =list(data_processor_a = "Data_Processor", data_processor_b = "Data_Processor", data_processor_c = "Data_Processor", data_processor_d = "Data_Processor", include_text = "logical", include_main = "logical", mai = "numeric")) Plots_for_Paper$methods( initialize = function(){ data_processor_a <<- Data_Processor$new(p_info = Data_Object_Info_737_two_step$new()) data_processor_b <<- Data_Processor$new(p_info = Data_Object_Info_737_combined$new()) data_processor_c <<- Data_Processor$new(p_info = Data_Object_Pyrococcus_tr $new()) data_processor_d <<- Data_Processor$new(p_info = Data_Object_Mouse_Mutations $new()) }, create_plots_for_paper = function(include_main=TRUE, finalize=TRUE){ print_table_4_data() print_figure_2_data() plot_figure_D(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_C(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_B(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_A(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_8(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_7(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_6(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main) plot_figure_5(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_4(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main) plot_figure_3(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main) }, print_figure_2_data = function(){ print(create_stats_for_grouping_figure(list(data_processor_a))) }, print_table_4_data = function(){ report_ranges_of_comparisons(processors = list(data_processor_a)) report_ranges_of_comparisons(processors = list(data_processor_c)) }, plot_figure_3 = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Compare_PMD_and_Norm_Density$new(p_data_processor = list(data_processor_a), p_show_norm = FALSE, p_include_text = TRUE, p_include_main = p_include_main, p_display_n_psms = FALSE) plot_object$plot_image_in_small_window(p_scale=p_scale) }, plot_figure_4 = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Time_Invariance_Alt_Before_and_After$new(p_data_processors = list(data_processor_a), p_include_text=TRUE, p_include_main=p_include_main, p_ylim = c(-4,4)) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, plot_figure_5 = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors = list(data_processor_a), p_include_text=TRUE, p_include_main=p_include_main) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, plot_figure_6 = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Bad_CI$new(p_data_processors = list(data_processor_a), p_include_text=TRUE, p_include_main=p_include_main) plot_object$plot_image_in_small_window(p_scale=p_scale) }, plot_figure_7 = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$new(p_data_processors = list(data_processor_a), p_include_text=TRUE, p_include_main=p_include_main) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, plot_figure_8 = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Selective_Loss$new(p_data_processors = list(data_processor_c), p_include_text=TRUE, p_include_main=p_include_main) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, plot_figure_A = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_a), p_include_text=TRUE, p_include_main=p_include_main, p_ylim_time_invariance=c(-4,4) ) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, plot_figure_B = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_b), p_include_text=TRUE, p_include_main=p_include_main, p_ylim_time_invariance=c(-4,4) ) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, plot_figure_C = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_c), p_include_text=TRUE, p_include_main=p_include_main, p_ylim_time_invariance=c(-4,4) ) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, plot_figure_D = function(p_scale=NULL, p_include_main=NULL){ plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_d), p_include_text=TRUE, p_include_main=p_include_main, p_ylim_time_invariance=c(-4,4) ) plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale) }, create_stats_for_grouping_figure = function(processors=NULL){ processor <- processors[[1]] processor$i_fdr$ensure() aug_i_fdr <- processor$i_fdr$df aug_i_fdr$group_good_bad_other <- gsub("_.*", "", aug_i_fdr$group_training_class) aug_i_fdr$group_null <- "all" table(aug_i_fdr$group_training_class) table(aug_i_fdr$group_good_bad_other) table(aug_i_fdr$group_null) create_agg_fdr_stats <- function(i_fdr=NULL, grouping_var_name = NULL){ formula_fdr <- as.formula(sprintf("%s~%s", "i_fdr", grouping_var_name)) formula_len <- as.formula(sprintf("%s~%s", "PMD_FDR_peptide_length", grouping_var_name)) agg_fdr <- aggregate(formula=formula_fdr, data=i_fdr, FUN=mean) agg_n <- aggregate(formula=formula_fdr, data=i_fdr, FUN=length) agg_len <- aggregate(formula=formula_len, data=i_fdr, FUN=mean) agg_fdr <- rename_columns(df = agg_fdr, names_before = c(grouping_var_name, "i_fdr"), names_after = c("group" , "fdr")) agg_n <- rename_columns(df = agg_n, names_before = c(grouping_var_name, "i_fdr"), names_after = c("group" , "n")) agg_len <- rename_columns(df = agg_len, names_before = c(grouping_var_name), names_after = c("group" )) agg <- merge(agg_fdr, agg_n) agg <- merge(agg , agg_len) return(agg) } agg_detail <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_training_class") agg_grouped <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_good_bad_other") agg_all <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_null") agg <- rbind(agg_detail, agg_grouped) agg <- rbind(agg, agg_all) agg$fdr <- ifelse(agg$fdr < 1, agg$fdr, 1) linear_combo <- function(x=NULL, a0=NULL, a1=NULL){ result <- (a0 * (1-x) + a1 * x) return(result) } agg$r <- linear_combo(agg$fdr, a0=197, a1= 47) agg$g <- linear_combo(agg$fdr, a0= 90, a1= 85) agg$b <- linear_combo(agg$fdr, a0= 17, a1=151) return(agg) }, report_ranges_of_comparisons = function(processors=NULL){ report_comparison_of_Confidence_and_PMD = function (i_fdr = NULL, min_conf=NULL, max_conf=NULL, include_max=FALSE){ report_PMD_confidence_comparison_from_subset = function(data_subset=NULL, group_name=NULL){ print(group_name) print(sprintf(" Number of PSMs: %d", nrow(data_subset))) mean_confidence <- mean(data_subset$PMD_FDR_input_score) print(sprintf(" Mean Confidence Score: %3.1f", mean_confidence)) print(sprintf(" PeptideShaker g-FDR: %3.1f", 100-mean_confidence)) mean_PMD_FDR = mean(data_subset$i_fdr) print(sprintf(" PMD g-FDR: %3.1f", 100*mean_PMD_FDR)) #col <- col2hex("black", 0.2) #plot(data_subset$i_fdr, pch=".", cex=2, col=col) #abline(h=0) } if (is.null(max_conf)) { data_subset <- subset(i_fdr, PMD_FDR_input_score == min_conf) group_name <- sprintf("Group %d", min_conf) } else if (include_max){ data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score <= max_conf)) group_name <- sprintf("Group %d through %d", min_conf, max_conf) } else { data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score < max_conf)) group_name <- sprintf("Group %d to %d", min_conf, max_conf) } report_PMD_confidence_comparison_from_subset(data_subset=data_subset, group_name=group_name) } processor <- processors[[1]] processor$i_fdr$ensure() i_fdr <- processor$i_fdr$df info <- processor$info print(sprintf("PMD and Confidence comparison for -- %s", info$collection_name())) report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf=100, max_conf=NULL, include_max=TRUE) report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 99, max_conf=100 , include_max=FALSE) report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 90, max_conf= 99 , include_max=FALSE) report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 0, max_conf=100 , include_max=TRUE) } ) ############################################################################### # C - 021 - PMD-FDR Wrapper - functions.R # # # # Creates the necessary structure to convert the PMD-FDR code into one that # # can run as a batch file # # # ############################################################################### ############################################################################### # Class: ModuleArgParser_PMD_FDR ############################################################################### ModuleArgParser_PMD_FDR <- setRefClass("ModuleArgParser_PMD_FDR", contains = c("ArgParser"), fields =list(args = "character") ) ModuleArgParser_PMD_FDR$methods( initialize = function(description = "Computes individual and global FDR using Precursor Mass Discrepancy (PMD-FDR)", ...){ callSuper(description=description, ...) local_add_argument("--psm_report" , help="full name and path to the PSM report") local_add_argument("--psm_report_1_percent", default = "" , help="full name and path to the PSM report for 1% FDR") local_add_argument("--output_i_fdr" , default = "" , help="full name and path to the i-FDR output file ") local_add_argument("--output_g_fdr" , default = "" , help="full name and path to the g-FDR output file ") local_add_argument("--output_densities" , default = "" , help="full name and path to the densities output file ") #local_add_argument("--score_field_name" , default = "" , help="name of score field (in R format)") local_add_argument("--input_file_type" , default = "PMD_FDR_input_file", help="type of input file (currently supports: PSM_Report)") } ) ############################################################################### # Class: Data_Object_Parser ############################################################################### Data_Object_Parser <- setRefClass("Data_Object_Parser", contains = c("Data_Object"), fields =list(parser = "ModuleArgParser_PMD_FDR", args = "character", parsing_results = "list") ) Data_Object_Parser$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Parser" }, verify = function(){ # Nothing to do here - parser handles verification during load }, m_load_data = function(){ if (length(args) == 0){ parsing_results <<- parser$parse_arguments(NULL) } else { parsing_results <<- parser$parse_arguments(args) } }, set_args = function(p_args=NULL){ # This is primarily used for testing. In operation arguments will be passed automatically (through use of commandArgs) args <<- p_args set_dirty(TRUE) } ) ############################################################################### # Class: Data_Object_Info_Parser ############################################################################### Data_Object_Info_Parser <- setRefClass("Data_Object_Info_Parser", contains = c("Data_Object_Info"), fields =list( output_i_fdr = "character", output_g_fdr = "character", output_densities = "character" ) ) Data_Object_Info_Parser$methods( initialize = function(){ callSuper() class_name <<- "Data_Object_Info_Parser" }, verify = function(){ check_field_exists = function(field_name=NULL, check_empty = TRUE){ field_value <- get_parser()$parsing_results[field_name] checkTrue(! is.null(field_value), msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value)) if (check_empty){ checkTrue(! is.null(field_value), msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value)) } } # Check parameters passed in check_field_exists("junk") check_field_exists("psm_report") check_field_exists("psm_report_1_percent", check_empty = FALSE) check_field_exists("output_i_fdr" , check_empty = FALSE) check_field_exists("output_g_fdr" , check_empty = FALSE) check_field_exists("output_densities" , check_empty = FALSE) #check_field_exists("score_field_name") check_field_exists("input_file_type") }, m_load_data = function(){ parsing_results <- get_parser()$parsing_results data_file_name <<- as.character(parsing_results["psm_report"]) data_file_name_1_percent_FDR <<- as.character(parsing_results["psm_report_1_percent"]) data_path_name <<- as.character(parsing_results[""]) #experiment_name <<- data_file_name #designation <<- "" output_i_fdr <<- as.character(parsing_results["output_i_fdr"]) output_g_fdr <<- as.character(parsing_results["output_g_fdr"]) output_densities <<- as.character(parsing_results["output_densities"]) input_file_type <<- as.character(parsing_results["input_file_type"]) #score_field_name <<- as.character(parsing_results["score_field_name"]) }, set_parser = function(parser){ parents[["parser"]] <<- parser }, get_parser = function(){ return(verified_element_of_list(parents, "parser", "Data_Object_Info_Parser$parents")) }, file_path = function(){ result <- data_file_name # Now assumes that full path is provided if (length(result) == 0){ stop("Unable to validate file path - file name is missing") } return(result) }, file_path_1_percent_FDR = function(){ local_file_name <- get_data_file_name_1_percent_FDR() if (length(local_file_name) == 0){ result <- "" } else { result <- local_file_name # path name is no longer relevant } # Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream # if (length(result) == 0){ # stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing") # } return(result) }, get_data_file_name_1_percent_FDR = function(){ return(data_file_name_1_percent_FDR) }, collection_name = function(){ result <- "" return(result) } ) ############################################################################### # Class: Processor_PMD_FDR_for_Galaxy # Purpose: Wrapper on tools from Project 019 to enable a Galaxy-based interface ############################################################################### Processor_PMD_FDR_for_Galaxy <- setRefClass("Processor_PMD_FDR_for_Galaxy", fields = list( parser = "Data_Object_Parser", info = "Data_Object_Info_Parser", raw_data = "Data_Object_Raw_Data", raw_1_percent = "Data_Object_Raw_1_Percent", data_converter = "Data_Object_Data_Converter", data_groups = "Data_Object_Groupings", densities = "Data_Object_Densities", alpha = "Data_Object_Alpha", i_fdr = "Data_Object_Individual_FDR" )) Processor_PMD_FDR_for_Galaxy$methods( initialize = function(){ # This initialization defines all of the dependencies between the various components # (Unfortunately, inheriting from Data_Processor leads to issues - I had to reimplement it here with a change to "info") # info info$set_parser(parser) parser$append_child(info) # raw_data raw_data$set_info(info) info$append_child(raw_data) # raw_1_percent raw_1_percent$set_info(info) info$append_child(raw_1_percent) # data_converter data_converter$set_info (info) data_converter$set_raw_data(raw_data) info $append_child (data_converter) raw_data $append_child (data_converter) # data_groups data_groups$set_info (info) data_groups$set_data_converter(data_converter) data_groups$set_raw_1_percent (raw_1_percent) info $append_child (data_groups) data_converter$append_child (data_groups) raw_1_percent $append_child (data_groups) # densities densities $set_data_groups(data_groups) data_groups$append_child (densities) # alpha alpha $set_densities(densities) densities$append_child (alpha) # i_fdr i_fdr$set_data_groups(data_groups) i_fdr$set_densities (densities) i_fdr$set_alpha (alpha) data_groups $append_child(i_fdr) densities $append_child(i_fdr) alpha $append_child(i_fdr) }, compute = function(){ #i_fdr is currently the lowest level object - it ultimately depends on everything else. i_fdr$ensure() # All pieces on which i_fdr depends are automatically verified and computed (through their verify() and ensure()) save_standard_df(x = densities$df, file_path = info$output_densities) save_standard_df(x = alpha$df, file_path = info$output_g_fdr) save_standard_df(x = i_fdr$df, file_path = info$output_i_fdr) } ) ############################################################################### # D - 021 - PMD-FDR Main.R # # # # File Description: Contains the base code that interprets the parameters # # and computes i-FDR and g-FDR for a mass spec project # # # ############################################################################### argv <- commandArgs(TRUE) # Saves the parameters (command code) processor <- Processor_PMD_FDR_for_Galaxy$new() processor$parser$set_args(argv) processor$compute()