pmd_fdr: PMD_FDR_package_for_Galaxy.R comparison

comparison PMD_FDR_package_for_Galaxy.R @ 0:5cc0c32d05a2 draft

"planemo upload for repository https://github.com/galaxyproteomics/tools-galaxyp/tree/master/tools/pmd_fdr commit 00f85eca73cd8afedfefbeec94a4462455ac1a9a"

author	galaxyp
date	Mon, 07 Oct 2019 11:59:37 -0400
parents
children	460edeedeb7d

comparison

equal deleted inserted replaced

--1:000000000000
+:5cc0c32d05a2
+###############################################################################
+# PMD_FDR_package_for_Galaxy.R                                                #
+#                                                                             #
+# Project 021 - PMD-FDR for Galaxy-P                                          #
+#                                                                             #
+# Description: Computes iFDR and gFDR on PSMs as a script designed for Galaxy #
+#              Note that plotting code has been left in that is not used      #
+#              in this file; this is the code I used to create figures for    #
+#              publication. I left it in for potential development of views.  #
+#                                                                             #
+#              This file was created by concatenating the following files:    #
+#                                                                             #
+#                   A - 005 - Parser - ArgParser.R                            #
+#                   B - 019 - PMD-FDR - functions.R                           #
+#                   C - 021 - PMD-FDR Wrapper - functions.R                   #
+#                   D - 021 - PMD-FDR Main.R                                  #
+#                                                                             #
+# Required packages: argparser                                                #
+#                    stringr                                                  #
+#                    RUnit                                                    #
+#                                                                             #
+# Release date: 2019-10-05                                                    #
+#      Version: 1.4                                                           #
+#                                                                             #
+###############################################################################
+# Package currently supports the following parameters:
+#
+# --psm_report            full name and path to the PSM report
+# --psm_report_1_percent  full name and path to the PSM report for 1% FDR
+# --output_i_fdr          full name and path to the i-FDR output file
+# --output_g_fdr          full name and path to the g-FDR output file
+# --output_densities      full name and path to the densities output file
+#
+###############################################################################
+# A - 005 - Parser - ArgParser.R                                              #
+#                                                                             #
+# Description: Wrapper for argparser package, using RefClass                  #
+#                                                                             #
+###############################################################################
+#install.packages("argparser")
+library(argparser)
+# Class definition
+ArgParser <- setRefClass("ArgParser",
+fields = c("parser"))
+ArgParser$methods(
+initialize = function(...){
+parser <<- arg_parser(...)
+},
+local_add_argument = function(...){
+parser <<- add_argument(parser, ...)
+},
+parse_arguments = function(...){
+result = parse_args(parser, ...)
+return(result)
+}
+)
+###############################################################################
+# B - 019 - PMD-FDR - functions.R                                             #
+#                                                                             #
+# Primary work-horse for PMD-FDR                                              #
+#                                                                             #
+###############################################################################
+###############################################################################
+####### Load libraries etc.
+###############################################################################
+library(stringr)
+library(RUnit)
+#############################################################
+####### Global values (should be parameters to module but aren't yet)
+#############################################################
+MIN_GOOD_PEPTIDE_LENGTH          <- 11
+MIN_ACCEPTABLE_POINTS_IN_DENSITY <- 10
+#############################################################
+####### General purpose functions
+#############################################################
+# Creates a more useful error report when file is not reasonable
+safe_file_exists <- function(file_path){ # Still not particularly useful in cases where it is a valid directory
+tryCatch(
+return(file_test(op = "-f", x=file_path)),
+error=function(e) {simpleError(sprintf("file path is not valid: '%s'", file_path))}
+)
+}
+# My standard way of loading data into data.frames
+load_standard_df <- function(file_path=NULL){
+clean_field_names = function(field_names){
+result <- field_names
+idx_blank <- which(result == "")
+result[idx_blank] <- sprintf("<Field %d>", idx_blank)
+return(result)
+}
+if (safe_file_exists(file_path)){
+field_names <- read_field_names(file_path, sep = "\t")
+field_names <- clean_field_names(field_names)
+if (length(field_names) == 0){
+return(data.frame())
+}
+data <- read.table(file = file_path, header = TRUE, sep = "\t", stringsAsFactors = FALSE, blank.lines.skip = TRUE)#, check.names = FALSE)
+colnames(data) = field_names
+} else {
+stop(sprintf("File path does not exist: '%s'", file_path))
+}
+return(data)
+}
+save_standard_df <- function(x=NULL, file_path=NULL){
+if (file_path != ""){
+write.table(x = x, file = file_path, quote = FALSE, sep = "\t", row.names = FALSE, col.names = TRUE)
+}
+}
+rename_column <- function(df=NULL, name_before=NULL, name_after=NULL, suppressWarnings=FALSE){
+if (is.null(df)){
+stop("Dataframe (df) does not exist - unable to rename column")
+}
+if (name_before %in% colnames(df)){
+df[,name_after]  <- df[,name_before]
+df[,name_before] <- NULL
+} else if (!suppressWarnings){
+warning(sprintf("'%s' is not a field in the data frame and so has not been renamed", name_before))
+}
+return(df)
+}
+rename_columns <- function(df=NULL, names_before=NULL, names_after=NULL){
+for (i in safe_iterator(length(names_before))){
+df <- rename_column(df, names_before[i], names_after[i])
+}
+return(df)
+}
+round_to_tolerance    <- function(x=NULL, tolerance=NULL, ...){
+return(function_to_tolerance(x=x, tolerance=tolerance, FUN=round, ...))
+}
+function_to_tolerance <- function(x=NULL, tolerance=NULL, FUN=NULL, ...){
+return(FUN(x/tolerance, ...) * tolerance)
+}
+safe_median <- function(x) median(x, na.rm=TRUE)
+normalize_density <- function(d){
+# Normalizes y-values in density function
+# so that the integral under the curve is 1
+# (uses rectangles to approximate area)
+delta_x               <- diff(range(d$x)) / length(d$x)
+unnormalized_integral <- delta_x * sum(d$y)
+new_d   <- d
+new_d$y <- with(new_d, y )
+return(new_d)
+}
+if_null <- function(cond=NULL, null_result=NULL, not_null_result=NULL){
+return(switch(1+is.null(cond),
+not_null_result,
+null_result))
+}
+rainbow_with_fixed_intensity <- function(n=NULL, goal_intensity_0_1=NULL, alpha=NULL){
+goal_intensity <- 255*goal_intensity_0_1
+hex_colors <- rainbow(n)
+rgb_colors <- col2rgb(hex_colors)
+df_colors <- data.frame(t(rgb_colors))
+df_colors$intensity <- with(df_colors, 0.2989*red + 0.5870*green + 0.1140*blue)
+df_colors$white_black <- with(df_colors, ifelse(intensity < goal_intensity, 255, 0))
+df_colors$mix_level   <- with(df_colors, (white_black - goal_intensity) / (white_black - intensity  ) )
+df_colors$new_red     <- with(df_colors, mix_level*red   + (1-mix_level)*white_black)
+df_colors$new_green   <- with(df_colors, mix_level*green + (1-mix_level)*white_black)
+df_colors$new_blue    <- with(df_colors, mix_level*blue  + (1-mix_level)*white_black)
+names_pref_new <- c("new_red", "new_green", "new_blue")
+names_no_pref  <- c("red", "green", "blue")
+df_colors <- df_colors[,names_pref_new]
+df_colors <- rename_columns(df_colors, names_before = names_pref_new, names_after = names_no_pref)
+rgb_colors <-as.matrix(df_colors/255 )
+return(rgb(rgb_colors, alpha=alpha))
+}
+safe_iterator <- function(n_steps = NULL){
+if (n_steps < 1){
+result = numeric(0)
+} else {
+result = 1:n_steps
+}
+return(result)
+}
+col2hex <- function(cols=NULL, col_alpha=255){
+if (all(col_alpha<=1)){
+col_alpha <- round(col_alpha*255)
+}
+col_matrix <- t(col2rgb(cols))
+results <- rgb(col_matrix, alpha=col_alpha, maxColorValue = 255)
+return(results)
+}
+credible_interval <- function(x=NULL, N=NULL, precision=0.001, alpha=0.05){
+# Approximates "highest posterior density interval"
+# Uses exact binomial but with a finite list of potential values (1/precision)
+p <- seq(from=0, to=1, by=precision)
+d <- dbinom(x = x, size = N, prob = p)
+d <- d / sum(d)
+df <- data.frame(p=p, d=d)
+df <- df[order(-df$d),]
+df$cumsum <- cumsum(df$d)
+max_idx <- sum(df$cumsum < (1-alpha)) + 1
+max_idx <- min(max_idx, nrow(df))
+lower <- min(df$p[1:max_idx])
+upper <- max(df$p[1:max_idx])
+return(c(lower,upper))
+}
+verified_element_of_list <- function(parent_list=NULL, element_name=NULL, object_name=NULL){
+if (is.null(parent_list[[element_name]])){
+if (is.null(object_name)){
+object_name = "the list"
+}
+stop(sprintf("Element '%s' does not yet exist in %s", element_name, object_name))
+}
+return(parent_list[[element_name]])
+}
+read_field_names = function(file_path=NULL, sep = "\t"){
+con = file(file_path,"r")
+fields = readLines(con, n=1)
+close(con)
+if (length(fields) == 0){
+return(c())
+}
+fields = strsplit(x = fields, split = sep)[[1]]
+return(fields)
+}
+check_field_name = function(input_df = NULL, name_of_input_df=NULL, field_name=NULL){
+test_succeeded <- field_name %in% colnames(input_df)
+current_columns <- paste0(colnames(input_df), collapse=", ")
+checkTrue(test_succeeded,
+msg = sprintf("Expected fieldname '%s' in %s (but did not find it among %s)",
+field_name, name_of_input_df, current_columns))
+}
+#############################################################
+####### Classes for Data
+#############################################################
+###############################################################################
+#            Class: Data_Object
+###############################################################################
+Data_Object <- setRefClass("Data_Object",
+fields =list(m_is_dirty = "logical",
+parents    = "list",
+children   = "list",
+class_name = "character"))
+Data_Object$methods(
+initialize = function(){
+m_is_dirty <<- TRUE
+class_name <<- "Data_Object <abstract class - class_name needs to be set in subclass>"
+},
+load_data = function(){
+#print(sprintf("Calling %s$load_data()", class_name)) # Useful for debugging
+ensure_parents()
+verify()
+m_load_data()
+set_dirty(new_value = FALSE)
+},
+ensure = function(){
+if (m_is_dirty){
+load_data()
+}
+},
+set_dirty = function(new_value){
+if (new_value != m_is_dirty){
+m_is_dirty <<- new_value
+set_children_dirty()
+}
+},
+verify = function(){
+stop(sprintf("verify() is an abstract method - define it in %s before calling load_data()", class_name))
+},
+m_load_data = function(){
+stop(sprintf("m_load_data() is an abstract method - define it in %s before calling load_data()", class_name))
+},
+append_parent = function(parent=NULL){
+parents <<- append(parents, parent)
+},
+append_child = function(child=NULL){
+children <<- append(children, child)
+},
+ensure_parents = function(){
+for (parent in parents){
+# print(sprintf("Calling %s$ensure()", parent$class_name)) # Useful for debugging
+parent$ensure()
+}
+},
+set_children_dirty = function(){
+for (child in children){
+child$set_dirty(TRUE)
+}
+}
+)
+###############################################################################
+#            Class: Data_Object_Info
+###############################################################################
+Data_Object_Info <- setRefClass("Data_Object_Info",
+contains = "Data_Object",
+fields =list(
+data_file_name_1_percent_FDR = "character",
+data_file_name  = "character",
+data_path_name  = "character",
+experiment_name = "character",
+designation     = "character",
+input_file_type = "character"
+#score_field_name = "character"
+#collection_name="character",
+#dir_results="character",
+#dir_dataset="character",
+#dataset_designation="character",
+#file_name_dataset="character",
+#file_name_dataset_1_percent="character",
+#experiment_name="character"
+) )
+Data_Object_Info$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Info - <Abstract class - class_name needs to be set in subclass>"
+},
+verify = function(){
+checkFieldExists = function(field_name=NULL){
+field_value <- .self[[field_name]]
+checkTrue(length(field_value) > 0,
+sprintf("Field %s$%s has not been set (and should have been)", class_name, field_name))
+checkTrue(length(field_value) == 1,
+sprintf("Field %s$%s has been set to multiple values (and should be a single value)", class_name, field_name))
+checkTrue(field_value != "",
+sprintf("Field %s$%s has been set to an empty string (and should not have been)", class_name, field_name))
+}
+checkFieldExists("data_file_name")
+checkFieldExists("data_path_name")
+checkFieldExists("experiment_name")
+checkFieldExists("designation")
+checkFieldExists("input_file_type")
+#checkFieldExists("score_field_name")
+},
+m_load_data = function(){
+# Nothing to do - this is really a data class
+},
+file_path = function(){
+result <- file.path(data_path_name, data_file_name)
+if (length(result) == 0){
+stop("Unable to validate file path - one or both of path name and file name are missing")
+}
+return(result)
+},
+file_path_1_percent_FDR = function(){
+local_file_name <- get_data_file_name_1_percent_FDR()
+if (length(local_file_name) == 0){
+result <- ""
+} else {
+result <- file.path(data_path_name, local_file_name)
+}
+# Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream
+# if (length(result) == 0){
+#   stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing")
+# }
+return(result)
+},
+get_data_file_name_1_percent_FDR = function(){
+return(data_file_name_1_percent_FDR)
+},
+collection_name = function(){
+result <- sprintf("%s_%s", experiment_name, designation)
+return(result)
+}
+)
+###############################################################################
+#            Class: Data_Object_Info_737_two_step
+###############################################################################
+Data_Object_Info_737_two_step <- setRefClass("Data_Object_Info_737_two_step",
+contains = "Data_Object_Info",
+fields =list())
+Data_Object_Info_737_two_step$methods(
+initialize = function(){
+callSuper()
+class_name                   <<- "Data_Object_Info_737_two_step"
+#score_field_name             <<- "Confidence [%]"
+data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
+data_file_name               <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_Multi_Stage_Two_Step.tabular.tabular"
+data_path_name               <<- file.path(".", "Data")
+experiment_name              <<- "Oral_737_NS"
+designation                  <<- "two_step"
+input_file_type              <<- "PSM_Report"
+#data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
+#data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
+}
+)
+###############################################################################
+#            Class: Data_Object_Info_737_combined
+###############################################################################
+Data_Object_Info_737_combined <- setRefClass("Data_Object_Info_737_combined",
+contains = "Data_Object_Info",
+fields =list())
+Data_Object_Info_737_combined$methods(
+initialize = function(){
+callSuper()
+class_name                   <<- "Data_Object_Info_737_combined"
+#score_field_name             <<- "Confidence [%]"
+data_file_name_1_percent_FDR <<- "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
+data_file_name               <<- "737_NS_Peptide_Shaker_Extended_PSM_Report_CombinedDB.tabular"
+data_path_name               <<- file.path(".", "Data")
+experiment_name              <<- "Oral_737_NS"
+designation                  <<- "two_step"
+input_file_type              <<- "PSM_Report"
+#data_collection_oral_737_NS_combined$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_CombinedDB.tabular"
+#data_collection_oral_737_NS_two_step$file_name_dataset_1_percent = "737_NS_Peptide_Shaker_PSM_Report_Multi_Stage_Two_Step.tabular"
+}
+)
+###############################################################################
+#            Class: Data_Object_Pyrococcus_tr
+###############################################################################
+Data_Object_Pyrococcus_tr <- setRefClass("Data_Object_Pyrococcus_tr",
+contains = "Data_Object_Info",
+fields =list())
+Data_Object_Pyrococcus_tr$methods(
+initialize = function(){
+callSuper()
+class_name                   <<- "Data_Object_Pyrococcus_tr"
+#score_field_name             <<- "Confidence [%]"
+data_file_name_1_percent_FDR <<- ""
+data_file_name               <<- "Pfu_traditional_Extended_PSM_Report.tabular"
+data_path_name               <<- file.path(".", "Data")
+experiment_name              <<- "Pyrococcus"
+designation                  <<- "tr"
+input_file_type              <<- "PSM_Report"
+}
+)
+###############################################################################
+#            Class: Data_Object_Mouse_Mutations
+###############################################################################
+Data_Object_Mouse_Mutations <- setRefClass("Data_Object_Mouse_Mutations",
+contains = "Data_Object_Info",
+fields =list())
+Data_Object_Mouse_Mutations$methods(
+initialize = function(){
+callSuper()
+class_name                   <<- "Data_Object_Mouse_Mutations"
+#score_field_name             <<- "Confidence [%]"
+data_file_name_1_percent_FDR <<- ""
+data_file_name               <<- "Combined_DB_Mouse_5PTM.tabular"
+data_path_name               <<- file.path(".", "Data")
+experiment_name              <<- "Mouse Mutations"
+designation                  <<- "combined_05"
+input_file_type              <<- "PSM_Report"
+}
+)
+###############################################################################
+#            Class: Data_Object_Raw_Data
+###############################################################################
+Data_Object_Raw_Data <- setRefClass("Data_Object_Raw_Data",
+contains = "Data_Object",
+fields =list(df = "data.frame"))
+Data_Object_Raw_Data$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Raw_Data"
+},
+verify = function(){
+# Check that file exists before using it
+file_path <- get_info()$file_path()
+if (! safe_file_exists(file_path)){
+stop(sprintf("Raw data file does not exist (%s)", file_path))
+}
+# BUGBUG: Needs to also check the following:
+#         - file is tab-delimited
+#         - first row is a list of column names
+},
+set_info = function(info){
+parents[["info"]] <<- info
+},
+get_info = function(){
+return(verified_element_of_list(parents, "info", "Data_Object_Raw_Data$parents"))
+},
+m_load_data = function(){
+info <- get_info()
+df <<- load_standard_df(info$file_path())
+}
+)
+###############################################################################
+#            Class: Data_Object_Raw_1_Percent
+###############################################################################
+Data_Object_Raw_1_Percent <- setRefClass("Data_Object_Raw_1_Percent",
+contains = "Data_Object",
+fields =list(df = "data.frame"))
+Data_Object_Raw_1_Percent$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Raw_1_Percent"
+},
+set_info = function(info){
+parents[["info"]] <<- info
+},
+verify = function(){
+# Do nothing - a missing file name is acceptable for this module and is dealt with in load()
+},
+get_info = function(){
+return(verified_element_of_list(parents, "info", "Data_Object_Raw_1_Percent$parents"))
+},
+m_load_data = function(){
+info <- get_info()
+file_path <- info$file_path_1_percent_FDR()
+if (exists()){
+df <<- load_standard_df(info$file_path_1_percent_FDR())
+} # Note that failing to load is a valid state for this file, leading to not is_dirty. BUGBUG: this could lead to problems if a good file appears later
+},
+exists = function(){
+info <- get_info()
+local_file_name <- info$get_data_file_name_1_percent_FDR() # Check file name not file path
+if (length(local_file_name) == 0 ){ # variable not set
+result = FALSE
+} else if (local_file_name == ""){  # variable set to empty string
+result = FALSE
+} else {
+result = safe_file_exists(info$file_path_1_percent_FDR())
+}
+return(result)
+}
+)
+###############################################################################
+#            Class: Data_Converter
+###############################################################################
+Data_Converter <- setRefClass("Data_Converter",
+fields =list(class_name = "character",
+file_type  = "character"
+) )
+Data_Converter$methods(
+initialize = function(){
+class_name <<- "Data_Converter <abstract class - class_name needs to be set in subclass>"
+file_type  <<- "file_type has not been set before being used <needs to be set in initialize() of subclass>"
+},
+check_raw_fields = function(info=NULL, raw_data=NULL){
+stop(sprintf("check_raw_fields() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name))
+},
+convert_data = function(){
+stop(sprintf("convert_data() is an abstract method - define it in %s before calling Data_Object_Data_Converter$load_data()", class_name))
+}
+)
+###############################################################################
+#            Class: Data_Converter_PMD_FDR_input_file
+###############################################################################
+Data_Converter_PMD_FDR_input_file <- setRefClass("Data_Converter_PMD_FDR_input_file",
+contains = "Data_Converter",
+fields =list(
+) )
+Data_Converter_PMD_FDR_input_file$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Converter_PMD_FDR_input_file"
+file_type  <<- "PMD_FDR_file_type"
+},
+check_raw_fields = function(info=NULL, raw_data=NULL){
+data_original <- raw_data$df
+check_field_name(data_original, "raw_data", "PMD_FDR_input_score")
+check_field_name(data_original, "raw_data", "PMD_FDR_pmd")
+check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_file")
+check_field_name(data_original, "raw_data", "PMD_FDR_proteins")
+check_field_name(data_original, "raw_data", "PMD_FDR_spectrum_title")
+check_field_name(data_original, "raw_data", "PMD_FDR_sequence")
+check_field_name(data_original, "raw_data", "PMD_FDR_decoy")
+},
+convert_data = function(info=NULL, raw_data=NULL){
+data_new <- raw_data$df
+return(data_new) # Pass through - everything should be in order
+}
+)
+###############################################################################
+#            Class: Data_Converter_PSM_Report
+###############################################################################
+Data_Converter_PSM_Report <- setRefClass("Data_Converter_PSM_Report",
+contains = "Data_Converter",
+fields =list(
+) )
+Data_Converter_PSM_Report$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Converter_PSM_Report"
+file_type  <<- "PSM_Report"
+},
+check_raw_fields = function(info=NULL, raw_data=NULL){
+data_original <- raw_data$df
+check_field_name(data_original, "raw_data", "Confidence [%]")
+check_field_name(data_original, "raw_data", "Precursor m/z Error [ppm]")
+check_field_name(data_original, "raw_data", "Spectrum File")
+check_field_name(data_original, "raw_data", "Protein(s)")
+check_field_name(data_original, "raw_data", "Spectrum Title")
+check_field_name(data_original, "raw_data", "Decoy")
+check_field_name(data_original, "raw_data", "Sequence")
+},
+convert_data = function(info=NULL, raw_data=NULL){
+data_new <- raw_data$df
+data_new$PMD_FDR_input_score    <- data_new[, "Confidence [%]"           ]
+data_new$PMD_FDR_pmd            <- data_new[, "Precursor m/z Error [ppm]"]
+data_new$PMD_FDR_spectrum_file  <- data_new[, "Spectrum File"            ]
+data_new$PMD_FDR_proteins       <- data_new[, "Protein(s)"               ]
+data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title"           ]
+data_new$PMD_FDR_sequence       <- data_new[, "Sequence"                 ]
+data_new$PMD_FDR_decoy          <- data_new[, "Decoy"                    ]
+return(data_new)
+}
+)
+###############################################################################
+#            Class: Data_Converter_MaxQuant_Evidence
+###############################################################################
+Data_Converter_MaxQuant_Evidence <- setRefClass("Data_Converter_MaxQuant_Evidence",
+contains = "Data_Converter",
+fields =list(
+) )
+Data_Converter_MaxQuant_Evidence$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Converter_MaxQuant_Evidence"
+file_type  <<- "MaxQuant_Evidence"
+},
+check_raw_fields = function(info=NULL, raw_data=NULL){
+data_original <- raw_data$df
+check_field_name(data_original, "raw_data", "PEP")
+check_field_name(data_original, "raw_data", "Mass error [ppm]")
+check_field_name(data_original, "raw_data", "Proteins")
+check_field_name(data_original, "raw_data", "Retention time")
+check_field_name(data_original, "raw_data", "Sequence")
+check_field_name(data_original, "raw_data", "Reverse")
+},
+convert_data = function(info=NULL, raw_data=NULL){
+data_new <- raw_data$df
+data_new$PMD_FDR_input_score    <- 100 * (1 - data_new[, "PEP"             ])
+data_new$PMD_FDR_pmd            <-            data_new[, "Mass error [ppm]"]
+data_new$PMD_FDR_spectrum_file  <-            "<place_holder - assumes a single spectra file>"
+data_new$PMD_FDR_proteins       <-            data_new[, "Proteins"        ]
+data_new$PMD_FDR_spectrum_title <-            data_new[, "Retention time"  ] # Used for ordering peptides - not important in MaxQuant since PMD has already been normalized effectively
+data_new$PMD_FDR_sequence       <-            data_new[, "Sequence"        ]
+data_new$PMD_FDR_decoy          <- ifelse(    data_new[, "Reverse"         ] == "+", 1, 0)
+return(data_new)
+}
+)
+###############################################################################
+#            Class: Data_Object_Data_Converter
+###############################################################################
+Data_Object_Data_Converter <- setRefClass("Data_Object_Data_Converter",
+contains = "Data_Object",
+fields =list(df             = "data.frame",
+data_converter = "Data_Converter"))
+Data_Object_Data_Converter$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Data_Converter"
+},
+currently_supported_file_types = function(){
+return(c("PSM_Report", "PMD_FDR_input_file"))
+},
+verify = function(){
+info     <- get_info()
+raw_data <- get_raw_data()
+file_type <- get_info()$input_file_type
+set_file_type(file_type)
+data_converter$check_raw_fields(info=info, raw_data=raw_data)
+},
+m_load_data = function(){
+info      <- get_info()
+raw_data  <- get_raw_data()
+file_type <- get_info()$input_file_type
+df <<- data_converter$convert_data(info=info, raw_data=raw_data)
+},
+set_file_type = function(file_type = NULL){
+if        (file_type == "PSM_Report"        ){
+data_converter <<- Data_Converter_PSM_Report        $new()
+} else if (file_type == "PMD_FDR_input_file"){
+data_converter <<- Data_Converter_PMD_FDR_input_file$new()
+} else if (file_type == "MaxQuant_Evidence"){
+data_converter <<- Data_Converter_MaxQuant_Evidence $new()
+} else {
+stop(sprintf("File type '%s' is not currently supported by PMD-FDR module", file_type))
+}
+},
+set_info = function(info){
+parents[["info"]] <<- info
+},
+get_info = function(){
+return(verified_element_of_list(parents, "info", "Data_Object_Data_Converter$parents"))
+},
+set_raw_data = function(raw_data){
+parents[["raw_data"]] <<- raw_data
+},
+get_raw_data = function(){
+return(verified_element_of_list(parents, "raw_data", "Data_Object_Data_Converter$parents"))
+}
+)
+###############################################################################
+#            Class: Data_Object_Groupings
+###############################################################################
+Data_Object_Groupings <- setRefClass("Data_Object_Groupings",
+contains = "Data_Object",
+fields =list(df = "data.frame"))
+Data_Object_Groupings$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Groupings"
+},
+simplify_field_name = function(x=NULL){
+result <- gsub(pattern = "PMD_FDR_", replacement = "", x = x)
+return(result)
+},
+verify = function(){
+data_original <- get_data_converter()$df
+check_field_name(data_original, "data_converter", "PMD_FDR_input_score")
+check_field_name(data_original, "data_converter", "PMD_FDR_pmd")
+check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_file")
+check_field_name(data_original, "data_converter", "PMD_FDR_proteins")
+check_field_name(data_original, "data_converter", "PMD_FDR_spectrum_title")
+check_field_name(data_original, "data_converter", "PMD_FDR_sequence")
+check_field_name(data_original, "data_converter", "PMD_FDR_decoy")
+},
+m_load_data = function(){
+make_data_groups <- function(data_original=NULL){
+# Functions supporting make_data_groups()
+standardize_fields <- function(data=NULL){
+data_new <- data
+info <- get_info()
+info$ensure()
+#field_name_of_score <- info$get_field_name_of_score()
+# #data_new <- rename_column(data_new, "Variable Modifications"   , "ptm_list")
+# data_new <- rename_column(data_new, field_name_of_score        , "PMD_FDR_input_score")
+# data_new <- rename_column(data_new, "Precursor m/z Error [ppm]", "PMD_FDR_pmd")
+# #data_new <- rename_column(data_new, "Isotope Number"           , "isotope_number")
+# #data_new <- rename_column(data_new, "m/z"                      , "m_z")
+# #data_new <- rename_column(data_new, "Measured Charge"          , "charge")
+# data_new <- rename_column(data_new, "Spectrum File"            , "PMD_FDR_spectrum_file")
+# data_new <- rename_column(data_new, "Protein(s)"               , "PMD_FDR_proteins")
+# data_new <- rename_column(data_new, "Spectrum Title"           , "PMD_FDR_spectrum_title")
+# data_new <- manage_decoy_column(data_new)
+# Now managed in Data_Converter
+# data_new$PMD_FDR_input_score    <- data_new[,  field_name_of_score       ]
+# data_new$PMD_FDR_pmd            <- data_new[, "Precursor m/z Error [ppm]"]
+# data_new$PMD_FDR_spectrum_file  <- data_new[, "Spectrum File"            ]
+# data_new$PMD_FDR_proteins       <- data_new[, "Protein(s)"               ]
+# data_new$PMD_FDR_spectrum_title <- data_new[, "Spectrum Title"           ]
+data_new$value          <- data_new$PMD_FDR_pmd
+data_new$PMD_FDR_peptide_length <- str_length(data_new$PMD_FDR_sequence)
+#data_new$charge_value   <- with(data_new, as.numeric(substr(charge, start=1, stop=str_length(charge)-1)))
+#data_new$measured_mass  <- with(data_new, m_z*charge_value)
+data_new$PMD_FDR_spectrum_index <- NA
+data_new$PMD_FDR_spectrum_index[order(data_new$PMD_FDR_spectrum_title, na.last = TRUE)] <- 1:nrow(data_new)
+return(data_new)
+}
+add_grouped_variable <- function(data_groups = data_groups, field_name_to_group = NULL, vec.length.out = NULL, vec.tolerance = NULL, value_format = NULL){
+# Support functions for add_grouped_variable()
+find_interval_vec <- function(x=NULL, length.out = NULL, tolerance = NULL){
+q <- quantile(x = x, probs = seq(from=0, to=1, length.out = length.out), na.rm=TRUE)
+q <- round_to_tolerance(q, tolerance = tolerance)
+return(q)
+}
+get_group_data_frame <- function(vec=NULL, value_format = NULL){
+n <- length(vec)
+a <- vec[-n]
+b <- vec[-1]
+lower      <- ifelse(a == b           , "eq", NA)
+lower      <- ifelse(is.na(lower     ), "ge", lower)
+upper      <- ifelse(a == b           , "eq", NA)
+upper[n-1] <- ifelse(is.na(upper[n-1]), "le", "eq")
+upper      <- ifelse(is.na(upper     ), "lt", upper)
+group <- data.frame(list(idx=1:(n-1), a=a, b=b, lower=lower, upper=upper))
+name_format <- sprintf("%%%s_%%%s_%%s_%%s", value_format, value_format)
+group$new_var <- with(group, sprintf(name_format, a, b, lower, upper))
+return(group)
+}
+merge_group_with_data <- function(data_groups = NULL, group = NULL, vec = NULL, field_name_to_group = NULL){
+field_name_new <- sprintf("group_%s", simplify_field_name(field_name_to_group))
+group_idx      <- findInterval(x = data_groups[,field_name_to_group],
+vec = vec,
+all.inside=TRUE)
+data_groups$new_var <- group$new_var[group_idx]
+data_groups         <- rename_column(data_groups, "new_var", field_name_new)
+}
+# Body of add_grouped_variable()
+vec    <- find_interval_vec(x          = data_groups[[field_name_to_group]],
+length.out = vec.length.out,
+tolerance  = vec.tolerance )
+group  <- get_group_data_frame(vec          = vec,
+value_format = value_format)
+df_new <- merge_group_with_data(data_groups         = data_groups,
+group               = group,
+vec                 = vec,
+field_name_to_group = field_name_to_group)
+df_new <- add_group_decoy(df_new, field_name_to_group)
+return(df_new)
+}
+add_already_grouped_variable <- function(field_name_to_group = NULL, data_groups = NULL ){
+old_name <- field_name_to_group
+new_name <- sprintf("group_%s", simplify_field_name(old_name))
+df_new <- data_groups
+df_new[[new_name]] <- data_groups[[old_name]]
+df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group)
+return(df_new)
+}
+add_value_norm <- function(data_groups = NULL){
+df_new            <- data_groups
+df_new$value_norm <- with(df_new, value - median_of_group_index)
+return(df_new)
+}
+add_protein_group <-function(data_groups = NULL){
+data_new <- data_groups
+df_group_def <- data.frame(stringsAsFactors = FALSE,
+list(pattern    = c(""     , "pfu_"      , "cRAP"),
+group_name = c("human", "pyrococcus", "contaminant")))
+for (i in 1:nrow(df_group_def)){
+idx <- grepl(pattern = df_group_def$pattern[i],
+x       = data_new$PMD_FDR_proteins)
+data_new$group_proteins[idx] <- df_group_def$group_name[i]
+}
+data_new <- add_group_decoy(data_groups = data_new, field_name_to_group = "PMD_FDR_proteins")
+return(data_new)
+}
+add_group_decoy <- function(data_groups=NULL, field_name_to_group=NULL){
+simple_field_name <- simplify_field_name(field_name_to_group)
+field_name_decoy <- sprintf("group_decoy_%s", simple_field_name)
+field_name_group <- sprintf("group_%s",       simple_field_name)
+data_groups[[field_name_decoy]] <- with(data_groups, ifelse(PMD_FDR_decoy, "decoy", data_groups[[field_name_group]]))
+return(data_groups)
+}
+add_group_training_class <- function(data_groups = NULL){
+df_new <- data_groups
+lowest_confidence_group <- min(data_groups$group_input_score)
+is_long_enough   <- with(df_new, (PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH)    )
+is_good          <- with(df_new, (PMD_FDR_decoy == 0) & (PMD_FDR_input_score == 100) )
+is_bad           <- with(df_new, (PMD_FDR_decoy == 1) )
+#is_used_to_train <- with(df_new, used_to_find_middle) # BUGBUG: circular definition
+idx_good         <- which(is_good         ) # & is_long_enough)
+n_good           <- length(idx_good)
+idx_testing      <- idx_good[c(TRUE,FALSE)] # Selects every other item
+idx_training     <- setdiff(idx_good, idx_testing)
+#is_good_short    <- with(df_new,  is_good      & !is_long_enough                )
+#is_good_long     <- with(df_new,  is_good      &  is_long_enough                )
+is_bad_short     <- with(df_new,  is_bad       & !is_long_enough                )
+is_bad_long      <- with(df_new,  is_bad       &  is_long_enough                )
+#is_good_training <- with(df_new,  is_good_long & (used_to_find_middle == TRUE ) )
+#is_good_testing  <- with(df_new,  is_good_long & (used_to_find_middle == FALSE) )
+df_new$group_training_class                   <- "other_short"   # Default
+df_new$group_training_class[is_long_enough  ] <- "other_long"    # Default (if long enough)
+df_new$group_training_class[idx_training    ] <- "good_training" # Length does not matter (anymore)
+df_new$group_training_class[idx_testing     ] <- "good_testing"  # Ditto
+#df_new$group_training_class[is_good_short   ] <- "good_short"
+df_new$group_training_class[is_bad_long     ] <- "bad_long"      # ...except for "bad"
+df_new$group_training_class[is_bad_short    ] <- "bad_short"
+df_new <- add_used_to_find_middle( data_groups = df_new ) # Guarantees consistency between duplicated definitions
+return(df_new)
+}
+add_used_to_find_middle <- function(data_groups = NULL){
+df_new    <- data_groups
+idx_used  <- which(data_groups$group_training_class == "good_training")
+df_new$used_to_find_middle           <- FALSE
+df_new$used_to_find_middle[idx_used] <- TRUE
+return(df_new)
+}
+add_group_spectrum_index <- function(data_groups = NULL){
+# Supporting functions for add_group_spectrum_index()
+get_breaks_all <- function(df_new){
+# Supporting function(s) for get_breaks_all()
+get_cut_points <- function(data_subset){
+# Supporting function(s) for get_cut_points()
+cut_values <- function(data=NULL, minimum_segment_length=NULL){
+# using cpt.mean -- Appears to have a memory leak
+#results_cpt <- cpt.mean(data=data, method="PELT", minimum_segment_length=minimum_segment_length)
+#results <- results_cpt@cpts
+# Just look at the end
+#results <- c(length(data))
+# regularly spaced, slightly larger than minimum_segment_length
+n_points <- length(data)
+n_regions <- floor(n_points / minimum_segment_length)
+n_regions <- ifelse(n_regions == 0, 1, n_regions)
+results <- round(seq(1, n_points, length.out = n_regions + 1))
+results <- results[-1]
+return(results)
+}
+remove_last <- function(x){
+return(x[-length(x)] )
+}
+# Main code of for get_cut_points()
+max_idx = max(data_subset$PMD_FDR_spectrum_index)
+data_sub_sub <- subset(data_subset, group_training_class == "good_training") #(PMD_FDR_input_score==100) & (PMD_FDR_decoy==0))
+minimum_segment_length = 50
+values <- data_sub_sub$value
+n_values <- length(values)
+local_to_global_idx <- data_sub_sub$PMD_FDR_spectrum_index
+if (n_values <= minimum_segment_length){
+result <- c()
+} else {
+local_idx <- cut_values(data=values, minimum_segment_length=minimum_segment_length)
+result <- local_to_global_idx[local_idx]
+result <- remove_last(result)
+}
+result <- c(result, max_idx)
+return(result)
+}
+remove_last <- function(vec) {
+return(vec[-length(vec)])
+}
+# Main code of get_breaks_all()
+breaks <- 1
+files <- unique(df_new$PMD_FDR_spectrum_file)
+for (local_file in files){
+data_subset <- subset(df_new, (PMD_FDR_spectrum_file==local_file))
+if (nrow(data_subset) > 0){
+breaks <- c(breaks, get_cut_points(data_subset))
+}
+}
+breaks <- sort(unique(breaks))
+breaks <- remove_last(breaks)
+breaks <- c(breaks, max(df_new$PMD_FDR_spectrum_index + 1))
+return(breaks)
+}
+# Main code of add_group_spectrum_index()
+field_name_to_group <- "PMD_FDR_spectrum_index"
+df_new <- data_groups[order(data_groups[[field_name_to_group]]),]
+breaks <- get_breaks_all(df_new)
+df_new$group_spectrum_index <- cut(x = df_new[[field_name_to_group]], breaks = breaks, right = FALSE, dig.lab = 6)
+df_new <- add_group_decoy(data_groups = df_new, field_name_to_group = field_name_to_group)
+return(df_new)
+}
+add_median_of_group_index <-function(data_groups = NULL){
+field_median <- "median_of_group_index"
+data_good <- subset(data_groups, used_to_find_middle )
+med <- aggregate(value~group_spectrum_index, data=data_good, FUN=safe_median)
+med <- rename_column(med, "value", field_median)
+data_groups[[field_median]] <- NULL
+df_new <- merge(data_groups, med)
+return(df_new)
+}
+add_1_percent_to_data_groups <- function(data_groups=NULL){
+data_new <- data_groups
+if (get_raw_1_percent()$exists()){
+# Load 1 percent file
+df_1_percent <- get_raw_1_percent()$df
+# Get relevant fields
+df_1_percent$is_in_1percent <- TRUE
+df_1_percent                <- rename_column(df_1_percent, "Spectrum Title", "PMD_FDR_spectrum_title")
+df_1_percent                <- df_1_percent[,c("PMD_FDR_spectrum_title", "is_in_1percent")]
+# Merge with data_groups
+data_new <- merge(data_new, df_1_percent, all.x=TRUE)
+data_new$is_in_1percent[is.na(data_new$is_in_1percent)] <- FALSE
+}
+# Save results
+return(data_new)
+}
+# Main code of make_data_groups()
+data_groups <- standardize_fields(data_original)
+data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_input_score",
+data_groups         = data_groups,
+vec.length.out      = 14,
+vec.tolerance       = 1,
+value_format        = "03d")
+data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_pmd",
+data_groups         = data_groups,
+vec.length.out      = 21,
+vec.tolerance       = 0.1,
+value_format        = "+05.1f")
+data_groups <- add_grouped_variable(field_name_to_group = "PMD_FDR_peptide_length",
+data_groups         = data_groups,
+vec.length.out      = 11,
+vec.tolerance       = 1,
+value_format        = "02d")
+# data_groups <- add_grouped_variable(field_name_to_group = "m_z",
+#                                     data_groups         = data_groups,
+#                                     vec.length.out      = 11,
+#                                     vec.tolerance       = 10,
+#                                     value_format        = "04.0f")
+#
+# data_groups <- add_grouped_variable(field_name_to_group = "measured_mass",
+#                                     data_groups         = data_groups,
+#                                     vec.length.out      = 11,
+#                                     vec.tolerance       = 1,
+#                                     value_format        = "04.0f")
+#
+# data_groups <- add_already_grouped_variable(field_name_to_group = "isotope_number",
+#                                             data_groups         = data_groups )
+#
+# data_groups <- add_already_grouped_variable(field_name_to_group = "charge",
+#                                             data_groups         = data_groups )
+#
+data_groups <- add_already_grouped_variable(field_name_to_group = "PMD_FDR_spectrum_file",
+data_groups         = data_groups )
+data_groups <- add_protein_group(data_groups = data_groups)
+data_groups <- add_group_training_class(  data_groups = data_groups)
+data_groups <- add_group_spectrum_index(  data_groups = data_groups)
+data_groups <- add_median_of_group_index( data_groups = data_groups)
+data_groups <- add_value_norm(            data_groups = data_groups)
+# fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "m_z", "PMD_FDR_peptide_length", "isotope_number", "charge", "PMD_FDR_spectrum_file", "measured_mass", "PMD_FDR_spectrum_index", "PMD_FDR_proteins")
+# fields_of_interest <- c("value",
+#                         "PMD_FDR_decoy",
+#                         "PMD_FDR_spectrum_title",
+#                         "median_of_group_index",
+#                         "value_norm",
+#                         "used_to_find_middle",
+#                         "group_training_class",
+#                         fields_of_interest,
+#                         sprintf("group_%s"      , fields_of_interest),
+#                         sprintf("group_decoy_%s", fields_of_interest))
+fields_of_interest <- c("PMD_FDR_input_score", "PMD_FDR_pmd", "PMD_FDR_peptide_length", "PMD_FDR_spectrum_file", "PMD_FDR_spectrum_index", "PMD_FDR_proteins")
+fields_of_interest <- c("value",
+"PMD_FDR_decoy",
+"PMD_FDR_spectrum_title",
+"median_of_group_index",
+"value_norm",
+"used_to_find_middle",
+"group_training_class",
+fields_of_interest,
+sprintf("group_%s"      , simplify_field_name(fields_of_interest)),
+sprintf("group_decoy_%s", simplify_field_name(fields_of_interest)))
+data_groups <- data_groups[,fields_of_interest]
+data_groups <- add_1_percent_to_data_groups(data_groups)
+return(data_groups)
+}
+data_original <- get_data_converter()$df #parents[[INDEX_OF_ORIGINAL_DATA]]$df
+df <<- make_data_groups(data_original)
+},
+set_info = function(info){
+parents[["info"]] <<- info
+},
+get_info = function(){
+return(verified_element_of_list(parents, "info", "Data_Object_Groupings$parents"))
+},
+set_data_converter = function(data_converter){
+parents[["data_converter"]] <<- data_converter
+},
+get_data_converter = function(){
+return(verified_element_of_list(parents, "data_converter", "Data_Object_Groupings$parents"))
+},
+set_raw_1_percent = function(raw_1_percent){ ############## BUGBUG: the 1% file should be using the same file type format as the standard data (but isn't)
+parents[["raw_1_percent"]] <<- raw_1_percent
+},
+get_raw_1_percent = function(){
+return(verified_element_of_list(parents, "raw_1_percent", "Data_Object_Groupings$parents"))
+}
+)
+###############################################################################
+#            Class: Data_Object_Individual_FDR
+###############################################################################
+Data_Object_Individual_FDR <- setRefClass("Data_Object_Individual_FDR",
+contains = "Data_Object",
+fields =list(df = "data.frame"))
+Data_Object_Individual_FDR$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Individual_FDR"
+},
+verify = function(){
+data_groups = get_data_groups()$df
+densities   = get_densities()$df
+alpha       = get_alpha()$df
+check_field_name(data_groups, "data_groups", "value_norm")
+check_field_name(data_groups, "data_groups", "group_decoy_input_score")
+check_field_name(data_groups, "data_groups", "PMD_FDR_peptide_length")
+check_field_name(data_groups, "data_groups", "PMD_FDR_input_score")
+check_field_name(alpha, "alpha", "alpha") # BUGBUG: I'm missing a field here...
+check_field_name(densities, "densities", "x")
+check_field_name(densities, "densities", "t")
+check_field_name(densities, "densities", "f")
+},
+set_data_groups = function(parent){
+parents[["data_groups"]] <<- parent
+},
+get_data_groups = function(){
+return(verified_element_of_list(parents, "data_groups", "Data_Object_Individual_FDR$parents"))
+},
+set_densities = function(parent){
+parents[["densities"]] <<- parent
+},
+get_densities = function(){
+return(verified_element_of_list(parents, "densities", "Data_Object_Individual_FDR$parents"))
+},
+set_alpha = function(parent){
+parents[["alpha"]] <<- parent
+},
+get_alpha = function(){
+return(verified_element_of_list(parents, "alpha", "Data_Object_Individual_FDR$parents"))
+},
+m_load_data = function(){
+add_FDR_to_data_groups <- function(data_groups=NULL, densities=NULL, alpha=NULL, field_value=NULL, field_decoy_group=NULL, set_decoy_to_1=FALSE){
+# Support functions for add_FDR_to_data_groups()
+get_group_fdr <- function(group_stats = NULL, data_groups = NULL, densities=NULL){
+group_fdr <- apply(X = densities, MARGIN = 2, FUN = max)
+df_group_fdr <- data.frame(group_fdr)
+df_group_fdr <- rename_column(df_group_fdr, "group_fdr", "v")
+df_group_fdr$group_of_interest <- names(group_fdr)
+t <- df_group_fdr[df_group_fdr$group_of_interest == "t", "v"]
+f <- df_group_fdr[df_group_fdr$group_of_interest == "f", "v"]
+df_group_fdr <- subset(df_group_fdr, !(group_of_interest %in% c("x", "t", "f")))
+df_group_fdr$group_fdr <-(df_group_fdr$v - t) / (f - t)
+return(df_group_fdr)
+}
+get_mode <- function(x){
+d <- density(x)
+return(d$x[which.max(d$y)])
+}
+# Main code for add_FDR_to_data_groups()
+# Set up analysis
+data_new <- data_groups
+data_new$value_of_interest <- data_new[,field_value]
+data_new$group_of_interest <- data_new[,field_decoy_group]
+data_subset <- subset(data_new, PMD_FDR_peptide_length >= 11)
+# Identify mean PMD_FDR_input_score per group
+group_input_score <- aggregate(PMD_FDR_input_score~group_of_interest, data=data_subset, FUN=mean)
+group_input_score <- rename_column(group_input_score, "PMD_FDR_input_score", "group_input_score")
+#group_fdr   <- get_group_fdr(data_groups = data_subset, densities=densities)
+group_stats <- merge(alpha, group_input_score)
+group_stats <- subset(group_stats, group_of_interest != "PMD_FDR_decoy")
+x=c(0,group_stats$group_input_score)
+y=c(1,group_stats$alpha)
+FUN_interp <- approxfun(x=x,y=y)
+data_new$interpolated_groupwise_FDR <- FUN_interp(data_new$PMD_FDR_input_score)
+if (set_decoy_to_1){
+data_new$interpolated_groupwise_FDR[data_new$PMD_FDR_decoy == 1] <- 1
+}
+return(data_new)
+}
+data_groups = get_data_groups()$df
+densities   = get_densities()$df
+alpha       = get_alpha()$df
+d_true  <- densities[,c("x", "t")]
+d_false <- densities[,c("x", "f")]
+i_fdr <- add_FDR_to_data_groups(data_groups       = data_groups,
+densities         = densities,
+alpha             = alpha,
+field_value       ="value_norm",
+field_decoy_group = "group_decoy_input_score")
+# Derive local t
+interp_t <- splinefun(x=d_true$x,  y=d_true$t) #approxfun(x=d_true$x, y=d_true$y)
+# Derive local f
+interp_f <- splinefun(x=d_false$x, y=d_false$f) #approxfun(x=d_true$x, y=d_true$y)
+# Derive local FDR
+i_fdr$t     <- interp_t(i_fdr$value_of_interest)
+i_fdr$f     <- interp_f(i_fdr$value_of_interest)
+i_fdr$alpha <- i_fdr$interpolated_groupwise_FDR
+i_fdr$i_fdr <- with(i_fdr, (alpha*f) / (alpha*f + (1-alpha)*t))
+df <<- i_fdr
+}
+)
+###############################################################################
+#            Class: Data_Object_Densities
+###############################################################################
+Data_Object_Densities <- setRefClass("Data_Object_Densities",
+contains = "Data_Object",
+fields =list(df = "data.frame"))
+Data_Object_Densities$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Densities"
+},
+verify = function(){
+df_data_groups <- get_data_groups()$df
+checkTrue(nrow(df_data_groups) > 0,
+msg = "data_groups data frame was empty (and should not have been)")
+check_field_name(df_data_groups, "data_groups", "value_norm")
+check_field_name(df_data_groups, "data_groups", "group_decoy_input_score")
+check_field_name(df_data_groups, "data_groups", "group_training_class")
+},
+set_data_groups = function(parent=NULL){
+parents[["data_groups"]] <<- parent
+},
+get_data_groups = function(){
+return(verified_element_of_list(parent_list = parents, element_name = "data_groups", object_name = "Data_Object_Densities$parents"))
+},
+m_load_data = function(){
+# Support functions for make_densities()
+set_values_of_interest <- function(df_data_groups=NULL, field_group = NULL){
+field_value       = "value_norm"
+new_data_groups <- get_data_groups()$df
+new_data_groups$value_of_interest <- new_data_groups[,field_value]
+new_data_groups$group_of_interest <- new_data_groups[,field_group]
+#groups <- sort(unique(new_data_groups$group_of_interest))
+return(new_data_groups)
+}
+get_ylim <- function(data_groups=NULL){
+ylim <- range(data_groups$value_of_interest, na.rm = TRUE)
+return(ylim)
+}
+make_hit_density <- function(data_subset=NULL, descr_of_df=NULL, ylim=NULL){
+#stop("Data_Object_Densities$make_hit_density() is untested beyond here")
+verify_density = function(data_subset=NULL, value_field=NULL, descr_of_df=NULL, ylim=NULL){
+values <- data_subset[value_field]
+values <- values[! is.na(values)]
+if (length(values) < MIN_ACCEPTABLE_POINTS_IN_DENSITY){
+stop (sprintf("There are too few valid %s (%d < %d) in %s to be used for calculating a density function",
+value_field,
+length(values),
+MIN_ACCEPTABLE_POINTS_IN_DENSITY,
+descr_of_df))
+}
+d <- density(values, from = ylim[1], to = ylim[2])
+return(d)
+}
+uniformalize_density <- function(d){
+# Reorganizes y-values of density function so that
+# function is monotone increasing to mode
+# and monotone decreasing afterwards
+idx_mode   <- which.max(d$y)
+idx_lower <- 1:(idx_mode-1)
+idx_upper <- idx_mode:length(d$y)
+values_lower <- d$y[idx_lower]
+values_upper <- d$y[idx_upper]
+new_d   <- d
+new_d$y <- c(sort(values_lower, decreasing = FALSE),
+sort(values_upper, decreasing = TRUE))
+return(new_d)
+}
+local_df <- subset(data_subset,
+(PMD_FDR_peptide_length >= MIN_GOOD_PEPTIDE_LENGTH) &
+(used_to_find_middle == FALSE))
+d <- verify_density      (data_subset=local_df, value_field = "value_of_interest", descr_of_df = descr_of_df, ylim=ylim)
+d <- normalize_density   (d)
+d <- uniformalize_density(d)
+return(d)
+}
+make_true_hit_density  <- function(data_groups=NULL){
+d_true  <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "good_testing") ),
+descr_of_df = "Good-testing dataset",
+ylim        = get_ylim(data_groups))
+return(d_true)
+}
+make_false_hit_density <- function(data_groups=NULL){
+d_false <- make_hit_density(data_subset = subset(data_groups, (group_training_class == "bad_long") ),
+descr_of_df = "Bad-long dataset",
+ylim        = get_ylim(data_groups))
+return(d_false)
+}
+add_v_densities <- function(data_groups=NULL, densities=NULL, field_group = NULL){
+groups <- sort(unique(data_groups$group_of_interest))
+new_densities <- densities
+for (local_group in groups){
+d_v <- make_hit_density(data_subset = subset(data_groups, (group_of_interest == local_group)),
+descr_of_df = sprintf("subset of data (where %s is '%s')",
+field_group,
+local_group),
+ylim        = get_ylim(data_groups))
+new_densities[local_group] <- d_v$y
+}
+return(new_densities)
+}
+# Main section of make_densities()
+df_data_groups <- get_data_groups()$df
+new_data_groups <- set_values_of_interest(df_data_groups,  field_group = "group_decoy_input_score")
+d_true  <- make_true_hit_density( new_data_groups)
+d_false <- make_false_hit_density(new_data_groups)
+densities <- data.frame(x=d_true$x,
+t=d_true$y,
+f=d_false$y)
+densities <- add_v_densities(data_groups=new_data_groups, densities=densities,  field_group = "group_decoy_input_score")
+df <<- densities
+}
+)
+###############################################################################
+#            Class: Data_Object_Alpha
+###############################################################################
+Data_Object_Alpha <- setRefClass("Data_Object_Alpha",
+contains = "Data_Object",
+fields =list(df = "data.frame"))
+Data_Object_Alpha$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Alpha"
+},
+verify = function(){
+densities <- get_densities()$df
+checkTrue(nrow(densities) > 0,
+msg = "Densities data.frame was empty (and should not have been)")
+},
+set_densities = function(parent=NULL){
+parents[["densities"]] <<- parent
+},
+get_densities = function(){
+return(verified_element_of_list(parent_list = parents, element_name = "densities", object_name = "Data_Object_Alpha"))
+},
+m_load_data = function(){
+densities <- get_densities()$df
+max_of_density = apply(X = densities, MARGIN = 2, FUN = max)
+df_alpha <- data.frame(stringsAsFactors = FALSE,
+list(v = max_of_density,
+group_of_interest = names(max_of_density)))
+df_alpha <- subset(df_alpha, group_of_interest != "x")
+t <- with(subset(df_alpha, group_of_interest=="t"), v)
+f <- with(subset(df_alpha, group_of_interest=="f"), v)
+df_alpha$alpha <- with(df_alpha, (t-v)/(t-f))
+alpha <- df_alpha[,c("group_of_interest", "alpha")]
+alpha <- subset(alpha, (group_of_interest != "t") & (group_of_interest != "f"))
+df <<- alpha
+}
+)
+###############################################################################
+#            Class: Data_Processor
+###############################################################################
+Data_Processor <- setRefClass("Data_Processor",
+fields =list(info           = "Data_Object_Info",
+raw_data       = "Data_Object_Raw_Data",
+raw_1_percent  = "Data_Object_Raw_1_Percent",
+data_converter = "Data_Object_Data_Converter",
+data_groups    = "Data_Object_Groupings",
+densities      = "Data_Object_Densities",
+alpha          = "Data_Object_Alpha",
+i_fdr          = "Data_Object_Individual_FDR"))
+Data_Processor$methods(
+initialize = function(p_info=NULL){
+if (! is.null(p_info)){
+set_info(p_info)
+}
+},
+set_info = function(p_info=NULL){
+# This initialization defines all of the dependencies between the various components
+info <<- p_info
+# raw_data
+raw_data$set_info(info)
+info$append_child(raw_data)
+# raw_1_percent
+raw_1_percent$set_info(info)
+info$append_child(raw_1_percent)
+# data_converter
+data_converter$set_info    (info)
+data_converter$set_raw_data(raw_data)
+info         $append_child (data_converter)
+raw_data     $append_child (data_converter)
+# data_groups
+data_groups$set_info          (info)
+data_groups$set_data_converter(data_converter)
+data_groups$set_raw_1_percent (raw_1_percent)
+info          $append_child   (data_groups)
+data_converter$append_child   (data_groups)
+raw_1_percent $append_child   (data_groups)
+# densities
+densities  $set_data_groups(data_groups)
+data_groups$append_child   (densities)
+# alpha
+alpha    $set_densities(densities)
+densities$append_child (alpha)
+# i_fdr
+i_fdr$set_data_groups(data_groups)
+i_fdr$set_densities  (densities)
+i_fdr$set_alpha      (alpha)
+data_groups  $append_child(i_fdr)
+densities    $append_child(i_fdr)
+alpha        $append_child(i_fdr)
+}
+)
+#############################################################
+####### Classes for Plotting
+#############################################################
+###############################################################################
+#            Class: Plot_Image
+###############################################################################
+Plot_Image = setRefClass("Plot_Image",
+fields = list(data_processors    = "list",
+plot_title         = "character",
+include_text       = "logical",
+include_main       = "logical",
+x.intersp          = "numeric",
+y.intersp          = "numeric",
+scale              = "numeric",
+main               = "character",
+is_image_container = "logical"))
+Plot_Image$methods(
+initialize = function(p_data_processors = list(),
+p_include_main = TRUE,
+p_include_text = TRUE,
+p_is_image_container = FALSE){
+include_main    <<- p_include_main
+include_text    <<- p_include_text
+data_processors <<- p_data_processors
+is_image_container <<- p_is_image_container
+},
+plot_image = function(){
+plot(main="Define plot_image() for subclass") # Abstract function
+},
+get_n = function(){
+stop("Need to define function get_n() for subclass") #Abstract function
+},
+create_standard_main = function(){
+needs_main <- function(){
+return(include_text & include_main & !is_image_container)
+}
+if (needs_main()){
+collection_name <- data_processors[[1]]$info$collection_name()
+main <<- sprintf("%s\n(Dataset: %s; n=%s)", plot_title, collection_name,  format(get_n(), big.mark = ","))
+}
+},
+plot_image_in_window = function(p_scale=NULL, window_height=NULL, window_width=NULL){
+scale <<- p_scale
+SIZE_AXIS      <- 2.5 * scale # in the units used by mar
+SIZE_MAIN      <- 2.5 * scale
+SIZE_NO_MARGIN <- 0.1 * scale
+FONT_SIZE      <- 8   * scale
+WINDOW_WIDTH   <- window_width  * scale
+WINDOW_HEIGHT  <- window_height * scale
+X_INTERSP      <- 0.5 * scale + 0.4 # manages legend text spacing
+Y_INTERSP      <- 0.5 * scale + 0.4 # manages
+if (include_main){
+mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_MAIN     , SIZE_NO_MARGIN)
+} else {
+mar = c(SIZE_AXIS, SIZE_AXIS, SIZE_NO_MARGIN, SIZE_NO_MARGIN)
+}
+mgp = c(SIZE_AXIS/2, SIZE_AXIS/4, 0) # Margin line (mex units) for axis title, axis labels, axis lines
+ps  = FONT_SIZE
+x.intersp <<- X_INTERSP
+y.intersp <<- Y_INTERSP
+windows(width = WINDOW_WIDTH, height=WINDOW_HEIGHT)
+old_par  <- par(mar=mar, ps=ps, mgp=mgp)
+create_standard_main()
+plot_image()
+if (!is_image_container){
+axis(side=1, labels=include_text, tcl=-0.5, lwd=scale)
+axis(side=2, labels=include_text, tcl=-0.5, lwd=scale)
+box(lwd=scale)
+}
+par(old_par)
+},
+plot_image_in_small_window = function(p_scale=1){
+plot_image_in_window(p_scale=p_scale, window_height=2, window_width=3.25)
+},
+plot_image_in_large_window = function(p_scale=1, window_height=NULL){
+plot_image_in_window(p_scale=p_scale, window_height=window_height, window_width=7)
+}
+)
+###############################################################################
+#            Class: Legend_Object
+###############################################################################
+Legend_Object = setRefClass("Legend_Object",
+contains = "Plot_Image",
+fields = list(user_params = "list",
+scale       = "numeric"))
+Legend_Object$methods(
+initialize = function(p_user_params = NULL, p_scale = NULL){
+if (is.null(p_user_params)){
+user_params <<- list()
+} else {
+user_params <<- p_user_params
+}
+if (is.null(p_scale)){
+stop("Legend_Object must have a valid scale")
+} else {
+scale <<- p_scale
+}
+user_params$x         <<- if_null(user_params$x        , "topleft", user_params$x)
+user_params$y         <<- if_null(user_params$y        ,      NULL, user_params$y)
+user_params$bty       <<- if_null(user_params$bty      ,       "o", user_params$bty)
+user_params$lwd       <<- if_null(user_params$lwd      ,      NULL, user_params$lwd        * scale) # Because we allow NULL, scale must be inside parens
+user_params$seg.len   <<- if_null(user_params$seg.len  ,         3, user_params$seg.len  ) * scale
+user_params$box.lwd   <<- if_null(user_params$box.lwd  ,         1, user_params$box.lwd  ) * scale
+user_params$x.intersp <<- if_null(user_params$x.intersp,       0.6, user_params$x.intersp) * scale
+user_params$y.intersp <<- if_null(user_params$y.intersp,       0.4, user_params$y.intersp) * scale + 0.2
+},
+show = function(){
+first_legend = legend(x         = user_params$x,
+y         = user_params$y,
+title     = "",
+legend    = user_params$leg,
+col       = user_params$col,
+bty       = user_params$bty,
+lty       = user_params$lty,
+lwd       = user_params$lwd,
+seg.len   = user_params$seg.len,
+box.lwd   = user_params$box.lwd,
+x.intersp = user_params$x.intersp,
+y.intersp = user_params$y.intersp)
+new_x = first_legend$rect$left
+new_y = first_legend$rect$top + first_legend$rect$h * ifelse(scale==1, 0.07, 0.03 - (scale * 0.02)) #switch(scale, 0.01, -0.01, -0.03, -0.05)# (0.07 - 0.09 * ((scale-1)^2))#(0.15 - 0.08*scale)#.07 * (2 - scale)
+legend(x=new_x, y=new_y, title = user_params$title, legend = "", cex=1.15, bty="n")
+}
+)
+###############################################################################
+#            Class: Plot_Multiple_Images
+###############################################################################
+Plot_Multiple_Images = setRefClass("Plot_Multiple_Images",
+contains = "Plot_Image",
+fields = list(n_images_wide = "numeric",
+n_images_tall = "numeric",
+image_list    = "list"))
+Plot_Multiple_Images$methods(
+initialize = function(p_n_images_wide=1, p_n_images_tall=2, p_image_list=NULL, ...){
+n_images_wide  <<- p_n_images_wide
+n_images_tall  <<- p_n_images_tall
+image_list     <<- p_image_list
+#plot_title      <<- "True Hit and False Hit Distributions"
+callSuper(p_is_image_container=TRUE, ...)
+},
+plot_image = function(){
+# Support functions
+apply_mtext <- function(letter=NULL){
+line=1.3*scale
+mtext(letter, side=1, line=line, adj=0)
+}
+# main code
+old_par <- par(mfrow=c(n_images_tall, n_images_wide))
+i=0
+n_images <- length(image_list)
+for (i in 1:n_images){
+image <- image_list[[i]]
+image$create_standard_main()
+image$scale <- scale
+image$plot_image()
+axis(side=1, labels=include_text, tcl=-0.5, lwd=scale)
+axis(side=2, labels=include_text, tcl=-0.5, lwd=scale)
+box(lwd=scale)
+apply_mtext(letter=sprintf("(%s)", letters[i]))
+}
+par(old_par)
+}
+)
+###############################################################################
+#            Class: Plot_Compare_PMD_and_Norm_Density
+###############################################################################
+Plot_Compare_PMD_and_Norm_Density = setRefClass("Plot_Compare_PMD_and_Norm_Density",
+contains = "Plot_Image",
+fields = list(show_norm      = "logical",
+display_n_psms = "logical"))
+Plot_Compare_PMD_and_Norm_Density$methods(
+initialize = function(p_show_norm=TRUE, p_display_n_psms=TRUE, ...){
+show_norm       <<- p_show_norm
+display_n_psms  <<- p_display_n_psms
+plot_title      <<- "True Hit and False Hit Distributions"
+callSuper(...)
+},
+plot_image = function(){
+# Support functions for plot_compare_PMD_and_norm_density()
+get_densities <- function(data_subset = NULL, var_value = NULL){
+data_subset$value_of_interest <- data_subset[,var_value]
+from <- min(data_subset$value_of_interest, na.rm = TRUE)
+to   <- max(data_subset$value_of_interest, na.rm = TRUE)
+xlim = range(data_subset$value_of_interest)
+data_true  <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100))
+data_false <- subset(data_subset, (PMD_FDR_decoy==1))
+d_true  <- with(data_true , density(value_of_interest, from = from, to = to, na.rm = TRUE))
+d_false <- with(data_false, density(value_of_interest, from = from, to = to, na.rm = TRUE))
+d_true  <- normalize_density(d_true)
+d_false <- normalize_density(d_false)
+densities <- list(d_true=d_true, d_false=d_false, var_value = var_value, n_true = nrow(data_true), n_false = nrow(data_false))
+return(densities)
+}
+get_xlim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){
+xlim   <- range(c(      densities_a$d_true$x, densities_a$d_false$y))
+if (show_norm){
+xlim <- range(c(xlim, densities_b$d_true$x, densities_b$d_false$y))
+}
+return(xlim)
+}
+get_ylim <- function(densities_a = NULL, densities_b = NULL, show_norm=NULL){
+ylim   <- range(c(      densities_a$d_true$y, densities_a$d_false$y))
+if (show_norm){
+ylim <- range(c(ylim, densities_b$d_true$y, densities_b$d_false$y))
+}
+return(ylim)
+}
+plot_distributions <- function(densities = NULL, var_value= NULL, dataset_name = NULL, ...){
+leg = list()
+leg$leg = c("Good", "Bad")
+if (display_n_psms){
+leg$leg = sprintf("%s (%d PSMs)",
+leg$leg,
+c(densities$n_true, densities$n_false))
+}
+leg$col = c("black", "red")
+leg$lwd = c(3      ,     3)
+leg$lty = c(1      ,     2)
+leg$title = "Hit Category"
+xlab = ifelse(var_value == "value",
+"PMD (ppm)",
+"PMD - normalized (ppm)")
+ylab = "Density"
+if (!include_text){
+xlab = ""
+ylab = ""
+}
+plot( densities$d_true , col=leg$col[1], lwd=leg$lwd[1] * scale, lty=leg$lty[1], xaxt = "n", yaxt = "n", main=main, xlab = xlab, ylab=ylab, ...)
+lines(densities$d_false, col=leg$col[2], lwd=leg$lwd[2] * scale, lty=leg$lty[2])
+abline(v=0, h=0, col="gray", lwd=1*scale)
+if (include_text){
+legend_object <- Legend_Object$new(leg, scale)
+legend_object$show()
+#legend("topleft", legend=leg.leg, col=leg.col, lwd=leg.lwd, lty=leg.lty, x.intersp = x.intersp, y.intersp = y.intersp)
+}
+}
+# Main code block for plot_compare_PMD_and_norm_density
+data_processor <- data_processors[[1]]
+data_processor$data_groups$ensure()
+data_groups <- data_processor$data_groups$df
+data_subset_a <- subset(data_groups  , used_to_find_middle == FALSE)
+data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
+densities_a <- get_densities(data_subset = data_subset_a, var_value = "value")
+densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm")
+xlim=get_xlim(densities_a, densities_b, show_norm = show_norm)
+ylim=get_ylim(densities_a, densities_b, show_norm = show_norm)
+dataset_name <- data_processor$info$collection_name
+plot_distributions(  densities=densities_a, var_value = "value"     , dataset_name = dataset_name, xlim=xlim, ylim=ylim)
+if (show_norm){
+plot_distributions(densities=densities_b, var_value = "value_norm", dataset_name = dataset_name, xlim=xlim, ylim=ylim)
+}
+},
+get_n = function(){
+data_processor <- data_processors[[1]]
+data_processor$data_groups$ensure()
+data_subset_a <- subset(data_processor$data_groups$df  , used_to_find_middle == FALSE)
+data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
+if (show_norm){
+data_subset <- data_subset_a
+} else {
+data_subset <- data_subset_b
+}
+data_true  <- subset(data_subset, (PMD_FDR_decoy==0) & (PMD_FDR_input_score==100))
+data_false <- subset(data_subset, (PMD_FDR_decoy==1))
+return(nrow(data_true) + nrow(data_false))
+}
+)
+###############################################################################
+#            Class: Plot_Time_Invariance_Alt
+###############################################################################
+Plot_Time_Invariance_Alt = setRefClass("Plot_Time_Invariance_Alt",
+contains = "Plot_Image",
+fields = list(show_norm      = "logical",
+display_n_psms = "logical",
+training_class = "character",
+ylim           = "numeric",
+field_of_interest = "character"))
+Plot_Time_Invariance_Alt$methods(
+initialize = function(p_ylim=NULL, p_training_class=NULL, p_field_of_interest="value_norm", ...){
+get_subset_title <- function(training_class=NULL){
+if        (training_class == "bad_long"){
+subset_title="bad only"
+} else if (training_class == "good_testing"){
+subset_title="good-testing only"
+} else if (training_class == "good_training"){
+subset_title="good-training only"
+} else if (training_class == "other"){
+subset_title="other only"
+} else {
+stop("Unexpected training_class in plot_time_invariance")
+}
+return(subset_title)
+}
+ylim <<- p_ylim
+training_class <<- p_training_class
+field_of_interest <<- p_field_of_interest
+subset_title <- get_subset_title(training_class=training_class)
+backup_title <- sprintf("Middle 25%% PMD for spectra sorted by index%s",
+ifelse(is.null(subset_title),
+"",
+sprintf(" - %s", subset_title)))
+#plot_title <<- get_main(main_title=main, backup_title=backup_title, data_collection = data_collection)
+plot_title <<- backup_title
+callSuper(...)
+},
+plot_image = function(){
+# Support functions for plot_time_invariance()
+# Main code of plot_time_invariance()
+data_subset = get_data_subset()
+plot_group_spectrum_index_from_subset_boxes(data_subset = data_subset)
+abline(h=0, col="blue", lwd=scale)
+},
+get_data_subset = function(){
+data_processor <- data_processors[[1]]
+data_processor$data_groups$ensure()
+return(subset(data_processor$data_groups$df, (group_training_class==training_class)))
+},
+get_n = function(){
+return(nrow(get_data_subset()))
+},
+plot_group_spectrum_index_from_subset_boxes = function(data_subset = NULL){
+n_plot_groups <- 100
+field_name_text <- ifelse(field_of_interest=="value", "PMD", "Translated PMD")
+new_subset                   <- data_subset
+new_subset$value_of_interest <- new_subset[,field_of_interest]
+new_subset                   <- new_subset[order(new_subset$PMD_FDR_spectrum_index),]
+idxs <- round_to_tolerance(seq(from=1, to=nrow(new_subset), length.out = n_plot_groups+1), 1)
+idxs_left  <- idxs[-(n_plot_groups+1)]
+idxs_right <- idxs[-1] - 1
+idxs_right[n_plot_groups] <- idxs_right[n_plot_groups] + 1
+new_subset$plot_group <- NA
+for (i in 1:n_plot_groups){
+new_subset$plot_group[idxs_left[i]:idxs_right[i]] <- i
+}
+xleft   <- aggregate(PMD_FDR_spectrum_index   ~plot_group, data=new_subset, FUN=min)
+xright  <- aggregate(PMD_FDR_spectrum_index   ~plot_group, data=new_subset, FUN=max)
+ybottom <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 - (0.25/2))})
+ytop    <- aggregate(value_of_interest~plot_group, data=new_subset, FUN=function(x){quantile(x, probs = 0.5 + (0.25/2))})
+boxes <- merge(            rename_column(xleft  , "PMD_FDR_spectrum_index"   , "xleft"),
+merge(      rename_column(xright , "PMD_FDR_spectrum_index"   , "xright"),
+merge(rename_column(ybottom, "value_of_interest", "ybottom"),
+rename_column(ytop   , "value_of_interest", "ytop"))))
+xlab <- "Spectrum Index"
+ylab <- sprintf("%s (ppm)", field_name_text )
+if (is.null(ylim)){
+ylim <<- range(new_subset$value_of_interest)
+}
+if (!include_text){
+xlab=""
+ylab=""
+}
+plot(value_of_interest~PMD_FDR_spectrum_index, data=new_subset, type="n", ylim=ylim, xlab = xlab, ylab=ylab, main=main, xaxt="n", yaxt="n")
+with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale))
+#points(median_of_group_index~PMD_FDR_spectrum_index, data=data_subset, cex=.5, pch=15)
+axis(1, labels=include_text, lwd=scale)
+axis(2, labels=include_text, lwd=scale)
+box(lwd=scale) #box around plot area
+}
+)
+###############################################################################
+#            Class: Plot_Time_Invariance_Alt_Before_and_After
+###############################################################################
+Plot_Time_Invariance_Alt_Before_and_After = setRefClass("Plot_Time_Invariance_Alt_Before_and_After",
+contains = "Plot_Multiple_Images",
+fields = list())
+Plot_Time_Invariance_Alt_Before_and_After$methods(
+initialize = function(p_data_processors = NULL,
+p_include_text=TRUE,
+p_include_main=FALSE,
+p_ylim = c(-4,4), ...){
+plot_object1 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors,
+p_include_text=p_include_text,
+p_include_main=p_include_main,
+p_training_class = "good_testing",
+p_field_of_interest = "value",
+p_ylim = p_ylim)
+plot_object2 <- Plot_Time_Invariance_Alt$new(p_data_processors = p_data_processors,
+p_include_text=p_include_text,
+p_include_main=p_include_main,
+p_training_class = "good_testing",
+p_field_of_interest = "value_norm",
+p_ylim = p_ylim)
+callSuper(p_n_images_wide=1,
+p_n_images_tall=2,
+p_include_text=p_include_text,
+p_include_main=p_include_main,
+p_image_list = list(plot_object1, plot_object2), ...)
+}
+)
+###############################################################################
+#            Class: Plot_Density_PMD_and_Norm_Decoy_by_AA_Length
+###############################################################################
+Plot_Density_PMD_and_Norm_Decoy_by_AA_Length = setRefClass("Plot_Density_PMD_and_Norm_Decoy_by_AA_Length",
+contains = "Plot_Image",
+fields = list(show_norm = "logical"))
+Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$methods(
+initialize = function(p_show_norm=FALSE, ...){
+plot_title <<- "The Decoy Bump: PMD Distribution of Decoy matches by peptide length"
+show_norm  <<- p_show_norm
+callSuper(...)
+},
+get_n = function(){
+data_processor <- data_processors[[1]]
+data_processor$data_groups$ensure()
+data_subset <- subset(data_processor$data_groups$df, (PMD_FDR_decoy == 1))
+return(nrow(data_subset))
+},
+plot_image = function(){
+# Support functions for plot_density_PMD_and_norm_decoy_by_aa_length()
+add_group_peptide_length_special <- function(){
+data_processor <- data_processors[[1]]
+data_processor$data_groups$ensure()
+data_groups <- data_processor$data_groups$df # the name data_groups is a data.frame instead of a Data_Object
+data_groups <- subset(data_groups, used_to_find_middle == FALSE)
+df_group_definition <- data.frame(stringsAsFactors = FALSE,
+list(group_peptide_length_special = c("06-08", "09-10", "11-12", "13-15", "16-20", "21-50"),
+min                          = c(  6    ,   9    ,  11    ,  13    ,  16    ,  21    ),
+max                          = c(     8 ,     10 ,     12 ,     15 ,     20 ,     50 ) ))
+group_peptide_length_special     <- data.frame(list(PMD_FDR_peptide_length = 6:50))
+group_peptide_length_special$min <- with(group_peptide_length_special, sapply(PMD_FDR_peptide_length, FUN = function(i) max(df_group_definition$min[df_group_definition$min <= i])))
+group_peptide_length_special     <- merge(group_peptide_length_special, df_group_definition)
+data_groups$group_peptide_length_special <- NULL
+new_data_groups <- (merge(data_groups,
+group_peptide_length_special[,c("PMD_FDR_peptide_length",
+"group_peptide_length_special")]))
+return(new_data_groups)
+}
+get_densities <- function(data_subset = NULL, field_value = NULL, field_group=NULL){
+get_density_from_subset <- function(data_subset=NULL, xlim=NULL){
+d_group            <- with(data_subset , density(value_of_interest, from = xlim[1], to = xlim[2], na.rm=TRUE))
+d_group            <- normalize_density(d_group)
+return(d_group)
+}
+data_temp                   <- data_subset
+data_temp$value_of_interest <- data_temp[[field_value]]
+data_temp$group_of_interest <- data_temp[[field_group]]
+xlim = range(data_temp$value_of_interest, na.rm=TRUE)
+groups      <- sort(unique(data_temp$group_of_interest))
+n_groups    <- length(groups)
+d_group <- get_density_from_subset( data_subset=data_temp, xlim = xlim )
+densities <- list("All decoys" = d_group)
+for (i in 1:n_groups){
+group <- groups[i]
+d_group <- get_density_from_subset( data_subset=subset(data_temp, (group_of_interest == group)),
+xlim = xlim )
+densities[[group]] <- d_group
+}
+return(densities)
+}
+get_limits <- function(densities_a = NULL, densities_b = NULL){
+xlim = c()
+ylim = c(0)
+for (single_density in densities_a){
+xlim=range(c(xlim, single_density$x))
+ylim=range(c(ylim, single_density$y))
+}
+for (single_density in densities_b){
+xlim=range(c(xlim, single_density$x))
+ylim=range(c(ylim, single_density$y))
+}
+return(list(xlim=xlim, ylim=ylim))
+}
+plot_distributions <- function(data_groups = NULL, xlim=NULL, ylim=NULL, densities = NULL, field_group= NULL, field_value = "value", xlab_modifier = "", var_value= NULL, include_peak_dots=TRUE, dataset_name = NULL, ...){
+data_groups$group_of_interest <- data_groups[[field_group]]
+data_groups$value_of_interest <- data_groups[[field_value]]
+# Main body of plot_decoy_distribution_by_field_of_interest()
+FIXED_LWD=3
+groups <- sort(unique(data_groups$group_of_interest))
+n      <- length(groups)
+df_leg <- data.frame(stringsAsFactors = FALSE,
+list(leg = groups,
+col = rainbow_with_fixed_intensity(n = n, goal_intensity_0_1 = 0.4),
+lty = rep(1:6, length.out=n),
+lwd = rep(FIXED_LWD , n)) )
+d <- densities[["All decoys"]]
+xlab = sprintf("Precursor Mass Discrepancy%s (ppm)", xlab_modifier)
+ylab = "Density"
+if (!include_text){
+xlab=""
+ylab=""
+}
+plot(d, lwd=FIXED_LWD * scale, main=main, xlab=xlab, ylab=ylab, xlim=xlim, ylim=ylim, xaxt="n", yaxt="n")
+ave_peak <- max(d$y)
+max_peak <- 0
+for (local_group in groups){
+data_subset <- subset(data_groups, group_of_interest == local_group)
+data_info   <- subset(df_leg     , leg               == local_group)
+col <- data_info$col[1]
+lty <- data_info$lty[1]
+lwd <- data_info$lwd[1]
+if (nrow(data_subset) > 100){
+d <- densities[[local_group]]  #density(data_subset[[field_value]])
+lines(d, col=col, lty=lty, lwd=lwd * scale)
+peak <- max(d$y)
+max_peak <- max(max_peak, peak)
+}
+}
+abline(v=0, h=0, lwd=scale)
+leg <- list(title = "Peptide length (aa)",
+leg = c("All decoys"     , df_leg$leg),
+col = c(col2hex("black") , df_leg$col),
+lty = c(1                , df_leg$lty),
+lwd = c(FIXED_LWD        , df_leg$lwd)
+)
+if (include_text){
+legend_object = Legend_Object$new(leg, scale)
+legend_object$show()
+#first_legend = legend(x="topleft", title = "", legend = leg$leg, col = leg$col, lty = leg$lty, lwd = leg$lwd, seg.len=leg$seg.len, box.lwd=leg$box.lwd, x.intersp = leg$x.intersp, y.intersp = leg$y.intersp)
+#new_x = first_legend$rect$left
+#new_y = first_legend$rect$top + first_legend$rect$h * .07 * (2 - scale)
+#legend(x=new_x, y=new_y, title = leg$title, legend = "", cex=1.15, bty="n")
+}
+box(lwd=scale) #box around plot area
+}
+# Main body for plot_density_PMD_and_norm_decoy_by_aa_length()
+data_mod <- add_group_peptide_length_special()
+data_mod <- subset(data_mod, PMD_FDR_decoy==1)
+densities_a <- get_densities(data_subset = data_mod, field_value = "value"     , field_group = "group_peptide_length_special")
+densities_b <- get_densities(data_subset = data_mod, field_value = "value_norm", field_group = "group_peptide_length_special")
+data_processor <- data_processors[[1]]
+dataset_name <- data_processor$info$collection_name()
+limits <- get_limits(densities_a, densities_b)
+xlim   <- limits$xlim
+ylim   <- limits$ylim
+if (show_norm){
+plot_distributions(data_groups = data_mod, densities=densities_b, field_value = "value_norm", xlab_modifier = " - normalized", field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim)
+} else {
+plot_distributions(data_groups = data_mod, densities=densities_a, field_value = "value"     , xlab_modifier = ""             , field_group = "group_peptide_length_special", dataset_name=dataset_name, xlim=xlim, ylim=ylim)
+}
+}
+)
+###############################################################################
+#            Class: Plot_Bad_CI
+###############################################################################
+Plot_Bad_CI = setRefClass("Plot_Bad_CI",
+contains = "Plot_Image",
+fields = list(breaks = "numeric",
+ylim   = "numeric"))
+Plot_Bad_CI$methods(
+initialize = function(p_breaks=20, p_ylim=NULL, ...){
+if (is.null(p_ylim)){
+ylim <<- numeric(0)
+} else {
+ylim <<- p_ylim
+}
+breaks <<- p_breaks
+plot_title <<- "Credible Intervals for proportion within range - bad"
+callSuper(...)
+},
+data_processor = function(){
+return(data_processors[[1]])
+},
+get_n = function(){
+data_processor()$data_groups$ensure()
+return(nrow(subset(data_processor()$data_groups$df, (PMD_FDR_decoy == 1))))
+},
+plot_image = function(){
+data_processor()$data_groups$ensure()
+data_groups <- data_processor()$data_groups$df
+data_decoy <- subset(data_groups, data_groups$group_training_class == "bad_long")
+data_decoy$region <- cut(x = data_decoy$value, breaks = breaks)
+table(data_decoy$region)
+regions <- unique(data_decoy$region)
+N = nrow(data_decoy)
+find_lower_ci_bound <- function(x){
+ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05)
+return(ci[1])
+}
+find_upper_ci_bound <- function(x){
+ci <- credible_interval(length(x), N, precision = 0.001, alpha=0.05)
+return(ci[2])
+}
+xleft   <- aggregate(value~region, data=data_decoy, FUN=min)
+xright  <- aggregate(value~region, data=data_decoy, FUN=max)
+ytop    <- aggregate(value~region, data=data_decoy, FUN=find_upper_ci_bound)
+ybottom <- aggregate(value~region, data=data_decoy, FUN=find_lower_ci_bound)
+xleft   <- rename_column(xleft  , "value", "xleft"  )
+xright  <- rename_column(xright , "value", "xright" )
+ytop    <- rename_column(ytop   , "value", "ytop"   )
+ybottom <- rename_column(ybottom, "value", "ybottom")
+boxes <- merge(merge(xleft, xright), merge(ytop, ybottom))
+xlab <- "Precursor Mass Discrepancy (ppm)"
+ylab <- "Proportion of PSMs\nin subgroup"
+xlim=range(data_decoy$value, na.rm = TRUE)
+get_ylim(boxes=boxes)
+if (!include_text){
+xlab=""
+ylab=""
+}
+plot(x=c(-10,10), y=c(0,1), type="n", ylim=ylim, xlim=xlim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n")
+with(boxes, rect(xleft=xleft, xright=xright, ytop=ytop, ybottom=ybottom, lwd=scale))
+abline(h=1/breaks, col="blue", lwd=scale)
+},
+get_ylim = function(boxes=NULL){
+is_valid_range <- function(r=NULL){
+return(length(r) == 2)
+}
+if (! is_valid_range(ylim)){
+ylim <<- range(c(0,boxes$ytop, boxes$ybottom))
+}
+}
+)
+###############################################################################
+#            Class: Plot_Selective_Loss
+###############################################################################
+Plot_Selective_Loss = setRefClass("Plot_Selective_Loss",
+contains = "Plot_Image",
+fields = list())
+Plot_Selective_Loss$methods(
+initialize = function( ...){
+plot_title <<- "PMD-FDR Selectively removes Bad Hits"
+callSuper(...)
+},
+data_processor = function(){
+return(data_processors[[1]])
+},
+get_n = function(){
+data_processor()$i_fdr$ensure()
+data_subset <- data_processor()$i_fdr$df
+return(nrow(data_subset))
+},
+plot_image = function(){
+# Support functions for plot_selective_loss()
+samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){
+data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold)
+tbl <- with(updated_i_fdr,
+table(PMD_FDR_input_score >= score_threshold,
+new_confidence < score_threshold,
+group_decoy_proteins))
+df <- data.frame(tbl)
+df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum)
+df_n <- rename_column(df_n, name_before = "Freq", "n")
+df <- merge(df, df_n)
+df$rate_of_loss <- with(df, Freq/n)
+df <- subset(df, (Var1==TRUE) & (Var2==TRUE))
+df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")]
+if (nrow(df) > 0){
+df$score_threshold <- score_threshold
+}
+return(df)
+}
+get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){
+df=data.frame()
+for (score_threshold in score_thresholds){
+df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold)
+df <- rbind(df, df_new_loss)
+}
+return(df)
+}
+# Main code for plot_selective_loss()
+updated_i_fdr                <- data_processor()$i_fdr$df
+updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100)))
+loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100)
+xlim <- with(loss_record, range(score_threshold))
+ylim <- c(0,1)
+xlab <- "Fixed Confidence threshold (PeptideShaker score)"
+ylab <- "Rate of PSM disqualification from PMD-FDR"
+lwd  <- 4
+plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab)
+groups <- sort(unique(loss_record$group_decoy_proteins))
+n_g    <- length(groups)
+cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1)
+ltys <- rep(1:6, length.out=n_g)
+leg     <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, title="Species/Category")
+for (i in 1:n_g){
+lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$leg[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i])
+}
+abline(h=0, v=100, lwd=scale)
+abline(h=c(0.1, 0.8), col="gray", lwd=scale)
+#leg = list(leg=group, col=col, lty=lty, lwd=lwd)
+#with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len))
+legend_object <- Legend_Object$new(leg, scale)
+legend_object$show()
+}
+)
+###############################################################################
+#            Class: Plot_Selective_Loss_for_TOC
+###############################################################################
+Plot_Selective_Loss_for_TOC = setRefClass("Plot_Selective_Loss_for_TOC",
+contains = "Plot_Image",
+fields = list(xlab="character",
+ylab="character",
+title_x="numeric",
+title_y="numeric",
+legend_border="logical",
+legend_x = "numeric",
+legend_y = "numeric",
+legend_title="character",
+legend_location = "character",
+name_contaminant = "character",
+name_decoy = "character",
+name_human = "character",
+name_pyro = "character"))
+Plot_Selective_Loss_for_TOC$methods(
+initialize = function( ...){
+plot_title <<- "PMD-FDR selectively removes bad hits"
+callSuper(...)
+xlab <<- "Confidence threshold (PeptideShaker)"
+ylab <<- "PMD Disqualifiction Rate"
+legend_border    <<- FALSE
+#legend_title     <<-  "Species/Category"
+title_x          <<- 50
+title_y          <<- 0.9
+legend_x         <<- 10
+legend_y         <<- 0.75
+name_contaminant <<- "signal - contaminant"
+name_decoy       <<- "decoy - reversed"
+name_human       <<- "decoy - human"
+name_pyro        <<- "signal - pyrococcus"
+},
+data_processor = function(){
+return(data_processors[[1]])
+},
+get_n = function(){
+data_processor()$i_fdr$ensure()
+data_subset <- data_processor()$i_fdr$df
+return(nrow(data_subset))
+},
+plot_image = function(){
+# Support functions for plot_selective_loss()
+samples_lost_by_threshold <- function(updated_i_fdr=NULL, score_threshold=NULL){
+data_subset <- subset(updated_i_fdr, PMD_FDR_input_score >= score_threshold)
+tbl <- with(updated_i_fdr,
+table(PMD_FDR_input_score >= score_threshold,
+new_confidence < score_threshold,
+group_decoy_proteins))
+df <- data.frame(tbl)
+df_n <- aggregate(Freq~group_decoy_proteins+Var1, data=df, FUN=sum)
+df_n <- rename_column(df_n, name_before = "Freq", "n")
+df <- merge(df, df_n)
+df$rate_of_loss <- with(df, Freq/n)
+df <- subset(df, (Var1==TRUE) & (Var2==TRUE))
+df <- df[,c("group_decoy_proteins", "rate_of_loss", "n", "Freq")]
+if (nrow(df) > 0){
+df$score_threshold <- score_threshold
+}
+return(df)
+}
+get_loss_record <- function(updated_i_fdr=NULL, score_thresholds=NULL){
+df=data.frame()
+for (score_threshold in score_thresholds){
+df_new_loss <- samples_lost_by_threshold(updated_i_fdr, score_threshold)
+df <- rbind(df, df_new_loss)
+}
+return(df)
+}
+convert_groups <- function(groups=NULL){
+new_groups <- groups
+new_groups <- gsub(pattern = "contaminant", replacement = name_contaminant, x = new_groups)
+new_groups <- gsub(pattern = "decoy"      , replacement = name_decoy      , x = new_groups)
+new_groups <- gsub(pattern = "human"      , replacement = name_human      , x = new_groups)
+new_groups <- gsub(pattern = "pyrococcus" , replacement = name_pyro       , x = new_groups)
+return(new_groups)
+}
+# Main code for plot_selective_loss()
+updated_i_fdr                <- data_processor()$i_fdr$df
+updated_i_fdr$new_confidence <- with(updated_i_fdr, 100 * (1-i_fdr)) #ifelse((1-i_fdr) < (PMD_FDR_input_score / 100), (1-i_fdr), (PMD_FDR_input_score/100)))
+loss_record <- get_loss_record(updated_i_fdr=updated_i_fdr, score_thresholds = 1:100)
+xlim <- with(loss_record, range(score_threshold))
+ylim <- c(0,1)
+#xlab <- "Fixed Confidence threshold (PeptideShaker score)"
+#ylab <- "Rate of PSM disqualification from PMD-FDR"
+lwd  <- 4
+plot(x=xlim, y=ylim, type="n", main=main, xlab=xlab, ylab=ylab)
+groups <- sort(unique(loss_record$group_decoy_proteins))
+n_g    <- length(groups)
+cols <- rainbow_with_fixed_intensity(n=n_g, goal_intensity_0_1 = 0.5, alpha = 1)
+ltys <- rep(1:6, length.out=n_g)
+bty  <- ifelse(legend_border, "o", "n")
+leg     <- list(leg=convert_groups(groups), var_name=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y)
+#leg     <- list(leg=groups, col=cols, lty=ltys, lwd=lwd, bty=bty, x=legend_x, y=legend_y)
+for (i in 1:n_g){
+lines(rate_of_loss~score_threshold, data=subset(loss_record, group_decoy_proteins==leg$var_name[i]), col=leg$col[i], lwd=leg$lwd * scale, lty=leg$lty[i])
+}
+abline(h=0, v=100, lwd=scale)
+abline(h=c(0.1, 0.8), col="gray", lwd=scale)
+#leg = list(leg=group, col=col, lty=lty, lwd=lwd)
+#with(leg, legend(x = "topleft", legend = group, col = col, lty = lty, lwd = lwd, seg.len = seg.len))
+legend_object <- Legend_Object$new(leg, scale)
+legend_object$show()
+text(x=title_x, y=title_y, labels = plot_title)
+}
+)
+###############################################################################
+#            Class: Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR
+###############################################################################
+Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR = setRefClass("Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR",
+contains = "Plot_Image",
+fields = list())
+Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$methods(
+initialize = function( ...){
+plot_title <<- "Precursor Mass Discrepance i-FDR for 1% Target-Decoy FDR PSMs"
+callSuper(...)
+},
+data_processor = function(){
+return(data_processors[[1]])
+},
+get_n = function(){
+data_processor()$i_fdr$ensure()
+if (one_percent_calculation_exists()){
+i_fdr <- data_processor()$i_fdr$df
+data_subset <- subset(i_fdr, is_in_1percent==TRUE)
+n <- nrow(data_subset)
+} else {
+n <- 0
+}
+return (n)
+},
+plot_image = function(){
+if (one_percent_calculation_exists()){
+i_fdr        <- get_modified_fdr()
+report_good_discrepancies(i_fdr)
+data_TD_good <- get_data_TD_good(i_fdr)
+mean_results <- get_mean_results(data_TD_good)
+boxes        <- mean_results
+boxes        <- rename_columns(df = boxes,
+names_before = c("min_conf", "max_conf", "lower"  , "upper"),
+names_after  = c("xleft"   , "xright"  , "ybottom", "ytop" ))
+xlim <- range(boxes[,c("xleft", "xright")])
+ylim <- range(boxes[,c("ybottom", "ytop")])
+#head(mean_results)
+xlab = "Confidence Score (Peptide Shaker)"
+ylab = "Mean PMD i-FDR"
+if (!include_text){
+xlab=""
+ylab=""
+}
+plot(mean_i_fdr~mean_conf, data=mean_results, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab, main=main, xaxt="n", yaxt="n", cex=scale, lwd=scale)
+with(boxes, rect(xleft = xleft, ybottom = ybottom, xright = xright, ytop = ytop, lwd=scale))
+abline(b=-1, a=100, lwd=4*scale, col="dark gray")
+abline(h=0, v=100, lwd=1*scale)
+} else {
+stop(sprintf("Dataset '%s' does not include 1%% FDR data", data_processor()$info$collection_name()))
+}
+},
+get_mean_results = function(data_TD_good = NULL){
+mean_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=mean)
+mean_i_fdr <- rename_column(mean_i_fdr, "i_fdr", "mean_i_fdr")
+sd_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=sd)
+sd_i_fdr <- rename_column(sd_i_fdr, "i_fdr", "sd_i_fdr")
+n_i_fdr <- aggregate(i_fdr~conf_group, data=data_TD_good, FUN=length)
+n_i_fdr <- rename_column(n_i_fdr, "i_fdr", "n")
+mean_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=mean)
+mean_conf <- rename_column(mean_conf, "PMD_FDR_input_score", "mean_conf")
+min_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=min)
+min_conf <- rename_column(min_conf, "PMD_FDR_input_score", "min_conf")
+max_conf <- aggregate(PMD_FDR_input_score~conf_group, data=data_TD_good, FUN=max)
+max_conf <- rename_column(max_conf, "PMD_FDR_input_score", "max_conf")
+mean_results <-                     mean_i_fdr
+mean_results <- merge(mean_results, sd_i_fdr)
+mean_results <- merge(mean_results, n_i_fdr)
+mean_results <- merge(mean_results, mean_conf)
+mean_results <- merge(mean_results, min_conf)
+mean_results <- merge(mean_results, max_conf)
+mean_results$se    <- with(mean_results, sd_i_fdr / sqrt(n - 1))
+mean_results$lower <- with(mean_results, mean_i_fdr - 2*se)
+mean_results$upper <- with(mean_results, mean_i_fdr + 2*se)
+return(mean_results)
+},
+get_data_TD_good = function(i_fdr=NULL){
+data_TD_good <- subset(i_fdr, TD_good==TRUE)
+data_TD_good <- data_TD_good[order(data_TD_good$PMD_FDR_input_score),]
+n <- nrow(data_TD_good)
+data_TD_good$conf_group <- cut(1:n, breaks=floor(n/100))
+data_TD_good$i_fdr <- 100 * data_TD_good$i_fdr
+return(data_TD_good)
+},
+get_modified_fdr = function(){
+i_fdr <- data_processor()$i_fdr$df
+i_fdr$PMD_good  <- i_fdr$i_fdr < 0.01
+i_fdr$TD_good   <- i_fdr$is_in_1percent == TRUE
+i_fdr$conf_good <- i_fdr$PMD_FDR_input_score == 100
+return(i_fdr)
+},
+one_percent_calculation_exists = function(){
+data_processor()$raw_1_percent$ensure()
+return(data_processor()$raw_1_percent$exists())# "is_in_1percent" %in% colnames(data_processor()$i_fdr))
+},
+report_good_discrepancies = function(i_fdr=NULL){
+with(subset(i_fdr,                                        (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
+with(subset(i_fdr, (PMD_FDR_input_score==100)                    & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
+with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
+with(subset(i_fdr, (PMD_FDR_input_score>= 99) & (PMD_FDR_input_score<100) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
+with(subset(i_fdr, (PMD_FDR_input_score>= 90) & (PMD_FDR_input_score< 99) & (PMD_FDR_decoy == 0)), print(table(TD_good, PMD_good)))
+}
+)
+###############################################################################
+#            Class: Plot_Density_PMD_by_Score
+###############################################################################
+Plot_Density_PMD_by_Score = setRefClass("Plot_Density_PMD_by_Score",
+contains = "Plot_Image",
+fields = list(show_norm = "logical"))
+Plot_Density_PMD_by_Score$methods(
+initialize = function(p_show_norm=FALSE, ...){
+show_norm <<- p_show_norm
+plot_title <<- "PMD distribution, by Confidence ranges"
+callSuper(...)
+},
+data_processor = function(){
+return(data_processors[[1]])
+},
+get_n = function(){
+return(nrow(data_processor()$data_groups$df))
+#data_subset <- data_collection$i_fdr
+#return(nrow(data_subset))
+},
+get_modified_data_groups = function(var_value = NULL){
+# Note: Filters out used_to_find_middle
+# Note: Creates "value_of_interest" field
+# Note: Remakes "group_decoy_input_score" field
+data_new                   <- data_processor()$data_groups$df
+data_new                   <- subset(data_new, !used_to_find_middle )
+data_new$value_of_interest <- data_new[, var_value]
+cutoff_points <- c(100, 100, 95, 80, 50, 0, 0)
+n <- length(cutoff_points)
+uppers <- cutoff_points[-n]
+lowers <- cutoff_points[-1]
+for (i in 1:(n-1)){
+upper <- uppers[i]
+lower <- lowers[i]
+if (lower==upper){
+idx <- with(data_new, which(                        (PMD_FDR_input_score == upper) & (PMD_FDR_decoy == 0)))
+cat_name <- sprintf("%d", upper)
+} else {
+idx <- with(data_new, which((PMD_FDR_input_score >= lower) & (PMD_FDR_input_score <  upper) & (PMD_FDR_decoy == 0)))
+cat_name <- sprintf("%02d - %2d", lower, upper)
+}
+data_new$group_decoy_input_score[idx] <- cat_name
+}
+return(data_new)
+},
+plot_image = function(){
+# Support functions for plot_density_PMD_by_score()
+get_densities <- function(data_subset = NULL, var_value = NULL){
+# Support functions for get_densities()
+# New version
+# Main body of get_densities()
+data_subset <- get_modified_data_groups(var_value=var_value)
+#data_subset$value_of_interest <- data_subset[,var_value]
+from <- min(data_subset$value_of_interest, na.rm=TRUE)
+to   <- max(data_subset$value_of_interest, na.rm=TRUE)
+xlim = range(data_subset$value_of_interest, na.rm=TRUE)
+groups   <- sort(unique(data_subset$group_decoy_input_score), decreasing = TRUE)
+n_groups <- length(groups)
+densities <- list(var_value = var_value, groups=groups)
+for (i in 1:n_groups){
+group <- groups[i]
+data_group_single  <- subset(data_subset, (group_decoy_input_score == group))
+d_group            <- with(data_group_single , density(value_of_interest, from = from, to = to, na.rm = TRUE))
+d_group            <- normalize_density(d_group)
+densities[[group]] <- d_group
+}
+return(densities)
+}
+get_xlim <- function(densities_a = NULL, densities_b = NULL){
+groups <- densities_a$groups
+xlim <- 0
+for (group in groups){
+xlim <- range(xlim, densities_a[[group]]$x, densities_b[[group]]$x)
+}
+return(xlim)
+}
+get_ylim <- function(densities_a = NULL, densities_b = NULL){
+groups <- densities_a$groups
+ylim <- 0
+for (group in groups){
+ylim <- range(ylim, densities_a[[group]]$y, densities_b[[group]]$y)
+}
+return(ylim)
+}
+plot_distributions <- function(densities = NULL, var_value= NULL,include_peak_dots=TRUE, xlab_modifier="", xlim=NULL, ylim=NULL, ...){
+data_groups <- get_modified_data_groups(var_value=var_value)
+groups      <- sort(unique(data_groups$group_decoy_input_score))
+n_groups    <- length(groups)
+groups_std   <- setdiff(groups, c("100", "decoy", "0") )
+groups_std   <- sort(groups_std, decreasing = TRUE)
+groups_std   <- c(groups_std, "0")
+n_std        <- length(groups_std)
+cols <- rainbow_with_fixed_intensity(n = n_std, goal_intensity_0_1 = 0.5, alpha=0.5)
+leg <- list(group = c("100"             , groups_std   , "decoy"                           ),
+leg   = c("100"             , groups_std   , "All Decoys"                      ),
+col   = c(col2hex("black")  , cols         , col2hex("purple", col_alpha = 0.5)),
+lwd   = c(4                 , rep(2, n_std), 4                                 ),
+title = "Confidence Score")
+xlab = sprintf("Precursor Mass Discrepancy%s (ppm)",
+xlab_modifier)
+ylab = "Density"
+if (!include_text){
+xlab=""
+ylab=""
+}
+plot( x=xlim, y=ylim, col=leg$col[1], lwd=leg$lwd[1] * scale, main=main, xlab=xlab, ylab=ylab, xaxt="n", yaxt="n", cex=scale, type="n")#, lty=leg.lty[1], ...)
+include_peak_dots = FALSE # BUGBUG: Disabling this for now.  Need to move this to class parameter
+for (i in 1:length(leg$group)){
+group <- leg$group[i]
+d     <- densities[[group]]
+lines(d, col=leg$col[i], lwd=leg$lwd[i] * scale)
+if (include_peak_dots){
+x=d$x[which.max(d$y)]
+y=max(d$y)
+points(x=c(x,x), y=c(0,y), pch=19, col=leg$col[i], cex=scale)
+}
+}
+abline(v=0, lwd=scale)
+if (include_text){
+legend_object = Legend_Object$new(leg, scale)
+legend_object$show()
+}
+}
+# Main body for plot_density_PMD_by_score()
+data_groups <- data_processor()$data_groups$df
+data_subset_a <- subset(data_groups  , used_to_find_middle == FALSE)
+data_subset_b <- subset(data_subset_a, PMD_FDR_peptide_length > 11)
+densities_a <- get_densities(data_subset = data_subset_a, var_value = "value")
+densities_b <- get_densities(data_subset = data_subset_b, var_value = "value_norm")
+xlim=get_xlim(densities_a, densities_b)
+ylim=get_ylim(densities_a, densities_b)
+dataset_name <- data_processor()$info$collection_name()
+if (show_norm){
+plot_distributions(densities=densities_b, var_value = "value_norm", xlab_modifier = " - normalized", xlim=xlim, ylim=ylim)
+} else {
+plot_distributions(densities=densities_a, var_value = "value"     , xlab_modifier = ""             , xlim=xlim, ylim=ylim)
+}
+}
+)
+###############################################################################
+#            Class: Plot_Dataset_Description
+###############################################################################
+Plot_Dataset_Description = setRefClass("Plot_Dataset_Description",
+contains = "Plot_Multiple_Images",
+fields = list(ylim_time_invariance = "numeric"))
+Plot_Dataset_Description$methods(
+initialize = function(p_data_processors = NULL,
+p_include_text=TRUE,
+p_include_main=FALSE,
+p_ylim_time_invariance = c(-4,4), ...){
+plot_object_r1_c1 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors,
+p_include_text=p_include_text,
+p_include_main=p_include_main,
+p_training_class = "good_testing",
+p_field_of_interest = "value",
+p_ylim = p_ylim_time_invariance)
+plot_object_r1_c2 <- Plot_Time_Invariance_Alt$new(p_data_processors=p_data_processors,
+p_include_text=p_include_text,
+p_include_main=p_include_main,
+p_training_class = "good_testing",
+p_field_of_interest = "value_norm",
+p_ylim = p_ylim_time_invariance)
+plot_object_r2_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors,
+p_show_norm=FALSE,
+p_include_text=p_include_text,
+p_include_main=p_include_main)
+plot_object_r2_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors,
+p_show_norm=FALSE,
+p_include_text=p_include_text,
+p_include_main=p_include_main)
+plot_object_r3_c1 <- Plot_Density_PMD_by_Score$new(p_data_processors=p_data_processors,
+p_show_norm=TRUE,
+p_include_text=p_include_text,
+p_include_main=p_include_main)
+plot_object_r3_c2 <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors=p_data_processors,
+p_show_norm=TRUE,
+p_include_text=p_include_text,
+p_include_main=p_include_main)
+callSuper(p_n_images_wide=2,
+p_n_images_tall=3,
+p_include_text=p_include_text,
+p_include_main=p_include_main,
+p_image_list = list(plot_object_r1_c1, plot_object_r1_c2,
+plot_object_r2_c1, plot_object_r2_c2,
+plot_object_r3_c1, plot_object_r3_c2), ...)
+}
+)
+###############################################################################
+#            Class: Plots_for_Paper
+###############################################################################
+Plots_for_Paper <- setRefClass("Plots_for_Paper", fields =list(data_processor_a = "Data_Processor",
+data_processor_b = "Data_Processor",
+data_processor_c = "Data_Processor",
+data_processor_d = "Data_Processor",
+include_text      = "logical",
+include_main      = "logical",
+mai               = "numeric"))
+Plots_for_Paper$methods(
+initialize = function(){
+data_processor_a <<- Data_Processor$new(p_info = Data_Object_Info_737_two_step$new())
+data_processor_b <<- Data_Processor$new(p_info = Data_Object_Info_737_combined$new())
+data_processor_c <<- Data_Processor$new(p_info = Data_Object_Pyrococcus_tr    $new())
+data_processor_d <<- Data_Processor$new(p_info = Data_Object_Mouse_Mutations  $new())
+},
+create_plots_for_paper = function(include_main=TRUE, finalize=TRUE){
+print_table_4_data()
+print_figure_2_data()
+plot_figure_D(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_C(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_B(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_A(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_8(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_7(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_6(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main)
+plot_figure_5(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_4(p_scale=ifelse(finalize, 2, 1), p_include_main = include_main)
+plot_figure_3(p_scale=ifelse(finalize, 4, 1), p_include_main = include_main)
+},
+print_figure_2_data = function(){
+print(create_stats_for_grouping_figure(list(data_processor_a)))
+},
+print_table_4_data = function(){
+report_ranges_of_comparisons(processors = list(data_processor_a))
+report_ranges_of_comparisons(processors = list(data_processor_c))
+},
+plot_figure_3 = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Compare_PMD_and_Norm_Density$new(p_data_processor  = list(data_processor_a),
+p_show_norm       = FALSE,
+p_include_text    = TRUE,
+p_include_main    = p_include_main,
+p_display_n_psms  = FALSE)
+plot_object$plot_image_in_small_window(p_scale=p_scale)
+},
+plot_figure_4 = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Time_Invariance_Alt_Before_and_After$new(p_data_processors = list(data_processor_a),
+p_include_text=TRUE,
+p_include_main=p_include_main,
+p_ylim = c(-4,4))
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+plot_figure_5 = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Density_PMD_and_Norm_Decoy_by_AA_Length$new(p_data_processors = list(data_processor_a),
+p_include_text=TRUE,
+p_include_main=p_include_main)
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+plot_figure_6 = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Bad_CI$new(p_data_processors = list(data_processor_a),
+p_include_text=TRUE,
+p_include_main=p_include_main)
+plot_object$plot_image_in_small_window(p_scale=p_scale)
+},
+plot_figure_7 = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Compare_iFDR_Confidence_1_Percent_TD_FDR$new(p_data_processors = list(data_processor_a),
+p_include_text=TRUE,
+p_include_main=p_include_main)
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+plot_figure_8 = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Selective_Loss$new(p_data_processors = list(data_processor_c),
+p_include_text=TRUE,
+p_include_main=p_include_main)
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+plot_figure_A = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_a),
+p_include_text=TRUE,
+p_include_main=p_include_main,
+p_ylim_time_invariance=c(-4,4) )
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+plot_figure_B = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_b),
+p_include_text=TRUE,
+p_include_main=p_include_main,
+p_ylim_time_invariance=c(-4,4) )
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+plot_figure_C = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_c),
+p_include_text=TRUE,
+p_include_main=p_include_main,
+p_ylim_time_invariance=c(-4,4) )
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+plot_figure_D = function(p_scale=NULL, p_include_main=NULL){
+plot_object <- Plot_Dataset_Description$new(p_data_processors=list(data_processor_d),
+p_include_text=TRUE,
+p_include_main=p_include_main,
+p_ylim_time_invariance=c(-4,4) )
+plot_object$plot_image_in_large_window(window_height=4, p_scale=p_scale)
+},
+create_stats_for_grouping_figure = function(processors=NULL){
+processor <- processors[[1]]
+processor$i_fdr$ensure()
+aug_i_fdr                      <- processor$i_fdr$df
+aug_i_fdr$group_good_bad_other <- gsub("_.*", "", aug_i_fdr$group_training_class)
+aug_i_fdr$group_null           <- "all"
+table(aug_i_fdr$group_training_class)
+table(aug_i_fdr$group_good_bad_other)
+table(aug_i_fdr$group_null)
+create_agg_fdr_stats <- function(i_fdr=NULL, grouping_var_name = NULL){
+formula_fdr <- as.formula(sprintf("%s~%s", "i_fdr", grouping_var_name))
+formula_len <- as.formula(sprintf("%s~%s", "PMD_FDR_peptide_length", grouping_var_name))
+agg_fdr <- aggregate(formula=formula_fdr, data=i_fdr, FUN=mean)
+agg_n   <- aggregate(formula=formula_fdr, data=i_fdr, FUN=length)
+agg_len <- aggregate(formula=formula_len, data=i_fdr, FUN=mean)
+agg_fdr <- rename_columns(df = agg_fdr,
+names_before = c(grouping_var_name, "i_fdr"),
+names_after  = c("group"          , "fdr"))
+agg_n   <- rename_columns(df = agg_n,
+names_before = c(grouping_var_name, "i_fdr"),
+names_after  = c("group"          , "n"))
+agg_len <- rename_columns(df = agg_len,
+names_before = c(grouping_var_name),
+names_after  = c("group"          ))
+agg <- merge(agg_fdr, agg_n)
+agg <- merge(agg    , agg_len)
+return(agg)
+}
+agg_detail  <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_training_class")
+agg_grouped <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_good_bad_other")
+agg_all     <- create_agg_fdr_stats(i_fdr = aug_i_fdr, grouping_var_name = "group_null")
+agg <- rbind(agg_detail, agg_grouped)
+agg <- rbind(agg, agg_all)
+agg$fdr <- ifelse(agg$fdr < 1, agg$fdr, 1)
+linear_combo <- function(x=NULL, a0=NULL, a1=NULL){
+result <- (a0 * (1-x) + a1 * x)
+return(result)
+}
+agg$r <- linear_combo(agg$fdr, a0=197, a1= 47)
+agg$g <- linear_combo(agg$fdr, a0= 90, a1= 85)
+agg$b <- linear_combo(agg$fdr, a0= 17, a1=151)
+return(agg)
+},
+report_ranges_of_comparisons = function(processors=NULL){
+report_comparison_of_Confidence_and_PMD = function (i_fdr = NULL, min_conf=NULL, max_conf=NULL, include_max=FALSE){
+report_PMD_confidence_comparison_from_subset = function(data_subset=NULL, group_name=NULL){
+print(group_name)
+print(sprintf("    Number of PSMs: %d", nrow(data_subset)))
+mean_confidence <- mean(data_subset$PMD_FDR_input_score)
+print(sprintf("    Mean Confidence Score: %3.1f", mean_confidence))
+print(sprintf("    PeptideShaker g-FDR: %3.1f", 100-mean_confidence))
+mean_PMD_FDR = mean(data_subset$i_fdr)
+print(sprintf("    PMD g-FDR: %3.1f", 100*mean_PMD_FDR))
+#col <- col2hex("black", 0.2)
+#plot(data_subset$i_fdr, pch=".", cex=2, col=col)
+#abline(h=0)
+}
+if (is.null(max_conf)) {
+data_subset <- subset(i_fdr, PMD_FDR_input_score == min_conf)
+group_name <- sprintf("Group %d", min_conf)
+} else if (include_max){
+data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score <= max_conf))
+group_name <- sprintf("Group %d through %d", min_conf, max_conf)
+} else {
+data_subset <- subset(i_fdr, (PMD_FDR_input_score >= min_conf) & (PMD_FDR_input_score < max_conf))
+group_name <- sprintf("Group %d to %d", min_conf, max_conf)
+}
+report_PMD_confidence_comparison_from_subset(data_subset=data_subset, group_name=group_name)
+}
+processor <- processors[[1]]
+processor$i_fdr$ensure()
+i_fdr <- processor$i_fdr$df
+info  <- processor$info
+print(sprintf("PMD and Confidence comparison for -- %s",  info$collection_name()))
+report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf=100, max_conf=NULL, include_max=TRUE)
+report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 99, max_conf=100 , include_max=FALSE)
+report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf= 90, max_conf= 99 , include_max=FALSE)
+report_comparison_of_Confidence_and_PMD(i_fdr = i_fdr, min_conf=  0, max_conf=100 , include_max=TRUE)
+}
+)
+###############################################################################
+# C - 021 - PMD-FDR Wrapper - functions.R                                     #
+#                                                                             #
+# Creates the necessary structure to convert the PMD-FDR code into one that   #
+# can run as a batch file                                                     #
+#                                                                             #
+###############################################################################
+###############################################################################
+#            Class: ModuleArgParser_PMD_FDR
+###############################################################################
+ModuleArgParser_PMD_FDR <- setRefClass("ModuleArgParser_PMD_FDR",
+contains = c("ArgParser"),
+fields =list(args = "character") )
+ModuleArgParser_PMD_FDR$methods(
+initialize = function(description = "Computes individual and global FDR using Precursor Mass Discrepancy (PMD-FDR)", ...){
+callSuper(description=description, ...)
+local_add_argument("--psm_report"          ,                                 help="full name and path to the PSM report")
+local_add_argument("--psm_report_1_percent", default = ""                  , help="full name and path to the PSM report for 1% FDR")
+local_add_argument("--output_i_fdr"        , default = ""                  , help="full name and path to the i-FDR output file ")
+local_add_argument("--output_g_fdr"        , default = ""                  , help="full name and path to the g-FDR output file ")
+local_add_argument("--output_densities"    , default = ""                  , help="full name and path to the densities output file ")
+#local_add_argument("--score_field_name"    , default = ""                  , help="name of score field (in R format)")
+local_add_argument("--input_file_type"     , default = "PMD_FDR_input_file", help="type of input file (currently supports: PSM_Report)")
+}
+)
+###############################################################################
+#            Class: Data_Object_Parser
+###############################################################################
+Data_Object_Parser <- setRefClass("Data_Object_Parser",
+contains = c("Data_Object"),
+fields =list(parser = "ModuleArgParser_PMD_FDR",
+args = "character",
+parsing_results = "list") )
+Data_Object_Parser$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Parser"
+},
+verify = function(){
+# Nothing to do here - parser handles verification during load
+},
+m_load_data = function(){
+if (length(args) == 0){
+parsing_results <<- parser$parse_arguments(NULL)
+} else {
+parsing_results <<- parser$parse_arguments(args)
+}
+},
+set_args = function(p_args=NULL){
+# This is primarily used for testing.  In operation arguments will be passed automatically (through use of commandArgs)
+args <<- p_args
+set_dirty(TRUE)
+}
+)
+###############################################################################
+#            Class: Data_Object_Info_Parser
+###############################################################################
+Data_Object_Info_Parser <- setRefClass("Data_Object_Info_Parser",
+contains = c("Data_Object_Info"),
+fields =list(
+output_i_fdr = "character",
+output_g_fdr = "character",
+output_densities = "character"
+) )
+Data_Object_Info_Parser$methods(
+initialize = function(){
+callSuper()
+class_name <<- "Data_Object_Info_Parser"
+},
+verify = function(){
+check_field_exists = function(field_name=NULL, check_empty = TRUE){
+field_value <- get_parser()$parsing_results[field_name]
+checkTrue(! is.null(field_value),
+msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value))
+if (check_empty){
+checkTrue(! is.null(field_value),
+msg = sprintf("Parameter %s was not passed to PMD_FDR", field_value))
+}
+}
+# Check parameters passed in
+check_field_exists("junk")
+check_field_exists("psm_report")
+check_field_exists("psm_report_1_percent", check_empty = FALSE)
+check_field_exists("output_i_fdr"        , check_empty = FALSE)
+check_field_exists("output_g_fdr"        , check_empty = FALSE)
+check_field_exists("output_densities"    , check_empty = FALSE)
+#check_field_exists("score_field_name")
+check_field_exists("input_file_type")
+},
+m_load_data = function(){
+parsing_results <- get_parser()$parsing_results
+data_file_name               <<- as.character(parsing_results["psm_report"])
+data_file_name_1_percent_FDR <<- as.character(parsing_results["psm_report_1_percent"])
+data_path_name               <<- as.character(parsing_results[""])
+#experiment_name              <<- data_file_name
+#designation                  <<- ""
+output_i_fdr                 <<- as.character(parsing_results["output_i_fdr"])
+output_g_fdr                 <<- as.character(parsing_results["output_g_fdr"])
+output_densities             <<- as.character(parsing_results["output_densities"])
+input_file_type              <<- as.character(parsing_results["input_file_type"])
+#score_field_name             <<- as.character(parsing_results["score_field_name"])
+},
+set_parser = function(parser){
+parents[["parser"]] <<- parser
+},
+get_parser = function(){
+return(verified_element_of_list(parents, "parser", "Data_Object_Info_Parser$parents"))
+},
+file_path = function(){
+result <- data_file_name # Now assumes that full path is provided
+if (length(result) == 0){
+stop("Unable to validate file path - file name is missing")
+}
+return(result)
+},
+file_path_1_percent_FDR = function(){
+local_file_name <- get_data_file_name_1_percent_FDR()
+if (length(local_file_name) == 0){
+result <- ""
+} else {
+result <- local_file_name # path name is no longer relevant
+}
+# Continue even if file name is missing - not all analyses have a 1 percent FDR file; this is managed downstream
+# if (length(result) == 0){
+#   stop("Unable to validate file path - one or both of path name and file name (of 1 percent FDR file) are missing")
+# }
+return(result)
+},
+get_data_file_name_1_percent_FDR = function(){
+return(data_file_name_1_percent_FDR)
+},
+collection_name = function(){
+result <- ""
+return(result)
+}
+)
+###############################################################################
+#            Class: Processor_PMD_FDR_for_Galaxy
+# Purpose: Wrapper on tools from Project 019 to enable a Galaxy-based interface
+###############################################################################
+Processor_PMD_FDR_for_Galaxy <- setRefClass("Processor_PMD_FDR_for_Galaxy",
+fields = list(
+parser         = "Data_Object_Parser",
+info           = "Data_Object_Info_Parser",
+raw_data       = "Data_Object_Raw_Data",
+raw_1_percent  = "Data_Object_Raw_1_Percent",
+data_converter = "Data_Object_Data_Converter",
+data_groups    = "Data_Object_Groupings",
+densities      = "Data_Object_Densities",
+alpha          = "Data_Object_Alpha",
+i_fdr          = "Data_Object_Individual_FDR"
+))
+Processor_PMD_FDR_for_Galaxy$methods(
+initialize = function(){
+# This initialization defines all of the dependencies between the various components
+# (Unfortunately, inheriting from Data_Processor leads to issues - I had to reimplement it here with a change to "info")
+# info
+info$set_parser(parser)
+parser$append_child(info)
+# raw_data
+raw_data$set_info(info)
+info$append_child(raw_data)
+# raw_1_percent
+raw_1_percent$set_info(info)
+info$append_child(raw_1_percent)
+# data_converter
+data_converter$set_info    (info)
+data_converter$set_raw_data(raw_data)
+info         $append_child (data_converter)
+raw_data     $append_child (data_converter)
+# data_groups
+data_groups$set_info          (info)
+data_groups$set_data_converter(data_converter)
+data_groups$set_raw_1_percent (raw_1_percent)
+info          $append_child   (data_groups)
+data_converter$append_child   (data_groups)
+raw_1_percent $append_child   (data_groups)
+# densities
+densities  $set_data_groups(data_groups)
+data_groups$append_child   (densities)
+# alpha
+alpha    $set_densities(densities)
+densities$append_child (alpha)
+# i_fdr
+i_fdr$set_data_groups(data_groups)
+i_fdr$set_densities  (densities)
+i_fdr$set_alpha      (alpha)
+data_groups  $append_child(i_fdr)
+densities    $append_child(i_fdr)
+alpha        $append_child(i_fdr)
+},
+compute = function(){
+#i_fdr is currently the lowest level object - it ultimately depends on everything else.
+i_fdr$ensure() # All pieces on which i_fdr depends are automatically verified and computed (through their verify() and ensure())
+save_standard_df(x = densities$df, file_path = info$output_densities)
+save_standard_df(x =     alpha$df, file_path = info$output_g_fdr)
+save_standard_df(x =     i_fdr$df, file_path = info$output_i_fdr)
+}
+)
+###############################################################################
+# D - 021 - PMD-FDR Main.R                                                    #
+#                                                                             #
+# File Description: Contains the base code that interprets the parameters     #
+#                   and computes i-FDR and g-FDR for a mass spec project      #
+#                                                                             #
+###############################################################################
+argv <- commandArgs(TRUE) # Saves the parameters (command code)
+processor <- Processor_PMD_FDR_for_Galaxy$new()
+processor$parser$set_args(argv)
+processor$compute()

Mercurial > repos > galaxyp > pmd_fdr

comparison PMD_FDR_package_for_Galaxy.R @ 0:5cc0c32d05a2 draft