Mercurial > repos > recetox > recetox_aplcms_recover_weaker_signals
diff utils.R @ 2:472dc85ce7c5 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 506df2aef355b3791567283e1a175914f06b405a
author | recetox |
---|---|
date | Mon, 13 Feb 2023 10:28:35 +0000 |
parents | 067a308223e3 |
children | c69a12bfc2fb |
line wrap: on
line diff
--- a/utils.R Thu Jun 16 10:27:28 2022 +0000 +++ b/utils.R Mon Feb 13 10:28:35 2023 +0000 @@ -1,149 +1,125 @@ library(recetox.aplcms) -align_features <- function(sample_names, ...) { - aligned <- feature.align(...) - feature_names <- seq_len(nrow(aligned$pk.times)) - - list( - mz_tolerance = as.numeric(aligned$mz.tol), - rt_tolerance = as.numeric(aligned$chr.tol), - rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times), - int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs) - ) +get_env_sample_name <- function() { + sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA) + if (nchar(sample_name) == 0) { + sample_name <- NA + } + if (is.na(sample_name)) { + message("The mzML file does not contain run ID.") + } + return(sample_name) } -get_sample_name <- function(filename) { - tools::file_path_sans_ext(basename(filename)) -} - -as_feature_crosstab <- function(feature_names, sample_names, data) { - colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names) - rownames(data) <- feature_names - as.data.frame(data) +save_sample_name <- function(df, sample_name) { + attr(df, "sample_name") <- sample_name + return(df) } -as_feature_sample_table <- function(rt_crosstab, int_crosstab) { - feature_names <- rownames(rt_crosstab) - sample_names <- colnames(rt_crosstab)[- (1:4)] - - feature_table <- data.frame( - feature = feature_names, - mz = rt_crosstab[, 1], - rt = rt_crosstab[, 2] - ) - - # series of conversions to produce a table type from data.frame - rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)])) - int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)])) - - crosstab_axes <- list(feature = feature_names, sample = sample_names) - dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes - - x <- as.data.frame(rt_crosstab, responseName = "sample_rt") - y <- as.data.frame(int_crosstab, responseName = "sample_intensity") - - data <- merge(x, y, by = c("feature", "sample")) - data <- merge(feature_table, data, by = "feature") - data +load_sample_name <- function(df) { + sample_name <- attr(df, "sample_name") + if (is.null(sample_name)) { + return(NA) + } else { + return(sample_name) + } } -load_features <- function(files) { - files_list <- sort_samples_by_acquisition_number(files) - features <- lapply(files_list, arrow::read_parquet) - features <- lapply(features, as.matrix) +save_data_as_parquet_file <- function(data, filename) { + arrow::write_parquet(data, filename) +} + +load_data_from_parquet_file <- function(filename) { + return(arrow::read_parquet(filename)) +} + +load_parquet_collection <- function(files) { + features <- lapply(files, arrow::read_parquet) + features <- lapply(features, tibble::as_tibble) return(features) } -save_data_as_parquet_files <- function(data, subdir) { - dir.create(subdir) - for (i in 0:(length(data) - 1)) { - filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet")) - arrow::write_parquet(as.data.frame(data[i + 1]), filename) - } +save_parquet_collection <- function(table, sample_names, subdir) { + dir.create(subdir) + for (i in seq_len(length(table$feature_tables))) { + filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet")) + feature_table <- as.data.frame(table$feature_tables[[i]]) + feature_table <- save_sample_name(feature_table, sample_names[i]) + arrow::write_parquet(feature_table, filename) + } +} + +sort_by_sample_name <- function(tables, sample_names) { + return(tables[order(sample_names)]) } -save_aligned_features <- function(aligned, rt_file, int_file, tol_file) { - arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file) - arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file) - - mz_tolerance <- c(aligned$mz_tolerance) - rt_tolerance <- c(aligned$rt_tolerance) - arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) +save_tolerances <- function(table, tol_file) { + mz_tolerance <- c(table$mz_tol_relative) + rt_tolerance <- c(table$rt_tol_relative) + arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) } -load_aligned_features <- function(rt_file, int_file, tol_file) { - rt_cross_table <- arrow::read_parquet(rt_file) - int_cross_table <- arrow::read_parquet(int_file) - tolerances_table <- arrow::read_parquet(tol_file) +get_mz_tol <- function(tolerances) { + return(tolerances$mz_tolerance) +} - result <- list() - result$mz_tolerance <- tolerances_table$mz_tolerance - result$rt_tolerance <- tolerances_table$rt_tolerance - result$rt_crosstab <- rt_cross_table - result$int_crosstab <- int_cross_table - return(result) +get_rt_tol <- function(tolerances) { + return(tolerances$rt_tolerance) +} + +save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) { + save_data_as_parquet_file(aligned_features$metadata, metadata_file) + save_data_as_parquet_file(aligned_features$rt, rt_file) + save_data_as_parquet_file(aligned_features$intensity, intensity_file) } -recover_signals <- function(cluster, - filenames, - extracted, - corrected, - aligned, - mz_tol = 1e-05, - mz_range = NA, - rt_range = NA, - use_observed_range = TRUE, - min_bandwidth = NA, - max_bandwidth = NA, - recover_min_count = 3) { - if (!is(cluster, "cluster")) { - cluster <- parallel::makeCluster(cluster) - on.exit(parallel::stopCluster(cluster)) - } - - clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker")) - clusterEvalQ(cluster, library("splines")) +select_table_with_sample_name <- function(tables, sample_name) { + sample_names <- lapply(tables, load_sample_name) + index <- which(sample_names == sample_name) + if (length(index) > 0) { + return(tables[[index]]) + } else { + stop(sprintf("Mismatch - sample name '%s' not present in %s", + sample_name, paste(sample_names, collapse = ", "))) + } +} - recovered <- parLapply(cluster, seq_along(filenames), function(i) { - recover.weaker( - loc = i, - filename = filenames[[i]], - this.f1 = extracted[[i]], - this.f2 = corrected[[i]], - pk.times = aligned$rt_crosstab, - aligned.ftrs = aligned$int_crosstab, - orig.tol = mz_tol, - align.mz.tol = aligned$mz_tolerance, - align.chr.tol = aligned$rt_tolerance, - mz.range = mz_range, - chr.range = rt_range, - use.observed.range = use_observed_range, - bandwidth = 0.5, - min.bw = min_bandwidth, - max.bw = max_bandwidth, - recover.min.count = recover_min_count - ) - }) +select_adjusted <- function(recovered_features) { + return(recovered_features$adjusted_features) +} - feature_table <- aligned$rt_crosstab[, 1:4] - rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times)) - int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs)) - - feature_names <- rownames(feature_table) - sample_names <- colnames(aligned$rt_crosstab[, - (1:4)]) - - list( - extracted_features = lapply(recovered, function(x) x$this.f1), - corrected_features = lapply(recovered, function(x) x$this.f2), - rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab), - int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab) - ) +known_table_columns <- function() { + c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", + "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", + "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", + "int_min(log)", "int_max(log)") } -create_feature_sample_table <- function(features) { - table <- as_feature_sample_table( - rt_crosstab = features$rt_crosstab, - int_crosstab = features$int_crosstab - ) - return(table) +save_known_table <- function(table, filename) { + columns <- known_table_columns() + arrow::write_parquet(table$known_table[columns], filename) +} + +read_known_table <- function(filename) { + arrow::read_parquet(filename, col_select = known_table_columns()) +} + +save_pairing <- function(table, filename) { + df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old")) + arrow::write_parquet(df, filename) } + +join_tables_to_list <- function(metadata, rt_table, intensity_table) { + features <- new("list") + features$metadata <- metadata + features$intensity <- intensity_table + features$rt <- rt_table + return(features) +} + +validate_sample_names <- function(sample_names) { + if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) { + stop(sprintf("Sample names absent or not unique - provided sample names: %s", + paste(sample_names, collapse = ", "))) + } +}