Mercurial > repos > recetox > recetox_aplcms_compute_clusters
comparison utils.R @ 7:550667ce03f1 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 97249a1af94ac5c387e1ede274dec5364f71cde9
| author | recetox |
|---|---|
| date | Wed, 11 Oct 2023 11:17:39 +0000 |
| parents | 149b9cc7499b |
| children | b9b19a74ac01 |
comparison
equal
deleted
inserted
replaced
| 6:149b9cc7499b | 7:550667ce03f1 |
|---|---|
| 1 library(recetox.aplcms) | 1 library(recetox.aplcms) |
| 2 | 2 |
| 3 get_env_sample_name <- function() { | 3 get_env_sample_name <- function() { |
| 4 sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA) | 4 sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA) |
| 5 if (nchar(sample_name) == 0) { | 5 if (nchar(sample_name) == 0) { |
| 6 sample_name <- NA | 6 sample_name <- NA |
| 7 } | 7 } |
| 8 if (is.na(sample_name)) { | 8 if (is.na(sample_name)) { |
| 9 message("The mzML file does not contain run ID.") | 9 message("The mzML file does not contain run ID.") |
| 10 } | 10 } |
| 11 return(sample_name) | 11 return(sample_name) |
| 12 } | 12 } |
| 13 | 13 |
| 14 save_sample_name <- function(df, sample_name) { | 14 save_sample_name <- function(df, sample_name) { |
| 15 attr(df, "sample_name") <- sample_name | 15 attr(df, "sample_name") <- sample_name |
| 16 return(df) | 16 return(df) |
| 17 } | 17 } |
| 18 | 18 |
| 19 restore_sample_name <- function(df) { | 19 restore_sample_name <- function(df) { |
| 20 return(df$sample_id[1]) | 20 return(df$sample_id[1]) |
| 21 } | 21 } |
| 22 | 22 |
| 23 load_sample_name <- function(df) { | 23 load_sample_name <- function(df) { |
| 24 sample_name <- attr(df, "sample_name") | 24 sample_name <- attr(df, "sample_name") |
| 25 if (is.null(sample_name)) { | 25 if (is.null(sample_name)) { |
| 26 return(NA) | 26 return(NA) |
| 27 } else { | 27 } else { |
| 28 return(sample_name) | 28 return(sample_name) |
| 29 } | 29 } |
| 30 } | 30 } |
| 31 | 31 |
| 32 save_data_as_parquet_file <- function(data, filename) { | 32 save_data_as_parquet_file <- function(data, filename) { |
| 33 arrow::write_parquet(data, filename) | 33 arrow::write_parquet(data, filename) |
| 34 } | 34 } |
| 35 | 35 |
| 36 load_data_from_parquet_file <- function(filename) { | 36 load_data_from_parquet_file <- function(filename) { |
| 37 return(arrow::read_parquet(filename)) | 37 return(arrow::read_parquet(filename)) |
| 38 } | 38 } |
| 39 | 39 |
| 40 load_parquet_collection <- function(files) { | 40 load_parquet_collection <- function(files) { |
| 41 features <- lapply(files, arrow::read_parquet) | 41 features <- lapply(files, arrow::read_parquet) |
| 42 features <- lapply(features, tibble::as_tibble) | 42 features <- lapply(features, tibble::as_tibble) |
| 43 return(features) | 43 return(features) |
| 44 } | 44 } |
| 45 | 45 |
| 46 save_parquet_collection <- function(feature_tables, sample_names, subdir) { | 46 save_parquet_collection <- function(feature_tables, sample_names, subdir) { |
| 47 dir.create(subdir) | 47 dir.create(subdir) |
| 48 for (i in seq_len(length(feature_tables))) { | 48 for (i in seq_len(length(feature_tables))) { |
| 49 filename <- file.path(subdir, paste0(sample_names[i], ".parquet")) | 49 filename <- file.path(subdir, paste0(sample_names[i], ".parquet")) |
| 50 feature_table <- as.data.frame(feature_tables[[i]]) | 50 feature_table <- as.data.frame(feature_tables[[i]]) |
| 51 feature_table <- save_sample_name(feature_table, sample_names[i]) | 51 feature_table <- save_sample_name(feature_table, sample_names[i]) |
| 52 arrow::write_parquet(feature_table, filename) | 52 arrow::write_parquet(feature_table, filename) |
| 53 } | 53 } |
| 54 } | 54 } |
| 55 | 55 |
| 56 sort_by_sample_name <- function(tables, sample_names) { | 56 sort_by_sample_name <- function(tables, sample_names) { |
| 57 return(tables[order(sample_names)]) | 57 return(tables[order(sample_names)]) |
| 58 } | 58 } |
| 59 | 59 |
| 60 save_tolerances <- function(table, tol_file) { | 60 save_tolerances <- function(table, tol_file) { |
| 61 mz_tolerance <- c(table$mz_tol_relative) | 61 mz_tolerance <- c(table$mz_tol_relative) |
| 62 rt_tolerance <- c(table$rt_tol_relative) | 62 rt_tolerance <- c(table$rt_tol_relative) |
| 63 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) | 63 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) |
| 64 } | 64 } |
| 65 | 65 |
| 66 save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) { | 66 save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) { |
| 67 save_data_as_parquet_file(aligned_features$metadata, metadata_file) | 67 save_data_as_parquet_file(aligned_features$metadata, metadata_file) |
| 68 save_data_as_parquet_file(aligned_features$rt, rt_file) | 68 save_data_as_parquet_file(aligned_features$rt, rt_file) |
| 69 save_data_as_parquet_file(aligned_features$intensity, intensity_file) | 69 save_data_as_parquet_file(aligned_features$intensity, intensity_file) |
| 70 } | 70 } |
| 71 | 71 |
| 72 select_table_with_sample_name <- function(tables, sample_name) { | 72 select_table_with_sample_name <- function(tables, sample_name) { |
| 73 sample_names <- lapply(tables, load_sample_name) | 73 sample_names <- lapply(tables, load_sample_name) |
| 74 index <- which(sample_names == sample_name) | 74 index <- which(sample_names == sample_name) |
| 75 if (length(index) > 0) { | 75 if (length(index) > 0) { |
| 76 return(tables[[index]]) | 76 return(tables[[index]]) |
| 77 } else { | 77 } else { |
| 78 stop(sprintf("Mismatch - sample name '%s' not present in %s", | 78 stop(sprintf( |
| 79 sample_name, paste(sample_names, collapse = ", "))) | 79 "Mismatch - sample name '%s' not present in %s", |
| 80 } | 80 sample_name, paste(sample_names, collapse = ", ") |
| 81 )) | |
| 82 } | |
| 81 } | 83 } |
| 82 | 84 |
| 83 select_adjusted <- function(recovered_features) { | 85 select_adjusted <- function(recovered_features) { |
| 84 return(recovered_features$adjusted_features) | 86 return(recovered_features$adjusted_features) |
| 85 } | 87 } |
| 86 | 88 |
| 87 known_table_columns <- function() { | 89 known_table_columns <- function() { |
| 88 c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", | 90 c( |
| 91 "chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", | |
| 89 "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", | 92 "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", |
| 90 "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", | 93 "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", |
| 91 "int_min(log)", "int_max(log)") | 94 "int_min(log)", "int_max(log)" |
| 95 ) | |
| 92 } | 96 } |
| 93 | 97 |
| 94 save_known_table <- function(table, filename) { | 98 save_known_table <- function(table, filename) { |
| 95 columns <- known_table_columns() | 99 columns <- known_table_columns() |
| 96 arrow::write_parquet(table$known_table[columns], filename) | 100 arrow::write_parquet(table$known_table[columns], filename) |
| 99 read_known_table <- function(filename) { | 103 read_known_table <- function(filename) { |
| 100 arrow::read_parquet(filename, col_select = known_table_columns()) | 104 arrow::read_parquet(filename, col_select = known_table_columns()) |
| 101 } | 105 } |
| 102 | 106 |
| 103 save_pairing <- function(table, filename) { | 107 save_pairing <- function(table, filename) { |
| 104 df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old")) | 108 df <- table$pairing %>% |
| 109 as_tibble() %>% | |
| 110 setNames(c("new", "old")) | |
| 105 arrow::write_parquet(df, filename) | 111 arrow::write_parquet(df, filename) |
| 106 } | 112 } |
| 107 | 113 |
| 108 join_tables_to_list <- function(metadata, rt_table, intensity_table) { | 114 join_tables_to_list <- function(metadata, rt_table, intensity_table) { |
| 109 features <- new("list") | 115 features <- new("list") |
| 112 features$rt <- rt_table | 118 features$rt <- rt_table |
| 113 return(features) | 119 return(features) |
| 114 } | 120 } |
| 115 | 121 |
| 116 validate_sample_names <- function(sample_names) { | 122 validate_sample_names <- function(sample_names) { |
| 117 if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) { | 123 if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) { |
| 118 stop(sprintf("Sample names absent or not unique - provided sample names: %s", | 124 stop(sprintf( |
| 119 paste(sample_names, collapse = ", "))) | 125 "Sample names absent or not unique - provided sample names: %s", |
| 120 } | 126 paste(sample_names, collapse = ", ") |
| 127 )) | |
| 128 } | |
| 121 } | 129 } |
| 122 | 130 |
| 123 determine_sigma_ratios <- function(sigma_ratio_lim_min = NA, sigma_ratio_lim_max = NA) { | 131 determine_sigma_ratios <- function(sigma_ratio_lim_min = NA, sigma_ratio_lim_max = NA) { |
| 124 if (is.na(sigma_ratio_lim_min)) { | 132 if (is.na(sigma_ratio_lim_min)) { |
| 125 sigma_ratio_lim_min <- 0 | 133 sigma_ratio_lim_min <- 0 |
| 126 } | 134 } |
| 127 if (is.na(sigma_ratio_lim_max)) { | 135 if (is.na(sigma_ratio_lim_max)) { |
| 128 sigma_ratio_lim_max <- Inf | 136 sigma_ratio_lim_max <- Inf |
| 129 } | 137 } |
| 130 return(c(sigma_ratio_lim_min, sigma_ratio_lim_max)) | 138 return(c(sigma_ratio_lim_min, sigma_ratio_lim_max)) |
| 131 } | 139 } |
