Mercurial > repos > recetox > recetox_aplcms_recover_weaker_signals
comparison utils.R @ 2:472dc85ce7c5 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 506df2aef355b3791567283e1a175914f06b405a
author | recetox |
---|---|
date | Mon, 13 Feb 2023 10:28:35 +0000 |
parents | 067a308223e3 |
children | c69a12bfc2fb |
comparison
equal
deleted
inserted
replaced
1:f9fb9d8fb710 | 2:472dc85ce7c5 |
---|---|
1 library(recetox.aplcms) | 1 library(recetox.aplcms) |
2 | 2 |
3 align_features <- function(sample_names, ...) { | 3 get_env_sample_name <- function() { |
4 aligned <- feature.align(...) | 4 sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA) |
5 feature_names <- seq_len(nrow(aligned$pk.times)) | 5 if (nchar(sample_name) == 0) { |
6 | 6 sample_name <- NA |
7 list( | 7 } |
8 mz_tolerance = as.numeric(aligned$mz.tol), | 8 if (is.na(sample_name)) { |
9 rt_tolerance = as.numeric(aligned$chr.tol), | 9 message("The mzML file does not contain run ID.") |
10 rt_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$pk.times), | 10 } |
11 int_crosstab = as_feature_crosstab(feature_names, sample_names, aligned$aligned.ftrs) | 11 return(sample_name) |
12 ) | |
13 } | 12 } |
14 | 13 |
15 get_sample_name <- function(filename) { | 14 save_sample_name <- function(df, sample_name) { |
16 tools::file_path_sans_ext(basename(filename)) | 15 attr(df, "sample_name") <- sample_name |
16 return(df) | |
17 } | 17 } |
18 | 18 |
19 as_feature_crosstab <- function(feature_names, sample_names, data) { | 19 load_sample_name <- function(df) { |
20 colnames(data) <- c("mz", "rt", "mz_min", "mz_max", sample_names) | 20 sample_name <- attr(df, "sample_name") |
21 rownames(data) <- feature_names | 21 if (is.null(sample_name)) { |
22 as.data.frame(data) | 22 return(NA) |
23 } else { | |
24 return(sample_name) | |
25 } | |
23 } | 26 } |
24 | 27 |
25 as_feature_sample_table <- function(rt_crosstab, int_crosstab) { | 28 save_data_as_parquet_file <- function(data, filename) { |
26 feature_names <- rownames(rt_crosstab) | 29 arrow::write_parquet(data, filename) |
27 sample_names <- colnames(rt_crosstab)[- (1:4)] | |
28 | |
29 feature_table <- data.frame( | |
30 feature = feature_names, | |
31 mz = rt_crosstab[, 1], | |
32 rt = rt_crosstab[, 2] | |
33 ) | |
34 | |
35 # series of conversions to produce a table type from data.frame | |
36 rt_crosstab <- as.table(as.matrix(rt_crosstab[, - (1:4)])) | |
37 int_crosstab <- as.table(as.matrix(int_crosstab[, - (1:4)])) | |
38 | |
39 crosstab_axes <- list(feature = feature_names, sample = sample_names) | |
40 dimnames(rt_crosstab) <- dimnames(int_crosstab) <- crosstab_axes | |
41 | |
42 x <- as.data.frame(rt_crosstab, responseName = "sample_rt") | |
43 y <- as.data.frame(int_crosstab, responseName = "sample_intensity") | |
44 | |
45 data <- merge(x, y, by = c("feature", "sample")) | |
46 data <- merge(feature_table, data, by = "feature") | |
47 data | |
48 } | 30 } |
49 | 31 |
50 load_features <- function(files) { | 32 load_data_from_parquet_file <- function(filename) { |
51 files_list <- sort_samples_by_acquisition_number(files) | 33 return(arrow::read_parquet(filename)) |
52 features <- lapply(files_list, arrow::read_parquet) | 34 } |
53 features <- lapply(features, as.matrix) | 35 |
36 load_parquet_collection <- function(files) { | |
37 features <- lapply(files, arrow::read_parquet) | |
38 features <- lapply(features, tibble::as_tibble) | |
54 return(features) | 39 return(features) |
55 } | 40 } |
56 | 41 |
57 save_data_as_parquet_files <- function(data, subdir) { | 42 save_parquet_collection <- function(table, sample_names, subdir) { |
58 dir.create(subdir) | 43 dir.create(subdir) |
59 for (i in 0:(length(data) - 1)) { | 44 for (i in seq_len(length(table$feature_tables))) { |
60 filename <- file.path(subdir, paste0(subdir, "_features_", i, ".parquet")) | 45 filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet")) |
61 arrow::write_parquet(as.data.frame(data[i + 1]), filename) | 46 feature_table <- as.data.frame(table$feature_tables[[i]]) |
62 } | 47 feature_table <- save_sample_name(feature_table, sample_names[i]) |
48 arrow::write_parquet(feature_table, filename) | |
49 } | |
63 } | 50 } |
64 | 51 |
65 save_aligned_features <- function(aligned, rt_file, int_file, tol_file) { | 52 sort_by_sample_name <- function(tables, sample_names) { |
66 arrow::write_parquet(as.data.frame(aligned$rt_crosstab), rt_file) | 53 return(tables[order(sample_names)]) |
67 arrow::write_parquet(as.data.frame(aligned$int_crosstab), int_file) | |
68 | |
69 mz_tolerance <- c(aligned$mz_tolerance) | |
70 rt_tolerance <- c(aligned$rt_tolerance) | |
71 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) | |
72 } | 54 } |
73 | 55 |
74 load_aligned_features <- function(rt_file, int_file, tol_file) { | 56 save_tolerances <- function(table, tol_file) { |
75 rt_cross_table <- arrow::read_parquet(rt_file) | 57 mz_tolerance <- c(table$mz_tol_relative) |
76 int_cross_table <- arrow::read_parquet(int_file) | 58 rt_tolerance <- c(table$rt_tol_relative) |
77 tolerances_table <- arrow::read_parquet(tol_file) | 59 arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file) |
78 | |
79 result <- list() | |
80 result$mz_tolerance <- tolerances_table$mz_tolerance | |
81 result$rt_tolerance <- tolerances_table$rt_tolerance | |
82 result$rt_crosstab <- rt_cross_table | |
83 result$int_crosstab <- int_cross_table | |
84 return(result) | |
85 } | 60 } |
86 | 61 |
87 recover_signals <- function(cluster, | 62 get_mz_tol <- function(tolerances) { |
88 filenames, | 63 return(tolerances$mz_tolerance) |
89 extracted, | |
90 corrected, | |
91 aligned, | |
92 mz_tol = 1e-05, | |
93 mz_range = NA, | |
94 rt_range = NA, | |
95 use_observed_range = TRUE, | |
96 min_bandwidth = NA, | |
97 max_bandwidth = NA, | |
98 recover_min_count = 3) { | |
99 if (!is(cluster, "cluster")) { | |
100 cluster <- parallel::makeCluster(cluster) | |
101 on.exit(parallel::stopCluster(cluster)) | |
102 } | |
103 | |
104 clusterExport(cluster, c("extracted", "corrected", "aligned", "recover.weaker")) | |
105 clusterEvalQ(cluster, library("splines")) | |
106 | |
107 recovered <- parLapply(cluster, seq_along(filenames), function(i) { | |
108 recover.weaker( | |
109 loc = i, | |
110 filename = filenames[[i]], | |
111 this.f1 = extracted[[i]], | |
112 this.f2 = corrected[[i]], | |
113 pk.times = aligned$rt_crosstab, | |
114 aligned.ftrs = aligned$int_crosstab, | |
115 orig.tol = mz_tol, | |
116 align.mz.tol = aligned$mz_tolerance, | |
117 align.chr.tol = aligned$rt_tolerance, | |
118 mz.range = mz_range, | |
119 chr.range = rt_range, | |
120 use.observed.range = use_observed_range, | |
121 bandwidth = 0.5, | |
122 min.bw = min_bandwidth, | |
123 max.bw = max_bandwidth, | |
124 recover.min.count = recover_min_count | |
125 ) | |
126 }) | |
127 | |
128 feature_table <- aligned$rt_crosstab[, 1:4] | |
129 rt_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.times)) | |
130 int_crosstab <- cbind(feature_table, sapply(recovered, function(x) x$this.ftrs)) | |
131 | |
132 feature_names <- rownames(feature_table) | |
133 sample_names <- colnames(aligned$rt_crosstab[, - (1:4)]) | |
134 | |
135 list( | |
136 extracted_features = lapply(recovered, function(x) x$this.f1), | |
137 corrected_features = lapply(recovered, function(x) x$this.f2), | |
138 rt_crosstab = as_feature_crosstab(feature_names, sample_names, rt_crosstab), | |
139 int_crosstab = as_feature_crosstab(feature_names, sample_names, int_crosstab) | |
140 ) | |
141 } | 64 } |
142 | 65 |
143 create_feature_sample_table <- function(features) { | 66 get_rt_tol <- function(tolerances) { |
144 table <- as_feature_sample_table( | 67 return(tolerances$rt_tolerance) |
145 rt_crosstab = features$rt_crosstab, | |
146 int_crosstab = features$int_crosstab | |
147 ) | |
148 return(table) | |
149 } | 68 } |
69 | |
70 save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) { | |
71 save_data_as_parquet_file(aligned_features$metadata, metadata_file) | |
72 save_data_as_parquet_file(aligned_features$rt, rt_file) | |
73 save_data_as_parquet_file(aligned_features$intensity, intensity_file) | |
74 } | |
75 | |
76 select_table_with_sample_name <- function(tables, sample_name) { | |
77 sample_names <- lapply(tables, load_sample_name) | |
78 index <- which(sample_names == sample_name) | |
79 if (length(index) > 0) { | |
80 return(tables[[index]]) | |
81 } else { | |
82 stop(sprintf("Mismatch - sample name '%s' not present in %s", | |
83 sample_name, paste(sample_names, collapse = ", "))) | |
84 } | |
85 } | |
86 | |
87 select_adjusted <- function(recovered_features) { | |
88 return(recovered_features$adjusted_features) | |
89 } | |
90 | |
91 known_table_columns <- function() { | |
92 c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type", | |
93 "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max", | |
94 "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)", | |
95 "int_min(log)", "int_max(log)") | |
96 } | |
97 | |
98 save_known_table <- function(table, filename) { | |
99 columns <- known_table_columns() | |
100 arrow::write_parquet(table$known_table[columns], filename) | |
101 } | |
102 | |
103 read_known_table <- function(filename) { | |
104 arrow::read_parquet(filename, col_select = known_table_columns()) | |
105 } | |
106 | |
107 save_pairing <- function(table, filename) { | |
108 df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old")) | |
109 arrow::write_parquet(df, filename) | |
110 } | |
111 | |
112 join_tables_to_list <- function(metadata, rt_table, intensity_table) { | |
113 features <- new("list") | |
114 features$metadata <- metadata | |
115 features$intensity <- intensity_table | |
116 features$rt <- rt_table | |
117 return(features) | |
118 } | |
119 | |
120 validate_sample_names <- function(sample_names) { | |
121 if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) { | |
122 stop(sprintf("Sample names absent or not unique - provided sample names: %s", | |
123 paste(sample_names, collapse = ", "))) | |
124 } | |
125 } |