Mercurial > repos > metaboflow_cam > ionflow
comparison ionflow/ionflow.R @ 0:3b461dc9568b draft default tip
Uploaded
author | metaboflow_cam |
---|---|
date | Mon, 09 Aug 2021 09:41:22 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:3b461dc9568b |
---|---|
1 #' wl-07-06-2021, Mon: The fourth version: based on Jacopo's new changes in | |
2 #' 'ionflow_funcs.R' and new pipeline 'tutorial_galaxy_ionflow.R' | |
3 #' wl-08-06-2021, Tue: finalise | |
4 | |
5 ## ==== General settings ==== | |
6 rm(list = ls(all = T)) | |
7 | |
8 #' flag for command-line use or not. If false, only for debug interactively. | |
9 com_f <- T | |
10 | |
11 #' galaxy will stop even if R has warning message | |
12 options(warn = -1) #' disable R warning. Turn back: options(warn=0) | |
13 | |
14 #' ------------------------------------------------------------------------ | |
15 #' Setup R error handling to go to stderr | |
16 #' options( show.error.messages=F, error = function () { | |
17 #' cat( geterrmessage(), file=stderr() ) | |
18 #' q( "no", 1, F ) | |
19 #' }) | |
20 | |
21 #' we need that to not crash galaxy with an UTF8 error on German LC settings. | |
22 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
23 | |
24 #' wl-28-08-2018, Tue: Convert a string separated by comma into character vector | |
25 str_vec <- function(x) { | |
26 x <- unlist(strsplit(x, ",")) | |
27 x <- gsub("^[ \t]+|[ \t]+$", "", x) #' trim white spaces | |
28 } | |
29 | |
30 pkgs <- c("optparse", "reshape2", "plyr", "dplyr", "tidyr", "ggplot2", | |
31 "ggrepel", "corrplot", "gplots", "network", "sna", "GGally", | |
32 "org.Sc.sgd.db","org.Hs.eg.db","GO.db", "GOstats", "KEGG.db", | |
33 "pheatmap") | |
34 suppressPackageStartupMessages(invisible(lapply(pkgs, library, | |
35 character.only = TRUE))) | |
36 | |
37 ## ==== Command line or interactive setting ==== | |
38 if (com_f) { | |
39 | |
40 func <- function() { | |
41 argv <- commandArgs(trailingOnly = FALSE) | |
42 path <- sub("--file=", "", argv[grep("--file=", argv)]) | |
43 } | |
44 #' prog_name <- basename(func()) | |
45 tool_dir <- paste0(dirname(func()), "/") | |
46 | |
47 option_list <- | |
48 list( | |
49 make_option(c("-v", "--verbose"), | |
50 action = "store_true", default = TRUE, | |
51 help = "Print extra output [default]" | |
52 ), | |
53 make_option(c("-q", "--quietly"), | |
54 action = "store_false", | |
55 dest = "verbose", help = "Print little output" | |
56 ), | |
57 | |
58 #' Data pre-processing | |
59 make_option("--ion_file", type = "character", | |
60 help = "ion concentration file in tabular format"), | |
61 make_option("--var_id", type = "integer", default = 1, | |
62 help = "Column index of variable"), | |
63 make_option("--batch_id", type = "integer", default = 3, | |
64 help = "Column index of batch ID"), | |
65 make_option("--data_id", type = "integer", default = 5, | |
66 help = "Start column index of data matrix"), | |
67 make_option("--method_norm", type = "character", default = "median", | |
68 help = "Batch correction methods. Support: median, | |
69 median+std and none"), | |
70 make_option("--batch_control", type = "character", default = "yes", | |
71 help = "Use control lines for batch correction or not"), | |
72 make_option("--control_lines", type = "character", default = "BY4741", | |
73 help = "Batch control lines"), | |
74 make_option("--control_use", type = "character", default = "all", | |
75 help = "Select lines used for batch correction control. | |
76 Three selection: control, all and control.out"), | |
77 make_option("--method_outliers", type = "character", | |
78 default = "log.FC.dist", | |
79 help = "Outlier detection method. Currently support: | |
80 mad, IQR, log.FC.dist and none."), | |
81 make_option("--thres_outl", type = "double", default = 3.0, | |
82 help = "Outlier detection threshold"), | |
83 make_option("--stand_method", type = "character", default = "std", | |
84 help = "Standardisation method. Currently support: | |
85 std, mad and custom."), | |
86 make_option("--std_file", type = "character", | |
87 help = "user predifined std file with respect to ions"), | |
88 make_option("--thres_symb", type = "double", default = 2.0, | |
89 help = "Symbolisation threshold"), | |
90 | |
91 #' Exploratory analysis | |
92 make_option("--thres_ion_corr", type = "double", default = 0.15, | |
93 help = "Threshold for Ion correlation (0 - 1)"), | |
94 | |
95 #' Clustering analysis | |
96 make_option("--min_clust_size", type = "double", default = 10.0, | |
97 help = "Minimal cluster size."), | |
98 make_option("--h_tree", type = "double", default = 0.0, | |
99 help = "Cutting height for hierarchical clustering."), | |
100 make_option("--filter_zero_string", type = "logical", default = TRUE, | |
101 help = "Filter the zero string or not"), | |
102 | |
103 #' Enrichment analysis | |
104 make_option("--pval", type = "double", default = 0.05, | |
105 help = "P-values for enrichment analysis."), | |
106 make_option("--min_count", type = "double", default = 3.0, | |
107 help = "Minimal count number for enrichment analysis."), | |
108 make_option("--ont", type = "character", default = "BP", | |
109 help = "Ontology method: BP, MF and CC."), | |
110 make_option("--annot_pkg", type = "character", default = "org.Sc.sgd.db", | |
111 help = "Annotation package"), | |
112 | |
113 #' Network analysis | |
114 make_option("--method_corr", type = "character", default = "cosine", | |
115 help = "Similarity measure method. Currently support: | |
116 pearson, spearman, kendall, cosine, mahal_cosine, | |
117 hybrid_mahal_cosine"), | |
118 make_option("--thres_corr", type = "double", default = 0.70, | |
119 help = "Similarity threshold for network analysis (0 - 1). | |
120 Features large than threshold will be kept."), | |
121 | |
122 #' output: pre-processing | |
123 make_option("--pre_proc_pdf", | |
124 type = "character", default = "pre_proc.pdf", | |
125 help = "Save plots from pre-processing" | |
126 ), | |
127 make_option("--df_stats_out", | |
128 type = "character", default = "df_stats.tsv", | |
129 help = "Save stats summary of raw, batch corrected and | |
130 standardised data" | |
131 ), | |
132 make_option("--outl_out", | |
133 type = "character", default = "outl.tsv", | |
134 help = "Save outliers summary" | |
135 ), | |
136 make_option("--data_wide_out", | |
137 type = "character", default = "data_wide.tsv", | |
138 help = "Save pre-processed data in wide format" | |
139 ), | |
140 make_option("--data_wide_symb_out", | |
141 type = "character", default = "data_wide_symb.tsv", | |
142 help = "Save pre-processed data Symbolization in wide format" | |
143 ), | |
144 | |
145 #' output: exploratory analysis | |
146 make_option("--expl_anal_pdf", | |
147 type = "character", default = "expl_anal.pdf", | |
148 help = "Save plots from exploratory analysis" | |
149 ), | |
150 | |
151 #' output: clustering analysis | |
152 make_option("--clus_anal_pdf", | |
153 type = "character", default = "clus_anal.pdf", | |
154 help = "Save plots from clustering analysis" | |
155 ), | |
156 | |
157 #' output: enrichment analysis | |
158 make_option("--go_en_out", | |
159 type = "character", default = "go_en.tsv", | |
160 help = "Save GO enrichment table" | |
161 ), | |
162 | |
163 #' output: network analysis | |
164 make_option("--gene_net_pdf", | |
165 type = "character", default = "gene_net.pdf", | |
166 help = "Save plots from gene network" | |
167 ), | |
168 make_option("--imbe_out", | |
169 type = "character", default = "impact_betweenness.tsv", | |
170 help = "Save impact and betweenness table" | |
171 ) | |
172 ) | |
173 | |
174 opt <- parse_args( | |
175 object = OptionParser(option_list = option_list), | |
176 args = commandArgs(trailingOnly = TRUE) | |
177 ) | |
178 } else { | |
179 #' tool_dir <- "C:/R_lwc/my_galaxy/ionflow/" | |
180 tool_dir <- "~/my_galaxy/ionflow/" | |
181 | |
182 opt <- list( | |
183 | |
184 #' Input | |
185 ion_file = paste0(tool_dir, "test-data/Dataset_IonFlow_Ionome_KO_short.csv"), | |
186 var_id = 1, | |
187 batch_id = 3, | |
188 data_id = 5, | |
189 method_norm = "median", | |
190 batch_control = "yes", | |
191 control_lines = "BY4741", | |
192 control_use = "all", | |
193 method_outliers = "log.FC.dist", | |
194 thres_outl = 3.0, | |
195 stand_method = "std", | |
196 thres_symb = 2, | |
197 | |
198 #' Exploratory analysis | |
199 thres_ion_corr = 0.15, | |
200 | |
201 #' Clustering analysis | |
202 min_clust_size = 10.0, | |
203 h_tree = 0.0, | |
204 filter_zero_string = TRUE, | |
205 | |
206 #' Enrichment analysis | |
207 pval = 0.05, | |
208 min_count = 3, | |
209 ont = "BP", | |
210 annot_pkg = "org.Sc.sgd.db", | |
211 | |
212 #' Network analysis | |
213 method_corr = "cosine", | |
214 thres_corr = 0.7, | |
215 | |
216 #' output: pre-processing | |
217 pre_proc_pdf = paste0(tool_dir, "test-data/res/pre_proc.pdf"), | |
218 df_stats_out = paste0(tool_dir, "test-data/res/df_stats.tsv"), | |
219 outl_out = paste0(tool_dir, "test-data/res/outl.tsv"), | |
220 data_wide_out = paste0(tool_dir, "test-data/res/data_wide.tsv"), | |
221 data_wide_symb_out = paste0(tool_dir, "test-data/res/data_wide_symb.tsv"), | |
222 | |
223 #' output: exploratory analysis | |
224 expl_anal_pdf = paste0(tool_dir, "test-data/res/expl_anal.pdf"), | |
225 | |
226 #' output: clustering analysis | |
227 clus_anal_pdf = paste0(tool_dir, "test-data/res/clus_anal.pdf"), | |
228 | |
229 #' output: enrichment analysis | |
230 go_en_out = paste0(tool_dir, "test-data/res/go_en.tsv"), | |
231 | |
232 #' output: network analysis | |
233 gene_net_pdf = paste0(tool_dir, "test-data/res/gene_net.pdf"), | |
234 imbe_out = paste0(tool_dir, "test-data/res/impact_betweenness.tsv") | |
235 ) | |
236 } | |
237 #' print(opt) | |
238 | |
239 suppressPackageStartupMessages({ | |
240 source(paste0(tool_dir, "ionflow_funcs.R")) | |
241 }) | |
242 | |
243 ## ==== Data preparation ==== | |
244 ion_data <- read.table(opt$ion_file, header = T, sep = ",") | |
245 | |
246 if (opt$batch_control == "yes") { | |
247 control_lines <- opt$control_line | |
248 } else { | |
249 control_lines <- NULL | |
250 } | |
251 | |
252 if (opt$stand_method == "custom") { #' if (lenth(opt$std_file) > 0) { | |
253 stdev <- read.table(opt$std_file, header = T, sep = "\t") | |
254 } else { | |
255 stdev <- NULL | |
256 } | |
257 | |
258 ## ==== Pre-processing ==== | |
259 pre <- PreProcessing(data = ion_data, | |
260 var_id = opt$var_id, | |
261 batch_id = opt$batch_id, | |
262 data_id = opt$data_id, | |
263 method_norm = opt$method_norm, | |
264 control_lines = control_lines, | |
265 control_use = opt$control_use, | |
266 method_outliers = opt$method_outliers, | |
267 thres_outl = opt$thres_outl, | |
268 stand_method = opt$stand_method, | |
269 stdev = stdev, | |
270 thres_symb = opt$thres_symb) | |
271 | |
272 #' save plot in pdf | |
273 pdf(file = opt$pre_proc_pdf, onefile = T) # width = 15, height = 10 | |
274 plot(pre$plot.hist) | |
275 plot(pre$plot.overview) | |
276 plot(pre$plot.medians) | |
277 plot(pre$plot.CV) | |
278 plot(pre$plot.change.stat) | |
279 plot(pre$plot.change.dir) | |
280 dev.off() | |
281 | |
282 #' combine stats | |
283 df_stats <- list(raw_data = pre$stats.raw.data, | |
284 bat_data = pre$stats.batches) | |
285 df_stats <- dplyr::bind_rows(df_stats, .id = "Data_Set") | |
286 row.names(df_stats) = NULL | |
287 | |
288 #' save tables | |
289 write.table(df_stats, file = opt$df_stats_out, sep = "\t", row.names = F) | |
290 write.table(pre$stats.outliers, file = opt$outl_out, sep = "\t", | |
291 row.names = F) | |
292 write.table(pre$data.line.zscores, file = opt$data_wide_out, sep = "\t", | |
293 row.names = F) | |
294 write.table(pre$data.line.symb, file = opt$data_wide_symb_out, | |
295 sep = "\t", row.names = F) | |
296 | |
297 ## ==== Exploratory analysis ==== | |
298 | |
299 pdf(file = opt$expl_anal_pdf, onefile = T) | |
300 expl <- IonAnalysis(data = pre$data.line.zscores, | |
301 thres_ion_corr = opt$thres_ion_corr) | |
302 plot(expl$plot.pca) | |
303 plot(expl$plot.net) | |
304 dev.off() | |
305 | |
306 ## ==== Clustering analysis ==== | |
307 | |
308 gcl <- ProfileClustering(pre$data.line.symb, | |
309 min_clust_size = opt$min_clust_size, | |
310 h_tree = opt$h_tree, | |
311 filter_zero_string = opt$filter_zero_string) | |
312 | |
313 #' select larger clusters | |
314 cluster_vector <- | |
315 gcl$clusters.vector[gcl$clusters.vector$Cluster %in% | |
316 gcl$tab.clusters.subset$Cluster, ] | |
317 | |
318 #' extract symbolic and z-score prifiles for lines in selected clusters | |
319 symbol_profiles <- pre$data.line.symb | |
320 symbol_profiles$Cluster <- | |
321 cluster_vector$Cluster[match(symbol_profiles$Line, cluster_vector$Line)] | |
322 | |
323 zscore_profiles <- pre$data.line.zscores | |
324 zscore_profiles$Cluster <- | |
325 cluster_vector$Cluster[match(zscore_profiles$Line, cluster_vector$Line)] | |
326 | |
327 #' remove lines showing no phenotype | |
328 symbol_profiles <- symbol_profiles[!is.na(symbol_profiles$Cluster),] | |
329 zscore_profiles <- zscore_profiles[!is.na(zscore_profiles$Cluster),] | |
330 | |
331 mat_long <- reshape2::melt(zscore_profiles, id = c("Line", "Cluster"), | |
332 variable.name = "Ion", value.name = "zscore") | |
333 | |
334 mat_long$n.genes <- | |
335 gcl$tab.clusters.subset$Number.of.genes[match(mat_long$Cluster, | |
336 gcl$tab.clusters.subset$Cluster)] | |
337 mat_long$title <- paste0('Cluster ', mat_long$Cluster,' (', | |
338 mat_long$n.genes, ' genes)') | |
339 | |
340 p_gcl <- | |
341 ggplot(data = mat_long, aes(x = Ion, y = zscore, group = Line), color = "gray") + | |
342 geom_line() + | |
343 stat_summary(fun.data = "mean_se", color = "red", geom = "line", group = 1) + | |
344 labs(x = "", y = "z-score") + | |
345 coord_cartesian(ylim = c(-8, 8)) + | |
346 facet_wrap(~title) + | |
347 theme(legend.position = "none", | |
348 axis.text.x = element_text(angle = 90, hjust = 1), | |
349 axis.text = element_text(size = 10)) | |
350 | |
351 pdf(file = opt$clus_anal_pdf, onefile = T) | |
352 plot(p_gcl) | |
353 dev.off() | |
354 | |
355 ## ==== Enrichment analysis ==== | |
356 | |
357 ge <- GOEnricher(cluster_vector, | |
358 pval = opt$pval, | |
359 min_count = opt$min_count, | |
360 annot_pkg = opt$annot_pkg, | |
361 ont = opt$ont, | |
362 gene_uni = as.character(pre$data.line.zscores$Line)) | |
363 | |
364 if (nrow(ge$enrichment.summary) > 0) { | |
365 write.table(ge$enrichment.summary, file = opt$go_en_out, sep = "\t", | |
366 row.names = FALSE) | |
367 } | |
368 | |
369 ## ==== Network analysis ==== | |
370 | |
371 gn <- GeneticNetwork(data = zscore_profiles, | |
372 method_corr = opt$method_corr, | |
373 thres_corr = opt$thres_corr, | |
374 network_modules = "input", | |
375 cluster_vector = cluster_vector, | |
376 cluster_label_vector = NULL) | |
377 | |
378 pdf(file = opt$gene_net_pdf, onefile = T) # width = 15, height = 10 | |
379 plot(gn$plot.network) | |
380 plot(gn$plot.impact_betweenness) | |
381 dev.off() | |
382 | |
383 write.table(gn$stats.impact_betweenness, file = opt$imbe_out, | |
384 sep = "\t", row.names = FALSE) |