comparison ionflow/ionflow.R @ 0:3b461dc9568b draft default tip

Uploaded
author metaboflow_cam
date Mon, 09 Aug 2021 09:41:22 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:3b461dc9568b
1 #' wl-07-06-2021, Mon: The fourth version: based on Jacopo's new changes in
2 #' 'ionflow_funcs.R' and new pipeline 'tutorial_galaxy_ionflow.R'
3 #' wl-08-06-2021, Tue: finalise
4
5 ## ==== General settings ====
6 rm(list = ls(all = T))
7
8 #' flag for command-line use or not. If false, only for debug interactively.
9 com_f <- T
10
11 #' galaxy will stop even if R has warning message
12 options(warn = -1) #' disable R warning. Turn back: options(warn=0)
13
14 #' ------------------------------------------------------------------------
15 #' Setup R error handling to go to stderr
16 #' options( show.error.messages=F, error = function () {
17 #' cat( geterrmessage(), file=stderr() )
18 #' q( "no", 1, F )
19 #' })
20
21 #' we need that to not crash galaxy with an UTF8 error on German LC settings.
22 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
23
24 #' wl-28-08-2018, Tue: Convert a string separated by comma into character vector
25 str_vec <- function(x) {
26 x <- unlist(strsplit(x, ","))
27 x <- gsub("^[ \t]+|[ \t]+$", "", x) #' trim white spaces
28 }
29
30 pkgs <- c("optparse", "reshape2", "plyr", "dplyr", "tidyr", "ggplot2",
31 "ggrepel", "corrplot", "gplots", "network", "sna", "GGally",
32 "org.Sc.sgd.db","org.Hs.eg.db","GO.db", "GOstats", "KEGG.db",
33 "pheatmap")
34 suppressPackageStartupMessages(invisible(lapply(pkgs, library,
35 character.only = TRUE)))
36
37 ## ==== Command line or interactive setting ====
38 if (com_f) {
39
40 func <- function() {
41 argv <- commandArgs(trailingOnly = FALSE)
42 path <- sub("--file=", "", argv[grep("--file=", argv)])
43 }
44 #' prog_name <- basename(func())
45 tool_dir <- paste0(dirname(func()), "/")
46
47 option_list <-
48 list(
49 make_option(c("-v", "--verbose"),
50 action = "store_true", default = TRUE,
51 help = "Print extra output [default]"
52 ),
53 make_option(c("-q", "--quietly"),
54 action = "store_false",
55 dest = "verbose", help = "Print little output"
56 ),
57
58 #' Data pre-processing
59 make_option("--ion_file", type = "character",
60 help = "ion concentration file in tabular format"),
61 make_option("--var_id", type = "integer", default = 1,
62 help = "Column index of variable"),
63 make_option("--batch_id", type = "integer", default = 3,
64 help = "Column index of batch ID"),
65 make_option("--data_id", type = "integer", default = 5,
66 help = "Start column index of data matrix"),
67 make_option("--method_norm", type = "character", default = "median",
68 help = "Batch correction methods. Support: median,
69 median+std and none"),
70 make_option("--batch_control", type = "character", default = "yes",
71 help = "Use control lines for batch correction or not"),
72 make_option("--control_lines", type = "character", default = "BY4741",
73 help = "Batch control lines"),
74 make_option("--control_use", type = "character", default = "all",
75 help = "Select lines used for batch correction control.
76 Three selection: control, all and control.out"),
77 make_option("--method_outliers", type = "character",
78 default = "log.FC.dist",
79 help = "Outlier detection method. Currently support:
80 mad, IQR, log.FC.dist and none."),
81 make_option("--thres_outl", type = "double", default = 3.0,
82 help = "Outlier detection threshold"),
83 make_option("--stand_method", type = "character", default = "std",
84 help = "Standardisation method. Currently support:
85 std, mad and custom."),
86 make_option("--std_file", type = "character",
87 help = "user predifined std file with respect to ions"),
88 make_option("--thres_symb", type = "double", default = 2.0,
89 help = "Symbolisation threshold"),
90
91 #' Exploratory analysis
92 make_option("--thres_ion_corr", type = "double", default = 0.15,
93 help = "Threshold for Ion correlation (0 - 1)"),
94
95 #' Clustering analysis
96 make_option("--min_clust_size", type = "double", default = 10.0,
97 help = "Minimal cluster size."),
98 make_option("--h_tree", type = "double", default = 0.0,
99 help = "Cutting height for hierarchical clustering."),
100 make_option("--filter_zero_string", type = "logical", default = TRUE,
101 help = "Filter the zero string or not"),
102
103 #' Enrichment analysis
104 make_option("--pval", type = "double", default = 0.05,
105 help = "P-values for enrichment analysis."),
106 make_option("--min_count", type = "double", default = 3.0,
107 help = "Minimal count number for enrichment analysis."),
108 make_option("--ont", type = "character", default = "BP",
109 help = "Ontology method: BP, MF and CC."),
110 make_option("--annot_pkg", type = "character", default = "org.Sc.sgd.db",
111 help = "Annotation package"),
112
113 #' Network analysis
114 make_option("--method_corr", type = "character", default = "cosine",
115 help = "Similarity measure method. Currently support:
116 pearson, spearman, kendall, cosine, mahal_cosine,
117 hybrid_mahal_cosine"),
118 make_option("--thres_corr", type = "double", default = 0.70,
119 help = "Similarity threshold for network analysis (0 - 1).
120 Features large than threshold will be kept."),
121
122 #' output: pre-processing
123 make_option("--pre_proc_pdf",
124 type = "character", default = "pre_proc.pdf",
125 help = "Save plots from pre-processing"
126 ),
127 make_option("--df_stats_out",
128 type = "character", default = "df_stats.tsv",
129 help = "Save stats summary of raw, batch corrected and
130 standardised data"
131 ),
132 make_option("--outl_out",
133 type = "character", default = "outl.tsv",
134 help = "Save outliers summary"
135 ),
136 make_option("--data_wide_out",
137 type = "character", default = "data_wide.tsv",
138 help = "Save pre-processed data in wide format"
139 ),
140 make_option("--data_wide_symb_out",
141 type = "character", default = "data_wide_symb.tsv",
142 help = "Save pre-processed data Symbolization in wide format"
143 ),
144
145 #' output: exploratory analysis
146 make_option("--expl_anal_pdf",
147 type = "character", default = "expl_anal.pdf",
148 help = "Save plots from exploratory analysis"
149 ),
150
151 #' output: clustering analysis
152 make_option("--clus_anal_pdf",
153 type = "character", default = "clus_anal.pdf",
154 help = "Save plots from clustering analysis"
155 ),
156
157 #' output: enrichment analysis
158 make_option("--go_en_out",
159 type = "character", default = "go_en.tsv",
160 help = "Save GO enrichment table"
161 ),
162
163 #' output: network analysis
164 make_option("--gene_net_pdf",
165 type = "character", default = "gene_net.pdf",
166 help = "Save plots from gene network"
167 ),
168 make_option("--imbe_out",
169 type = "character", default = "impact_betweenness.tsv",
170 help = "Save impact and betweenness table"
171 )
172 )
173
174 opt <- parse_args(
175 object = OptionParser(option_list = option_list),
176 args = commandArgs(trailingOnly = TRUE)
177 )
178 } else {
179 #' tool_dir <- "C:/R_lwc/my_galaxy/ionflow/"
180 tool_dir <- "~/my_galaxy/ionflow/"
181
182 opt <- list(
183
184 #' Input
185 ion_file = paste0(tool_dir, "test-data/Dataset_IonFlow_Ionome_KO_short.csv"),
186 var_id = 1,
187 batch_id = 3,
188 data_id = 5,
189 method_norm = "median",
190 batch_control = "yes",
191 control_lines = "BY4741",
192 control_use = "all",
193 method_outliers = "log.FC.dist",
194 thres_outl = 3.0,
195 stand_method = "std",
196 thres_symb = 2,
197
198 #' Exploratory analysis
199 thres_ion_corr = 0.15,
200
201 #' Clustering analysis
202 min_clust_size = 10.0,
203 h_tree = 0.0,
204 filter_zero_string = TRUE,
205
206 #' Enrichment analysis
207 pval = 0.05,
208 min_count = 3,
209 ont = "BP",
210 annot_pkg = "org.Sc.sgd.db",
211
212 #' Network analysis
213 method_corr = "cosine",
214 thres_corr = 0.7,
215
216 #' output: pre-processing
217 pre_proc_pdf = paste0(tool_dir, "test-data/res/pre_proc.pdf"),
218 df_stats_out = paste0(tool_dir, "test-data/res/df_stats.tsv"),
219 outl_out = paste0(tool_dir, "test-data/res/outl.tsv"),
220 data_wide_out = paste0(tool_dir, "test-data/res/data_wide.tsv"),
221 data_wide_symb_out = paste0(tool_dir, "test-data/res/data_wide_symb.tsv"),
222
223 #' output: exploratory analysis
224 expl_anal_pdf = paste0(tool_dir, "test-data/res/expl_anal.pdf"),
225
226 #' output: clustering analysis
227 clus_anal_pdf = paste0(tool_dir, "test-data/res/clus_anal.pdf"),
228
229 #' output: enrichment analysis
230 go_en_out = paste0(tool_dir, "test-data/res/go_en.tsv"),
231
232 #' output: network analysis
233 gene_net_pdf = paste0(tool_dir, "test-data/res/gene_net.pdf"),
234 imbe_out = paste0(tool_dir, "test-data/res/impact_betweenness.tsv")
235 )
236 }
237 #' print(opt)
238
239 suppressPackageStartupMessages({
240 source(paste0(tool_dir, "ionflow_funcs.R"))
241 })
242
243 ## ==== Data preparation ====
244 ion_data <- read.table(opt$ion_file, header = T, sep = ",")
245
246 if (opt$batch_control == "yes") {
247 control_lines <- opt$control_line
248 } else {
249 control_lines <- NULL
250 }
251
252 if (opt$stand_method == "custom") { #' if (lenth(opt$std_file) > 0) {
253 stdev <- read.table(opt$std_file, header = T, sep = "\t")
254 } else {
255 stdev <- NULL
256 }
257
258 ## ==== Pre-processing ====
259 pre <- PreProcessing(data = ion_data,
260 var_id = opt$var_id,
261 batch_id = opt$batch_id,
262 data_id = opt$data_id,
263 method_norm = opt$method_norm,
264 control_lines = control_lines,
265 control_use = opt$control_use,
266 method_outliers = opt$method_outliers,
267 thres_outl = opt$thres_outl,
268 stand_method = opt$stand_method,
269 stdev = stdev,
270 thres_symb = opt$thres_symb)
271
272 #' save plot in pdf
273 pdf(file = opt$pre_proc_pdf, onefile = T) # width = 15, height = 10
274 plot(pre$plot.hist)
275 plot(pre$plot.overview)
276 plot(pre$plot.medians)
277 plot(pre$plot.CV)
278 plot(pre$plot.change.stat)
279 plot(pre$plot.change.dir)
280 dev.off()
281
282 #' combine stats
283 df_stats <- list(raw_data = pre$stats.raw.data,
284 bat_data = pre$stats.batches)
285 df_stats <- dplyr::bind_rows(df_stats, .id = "Data_Set")
286 row.names(df_stats) = NULL
287
288 #' save tables
289 write.table(df_stats, file = opt$df_stats_out, sep = "\t", row.names = F)
290 write.table(pre$stats.outliers, file = opt$outl_out, sep = "\t",
291 row.names = F)
292 write.table(pre$data.line.zscores, file = opt$data_wide_out, sep = "\t",
293 row.names = F)
294 write.table(pre$data.line.symb, file = opt$data_wide_symb_out,
295 sep = "\t", row.names = F)
296
297 ## ==== Exploratory analysis ====
298
299 pdf(file = opt$expl_anal_pdf, onefile = T)
300 expl <- IonAnalysis(data = pre$data.line.zscores,
301 thres_ion_corr = opt$thres_ion_corr)
302 plot(expl$plot.pca)
303 plot(expl$plot.net)
304 dev.off()
305
306 ## ==== Clustering analysis ====
307
308 gcl <- ProfileClustering(pre$data.line.symb,
309 min_clust_size = opt$min_clust_size,
310 h_tree = opt$h_tree,
311 filter_zero_string = opt$filter_zero_string)
312
313 #' select larger clusters
314 cluster_vector <-
315 gcl$clusters.vector[gcl$clusters.vector$Cluster %in%
316 gcl$tab.clusters.subset$Cluster, ]
317
318 #' extract symbolic and z-score prifiles for lines in selected clusters
319 symbol_profiles <- pre$data.line.symb
320 symbol_profiles$Cluster <-
321 cluster_vector$Cluster[match(symbol_profiles$Line, cluster_vector$Line)]
322
323 zscore_profiles <- pre$data.line.zscores
324 zscore_profiles$Cluster <-
325 cluster_vector$Cluster[match(zscore_profiles$Line, cluster_vector$Line)]
326
327 #' remove lines showing no phenotype
328 symbol_profiles <- symbol_profiles[!is.na(symbol_profiles$Cluster),]
329 zscore_profiles <- zscore_profiles[!is.na(zscore_profiles$Cluster),]
330
331 mat_long <- reshape2::melt(zscore_profiles, id = c("Line", "Cluster"),
332 variable.name = "Ion", value.name = "zscore")
333
334 mat_long$n.genes <-
335 gcl$tab.clusters.subset$Number.of.genes[match(mat_long$Cluster,
336 gcl$tab.clusters.subset$Cluster)]
337 mat_long$title <- paste0('Cluster ', mat_long$Cluster,' (',
338 mat_long$n.genes, ' genes)')
339
340 p_gcl <-
341 ggplot(data = mat_long, aes(x = Ion, y = zscore, group = Line), color = "gray") +
342 geom_line() +
343 stat_summary(fun.data = "mean_se", color = "red", geom = "line", group = 1) +
344 labs(x = "", y = "z-score") +
345 coord_cartesian(ylim = c(-8, 8)) +
346 facet_wrap(~title) +
347 theme(legend.position = "none",
348 axis.text.x = element_text(angle = 90, hjust = 1),
349 axis.text = element_text(size = 10))
350
351 pdf(file = opt$clus_anal_pdf, onefile = T)
352 plot(p_gcl)
353 dev.off()
354
355 ## ==== Enrichment analysis ====
356
357 ge <- GOEnricher(cluster_vector,
358 pval = opt$pval,
359 min_count = opt$min_count,
360 annot_pkg = opt$annot_pkg,
361 ont = opt$ont,
362 gene_uni = as.character(pre$data.line.zscores$Line))
363
364 if (nrow(ge$enrichment.summary) > 0) {
365 write.table(ge$enrichment.summary, file = opt$go_en_out, sep = "\t",
366 row.names = FALSE)
367 }
368
369 ## ==== Network analysis ====
370
371 gn <- GeneticNetwork(data = zscore_profiles,
372 method_corr = opt$method_corr,
373 thres_corr = opt$thres_corr,
374 network_modules = "input",
375 cluster_vector = cluster_vector,
376 cluster_label_vector = NULL)
377
378 pdf(file = opt$gene_net_pdf, onefile = T) # width = 15, height = 10
379 plot(gn$plot.network)
380 plot(gn$plot.impact_betweenness)
381 dev.off()
382
383 write.table(gn$stats.impact_betweenness, file = opt$imbe_out,
384 sep = "\t", row.names = FALSE)