0
|
1 #' wl-07-06-2021, Mon: The fourth version: based on Jacopo's new changes in
|
|
2 #' 'ionflow_funcs.R' and new pipeline 'tutorial_galaxy_ionflow.R'
|
|
3 #' wl-08-06-2021, Tue: finalise
|
|
4
|
|
5 ## ==== General settings ====
|
|
6 rm(list = ls(all = T))
|
|
7
|
|
8 #' flag for command-line use or not. If false, only for debug interactively.
|
|
9 com_f <- T
|
|
10
|
|
11 #' galaxy will stop even if R has warning message
|
|
12 options(warn = -1) #' disable R warning. Turn back: options(warn=0)
|
|
13
|
|
14 #' ------------------------------------------------------------------------
|
|
15 #' Setup R error handling to go to stderr
|
|
16 #' options( show.error.messages=F, error = function () {
|
|
17 #' cat( geterrmessage(), file=stderr() )
|
|
18 #' q( "no", 1, F )
|
|
19 #' })
|
|
20
|
|
21 #' we need that to not crash galaxy with an UTF8 error on German LC settings.
|
|
22 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
|
|
23
|
|
24 #' wl-28-08-2018, Tue: Convert a string separated by comma into character vector
|
|
25 str_vec <- function(x) {
|
|
26 x <- unlist(strsplit(x, ","))
|
|
27 x <- gsub("^[ \t]+|[ \t]+$", "", x) #' trim white spaces
|
|
28 }
|
|
29
|
|
30 pkgs <- c("optparse", "reshape2", "plyr", "dplyr", "tidyr", "ggplot2",
|
|
31 "ggrepel", "corrplot", "gplots", "network", "sna", "GGally",
|
|
32 "org.Sc.sgd.db","org.Hs.eg.db","GO.db", "GOstats", "KEGG.db",
|
|
33 "pheatmap")
|
|
34 suppressPackageStartupMessages(invisible(lapply(pkgs, library,
|
|
35 character.only = TRUE)))
|
|
36
|
|
37 ## ==== Command line or interactive setting ====
|
|
38 if (com_f) {
|
|
39
|
|
40 func <- function() {
|
|
41 argv <- commandArgs(trailingOnly = FALSE)
|
|
42 path <- sub("--file=", "", argv[grep("--file=", argv)])
|
|
43 }
|
|
44 #' prog_name <- basename(func())
|
|
45 tool_dir <- paste0(dirname(func()), "/")
|
|
46
|
|
47 option_list <-
|
|
48 list(
|
|
49 make_option(c("-v", "--verbose"),
|
|
50 action = "store_true", default = TRUE,
|
|
51 help = "Print extra output [default]"
|
|
52 ),
|
|
53 make_option(c("-q", "--quietly"),
|
|
54 action = "store_false",
|
|
55 dest = "verbose", help = "Print little output"
|
|
56 ),
|
|
57
|
|
58 #' Data pre-processing
|
|
59 make_option("--ion_file", type = "character",
|
|
60 help = "ion concentration file in tabular format"),
|
|
61 make_option("--var_id", type = "integer", default = 1,
|
|
62 help = "Column index of variable"),
|
|
63 make_option("--batch_id", type = "integer", default = 3,
|
|
64 help = "Column index of batch ID"),
|
|
65 make_option("--data_id", type = "integer", default = 5,
|
|
66 help = "Start column index of data matrix"),
|
|
67 make_option("--method_norm", type = "character", default = "median",
|
|
68 help = "Batch correction methods. Support: median,
|
|
69 median+std and none"),
|
|
70 make_option("--batch_control", type = "character", default = "yes",
|
|
71 help = "Use control lines for batch correction or not"),
|
|
72 make_option("--control_lines", type = "character", default = "BY4741",
|
|
73 help = "Batch control lines"),
|
|
74 make_option("--control_use", type = "character", default = "all",
|
|
75 help = "Select lines used for batch correction control.
|
|
76 Three selection: control, all and control.out"),
|
|
77 make_option("--method_outliers", type = "character",
|
|
78 default = "log.FC.dist",
|
|
79 help = "Outlier detection method. Currently support:
|
|
80 mad, IQR, log.FC.dist and none."),
|
|
81 make_option("--thres_outl", type = "double", default = 3.0,
|
|
82 help = "Outlier detection threshold"),
|
|
83 make_option("--stand_method", type = "character", default = "std",
|
|
84 help = "Standardisation method. Currently support:
|
|
85 std, mad and custom."),
|
|
86 make_option("--std_file", type = "character",
|
|
87 help = "user predifined std file with respect to ions"),
|
|
88 make_option("--thres_symb", type = "double", default = 2.0,
|
|
89 help = "Symbolisation threshold"),
|
|
90
|
|
91 #' Exploratory analysis
|
|
92 make_option("--thres_ion_corr", type = "double", default = 0.15,
|
|
93 help = "Threshold for Ion correlation (0 - 1)"),
|
|
94
|
|
95 #' Clustering analysis
|
|
96 make_option("--min_clust_size", type = "double", default = 10.0,
|
|
97 help = "Minimal cluster size."),
|
|
98 make_option("--h_tree", type = "double", default = 0.0,
|
|
99 help = "Cutting height for hierarchical clustering."),
|
|
100 make_option("--filter_zero_string", type = "logical", default = TRUE,
|
|
101 help = "Filter the zero string or not"),
|
|
102
|
|
103 #' Enrichment analysis
|
|
104 make_option("--pval", type = "double", default = 0.05,
|
|
105 help = "P-values for enrichment analysis."),
|
|
106 make_option("--min_count", type = "double", default = 3.0,
|
|
107 help = "Minimal count number for enrichment analysis."),
|
|
108 make_option("--ont", type = "character", default = "BP",
|
|
109 help = "Ontology method: BP, MF and CC."),
|
|
110 make_option("--annot_pkg", type = "character", default = "org.Sc.sgd.db",
|
|
111 help = "Annotation package"),
|
|
112
|
|
113 #' Network analysis
|
|
114 make_option("--method_corr", type = "character", default = "cosine",
|
|
115 help = "Similarity measure method. Currently support:
|
|
116 pearson, spearman, kendall, cosine, mahal_cosine,
|
|
117 hybrid_mahal_cosine"),
|
|
118 make_option("--thres_corr", type = "double", default = 0.70,
|
|
119 help = "Similarity threshold for network analysis (0 - 1).
|
|
120 Features large than threshold will be kept."),
|
|
121
|
|
122 #' output: pre-processing
|
|
123 make_option("--pre_proc_pdf",
|
|
124 type = "character", default = "pre_proc.pdf",
|
|
125 help = "Save plots from pre-processing"
|
|
126 ),
|
|
127 make_option("--df_stats_out",
|
|
128 type = "character", default = "df_stats.tsv",
|
|
129 help = "Save stats summary of raw, batch corrected and
|
|
130 standardised data"
|
|
131 ),
|
|
132 make_option("--outl_out",
|
|
133 type = "character", default = "outl.tsv",
|
|
134 help = "Save outliers summary"
|
|
135 ),
|
|
136 make_option("--data_wide_out",
|
|
137 type = "character", default = "data_wide.tsv",
|
|
138 help = "Save pre-processed data in wide format"
|
|
139 ),
|
|
140 make_option("--data_wide_symb_out",
|
|
141 type = "character", default = "data_wide_symb.tsv",
|
|
142 help = "Save pre-processed data Symbolization in wide format"
|
|
143 ),
|
|
144
|
|
145 #' output: exploratory analysis
|
|
146 make_option("--expl_anal_pdf",
|
|
147 type = "character", default = "expl_anal.pdf",
|
|
148 help = "Save plots from exploratory analysis"
|
|
149 ),
|
|
150
|
|
151 #' output: clustering analysis
|
|
152 make_option("--clus_anal_pdf",
|
|
153 type = "character", default = "clus_anal.pdf",
|
|
154 help = "Save plots from clustering analysis"
|
|
155 ),
|
|
156
|
|
157 #' output: enrichment analysis
|
|
158 make_option("--go_en_out",
|
|
159 type = "character", default = "go_en.tsv",
|
|
160 help = "Save GO enrichment table"
|
|
161 ),
|
|
162
|
|
163 #' output: network analysis
|
|
164 make_option("--gene_net_pdf",
|
|
165 type = "character", default = "gene_net.pdf",
|
|
166 help = "Save plots from gene network"
|
|
167 ),
|
|
168 make_option("--imbe_out",
|
|
169 type = "character", default = "impact_betweenness.tsv",
|
|
170 help = "Save impact and betweenness table"
|
|
171 )
|
|
172 )
|
|
173
|
|
174 opt <- parse_args(
|
|
175 object = OptionParser(option_list = option_list),
|
|
176 args = commandArgs(trailingOnly = TRUE)
|
|
177 )
|
|
178 } else {
|
|
179 #' tool_dir <- "C:/R_lwc/my_galaxy/ionflow/"
|
|
180 tool_dir <- "~/my_galaxy/ionflow/"
|
|
181
|
|
182 opt <- list(
|
|
183
|
|
184 #' Input
|
|
185 ion_file = paste0(tool_dir, "test-data/Dataset_IonFlow_Ionome_KO_short.csv"),
|
|
186 var_id = 1,
|
|
187 batch_id = 3,
|
|
188 data_id = 5,
|
|
189 method_norm = "median",
|
|
190 batch_control = "yes",
|
|
191 control_lines = "BY4741",
|
|
192 control_use = "all",
|
|
193 method_outliers = "log.FC.dist",
|
|
194 thres_outl = 3.0,
|
|
195 stand_method = "std",
|
|
196 thres_symb = 2,
|
|
197
|
|
198 #' Exploratory analysis
|
|
199 thres_ion_corr = 0.15,
|
|
200
|
|
201 #' Clustering analysis
|
|
202 min_clust_size = 10.0,
|
|
203 h_tree = 0.0,
|
|
204 filter_zero_string = TRUE,
|
|
205
|
|
206 #' Enrichment analysis
|
|
207 pval = 0.05,
|
|
208 min_count = 3,
|
|
209 ont = "BP",
|
|
210 annot_pkg = "org.Sc.sgd.db",
|
|
211
|
|
212 #' Network analysis
|
|
213 method_corr = "cosine",
|
|
214 thres_corr = 0.7,
|
|
215
|
|
216 #' output: pre-processing
|
|
217 pre_proc_pdf = paste0(tool_dir, "test-data/res/pre_proc.pdf"),
|
|
218 df_stats_out = paste0(tool_dir, "test-data/res/df_stats.tsv"),
|
|
219 outl_out = paste0(tool_dir, "test-data/res/outl.tsv"),
|
|
220 data_wide_out = paste0(tool_dir, "test-data/res/data_wide.tsv"),
|
|
221 data_wide_symb_out = paste0(tool_dir, "test-data/res/data_wide_symb.tsv"),
|
|
222
|
|
223 #' output: exploratory analysis
|
|
224 expl_anal_pdf = paste0(tool_dir, "test-data/res/expl_anal.pdf"),
|
|
225
|
|
226 #' output: clustering analysis
|
|
227 clus_anal_pdf = paste0(tool_dir, "test-data/res/clus_anal.pdf"),
|
|
228
|
|
229 #' output: enrichment analysis
|
|
230 go_en_out = paste0(tool_dir, "test-data/res/go_en.tsv"),
|
|
231
|
|
232 #' output: network analysis
|
|
233 gene_net_pdf = paste0(tool_dir, "test-data/res/gene_net.pdf"),
|
|
234 imbe_out = paste0(tool_dir, "test-data/res/impact_betweenness.tsv")
|
|
235 )
|
|
236 }
|
|
237 #' print(opt)
|
|
238
|
|
239 suppressPackageStartupMessages({
|
|
240 source(paste0(tool_dir, "ionflow_funcs.R"))
|
|
241 })
|
|
242
|
|
243 ## ==== Data preparation ====
|
|
244 ion_data <- read.table(opt$ion_file, header = T, sep = ",")
|
|
245
|
|
246 if (opt$batch_control == "yes") {
|
|
247 control_lines <- opt$control_line
|
|
248 } else {
|
|
249 control_lines <- NULL
|
|
250 }
|
|
251
|
|
252 if (opt$stand_method == "custom") { #' if (lenth(opt$std_file) > 0) {
|
|
253 stdev <- read.table(opt$std_file, header = T, sep = "\t")
|
|
254 } else {
|
|
255 stdev <- NULL
|
|
256 }
|
|
257
|
|
258 ## ==== Pre-processing ====
|
|
259 pre <- PreProcessing(data = ion_data,
|
|
260 var_id = opt$var_id,
|
|
261 batch_id = opt$batch_id,
|
|
262 data_id = opt$data_id,
|
|
263 method_norm = opt$method_norm,
|
|
264 control_lines = control_lines,
|
|
265 control_use = opt$control_use,
|
|
266 method_outliers = opt$method_outliers,
|
|
267 thres_outl = opt$thres_outl,
|
|
268 stand_method = opt$stand_method,
|
|
269 stdev = stdev,
|
|
270 thres_symb = opt$thres_symb)
|
|
271
|
|
272 #' save plot in pdf
|
|
273 pdf(file = opt$pre_proc_pdf, onefile = T) # width = 15, height = 10
|
|
274 plot(pre$plot.hist)
|
|
275 plot(pre$plot.overview)
|
|
276 plot(pre$plot.medians)
|
|
277 plot(pre$plot.CV)
|
|
278 plot(pre$plot.change.stat)
|
|
279 plot(pre$plot.change.dir)
|
|
280 dev.off()
|
|
281
|
|
282 #' combine stats
|
|
283 df_stats <- list(raw_data = pre$stats.raw.data,
|
|
284 bat_data = pre$stats.batches)
|
|
285 df_stats <- dplyr::bind_rows(df_stats, .id = "Data_Set")
|
|
286 row.names(df_stats) = NULL
|
|
287
|
|
288 #' save tables
|
|
289 write.table(df_stats, file = opt$df_stats_out, sep = "\t", row.names = F)
|
|
290 write.table(pre$stats.outliers, file = opt$outl_out, sep = "\t",
|
|
291 row.names = F)
|
|
292 write.table(pre$data.line.zscores, file = opt$data_wide_out, sep = "\t",
|
|
293 row.names = F)
|
|
294 write.table(pre$data.line.symb, file = opt$data_wide_symb_out,
|
|
295 sep = "\t", row.names = F)
|
|
296
|
|
297 ## ==== Exploratory analysis ====
|
|
298
|
|
299 pdf(file = opt$expl_anal_pdf, onefile = T)
|
|
300 expl <- IonAnalysis(data = pre$data.line.zscores,
|
|
301 thres_ion_corr = opt$thres_ion_corr)
|
|
302 plot(expl$plot.pca)
|
|
303 plot(expl$plot.net)
|
|
304 dev.off()
|
|
305
|
|
306 ## ==== Clustering analysis ====
|
|
307
|
|
308 gcl <- ProfileClustering(pre$data.line.symb,
|
|
309 min_clust_size = opt$min_clust_size,
|
|
310 h_tree = opt$h_tree,
|
|
311 filter_zero_string = opt$filter_zero_string)
|
|
312
|
|
313 #' select larger clusters
|
|
314 cluster_vector <-
|
|
315 gcl$clusters.vector[gcl$clusters.vector$Cluster %in%
|
|
316 gcl$tab.clusters.subset$Cluster, ]
|
|
317
|
|
318 #' extract symbolic and z-score prifiles for lines in selected clusters
|
|
319 symbol_profiles <- pre$data.line.symb
|
|
320 symbol_profiles$Cluster <-
|
|
321 cluster_vector$Cluster[match(symbol_profiles$Line, cluster_vector$Line)]
|
|
322
|
|
323 zscore_profiles <- pre$data.line.zscores
|
|
324 zscore_profiles$Cluster <-
|
|
325 cluster_vector$Cluster[match(zscore_profiles$Line, cluster_vector$Line)]
|
|
326
|
|
327 #' remove lines showing no phenotype
|
|
328 symbol_profiles <- symbol_profiles[!is.na(symbol_profiles$Cluster),]
|
|
329 zscore_profiles <- zscore_profiles[!is.na(zscore_profiles$Cluster),]
|
|
330
|
|
331 mat_long <- reshape2::melt(zscore_profiles, id = c("Line", "Cluster"),
|
|
332 variable.name = "Ion", value.name = "zscore")
|
|
333
|
|
334 mat_long$n.genes <-
|
|
335 gcl$tab.clusters.subset$Number.of.genes[match(mat_long$Cluster,
|
|
336 gcl$tab.clusters.subset$Cluster)]
|
|
337 mat_long$title <- paste0('Cluster ', mat_long$Cluster,' (',
|
|
338 mat_long$n.genes, ' genes)')
|
|
339
|
|
340 p_gcl <-
|
|
341 ggplot(data = mat_long, aes(x = Ion, y = zscore, group = Line), color = "gray") +
|
|
342 geom_line() +
|
|
343 stat_summary(fun.data = "mean_se", color = "red", geom = "line", group = 1) +
|
|
344 labs(x = "", y = "z-score") +
|
|
345 coord_cartesian(ylim = c(-8, 8)) +
|
|
346 facet_wrap(~title) +
|
|
347 theme(legend.position = "none",
|
|
348 axis.text.x = element_text(angle = 90, hjust = 1),
|
|
349 axis.text = element_text(size = 10))
|
|
350
|
|
351 pdf(file = opt$clus_anal_pdf, onefile = T)
|
|
352 plot(p_gcl)
|
|
353 dev.off()
|
|
354
|
|
355 ## ==== Enrichment analysis ====
|
|
356
|
|
357 ge <- GOEnricher(cluster_vector,
|
|
358 pval = opt$pval,
|
|
359 min_count = opt$min_count,
|
|
360 annot_pkg = opt$annot_pkg,
|
|
361 ont = opt$ont,
|
|
362 gene_uni = as.character(pre$data.line.zscores$Line))
|
|
363
|
|
364 if (nrow(ge$enrichment.summary) > 0) {
|
|
365 write.table(ge$enrichment.summary, file = opt$go_en_out, sep = "\t",
|
|
366 row.names = FALSE)
|
|
367 }
|
|
368
|
|
369 ## ==== Network analysis ====
|
|
370
|
|
371 gn <- GeneticNetwork(data = zscore_profiles,
|
|
372 method_corr = opt$method_corr,
|
|
373 thres_corr = opt$thres_corr,
|
|
374 network_modules = "input",
|
|
375 cluster_vector = cluster_vector,
|
|
376 cluster_label_vector = NULL)
|
|
377
|
|
378 pdf(file = opt$gene_net_pdf, onefile = T) # width = 15, height = 10
|
|
379 plot(gn$plot.network)
|
|
380 plot(gn$plot.impact_betweenness)
|
|
381 dev.off()
|
|
382
|
|
383 write.table(gn$stats.impact_betweenness, file = opt$imbe_out,
|
|
384 sep = "\t", row.names = FALSE)
|