comparison pathifier.R @ 1:0960bd1161fa draft default tip

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/pathifier commit f7363f7c391bd501fc4d1c10aaf372ec2bd82732
author artbio
date Mon, 12 Feb 2024 23:57:01 +0000
parents fec313f5c889
children
comparison
equal deleted inserted replaced
0:fec313f5c889 1:0960bd1161fa
2 # Running PATHIFIER (Drier et al., 2013) 2 # Running PATHIFIER (Drier et al., 2013)
3 # Based on the work of Author: Miguel Angel Garcia-Campos - Github: https://github.com/AngelCampos 3 # Based on the work of Author: Miguel Angel Garcia-Campos - Github: https://github.com/AngelCampos
4 ################################################################################################## 4 ##################################################################################################
5 5
6 6
7 options(show.error.messages = F, error = function() { 7 options(show.error.messages = FALSE, error = function() {
8 cat(geterrmessage(), file = stderr()); q("no", 1, F) 8 cat(geterrmessage(), file = stderr())
9 } 9 q("no", 1, FALSE)
10 ) 10 })
11 # we need that to not crash galaxy with an UTF8 error on German LC settings. 11 # we need that to not crash galaxy with an UTF8 error on German LC settings.
12 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") 12 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
13 13
14 library(pathifier) 14 library(pathifier)
15 library(optparse) 15 library(optparse)
16 library(pheatmap) 16 library(pheatmap)
17 library(scatterplot3d) 17 library(scatterplot3d)
18 library(circlize) 18 library(circlize)
19 19
20 option_list <- list( 20 option_list <- list(
21 make_option( 21 make_option(
22 "--exp", 22 "--exp",
23 type = "character", 23 type = "character",
24 help = "Expression matrix"), 24 help = "Expression matrix"
25 make_option( 25 ),
26 "--sep", 26 make_option(
27 type = "character", 27 "--sep",
28 default = "\t", 28 type = "character",
29 help = "File separator [default : '%default']" 29 default = "\t",
30 ), 30 help = "File separator [default : '%default']"
31 make_option( 31 ),
32 "--genes", 32 make_option(
33 type = "character", 33 "--genes",
34 help = "Gene sets Pathways : gmt format (one pathway per line : Name, description, genes (one by column), tab separated)"), 34 type = "character",
35 make_option( 35 help = "Gene sets Pathways : gmt format (one pathway per line : Name, description, genes (one by column), tab separated)"
36 "--is_normal", 36 ),
37 default = F, 37 make_option(
38 type = "logical", 38 "--is_normal",
39 help = "Define normals cells in your data"), 39 default = FALSE,
40 make_option( 40 type = "logical",
41 "--normals", 41 help = "Define normals cells in your data"
42 type = "character", 42 ),
43 help = "A vector of sample status : 1 = Healthy, 0 = Tumor. Must be in the same order as in expression data"), 43 make_option(
44 make_option( 44 "--normals",
45 "--logfile", 45 type = "character",
46 type = "character", 46 help = "A vector of sample status : 1 = Healthy, 0 = Tumor. Must be in the same order as in expression data"
47 default = "log.txt", 47 ),
48 help = "Log file name [default : '%default']" 48 make_option(
49 ), 49 "--logfile",
50 make_option( 50 type = "character",
51 "--max_stability", 51 default = "log.txt",
52 type = "logical", 52 help = "Log file name [default : '%default']"
53 default = T, 53 ),
54 help = "If true, throw away components leading to low stability of sampling noise [default : '%default']" 54 make_option(
55 ), 55 "--max_stability",
56 make_option( 56 type = "logical",
57 "--attempts", 57 default = TRUE,
58 type = "integer", 58 help = "If true, throw away components leading to low stability of sampling noise [default : '%default']"
59 default = 10, 59 ),
60 help = "Number of runs to determine stability. [default : '%default']" 60 make_option(
61 ), 61 "--attempts",
62 make_option( 62 type = "integer",
63 "--min_std", 63 default = 10,
64 type = "character", 64 help = "Number of runs to determine stability. [default : '%default']"
65 default = "0.4", 65 ),
66 help = "Minimum of standard deviation to filter out low variable genes. 66 make_option(
67 "--min_std",
68 type = "character",
69 default = "0.4",
70 help = "Minimum of standard deviation to filter out low variable genes.
67 Use --min.std data, to use the minimum std of your data [default : '%default']" 71 Use --min.std data, to use the minimum std of your data [default : '%default']"
68 ), 72 ),
69 make_option( 73 make_option(
70 "--min_exp", 74 "--min_exp",
71 type = "character", 75 type = "character",
72 default = "4", 76 default = "4",
73 help = "Minimum of gene expression to filter out low expressed genes. 77 help = "Minimum of gene expression to filter out low expressed genes.
74 Use --min.exp data, to use the minimum expression of your data [default : '%default']" 78 Use --min.exp data, to use the minimum expression of your data [default : '%default']"
75 ), 79 ),
76 make_option( 80 make_option(
77 "--pds", 81 "--pds",
78 type = "character", 82 type = "character",
79 default = "PDS.tsv", 83 default = "PDS.tsv",
80 help = "Output PDS (Pathway deregulation score) of Pathifier in tabular file [default : '%default']" 84 help = "Output PDS (Pathway deregulation score) of Pathifier in tabular file [default : '%default']"
81 ), 85 ),
82 make_option( 86 make_option(
83 "--heatmap_cluster_cells", 87 "--heatmap_cluster_cells",
84 type = "logical", 88 type = "logical",
85 default = TRUE, 89 default = TRUE,
86 help = "Cluster columns (cells) in the heatmap [default : '%default']" 90 help = "Cluster columns (cells) in the heatmap [default : '%default']"
87 ), 91 ),
88 make_option( 92 make_option(
89 "--heatmap_cluster_pathways", 93 "--heatmap_cluster_pathways",
90 type = "logical", 94 type = "logical",
91 default = TRUE, 95 default = TRUE,
92 help = "Cluster rows (pathways) in the heatmap [default : '%default']" 96 help = "Cluster rows (pathways) in the heatmap [default : '%default']"
93 ), 97 ),
94 make_option( 98 make_option(
95 "--heatmap_show_cell_labels", 99 "--heatmap_show_cell_labels",
96 type = "logical", 100 type = "logical",
97 default = FALSE, 101 default = FALSE,
98 help = "Print column names (cells) on the heatmap [default : '%default']" 102 help = "Print column names (cells) on the heatmap [default : '%default']"
99 ), 103 ),
100 make_option( 104 make_option(
101 "--heatmap_show_pathway_labels", 105 "--heatmap_show_pathway_labels",
102 type = "logical", 106 type = "logical",
103 default = FALSE, 107 default = FALSE,
104 help = "Print row names (pathways) on the heatmap [default : '%default']" 108 help = "Print row names (pathways) on the heatmap [default : '%default']"
105 ), 109 ),
106 make_option( 110 make_option(
107 "--plot", 111 "--plot",
108 type = "character", 112 type = "character",
109 default = "./plot.pdf", 113 default = "./plot.pdf",
110 help = "Pathifier visualization [default : '%default']" 114 help = "Pathifier visualization [default : '%default']"
111 ), 115 ),
112 make_option( 116 make_option(
113 "--rdata", 117 "--rdata",
114 type = "character", 118 type = "character",
115 default = "./results.rdata", 119 default = "./results.rdata",
116 help = "Pathifier object (S4) [default : '%default']")) 120 help = "Pathifier object (S4) [default : '%default']"
121 )
122 )
117 parser <- OptionParser(usage = "%prog [options] file", option_list = option_list) 123 parser <- OptionParser(usage = "%prog [options] file", option_list = option_list)
118 args <- parse_args(parser) 124 args <- parse_args(parser)
119 if (args$sep == "tab") { 125 if (args$sep == "tab") {
120 args$sep <- "\t" 126 args$sep <- "\t"
121 } 127 }
122 128
123 129
124 # set seed for reproducibility 130 # set seed for reproducibility
125 set.seed(123) 131 set.seed(123)
126 132
127 # Load expression data for PATHIFIER 133 # Load expression data for PATHIFIER
128 exp_matrix <- as.matrix(read.delim(file = args$exp, 134 exp_matrix <- as.matrix(read.delim(
129 as.is = T, 135 file = args$exp,
130 row.names = 1, 136 as.is = TRUE,
131 sep = args$sep, 137 row.names = 1,
132 check.names = F)) 138 sep = args$sep,
139 check.names = FALSE
140 ))
133 141
134 # Load Genesets annotation 142 # Load Genesets annotation
135 gene_sets_file <- file(args$genes, open = "r") 143 gene_sets_file <- file(args$genes, open = "r")
136 gene_sets <- readLines(gene_sets_file) 144 gene_sets <- readLines(gene_sets_file)
137 close(gene_sets_file) 145 close(gene_sets_file)
144 # Generate a list that contains the names of the genesets used 152 # Generate a list that contains the names of the genesets used
145 pathwaynames <- names(gs) 153 pathwaynames <- names(gs)
146 154
147 # Prepare data and parameters ################################################## 155 # Prepare data and parameters ##################################################
148 # Extract information from binary phenotypes. 1 = Normal, 0 = Tumor 156 # Extract information from binary phenotypes. 1 = Normal, 0 = Tumor
149 if (args$is_normal == T) { 157 if (args$is_normal == TRUE) {
150 normals <- read.delim(file = args$normals, h = F) 158 normals <- read.delim(file = args$normals, h = FALSE)
151 normals <- as.logical(normals[, 2]) 159 normals <- as.logical(normals[, 2])
152 n_exp_matrix <- exp_matrix[, normals] 160 n_exp_matrix <- exp_matrix[, normals]
153 } else n_exp_matrix <- exp_matrix 161 } else {
162 n_exp_matrix <- exp_matrix
163 }
154 164
155 # Calculate MIN_STD 165 # Calculate MIN_STD
156 rsd <- apply(n_exp_matrix, 1, sd) 166 rsd <- apply(n_exp_matrix, 1, sd)
157 min_std <- quantile(rsd, 0.25) 167 min_std <- quantile(rsd, 0.25)
158 168
159 # Calculate MIN_EXP 169 # Calculate MIN_EXP
160 min_exp <- quantile(as.vector(as.matrix(exp_matrix)), 170 min_exp <- quantile(
161 0.1) # Percentile 10 of data 171 as.vector(as.matrix(exp_matrix)),
172 0.1
173 ) # Percentile 10 of data
162 174
163 # Filter low value genes. At least 10% of samples with values over min_exp 175 # Filter low value genes. At least 10% of samples with values over min_exp
164 # Set expression levels < MIN_EXP to MIN_EXP 176 # Set expression levels < MIN_EXP to MIN_EXP
165 over <- apply(exp_matrix, 1, function(x) x > min_exp) 177 over <- apply(exp_matrix, 1, function(x) x > min_exp)
166 g_over <- apply(over, 2, mean) 178 g_over <- apply(over, 2, mean)
167 g_over <- names(g_over)[g_over > 0.1] 179 g_over <- names(g_over)[g_over > 0.1]
168 exp_matrix_filtered <- exp_matrix[g_over, ] 180 exp_matrix_filtered <- exp_matrix[g_over, ]
169 exp_matrix_filtered[exp_matrix_filtered < min_exp] <- min_exp 181 exp_matrix_filtered[exp_matrix_filtered < min_exp] <- min_exp
170 182
171 # Set maximum 5000 genes with more variance 183 # Set maximum 5000 genes with more variance
172 variable_genes <- names(sort(apply(exp_matrix_filtered, 1, var), decreasing = T))[1:5000] 184 variable_genes <- names(sort(apply(exp_matrix_filtered, 1, var), decreasing = TRUE))[1:5000]
173 variable_genes <- variable_genes[!is.na(variable_genes)] 185 variable_genes <- variable_genes[!is.na(variable_genes)]
174 exp_matrix_filtered <- exp_matrix_filtered[variable_genes, ] 186 exp_matrix_filtered <- exp_matrix_filtered[variable_genes, ]
175 allgenes <- as.vector(rownames(exp_matrix_filtered)) 187 allgenes <- as.vector(rownames(exp_matrix_filtered))
176 188
177 189
178 if (args$min_std == "data") { 190 if (args$min_std == "data") {
179 args$min_std <- min_std 191 args$min_std <- min_std
180 } else args$min_std <- as.numeric(args$min_std) 192 } else {
193 args$min_std <- as.numeric(args$min_std)
194 }
181 195
182 if (args$min_exp == "data") { 196 if (args$min_exp == "data") {
183 args$min_exp <- min_exp 197 args$min_exp <- min_exp
184 } else args$min_exp <- as.numeric(args$min_exp) 198 } else {
199 args$min_exp <- as.numeric(args$min_exp)
200 }
185 201
186 202
187 # Open pdf 203 # Open pdf
188 pdf(args$plot) 204 pdf(args$plot)
189 205
190 # Construct continuous color scale 206 # Construct continuous color scale
191 col_score_fun <- colorRamp2(c(0, 0.5, 1), c("#4575B4", "#FFFFBF", "#D73027")) 207 col_score_fun <- colorRamp2(c(0, 0.5, 1), c("#4575B4", "#FFFFBF", "#D73027"))
192 208
193 # Run Pathifier 209 # Run Pathifier
194 if (args$is_normal == T) { 210 if (args$is_normal == TRUE) {
195 pds <- quantify_pathways_deregulation(exp_matrix_filtered, 211 pds <- quantify_pathways_deregulation(exp_matrix_filtered,
196 allgenes, 212 allgenes,
197 gs, 213 gs,
198 pathwaynames, 214 pathwaynames,
199 normals, 215 normals,
200 maximize_stability = args$max_stability, 216 maximize_stability = args$max_stability,
201 attempts = args$attempts, 217 attempts = args$attempts,
202 logfile = args$logfile, 218 logfile = args$logfile,
203 min_std = args$min_std, 219 min_std = args$min_std,
204 min_exp = args$min_exp) 220 min_exp = args$min_exp
205 for (i in pathwaynames) { 221 )
206 df <- data.frame(pds$curves[[i]][, 1:3], 222 for (i in pathwaynames) {
207 normal = normals, 223 df <- data.frame(pds$curves[[i]][, 1:3],
208 PDS = as.numeric(pds$scores[[i]]), 224 normal = normals,
209 curve_order = as.vector(pds$curves_order[[i]])) 225 PDS = as.numeric(pds$scores[[i]]),
210 ordered <- df[df$curve_order, ] 226 curve_order = as.vector(pds$curves_order[[i]])
211 227 )
212 228 ordered <- df[df$curve_order, ]
213 layout(cbind(1:2, 1:2), heights = c(7, 1)) 229
214 sc3 <- scatterplot3d(ordered[, 1:3], 230
215 main = paste("Principal curve of", i), 231 layout(cbind(1:2, 1:2), heights = c(7, 1))
216 box = F, pch = 19, type = "l") 232 sc3 <- scatterplot3d(ordered[, 1:3],
217 sc3$points3d(ordered[, 1:3], box = F, pch = 19, 233 main = paste("Principal curve of", i),
218 col = col_score_fun(ordered$PDS)) 234 box = FALSE, pch = 19, type = "l"
219 235 )
220 # Plot color scale legend 236 sc3$points3d(ordered[, 1:3],
221 par(mar = c(5, 3, 0, 3)) 237 box = FALSE, pch = 19,
222 plot(seq(min(ordered$PDS), max(ordered$PDS), length = 100), rep(0, 100), pch = 15, 238 col = col_score_fun(ordered$PDS)
223 axes = TRUE, yaxt = "n", xlab = "Color scale of PDS", ylab = "", bty = "n", 239 )
224 col = col_score_fun(seq(min(ordered$PDS), max(ordered$PDS), length = 100))) 240
225 241 # Plot color scale legend
226 242 par(mar = c(5, 3, 0, 3))
227 cols_status <- ifelse(ordered$normal, "blue", "red") 243 plot(seq(min(ordered$PDS), max(ordered$PDS), length = 100), rep(0, 100),
228 sc3 <- scatterplot3d(ordered[, 1:3], 244 pch = 15,
229 main = paste("Principal curve of", i), 245 axes = TRUE, yaxt = "n", xlab = "Color scale of PDS", ylab = "", bty = "n",
230 box = F, pch = "", type = "l") 246 col = col_score_fun(seq(min(ordered$PDS), max(ordered$PDS), length = 100))
231 sc3$points3d(ordered[, 1:3], box = F, 247 )
232 pch = ifelse(ordered$normal, 19, 8), 248
233 col = ifelse(ordered$normal, "blue", "red")) 249
234 legend("topright", pch = c(19, 8), yjust = 0, 250 cols_status <- ifelse(ordered$normal, "blue", "red")
235 legend = c("normal", "cancer"), 251 sc3 <- scatterplot3d(ordered[, 1:3],
236 col = c("blue", "red"), cex = 1.1) 252 main = paste("Principal curve of", i),
237 253 box = FALSE, pch = "", type = "l"
238 ## annotation for heatmap 254 )
239 sample_status <- data.frame(Status = factor(ifelse(df$normal, "normal", "tumor"))) 255 sc3$points3d(ordered[, 1:3],
240 rownames(sample_status) <- colnames(exp_matrix_filtered) 256 box = FALSE,
241 color_status_heatmap <- list(Status = c(normal = "blue", tumor = "red")) 257 pch = ifelse(ordered$normal, 19, 8),
242 } 258 col = ifelse(ordered$normal, "blue", "red")
243 } else{ 259 )
244 pds <- quantify_pathways_deregulation(exp_matrix_filtered, 260 legend("topright",
245 allgenes, 261 pch = c(19, 8), yjust = 0,
246 gs, 262 legend = c("normal", "cancer"),
247 pathwaynames, 263 col = c("blue", "red"), cex = 1.1
248 maximize_stability = args$max_stability, 264 )
249 attempts = args$attempts, 265
250 logfile = args$logfile, 266 ## annotation for heatmap
251 min_std = args$min_std, 267 sample_status <- data.frame(Status = factor(ifelse(df$normal, "normal", "tumor")))
252 min_exp = args$min_exp) 268 rownames(sample_status) <- colnames(exp_matrix_filtered)
253 for (i in pathwaynames) { 269 color_status_heatmap <- list(Status = c(normal = "blue", tumor = "red"))
254 df <- data.frame(pds$curves[[i]][, 1:3], 270 }
255 PDS = as.numeric(pds$scores[[i]]), 271 } else {
256 curve_order = as.vector(pds$curves_order[[i]])) 272 pds <- quantify_pathways_deregulation(exp_matrix_filtered,
257 ordered <- df[df$curve_order, ] 273 allgenes,
258 274 gs,
259 layout(cbind(1:2, 1:2), heights = c(7, 1)) 275 pathwaynames,
260 sc3 <- scatterplot3d(ordered[, 1:3], 276 maximize_stability = args$max_stability,
261 main = paste("Principal curve of", i), 277 attempts = args$attempts,
262 box = F, pch = 19, type = "l") 278 logfile = args$logfile,
263 sc3$points3d(ordered[, 1:3], box = F, pch = 19, 279 min_std = args$min_std,
264 col = col_score_fun(ordered$PDS)) 280 min_exp = args$min_exp
265 281 )
266 # Plot color scale legend 282 for (i in pathwaynames) {
267 par(mar = c(5, 3, 0, 3)) 283 df <- data.frame(pds$curves[[i]][, 1:3],
268 plot(seq(min(ordered$PDS), max(ordered$PDS), length = 100), rep(0, 100), pch = 15, 284 PDS = as.numeric(pds$scores[[i]]),
269 axes = TRUE, yaxt = "n", xlab = "Color scale of PDS", ylab = "", bty = "n", 285 curve_order = as.vector(pds$curves_order[[i]])
270 col = col_score_fun(seq(min(ordered$PDS), max(ordered$PDS), length = 100))) 286 )
271 287 ordered <- df[df$curve_order, ]
272 288
273 ## annotation for heatmap (for the moment none for this situation) 289 layout(cbind(1:2, 1:2), heights = c(7, 1))
274 sample_status <- NA 290 sc3 <- scatterplot3d(ordered[, 1:3],
275 color_status_heatmap <- NA 291 main = paste("Principal curve of", i),
276 } 292 box = FALSE, pch = 19, type = "l"
293 )
294 sc3$points3d(ordered[, 1:3],
295 box = FALSE, pch = 19,
296 col = col_score_fun(ordered$PDS)
297 )
298
299 # Plot color scale legend
300 par(mar = c(5, 3, 0, 3))
301 plot(seq(min(ordered$PDS), max(ordered$PDS), length = 100), rep(0, 100),
302 pch = 15,
303 axes = TRUE, yaxt = "n", xlab = "Color scale of PDS", ylab = "", bty = "n",
304 col = col_score_fun(seq(min(ordered$PDS), max(ordered$PDS), length = 100))
305 )
306
307
308 ## annotation for heatmap (for the moment none for this situation)
309 sample_status <- NA
310 color_status_heatmap <- NA
311 }
277 } 312 }
278 313
279 ## Create dataframe from Pathifier list and round score to 4 digits 314 ## Create dataframe from Pathifier list and round score to 4 digits
280 pds_scores <- mapply(FUN = function(x) cbind(round(x, 4)), pds$scores) 315 pds_scores <- mapply(FUN = function(x) cbind(round(x, 4)), pds$scores)
281 dimnames(pds_scores) <- list(colnames(exp_matrix_filtered), names(pds$scores)) 316 dimnames(pds_scores) <- list(colnames(exp_matrix_filtered), names(pds$scores))
282 317
283 ## plot heatmap 318 ## plot heatmap
284 if (ncol(pds_scores) > 1) { 319 if (ncol(pds_scores) > 1) {
285 pheatmap(t(pds_scores), 320 pheatmap(t(pds_scores),
286 main = "Heatmap of Pathway Deregulation Score", # heat map title 321 main = "Heatmap of Pathway Deregulation Score", # heat map title
287 cluster_rows = args$heatmap_cluster_pathways, # apply clustering method 322 cluster_rows = args$heatmap_cluster_pathways, # apply clustering method
288 cluster_cols = args$heatmap_cluster_cells, # apply clustering method 323 cluster_cols = args$heatmap_cluster_cells, # apply clustering method
289 324
290 #Additional Options 325 # Additional Options
291 ## Color labeled columns 326 ## Color labeled columns
292 annotation_col = sample_status, 327 annotation_col = sample_status,
293 annotation_colors = color_status_heatmap, 328 annotation_colors = color_status_heatmap,
294 show_rownames = args$heatmap_show_pathway_labels, 329 show_rownames = args$heatmap_show_pathway_labels,
295 show_colnames = args$heatmap_show_cell_labels, 330 show_colnames = args$heatmap_show_cell_labels,
296 border_color = NA, 331 border_color = NA,
297 legend = TRUE) 332 legend = TRUE
333 )
298 } 334 }
299 dev.off() 335 dev.off()
300 336
301 337
302 ## write table 338 ## write table
303 write.table(pds_scores, 339 write.table(pds_scores,
304 args$pds, 340 args$pds,
305 row.names = T, 341 row.names = TRUE,
306 col.names = T, 342 col.names = TRUE,
307 quote = F, 343 quote = FALSE,
308 sep = "\t") 344 sep = "\t"
345 )
309 346
310 ## write S4 pathifier object 347 ## write S4 pathifier object
311 save(pds, file = args$rdata) 348 save(pds, file = args$rdata)