Mercurial > repos > artbio > ez_histograms
comparison ez_histograms.R @ 0:bdf40b0924cb draft
planemo upload for repository https://github.com/artbio/tools-artbio/tree/main/tools/ez_histograms commit 443759a746f78d67dc4ffcafdc6610d09d278846
author | artbio |
---|---|
date | Wed, 07 Feb 2024 19:49:56 +0000 |
parents | |
children | fbedb212982d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bdf40b0924cb |
---|---|
1 library(ggplot2) | |
2 library(reshape2) | |
3 library(dplyr) | |
4 library(scales) | |
5 library(vtable) | |
6 library(optparse) | |
7 | |
8 options(show.error.messages = FALSE, | |
9 error = function() { | |
10 cat(geterrmessage(), file = stderr()) | |
11 q("no", 1, FALSE) | |
12 } | |
13 ) | |
14 | |
15 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
16 warnings() | |
17 | |
18 option_list <- list( | |
19 make_option( | |
20 c("-f", "--file"), | |
21 default = NA, | |
22 type = "character", | |
23 help = "Input file that contains count values to transform" | |
24 ), | |
25 make_option( | |
26 c("-d", "--profile"), | |
27 default = "count", | |
28 type = "character", | |
29 help = "Whether y-axis shows absolute counts or density: 'count' or 'density' [default : '%default' ]" | |
30 ), | |
31 make_option( | |
32 "--xscale", | |
33 default = "cartesian", | |
34 type = "character", | |
35 help = "Whether x-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]" | |
36 ), | |
37 make_option( | |
38 "--yscale", | |
39 default = "cartesian", | |
40 type = "character", | |
41 help = "Whether y-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]" | |
42 ), | |
43 make_option( | |
44 c("-p", "--pdf"), | |
45 default = "histograms.pdf", | |
46 type = "character", | |
47 help = "Output pdf file name [default : '%default' ]" | |
48 ), | |
49 make_option( | |
50 c("-s", "--summary"), | |
51 default = "summary.tsv", | |
52 type = "character", | |
53 help = "statistics summary file name [default : '%default' ]" | |
54 ) | |
55 ) | |
56 | |
57 opt <- parse_args(OptionParser(option_list = option_list), | |
58 args = commandArgs(trailingOnly = TRUE)) | |
59 | |
60 plot_histograms <- function(mdata, profile = "count", xscale = "cartesian", yscale = "cartesian", bins = 30) { | |
61 if (profile == "count") { | |
62 # count histogram | |
63 p <- ggplot(mdata, aes(x = value, fill = variable, color = variable, y = after_stat(count)), show.legend = FALSE) + | |
64 geom_histogram(bins = bins) + theme(legend.position = "none") | |
65 if (xscale == "cartesian") { | |
66 if (yscale == "log2") { | |
67 p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) | |
68 } else { | |
69 if (yscale == "log10") { | |
70 p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) | |
71 } | |
72 } | |
73 } | |
74 if (xscale == "log2") { | |
75 p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) | |
76 if (yscale == "log2") { | |
77 p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) | |
78 } else { | |
79 if (yscale == "log10") { | |
80 p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) | |
81 } | |
82 } | |
83 } | |
84 if (xscale == "log10") { | |
85 p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) | |
86 if (yscale == "log2") { | |
87 p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) | |
88 } else { | |
89 if (yscale == "log10") { | |
90 p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) | |
91 } | |
92 } | |
93 } | |
94 } | |
95 | |
96 if (profile == "density") { | |
97 # density histogram | |
98 p <- ggplot(mdata, aes(x = value, fill = variable, color = variable)) + | |
99 geom_density() + theme(legend.position = "none") | |
100 if (xscale == "log2") { | |
101 p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) | |
102 } | |
103 if (xscale == "log10") { | |
104 p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) | |
105 } | |
106 } | |
107 return(p) | |
108 } | |
109 | |
110 test_header <- function(file) { | |
111 data <- read.delim(file = file, header = FALSE, row.names = 1, nrows = 2) | |
112 if (all(is.na(as.numeric(data[1, seq_len(ncol(data))])))) { | |
113 return(TRUE) | |
114 } else { | |
115 return(FALSE) | |
116 } | |
117 } | |
118 | |
119 test_rownames <- function(file) { | |
120 data <- read.delim(file = file, header = FALSE, row.names = NULL, nrows = 2) | |
121 if (is.na(as.numeric(data[2, 1]))) { | |
122 return(1) | |
123 } else { | |
124 return(NULL) | |
125 } | |
126 } | |
127 | |
128 ##### prepare input data | |
129 data <- read.delim(file = opt$file, header = test_header(opt$file), row.names = test_rownames(opt$file)) | |
130 data <- data %>% select(where(is.numeric)) # remove non numeric columns | |
131 mdata <- melt(data) | |
132 | |
133 ##### main | |
134 | |
135 # determine optimal number of bins (Sturges’ Rule) | |
136 bins <- ceiling(log2(nrow(data)) + 1) | |
137 # plot | |
138 p <- plot_histograms(mdata, profile = opt$profile, xscale = opt$xscale, bins = bins, yscale = opt$yscale) | |
139 | |
140 # determine optimal width for the graph | |
141 width <- length(data) | |
142 width <- case_when( | |
143 width == 1 ~ 14 / 3, | |
144 width == 2 ~ (2 / 3) * 14, | |
145 TRUE ~ 14 | |
146 ) | |
147 # determine optimal height for the graph | |
148 height <- length(data) | |
149 height <- case_when( | |
150 height <= 3 ~ 3, | |
151 height <= 6 ~ 6, | |
152 TRUE ~ (floor(height / 3) + 1) * 3 | |
153 ) | |
154 # determine optimal number of col for the graph | |
155 ncol <- length(data) | |
156 ncol <- case_when( | |
157 ncol == 1 ~ 1, | |
158 ncol == 2 ~ 2, | |
159 TRUE ~ 3 | |
160 ) | |
161 pdf(opt$pdf, width = width, height = height) | |
162 print(p + facet_wrap(~variable, ncol = ncol, scales = "free")) | |
163 dev.off() | |
164 | |
165 # Summary statistics with vtable package | |
166 summary_df <- sumtable(data, digits = 8, out = "return", add.median = TRUE, | |
167 summ.names = c("N", "Mean", "Std. Dev.", "Min", "Pctl. 25", | |
168 "Median", "Pctl. 75", "Max")) | |
169 write.table(summary_df, file = opt$summary, sep = "\t", quote = FALSE, row.names = FALSE) |