comparison ez_histograms.R @ 0:bdf40b0924cb draft

planemo upload for repository https://github.com/artbio/tools-artbio/tree/main/tools/ez_histograms commit 443759a746f78d67dc4ffcafdc6610d09d278846
author artbio
date Wed, 07 Feb 2024 19:49:56 +0000
parents
children fbedb212982d
comparison
equal deleted inserted replaced
-1:000000000000 0:bdf40b0924cb
1 library(ggplot2)
2 library(reshape2)
3 library(dplyr)
4 library(scales)
5 library(vtable)
6 library(optparse)
7
8 options(show.error.messages = FALSE,
9 error = function() {
10 cat(geterrmessage(), file = stderr())
11 q("no", 1, FALSE)
12 }
13 )
14
15 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
16 warnings()
17
18 option_list <- list(
19 make_option(
20 c("-f", "--file"),
21 default = NA,
22 type = "character",
23 help = "Input file that contains count values to transform"
24 ),
25 make_option(
26 c("-d", "--profile"),
27 default = "count",
28 type = "character",
29 help = "Whether y-axis shows absolute counts or density: 'count' or 'density' [default : '%default' ]"
30 ),
31 make_option(
32 "--xscale",
33 default = "cartesian",
34 type = "character",
35 help = "Whether x-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]"
36 ),
37 make_option(
38 "--yscale",
39 default = "cartesian",
40 type = "character",
41 help = "Whether y-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]"
42 ),
43 make_option(
44 c("-p", "--pdf"),
45 default = "histograms.pdf",
46 type = "character",
47 help = "Output pdf file name [default : '%default' ]"
48 ),
49 make_option(
50 c("-s", "--summary"),
51 default = "summary.tsv",
52 type = "character",
53 help = "statistics summary file name [default : '%default' ]"
54 )
55 )
56
57 opt <- parse_args(OptionParser(option_list = option_list),
58 args = commandArgs(trailingOnly = TRUE))
59
60 plot_histograms <- function(mdata, profile = "count", xscale = "cartesian", yscale = "cartesian", bins = 30) {
61 if (profile == "count") {
62 # count histogram
63 p <- ggplot(mdata, aes(x = value, fill = variable, color = variable, y = after_stat(count)), show.legend = FALSE) +
64 geom_histogram(bins = bins) + theme(legend.position = "none")
65 if (xscale == "cartesian") {
66 if (yscale == "log2") {
67 p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x)))
68 } else {
69 if (yscale == "log10") {
70 p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x)))
71 }
72 }
73 }
74 if (xscale == "log2") {
75 p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x)))
76 if (yscale == "log2") {
77 p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x)))
78 } else {
79 if (yscale == "log10") {
80 p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x)))
81 }
82 }
83 }
84 if (xscale == "log10") {
85 p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x)))
86 if (yscale == "log2") {
87 p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x)))
88 } else {
89 if (yscale == "log10") {
90 p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x)))
91 }
92 }
93 }
94 }
95
96 if (profile == "density") {
97 # density histogram
98 p <- ggplot(mdata, aes(x = value, fill = variable, color = variable)) +
99 geom_density() + theme(legend.position = "none")
100 if (xscale == "log2") {
101 p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x)))
102 }
103 if (xscale == "log10") {
104 p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x)))
105 }
106 }
107 return(p)
108 }
109
110 test_header <- function(file) {
111 data <- read.delim(file = file, header = FALSE, row.names = 1, nrows = 2)
112 if (all(is.na(as.numeric(data[1, seq_len(ncol(data))])))) {
113 return(TRUE)
114 } else {
115 return(FALSE)
116 }
117 }
118
119 test_rownames <- function(file) {
120 data <- read.delim(file = file, header = FALSE, row.names = NULL, nrows = 2)
121 if (is.na(as.numeric(data[2, 1]))) {
122 return(1)
123 } else {
124 return(NULL)
125 }
126 }
127
128 ##### prepare input data
129 data <- read.delim(file = opt$file, header = test_header(opt$file), row.names = test_rownames(opt$file))
130 data <- data %>% select(where(is.numeric)) # remove non numeric columns
131 mdata <- melt(data)
132
133 ##### main
134
135 # determine optimal number of bins (Sturges’ Rule)
136 bins <- ceiling(log2(nrow(data)) + 1)
137 # plot
138 p <- plot_histograms(mdata, profile = opt$profile, xscale = opt$xscale, bins = bins, yscale = opt$yscale)
139
140 # determine optimal width for the graph
141 width <- length(data)
142 width <- case_when(
143 width == 1 ~ 14 / 3,
144 width == 2 ~ (2 / 3) * 14,
145 TRUE ~ 14
146 )
147 # determine optimal height for the graph
148 height <- length(data)
149 height <- case_when(
150 height <= 3 ~ 3,
151 height <= 6 ~ 6,
152 TRUE ~ (floor(height / 3) + 1) * 3
153 )
154 # determine optimal number of col for the graph
155 ncol <- length(data)
156 ncol <- case_when(
157 ncol == 1 ~ 1,
158 ncol == 2 ~ 2,
159 TRUE ~ 3
160 )
161 pdf(opt$pdf, width = width, height = height)
162 print(p + facet_wrap(~variable, ncol = ncol, scales = "free"))
163 dev.off()
164
165 # Summary statistics with vtable package
166 summary_df <- sumtable(data, digits = 8, out = "return", add.median = TRUE,
167 summ.names = c("N", "Mean", "Std. Dev.", "Min", "Pctl. 25",
168 "Median", "Pctl. 75", "Max"))
169 write.table(summary_df, file = opt$summary, sep = "\t", quote = FALSE, row.names = FALSE)