Mercurial > repos > artbio > ez_histograms
view ez_histograms.R @ 2:d375c9df9c34 draft default tip
planemo upload for repository https://github.com/artbio/tools-artbio/tree/main/tools/ez_histograms commit f8dffb887b68fdc2c9b35f30676539b8396abd54
author | artbio |
---|---|
date | Thu, 07 Nov 2024 15:40:40 +0000 |
parents | fbedb212982d |
children |
line wrap: on
line source
library(ggplot2) library(reshape2) library(dplyr) library(scales) library(psych) library(optparse) options( show.error.messages = FALSE, error = function() { cat(geterrmessage(), file = stderr()) q("no", 1, FALSE) } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") warnings() option_list <- list( make_option( c("-f", "--file"), default = NA, type = "character", help = "Input file that contains count values to transform" ), make_option( c("-d", "--profile"), default = "count", type = "character", help = "Whether y-axis shows absolute counts or density: 'count' or 'density' [default : '%default' ]" ), make_option( "--xscale", default = "cartesian", type = "character", help = "Whether x-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]" ), make_option( "--yscale", default = "cartesian", type = "character", help = "Whether y-axis is 'cartesian', 'log2' or 'log10' [default : '%default' ]" ), make_option( c("-p", "--pdf"), default = "histograms.pdf", type = "character", help = "Output pdf file name [default : '%default' ]" ), make_option( c("-s", "--summary"), default = "summary.tsv", type = "character", help = "statistics summary file name [default : '%default' ]" ) ) opt <- parse_args(OptionParser(option_list = option_list), args = commandArgs(trailingOnly = TRUE) ) plot_histograms <- function(mdata, profile = "count", xscale = "cartesian", yscale = "cartesian", bins = 30) { if (profile == "count") { # count histogram p <- ggplot(mdata, aes(x = value, fill = variable, color = variable, y = after_stat(count)), show.legend = FALSE) + geom_histogram(bins = bins) + theme(legend.position = "none") if (xscale == "cartesian") { if (yscale == "log2") { p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) } else { if (yscale == "log10") { p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) } } } if (xscale == "log2") { p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) if (yscale == "log2") { p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) } else { if (yscale == "log10") { p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) } } } if (xscale == "log10") { p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) if (yscale == "log2") { p <- p + scale_y_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) } else { if (yscale == "log10") { p <- p + scale_y_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) } } } } if (profile == "density") { # density histogram p <- ggplot(mdata, aes(x = value, fill = variable, color = variable)) + geom_density() + theme(legend.position = "none") if (xscale == "log2") { p <- p + scale_x_continuous(trans = "log2", labels = trans_format("log2", math_format(2^.x))) } if (xscale == "log10") { p <- p + scale_x_continuous(trans = "log10", labels = trans_format("log10", math_format(10^.x))) } } return(p) } test_header <- function(file) { data <- read.delim(file = file, header = FALSE, row.names = 1, nrows = 2) if (all(is.na(as.numeric(data[1, seq_len(ncol(data))])))) { return(TRUE) } else { return(FALSE) } } ##### prepare input data data <- read.delim(file = opt$file, header = test_header(opt$file)) data <- data %>% select(where(is.numeric)) # remove non numeric columns mdata <- melt(data) ##### main # determine optimal number of bins (Sturges’ Rule) bins <- ceiling(log2(nrow(data)) + 1) # plot p <- plot_histograms(mdata, profile = opt$profile, xscale = opt$xscale, bins = bins, yscale = opt$yscale) # determine optimal width for the graph width <- length(data) width <- case_when( width == 1 ~ 14 / 3, width == 2 ~ (2 / 3) * 14, TRUE ~ 14 ) # determine optimal height for the graph height <- length(data) height <- case_when( height <= 3 ~ 3, height <= 6 ~ 6, TRUE ~ (floor(height / 3) + 1) * 3 ) # determine optimal number of col for the graph ncol <- length(data) ncol <- case_when( ncol == 1 ~ 1, ncol == 2 ~ 2, TRUE ~ 3 ) pdf(opt$pdf, width = width, height = height) print(p + facet_wrap(~variable, ncol = ncol, scales = "free")) dev.off() # Summary statistics with psych package summary_df <- describe(x = data, skew = FALSE, ranges = FALSE, quant = c(.25, .50, .75)) summary_df <- cbind(var_names = rownames(summary_df), summary_df) colnames(summary_df)[2] <- "var_num" summary_df <- summary_df[, -6] summary_df[, 4:8] <- format(summary_df[, 4:8], scientific = TRUE) write.table(summary_df, file = opt$summary, sep = "\t", quote = FALSE, row.names = FALSE)