Mercurial > repos > artbio > cpm_tpm_rpk
diff cpm_tpm_rpk.R @ 7:f00c1c34565e draft default tip
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/cpm_tpm_rpk commit 8cca5f05212cd25113955fb9c11801f4047c12b4
author | artbio |
---|---|
date | Wed, 03 Jul 2024 14:39:26 +0000 |
parents | bcff1eb6fdb5 |
children |
line wrap: on
line diff
--- a/cpm_tpm_rpk.R Sat Oct 14 16:24:48 2023 +0000 +++ b/cpm_tpm_rpk.R Wed Jul 03 14:39:26 2024 +0000 @@ -1,15 +1,15 @@ if (length(commandArgs(TRUE)) == 0) { - system("Rscript cpm_tpm_rpk.R -h", intern = FALSE) - q("no") + system("Rscript cpm_tpm_rpk.R -h", intern = FALSE) + q("no") } # load packages that are provided in the conda env options(show.error.messages = FALSE, - error = function() { - cat(geterrmessage(), file = stderr()) - q("no", 1, FALSE) - } + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } ) loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolintā warnings() @@ -20,105 +20,104 @@ library(ggfortify) - #Arguments option_list <- list( - make_option( - c("-d", "--data"), - default = NA, - type = "character", - help = "Input file that contains count values to transform" - ), - make_option( - c("-t", "--type"), - default = "cpm", - type = "character", - help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]" - ), - make_option( - c("-s", "--sep"), - default = "\t", - type = "character", - help = "File separator [default : '%default' ]" - ), - make_option( - c("-c", "--colnames"), - default = TRUE, - type = "logical", - help = "Consider first line as header ? [default : '%default' ]" - ), - make_option( - c("-f", "--gene"), - default = NA, - type = "character", - help = "Two column of gene length file" - ), - make_option( - c("-a", "--gene_sep"), - default = "\t", - type = "character", - help = "Gene length file separator [default : '%default' ]" - ), - make_option( - c("-b", "--gene_header"), - default = TRUE, - type = "logical", - help = "Consider first line of gene length as header ? [default : '%default' ]" - ), - make_option( - c("-l", "--log"), - default = FALSE, - type = "logical", - help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" - ), - make_option( - c("-o", "--out"), - default = "res.tab", - type = "character", - help = "Output name [default : '%default' ]" - ), - make_option( - "--visu", - default = FALSE, - type = "logical", - help = "performs T-SNE [default : '%default' ]" - ), - make_option( - "--tsne_labels", - default = FALSE, - type = "logical", - help = "add labels to t-SNE plot [default : '%default' ]" - ), - make_option( - "--seed", - default = 42, - type = "integer", - help = "Seed value for reproducibility [default : '%default' ]" - ), - make_option( - "--perp", - default = 5.0, - type = "numeric", - help = "perplexity [default : '%default' ]" - ), - make_option( - "--theta", - default = 1.0, - type = "numeric", - help = "theta [default : '%default' ]" - ), - make_option( - c("-D", "--tsne_out"), - default = "tsne.pdf", - type = "character", - help = "T-SNE pdf [default : '%default' ]" - ), - make_option( - "--pca_out", - default = "pca.pdf", - type = "character", - help = "PCA pdf [default : '%default' ]" - ) + make_option( + c("-d", "--data"), + default = NA, + type = "character", + help = "Input file that contains count values to transform" + ), + make_option( + c("-t", "--type"), + default = "cpm", + type = "character", + help = "Transformation type, either cpm, tpm, rpkm or none[default : '%default' ]" + ), + make_option( + c("-s", "--sep"), + default = "\t", + type = "character", + help = "File separator [default : '%default' ]" + ), + make_option( + c("-c", "--colnames"), + default = TRUE, + type = "logical", + help = "Consider first line as header ? [default : '%default' ]" + ), + make_option( + c("-f", "--gene"), + default = NA, + type = "character", + help = "Two column of gene length file" + ), + make_option( + c("-a", "--gene_sep"), + default = "\t", + type = "character", + help = "Gene length file separator [default : '%default' ]" + ), + make_option( + c("-b", "--gene_header"), + default = TRUE, + type = "logical", + help = "Consider first line of gene length as header ? [default : '%default' ]" + ), + make_option( + c("-l", "--log"), + default = FALSE, + type = "logical", + help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" + ), + make_option( + c("-o", "--out"), + default = "res.tab", + type = "character", + help = "Output name [default : '%default' ]" + ), + make_option( + "--visu", + default = FALSE, + type = "logical", + help = "performs T-SNE [default : '%default' ]" + ), + make_option( + "--tsne_labels", + default = FALSE, + type = "logical", + help = "add labels to t-SNE plot [default : '%default' ]" + ), + make_option( + "--seed", + default = 42, + type = "integer", + help = "Seed value for reproducibility [default : '%default' ]" + ), + make_option( + "--perp", + default = 5.0, + type = "numeric", + help = "perplexity [default : '%default' ]" + ), + make_option( + "--theta", + default = 1.0, + type = "numeric", + help = "theta [default : '%default' ]" + ), + make_option( + c("-D", "--tsne_out"), + default = "tsne.pdf", + type = "character", + help = "T-SNE pdf [default : '%default' ]" + ), + make_option( + "--pca_out", + default = "pca.pdf", + type = "character", + help = "PCA pdf [default : '%default' ]" + ) ) @@ -126,116 +125,127 @@ args = commandArgs(trailingOnly = TRUE)) if (opt$data == "" && !(opt$help)) { - stop("At least one argument must be supplied (count data).\n", - call. = FALSE) -} else if ((opt$type == "tpm" || opt$type == "rpk") && opt$gene == "") { - stop("At least two arguments must be supplied (count data and gene length file).\n", - call. = FALSE) -} else if (opt$type != "tpm" && opt$type != "rpk" && opt$type != "cpm" && opt$type != "none") { - stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n", - call. = FALSE) + stop("At least one argument must be supplied (count data).\n", + call. = FALSE) +} else if ((opt$type == "tpm" || opt$type == "rpkm") && opt$gene == "") { + stop("At least two arguments must be supplied (count data and gene length file).\n", + call. = FALSE) +} else if (opt$type != "tpm" && opt$type != "rpkm" && opt$type != "cpm" && opt$type != "none") { + stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpkm.\n", + call. = FALSE) } if (opt$sep == "tab") { - opt$sep <- "\t" + opt$sep <- "\t" } if (opt$gene_sep == "tab") { - opt$gene_sep <- "\t" + opt$gene_sep <- "\t" } cpm <- function(count) { - (count / colSums(count)) * 1000000 + (count / colSums(count)) * 1000000 } rpk <- function(count, length) { - count / (length / 1000) + count / (length / 1000) } tpm <- function(count, length) { - rpk <- rpk(count, length) - per_million_factor <- colSums(rpk) / 1000000 - tpm <- rpk / per_million_factor - return(tpm) + rpk <- rpk(count, length) + per_million_factor <- colSums(rpk) / 1000000 + tpm <- rpk / per_million_factor + return(tpm) +} + +rpkm <- function(count, length) { + rpk <- rpk(count, length) + per_million_factor <- colSums(as.data.frame(count)) / 1000000 + rpkm <- rpk / per_million_factor + return(rpkm) } #### running code #### data <- read.delim( - opt$data, - check.names = FALSE, - header = opt$colnames, - row.names = 1, - sep = opt$sep + opt$data, + check.names = FALSE, + header = opt$colnames, + row.names = 1, + sep = opt$sep ) -if (opt$type == "tpm" || opt$type == "rpk") { - gene_length <- as.data.frame( - read.delim( - opt$gene, - header = opt$gene_header, - row.names = 1, - sep = opt$gene_sep +if (opt$type == "tpm" || opt$type == "rpkm") { + gene_length <- as.data.frame( + read.delim( + opt$gene, + header = opt$gene_header, + row.names = 1, + sep = opt$gene_sep + ) ) - ) - gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) + gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) } -if (opt$type == "cpm") - res <- cpm(data) -if (opt$type == "tpm") - res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) -if (opt$type == "rpk") - res <- as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) -if (opt$type == "none") - res <- data +if (opt$type == "cpm") { + res <- cpm(data) +} +if (opt$type == "tpm") { + res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) +} +if (opt$type == "rpkm") { + res <- as.data.frame(apply(data, 2, rpkm, length = gene_length), row.names = rownames(data)) +} +if (opt$type == "none") { + res <- data +} colnames(res) <- colnames(data) if (opt$log == TRUE) { - res <- log2(res + 1) + res <- log2(res + 1) } if (opt$visu == TRUE) { - df <- res - # filter and transpose df for tsne and pca - df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts) - tdf <- t(df) - # make tsne and plot results - set.seed(opt$seed) ## Sets seed for reproducibility - tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta) - embedding <- as.data.frame(tsne_out$Y) - embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) - gg_legend <- theme(legend.position = "none") - ggplot(embedding, aes(x = V1, y = V2)) + - geom_point(size = 1, color = "red") + - gg_legend + - xlab("") + - ylab("") + - ggtitle("t-SNE") + + df <- res + # filter and transpose df for tsne and pca + df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts) + tdf <- t(df) + # make tsne and plot results + set.seed(opt$seed) ## Sets seed for reproducibility + tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta) + embedding <- as.data.frame(tsne_out$Y) + embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) + gg_legend <- theme(legend.position = "none") + ggplot(embedding, aes(x = V1, y = V2)) + + geom_point(size = 1, color = "red") + + gg_legend + + xlab("") + + ylab("") + + ggtitle("t-SNE") + + if (opt$tsne_labels == TRUE) { + geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue") + } + ggsave(file = opt$tsne_out, device = "pdf") + # make PCA and plot result with ggfortify (autoplot) + tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE) if (opt$tsne_labels == TRUE) { - geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue") + autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2, + label.hjust = 1.2, + colour = "darkblue") + + geom_point(size = 1, color = "red") + + xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + + ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + + ggtitle("PCA") + ggsave(file = opt$pca_out, device = "pdf") + } else { + autoplot(tdf_pca, shape = TRUE, colour = "darkblue") + + geom_point(size = 1, color = "red") + + xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + + ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + + ggtitle("PCA") + ggsave(file = opt$pca_out, device = "pdf") } - ggsave(file = opt$tsne_out, device = "pdf") - # make PCA and plot result with ggfortify (autoplot) - tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE) - if (opt$tsne_labels == TRUE) { - autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2, - label.hjust = 1.2, - colour = "darkblue") + - geom_point(size = 1, color = "red") + - xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + - ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + - ggtitle("PCA") - ggsave(file = opt$pca_out, device = "pdf") - } else { - autoplot(tdf_pca, shape = TRUE, colour = "darkblue") + - geom_point(size = 1, color = "red") + - xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + - ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + - ggtitle("PCA") - ggsave(file = opt$pca_out, device = "pdf") - } } # at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output @@ -243,10 +253,10 @@ res[is_num] <- lapply(res[is_num], round, 8) write.table( - cbind(Features = rownames(res), res), - opt$out, - col.names = opt$colnames, - row.names = FALSE, - quote = FALSE, - sep = "\t" + cbind(Features = rownames(res), res), + opt$out, + col.names = opt$colnames, + row.names = FALSE, + quote = FALSE, + sep = "\t" )