Mercurial > repos > artbio > cpm_tpm_rpk
diff cpm_tpm_rpk.R @ 4:be358a1ebf67 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cpm_tpm_rpk commit a8486c89b7ddabfb1e814c2c42f6c04d3896904c
author | artbio |
---|---|
date | Thu, 05 Oct 2023 13:50:22 +0000 |
parents | 8b1020c25f0f |
children | bcff1eb6fdb5 |
line wrap: on
line diff
--- a/cpm_tpm_rpk.R Fri Apr 12 12:01:35 2019 -0400 +++ b/cpm_tpm_rpk.R Thu Oct 05 13:50:22 2023 +0000 @@ -1,142 +1,150 @@ if (length(commandArgs(TRUE)) == 0) { - system("Rscript cpm_tpm_rpk.R -h", intern = F) + system("Rscript cpm_tpm_rpk.R -h", intern = FALSE) q("no") } # load packages that are provided in the conda env -options( show.error.messages=F, - error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) -loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +options(show.error.messages = FALSE, + error = function() { + cat(geterrmessage(), file = stderr()) + q("no", 1, FALSE) + } +) +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint warnings() library(optparse) library(ggplot2) library(reshape2) -library(Rtsne) +library(Rtsne) # nolint library(ggfortify) #Arguments -option_list = list( +option_list <- list( make_option( c("-d", "--data"), default = NA, - type = 'character', + type = "character", help = "Input file that contains count values to transform" ), make_option( c("-t", "--type"), - default = 'cpm', - type = 'character', + default = "cpm", + type = "character", help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]" ), make_option( c("-s", "--sep"), - default = '\t', - type = 'character', + default = "\t", + type = "character", help = "File separator [default : '%default' ]" ), make_option( c("-c", "--colnames"), default = TRUE, - type = 'logical', + type = "logical", help = "Consider first line as header ? [default : '%default' ]" ), make_option( c("-f", "--gene"), default = NA, - type = 'character', + type = "character", help = "Two column of gene length file" ), make_option( c("-a", "--gene_sep"), - default = '\t', - type = 'character', + default = "\t", + type = "character", help = "Gene length file separator [default : '%default' ]" ), make_option( c("-b", "--gene_header"), default = TRUE, - type = 'logical', + type = "logical", help = "Consider first line of gene length as header ? [default : '%default' ]" ), make_option( c("-l", "--log"), default = FALSE, - type = 'logical', + type = "logical", help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]" ), make_option( c("-o", "--out"), default = "res.tab", - type = 'character', + type = "character", help = "Output name [default : '%default' ]" ), make_option( "--visu", default = FALSE, - type = 'logical', + type = "logical", help = "performs T-SNE [default : '%default' ]" ), make_option( "--tsne_labels", default = FALSE, - type = 'logical', + type = "logical", help = "add labels to t-SNE plot [default : '%default' ]" ), make_option( "--seed", default = 42, - type = 'integer', + type = "integer", help = "Seed value for reproducibility [default : '%default' ]" ), make_option( "--perp", default = 5.0, - type = 'numeric', + type = "numeric", help = "perplexity [default : '%default' ]" ), make_option( "--theta", default = 1.0, - type = 'numeric', + type = "numeric", help = "theta [default : '%default' ]" ), make_option( c("-D", "--tsne_out"), default = "tsne.pdf", - type = 'character', + type = "character", help = "T-SNE pdf [default : '%default' ]" ), make_option( "--pca_out", default = "pca.pdf", - type = 'character', + type = "character", help = "PCA pdf [default : '%default' ]" ) ) -opt = parse_args(OptionParser(option_list = option_list), - args = commandArgs(trailingOnly = TRUE)) +opt <- parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE)) -if (opt$data == "" & !(opt$help)) { +if (opt$data == "" && !(opt$help)) { stop("At least one argument must be supplied (count data).\n", call. = FALSE) -} else if ((opt$type == "tpm" | opt$type == "rpk") & opt$gene == "") { +} else if ((opt$type == "tpm" || opt$type == "rpk") && opt$gene == "") { stop("At least two arguments must be supplied (count data and gene length file).\n", call. = FALSE) -} else if (opt$type != "tpm" & opt$type != "rpk" & opt$type != "cpm" & opt$type != "none") { +} else if (opt$type != "tpm" && opt$type != "rpk" && opt$type != "cpm" && opt$type != "none") { stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n", call. = FALSE) } -if (opt$sep == "tab") {opt$sep = "\t"} -if (opt$gene_sep == "tab") {opt$gene_sep = "\t"} +if (opt$sep == "tab") { + opt$sep <- "\t" +} +if (opt$gene_sep == "tab") { + opt$gene_sep <- "\t" +} cpm <- function(count) { - t(t(count) / colSums(count)) * 1000000 + (count / colSums(count)) * 1000000 } @@ -145,13 +153,15 @@ } tpm <- function(count, length) { - RPK = rpk(count, length) - perMillion_factor = colSums(RPK) / 1000000 - TPM = RPK / perMillion_factor - return(TPM) + rpk <- rpk(count, length) + per_million_factor <- colSums(rpk) / 1000000 + tpm <- rpk / per_million_factor + return(tpm) } -data = read.table( +#### running code #### + +data <- read.table( opt$data, check.names = FALSE, header = opt$colnames, @@ -159,94 +169,84 @@ sep = opt$sep ) -if (opt$type == "tpm" | opt$type == "rpk") { - gene_length = as.data.frame( +if (opt$type == "tpm" || opt$type == "rpk") { + gene_length <- as.data.frame( read.table( opt$gene, - h = opt$gene_header, + header = opt$gene_header, row.names = 1, sep = opt$gene_sep ) ) - gene_length = as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) + gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data)) } if (opt$type == "cpm") - res = cpm(data) + res <- cpm(data) if (opt$type == "tpm") - res = as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) + res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data)) if (opt$type == "rpk") - res = as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) + res <- as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data)) if (opt$type == "none") - res = data -colnames(res) = colnames(data) - + res <- data +colnames(res) <- colnames(data) if (opt$log == TRUE) { - res = log2(res + 1) + res <- log2(res + 1) } +if (opt$visu == TRUE) { + df <- res + # filter and transpose df for tsne and pca + df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts) + tdf <- t(df) + # make tsne and plot results + set.seed(opt$seed) ## Sets seed for reproducibility + tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta) + embedding <- as.data.frame(tsne_out$Y) + embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) + gg_legend <- theme(legend.position = "none") + ggplot(embedding, aes(x = V1, y = V2)) + + geom_point(size = 1, color = "red") + + gg_legend + + xlab("") + + ylab("") + + ggtitle("t-SNE") + + if (opt$tsne_labels == TRUE) { + geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue") + } + ggsave(file = opt$tsne_out, device = "pdf") + # make PCA and plot result with ggfortify (autoplot) + tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE) + if (opt$tsne_labels == TRUE) { + autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2, + label.hjust = 1.2, + colour = "darkblue") + + geom_point(size = 1, color = "red") + + xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + + ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + + ggtitle("PCA") + ggsave(file = opt$pca_out, device = "pdf") + } else { + autoplot(tdf_pca, shape = TRUE, colour = "darkblue") + + geom_point(size = 1, color = "red") + + xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) + + ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) + + ggtitle("PCA") + ggsave(file = opt$pca_out, device = "pdf") + } +} + +# at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output +is_num <- sapply(res, is.numeric) +res[is_num] <- lapply(res[is_num], round, 8) + write.table( cbind(Features = rownames(res), res), opt$out, col.names = opt$colnames, - row.names = F, - quote = F, + row.names = FALSE, + quote = FALSE, sep = "\t" ) - -## -if (opt$visu == TRUE) { - df = res - # filter and transpose df for tsne and pca - df = df[rowSums(df) != 0,] # remove lines without information (with only 0 counts) - tdf = t(df) - # make tsne and plot results - set.seed(opt$seed) ## Sets seed for reproducibility - tsne_out <- Rtsne(tdf, perplexity=opt$perp, theta=opt$theta) # - embedding <- as.data.frame(tsne_out$Y) - embedding$Class <- as.factor(sub("Class_", "", rownames(tdf))) - gg_legend = theme(legend.position="none") - ggplot(embedding, aes(x=V1, y=V2)) + - geom_point(size=1, color='red') + - gg_legend + - xlab("") + - ylab("") + - ggtitle('t-SNE') + - if (opt$tsne_labels == TRUE) { - geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=2.5, color='darkblue') - } - ggsave(file=opt$tsne_out, device="pdf") - # make PCA and plot result with ggfortify (autoplot) - tdf.pca <- prcomp(tdf, center = TRUE, scale. = T) - if (opt$tsne_labels == TRUE) { - autoplot(tdf.pca, shape=F, label=T, label.size=2.5, label.vjust=1.2, - label.hjust=1.2, - colour="darkblue") + - geom_point(size=1, color='red') + - xlab(paste("PC1",summary(tdf.pca)$importance[2,1]*100, "%")) + - ylab(paste("PC2",summary(tdf.pca)$importance[2,2]*100, "%")) + - ggtitle('PCA') - ggsave(file=opt$pca_out, device="pdf") - } else { - autoplot(tdf.pca, shape=T, colour="darkblue") + - geom_point(size=1, color='red') + - xlab(paste("PC1",summary(tdf.pca)$importance[2,1]*100, "%")) + - ylab(paste("PC2",summary(tdf.pca)$importance[2,2]*100, "%")) + - ggtitle('PCA') - ggsave(file=opt$pca_out, device="pdf") - } -} - - - - - - - - - - - -