view cpm_tpm_rpk.R @ 5:bcff1eb6fdb5 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/cpm_tpm_rpk commit d6ef3d1c1d490967b7b79138573a86a8d5235c43
author artbio
date Fri, 06 Oct 2023 18:00:05 +0000
parents be358a1ebf67
children f00c1c34565e
line wrap: on
line source

if (length(commandArgs(TRUE)) == 0) {
  system("Rscript cpm_tpm_rpk.R -h", intern = FALSE)
  q("no")
}


# load packages that are provided in the conda env
options(show.error.messages = FALSE,
  error = function() {
    cat(geterrmessage(), file = stderr())
    q("no", 1, FALSE)
  }
)
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") # nolint⁠
warnings()
library(optparse)
library(ggplot2)
library(reshape2)
library(Rtsne) # nolint⁠
library(ggfortify)



#Arguments
option_list <- list(
  make_option(
    c("-d", "--data"),
    default = NA,
    type = "character",
    help = "Input file that contains count values to transform"
  ),
  make_option(
    c("-t", "--type"),
    default = "cpm",
    type = "character",
    help = "Transformation type, either cpm, tpm, rpk or none[default : '%default' ]"
  ),
  make_option(
    c("-s", "--sep"),
    default = "\t",
    type = "character",
    help = "File separator [default : '%default' ]"
  ),
  make_option(
    c("-c", "--colnames"),
    default = TRUE,
    type = "logical",
    help = "Consider first line as header ? [default : '%default' ]"
  ),
  make_option(
    c("-f", "--gene"),
    default = NA,
    type = "character",
    help = "Two column of gene length file"
  ),
  make_option(
    c("-a", "--gene_sep"),
    default = "\t",
    type = "character",
    help = "Gene length file separator [default : '%default' ]"
  ),
  make_option(
    c("-b", "--gene_header"),
    default = TRUE,
    type = "logical",
    help = "Consider first line of gene length as header ? [default : '%default' ]"
  ),
  make_option(
    c("-l", "--log"),
    default = FALSE,
    type = "logical",
    help = "Should be log transformed as well ? (log2(data +1)) [default : '%default' ]"
  ),
  make_option(
    c("-o", "--out"),
    default = "res.tab",
    type = "character",
    help = "Output name [default : '%default' ]"
  ),
  make_option(
    "--visu",
    default = FALSE,
    type = "logical",
    help = "performs T-SNE [default : '%default' ]"
  ),
  make_option(
    "--tsne_labels",
    default = FALSE,
    type = "logical",
    help = "add labels to t-SNE plot [default : '%default' ]"
  ),
  make_option(
    "--seed",
    default = 42,
    type = "integer",
    help = "Seed value for reproducibility [default : '%default' ]"
  ),
  make_option(
    "--perp",
    default = 5.0,
    type = "numeric",
    help = "perplexity [default : '%default' ]"
  ),
  make_option(
    "--theta",
    default = 1.0,
    type = "numeric",
    help = "theta [default : '%default' ]"
  ),
  make_option(
    c("-D", "--tsne_out"),
    default = "tsne.pdf",
    type = "character",
    help = "T-SNE pdf [default : '%default' ]"
  ),
  make_option(
    "--pca_out",
    default = "pca.pdf",
    type = "character",
    help = "PCA pdf [default : '%default' ]"
  )

)

opt <- parse_args(OptionParser(option_list = option_list),
                  args = commandArgs(trailingOnly = TRUE))

if (opt$data == "" && !(opt$help)) {
  stop("At least one argument must be supplied (count data).\n",
       call. = FALSE)
} else if ((opt$type == "tpm" || opt$type == "rpk") && opt$gene == "") {
  stop("At least two arguments must be supplied (count data and gene length file).\n",
       call. = FALSE)
} else if (opt$type != "tpm" && opt$type != "rpk" && opt$type != "cpm" && opt$type != "none") {
  stop("Wrong transformation requested (--type option) must be : cpm, tpm or rpk.\n",
       call. = FALSE)
}

if (opt$sep == "tab") {
  opt$sep <- "\t"
}
if (opt$gene_sep == "tab") {
  opt$gene_sep <- "\t"
}

cpm <- function(count) {
  (count / colSums(count)) * 1000000
}


rpk <- function(count, length) {
  count / (length / 1000)
}

tpm <- function(count, length) {
  rpk <- rpk(count, length)
  per_million_factor <- colSums(rpk) / 1000000
  tpm <- rpk / per_million_factor
  return(tpm)
}

#### running code ####

data <- read.delim(
  opt$data,
  check.names = FALSE,
  header = opt$colnames,
  row.names = 1,
  sep = opt$sep
)

if (opt$type == "tpm" || opt$type == "rpk") {
  gene_length <- as.data.frame(
    read.delim(
      opt$gene,
      header = opt$gene_header,
      row.names = 1,
      sep = opt$gene_sep
    )
  )
  gene_length <- as.data.frame(gene_length[match(rownames(data), rownames(gene_length)), ], rownames(data))
}


if (opt$type == "cpm")
  res <- cpm(data)
if (opt$type == "tpm")
  res <- as.data.frame(apply(data, 2, tpm, length = gene_length), row.names = rownames(data))
if (opt$type == "rpk")
  res <- as.data.frame(apply(data, 2, rpk, length = gene_length), row.names = rownames(data))
if (opt$type == "none")
  res <- data
colnames(res) <- colnames(data)

if (opt$log == TRUE) {
  res <- log2(res + 1)
}

if (opt$visu == TRUE) {
  df <- res
  # filter and transpose df for tsne and pca
  df <- df[rowSums(df) != 0, ] # remove lines without information (with only 0 counts)
  tdf <- t(df)
  # make tsne and plot results
  set.seed(opt$seed) ## Sets seed for reproducibility
  tsne_out <- Rtsne(tdf, perplexity = opt$perp, theta = opt$theta)
  embedding <- as.data.frame(tsne_out$Y)
  embedding$Class <- as.factor(sub("Class_", "", rownames(tdf)))
  gg_legend <- theme(legend.position = "none")
  ggplot(embedding, aes(x = V1, y = V2)) +
    geom_point(size = 1, color = "red") +
    gg_legend +
    xlab("") +
    ylab("") +
    ggtitle("t-SNE") +
    if (opt$tsne_labels == TRUE) {
      geom_text(aes(label = Class), hjust = -0.2, vjust = -0.5, size = 2.5, color = "darkblue")
    }
  ggsave(file = opt$tsne_out, device = "pdf")
  # make PCA and plot result with ggfortify (autoplot)
  tdf_pca <- prcomp(tdf, center = TRUE, scale. = TRUE)
  if (opt$tsne_labels == TRUE) {
    autoplot(tdf_pca, shape = FALSE, label = TRUE, label.size = 2.5, label.vjust = 1.2,
             label.hjust = 1.2,
             colour = "darkblue") +
      geom_point(size = 1, color = "red") +
      xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
      ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
      ggtitle("PCA")
    ggsave(file = opt$pca_out, device = "pdf")
  } else {
    autoplot(tdf_pca, shape = TRUE, colour = "darkblue") +
      geom_point(size = 1, color = "red") +
      xlab(paste("PC1", summary(tdf_pca)$importance[2, 1] * 100, "%")) +
      ylab(paste("PC2", summary(tdf_pca)$importance[2, 2] * 100, "%")) +
      ggtitle("PCA")
    ggsave(file = opt$pca_out, device = "pdf")
  }
}

# at this stage, we select numeric columns and round theirs values to 8 decimals for cleaner output
is_num <- sapply(res, is.numeric)
res[is_num] <- lapply(res[is_num], round, 8)

write.table(
  cbind(Features = rownames(res), res),
  opt$out,
  col.names = opt$colnames,
  row.names = FALSE,
  quote = FALSE,
  sep = "\t"
)