diff MannWhitney_DE.R @ 4:6916ac5a9ef0 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/main/tools/gsc_mannwhitney_de commit c394391dcf541d91ee1dfdc0c3d80cd7a21942ff
author artbio
date Thu, 30 Nov 2023 02:03:53 +0000
parents 3d86c89f15bf
children a7eb04240b8b
line wrap: on
line diff
--- a/MannWhitney_DE.R	Thu Aug 29 05:35:04 2019 -0400
+++ b/MannWhitney_DE.R	Thu Nov 30 02:03:53 2023 +0000
@@ -1,83 +1,88 @@
-####################
-#   Differential   #
-#     analysis     #
-####################
-
-# Perform a differential analysis between 2
-# groups of cells.
+# Perform a differential analysis between 2 groups of cells.
 
 # Example of command
 # Rscript MannWhitney_DE.R --input <input.tsv> --sep <tab> --colnames <TRUE> --metadata <signature.tsv> --column_name <rate> --fdr <0.01> --output <diff_analysis.tsv>
 
-# load packages that are provided in the conda env
-options( show.error.messages=F,
-       error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+options(show.error.messages = FALSE,
+  error = function() {
+    cat(geterrmessage(), file = stderr())
+    q("no", 1, FALSE)
+  }
+)
+
 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
-warnings()
-library(optparse)
 
-#Arguments
-option_list = list(
+suppressPackageStartupMessages({
+  library(optparse)
+})
+
+sessionInfo()
+
+option_list <- list(
   make_option(
     "--input",
     default = NA,
-    type = 'character',
+    type = "character",
     help = "Input file that contains log2(CPM +1) values"
   ),
   make_option(
     "--sep",
-    default = '\t',
-    type = 'character',
+    default = "\t",
+    type = "character",
     help = "File separator [default : '%default' ]"
   ),
   make_option(
     "--colnames",
     default = TRUE,
-    type = 'logical',
+    type = "logical",
     help = "Consider first line as header ? [default : '%default' ]"
-  ),  
+  ),
   make_option(
     "--comparison_factor_file",
     default = NA,
-    type = 'character',
+    type = "character",
     help = " A two column table : cell identifiers and a comparison factor that split cells in two categories (high/low, HOM/HET,...)"
   ),
   make_option(
     "--factor1",
-    type = 'character',
+    type = "character",
     help = "level associated to the control condition in the factor file"
-  ), 
+  ),
   make_option(
     "--factor2",
-    type = 'character',
+    type = "character",
     help = "level associated to the test condition in the factor file"
   ),
   make_option(
     "--fdr",
     default = 0.01,
-    type = 'numeric',
+    type = "numeric",
     help = "FDR threshold [default : '%default' ]"
   ),
   make_option(
     "--log",
-    default=FALSE,
-    action="store_true",
-    type = 'logical',
+    default = FALSE,
+    action = "store_true",
+    type = "logical",
     help = "Expression data are log-transformed [default : '%default' ]"
   ),
   make_option(
     "--output",
     default = "results.tsv",
-    type = 'character',
+    type = "character",
     help = "Output name [default : '%default' ]"
   )
 )
 
-opt = parse_args(OptionParser(option_list = option_list),
-                 args = commandArgs(trailingOnly = TRUE))
+opt <- parse_args(OptionParser(option_list = option_list),
+                  args = commandArgs(trailingOnly = TRUE))
 
-if (opt$sep == "tab") {opt$sep = "\t"}
-if (opt$sep == "comma") {opt$sep = ","}
+if (opt$sep == "tab") {
+  opt$sep <- "\t"
+}
+if (opt$sep == "comma") {
+  opt$sep <- ","
+}
 
 #Open files
 data.counts <- read.table(
@@ -85,13 +90,13 @@
   h = opt$colnames,
   row.names = 1,
   sep = opt$sep,
-  check.names = F
+  check.names = FALSE
 )
 
 metadata <- read.table(
   opt$comparison_factor_file,
   header = TRUE,
-  stringsAsFactors = F,
+  stringsAsFactors = FALSE,
   sep = "\t",
   check.names = FALSE,
   row.names = 1
@@ -100,35 +105,34 @@
 metadata <- subset(metadata, rownames(metadata) %in% colnames(data.counts))
 
 # Create two logical named vectors for each factor level of cell signature
-factor1_cells <- setNames(metadata[,1] == opt$factor1, rownames(metadata))
-factor2_cells <- setNames(metadata[,1] == opt$factor2, rownames(metadata))
+factor1_cells <- setNames(metadata[, 1] == opt$factor1, rownames(metadata))
+factor2_cells <- setNames(metadata[, 1] == opt$factor2, rownames(metadata))
 
 ## Mann-Whitney test (Two-sample Wilcoxon test)
 MW_test <- data.frame(t(apply(data.counts, 1, function(x) {
   do.call("cbind", wilcox.test(x[names(factor1_cells)[factor1_cells]], x[names(factor2_cells)[factor2_cells]]))[, 1:2]
-})), stringsAsFactors = F)
+})), stringsAsFactors = FALSE)
 
 # Benjamini-Hochberg correction and significativity
-MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH" , n = nrow(MW_test))
-# MW_test$Critical.value <- (rank(MW_test$p.value) / nrow(MW_test)) * opt$fdr
+MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH", n = nrow(MW_test))
 MW_test$Significant <- MW_test$p.adjust < opt$fdr
 
 ## Descriptive Statistics Function
 descriptive_stats <- function(InputData) {
-  SummaryData = data.frame(
+  SummaryData <- data.frame(
     mean = rowMeans(InputData),
     SD = apply(InputData, 1, sd),
     Variance = apply(InputData, 1, var),
     Percentage_Detection = apply(InputData, 1, function(x, y = InputData) {
       (sum(x != 0) / ncol(y)) * 100
     }),
-    mean_condition2 = rowMeans(InputData[,factor2_cells]),
+    mean_condition2 = rowMeans(InputData[, factor2_cells]),
     mean_condition1 = rowMeans(InputData[, factor1_cells])
   )
-  if(opt$log) {
-  SummaryData$log2FC <- SummaryData$mean_condition2 - SummaryData$mean_condition1
+  if (opt$log) {
+    SummaryData$log2FC <- SummaryData$mean_condition2 - SummaryData$mean_condition1
   } else {
-  SummaryData$log2FC <- log2(SummaryData$mean_condition2 / SummaryData$mean_condition1)
+    SummaryData$log2FC <- log2(SummaryData$mean_condition2 / SummaryData$mean_condition1)
   }
   return(SummaryData)
 }
@@ -139,16 +143,16 @@
 colnames(results)[1] <- "genes"
 
 ## Annotate Significant column
-results$Significant[results$Significant == T & !is.na(results$Significant)] <- ifelse(subset(results, Significant == T)$log2FC > 0, "UP", "DOWN")
-results$Significant[results$Significant == F & !is.na(results$Significant)] <- "NS"
+results$Significant[results$Significant == TRUE & !is.na(results$Significant)] <- ifelse(subset(results, Significant == TRUE)$log2FC > 0, "UP", "DOWN")
+results$Significant[results$Significant == FALSE & !is.na(results$Significant)] <- "NS"
 
 
 # Save files
 write.table(
-  results[order(results$p.adjust),],
+  results[order(results$p.adjust), ],
   opt$output,
   sep = "\t",
-  quote = F,
-  col.names = T,
-  row.names = F
+  quote = FALSE,
+  col.names = TRUE,
+  row.names = FALSE
 )