Mercurial > repos > artbio > gsc_mannwhitney_de
diff MannWhitney_DE.R @ 0:c67dba545a37 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_mannwhitney_de commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author | artbio |
---|---|
date | Mon, 24 Jun 2019 13:39:39 -0400 |
parents | |
children | 3d86c89f15bf |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MannWhitney_DE.R Mon Jun 24 13:39:39 2019 -0400 @@ -0,0 +1,149 @@ +#################### +# Differential # +# analysis # +#################### + +# Perform a differential analysis between 2 +# groups of cells. + +# Example of command +# Rscript MannWhitney_DE.R --input <input.tsv> --sep <tab> --colnames <TRUE> --metadata <signature.tsv> --column_name <rate> --fdr <0.01> --output <diff_analysis.tsv> + +# load packages that are provided in the conda env +options( show.error.messages=F, + error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") +warnings() +library(optparse) + +#Arguments +option_list = list( + make_option( + "--input", + default = NA, + type = 'character', + help = "Input file that contains log2(CPM +1) values" + ), + make_option( + "--sep", + default = '\t', + type = 'character', + help = "File separator [default : '%default' ]" + ), + make_option( + "--colnames", + default = TRUE, + type = 'logical', + help = "Consider first line as header ? [default : '%default' ]" + ), + make_option( + "--comparison_factor_file", + default = NA, + type = 'character', + help = " A two column table : cell identifiers and a comparison factor that split cells in two categories (high/low, HOM/HET,...)" + ), + make_option( + "--factor1", + type = 'character', + help = "level associated to the control condition in the factor file" + ), + make_option( + "--factor2", + type = 'character', + help = "level associated to the test condition in the factor file" + ), + make_option( + "--fdr", + default = 0.01, + type = 'numeric', + help = "FDR threshold [default : '%default' ]" + ), + make_option( + "--log", + default=FALSE, + action="store_true", + type = 'logical', + help = "Expression data are log-transformed [default : '%default' ]" + ), + make_option( + "--output", + default = "results.tsv", + type = 'character', + help = "Output name [default : '%default' ]" + ) +) + +opt = parse_args(OptionParser(option_list = option_list), + args = commandArgs(trailingOnly = TRUE)) + +if (opt$sep == "tab") {opt$sep = "\t"} +if (opt$sep == "comma") {opt$sep = ","} + +#Open files +data.counts <- read.table( + opt$input, + h = opt$colnames, + row.names = 1, + sep = opt$sep, + check.names = F +) + +metadata <- read.table( + opt$comparison_factor_file, + header = TRUE, + stringsAsFactors = F, + sep = "\t", + check.names = FALSE, + row.names = 1 +) + +metadata <- subset(metadata, rownames(metadata) %in% colnames(data.counts)) + +# Create two logical named vectors for each factor level of cell signature +factor1_cells <- setNames(metadata[,1] == opt$factor1, rownames(metadata)) +factor2_cells <- setNames(metadata[,1] == opt$factor2, rownames(metadata)) + +## Mann-Whitney test (Two-sample Wilcoxon test) +MW_test <- data.frame(t(apply(data.counts, 1, function(x) { + do.call("cbind", wilcox.test(x[names(factor1_cells)[factor1_cells]], x[names(factor2_cells)[factor2_cells]]))[, 1:2] +})), stringsAsFactors = F) + +# Benjamini-Hochberg correction and significativity +MW_test$p.adjust <- p.adjust(as.numeric(MW_test$p.value), method = "BH" , n = nrow(MW_test)) +# MW_test$Critical.value <- (rank(MW_test$p.value) / nrow(MW_test)) * opt$fdr +MW_test$Significant <- MW_test$p.adjust < opt$fdr + +## Descriptive Statistics Function +descriptive_stats <- function(InputData) { + SummaryData = data.frame( + mean = rowMeans(InputData), + SD = apply(InputData, 1, sd), + Variance = apply(InputData, 1, var), + Percentage_Detection = apply(InputData, 1, function(x, y = InputData) { + (sum(x != 0) / ncol(y)) * 100 + }), + mean_condition2 = rowMeans(InputData[,factor2_cells]), + mean_condition1 = rowMeans(InputData[, factor1_cells]) + ) + if(opt$log) { + SummaryData$log2FC <- SummaryData$mean_condition2 - SummaryData$mean_condition1 + } else { + SummaryData$log2FC <- log2(SummaryData$mean_condition2 / SummaryData$mean_condition1) + } + return(SummaryData) +} + +gene_stats <- descriptive_stats(data.counts) + +results <- merge(gene_stats, MW_test, by = "row.names") +colnames(results)[1] <- "genes" + +# Save files +write.table( + results, + opt$output, + sep = "\t", + quote = F, + col.names = T, + row.names = F +)