Mercurial > repos > iuc > egsea
diff egsea.R @ 4:fba1660fb717 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/egsea commit c2313b506b3b8ae860bb844b979397d87de4fb44"
author | iuc |
---|---|
date | Mon, 28 Jun 2021 09:45:14 +0000 |
parents | ba2111ae6eb4 |
children |
line wrap: on
line diff
--- a/egsea.R Sat Feb 09 09:15:52 2019 -0500 +++ b/egsea.R Mon Jun 28 09:45:14 2021 +0000 @@ -1,6 +1,8 @@ # Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers -options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) +options(show.error.messages = F, error = function() { + cat(geterrmessage(), file = stderr()); q("no", 1, F) +}) # we need that to not crash galaxy with an UTF8 error on German LC settings. loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") @@ -15,7 +17,7 @@ ## Function Declaration -sanitiseEquation <- function(equation) { +sanitise_equation <- function(equation) { equation <- gsub(" *[+] *", "+", equation) equation <- gsub(" *[-] *", "-", equation) equation <- gsub(" *[/] *", "/", equation) @@ -25,43 +27,43 @@ } # Function to sanitise group information -sanitiseGroups <- function(string) { +sanitise_groups <- function(string) { string <- gsub(" *[,] *", ",", string) string <- gsub("^\\s+|\\s+$", "", string) return(string) } # Generating design information -pasteListName <- function(string) { +paste_listname <- function(string) { return(paste0("factors$", string)) } ## Input Processing option_list <- list( - make_option(c("-threads", "--threads"), default=2, type="integer", help="Number of threads for egsea"), - make_option(c("-filesPath", "--filesPath"), type="character", help="JSON list object if multiple files input"), - make_option(c("-matrixPath", "--matrixPath"), type="character", help="Path to count matrix"), - make_option(c("-factFile", "--factFile"), type="character", help="Path to factor information file"), - make_option(c("-factInput", "--factInput"), type="character", help="String containing factors if manually input"), - make_option(c("-contrastData", "--contrastData"), type="character", help="Contrasts of Interest (Groups to compare)"), - make_option(c("-genes", "--genes"), type="character", help="Path to genes file"), - make_option(c("-species", "--species"), type="character"), - make_option(c("-base_methods", "--base_methods"), type="character", help="Gene set testing methods"), - make_option(c("-msigdb", "--msigdb"), type="character", help="MSigDB Gene Set Collections"), - make_option(c("-keggdb", "--keggdb"), type="character", help="KEGG Pathways"), - make_option(c("-keggupdated", "--keggupdated"), type="logical", help="Use updated KEGG"), - make_option(c("-gsdb", "--gsdb"), type="character", help = "GeneSetDB Gene Sets"), - make_option(c("-display_top", "--display_top"), type="integer", help = "Number of top Gene Sets to display"), - make_option(c("-min_size", "--min_size"), type="integer", help = "Minimum Size of Gene Set"), - make_option(c("-fdr_cutoff", "--fdr_cutoff"), type="double", help = "FDR cutoff"), - make_option(c("-combine_method", "--combine_method"), type="character", help="Method to use to combine the p-values"), - make_option(c("-sort_method", "--sort_method"), type="character", help="Method to sort the results"), - make_option(c("-rdaOpt", "--rdaOpt"), type="character", help="Output RData file") + make_option("--threads", default = 2, type = "integer", help = "Number of threads for egsea"), + make_option("--filesPath", type = "character", help = "JSON list object if multiple files input"), + make_option("--matrixPath", type = "character", help = "Path to count matrix"), + make_option("--factFile", type = "character", help = "Path to factor information file"), + make_option("--factInput", type = "character", help = "String containing factors if manually input"), + make_option("--contrastData", type = "character", help = "Contrasts of Interest (Groups to compare)"), + make_option("--genes", type = "character", help = "Path to genes file"), + make_option("--species", type = "character"), + make_option("--base_methods", type = "character", help = "Gene set testing methods"), + make_option("--msigdb", type = "character", help = "MSigDB Gene Set Collections"), + make_option("--keggdb", type = "character", help = "KEGG Pathways"), + make_option("--keggupdated", type = "logical", help = "Use updated KEGG"), + make_option("--gsdb", type = "character", help = "GeneSetDB Gene Sets"), + make_option("--display_top", type = "integer", help = "Number of top Gene Sets to display"), + make_option("--min_size", type = "integer", help = "Minimum Size of Gene Set"), + make_option("--fdr_cutoff", type = "double", help = "FDR cutoff"), + make_option("--combine_method", type = "character", help = "Method to use to combine the p-values"), + make_option("--sort_method", type = "character", help = "Method to sort the results"), + make_option("--rdaOpt", type = "character", help = "Output RData file") ) -parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) -args = parse_args(parser) +parser <- OptionParser(usage = "%prog [options] file", option_list = option_list) +args <- parse_args(parser) ## Read in Files @@ -71,63 +73,65 @@ library("rjson") parser <- newJSONParser() parser$addData(args$filesPath) - factorList <- parser$getObject() - factors <- sapply(factorList, function(x) x[[1]]) - filenamesIn <- unname(unlist(factorList[[1]][[2]])) - sampleTable <- data.frame(sample=basename(filenamesIn), - filename=filenamesIn, - row.names=filenamesIn, - stringsAsFactors=FALSE) - for (factor in factorList) { - factorName <- factor[[1]] - sampleTable[[factorName]] <- character(nrow(sampleTable)) + factor_list <- parser$getObject() + factors <- sapply(factor_list, function(x) x[[1]]) + filenames_in <- unname(unlist(factor_list[[1]][[2]])) + sampletable <- data.frame(sample = basename(filenames_in), + filename = filenames_in, + row.names = filenames_in, + stringsAsFactors = FALSE) + for (factor in factor_list) { + factorname <- factor[[1]] + sampletable[[factorname]] <- character(nrow(sampletable)) lvls <- sapply(factor[[2]], function(x) names(x)) for (i in seq_along(factor[[2]])) { files <- factor[[2]][[i]][[1]] - sampleTable[files,factorName] <- lvls[i] + sampletable[files, factorname] <- lvls[i] } - sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) + sampletable[[factorname]] <- factor(sampletable[[factorname]], levels = lvls) } - rownames(sampleTable) <- sampleTable$sample - rem <- c("sample","filename") - factors <- sampleTable[, !(names(sampleTable) %in% rem), drop=FALSE] + rownames(sampletable) <- sampletable$sample + rem <- c("sample", "filename") + factors <- sampletable[, !(names(sampletable) %in% rem), drop = FALSE] #read in count files and create single table - countfiles <- lapply(sampleTable$filename, function(x){read.delim(x, row.names=1)}) + countfiles <- lapply(sampletable$filename, function(x) { + read.delim(x, row.names = 1) + }) counts <- do.call("cbind", countfiles) } else { # Process the single count matrix - counts <- read.table(args$matrixPath, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE) + counts <- read.table(args$matrixPath, header = TRUE, sep = "\t", stringsAsFactors = FALSE, check.names = FALSE) row.names(counts) <- counts[, 1] - counts <- counts[ , -1] - countsRows <- nrow(counts) + counts <- counts[, -1] + countsrows <- nrow(counts) # Process factors if (is.null(args$factInput)) { - factorData <- read.table(args$factFile, header=TRUE, sep="\t", strip.white=TRUE) + factordata <- read.table(args$factFile, header = TRUE, sep = "\t", strip.white = TRUE, stringsAsFactors = TRUE) # check samples names match - if(!any(factorData[, 1] %in% colnames(counts))) + if (!any(factordata[, 1] %in% colnames(counts))) stop("Sample IDs in factors file and count matrix don't match") # order samples as in counts matrix - factorData <- factorData[match(colnames(counts), factorData[, 1]), ] - factors <- factorData[, -1, drop=FALSE] + factordata <- factordata[match(colnames(counts), factordata[, 1]), ] + factors <- factordata[, -1, drop = FALSE] } else { - factors <- unlist(strsplit(args$factInput, "|", fixed=TRUE)) - factorData <- list() + factors <- unlist(strsplit(args$factInput, "|", fixed = TRUE)) + factordata <- list() for (fact in factors) { - newFact <- unlist(strsplit(fact, split="::")) - factorData <- rbind(factorData, newFact) + newfact <- unlist(strsplit(fact, split = "::")) + factordata <- rbind(factordata, newfact) } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor. # Set the row names to be the name of the factor and delete first row - row.names(factorData) <- factorData[, 1] - factorData <- factorData[, -1] - factorData <- sapply(factorData, sanitiseGroups) - factorData <- sapply(factorData, strsplit, split=",") - factorData <- sapply(factorData, make.names) + row.names(factordata) <- factordata[, 1] + factordata <- factordata[, -1] + factordata <- sapply(factordata, sanitise_groups) + factordata <- sapply(factordata, strsplit, split = ",") + factordata <- sapply(factordata, make.names) # Transform factor data into data frame of R factor objects - factors <- data.frame(factorData) + factors <- data.frame(factordata, stringsAsFactors = TRUE) } } @@ -135,36 +139,36 @@ counts <- DGEList(counts) # Set group to be the Primary Factor input -group <- factors[, 1, drop=FALSE] +group <- factors[, 1, drop = FALSE] # Split up contrasts separated by comma into a vector then sanitise -contrastData <- unlist(strsplit(args$contrastData, split=",")) -contrastData <- sanitiseEquation(contrastData) -contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) +contrast_data <- unlist(strsplit(args$contrastData, split = ",")) +contrast_data <- sanitise_equation(contrast_data) +contrast_data <- gsub(" ", ".", contrast_data, fixed = TRUE) # Creating design row.names(factors) <- colnames(counts) -factorList <- sapply(names(factors), pasteListName) +factor_list <- sapply(names(factors), paste_listname) formula <- "~0" -for (i in 1:length(factorList)) { - formula <- paste(formula, factorList[i], sep="+") +for (i in seq_along(factor_list)) { + formula <- paste(formula, factor_list[i], sep = "+") } formula <- formula(formula) design <- model.matrix(formula) -for (i in 1:length(factorList)) { - colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) +for (i in seq_along(factor_list)) { + colnames(design) <- gsub(factor_list[i], "", colnames(design), fixed = TRUE) } ## Generate Contrasts information -contrasts <- makeContrasts(contrasts=contrastData, levels=design) +contrasts <- makeContrasts(contrasts = contrast_data, levels = design) ## Add Gene Symbol information -genes <- read.table(args$genes, sep='\t', header=TRUE) +genes <- read.table(args$genes, sep = "\t", header = TRUE) ## Set Gene Set Testing Methods @@ -182,7 +186,7 @@ if (args$keggdb != "None") { keggdb <- unlist(strsplit(args$keggdb, ",")) - kegg_all <- c("Metabolism"="keggmet", "Signaling"="keggsig", "Disease"="keggdis") + kegg_all <- c("Metabolism" = "keggmet", "Signaling" = "keggsig", "Disease" = "keggdis") kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)]) } else { kegg_exclude <- "all" @@ -196,16 +200,16 @@ ## Index gene sets -gs.annots <- buildIdx(entrezIDs=rownames(counts), species=args$species, msigdb.gsets=msigdb, gsdb.gsets=gsdb, kegg.exclude=kegg_exclude, kegg.updated=args$keggupdated) +gs_annots <- buildIdx(entrezIDs = rownames(counts), species = args$species, msigdb.gsets = msigdb, gsdb.gsets = gsdb, kegg.exclude = kegg_exclude, kegg.updated = args$keggupdated) ## Run egsea.cnt -gsa <- egsea.cnt(counts=counts, group=group, design=design, contrasts=contrasts, gs.annots=gs.annots, symbolsMap=genes, baseGSEAs=base_methods, minSize=args$min_size, display.top=args$display_top, combineMethod=args$combine_method, sort.by=args$sort_method, report.dir='./report_dir', fdr.cutoff=args$fdr_cutoff, num.threads=args$threads, report=TRUE) +gsa <- egsea.cnt(counts = counts, group = group, design = design, contrasts = contrasts, gs.annots = gs_annots, symbolsMap = genes, baseGSEAs = base_methods, minSize = args$min_size, display.top = args$display_top, combineMethod = args$combine_method, sort.by = args$sort_method, report.dir = "./report_dir", fdr.cutoff = args$fdr_cutoff, num.threads = args$threads, report = TRUE) ## Output RData file if (!is.null(args$rdaOpt)) { save.image(file = "EGSEA_analysis.RData") -} \ No newline at end of file +}