diff egsea.R @ 0:a8a083193440 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/egsea commit 7d0c7d850cd56ea3e54d8c03266f719241b20b8b
author iuc
date Thu, 25 Jan 2018 02:23:23 -0500
parents
children 73281fbdf6c1
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/egsea.R	Thu Jan 25 02:23:23 2018 -0500
@@ -0,0 +1,206 @@
+# Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers
+
+options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
+
+# we need that to not crash galaxy with an UTF8 error on German LC settings.
+loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
+
+suppressPackageStartupMessages({
+    library(EGSEA)
+    library(limma)
+    library(edgeR)
+    library(optparse)
+})
+
+
+## Function Declaration
+
+sanitiseEquation <- function(equation) {
+    equation <- gsub(" *[+] *", "+", equation)
+    equation <- gsub(" *[-] *", "-", equation)
+    equation <- gsub(" *[/] *", "/", equation)
+    equation <- gsub(" *[*] *", "*", equation)
+    equation <- gsub("^\\s+|\\s+$", "", equation)
+    return(equation)
+}
+
+# Function to sanitise group information
+sanitiseGroups <- function(string) {
+    string <- gsub(" *[,] *", ",", string)
+    string <- gsub("^\\s+|\\s+$", "", string)
+    return(string)
+}
+
+# Generating design information
+pasteListName <- function(string) {
+    return(paste0("factors$", string))
+}
+
+## Input Processing
+
+option_list <- list(
+    make_option(c("-threads", "--threads"), default=2, type="integer", help="Number of threads for egsea"),
+    make_option(c("-filesPath", "--filesPath"), type="character", help="JSON list object if multiple files input"),
+    make_option(c("-matrixPath", "--matrixPath"), type="character", help="Path to count matrix"),
+    make_option(c("-factFile", "--factFile"), type="character", help="Path to factor information file"),
+    make_option(c("-factInput", "--factInput"), type="character", help="String containing factors if manually input"),
+    make_option(c("-contrastData", "--contrastData"), type="character", help="Contrasts of Interest (Groups to compare)"),
+    make_option(c("-genes", "--genes"), type="character", help="Path to genes file"),
+    make_option(c("-species", "--species"), type="character"),
+    make_option(c("-base_methods", "--base_methods"), type="character", help="Gene set testing methods"),
+    make_option(c("-msigdb", "--msigdb"), type="character", help="MSigDB Gene Set Collections"),
+    make_option(c("-keggdb", "--keggdb"), type="character", help="KEGG Pathways"),
+    make_option(c("-gsdb", "--gsdb"), type="character", help = "GeneSetDB Gene Sets"),
+    make_option(c("-display_top", "--display_top"), type="integer", help = "Number of top Gene Sets to display"),
+    make_option(c("-min_size", "--min_size"), type="integer", help = "Minimum Size of Gene Set"),
+    make_option(c("-fdr_cutoff", "--fdr_cutoff"), type="double", help = "FDR cutoff"),
+    make_option(c("-combine_method", "--combine_method"), type="character", help="Method to use to combine the p-values"),
+    make_option(c("-sort_method", "--sort_method"), type="character", help="Method to sort the results"),
+    make_option(c("-rdata", "--rdaOpt"), type="character", help="Output RData file")
+    )
+
+parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
+args = parse_args(parser)
+
+
+## Read in Files
+
+if (!is.null(args$filesPath)) {
+    # Process the separate count files (adapted from DESeq2 wrapper)
+    library("rjson")
+    parser <- newJSONParser()
+    parser$addData(args$filesPath)
+    factorList <- parser$getObject()
+    factors <- sapply(factorList, function(x) x[[1]])
+    filenamesIn <- unname(unlist(factorList[[1]][[2]]))
+    sampleTable <- data.frame(sample=basename(filenamesIn),
+                            filename=filenamesIn,
+                            row.names=filenamesIn,
+                            stringsAsFactors=FALSE)
+    for (factor in factorList) {
+        factorName <- factor[[1]]
+        sampleTable[[factorName]] <- character(nrow(sampleTable))
+        lvls <- sapply(factor[[2]], function(x) names(x))
+        for (i in seq_along(factor[[2]])) {
+            files <- factor[[2]][[i]][[1]]
+            sampleTable[files,factorName] <- lvls[i]
+        }
+        sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls)
+    }
+    rownames(sampleTable) <- sampleTable$sample
+    rem <- c("sample","filename")
+    factors <- sampleTable[, !(names(sampleTable) %in% rem), drop=FALSE]
+
+    #read in count files and create single table
+    countfiles <- lapply(sampleTable$filename, function(x){read.delim(x, row.names=1)})
+    counts <- do.call("cbind", countfiles)
+
+} else {
+ # Process the single count matrix
+    counts <- read.table(args$matrixPath, header=TRUE, sep="\t", stringsAsFactors=FALSE)
+    row.names(counts) <- counts[, 1]
+    counts <- counts[ , -1]
+    countsRows <- nrow(counts)
+
+    # Process factors
+    if (is.null(args$factInput)) {
+            factorData <- read.table(args$factFile, header=TRUE, sep="\t")
+            factors <- factorData[, -1, drop=FALSE]
+    }  else {
+            factors <- unlist(strsplit(args$factInput, "|", fixed=TRUE))
+            factorData <- list()
+            for (fact in factors) {
+                newFact <- unlist(strsplit(fact, split="::"))
+                factorData <- rbind(factorData, newFact)
+            } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor.
+
+            # Set the row names to be the name of the factor and delete first row
+            row.names(factorData) <- factorData[, 1]
+            factorData <- factorData[, -1]
+            factorData <- sapply(factorData, sanitiseGroups)
+            factorData <- sapply(factorData, strsplit, split=",")
+            factorData <- sapply(factorData, make.names)
+            # Transform factor data into data frame of R factor objects
+            factors <- data.frame(factorData)
+    }
+}
+
+# Create a DGEList object
+counts <- DGEList(counts)
+
+# Set group to be the Primary Factor input
+group <- factors[, 1, drop=FALSE]
+
+# Split up contrasts separated by comma into a vector then sanitise
+contrastData <- unlist(strsplit(args$contrastData, split=","))
+contrastData <- sanitiseEquation(contrastData)
+contrastData <- gsub(" ", ".", contrastData, fixed=TRUE)
+
+# Creating design
+row.names(factors) <- colnames(counts)
+factorList <- sapply(names(factors), pasteListName)
+
+formula <- "~0"
+for (i in 1:length(factorList)) {
+    formula <- paste(formula, factorList[i], sep="+")
+}
+formula <- formula(formula)
+
+design <- model.matrix(formula)
+
+for (i in 1:length(factorList)) {
+    colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE)
+}
+
+## Generate Contrasts information
+contrasts <- makeContrasts(contrasts=contrastData, levels=design)
+
+
+## Add Gene Symbol information
+
+genes <- read.table(args$genes, sep='\t', header=TRUE)
+
+
+## Set Gene Set Testing Methods
+
+base_methods <- unlist(strsplit(args$base_methods, ","))
+
+
+## Set Gene Sets
+
+if (args$msigdb != "None") {
+    msigdb <- unlist(strsplit(args$msigdb, ","))
+} else {
+    msigdb <- "none"
+}
+
+if (args$keggdb != "None") {
+    keggdb <- unlist(strsplit(args$keggdb, ","))
+    kegg_all <- c("Metabolism"="keggmet", "Signaling"="keggsig", "Disease"="keggdis")
+    kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)])
+} else {
+    kegg_exclude <- "all"
+}
+
+if (args$gsdb != "None") {
+    gsdb <- unlist(strsplit(args$gsdb, ","))
+} else {
+    gsdb <- "none"
+}
+
+
+## Index gene sets
+
+gs.annots <- buildIdx(entrezIDs=rownames(counts), species=args$species, msigdb.gsets=msigdb, gsdb.gsets=gsdb, kegg.exclude=kegg_exclude)
+
+
+## Run egsea.cnt
+
+gsa <- egsea.cnt(counts=counts, group=group, design=design, contrasts=contrasts, gs.annots=gs.annots, symbolsMap=genes, baseGSEAs=base_methods, minSize=args$min_size, display.top=args$display_top, combineMethod=args$combine_method, sort.by=args$sort_method, report.dir='./report_dir', fdr.cutoff=args$fdr_cutoff, num.threads=args$threads, report=TRUE)
+
+
+## Output RData file
+
+if (!is.null(args$rdata)) {
+  save.image(file = "EGSEA_analysis.RData")
+}
\ No newline at end of file