Mercurial > repos > iuc > egsea
comparison egsea.R @ 4:fba1660fb717 draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/egsea commit c2313b506b3b8ae860bb844b979397d87de4fb44"
author | iuc |
---|---|
date | Mon, 28 Jun 2021 09:45:14 +0000 |
parents | ba2111ae6eb4 |
children |
comparison
equal
deleted
inserted
replaced
3:31ea4992b948 | 4:fba1660fb717 |
---|---|
1 # Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers | 1 # Code based on (and inspired by) the Galaxy limma-voom/edgeR/DESeq2 wrappers |
2 | 2 |
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | 3 options(show.error.messages = F, error = function() { |
4 cat(geterrmessage(), file = stderr()); q("no", 1, F) | |
5 }) | |
4 | 6 |
5 # we need that to not crash galaxy with an UTF8 error on German LC settings. | 7 # we need that to not crash galaxy with an UTF8 error on German LC settings. |
6 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | 8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") |
7 | 9 |
8 suppressPackageStartupMessages({ | 10 suppressPackageStartupMessages({ |
13 }) | 15 }) |
14 | 16 |
15 | 17 |
16 ## Function Declaration | 18 ## Function Declaration |
17 | 19 |
18 sanitiseEquation <- function(equation) { | 20 sanitise_equation <- function(equation) { |
19 equation <- gsub(" *[+] *", "+", equation) | 21 equation <- gsub(" *[+] *", "+", equation) |
20 equation <- gsub(" *[-] *", "-", equation) | 22 equation <- gsub(" *[-] *", "-", equation) |
21 equation <- gsub(" *[/] *", "/", equation) | 23 equation <- gsub(" *[/] *", "/", equation) |
22 equation <- gsub(" *[*] *", "*", equation) | 24 equation <- gsub(" *[*] *", "*", equation) |
23 equation <- gsub("^\\s+|\\s+$", "", equation) | 25 equation <- gsub("^\\s+|\\s+$", "", equation) |
24 return(equation) | 26 return(equation) |
25 } | 27 } |
26 | 28 |
27 # Function to sanitise group information | 29 # Function to sanitise group information |
28 sanitiseGroups <- function(string) { | 30 sanitise_groups <- function(string) { |
29 string <- gsub(" *[,] *", ",", string) | 31 string <- gsub(" *[,] *", ",", string) |
30 string <- gsub("^\\s+|\\s+$", "", string) | 32 string <- gsub("^\\s+|\\s+$", "", string) |
31 return(string) | 33 return(string) |
32 } | 34 } |
33 | 35 |
34 # Generating design information | 36 # Generating design information |
35 pasteListName <- function(string) { | 37 paste_listname <- function(string) { |
36 return(paste0("factors$", string)) | 38 return(paste0("factors$", string)) |
37 } | 39 } |
38 | 40 |
39 ## Input Processing | 41 ## Input Processing |
40 | 42 |
41 option_list <- list( | 43 option_list <- list( |
42 make_option(c("-threads", "--threads"), default=2, type="integer", help="Number of threads for egsea"), | 44 make_option("--threads", default = 2, type = "integer", help = "Number of threads for egsea"), |
43 make_option(c("-filesPath", "--filesPath"), type="character", help="JSON list object if multiple files input"), | 45 make_option("--filesPath", type = "character", help = "JSON list object if multiple files input"), |
44 make_option(c("-matrixPath", "--matrixPath"), type="character", help="Path to count matrix"), | 46 make_option("--matrixPath", type = "character", help = "Path to count matrix"), |
45 make_option(c("-factFile", "--factFile"), type="character", help="Path to factor information file"), | 47 make_option("--factFile", type = "character", help = "Path to factor information file"), |
46 make_option(c("-factInput", "--factInput"), type="character", help="String containing factors if manually input"), | 48 make_option("--factInput", type = "character", help = "String containing factors if manually input"), |
47 make_option(c("-contrastData", "--contrastData"), type="character", help="Contrasts of Interest (Groups to compare)"), | 49 make_option("--contrastData", type = "character", help = "Contrasts of Interest (Groups to compare)"), |
48 make_option(c("-genes", "--genes"), type="character", help="Path to genes file"), | 50 make_option("--genes", type = "character", help = "Path to genes file"), |
49 make_option(c("-species", "--species"), type="character"), | 51 make_option("--species", type = "character"), |
50 make_option(c("-base_methods", "--base_methods"), type="character", help="Gene set testing methods"), | 52 make_option("--base_methods", type = "character", help = "Gene set testing methods"), |
51 make_option(c("-msigdb", "--msigdb"), type="character", help="MSigDB Gene Set Collections"), | 53 make_option("--msigdb", type = "character", help = "MSigDB Gene Set Collections"), |
52 make_option(c("-keggdb", "--keggdb"), type="character", help="KEGG Pathways"), | 54 make_option("--keggdb", type = "character", help = "KEGG Pathways"), |
53 make_option(c("-keggupdated", "--keggupdated"), type="logical", help="Use updated KEGG"), | 55 make_option("--keggupdated", type = "logical", help = "Use updated KEGG"), |
54 make_option(c("-gsdb", "--gsdb"), type="character", help = "GeneSetDB Gene Sets"), | 56 make_option("--gsdb", type = "character", help = "GeneSetDB Gene Sets"), |
55 make_option(c("-display_top", "--display_top"), type="integer", help = "Number of top Gene Sets to display"), | 57 make_option("--display_top", type = "integer", help = "Number of top Gene Sets to display"), |
56 make_option(c("-min_size", "--min_size"), type="integer", help = "Minimum Size of Gene Set"), | 58 make_option("--min_size", type = "integer", help = "Minimum Size of Gene Set"), |
57 make_option(c("-fdr_cutoff", "--fdr_cutoff"), type="double", help = "FDR cutoff"), | 59 make_option("--fdr_cutoff", type = "double", help = "FDR cutoff"), |
58 make_option(c("-combine_method", "--combine_method"), type="character", help="Method to use to combine the p-values"), | 60 make_option("--combine_method", type = "character", help = "Method to use to combine the p-values"), |
59 make_option(c("-sort_method", "--sort_method"), type="character", help="Method to sort the results"), | 61 make_option("--sort_method", type = "character", help = "Method to sort the results"), |
60 make_option(c("-rdaOpt", "--rdaOpt"), type="character", help="Output RData file") | 62 make_option("--rdaOpt", type = "character", help = "Output RData file") |
61 ) | 63 ) |
62 | 64 |
63 parser <- OptionParser(usage = "%prog [options] file", option_list=option_list) | 65 parser <- OptionParser(usage = "%prog [options] file", option_list = option_list) |
64 args = parse_args(parser) | 66 args <- parse_args(parser) |
65 | 67 |
66 | 68 |
67 ## Read in Files | 69 ## Read in Files |
68 | 70 |
69 if (!is.null(args$filesPath)) { | 71 if (!is.null(args$filesPath)) { |
70 # Process the separate count files (adapted from DESeq2 wrapper) | 72 # Process the separate count files (adapted from DESeq2 wrapper) |
71 library("rjson") | 73 library("rjson") |
72 parser <- newJSONParser() | 74 parser <- newJSONParser() |
73 parser$addData(args$filesPath) | 75 parser$addData(args$filesPath) |
74 factorList <- parser$getObject() | 76 factor_list <- parser$getObject() |
75 factors <- sapply(factorList, function(x) x[[1]]) | 77 factors <- sapply(factor_list, function(x) x[[1]]) |
76 filenamesIn <- unname(unlist(factorList[[1]][[2]])) | 78 filenames_in <- unname(unlist(factor_list[[1]][[2]])) |
77 sampleTable <- data.frame(sample=basename(filenamesIn), | 79 sampletable <- data.frame(sample = basename(filenames_in), |
78 filename=filenamesIn, | 80 filename = filenames_in, |
79 row.names=filenamesIn, | 81 row.names = filenames_in, |
80 stringsAsFactors=FALSE) | 82 stringsAsFactors = FALSE) |
81 for (factor in factorList) { | 83 for (factor in factor_list) { |
82 factorName <- factor[[1]] | 84 factorname <- factor[[1]] |
83 sampleTable[[factorName]] <- character(nrow(sampleTable)) | 85 sampletable[[factorname]] <- character(nrow(sampletable)) |
84 lvls <- sapply(factor[[2]], function(x) names(x)) | 86 lvls <- sapply(factor[[2]], function(x) names(x)) |
85 for (i in seq_along(factor[[2]])) { | 87 for (i in seq_along(factor[[2]])) { |
86 files <- factor[[2]][[i]][[1]] | 88 files <- factor[[2]][[i]][[1]] |
87 sampleTable[files,factorName] <- lvls[i] | 89 sampletable[files, factorname] <- lvls[i] |
88 } | 90 } |
89 sampleTable[[factorName]] <- factor(sampleTable[[factorName]], levels=lvls) | 91 sampletable[[factorname]] <- factor(sampletable[[factorname]], levels = lvls) |
90 } | 92 } |
91 rownames(sampleTable) <- sampleTable$sample | 93 rownames(sampletable) <- sampletable$sample |
92 rem <- c("sample","filename") | 94 rem <- c("sample", "filename") |
93 factors <- sampleTable[, !(names(sampleTable) %in% rem), drop=FALSE] | 95 factors <- sampletable[, !(names(sampletable) %in% rem), drop = FALSE] |
94 | 96 |
95 #read in count files and create single table | 97 #read in count files and create single table |
96 countfiles <- lapply(sampleTable$filename, function(x){read.delim(x, row.names=1)}) | 98 countfiles <- lapply(sampletable$filename, function(x) { |
99 read.delim(x, row.names = 1) | |
100 }) | |
97 counts <- do.call("cbind", countfiles) | 101 counts <- do.call("cbind", countfiles) |
98 | 102 |
99 } else { | 103 } else { |
100 # Process the single count matrix | 104 # Process the single count matrix |
101 counts <- read.table(args$matrixPath, header=TRUE, sep="\t", stringsAsFactors=FALSE, check.names=FALSE) | 105 counts <- read.table(args$matrixPath, header = TRUE, sep = "\t", stringsAsFactors = FALSE, check.names = FALSE) |
102 row.names(counts) <- counts[, 1] | 106 row.names(counts) <- counts[, 1] |
103 counts <- counts[ , -1] | 107 counts <- counts[, -1] |
104 countsRows <- nrow(counts) | 108 countsrows <- nrow(counts) |
105 | 109 |
106 # Process factors | 110 # Process factors |
107 if (is.null(args$factInput)) { | 111 if (is.null(args$factInput)) { |
108 factorData <- read.table(args$factFile, header=TRUE, sep="\t", strip.white=TRUE) | 112 factordata <- read.table(args$factFile, header = TRUE, sep = "\t", strip.white = TRUE, stringsAsFactors = TRUE) |
109 # check samples names match | 113 # check samples names match |
110 if(!any(factorData[, 1] %in% colnames(counts))) | 114 if (!any(factordata[, 1] %in% colnames(counts))) |
111 stop("Sample IDs in factors file and count matrix don't match") | 115 stop("Sample IDs in factors file and count matrix don't match") |
112 # order samples as in counts matrix | 116 # order samples as in counts matrix |
113 factorData <- factorData[match(colnames(counts), factorData[, 1]), ] | 117 factordata <- factordata[match(colnames(counts), factordata[, 1]), ] |
114 factors <- factorData[, -1, drop=FALSE] | 118 factors <- factordata[, -1, drop = FALSE] |
115 } else { | 119 } else { |
116 factors <- unlist(strsplit(args$factInput, "|", fixed=TRUE)) | 120 factors <- unlist(strsplit(args$factInput, "|", fixed = TRUE)) |
117 factorData <- list() | 121 factordata <- list() |
118 for (fact in factors) { | 122 for (fact in factors) { |
119 newFact <- unlist(strsplit(fact, split="::")) | 123 newfact <- unlist(strsplit(fact, split = "::")) |
120 factorData <- rbind(factorData, newFact) | 124 factordata <- rbind(factordata, newfact) |
121 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor. | 125 } # Factors have the form: FACT_NAME::LEVEL,LEVEL,LEVEL,LEVEL,... The first factor is the Primary Factor. |
122 | 126 |
123 # Set the row names to be the name of the factor and delete first row | 127 # Set the row names to be the name of the factor and delete first row |
124 row.names(factorData) <- factorData[, 1] | 128 row.names(factordata) <- factordata[, 1] |
125 factorData <- factorData[, -1] | 129 factordata <- factordata[, -1] |
126 factorData <- sapply(factorData, sanitiseGroups) | 130 factordata <- sapply(factordata, sanitise_groups) |
127 factorData <- sapply(factorData, strsplit, split=",") | 131 factordata <- sapply(factordata, strsplit, split = ",") |
128 factorData <- sapply(factorData, make.names) | 132 factordata <- sapply(factordata, make.names) |
129 # Transform factor data into data frame of R factor objects | 133 # Transform factor data into data frame of R factor objects |
130 factors <- data.frame(factorData) | 134 factors <- data.frame(factordata, stringsAsFactors = TRUE) |
131 } | 135 } |
132 } | 136 } |
133 | 137 |
134 # Create a DGEList object | 138 # Create a DGEList object |
135 counts <- DGEList(counts) | 139 counts <- DGEList(counts) |
136 | 140 |
137 # Set group to be the Primary Factor input | 141 # Set group to be the Primary Factor input |
138 group <- factors[, 1, drop=FALSE] | 142 group <- factors[, 1, drop = FALSE] |
139 | 143 |
140 # Split up contrasts separated by comma into a vector then sanitise | 144 # Split up contrasts separated by comma into a vector then sanitise |
141 contrastData <- unlist(strsplit(args$contrastData, split=",")) | 145 contrast_data <- unlist(strsplit(args$contrastData, split = ",")) |
142 contrastData <- sanitiseEquation(contrastData) | 146 contrast_data <- sanitise_equation(contrast_data) |
143 contrastData <- gsub(" ", ".", contrastData, fixed=TRUE) | 147 contrast_data <- gsub(" ", ".", contrast_data, fixed = TRUE) |
144 | 148 |
145 # Creating design | 149 # Creating design |
146 row.names(factors) <- colnames(counts) | 150 row.names(factors) <- colnames(counts) |
147 factorList <- sapply(names(factors), pasteListName) | 151 factor_list <- sapply(names(factors), paste_listname) |
148 | 152 |
149 formula <- "~0" | 153 formula <- "~0" |
150 for (i in 1:length(factorList)) { | 154 for (i in seq_along(factor_list)) { |
151 formula <- paste(formula, factorList[i], sep="+") | 155 formula <- paste(formula, factor_list[i], sep = "+") |
152 } | 156 } |
153 formula <- formula(formula) | 157 formula <- formula(formula) |
154 | 158 |
155 design <- model.matrix(formula) | 159 design <- model.matrix(formula) |
156 | 160 |
157 for (i in 1:length(factorList)) { | 161 for (i in seq_along(factor_list)) { |
158 colnames(design) <- gsub(factorList[i], "", colnames(design), fixed=TRUE) | 162 colnames(design) <- gsub(factor_list[i], "", colnames(design), fixed = TRUE) |
159 } | 163 } |
160 | 164 |
161 ## Generate Contrasts information | 165 ## Generate Contrasts information |
162 contrasts <- makeContrasts(contrasts=contrastData, levels=design) | 166 contrasts <- makeContrasts(contrasts = contrast_data, levels = design) |
163 | 167 |
164 | 168 |
165 ## Add Gene Symbol information | 169 ## Add Gene Symbol information |
166 | 170 |
167 genes <- read.table(args$genes, sep='\t', header=TRUE) | 171 genes <- read.table(args$genes, sep = "\t", header = TRUE) |
168 | 172 |
169 | 173 |
170 ## Set Gene Set Testing Methods | 174 ## Set Gene Set Testing Methods |
171 | 175 |
172 base_methods <- unlist(strsplit(args$base_methods, ",")) | 176 base_methods <- unlist(strsplit(args$base_methods, ",")) |
180 msigdb <- "none" | 184 msigdb <- "none" |
181 } | 185 } |
182 | 186 |
183 if (args$keggdb != "None") { | 187 if (args$keggdb != "None") { |
184 keggdb <- unlist(strsplit(args$keggdb, ",")) | 188 keggdb <- unlist(strsplit(args$keggdb, ",")) |
185 kegg_all <- c("Metabolism"="keggmet", "Signaling"="keggsig", "Disease"="keggdis") | 189 kegg_all <- c("Metabolism" = "keggmet", "Signaling" = "keggsig", "Disease" = "keggdis") |
186 kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)]) | 190 kegg_exclude <- names(kegg_all[!(kegg_all %in% keggdb)]) |
187 } else { | 191 } else { |
188 kegg_exclude <- "all" | 192 kegg_exclude <- "all" |
189 } | 193 } |
190 | 194 |
194 gsdb <- "none" | 198 gsdb <- "none" |
195 } | 199 } |
196 | 200 |
197 ## Index gene sets | 201 ## Index gene sets |
198 | 202 |
199 gs.annots <- buildIdx(entrezIDs=rownames(counts), species=args$species, msigdb.gsets=msigdb, gsdb.gsets=gsdb, kegg.exclude=kegg_exclude, kegg.updated=args$keggupdated) | 203 gs_annots <- buildIdx(entrezIDs = rownames(counts), species = args$species, msigdb.gsets = msigdb, gsdb.gsets = gsdb, kegg.exclude = kegg_exclude, kegg.updated = args$keggupdated) |
200 | 204 |
201 | 205 |
202 ## Run egsea.cnt | 206 ## Run egsea.cnt |
203 | 207 |
204 gsa <- egsea.cnt(counts=counts, group=group, design=design, contrasts=contrasts, gs.annots=gs.annots, symbolsMap=genes, baseGSEAs=base_methods, minSize=args$min_size, display.top=args$display_top, combineMethod=args$combine_method, sort.by=args$sort_method, report.dir='./report_dir', fdr.cutoff=args$fdr_cutoff, num.threads=args$threads, report=TRUE) | 208 gsa <- egsea.cnt(counts = counts, group = group, design = design, contrasts = contrasts, gs.annots = gs_annots, symbolsMap = genes, baseGSEAs = base_methods, minSize = args$min_size, display.top = args$display_top, combineMethod = args$combine_method, sort.by = args$sort_method, report.dir = "./report_dir", fdr.cutoff = args$fdr_cutoff, num.threads = args$threads, report = TRUE) |
205 | 209 |
206 | 210 |
207 ## Output RData file | 211 ## Output RData file |
208 | 212 |
209 if (!is.null(args$rdaOpt)) { | 213 if (!is.null(args$rdaOpt)) { |