comparison ruvseq.R @ 0:61dffb20b6f9 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ruvseq commit b95582cea8320d5488056a9576474f79cec53be8
author iuc
date Wed, 05 Sep 2018 15:54:16 -0400
parents
children c24765926774
comparison
equal deleted inserted replaced
-1:000000000000 0:61dffb20b6f9
1 # setup R error handling to go to stderr
2 library("getopt")
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
4 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
5
6 setup_cmdline_options <- function() {
7 args <- commandArgs(trailingOnly = TRUE)
8 spec <- matrix(c(
9 "help", "h", 0, "logical",
10 "alpha", "a", 1, "double",
11 "min_mean_count", "min_c", 1, "double",
12 "min_k", "min_k", 1, "double",
13 "max_k", "max_k", 1, "double",
14 "sample_json", "s", 1, "character",
15 "plots" , "p", 1, "character",
16 "header", "H", 0, "logical",
17 "txtype", "y", 1, "character",
18 "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF)
19 byrow=TRUE, ncol=4)
20
21 opt <- getopt(spec)
22 # if help was asked for print a friendly message
23 # and exit with a non-zero error code
24 if (!is.null(opt$help)) {
25 cat(getopt(spec, usage=TRUE))
26 q(status=1)
27 } else {
28 load_libraries()
29 }
30 return(opt)
31 }
32
33 load_libraries <- function() {
34 # Allows displaying help without waiting for libraries to load
35 library("tools")
36 library("jsonlite")
37 library("reshape2")
38 library("RUVSeq")
39 library("RColorBrewer")
40 library("tximport")
41 library("DESeq2")
42 library("ggrepel")
43 }
44
45 source_local <- function(fname){
46 argv <- commandArgs(trailingOnly = FALSE)
47 base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
48 source(paste(base_dir, fname, sep="/"))
49 }
50
51 # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport
52 source_local('get_deseq_dataset.R')
53
54 # RUVseq function definitions
55
56 plot_pca_rle <- function (set, title) {
57 x <- pData(set)[,1]
58 colors <- brewer.pal(3, "Set2")
59 label <- paste0(' for ', title)
60 plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=colors[x])
61 title(main=paste0("RLE", label))
62 plotPCA(set, col=colors[x], cex=1.2)
63 title(main=paste0("PCA", label))
64 }
65
66 plot_factors_of_unwanted_variation <- function(set, method, k){
67 pd <- pData(set)
68 pd['sample'] <- row.names(pd)
69 colnames(pd)[1] <- 'condition'
70 d = melt(pd, id.vars = c('sample', 'condition'))
71 d['x'] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot
72 print(ggplot(d, aes(x=x, y=value, color=condition, label=sample)) +
73 geom_point() +
74 ggtitle(paste0('Factors of unwanted variation for method: ', method, ", k=", k)) +
75 facet_wrap( ~ variable, scales = "free_x") +
76 geom_text_repel() +
77 theme(axis.title.x=element_blank(),
78 axis.text.x=element_blank(),
79 axis.ticks.x=element_blank(),
80 plot.title = element_text(hjust = 0.5))
81 )
82 }
83
84 create_seq_expression_set <- function (dds, min_mean_count) {
85 count_values <- counts(dds)
86 filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count)
87 filtered <- count_values[filter,]
88 set = newSeqExpressionSet(as.matrix(count_values),
89 phenoData = data.frame(colData(dds)$condition, row.names=colnames(filtered)))
90 plot_pca_rle(set = set, title = 'raw data')
91 set <- betweenLaneNormalization(set, which="upper")
92 plot_pca_rle(set = set, title = 'upper quartile normalized')
93 return(set)
94 }
95
96 get_empirical_control_genes <- function(set, cutoff_p) {
97 x <- pData(set)[,1]
98 design <- model.matrix(~x, data=pData(set))
99 y <- DGEList(counts=counts(set), group=x)
100 y <- calcNormFactors(y, method="upperquartile")
101 y <- estimateGLMCommonDisp(y, design)
102 y <- estimateGLMTagwiseDisp(y, design)
103 fit <- glmFit(y, design)
104 lrt <- glmLRT(fit, coef=2)
105 top <- topTags(lrt, n=nrow(set))$table
106 top_rows <- rownames(top)[which(top$PValue > cutoff_p)]
107 empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))]
108 return(empirical)
109 }
110
111 ruv_control_gene_method <- function(set, k, control_genes='empirical', cutoff_p=0.2) {
112 if (control_genes == 'empirical') {
113 control_genes = get_empirical_control_genes(set, cutoff_p=cutoff_p)
114 }
115 set <- RUVg(set, control_genes, k=k)
116 plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k))
117 plot_factors_of_unwanted_variation(set, method="RUVg with empirical control genes", k=k)
118 return(set)
119 }
120
121 ruv_residual_method <- function(set, k) {
122 genes <- rownames(counts(set))
123 x <- pData(set)[,1]
124 # Initial edger residuals
125 design <- model.matrix(~x, data=pData(set))
126 y <- DGEList(counts=counts(set), group=x)
127 y <- calcNormFactors(y, method="upperquartile")
128 y <- estimateGLMCommonDisp(y, design)
129 y <- estimateGLMTagwiseDisp(y, design)
130 fit <- glmFit(y, design)
131 res <- residuals(fit, type="deviance")
132 set <- RUVr(set, genes, k=k, res)
133 plot_pca_rle(set = set, title = paste0('RUVr using residuals, k=', k))
134 plot_factors_of_unwanted_variation(set, method="RUVr using residuals", k=k)
135 return(set)
136 }
137
138 ruv_replicate_method <- function (set, k) {
139 genes <- rownames(counts(set))
140 x <- pData(set)[,1]
141 differences <- makeGroups(x)
142 set <- RUVs(set, genes, k=k, differences)
143 plot_pca_rle(set, paste0('RUVs with replicate samples, k=', k))
144 plot_factors_of_unwanted_variation(set, method="RUVs using replicates", k=k)
145 return(set)
146 }
147
148 get_differentially_expressed_genes <- function(dds, contrast, alpha=0.01) {
149 r <- results(dds, contrast=contrast, alpha=alpha)
150 return(rownames(r[which(r$padj < alpha),]))
151 }
152
153 opt <- setup_cmdline_options()
154 alpha <- opt$alpha
155 min_k <- opt$min_k
156 max_k <- opt$max_k
157 sample_json <- fromJSON(opt$sample_json)
158 sample_paths <- sample_json$path
159 sample_names <- sample_json$label
160 condition <- as.factor(sample_json$condition)
161 sampleTable <- data.frame(samplename=sample_names,
162 filename = sample_paths,
163 condition=condition)
164 rownames(sampleTable) <- sample_names
165
166 dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula= ~ condition, tximport=opt$txtype, txtype=opt$txtype, tx2gene=opt$tx2gene)
167 if (!is.null(opt$plots)) {
168 pdf(opt$plots)
169 }
170
171 # Run through the ruvseq variants
172 set <- create_seq_expression_set(dds, min_mean_count = opt$min_mean_count)
173 result <- list(no_correction = set)
174 for (k in seq(min_k, max_k)) {
175 result[[paste0('residual_method_k', k)]] <- ruv_residual_method(set, k=k)
176 result[[paste0('replicate_method_k', k)]] <- ruv_replicate_method(set, k=k)
177 result[[paste0('control_method_k', k)]] <- ruv_control_gene_method(set, k=k, cutoff_p=0.5)
178 }
179
180 for (name in names(result)) {
181 if (!startsWith(name, "no_correction")) {
182 set <- result[[name]]
183 unwanted_variation <- pData(set)
184 df <- data.frame(identifier = rownames(unwanted_variation))
185 df <- cbind(df, unwanted_variation)
186 colnames(df)[2] <- 'condition'
187 write.table(df, file=paste0("batch_effects_", name, ".tabular"), sep="\t", quote=F, row.names=F)
188 }
189 }
190
191 # close the plot device
192 if (!is.null(opt$plots)) {
193 cat("closing plot device\n")
194 dev.off()
195 }
196
197 cat("Session information:\n\n")
198 sessionInfo()