comparison ruvseq.R @ 2:fed9d0350d72 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ruvseq commit 4daa375d022673d2437d609b1865b78c64b04415"
author iuc
date Fri, 15 Jan 2021 17:53:15 +0000
parents c24765926774
children 3a083c78896e
comparison
equal deleted inserted replaced
1:c24765926774 2:fed9d0350d72
1 # setup R error handling to go to stderr 1 # setup R error handling to go to stderr
2 library("getopt") 2 library("getopt")
3 options( show.error.messages=F, error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) 3 options(show.error.messages = F, error = function() {
4 cat(geterrmessage(), file = stderr()); q("no", 1, F)
5 })
4 options(stringAsFactors = FALSE, useFancyQuotes = FALSE) 6 options(stringAsFactors = FALSE, useFancyQuotes = FALSE)
5 7
6 setup_cmdline_options <- function() { 8 setup_cmdline_options <- function() {
7 args <- commandArgs(trailingOnly = TRUE) 9 args <- commandArgs(trailingOnly = TRUE)
8 spec <- matrix(c( 10 spec <- matrix(c(
10 "alpha", "a", 1, "double", 12 "alpha", "a", 1, "double",
11 "min_mean_count", "min_c", 1, "double", 13 "min_mean_count", "min_c", 1, "double",
12 "min_k", "min_k", 1, "double", 14 "min_k", "min_k", 1, "double",
13 "max_k", "max_k", 1, "double", 15 "max_k", "max_k", 1, "double",
14 "sample_json", "s", 1, "character", 16 "sample_json", "s", 1, "character",
15 "plots" , "p", 1, "character", 17 "plots", "p", 1, "character",
16 "header", "H", 0, "logical", 18 "header", "H", 0, "logical",
17 "txtype", "y", 1, "character", 19 "txtype", "y", 1, "character",
18 "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF) 20 "tx2gene", "x", 1, "character"), # a space-sep tx-to-gene map or GTF file (auto detect .gtf/.GTF)
19 byrow=TRUE, ncol=4) 21 byrow = TRUE, ncol = 4)
20 22
21 opt <- getopt(spec) 23 opt <- getopt(spec)
22 # if help was asked for print a friendly message 24 # if help was asked for print a friendly message
23 # and exit with a non-zero error code 25 # and exit with a non-zero error code
24 if (!is.null(opt$help)) { 26 if (!is.null(opt$help)) {
25 cat(getopt(spec, usage=TRUE)) 27 cat(getopt(spec, usage = TRUE))
26 q(status=1) 28 q(status = 1)
27 } else { 29 } else {
28 load_libraries() 30 load_libraries()
29 } 31 }
30 return(opt) 32 return(opt)
31 } 33 }
40 library("tximport") 42 library("tximport")
41 library("DESeq2") 43 library("DESeq2")
42 library("ggrepel") 44 library("ggrepel")
43 } 45 }
44 46
45 source_local <- function(fname){ 47 source_local <- function(fname) {
46 argv <- commandArgs(trailingOnly = FALSE) 48 argv <- commandArgs(trailingOnly = FALSE)
47 base_dir <- dirname(substring(argv[grep("--file=", argv)], 8)) 49 base_dir <- dirname(substring(argv[grep("--file=", argv)], 8))
48 source(paste(base_dir, fname, sep="/")) 50 source(paste(base_dir, fname, sep = "/"))
49 } 51 }
50 52
51 # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport 53 # Source get_deseq_dataset.R for getting deseq dataset from htseq/featurecounts/tximport
52 source_local('get_deseq_dataset.R') 54 source_local("get_deseq_dataset.R")
53 55
54 # RUVseq function definitions 56 # RUVseq function definitions
55 57
56 plot_pca_rle <- function (set, title) { 58 plot_pca_rle <- function(set, title) {
57 x <- pData(set)[,1] 59 x <- pData(set)[, 1]
58 colors <- brewer.pal(3, "Set2") 60 colors <- brewer.pal(3, "Set2")
59 label <- paste0(' for ', title) 61 label <- paste0(" for ", title)
60 plotRLE(set, outline=FALSE, ylim=c(-4, 4), col=colors[x]) 62 plotRLE(set, outline = FALSE, ylim = c(-4, 4), col = colors[x])
61 title(main=paste0("RLE", label)) 63 title(main = paste0("RLE", label))
62 plotPCA(set, col=colors[x], cex=1.2) 64 plotPCA(set, col = colors[x], cex = 1.2)
63 title(main=paste0("PCA", label)) 65 title(main = paste0("PCA", label))
64 } 66 }
65 67
66 plot_factors_of_unwanted_variation <- function(set, method, k){ 68 plot_factors_of_unwanted_var <- function(set, method, k) {
67 pd <- pData(set) 69 pd <- pData(set)
68 pd['sample'] <- row.names(pd) 70 pd["sample"] <- row.names(pd)
69 colnames(pd)[1] <- 'condition' 71 colnames(pd)[1] <- "condition"
70 d = melt(pd, id.vars = c('sample', 'condition')) 72 d <- melt(pd, id.vars = c("sample", "condition"))
71 d['x'] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot 73 d["x"] <- 1 # There is no information on the X, so we just fake it to be able to do a scatterplot
72 print(ggplot(d, aes(x=x, y=value, color=condition, label=sample)) + 74 print(ggplot(d, aes(x = x, y = value, color = condition, label = sample)) +
73 geom_point() + 75 geom_point() +
74 ggtitle(paste0('Factors of unwanted variation for method: ', method, ", k=", k)) + 76 ggtitle(paste0("Factors of unwanted variation for method: ", method, ", k=", k)) +
75 facet_wrap( ~ variable, scales = "free_x") + 77 facet_wrap(~ variable, scales = "free_x") +
76 geom_text_repel() + 78 geom_text_repel() +
77 theme(axis.title.x=element_blank(), 79 theme(axis.title.x = element_blank(),
78 axis.text.x=element_blank(), 80 axis.text.x = element_blank(),
79 axis.ticks.x=element_blank(), 81 axis.ticks.x = element_blank(),
80 plot.title = element_text(hjust = 0.5)) 82 plot.title = element_text(hjust = 0.5))
81 ) 83 )
82 } 84 }
83 85
84 create_seq_expression_set <- function (dds, min_mean_count) { 86 create_seq_expression_set <- function(dds, min_mean_count) {
85 count_values <- counts(dds) 87 count_values <- counts(dds)
86 print(paste0("feature count before filtering :",nrow(count_values),"\n")) 88 print(paste0("feature count before filtering :", nrow(count_values), "\n"))
87 print(paste0("Filtering features which mean expression is less or eq. than ", min_mean_count, " counts\n")) 89 print(paste0("Filtering features which mean expression is less or eq. than ", min_mean_count, " counts\n"))
88 filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count) 90 filter <- apply(count_values, 1, function(x) mean(x) > min_mean_count)
89 filtered <- count_values[filter,] 91 filtered <- count_values[filter, ]
90 print(paste0("feature count after filtering :",nrow(filtered),"\n")) 92 print(paste0("feature count after filtering :", nrow(filtered), "\n"))
91 set = newSeqExpressionSet(as.matrix(filtered), 93 set <- newSeqExpressionSet(as.matrix(filtered),
92 phenoData = data.frame(colData(dds)$condition, row.names=colnames(filtered))) 94 phenoData = data.frame(colData(dds)$condition, row.names = colnames(filtered)))
93 plot_pca_rle(set = set, title = 'raw data') 95 plot_pca_rle(set = set, title = "raw data")
94 set <- betweenLaneNormalization(set, which="upper") 96 set <- betweenLaneNormalization(set, which = "upper")
95 plot_pca_rle(set = set, title = 'upper quartile normalized') 97 plot_pca_rle(set = set, title = "upper quartile normalized")
96 return(set) 98 return(set)
97 } 99 }
98 100
99 get_empirical_control_genes <- function(set, cutoff_p) { 101 get_empirical_control_genes <- function(set, cutoff_p) {
100 x <- pData(set)[,1] 102 x <- pData(set)[, 1]
101 design <- model.matrix(~x, data=pData(set)) 103 design <- model.matrix(~x, data = pData(set))
102 y <- DGEList(counts=counts(set), group=x) 104 y <- DGEList(counts = counts(set), group = x)
103 y <- calcNormFactors(y, method="upperquartile") 105 y <- calcNormFactors(y, method = "upperquartile")
104 y <- estimateGLMCommonDisp(y, design) 106 y <- estimateGLMCommonDisp(y, design)
105 y <- estimateGLMTagwiseDisp(y, design) 107 y <- estimateGLMTagwiseDisp(y, design)
106 fit <- glmFit(y, design) 108 fit <- glmFit(y, design)
107 lrt <- glmLRT(fit, coef=2) 109 lrt <- glmLRT(fit, coef = 2)
108 top <- topTags(lrt, n=nrow(set))$table 110 top <- topTags(lrt, n = nrow(set))$table
109 top_rows <- rownames(top)[which(top$PValue < cutoff_p)] 111 top_rows <- rownames(top)[which(top$PValue < cutoff_p)]
110 empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))] 112 empirical <- rownames(set)[which(!(rownames(set) %in% top_rows))]
111 return(empirical) 113 return(empirical)
112 } 114 }
113 115
114 ruv_control_gene_method <- function(set, k, control_genes='empirical', cutoff_p=0.2) { 116 ruv_control_gene_method <- function(set, k, control_genes = "empirical", cutoff_p = 0.2) {
115 if (control_genes == 'empirical') { 117 if (control_genes == "empirical") {
116 control_genes = get_empirical_control_genes(set, cutoff_p=cutoff_p) 118 control_genes <- get_empirical_control_genes(set, cutoff_p = cutoff_p)
117 } 119 }
118 set <- RUVg(set, control_genes, k=k) 120 set <- RUVg(set, control_genes, k = k)
119 plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k)) 121 plot_pca_rle(set, paste0("RUVg with empirical control genes, k=", k))
120 plot_factors_of_unwanted_variation(set, method="RUVg with empirical control genes", k=k) 122 plot_factors_of_unwanted_var(set, method = "RUVg with empirical control genes", k = k)
121 return(set) 123 return(set)
122 } 124 }
123 125
124 ruv_residual_method <- function(set, k) { 126 ruv_residual_method <- function(set, k) {
125 genes <- rownames(counts(set)) 127 genes <- rownames(counts(set))
126 x <- pData(set)[,1] 128 x <- pData(set)[, 1]
127 # Initial edger residuals 129 # Initial edger residuals
128 design <- model.matrix(~x, data=pData(set)) 130 design <- model.matrix(~x, data = pData(set))
129 y <- DGEList(counts=counts(set), group=x) 131 y <- DGEList(counts = counts(set), group = x)
130 y <- calcNormFactors(y, method="upperquartile") 132 y <- calcNormFactors(y, method = "upperquartile")
131 y <- estimateGLMCommonDisp(y, design) 133 y <- estimateGLMCommonDisp(y, design)
132 y <- estimateGLMTagwiseDisp(y, design) 134 y <- estimateGLMTagwiseDisp(y, design)
133 fit <- glmFit(y, design) 135 fit <- glmFit(y, design)
134 res <- residuals(fit, type="deviance") 136 res <- residuals(fit, type = "deviance")
135 set <- RUVr(set, genes, k=k, res) 137 set <- RUVr(set, genes, k = k, res)
136 plot_pca_rle(set = set, title = paste0('RUVr using residuals, k=', k)) 138 plot_pca_rle(set = set, title = paste0("RUVr using residuals, k=", k))
137 plot_factors_of_unwanted_variation(set, method="RUVr using residuals", k=k) 139 plot_factors_of_unwanted_var(set, method = "RUVr using residuals", k = k)
138 return(set) 140 return(set)
139 } 141 }
140 142
141 ruv_replicate_method <- function (set, k) { 143 ruv_replicate_method <- function(set, k) {
142 genes <- rownames(counts(set)) 144 genes <- rownames(counts(set))
143 x <- pData(set)[,1] 145 x <- pData(set)[, 1]
144 differences <- makeGroups(x) 146 differences <- makeGroups(x)
145 set <- RUVs(set, genes, k=k, differences) 147 set <- RUVs(set, genes, k = k, differences)
146 plot_pca_rle(set, paste0('RUVs with replicate samples, k=', k)) 148 plot_pca_rle(set, paste0("RUVs with replicate samples, k=", k))
147 plot_factors_of_unwanted_variation(set, method="RUVs using replicates", k=k) 149 plot_factors_of_unwanted_var(set, method = "RUVs using replicates", k = k)
148 return(set) 150 return(set)
149 }
150
151 get_differentially_expressed_genes <- function(dds, contrast, alpha=0.01) {
152 r <- results(dds, contrast=contrast, alpha=alpha)
153 return(rownames(r[which(r$padj < alpha),]))
154 } 151 }
155 152
156 opt <- setup_cmdline_options() 153 opt <- setup_cmdline_options()
157 alpha <- opt$alpha 154 alpha <- opt$alpha
158 min_k <- opt$min_k 155 min_k <- opt$min_k
160 min_c <- opt$min_mean_count 157 min_c <- opt$min_mean_count
161 sample_json <- fromJSON(opt$sample_json) 158 sample_json <- fromJSON(opt$sample_json)
162 sample_paths <- sample_json$path 159 sample_paths <- sample_json$path
163 sample_names <- sample_json$label 160 sample_names <- sample_json$label
164 condition <- as.factor(sample_json$condition) 161 condition <- as.factor(sample_json$condition)
165 sampleTable <- data.frame(samplename=sample_names, 162 sample_table <- data.frame(samplename = sample_names, filename = sample_paths, condition = condition)
166 filename = sample_paths, 163 rownames(sample_table) <- sample_names
167 condition=condition)
168 rownames(sampleTable) <- sample_names
169 164
170 dds <- get_deseq_dataset(sampleTable, header=opt$header, designFormula= ~ condition, tximport=opt$txtype, txtype=opt$txtype, tx2gene=opt$tx2gene) 165 dds <- get_deseq_dataset(sample_table, header = opt$header, design_formula = ~ condition, tximport = opt$txtype, txtype = opt$txtype, tx2gene = opt$tx2gene)
171 if (!is.null(opt$plots)) { 166 if (!is.null(opt$plots)) {
172 pdf(opt$plots) 167 pdf(opt$plots)
173 } 168 }
174 169
175 # Run through the ruvseq variants 170 # Run through the ruvseq variants
176 set <- create_seq_expression_set(dds, min_mean_count = min_c) 171 set <- create_seq_expression_set(dds, min_mean_count = min_c)
177 result <- list(no_correction = set) 172 result <- list(no_correction = set)
178 for (k in seq(min_k, max_k)) { 173 for (k in seq(min_k, max_k)) {
179 result[[paste0('residual_method_k', k)]] <- ruv_residual_method(set, k=k) 174 result[[paste0("residual_method_k", k)]] <- ruv_residual_method(set, k = k)
180 result[[paste0('replicate_method_k', k)]] <- ruv_replicate_method(set, k=k) 175 result[[paste0("replicate_method_k", k)]] <- ruv_replicate_method(set, k = k)
181 result[[paste0('control_method_k', k)]] <- ruv_control_gene_method(set, k=k, cutoff_p=0.5) 176 result[[paste0("control_method_k", k)]] <- ruv_control_gene_method(set, k = k, cutoff_p = 0.5)
182 } 177 }
183 178
184 for (name in names(result)) { 179 for (name in names(result)) {
185 if (!startsWith(name, "no_correction")) { 180 if (!startsWith(name, "no_correction")) {
186 set <- result[[name]] 181 set <- result[[name]]
187 unwanted_variation <- pData(set) 182 unwanted_variation <- pData(set)
188 df <- data.frame(identifier = rownames(unwanted_variation)) 183 df <- data.frame(identifier = rownames(unwanted_variation))
189 df <- cbind(df, unwanted_variation) 184 df <- cbind(df, unwanted_variation)
190 colnames(df)[2] <- 'condition' 185 colnames(df)[2] <- "condition"
191 write.table(df, file=paste0("batch_effects_", name, ".tabular"), sep="\t", quote=F, row.names=F) 186 write.table(df, file = paste0("batch_effects_", name, ".tabular"), sep = "\t", quote = F, row.names = F)
192 } 187 }
193 } 188 }
194 189
195 # close the plot device 190 # close the plot device
196 if (!is.null(opt$plots)) { 191 if (!is.null(opt$plots)) {