comparison analyseStudy.Rmd @ 0:e1cb2e012307 draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/dewseq commit 71db0e65b3b306904ae2b17ce3de677244aea776"
author rnateam
date Thu, 20 Oct 2022 08:18:30 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e1cb2e012307
1 ---
2 title: "DEWSeq Analysis"
3 author: "Thomas Schwarzl and Sudeep Sahadevan"
4 date: "07/10/2020"
5 output:
6 BiocStyle::html_document:
7 toc: true
8 toc_float: true
9 toc_depth: 5
10 params:
11 protein: ""
12 sampleinfo_file: ""
13 countmatrix_file: ""
14 annotation_file: ""
15 output_windows_file: ""
16 output_regions_file: ""
17 output_bed_file: ""
18 output_rdata: ""
19 min_count: 2
20 min_sample: 2
21 LRT: FALSE
22 p_value_cutoff: 0.05
23 lfc_cutoff: 1
24 overlap_correction: FALSE
25 IHW: TRUE
26 decide_fit: TRUE
27 ---
28 ```{r setup, include=FALSE}
29 knitr::opts_chunk$set(echo = TRUE)
30 ```
31
32
33 ```{r assignData, echo=FALSE, eval=TRUE}
34 protein <- params$protein
35 sampleinfo_file <- params$sampleinfo_file
36 countmatrix_file <- params$countmatrix_file
37 annotation_file <- params$annotation_file
38 output_windows_file <- params$output_windows_file
39 output_regions_file <- params$output_regions_file
40 output_bed_file <- params$output_bed_file
41 output_rdata <- params$output_rdata
42 min_count <- params$min_count
43 min_sample <- params$min_sample
44 p_value_cutoff <- params$p_value_cutoff
45 lfc_cutoff <- params$lfc_cutoff
46 ```
47
48 Sanity check input parameter values
49
50 ```{r sanityCheck, echo=FALSE, eval=TRUE}
51 # round of min_count and min_sample values first
52 message("Any decimals given as values for min_count and min_sample parameters will be rounded off to the nearest integer.")
53 min_count <- round(min_count)
54 min_sample <- round(min_sample)
55 # receive and sanity check p_value_cutoff
56 if (p_value_cutoff <= 0 || p_value_cutoff >= 1) {
57 warning("p_value_cutoff must satisfy: 0<=p_value_cutoff<=1. Resetting to default value: 0.05")
58 p_value_cutoff <- 0.05
59 }
60 # sanity check log2Foldchange cutoff
61 if (lfc_cutoff < 0) {
62 warning("lfc_cutoff must be a value >=0. Resetting to default value: 1.00")
63 lfc_cutoff <- 1.0
64 }
65 # sanity check LRT vs Wald
66 if (is(params$LRT, "logical")) {
67 lrt <- params$LRT
68 }else {
69 warning("LRT must be TRUE or FALSE, setting this parameter to default: FALSE")
70 lrt <- FALSE
71 }
72 # sanity check overlap correction parameter
73 if (is(params$overlap_correction, "logical")) {
74 overlap_correction <- params$overlap_correction
75 }else {
76 warning("overlap_correction must be TRUE or FALSE, setting this parameter to default: TRUE")
77 overlap_correction <- TRUE
78 }
79 # sanity check IHW vs BH correction parameter
80 if (is(params$IHW, "logical")) {
81 ihw_filt <- params$IHW
82 }else {
83 warning("IHW must be TRUE or FALSE, setting this parameter to default: TRUE")
84 ihw_filt <- TRUE
85 }
86 # sanity check automated fit vs parametric fit paramter
87 if (is(params$decide_fit, "logical")) {
88 decide_fit <- params$decide_fit
89 }else {
90 warning("decide_fit must be TRUE or FALSE, setting this parameter to default: TRUE")
91 decide_fit <- TRUE
92 }
93 ```
94
95 # eCLIP analysis of `r protein`
96
97
98 ## Setup
99
100 This is the analysis of __`r protein`__ with
101 sampleinfo file: ``r sampleinfo_file``
102 countmatrix file: ``r countmatrix_file`` and
103 annotation file: ``r annotation_file``
104
105 with the following threshold:
106
107 minimum read count per window per sample: ``r min_count``
108 number of samples with minimum read count per window: ``r min_sample``
109
110 using the following parameters:
111 p-value cut-off: ``r p_value_cutoff``
112 Log2FoldChange cut-off: ``r lfc_cutoff``
113 use automated method for dispersion estmation: ``r decide_fit``
114 use LRT test : ``r lrt``
115 use overlap correction: ``r overlap_correction``
116 use IHW for FDR correction: ``r ihw_filt``
117
118
119 ``` {r check if files exists, echo=FALSE}
120 file_exists <- function(x) {
121 if (!file.exists(x))
122 stop(paste0("'", x, "' - file does not exist."))
123 }
124
125 file_exists(countmatrix_file)
126 file_exists(annotation_file)
127 file_exists(sampleinfo_file)
128 ```
129
130 First, we load the libraries.
131
132 ```{r load DEWSeq}
133 required_packages <- c("DEWSeq", "data.table", "IHW", "R.utils", "tidyverse", "ggrepel")
134 installed_packages <- installed.packages()[, 1]
135 missing_packages <- setdiff(required_packages, installed_packages)
136 if (length(missing_packages) != 0) {
137 stop("Found missing dependencies! Please install the following package(s): ", paste(missing_packages, collapse = ", "))
138 }
139 suppressPackageStartupMessages({
140 require(DEWSeq)
141 require(tidyverse)
142 require(data.table)
143 require(IHW)
144 require(R.utils)
145 require(ggrepel)
146 })
147 ```
148
149 ## Read in data
150
151 Here we read in the window counts
152
153 ```{r read window counts}
154 window_counts <- fread(countmatrix_file, sep = "\t", stringsAsFactors = FALSE) %>% as.data.frame()
155 rownames(window_counts) <- window_counts[, 1]
156 window_counts <- window_counts[, -1]
157 ```
158
159 and the sample info file
160
161 ```{r read sample info}
162 sample_info <- read.table(sampleinfo_file, sep = "\t", stringsAsFactors = FALSE)
163 if (ncol(sample_info) < 2) {
164 stop("sampleinfo_file ", sampleinfo_file, " MUST have atleast two columns: first column should be the sample names used in ", countmatrix_file,
165 " and second column must be the experiment type: IP or SMI")
166 }else if (ncol(sample_info) > 2) {
167 message("Found ", ncol(sample_info), " columns in ", sampleinfo_file, " using the first column as sample name and second column as experiment name")
168 sample_info <- sample_info[, c(1, 2)]
169 }
170 colnames(sample_info) <- c("samples", "type")
171 rownames(sample_info) <- sample_info[, 1]
172 ```
173
174 Now we make sure that the sampleinfo file contains the column "type" with values "SMI" and "IP" only.
175
176 ```{r sampleSanity}
177 # make sure that sample_info rows and window_counts columns are in same order
178 common_samples <- sort(intersect(colnames(window_counts), rownames(sample_info)))
179 if (length(common_samples) != ncol(window_counts)) {
180 stop("The number of samples in ", countmatrix_file, " and ", sampleinfo_file, " do not MATCH!")
181 }
182 sample_info <- sample_info[common_samples, ]
183 window_counts <- window_counts[, common_samples]
184 # Now make sure that sample_info$type contains only "IP" and "SMI"
185 type_check <- setdiff(unique(sample_info$type), c("IP", "SMI"))
186 if (length(type_check) != 0) {
187 stop("The second column in ", sampleinfo_file, " should contain analysis types:'IP' or 'SMI' only. Found unknown value(s): ",
188 paste(type_check, collapse = ", "))
189 }
190 ```
191
192 We make sure that only IP and SMI are in the right factor level order
193
194 ```{r sampleFactor}
195 sample_info <- sample_info %>% mutate(type = factor(type, levels = c("SMI", "IP")))
196 ```
197
198
199
200 We create the DEWSeq object
201
202 ```{r dewseqInit}
203 ddw <- DESeqDataSetFromSlidingWindows(countData = window_counts,
204 colData = sample_info,
205 annotObj = annotation_file,
206 tidy = FALSE,
207 design = ~type)
208 ```
209
210
211 ## Prefiltering
212
213 ```{r prefiltering1}
214 # remove all empty windows
215 keep <- rowSums(counts(ddw)) >= 1
216 ddw <- ddw[keep, ]
217 ```
218
219
220 ## Estimating size factors
221
222
223 ```{r size factors}
224 ddw <- estimateSizeFactors(ddw)
225 sizeFactors(ddw)
226 ```
227
228 ### estimate size factors for only protein_coding genes
229
230 ```{r protein_coding_size_factors}
231 ddw_mrnas <- ddw[rowData(ddw)[, "gene_type"] == "protein_coding", ]
232 ddw_mrnas <- estimateSizeFactors(ddw_mrnas)
233 ```
234
235 ### estimate size factors without significant windows
236
237 ```{r size_factors_no_sig_windows}
238 ddw_tmp <- ddw
239 ddw_tmp <- estimateDispersions(ddw_tmp, fitType = "local", quiet = TRUE)
240 if (lrt) {
241 ddw_tmp <- nbinomLRT(ddw_tmp, full = ~type, reduced = ~1)
242 }else {
243 ddw_tmp <- nbinomWaldTest(ddw_tmp)
244 }
245
246 tmp_significant_windows <-
247 results(ddw_tmp,
248 contrast = c("type", "IP", "SMI"),
249 tidy = TRUE,
250 filterFun = ihw) %>%
251 dplyr::filter(padj < p_value_cutoff) %>%
252 .[["row"]]
253 rm(ddw_tmp)
254 ```
255
256 estimate the size factors without the significant windows.
257
258 ```{r final_size_factors}
259 ddw_mrnas <- ddw_mrnas[!rownames(ddw_mrnas) %in% tmp_significant_windows, ]
260 ddw_mrnas <- estimateSizeFactors(ddw_mrnas)
261 ```
262
263 before thresholding:
264 ```{r threshold1}
265 dim(ddw)
266 ```
267
268 Now threshold the windows read count table.
269 ```{r threshold2}
270 keep_exp <- which(rowSums(counts(ddw) > min_count) >= min_sample)
271 ddw <- ddw[keep_exp, ]
272 ```
273
274 after thresholding:
275 ```{r threshold3}
276 dim(ddw)
277 ```
278 assign size factors
279
280 ```{r final_assign}
281 sizeFactors(ddw) <- sizeFactors(ddw_mrnas)
282 rm(list = c("tmp_significant_windows", "ddw_mrnas"))
283 sizeFactors(ddw)
284 ```
285
286
287 ## Differential window analysis
288
289 ### Dispersion estimates
290
291 ```{r source, echo = FALSE, eval = FALSE}
292 # source: https://support.bioconductor.org/p/81094/
293 ```
294 We fit parametric and local fit, and decide the best fit following this [Bioconductor post](https://support.bioconductor.org/p/81094/)
295
296 ```{r parametric_dispersion}
297 parametric_ddw <- estimateDispersions(ddw, fitType = "parametric")
298 if (decide_fit) {
299 local_ddw <- estimateDispersions(ddw, fitType = "local")
300 }
301
302 ```
303
304 This is the dispersion estimate for parametric fit
305 ```{r plot parametric fit, fig.wide=TRUE}
306 plotDispEsts(parametric_ddw, main = "Parametric fit")
307 ```
308
309 This is the dispersion estimate for local fit, given automated decision fitting is enabled:
310 ```{r plot local fit, fig.wide = TRUE}
311 if (decide_fit) {
312 plotDispEsts(local_ddw, main = "Local fit")
313 }
314
315 ```
316
317 This will get the residuals for either fit, only for automated decision fitting
318 ```{r residual fit}
319 parametric_resid <- na.omit(with(mcols(parametric_ddw), abs(log(dispGeneEst) - log(dispFit))))
320 if (decide_fit) {
321 local_resid <- na.omit(with(mcols(local_ddw), abs(log(dispGeneEst) - log(dispFit))))
322 resid_df <- data.frame(residuals = c(parametric_resid, local_resid),
323 fitType = c(rep("parametric", length(parametric_resid)),
324 rep("local", length(local_resid))))
325 summary(resid_df)
326 }
327
328 ```
329
330 and we plot histograms of the fits
331
332 ```{r plot residual histograms, fig.wide = TRUE}
333 if (decide_fit) {
334 ggplot(resid_df, aes(x = residuals, fill = fitType)) +
335 scale_fill_manual(values = c("darkred", "darkblue")) + geom_histogram(alpha = 0.5, position = "identity", bins = 100) +
336 theme_bw()
337 }
338
339 ```
340
341 Now, we will decide for the better fit based on median
342
343 ```{r choose_fit}
344 summary(parametric_resid)
345 if (decide_fit) {
346 summary(local_resid)
347 if (median(local_resid) <= median(parametric_resid)) {
348 cat("chosen fitType: local")
349 ddw <- local_ddw
350 }else {
351 cat("chosen fitType: parametric")
352 ddw <- parametric_ddw
353 }
354 rm(local_ddw, parametric_ddw, resid_df, parametric_resid, local_resid)
355 }else {
356 ddw <- parametric_ddw
357 rm(parametric_ddw)
358 }
359
360 ```
361
362
363
364 ### Wald test or LRT
365
366 ```{r wald_or_LRT}
367 if (lrt) {
368 ddw <- nbinomLRT(ddw, full = ~type, reduced = ~1)
369 }else {
370 ddw <- nbinomWaldTest(ddw)
371 }
372
373 ```
374
375
376 ### Significance testing
377
378 ```{r sig_windows}
379 result_windows <- resultsDEWSeq(ddw,
380 contrast = c("type", "IP", "SMI"),
381 tidy = TRUE) %>% as_tibble
382
383 result_windows
384 ```
385
386
387 ### Multiple hypothesis correction with IHW
388
389 You might be interested to correct for multiple hypothesis testing with IHW.
390
391 Decide on overlap correction based on the parameter `overlap_correction`
392
393 ```{r ihw}
394 if (overlap_correction && ihw_filt) {
395 result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pSlidingWindows ~ baseMean,
396 data = result_windows,
397 alpha = p_value_cutoff,
398 nfolds = 10))
399 padj_col <- "p_adj_IHW"
400 }else if (!overlap_correction && ihw_filt) {
401 result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pvalue ~ baseMean,
402 data = result_windows,
403 alpha = p_value_cutoff,
404 nfolds = 10))
405 padj_col <- "p_adj_IHW"
406 }else if (overlap_correction && !ihw_filt) {
407 padj_col <- "pSlidingWindows.adj"
408 }else {
409 result_windows[, "p_adj"] <- p.adjust(result_windows$pvalue, method = "BH")
410 padj_col <- "p_adj"
411 }
412
413 ```
414
415 Determine significant windows
416
417 ```{r filter_sig_windows}
418 result_windows <- result_windows %>%
419 mutate(significant = result_windows[, padj_col] < p_value_cutoff)
420 sig_windows <- sum(result_windows$significant)
421 ```
422
423
424 `r sig_windows` windows are significant
425
426
427 ### Combining windows
428
429 ```{r, reg1, message=FALSE, eval=TRUE, include=FALSE}
430 if (sig_windows > 0) {
431 result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble
432 }else {
433 message("Cannot find significant windows in this dataset. Try lowering the p-value and log2FoldChange thresholds!")
434 }
435 ```
436
437 ```{r extractRegion, eval=FALSE}
438 if (sig_windows > 0) {
439 result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble
440 }
441 ```
442
443
444 ### Writing Bed file
445
446 ```{r writing bed file}
447 if (sig_windows > 1) {
448 toBED(windowRes = result_windows, regionRes = result_regions, padjThresh = p_value_cutoff, padjCol = padj_col, fileName = output_bed_file)
449 }else {
450 message("This analysis does not have enough <=1 significant windows to create BED file for visualization")
451 }
452 ```
453
454
455 ## Save Session
456 ```{r save_data}
457 # save enriched windows, gzip results file if the file suffix is .gz
458 if (grepl("\\.gz$", output_windows_file, ignore.case = TRUE)) {
459 gz_out <- gzfile(output_windows_file, "w")
460 write.table(result_windows, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
461 close(gz_out)
462 }else {
463 write.table(result_windows, file = output_windows_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
464 }
465 # save enriched regions
466 if (sig_windows > 0) {
467 if (grepl("\\.gz$", output_regions_file, ignore.case = TRUE)) {
468 gz_out <- gzfile(output_regions_file, "w")
469 write.table(result_regions, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
470 close(gz_out)
471 }else {
472 write.table(result_regions, file = output_regions_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
473 }
474 }
475 # save session
476 # Warning! session images can be heavy!
477 if (nchar(output_rdata) > 5) {
478 save.image(file = output_rdata)
479 }
480 ```
481
482 ## Session Info
483
484 ```{r session info}
485 sessionInfo()
486 ```