Mercurial > repos > rnateam > dewseq
comparison analyseStudy.Rmd @ 0:e1cb2e012307 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/dewseq commit 71db0e65b3b306904ae2b17ce3de677244aea776"
author | rnateam |
---|---|
date | Thu, 20 Oct 2022 08:18:30 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e1cb2e012307 |
---|---|
1 --- | |
2 title: "DEWSeq Analysis" | |
3 author: "Thomas Schwarzl and Sudeep Sahadevan" | |
4 date: "07/10/2020" | |
5 output: | |
6 BiocStyle::html_document: | |
7 toc: true | |
8 toc_float: true | |
9 toc_depth: 5 | |
10 params: | |
11 protein: "" | |
12 sampleinfo_file: "" | |
13 countmatrix_file: "" | |
14 annotation_file: "" | |
15 output_windows_file: "" | |
16 output_regions_file: "" | |
17 output_bed_file: "" | |
18 output_rdata: "" | |
19 min_count: 2 | |
20 min_sample: 2 | |
21 LRT: FALSE | |
22 p_value_cutoff: 0.05 | |
23 lfc_cutoff: 1 | |
24 overlap_correction: FALSE | |
25 IHW: TRUE | |
26 decide_fit: TRUE | |
27 --- | |
28 ```{r setup, include=FALSE} | |
29 knitr::opts_chunk$set(echo = TRUE) | |
30 ``` | |
31 | |
32 | |
33 ```{r assignData, echo=FALSE, eval=TRUE} | |
34 protein <- params$protein | |
35 sampleinfo_file <- params$sampleinfo_file | |
36 countmatrix_file <- params$countmatrix_file | |
37 annotation_file <- params$annotation_file | |
38 output_windows_file <- params$output_windows_file | |
39 output_regions_file <- params$output_regions_file | |
40 output_bed_file <- params$output_bed_file | |
41 output_rdata <- params$output_rdata | |
42 min_count <- params$min_count | |
43 min_sample <- params$min_sample | |
44 p_value_cutoff <- params$p_value_cutoff | |
45 lfc_cutoff <- params$lfc_cutoff | |
46 ``` | |
47 | |
48 Sanity check input parameter values | |
49 | |
50 ```{r sanityCheck, echo=FALSE, eval=TRUE} | |
51 # round of min_count and min_sample values first | |
52 message("Any decimals given as values for min_count and min_sample parameters will be rounded off to the nearest integer.") | |
53 min_count <- round(min_count) | |
54 min_sample <- round(min_sample) | |
55 # receive and sanity check p_value_cutoff | |
56 if (p_value_cutoff <= 0 || p_value_cutoff >= 1) { | |
57 warning("p_value_cutoff must satisfy: 0<=p_value_cutoff<=1. Resetting to default value: 0.05") | |
58 p_value_cutoff <- 0.05 | |
59 } | |
60 # sanity check log2Foldchange cutoff | |
61 if (lfc_cutoff < 0) { | |
62 warning("lfc_cutoff must be a value >=0. Resetting to default value: 1.00") | |
63 lfc_cutoff <- 1.0 | |
64 } | |
65 # sanity check LRT vs Wald | |
66 if (is(params$LRT, "logical")) { | |
67 lrt <- params$LRT | |
68 }else { | |
69 warning("LRT must be TRUE or FALSE, setting this parameter to default: FALSE") | |
70 lrt <- FALSE | |
71 } | |
72 # sanity check overlap correction parameter | |
73 if (is(params$overlap_correction, "logical")) { | |
74 overlap_correction <- params$overlap_correction | |
75 }else { | |
76 warning("overlap_correction must be TRUE or FALSE, setting this parameter to default: TRUE") | |
77 overlap_correction <- TRUE | |
78 } | |
79 # sanity check IHW vs BH correction parameter | |
80 if (is(params$IHW, "logical")) { | |
81 ihw_filt <- params$IHW | |
82 }else { | |
83 warning("IHW must be TRUE or FALSE, setting this parameter to default: TRUE") | |
84 ihw_filt <- TRUE | |
85 } | |
86 # sanity check automated fit vs parametric fit paramter | |
87 if (is(params$decide_fit, "logical")) { | |
88 decide_fit <- params$decide_fit | |
89 }else { | |
90 warning("decide_fit must be TRUE or FALSE, setting this parameter to default: TRUE") | |
91 decide_fit <- TRUE | |
92 } | |
93 ``` | |
94 | |
95 # eCLIP analysis of `r protein` | |
96 | |
97 | |
98 ## Setup | |
99 | |
100 This is the analysis of __`r protein`__ with | |
101 sampleinfo file: ``r sampleinfo_file`` | |
102 countmatrix file: ``r countmatrix_file`` and | |
103 annotation file: ``r annotation_file`` | |
104 | |
105 with the following threshold: | |
106 | |
107 minimum read count per window per sample: ``r min_count`` | |
108 number of samples with minimum read count per window: ``r min_sample`` | |
109 | |
110 using the following parameters: | |
111 p-value cut-off: ``r p_value_cutoff`` | |
112 Log2FoldChange cut-off: ``r lfc_cutoff`` | |
113 use automated method for dispersion estmation: ``r decide_fit`` | |
114 use LRT test : ``r lrt`` | |
115 use overlap correction: ``r overlap_correction`` | |
116 use IHW for FDR correction: ``r ihw_filt`` | |
117 | |
118 | |
119 ``` {r check if files exists, echo=FALSE} | |
120 file_exists <- function(x) { | |
121 if (!file.exists(x)) | |
122 stop(paste0("'", x, "' - file does not exist.")) | |
123 } | |
124 | |
125 file_exists(countmatrix_file) | |
126 file_exists(annotation_file) | |
127 file_exists(sampleinfo_file) | |
128 ``` | |
129 | |
130 First, we load the libraries. | |
131 | |
132 ```{r load DEWSeq} | |
133 required_packages <- c("DEWSeq", "data.table", "IHW", "R.utils", "tidyverse", "ggrepel") | |
134 installed_packages <- installed.packages()[, 1] | |
135 missing_packages <- setdiff(required_packages, installed_packages) | |
136 if (length(missing_packages) != 0) { | |
137 stop("Found missing dependencies! Please install the following package(s): ", paste(missing_packages, collapse = ", ")) | |
138 } | |
139 suppressPackageStartupMessages({ | |
140 require(DEWSeq) | |
141 require(tidyverse) | |
142 require(data.table) | |
143 require(IHW) | |
144 require(R.utils) | |
145 require(ggrepel) | |
146 }) | |
147 ``` | |
148 | |
149 ## Read in data | |
150 | |
151 Here we read in the window counts | |
152 | |
153 ```{r read window counts} | |
154 window_counts <- fread(countmatrix_file, sep = "\t", stringsAsFactors = FALSE) %>% as.data.frame() | |
155 rownames(window_counts) <- window_counts[, 1] | |
156 window_counts <- window_counts[, -1] | |
157 ``` | |
158 | |
159 and the sample info file | |
160 | |
161 ```{r read sample info} | |
162 sample_info <- read.table(sampleinfo_file, sep = "\t", stringsAsFactors = FALSE) | |
163 if (ncol(sample_info) < 2) { | |
164 stop("sampleinfo_file ", sampleinfo_file, " MUST have atleast two columns: first column should be the sample names used in ", countmatrix_file, | |
165 " and second column must be the experiment type: IP or SMI") | |
166 }else if (ncol(sample_info) > 2) { | |
167 message("Found ", ncol(sample_info), " columns in ", sampleinfo_file, " using the first column as sample name and second column as experiment name") | |
168 sample_info <- sample_info[, c(1, 2)] | |
169 } | |
170 colnames(sample_info) <- c("samples", "type") | |
171 rownames(sample_info) <- sample_info[, 1] | |
172 ``` | |
173 | |
174 Now we make sure that the sampleinfo file contains the column "type" with values "SMI" and "IP" only. | |
175 | |
176 ```{r sampleSanity} | |
177 # make sure that sample_info rows and window_counts columns are in same order | |
178 common_samples <- sort(intersect(colnames(window_counts), rownames(sample_info))) | |
179 if (length(common_samples) != ncol(window_counts)) { | |
180 stop("The number of samples in ", countmatrix_file, " and ", sampleinfo_file, " do not MATCH!") | |
181 } | |
182 sample_info <- sample_info[common_samples, ] | |
183 window_counts <- window_counts[, common_samples] | |
184 # Now make sure that sample_info$type contains only "IP" and "SMI" | |
185 type_check <- setdiff(unique(sample_info$type), c("IP", "SMI")) | |
186 if (length(type_check) != 0) { | |
187 stop("The second column in ", sampleinfo_file, " should contain analysis types:'IP' or 'SMI' only. Found unknown value(s): ", | |
188 paste(type_check, collapse = ", ")) | |
189 } | |
190 ``` | |
191 | |
192 We make sure that only IP and SMI are in the right factor level order | |
193 | |
194 ```{r sampleFactor} | |
195 sample_info <- sample_info %>% mutate(type = factor(type, levels = c("SMI", "IP"))) | |
196 ``` | |
197 | |
198 | |
199 | |
200 We create the DEWSeq object | |
201 | |
202 ```{r dewseqInit} | |
203 ddw <- DESeqDataSetFromSlidingWindows(countData = window_counts, | |
204 colData = sample_info, | |
205 annotObj = annotation_file, | |
206 tidy = FALSE, | |
207 design = ~type) | |
208 ``` | |
209 | |
210 | |
211 ## Prefiltering | |
212 | |
213 ```{r prefiltering1} | |
214 # remove all empty windows | |
215 keep <- rowSums(counts(ddw)) >= 1 | |
216 ddw <- ddw[keep, ] | |
217 ``` | |
218 | |
219 | |
220 ## Estimating size factors | |
221 | |
222 | |
223 ```{r size factors} | |
224 ddw <- estimateSizeFactors(ddw) | |
225 sizeFactors(ddw) | |
226 ``` | |
227 | |
228 ### estimate size factors for only protein_coding genes | |
229 | |
230 ```{r protein_coding_size_factors} | |
231 ddw_mrnas <- ddw[rowData(ddw)[, "gene_type"] == "protein_coding", ] | |
232 ddw_mrnas <- estimateSizeFactors(ddw_mrnas) | |
233 ``` | |
234 | |
235 ### estimate size factors without significant windows | |
236 | |
237 ```{r size_factors_no_sig_windows} | |
238 ddw_tmp <- ddw | |
239 ddw_tmp <- estimateDispersions(ddw_tmp, fitType = "local", quiet = TRUE) | |
240 if (lrt) { | |
241 ddw_tmp <- nbinomLRT(ddw_tmp, full = ~type, reduced = ~1) | |
242 }else { | |
243 ddw_tmp <- nbinomWaldTest(ddw_tmp) | |
244 } | |
245 | |
246 tmp_significant_windows <- | |
247 results(ddw_tmp, | |
248 contrast = c("type", "IP", "SMI"), | |
249 tidy = TRUE, | |
250 filterFun = ihw) %>% | |
251 dplyr::filter(padj < p_value_cutoff) %>% | |
252 .[["row"]] | |
253 rm(ddw_tmp) | |
254 ``` | |
255 | |
256 estimate the size factors without the significant windows. | |
257 | |
258 ```{r final_size_factors} | |
259 ddw_mrnas <- ddw_mrnas[!rownames(ddw_mrnas) %in% tmp_significant_windows, ] | |
260 ddw_mrnas <- estimateSizeFactors(ddw_mrnas) | |
261 ``` | |
262 | |
263 before thresholding: | |
264 ```{r threshold1} | |
265 dim(ddw) | |
266 ``` | |
267 | |
268 Now threshold the windows read count table. | |
269 ```{r threshold2} | |
270 keep_exp <- which(rowSums(counts(ddw) > min_count) >= min_sample) | |
271 ddw <- ddw[keep_exp, ] | |
272 ``` | |
273 | |
274 after thresholding: | |
275 ```{r threshold3} | |
276 dim(ddw) | |
277 ``` | |
278 assign size factors | |
279 | |
280 ```{r final_assign} | |
281 sizeFactors(ddw) <- sizeFactors(ddw_mrnas) | |
282 rm(list = c("tmp_significant_windows", "ddw_mrnas")) | |
283 sizeFactors(ddw) | |
284 ``` | |
285 | |
286 | |
287 ## Differential window analysis | |
288 | |
289 ### Dispersion estimates | |
290 | |
291 ```{r source, echo = FALSE, eval = FALSE} | |
292 # source: https://support.bioconductor.org/p/81094/ | |
293 ``` | |
294 We fit parametric and local fit, and decide the best fit following this [Bioconductor post](https://support.bioconductor.org/p/81094/) | |
295 | |
296 ```{r parametric_dispersion} | |
297 parametric_ddw <- estimateDispersions(ddw, fitType = "parametric") | |
298 if (decide_fit) { | |
299 local_ddw <- estimateDispersions(ddw, fitType = "local") | |
300 } | |
301 | |
302 ``` | |
303 | |
304 This is the dispersion estimate for parametric fit | |
305 ```{r plot parametric fit, fig.wide=TRUE} | |
306 plotDispEsts(parametric_ddw, main = "Parametric fit") | |
307 ``` | |
308 | |
309 This is the dispersion estimate for local fit, given automated decision fitting is enabled: | |
310 ```{r plot local fit, fig.wide = TRUE} | |
311 if (decide_fit) { | |
312 plotDispEsts(local_ddw, main = "Local fit") | |
313 } | |
314 | |
315 ``` | |
316 | |
317 This will get the residuals for either fit, only for automated decision fitting | |
318 ```{r residual fit} | |
319 parametric_resid <- na.omit(with(mcols(parametric_ddw), abs(log(dispGeneEst) - log(dispFit)))) | |
320 if (decide_fit) { | |
321 local_resid <- na.omit(with(mcols(local_ddw), abs(log(dispGeneEst) - log(dispFit)))) | |
322 resid_df <- data.frame(residuals = c(parametric_resid, local_resid), | |
323 fitType = c(rep("parametric", length(parametric_resid)), | |
324 rep("local", length(local_resid)))) | |
325 summary(resid_df) | |
326 } | |
327 | |
328 ``` | |
329 | |
330 and we plot histograms of the fits | |
331 | |
332 ```{r plot residual histograms, fig.wide = TRUE} | |
333 if (decide_fit) { | |
334 ggplot(resid_df, aes(x = residuals, fill = fitType)) + | |
335 scale_fill_manual(values = c("darkred", "darkblue")) + geom_histogram(alpha = 0.5, position = "identity", bins = 100) + | |
336 theme_bw() | |
337 } | |
338 | |
339 ``` | |
340 | |
341 Now, we will decide for the better fit based on median | |
342 | |
343 ```{r choose_fit} | |
344 summary(parametric_resid) | |
345 if (decide_fit) { | |
346 summary(local_resid) | |
347 if (median(local_resid) <= median(parametric_resid)) { | |
348 cat("chosen fitType: local") | |
349 ddw <- local_ddw | |
350 }else { | |
351 cat("chosen fitType: parametric") | |
352 ddw <- parametric_ddw | |
353 } | |
354 rm(local_ddw, parametric_ddw, resid_df, parametric_resid, local_resid) | |
355 }else { | |
356 ddw <- parametric_ddw | |
357 rm(parametric_ddw) | |
358 } | |
359 | |
360 ``` | |
361 | |
362 | |
363 | |
364 ### Wald test or LRT | |
365 | |
366 ```{r wald_or_LRT} | |
367 if (lrt) { | |
368 ddw <- nbinomLRT(ddw, full = ~type, reduced = ~1) | |
369 }else { | |
370 ddw <- nbinomWaldTest(ddw) | |
371 } | |
372 | |
373 ``` | |
374 | |
375 | |
376 ### Significance testing | |
377 | |
378 ```{r sig_windows} | |
379 result_windows <- resultsDEWSeq(ddw, | |
380 contrast = c("type", "IP", "SMI"), | |
381 tidy = TRUE) %>% as_tibble | |
382 | |
383 result_windows | |
384 ``` | |
385 | |
386 | |
387 ### Multiple hypothesis correction with IHW | |
388 | |
389 You might be interested to correct for multiple hypothesis testing with IHW. | |
390 | |
391 Decide on overlap correction based on the parameter `overlap_correction` | |
392 | |
393 ```{r ihw} | |
394 if (overlap_correction && ihw_filt) { | |
395 result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pSlidingWindows ~ baseMean, | |
396 data = result_windows, | |
397 alpha = p_value_cutoff, | |
398 nfolds = 10)) | |
399 padj_col <- "p_adj_IHW" | |
400 }else if (!overlap_correction && ihw_filt) { | |
401 result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pvalue ~ baseMean, | |
402 data = result_windows, | |
403 alpha = p_value_cutoff, | |
404 nfolds = 10)) | |
405 padj_col <- "p_adj_IHW" | |
406 }else if (overlap_correction && !ihw_filt) { | |
407 padj_col <- "pSlidingWindows.adj" | |
408 }else { | |
409 result_windows[, "p_adj"] <- p.adjust(result_windows$pvalue, method = "BH") | |
410 padj_col <- "p_adj" | |
411 } | |
412 | |
413 ``` | |
414 | |
415 Determine significant windows | |
416 | |
417 ```{r filter_sig_windows} | |
418 result_windows <- result_windows %>% | |
419 mutate(significant = result_windows[, padj_col] < p_value_cutoff) | |
420 sig_windows <- sum(result_windows$significant) | |
421 ``` | |
422 | |
423 | |
424 `r sig_windows` windows are significant | |
425 | |
426 | |
427 ### Combining windows | |
428 | |
429 ```{r, reg1, message=FALSE, eval=TRUE, include=FALSE} | |
430 if (sig_windows > 0) { | |
431 result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble | |
432 }else { | |
433 message("Cannot find significant windows in this dataset. Try lowering the p-value and log2FoldChange thresholds!") | |
434 } | |
435 ``` | |
436 | |
437 ```{r extractRegion, eval=FALSE} | |
438 if (sig_windows > 0) { | |
439 result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble | |
440 } | |
441 ``` | |
442 | |
443 | |
444 ### Writing Bed file | |
445 | |
446 ```{r writing bed file} | |
447 if (sig_windows > 1) { | |
448 toBED(windowRes = result_windows, regionRes = result_regions, padjThresh = p_value_cutoff, padjCol = padj_col, fileName = output_bed_file) | |
449 }else { | |
450 message("This analysis does not have enough <=1 significant windows to create BED file for visualization") | |
451 } | |
452 ``` | |
453 | |
454 | |
455 ## Save Session | |
456 ```{r save_data} | |
457 # save enriched windows, gzip results file if the file suffix is .gz | |
458 if (grepl("\\.gz$", output_windows_file, ignore.case = TRUE)) { | |
459 gz_out <- gzfile(output_windows_file, "w") | |
460 write.table(result_windows, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
461 close(gz_out) | |
462 }else { | |
463 write.table(result_windows, file = output_windows_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
464 } | |
465 # save enriched regions | |
466 if (sig_windows > 0) { | |
467 if (grepl("\\.gz$", output_regions_file, ignore.case = TRUE)) { | |
468 gz_out <- gzfile(output_regions_file, "w") | |
469 write.table(result_regions, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
470 close(gz_out) | |
471 }else { | |
472 write.table(result_regions, file = output_regions_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) | |
473 } | |
474 } | |
475 # save session | |
476 # Warning! session images can be heavy! | |
477 if (nchar(output_rdata) > 5) { | |
478 save.image(file = output_rdata) | |
479 } | |
480 ``` | |
481 | |
482 ## Session Info | |
483 | |
484 ```{r session info} | |
485 sessionInfo() | |
486 ``` |