Mercurial > repos > rnateam > dewseq

diff analyseStudy.Rmd @ 0:e1cb2e012307 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/dewseq commit 71db0e65b3b306904ae2b17ce3de677244aea776"
author: rnateam
date: Thu, 20 Oct 2022 08:18:30 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/analyseStudy.Rmd	Thu Oct 20 08:18:30 2022 +0000
@@ -0,0 +1,486 @@
+---
+title: "DEWSeq Analysis"
+author: "Thomas Schwarzl and Sudeep Sahadevan"
+date: "07/10/2020"
+output:
+  BiocStyle::html_document:
+    toc: true
+    toc_float: true
+    toc_depth: 5
+params:
+  protein: ""
+  sampleinfo_file: ""
+  countmatrix_file: ""
+  annotation_file: ""
+  output_windows_file: ""
+  output_regions_file: ""
+  output_bed_file: ""
+  output_rdata: ""
+  min_count: 2
+  min_sample: 2
+  LRT: FALSE
+  p_value_cutoff: 0.05
+  lfc_cutoff: 1
+  overlap_correction: FALSE
+  IHW: TRUE
+  decide_fit: TRUE
+---
+```{r setup,  include=FALSE}
+knitr::opts_chunk$set(echo = TRUE)
+```
+
+
+```{r assignData,  echo=FALSE,  eval=TRUE}
+protein <- params$protein
+sampleinfo_file <- params$sampleinfo_file
+countmatrix_file <- params$countmatrix_file
+annotation_file <- params$annotation_file
+output_windows_file <- params$output_windows_file
+output_regions_file <- params$output_regions_file
+output_bed_file <- params$output_bed_file
+output_rdata <- params$output_rdata
+min_count <- params$min_count
+min_sample <- params$min_sample
+p_value_cutoff <- params$p_value_cutoff
+lfc_cutoff <- params$lfc_cutoff
+```
+
+Sanity check input parameter values
+
+```{r sanityCheck,  echo=FALSE,  eval=TRUE}
+# round of min_count and min_sample values first
+message("Any decimals given as values for min_count and min_sample parameters will be rounded off to the nearest integer.")
+min_count <- round(min_count)
+min_sample <- round(min_sample)
+# receive and sanity check p_value_cutoff
+if (p_value_cutoff <= 0 || p_value_cutoff >= 1) {
+  warning("p_value_cutoff must satisfy: 0<=p_value_cutoff<=1. Resetting to default value: 0.05")
+  p_value_cutoff <- 0.05
+}
+# sanity check log2Foldchange cutoff
+if (lfc_cutoff < 0) {
+  warning("lfc_cutoff must be a value >=0. Resetting to default value: 1.00")
+  lfc_cutoff <- 1.0
+}
+# sanity check LRT vs Wald
+if (is(params$LRT,  "logical")) {
+  lrt <- params$LRT
+}else {
+  warning("LRT must be TRUE or FALSE,   setting this parameter to default: FALSE")
+  lrt <- FALSE
+}
+# sanity check overlap correction parameter
+if (is(params$overlap_correction,  "logical")) {
+  overlap_correction <- params$overlap_correction
+}else {
+  warning("overlap_correction must be TRUE or FALSE,  setting this parameter to default: TRUE")
+  overlap_correction <- TRUE
+}
+# sanity check IHW vs BH correction parameter
+if (is(params$IHW, "logical")) {
+  ihw_filt <- params$IHW
+}else {
+  warning("IHW must be TRUE or FALSE,  setting this parameter to default: TRUE")
+  ihw_filt <- TRUE
+}
+# sanity check automated fit vs parametric fit paramter
+if (is(params$decide_fit, "logical")) {
+  decide_fit <- params$decide_fit
+}else {
+  warning("decide_fit must be TRUE or FALSE,  setting this parameter to default: TRUE")
+  decide_fit <- TRUE
+}
+```
+
+# eCLIP analysis of `r protein` 
+
+
+## Setup
+
+This is the analysis of __`r protein`__ with    
+sampleinfo file: ``r sampleinfo_file``   
+countmatrix file: ``r countmatrix_file`` and     
+annotation file: ``r annotation_file`` 
+
+with the following threshold:
+
+minimum read count per window per sample: ``r min_count``
+number of samples with minimum read count per window: ``r min_sample``
+
+using the following parameters:   
+p-value cut-off: ``r p_value_cutoff``  
+Log2FoldChange cut-off: ``r lfc_cutoff``
+use automated method for dispersion estmation: ``r decide_fit``    
+use LRT test : ``r lrt``  
+use overlap correction: ``r overlap_correction``   
+use IHW for FDR correction: ``r ihw_filt``   
+
+
+``` {r check if files exists,  echo=FALSE}
+file_exists <- function(x) {
+  if (!file.exists(x))
+    stop(paste0("'",  x,  "' - file does not exist."))
+}
+
+file_exists(countmatrix_file)
+file_exists(annotation_file)
+file_exists(sampleinfo_file)
+```
+
+First,  we load the libraries.
+
+```{r load DEWSeq}
+required_packages <- c("DEWSeq", "data.table", "IHW", "R.utils", "tidyverse", "ggrepel")
+installed_packages <- installed.packages()[, 1]
+missing_packages <- setdiff(required_packages, installed_packages)
+if (length(missing_packages) != 0) {
+  stop("Found missing dependencies! Please install the following package(s): ", paste(missing_packages, collapse = ", "))
+}
+suppressPackageStartupMessages({
+  require(DEWSeq)
+  require(tidyverse)
+  require(data.table)
+  require(IHW)
+  require(R.utils)
+  require(ggrepel)
+})
+```
+
+## Read in data
+
+Here we read in the window counts
+
+```{r read window counts}
+window_counts <- fread(countmatrix_file,  sep = "\t", stringsAsFactors = FALSE) %>% as.data.frame()
+rownames(window_counts) <- window_counts[, 1]
+window_counts <- window_counts[, -1]
+```
+
+and the sample info file
+
+```{r read sample info}
+sample_info <- read.table(sampleinfo_file, sep = "\t", stringsAsFactors = FALSE)
+if (ncol(sample_info) < 2) {
+  stop("sampleinfo_file ", sampleinfo_file, " MUST have atleast two columns: first column should be the sample names used in ", countmatrix_file,
+       " and second column must be the experiment type: IP or SMI")
+}else if (ncol(sample_info) > 2) {
+  message("Found ", ncol(sample_info), " columns in ", sampleinfo_file, " using the first column as sample name and second column as experiment name")
+  sample_info <- sample_info[, c(1, 2)]
+}
+colnames(sample_info) <- c("samples", "type")
+rownames(sample_info) <- sample_info[, 1]
+```
+
+Now we make sure that the sampleinfo file contains the column "type" with values "SMI" and "IP" only. 
+
+```{r sampleSanity}
+# make sure that sample_info rows and window_counts columns are in same order
+common_samples <- sort(intersect(colnames(window_counts), rownames(sample_info)))
+if (length(common_samples) != ncol(window_counts)) {
+  stop("The number of samples in ", countmatrix_file, " and ", sampleinfo_file, " do not MATCH!")
+}
+sample_info <- sample_info[common_samples,  ]
+window_counts <- window_counts[, common_samples]
+# Now make sure that sample_info$type contains only "IP" and "SMI"
+type_check <- setdiff(unique(sample_info$type), c("IP", "SMI"))
+if (length(type_check) != 0) {
+  stop("The second column in ", sampleinfo_file, " should contain analysis types:'IP' or 'SMI' only. Found unknown value(s): ",
+       paste(type_check, collapse = ",  "))
+}
+```
+
+We make sure that only IP and SMI are in the right factor level order
+
+```{r sampleFactor}
+sample_info <- sample_info %>% mutate(type = factor(type,  levels = c("SMI",  "IP")))
+```
+
+
+
+We create the DEWSeq object
+
+```{r dewseqInit}
+ddw <- DESeqDataSetFromSlidingWindows(countData  = window_counts,
+                                      colData    = sample_info,
+                                      annotObj   = annotation_file,
+                                      tidy       = FALSE,
+                                      design     = ~type)
+```
+
+
+## Prefiltering
+
+```{r prefiltering1}
+# remove all empty windows
+keep <- rowSums(counts(ddw)) >= 1
+ddw <- ddw[keep, ]
+```
+
+
+## Estimating size factors
+
+
+```{r size factors}
+ddw <- estimateSizeFactors(ddw)
+sizeFactors(ddw)
+```
+
+### estimate size factors for only protein_coding genes
+
+```{r protein_coding_size_factors}
+ddw_mrnas <- ddw[rowData(ddw)[, "gene_type"] == "protein_coding",  ]
+ddw_mrnas <- estimateSizeFactors(ddw_mrnas)
+```
+
+### estimate size factors without significant windows
+
+```{r size_factors_no_sig_windows}
+ddw_tmp <- ddw
+ddw_tmp <- estimateDispersions(ddw_tmp,  fitType = "local",  quiet = TRUE)
+if (lrt) {
+  ddw_tmp <- nbinomLRT(ddw_tmp, full = ~type, reduced = ~1)
+}else {
+  ddw_tmp <- nbinomWaldTest(ddw_tmp)
+}
+
+tmp_significant_windows <-
+                results(ddw_tmp,
+                    contrast = c("type",  "IP",  "SMI"),
+                    tidy = TRUE,
+                    filterFun = ihw) %>%
+                dplyr::filter(padj < p_value_cutoff) %>%
+                .[["row"]]
+rm(ddw_tmp)
+```
+
+estimate the size factors without the significant windows.
+
+```{r final_size_factors}
+ddw_mrnas <- ddw_mrnas[!rownames(ddw_mrnas) %in% tmp_significant_windows,  ]
+ddw_mrnas <- estimateSizeFactors(ddw_mrnas)
+```
+
+before thresholding:
+```{r threshold1}
+dim(ddw)
+```
+
+Now threshold the windows read count table.
+```{r threshold2}
+keep_exp <-  which(rowSums(counts(ddw) > min_count) >= min_sample)
+ddw <- ddw[keep_exp, ]
+```
+
+after thresholding:
+```{r threshold3}
+dim(ddw)
+```
+assign size factors
+
+```{r final_assign}
+sizeFactors(ddw) <- sizeFactors(ddw_mrnas)
+rm(list = c("tmp_significant_windows",  "ddw_mrnas"))
+sizeFactors(ddw)
+```
+
+
+## Differential window analysis
+
+### Dispersion estimates
+
+```{r source,  echo = FALSE,  eval = FALSE}
+# source: https://support.bioconductor.org/p/81094/
+```
+We fit parametric and local fit,  and decide the best fit following this [Bioconductor post](https://support.bioconductor.org/p/81094/)
+
+```{r parametric_dispersion}
+parametric_ddw  <- estimateDispersions(ddw,  fitType = "parametric")
+if (decide_fit) {
+  local_ddw  <- estimateDispersions(ddw,  fitType = "local")
+}
+
+```
+
+This is the dispersion estimate for parametric fit
+```{r plot parametric fit,  fig.wide=TRUE}
+plotDispEsts(parametric_ddw,  main = "Parametric fit")
+```
+
+This is the dispersion estimate for local fit,  given automated decision fitting is enabled:
+```{r plot local fit,  fig.wide = TRUE}
+if (decide_fit) {
+  plotDispEsts(local_ddw,  main = "Local fit")
+}
+
+```
+
+This will get the residuals for either fit,  only for automated decision fitting
+```{r residual fit}
+parametric_resid <- na.omit(with(mcols(parametric_ddw), abs(log(dispGeneEst) - log(dispFit))))
+if (decide_fit) {
+  local_resid <- na.omit(with(mcols(local_ddw), abs(log(dispGeneEst) - log(dispFit))))
+  resid_df <- data.frame(residuals = c(parametric_resid, local_resid),
+                         fitType = c(rep("parametric", length(parametric_resid)),
+                                   rep("local", length(local_resid))))
+  summary(resid_df)
+}
+
+```
+
+and we plot histograms of the fits
+
+```{r plot residual histograms,  fig.wide = TRUE}
+if (decide_fit) {
+  ggplot(resid_df, aes(x = residuals, fill = fitType)) +
+    scale_fill_manual(values = c("darkred", "darkblue")) + geom_histogram(alpha = 0.5, position = "identity", bins = 100) +
+    theme_bw()
+}
+
+```
+
+Now,  we will decide for the better fit based on median
+
+```{r choose_fit}
+summary(parametric_resid)
+if (decide_fit) {
+  summary(local_resid)
+  if (median(local_resid) <= median(parametric_resid)) {
+    cat("chosen fitType: local")
+    ddw <- local_ddw
+  }else {
+    cat("chosen fitType: parametric")
+    ddw <- parametric_ddw
+  }
+  rm(local_ddw, parametric_ddw, resid_df, parametric_resid, local_resid)
+}else {
+  ddw <- parametric_ddw
+  rm(parametric_ddw)
+}
+
+```
+
+
+
+### Wald test or LRT
+
+```{r wald_or_LRT}
+if (lrt) {
+  ddw <- nbinomLRT(ddw, full = ~type,  reduced = ~1)
+}else {
+  ddw <- nbinomWaldTest(ddw)
+}
+
+```
+
+
+### Significance testing
+
+```{r sig_windows}
+result_windows <- resultsDEWSeq(ddw,
+                              contrast = c("type",  "IP",  "SMI"),
+                              tidy = TRUE) %>% as_tibble
+
+result_windows
+```
+
+
+### Multiple hypothesis correction with IHW
+
+You might be interested to correct for multiple hypothesis testing with IHW.
+
+Decide on overlap correction based on the parameter `overlap_correction`
+
+```{r ihw}
+if (overlap_correction && ihw_filt) {
+  result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pSlidingWindows ~ baseMean,
+                     data = result_windows,
+                     alpha = p_value_cutoff,
+                     nfolds = 10))
+  padj_col <- "p_adj_IHW"
+}else if (!overlap_correction && ihw_filt) {
+  result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pvalue ~ baseMean,
+                     data = result_windows,
+                     alpha = p_value_cutoff,
+                     nfolds = 10))
+  padj_col <- "p_adj_IHW"
+}else if (overlap_correction && !ihw_filt) {
+  padj_col <- "pSlidingWindows.adj"
+}else {
+  result_windows[, "p_adj"] <- p.adjust(result_windows$pvalue, method = "BH")
+  padj_col <- "p_adj"
+}
+
+```
+
+Determine significant windows
+
+```{r filter_sig_windows}
+result_windows <- result_windows %>%
+                      mutate(significant = result_windows[, padj_col] < p_value_cutoff)
+sig_windows <- sum(result_windows$significant)
+```
+
+
+`r sig_windows` windows are significant
+
+
+### Combining windows
+
+```{r,  reg1,  message=FALSE,  eval=TRUE, include=FALSE}
+if (sig_windows > 0) {
+ result_regions <- extractRegions(windowRes  = result_windows,  padjCol    = padj_col,  padjThresh = p_value_cutoff,  log2FoldChangeThresh = lfc_cutoff) %>% as_tibble
+}else {
+ message("Cannot find significant windows in this dataset. Try lowering the p-value and log2FoldChange thresholds!")
+}
+```
+
+```{r extractRegion, eval=FALSE}
+if (sig_windows > 0) {
+ result_regions <- extractRegions(windowRes  = result_windows,  padjCol    = padj_col,  padjThresh = p_value_cutoff,  log2FoldChangeThresh = lfc_cutoff) %>% as_tibble
+}
+```
+
+                     
+### Writing Bed file
+
+```{r writing bed file}
+if (sig_windows > 1) {
+ toBED(windowRes = result_windows,  regionRes = result_regions, padjThresh = p_value_cutoff, padjCol   = padj_col,  fileName  = output_bed_file)
+}else {
+ message("This analysis does not have enough <=1 significant windows to create BED file for visualization")
+}
+```
+
+
+## Save Session
+```{r save_data}
+# save enriched windows,  gzip results file if the file suffix is .gz
+if (grepl("\\.gz$", output_windows_file, ignore.case = TRUE)) {
+  gz_out <- gzfile(output_windows_file, "w")
+  write.table(result_windows, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
+  close(gz_out)
+}else {
+  write.table(result_windows, file = output_windows_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
+}
+# save enriched regions
+if (sig_windows > 0) {
+  if (grepl("\\.gz$", output_regions_file, ignore.case = TRUE)) {
+    gz_out <- gzfile(output_regions_file, "w")
+    write.table(result_regions, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
+    close(gz_out)
+  }else {
+    write.table(result_regions, file = output_regions_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)
+  }
+}
+# save session
+# Warning! session images can be heavy!
+if (nchar(output_rdata) > 5) {
+  save.image(file = output_rdata)
+}
+```
+
+## Session Info
+
+```{r session info}
+sessionInfo()
+```
author	rnateam
date	Thu, 20 Oct 2022 08:18:30 +0000
parents
children