Mercurial > repos > rnateam > dewseq
view analyseStudy.Rmd @ 0:e1cb2e012307 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/rna_tools/dewseq commit 71db0e65b3b306904ae2b17ce3de677244aea776"
author | rnateam |
---|---|
date | Thu, 20 Oct 2022 08:18:30 +0000 |
parents | |
children |
line wrap: on
line source
--- title: "DEWSeq Analysis" author: "Thomas Schwarzl and Sudeep Sahadevan" date: "07/10/2020" output: BiocStyle::html_document: toc: true toc_float: true toc_depth: 5 params: protein: "" sampleinfo_file: "" countmatrix_file: "" annotation_file: "" output_windows_file: "" output_regions_file: "" output_bed_file: "" output_rdata: "" min_count: 2 min_sample: 2 LRT: FALSE p_value_cutoff: 0.05 lfc_cutoff: 1 overlap_correction: FALSE IHW: TRUE decide_fit: TRUE --- ```{r setup, include=FALSE} knitr::opts_chunk$set(echo = TRUE) ``` ```{r assignData, echo=FALSE, eval=TRUE} protein <- params$protein sampleinfo_file <- params$sampleinfo_file countmatrix_file <- params$countmatrix_file annotation_file <- params$annotation_file output_windows_file <- params$output_windows_file output_regions_file <- params$output_regions_file output_bed_file <- params$output_bed_file output_rdata <- params$output_rdata min_count <- params$min_count min_sample <- params$min_sample p_value_cutoff <- params$p_value_cutoff lfc_cutoff <- params$lfc_cutoff ``` Sanity check input parameter values ```{r sanityCheck, echo=FALSE, eval=TRUE} # round of min_count and min_sample values first message("Any decimals given as values for min_count and min_sample parameters will be rounded off to the nearest integer.") min_count <- round(min_count) min_sample <- round(min_sample) # receive and sanity check p_value_cutoff if (p_value_cutoff <= 0 || p_value_cutoff >= 1) { warning("p_value_cutoff must satisfy: 0<=p_value_cutoff<=1. Resetting to default value: 0.05") p_value_cutoff <- 0.05 } # sanity check log2Foldchange cutoff if (lfc_cutoff < 0) { warning("lfc_cutoff must be a value >=0. Resetting to default value: 1.00") lfc_cutoff <- 1.0 } # sanity check LRT vs Wald if (is(params$LRT, "logical")) { lrt <- params$LRT }else { warning("LRT must be TRUE or FALSE, setting this parameter to default: FALSE") lrt <- FALSE } # sanity check overlap correction parameter if (is(params$overlap_correction, "logical")) { overlap_correction <- params$overlap_correction }else { warning("overlap_correction must be TRUE or FALSE, setting this parameter to default: TRUE") overlap_correction <- TRUE } # sanity check IHW vs BH correction parameter if (is(params$IHW, "logical")) { ihw_filt <- params$IHW }else { warning("IHW must be TRUE or FALSE, setting this parameter to default: TRUE") ihw_filt <- TRUE } # sanity check automated fit vs parametric fit paramter if (is(params$decide_fit, "logical")) { decide_fit <- params$decide_fit }else { warning("decide_fit must be TRUE or FALSE, setting this parameter to default: TRUE") decide_fit <- TRUE } ``` # eCLIP analysis of `r protein` ## Setup This is the analysis of __`r protein`__ with sampleinfo file: ``r sampleinfo_file`` countmatrix file: ``r countmatrix_file`` and annotation file: ``r annotation_file`` with the following threshold: minimum read count per window per sample: ``r min_count`` number of samples with minimum read count per window: ``r min_sample`` using the following parameters: p-value cut-off: ``r p_value_cutoff`` Log2FoldChange cut-off: ``r lfc_cutoff`` use automated method for dispersion estmation: ``r decide_fit`` use LRT test : ``r lrt`` use overlap correction: ``r overlap_correction`` use IHW for FDR correction: ``r ihw_filt`` ``` {r check if files exists, echo=FALSE} file_exists <- function(x) { if (!file.exists(x)) stop(paste0("'", x, "' - file does not exist.")) } file_exists(countmatrix_file) file_exists(annotation_file) file_exists(sampleinfo_file) ``` First, we load the libraries. ```{r load DEWSeq} required_packages <- c("DEWSeq", "data.table", "IHW", "R.utils", "tidyverse", "ggrepel") installed_packages <- installed.packages()[, 1] missing_packages <- setdiff(required_packages, installed_packages) if (length(missing_packages) != 0) { stop("Found missing dependencies! Please install the following package(s): ", paste(missing_packages, collapse = ", ")) } suppressPackageStartupMessages({ require(DEWSeq) require(tidyverse) require(data.table) require(IHW) require(R.utils) require(ggrepel) }) ``` ## Read in data Here we read in the window counts ```{r read window counts} window_counts <- fread(countmatrix_file, sep = "\t", stringsAsFactors = FALSE) %>% as.data.frame() rownames(window_counts) <- window_counts[, 1] window_counts <- window_counts[, -1] ``` and the sample info file ```{r read sample info} sample_info <- read.table(sampleinfo_file, sep = "\t", stringsAsFactors = FALSE) if (ncol(sample_info) < 2) { stop("sampleinfo_file ", sampleinfo_file, " MUST have atleast two columns: first column should be the sample names used in ", countmatrix_file, " and second column must be the experiment type: IP or SMI") }else if (ncol(sample_info) > 2) { message("Found ", ncol(sample_info), " columns in ", sampleinfo_file, " using the first column as sample name and second column as experiment name") sample_info <- sample_info[, c(1, 2)] } colnames(sample_info) <- c("samples", "type") rownames(sample_info) <- sample_info[, 1] ``` Now we make sure that the sampleinfo file contains the column "type" with values "SMI" and "IP" only. ```{r sampleSanity} # make sure that sample_info rows and window_counts columns are in same order common_samples <- sort(intersect(colnames(window_counts), rownames(sample_info))) if (length(common_samples) != ncol(window_counts)) { stop("The number of samples in ", countmatrix_file, " and ", sampleinfo_file, " do not MATCH!") } sample_info <- sample_info[common_samples, ] window_counts <- window_counts[, common_samples] # Now make sure that sample_info$type contains only "IP" and "SMI" type_check <- setdiff(unique(sample_info$type), c("IP", "SMI")) if (length(type_check) != 0) { stop("The second column in ", sampleinfo_file, " should contain analysis types:'IP' or 'SMI' only. Found unknown value(s): ", paste(type_check, collapse = ", ")) } ``` We make sure that only IP and SMI are in the right factor level order ```{r sampleFactor} sample_info <- sample_info %>% mutate(type = factor(type, levels = c("SMI", "IP"))) ``` We create the DEWSeq object ```{r dewseqInit} ddw <- DESeqDataSetFromSlidingWindows(countData = window_counts, colData = sample_info, annotObj = annotation_file, tidy = FALSE, design = ~type) ``` ## Prefiltering ```{r prefiltering1} # remove all empty windows keep <- rowSums(counts(ddw)) >= 1 ddw <- ddw[keep, ] ``` ## Estimating size factors ```{r size factors} ddw <- estimateSizeFactors(ddw) sizeFactors(ddw) ``` ### estimate size factors for only protein_coding genes ```{r protein_coding_size_factors} ddw_mrnas <- ddw[rowData(ddw)[, "gene_type"] == "protein_coding", ] ddw_mrnas <- estimateSizeFactors(ddw_mrnas) ``` ### estimate size factors without significant windows ```{r size_factors_no_sig_windows} ddw_tmp <- ddw ddw_tmp <- estimateDispersions(ddw_tmp, fitType = "local", quiet = TRUE) if (lrt) { ddw_tmp <- nbinomLRT(ddw_tmp, full = ~type, reduced = ~1) }else { ddw_tmp <- nbinomWaldTest(ddw_tmp) } tmp_significant_windows <- results(ddw_tmp, contrast = c("type", "IP", "SMI"), tidy = TRUE, filterFun = ihw) %>% dplyr::filter(padj < p_value_cutoff) %>% .[["row"]] rm(ddw_tmp) ``` estimate the size factors without the significant windows. ```{r final_size_factors} ddw_mrnas <- ddw_mrnas[!rownames(ddw_mrnas) %in% tmp_significant_windows, ] ddw_mrnas <- estimateSizeFactors(ddw_mrnas) ``` before thresholding: ```{r threshold1} dim(ddw) ``` Now threshold the windows read count table. ```{r threshold2} keep_exp <- which(rowSums(counts(ddw) > min_count) >= min_sample) ddw <- ddw[keep_exp, ] ``` after thresholding: ```{r threshold3} dim(ddw) ``` assign size factors ```{r final_assign} sizeFactors(ddw) <- sizeFactors(ddw_mrnas) rm(list = c("tmp_significant_windows", "ddw_mrnas")) sizeFactors(ddw) ``` ## Differential window analysis ### Dispersion estimates ```{r source, echo = FALSE, eval = FALSE} # source: https://support.bioconductor.org/p/81094/ ``` We fit parametric and local fit, and decide the best fit following this [Bioconductor post](https://support.bioconductor.org/p/81094/) ```{r parametric_dispersion} parametric_ddw <- estimateDispersions(ddw, fitType = "parametric") if (decide_fit) { local_ddw <- estimateDispersions(ddw, fitType = "local") } ``` This is the dispersion estimate for parametric fit ```{r plot parametric fit, fig.wide=TRUE} plotDispEsts(parametric_ddw, main = "Parametric fit") ``` This is the dispersion estimate for local fit, given automated decision fitting is enabled: ```{r plot local fit, fig.wide = TRUE} if (decide_fit) { plotDispEsts(local_ddw, main = "Local fit") } ``` This will get the residuals for either fit, only for automated decision fitting ```{r residual fit} parametric_resid <- na.omit(with(mcols(parametric_ddw), abs(log(dispGeneEst) - log(dispFit)))) if (decide_fit) { local_resid <- na.omit(with(mcols(local_ddw), abs(log(dispGeneEst) - log(dispFit)))) resid_df <- data.frame(residuals = c(parametric_resid, local_resid), fitType = c(rep("parametric", length(parametric_resid)), rep("local", length(local_resid)))) summary(resid_df) } ``` and we plot histograms of the fits ```{r plot residual histograms, fig.wide = TRUE} if (decide_fit) { ggplot(resid_df, aes(x = residuals, fill = fitType)) + scale_fill_manual(values = c("darkred", "darkblue")) + geom_histogram(alpha = 0.5, position = "identity", bins = 100) + theme_bw() } ``` Now, we will decide for the better fit based on median ```{r choose_fit} summary(parametric_resid) if (decide_fit) { summary(local_resid) if (median(local_resid) <= median(parametric_resid)) { cat("chosen fitType: local") ddw <- local_ddw }else { cat("chosen fitType: parametric") ddw <- parametric_ddw } rm(local_ddw, parametric_ddw, resid_df, parametric_resid, local_resid) }else { ddw <- parametric_ddw rm(parametric_ddw) } ``` ### Wald test or LRT ```{r wald_or_LRT} if (lrt) { ddw <- nbinomLRT(ddw, full = ~type, reduced = ~1) }else { ddw <- nbinomWaldTest(ddw) } ``` ### Significance testing ```{r sig_windows} result_windows <- resultsDEWSeq(ddw, contrast = c("type", "IP", "SMI"), tidy = TRUE) %>% as_tibble result_windows ``` ### Multiple hypothesis correction with IHW You might be interested to correct for multiple hypothesis testing with IHW. Decide on overlap correction based on the parameter `overlap_correction` ```{r ihw} if (overlap_correction && ihw_filt) { result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pSlidingWindows ~ baseMean, data = result_windows, alpha = p_value_cutoff, nfolds = 10)) padj_col <- "p_adj_IHW" }else if (!overlap_correction && ihw_filt) { result_windows[, "p_adj_IHW"] <- adj_pvalues(ihw(pvalue ~ baseMean, data = result_windows, alpha = p_value_cutoff, nfolds = 10)) padj_col <- "p_adj_IHW" }else if (overlap_correction && !ihw_filt) { padj_col <- "pSlidingWindows.adj" }else { result_windows[, "p_adj"] <- p.adjust(result_windows$pvalue, method = "BH") padj_col <- "p_adj" } ``` Determine significant windows ```{r filter_sig_windows} result_windows <- result_windows %>% mutate(significant = result_windows[, padj_col] < p_value_cutoff) sig_windows <- sum(result_windows$significant) ``` `r sig_windows` windows are significant ### Combining windows ```{r, reg1, message=FALSE, eval=TRUE, include=FALSE} if (sig_windows > 0) { result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble }else { message("Cannot find significant windows in this dataset. Try lowering the p-value and log2FoldChange thresholds!") } ``` ```{r extractRegion, eval=FALSE} if (sig_windows > 0) { result_regions <- extractRegions(windowRes = result_windows, padjCol = padj_col, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble } ``` ### Writing Bed file ```{r writing bed file} if (sig_windows > 1) { toBED(windowRes = result_windows, regionRes = result_regions, padjThresh = p_value_cutoff, padjCol = padj_col, fileName = output_bed_file) }else { message("This analysis does not have enough <=1 significant windows to create BED file for visualization") } ``` ## Save Session ```{r save_data} # save enriched windows, gzip results file if the file suffix is .gz if (grepl("\\.gz$", output_windows_file, ignore.case = TRUE)) { gz_out <- gzfile(output_windows_file, "w") write.table(result_windows, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) close(gz_out) }else { write.table(result_windows, file = output_windows_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) } # save enriched regions if (sig_windows > 0) { if (grepl("\\.gz$", output_regions_file, ignore.case = TRUE)) { gz_out <- gzfile(output_regions_file, "w") write.table(result_regions, file = gz_out, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) close(gz_out) }else { write.table(result_regions, file = output_regions_file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE) } } # save session # Warning! session images can be heavy! if (nchar(output_rdata) > 5) { save.image(file = output_rdata) } ``` ## Session Info ```{r session info} sessionInfo() ```