Sanity check input parameter values
## Any decimals given as values for min_count and min_sample parameters will be rounded off to the nearest integer.
This is the analysis of RBP with
sampleinfo file: /home/malong/Repos/dewseq_galaxy/test-data/sample_info.exp.txt
countmatrix file: /home/malong/Repos/dewseq_galaxy/test-data/Rbp_count_matrix.exp.txt
and
annotation file: /home/malong/Repos/dewseq_galaxy/test-data/windows.exp.txt
with the following threshold:
minimum read count per window per sample: 2
number of samples with minimum read count per window: 2
using the following parameters:
p-value cut-off: 0.5
Log2FoldChange cut-off: 1
use automated method for dispersion estmation: TRUE
use LRT test : FALSE
use overlap correction: FALSE
use IHW for FDR correction: TRUE
First, we load the libraries.
requiredPackages <- c('DEWSeq','data.table','IHW','R.utils','tidyverse')
installedPackages <- installed.packages()[,1]
diffPackages <- setdiff(requiredPackages,installedPackages)
if(length(diffPackages)!=0){
stop('Found missing dependencies! Please install the following package(s): ',paste(diffPackages,collapse = ", "))
}
suppressPackageStartupMessages({
require(DEWSeq)
require(tidyverse)
require(data.table)
require(IHW)
require(R.utils)
})
Here we read in the window counts
WINDOWCOUNTS <- fread(countmatrix_file, sep = "\t",stringsAsFactors = FALSE) %>% as.data.frame()
rownames(WINDOWCOUNTS) <- WINDOWCOUNTS[,1]
WINDOWCOUNTS <- WINDOWCOUNTS[,-1]
and the sample info file
SAMPLEINFO <- read.table(sampleinfo_file,sep="\t",stringsAsFactors = FALSE)
if(ncol(SAMPLEINFO)<2){
stop("sampleinfo_file ",sampleinfo_file," MUST have atleast two columns: first column should be the sample names used in ",countmatrix_file,
" and second column must be the experiment type: IP or SMI")
}else if(ncol(SAMPLEINFO)>2){
message("Found ",ncol(SAMPLEINFO)," columns in ",sampleinfo_file," using the first column as sample name and second column as experiment name")
SAMPLEINFO <- SAMPLEINFO[,c(1,2)]
}
colnames(SAMPLEINFO) <- c("samples","type")
rownames(SAMPLEINFO) <- SAMPLEINFO[,1]
Now we make sure that the sampleinfo file contains the column “type” with values “SMI” and “IP” only.
# make sure that SAMPLEINFO rows and WINDOWCOUNTS columns are in same order
commonSamples <- sort(intersect(colnames(WINDOWCOUNTS),rownames(SAMPLEINFO)))
if(length(commonSamples)!=ncol(WINDOWCOUNTS)){
stop("The number of samples in ",countmatrix_file," and ",sampleinfo_file," do not MATCH!")
}
SAMPLEINFO <- SAMPLEINFO[ commonSamples, ]
WINDOWCOUNTS <- WINDOWCOUNTS[,commonSamples]
# Now make sure that SAMPLEINFO$type contains only "IP" and "SMI"
typeCheck <- setdiff(unique(SAMPLEINFO$type),c("IP","SMI"))
if(length(typeCheck)!=0){
stop("The second column in ",sampleinfo_file," should contain analysis types: 'IP' or 'SMI' only. Found unknown value(s): ",
paste(typeCheck,collapse=", "))
}
We make sure that only IP and SMI are in the right factor level order
SAMPLEINFO <- SAMPLEINFO %>% mutate(type = factor(type, levels = c("SMI", "IP")))
We create the DEWSeq object
ddw <- DESeqDataSetFromSlidingWindows(countData = WINDOWCOUNTS,
colData = SAMPLEINFO,
annotObj = annotation_file,
tidy = FALSE,
design = ~type)
# remove all empty windows
keep <- rowSums(counts(ddw)) >= 1
ddw <- ddw[keep,]
ddw <- estimateSizeFactors(ddw)
sizeFactors(ddw)
## Rbp_ctrl_rep1 Rbp_exp_rep1 Rbp_exp_rep2
## 0.6299605 1.2599210 1.3867225
ddw_mRNAs <- ddw[ rowData(ddw)[,"gene_type"] == "protein_coding", ]
ddw_mRNAs <- estimateSizeFactors(ddw_mRNAs)
ddw_tmp <- ddw
ddw_tmp <- estimateDispersions(ddw_tmp, fitType = "local", quiet = TRUE)
if(LRT){
ddw_tmp <- nbinomLRT(ddw_tmp,full = ~type,reduced = ~1)
}else{
ddw_tmp <- nbinomWaldTest(ddw_tmp)
}
tmp_significant_windows <-
results(ddw_tmp,
contrast = c("type", "IP", "SMI"),
tidy = TRUE,
filterFun = ihw) %>%
dplyr::filter(padj < p_value_cutoff) %>%
.[["row"]]
rm(ddw_tmp)
estimate the size factors without the significant windows.
ddw_mRNAs <- ddw_mRNAs[ !rownames(ddw_mRNAs) %in% tmp_significant_windows, ]
ddw_mRNAs <- estimateSizeFactors(ddw_mRNAs)
before thresholding:
dim(ddw)
## [1] 3245 3
Now threshold the windows read count table.
keep_exp <- which(rowSums(counts(ddw)>minCount)>=minSample)
ddw <- ddw[keep_exp,]
after thresholding:
dim(ddw)
## [1] 1008 3
assign size factors
sizeFactors(ddw) <- sizeFactors(ddw_mRNAs)
rm( list = c("tmp_significant_windows", "ddw_mRNAs"))
sizeFactors(ddw)
## Rbp_ctrl_rep1 Rbp_exp_rep1 Rbp_exp_rep2
## 0.6299605 1.2599210 1.3867225
We fit parametric and local fit, and decide the best fit following this Bioconductor post
parametric_ddw <- estimateDispersions(ddw, fitType="parametric")
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
if(decide_fit){
local_ddw <- estimateDispersions(ddw, fitType="local")
}
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
This is the dispersion estimate for parametric fit
plotDispEsts(parametric_ddw, main="Parametric fit")
This is the dispersion estimate for local fit, given automated decision fitting is enabled:
if(decide_fit){
plotDispEsts(local_ddw, main="Local fit")
}
This will get the residuals for either fit, only for automated decision fitting
parametricResid <- na.omit(with(mcols(parametric_ddw),abs(log(dispGeneEst)-log(dispFit))))
if(decide_fit){
localResid <- na.omit(with(mcols(local_ddw),abs(log(dispGeneEst)-log(dispFit))))
residDf <- data.frame(residuals=c(parametricResid,localResid),fitType=c(rep("parametric",length(parametricResid)),rep("local",length(localResid))))
summary(residDf)
}
## residuals fitType
## Min. : 0.000899 Length:2016
## 1st Qu.:17.894458 Class :character
## Median :18.468848 Mode :character
## Mean :16.776221
## 3rd Qu.:18.926598
## Max. :20.945520
and we plot histograms of the fits
if(decide_fit){
ggplot(residDf, aes(x = residuals, fill = fitType)) + scale_fill_manual(values = c("darkred", "darkblue")) + geom_histogram(alpha = 0.5, position='identity', bins = 100) + theme_bw()
}
Now, we will decide for the better fit based on median
summary(parametricResid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.09595 18.18429 18.59938 16.85875 18.92660 19.52154
if(decide_fit){
summary(localResid)
if (median(localResid) <= median(parametricResid)){
cat("chosen fitType: local")
ddw <- local_ddw
}else{
cat("chosen fitType: parametric")
ddw <- parametric_ddw
}
rm(local_ddw,parametric_ddw,residDf,parametricResid,localResid)
}else{
ddw <- parametric_ddw
rm(parametric_ddw)
}
## chosen fitType: local
if(LRT){
ddw <- nbinomLRT(ddw,full = ~type, reduced = ~1)
}else{
ddw <- nbinomWaldTest(ddw)
}
resultWindows <- resultsDEWSeq(ddw,
contrast = c("type", "IP", "SMI"),
tidy = TRUE) %>% as_tibble
resultWindows
## # A tibble: 672 × 20
## chromosome begin end width strand uniqu…¹ gene_id gene_…² gene_…³ gene_…⁴
## <chr> <dbl> <int> <int> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Synechocyst… 2467 2517 50 + TU5005… TU5005 TU5005 protei… exon
## 2 Synechocyst… 2487 2537 50 + TU5005… TU5005 TU5005 protei… exon
## 3 Synechocyst… 2507 2557 50 + TU5005… TU5005 TU5005 protei… exon
## 4 Synechocyst… 2527 2577 50 + TU5005… TU5005 TU5005 protei… exon
## 5 Synechocyst… 2547 2597 50 + TU5005… TU5005 TU5005 protei… exon
## 6 Synechocyst… 2567 2617 50 + TU5005… TU5005 TU5005 protei… exon
## 7 Synechocyst… 2587 2637 50 + TU5005… TU5005 TU5005 protei… exon
## 8 Synechocyst… 2607 2657 50 + TU5005… TU5005 TU5005 protei… exon
## 9 Synechocyst… 2627 2677 50 + TU5005… TU5005 TU5005 protei… exon
## 10 Synechocyst… 2647 2697 50 + TU5005… TU5005 TU5005 protei… exon
## # … with 662 more rows, 10 more variables: Nr_of_region <int>,
## # Total_nr_of_region <int>, window_number <int>, baseMean <dbl>,
## # log2FoldChange <dbl>, lfcSE <dbl>, stat <dbl>, pvalue <dbl>,
## # pSlidingWindows <dbl>, pSlidingWindows.adj <dbl>, and abbreviated variable
## # names ¹unique_id, ²gene_name, ³gene_type, ⁴gene_region
You might be interested to correct for multiple hypothesis testing with IHW.
Decide on overlap correction based on the parameter overlap_correction
if(overlap_correction & IHW){
resultWindows[,"p_adj_IHW"] <- adj_pvalues(ihw(pSlidingWindows ~ baseMean,
data = resultWindows,
alpha = p_value_cutoff,
nfolds = 10))
padjCol <- "p_adj_IHW"
}else if(!overlap_correction & IHW){
resultWindows[,"p_adj_IHW"] <- adj_pvalues(ihw(pvalue ~ baseMean,
data = resultWindows,
alpha = p_value_cutoff,
nfolds = 10))
padjCol <- "p_adj_IHW"
}else if(overlap_correction & !IHW){
padjCol <- "pSlidingWindows.adj"
}else{
resultWindows[,'p_adj'] <- p.adjust(resultWindows$pvalue,method="BH")
padjCol <- 'p_adj'
}
## Only 1 bin; IHW reduces to Benjamini Hochberg (uniform weights)
Determine significant windows
resultWindows <- resultWindows %>%
mutate(significant = resultWindows[ ,padjCol] < p_value_cutoff)
sigWindows <- sum(resultWindows$significant)
672 windows are significant
resultWindows %>%
filter(significant) %>%
arrange(desc(log2FoldChange)) %>%
.[["gene_name"]] %>%
unique %>%
head(20)
## [1] "TU5062" "TU5031" "TU5077" "TU5086" "TU5122" "TU5102" "TU5088" "TU5089"
## [9] "TU5111" "TU5120" "TU5094" "TU5119" "TU5124" "TU5082" "TU5032" "TU5054"
## [17] "TU5036" "TU5092" "TU5106" "TU5013"
if(sigWindows>0){
resultRegions <- extractRegions(windowRes = resultWindows, padjCol = padjCol, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble
}
if(sigWindows>1){
toBED(windowRes = resultWindows, regionRes = resultRegions,padjThresh=p_value_cutoff,
padjCol = padjCol, fileName = output_bed_file)
}else{
message("This analysis does not have enough <=2 significant windows to create BED file for visualization")
}
# save enriched windows, gzip results file if the file suffix is .gz
if(grepl("\\.gz$",output_windows_file,ignore.case = TRUE)){
gz_out <- gzfile(output_windows_file,"w")
write.table(resultWindows,file=gz_out,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
close(gz_out)
}else{
write.table(resultWindows,file=output_windows_file,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
}
# save enriched regions
if(sigWindows>0){
if(grepl("\\.gz$",output_regions_file,ignore.case = TRUE)){
gz_out <- gzfile(output_regions_file,"w")
write.table(resultRegions,file=gz_out,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
close(gz_out)
}else{
write.table(resultRegions,file=output_regions_file,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
}
}
# save session
# Warning! session images can be heavy!
if(nchar(output_Rdata)>5){
save.image( file = output_Rdata)
}
sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/malong/miniconda3/envs/dewseq/lib/libopenblasp-r0.3.21.so
##
## locale:
## [1] LC_CTYPE=de_DE.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=de_DE.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=de_DE.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] IHW_1.22.0 data.table_1.14.2
## [3] forcats_0.5.2 stringr_1.4.1
## [5] dplyr_1.0.10 purrr_0.3.5
## [7] readr_2.1.3 tidyr_1.2.1
## [9] tibble_3.1.8 ggplot2_3.3.6
## [11] tidyverse_1.3.2 DEWSeq_1.8.0
## [13] BiocParallel_1.28.3 DESeq2_1.34.0
## [15] SummarizedExperiment_1.24.0 Biobase_2.54.0
## [17] MatrixGenerics_1.6.0 matrixStats_0.62.0
## [19] GenomicRanges_1.46.1 GenomeInfoDb_1.30.0
## [21] IRanges_2.28.0 S4Vectors_0.32.4
## [23] BiocGenerics_0.40.0 R.utils_2.12.0
## [25] R.oo_1.25.0 R.methodsS3_1.8.2
## [27] BiocStyle_2.22.0
##
## loaded via a namespace (and not attached):
## [1] googledrive_2.0.0 colorspace_2.0-3 ellipsis_0.3.2
## [4] XVector_0.34.0 fs_1.5.2 farver_2.1.1
## [7] bit64_4.0.5 AnnotationDbi_1.56.1 fansi_1.0.3
## [10] lubridate_1.8.0 xml2_1.3.3 splines_4.1.3
## [13] cachem_1.0.6 geneplotter_1.72.0 knitr_1.40
## [16] jsonlite_1.8.2 broom_1.0.1 annotate_1.72.0
## [19] dbplyr_2.2.1 png_0.1-7 BiocManager_1.30.18
## [22] compiler_4.1.3 httr_1.4.4 backports_1.4.1
## [25] assertthat_0.2.1 Matrix_1.4-1 fastmap_1.1.0
## [28] gargle_1.2.1 cli_3.4.1 htmltools_0.5.3
## [31] tools_4.1.3 gtable_0.3.1 glue_1.6.2
## [34] GenomeInfoDbData_1.2.7 Rcpp_1.0.9 slam_0.1-50
## [37] cellranger_1.1.0 jquerylib_0.1.4 vctrs_0.4.2
## [40] Biostrings_2.62.0 xfun_0.33 rvest_1.0.3
## [43] lifecycle_1.0.3 XML_3.99-0.11 googlesheets4_1.0.1
## [46] zlibbioc_1.40.0 scales_1.2.1 hms_1.1.2
## [49] parallel_4.1.3 RColorBrewer_1.1-3 yaml_2.3.5
## [52] memoise_2.0.1 sass_0.4.2 stringi_1.7.8
## [55] RSQLite_2.2.8 highr_0.9 genefilter_1.76.0
## [58] rlang_1.0.6 pkgconfig_2.0.3 bitops_1.0-7
## [61] lpsymphony_1.22.0 evaluate_0.17 lattice_0.20-45
## [64] labeling_0.4.2 bit_4.0.4 tidyselect_1.1.2
## [67] magrittr_2.0.3 bookdown_0.29 R6_2.5.1
## [70] generics_0.1.3 DelayedArray_0.20.0 DBI_1.1.3
## [73] pillar_1.8.1 haven_2.5.1 withr_2.5.0
## [76] survival_3.4-0 KEGGREST_1.34.0 RCurl_1.98-1.9
## [79] modelr_0.1.9 crayon_1.5.2 fdrtool_1.2.17
## [82] utf8_1.2.2 tzdb_0.3.0 rmarkdown_2.17
## [85] locfit_1.5-9.6 grid_4.1.3 readxl_1.4.1
## [88] blob_1.2.3 reprex_2.0.2 digest_0.6.29
## [91] xtable_1.8-4 munsell_0.5.0 bslib_0.4.0