Sanity check input parameter values

## Any decimals given as values for min_count and min_sample parameters will be rounded off to the nearest integer.

1 eCLIP analysis of RBP

1.1 Setup

This is the analysis of RBP with
sampleinfo file: /home/malong/Repos/dewseq_galaxy/test-data/sample_info.exp.txt
countmatrix file: /home/malong/Repos/dewseq_galaxy/test-data/Rbp_count_matrix.exp.txt and
annotation file: /home/malong/Repos/dewseq_galaxy/test-data/windows.exp.txt

with the following threshold:

minimum read count per window per sample: 2 number of samples with minimum read count per window: 2

using the following parameters:
p-value cut-off: 0.5
Log2FoldChange cut-off: 1 use automated method for dispersion estmation: TRUE
use LRT test : FALSE
use overlap correction: FALSE
use IHW for FDR correction: TRUE

First, we load the libraries.

requiredPackages <- c('DEWSeq','data.table','IHW','R.utils','tidyverse')
installedPackages <- installed.packages()[,1]
diffPackages <- setdiff(requiredPackages,installedPackages)
if(length(diffPackages)!=0){
  stop('Found missing dependencies! Please install the following package(s): ',paste(diffPackages,collapse = ", "))
}
suppressPackageStartupMessages({
  require(DEWSeq)
  require(tidyverse)
  require(data.table)
  require(IHW)
  require(R.utils)
})

1.2 Read in data

Here we read in the window counts

WINDOWCOUNTS <- fread(countmatrix_file, sep = "\t",stringsAsFactors = FALSE) %>% as.data.frame()
rownames(WINDOWCOUNTS) <- WINDOWCOUNTS[,1]
WINDOWCOUNTS <- WINDOWCOUNTS[,-1]

and the sample info file

SAMPLEINFO <- read.table(sampleinfo_file,sep="\t",stringsAsFactors = FALSE)
if(ncol(SAMPLEINFO)<2){
  stop("sampleinfo_file ",sampleinfo_file," MUST have atleast two columns: first column should be the sample names used in ",countmatrix_file,
       " and second column must be the experiment type: IP or SMI")
}else if(ncol(SAMPLEINFO)>2){
  message("Found ",ncol(SAMPLEINFO)," columns in ",sampleinfo_file," using the first column as sample name and second column as experiment name")
  SAMPLEINFO <- SAMPLEINFO[,c(1,2)]
}
colnames(SAMPLEINFO) <- c("samples","type")
rownames(SAMPLEINFO) <- SAMPLEINFO[,1]

Now we make sure that the sampleinfo file contains the column “type” with values “SMI” and “IP” only.

# make sure that SAMPLEINFO rows and WINDOWCOUNTS columns are in same order
commonSamples <- sort(intersect(colnames(WINDOWCOUNTS),rownames(SAMPLEINFO)))
if(length(commonSamples)!=ncol(WINDOWCOUNTS)){
  stop("The number of samples in ",countmatrix_file," and ",sampleinfo_file," do not MATCH!")
}
SAMPLEINFO <- SAMPLEINFO[ commonSamples, ]
WINDOWCOUNTS <- WINDOWCOUNTS[,commonSamples]
# Now make sure that SAMPLEINFO$type contains only "IP" and "SMI"
typeCheck <- setdiff(unique(SAMPLEINFO$type),c("IP","SMI"))
if(length(typeCheck)!=0){
  stop("The second column in ",sampleinfo_file," should contain analysis types: 'IP' or 'SMI' only. Found unknown value(s): ",
       paste(typeCheck,collapse=", "))
}

We make sure that only IP and SMI are in the right factor level order

SAMPLEINFO <- SAMPLEINFO %>% mutate(type = factor(type, levels = c("SMI", "IP")))

We create the DEWSeq object

ddw <- DESeqDataSetFromSlidingWindows(countData  = WINDOWCOUNTS,
                                      colData    = SAMPLEINFO,
                                      annotObj   = annotation_file,
                                      tidy       = FALSE,
                                      design     = ~type)

1.3 Prefiltering

# remove all empty windows
keep <- rowSums(counts(ddw)) >= 1
ddw <- ddw[keep,]

1.4 Estimating size factors

ddw <- estimateSizeFactors(ddw)
sizeFactors(ddw)
## Rbp_ctrl_rep1  Rbp_exp_rep1  Rbp_exp_rep2 
##     0.6299605     1.2599210     1.3867225

1.4.1 estimate size factors for only protein_coding genes

ddw_mRNAs <- ddw[ rowData(ddw)[,"gene_type"] == "protein_coding", ]
ddw_mRNAs <- estimateSizeFactors(ddw_mRNAs)

1.4.2 estimate size factors without significant windows

ddw_tmp <- ddw
ddw_tmp <- estimateDispersions(ddw_tmp, fitType = "local", quiet = TRUE)
if(LRT){
  ddw_tmp <- nbinomLRT(ddw_tmp,full = ~type,reduced = ~1)
}else{
  ddw_tmp <- nbinomWaldTest(ddw_tmp)
}

tmp_significant_windows <- 
                results(ddw_tmp,
                    contrast = c("type", "IP", "SMI"),
                    tidy = TRUE,
                    filterFun = ihw) %>% 
                dplyr::filter(padj < p_value_cutoff) %>% 
                .[["row"]]
rm(ddw_tmp)

estimate the size factors without the significant windows.

ddw_mRNAs <- ddw_mRNAs[ !rownames(ddw_mRNAs) %in% tmp_significant_windows, ]
ddw_mRNAs <- estimateSizeFactors(ddw_mRNAs)

before thresholding:

dim(ddw)
## [1] 3245    3

Now threshold the windows read count table.

keep_exp <-  which(rowSums(counts(ddw)>minCount)>=minSample)
ddw <- ddw[keep_exp,]

after thresholding:

dim(ddw)
## [1] 1008    3

assign size factors

sizeFactors(ddw) <- sizeFactors(ddw_mRNAs)
rm( list = c("tmp_significant_windows", "ddw_mRNAs"))
sizeFactors(ddw)
## Rbp_ctrl_rep1  Rbp_exp_rep1  Rbp_exp_rep2 
##     0.6299605     1.2599210     1.3867225

1.5 Differential window analysis

1.5.1 Dispersion estimates

We fit parametric and local fit, and decide the best fit following this Bioconductor post

parametric_ddw  <- estimateDispersions(ddw, fitType="parametric")
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
if(decide_fit){
  local_ddw  <- estimateDispersions(ddw, fitType="local")
}
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates

This is the dispersion estimate for parametric fit

plotDispEsts(parametric_ddw, main="Parametric fit")

This is the dispersion estimate for local fit, given automated decision fitting is enabled:

if(decide_fit){
  plotDispEsts(local_ddw, main="Local fit")
}

This will get the residuals for either fit, only for automated decision fitting

parametricResid <- na.omit(with(mcols(parametric_ddw),abs(log(dispGeneEst)-log(dispFit))))
if(decide_fit){
  localResid <- na.omit(with(mcols(local_ddw),abs(log(dispGeneEst)-log(dispFit))))
  residDf <- data.frame(residuals=c(parametricResid,localResid),fitType=c(rep("parametric",length(parametricResid)),rep("local",length(localResid))))
  summary(residDf)
}
##    residuals           fitType         
##  Min.   : 0.000899   Length:2016       
##  1st Qu.:17.894458   Class :character  
##  Median :18.468848   Mode  :character  
##  Mean   :16.776221                     
##  3rd Qu.:18.926598                     
##  Max.   :20.945520

and we plot histograms of the fits

if(decide_fit){
  ggplot(residDf, aes(x = residuals, fill = fitType)) + scale_fill_manual(values = c("darkred", "darkblue")) + geom_histogram(alpha = 0.5, position='identity', bins = 100) + theme_bw()
}

Now, we will decide for the better fit based on median

summary(parametricResid)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.09595 18.18429 18.59938 16.85875 18.92660 19.52154
if(decide_fit){
  summary(localResid)
  if (median(localResid) <= median(parametricResid)){
    cat("chosen fitType: local")
    ddw <- local_ddw
  }else{
    cat("chosen fitType: parametric")
    ddw <- parametric_ddw
  }
  rm(local_ddw,parametric_ddw,residDf,parametricResid,localResid)
}else{
  ddw <- parametric_ddw
  rm(parametric_ddw)
}
## chosen fitType: local

1.5.2 Wald test or LRT

if(LRT){
  ddw <- nbinomLRT(ddw,full = ~type, reduced = ~1)
}else{
  ddw <- nbinomWaldTest(ddw)
}

1.5.3 Significance testing

resultWindows <- resultsDEWSeq(ddw,
                              contrast = c("type", "IP", "SMI"),
                              tidy = TRUE) %>% as_tibble

resultWindows
## # A tibble: 672 × 20
##    chromosome   begin   end width strand uniqu…¹ gene_id gene_…² gene_…³ gene_…⁴
##    <chr>        <dbl> <int> <int> <chr>  <chr>   <chr>   <chr>   <chr>   <chr>  
##  1 Synechocyst…  2467  2517    50 +      TU5005… TU5005  TU5005  protei… exon   
##  2 Synechocyst…  2487  2537    50 +      TU5005… TU5005  TU5005  protei… exon   
##  3 Synechocyst…  2507  2557    50 +      TU5005… TU5005  TU5005  protei… exon   
##  4 Synechocyst…  2527  2577    50 +      TU5005… TU5005  TU5005  protei… exon   
##  5 Synechocyst…  2547  2597    50 +      TU5005… TU5005  TU5005  protei… exon   
##  6 Synechocyst…  2567  2617    50 +      TU5005… TU5005  TU5005  protei… exon   
##  7 Synechocyst…  2587  2637    50 +      TU5005… TU5005  TU5005  protei… exon   
##  8 Synechocyst…  2607  2657    50 +      TU5005… TU5005  TU5005  protei… exon   
##  9 Synechocyst…  2627  2677    50 +      TU5005… TU5005  TU5005  protei… exon   
## 10 Synechocyst…  2647  2697    50 +      TU5005… TU5005  TU5005  protei… exon   
## # … with 662 more rows, 10 more variables: Nr_of_region <int>,
## #   Total_nr_of_region <int>, window_number <int>, baseMean <dbl>,
## #   log2FoldChange <dbl>, lfcSE <dbl>, stat <dbl>, pvalue <dbl>,
## #   pSlidingWindows <dbl>, pSlidingWindows.adj <dbl>, and abbreviated variable
## #   names ¹​unique_id, ²​gene_name, ³​gene_type, ⁴​gene_region

1.5.4 Multiple hypothesis correction with IHW

You might be interested to correct for multiple hypothesis testing with IHW.

Decide on overlap correction based on the parameter overlap_correction

if(overlap_correction & IHW){
  resultWindows[,"p_adj_IHW"] <- adj_pvalues(ihw(pSlidingWindows ~ baseMean, 
                     data = resultWindows,
                     alpha = p_value_cutoff,
                     nfolds = 10))
  padjCol <- "p_adj_IHW"
}else if(!overlap_correction & IHW){
  resultWindows[,"p_adj_IHW"] <- adj_pvalues(ihw(pvalue ~ baseMean, 
                     data = resultWindows,
                     alpha = p_value_cutoff,
                     nfolds = 10))
  padjCol <- "p_adj_IHW"
}else if(overlap_correction & !IHW){
  padjCol <- "pSlidingWindows.adj"
}else{
  resultWindows[,'p_adj'] <- p.adjust(resultWindows$pvalue,method="BH")
  padjCol <- 'p_adj'
}
## Only 1 bin; IHW reduces to Benjamini Hochberg (uniform weights)

Determine significant windows

resultWindows <- resultWindows %>% 
                      mutate(significant = resultWindows[ ,padjCol] < p_value_cutoff)
sigWindows <- sum(resultWindows$significant)

672 windows are significant

resultWindows %>%
   filter(significant) %>% 
   arrange(desc(log2FoldChange)) %>% 
   .[["gene_name"]] %>% 
   unique %>% 
   head(20)
##  [1] "TU5062" "TU5031" "TU5077" "TU5086" "TU5122" "TU5102" "TU5088" "TU5089"
##  [9] "TU5111" "TU5120" "TU5094" "TU5119" "TU5124" "TU5082" "TU5032" "TU5054"
## [17] "TU5036" "TU5092" "TU5106" "TU5013"

1.5.5 Combining windows

if(sigWindows>0){
    resultRegions <- extractRegions(windowRes  = resultWindows, padjCol    = padjCol, padjThresh = p_value_cutoff, log2FoldChangeThresh = lfc_cutoff) %>% as_tibble
}

1.5.6 Writing Bed file

if(sigWindows>1){
    toBED(windowRes = resultWindows, regionRes = resultRegions,padjThresh=p_value_cutoff,
        padjCol   = padjCol, fileName  = output_bed_file)
}else{
  message("This analysis does not have enough <=2 significant windows to create BED file for visualization")
}

1.6 Save Session

# save enriched windows, gzip results file if the file suffix is .gz
if(grepl("\\.gz$",output_windows_file,ignore.case = TRUE)){
  gz_out <- gzfile(output_windows_file,"w")
  write.table(resultWindows,file=gz_out,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
  close(gz_out)
}else{
  write.table(resultWindows,file=output_windows_file,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
}
# save enriched regions
if(sigWindows>0){
  if(grepl("\\.gz$",output_regions_file,ignore.case = TRUE)){
    gz_out <- gzfile(output_regions_file,"w")
    write.table(resultRegions,file=gz_out,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
    close(gz_out)
  }else{
    write.table(resultRegions,file=output_regions_file,sep="\t",quote=FALSE,row.names=FALSE,col.names=TRUE)
  }
}
# save session
# Warning! session images can be heavy!
if(nchar(output_Rdata)>5){
  save.image( file = output_Rdata)
}

1.7 Session Info

sessionInfo()
## R version 4.1.3 (2022-03-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/malong/miniconda3/envs/dewseq/lib/libopenblasp-r0.3.21.so
## 
## locale:
##  [1] LC_CTYPE=de_DE.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=de_DE.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=de_DE.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] IHW_1.22.0                  data.table_1.14.2          
##  [3] forcats_0.5.2               stringr_1.4.1              
##  [5] dplyr_1.0.10                purrr_0.3.5                
##  [7] readr_2.1.3                 tidyr_1.2.1                
##  [9] tibble_3.1.8                ggplot2_3.3.6              
## [11] tidyverse_1.3.2             DEWSeq_1.8.0               
## [13] BiocParallel_1.28.3         DESeq2_1.34.0              
## [15] SummarizedExperiment_1.24.0 Biobase_2.54.0             
## [17] MatrixGenerics_1.6.0        matrixStats_0.62.0         
## [19] GenomicRanges_1.46.1        GenomeInfoDb_1.30.0        
## [21] IRanges_2.28.0              S4Vectors_0.32.4           
## [23] BiocGenerics_0.40.0         R.utils_2.12.0             
## [25] R.oo_1.25.0                 R.methodsS3_1.8.2          
## [27] BiocStyle_2.22.0           
## 
## loaded via a namespace (and not attached):
##  [1] googledrive_2.0.0      colorspace_2.0-3       ellipsis_0.3.2        
##  [4] XVector_0.34.0         fs_1.5.2               farver_2.1.1          
##  [7] bit64_4.0.5            AnnotationDbi_1.56.1   fansi_1.0.3           
## [10] lubridate_1.8.0        xml2_1.3.3             splines_4.1.3         
## [13] cachem_1.0.6           geneplotter_1.72.0     knitr_1.40            
## [16] jsonlite_1.8.2         broom_1.0.1            annotate_1.72.0       
## [19] dbplyr_2.2.1           png_0.1-7              BiocManager_1.30.18   
## [22] compiler_4.1.3         httr_1.4.4             backports_1.4.1       
## [25] assertthat_0.2.1       Matrix_1.4-1           fastmap_1.1.0         
## [28] gargle_1.2.1           cli_3.4.1              htmltools_0.5.3       
## [31] tools_4.1.3            gtable_0.3.1           glue_1.6.2            
## [34] GenomeInfoDbData_1.2.7 Rcpp_1.0.9             slam_0.1-50           
## [37] cellranger_1.1.0       jquerylib_0.1.4        vctrs_0.4.2           
## [40] Biostrings_2.62.0      xfun_0.33              rvest_1.0.3           
## [43] lifecycle_1.0.3        XML_3.99-0.11          googlesheets4_1.0.1   
## [46] zlibbioc_1.40.0        scales_1.2.1           hms_1.1.2             
## [49] parallel_4.1.3         RColorBrewer_1.1-3     yaml_2.3.5            
## [52] memoise_2.0.1          sass_0.4.2             stringi_1.7.8         
## [55] RSQLite_2.2.8          highr_0.9              genefilter_1.76.0     
## [58] rlang_1.0.6            pkgconfig_2.0.3        bitops_1.0-7          
## [61] lpsymphony_1.22.0      evaluate_0.17          lattice_0.20-45       
## [64] labeling_0.4.2         bit_4.0.4              tidyselect_1.1.2      
## [67] magrittr_2.0.3         bookdown_0.29          R6_2.5.1              
## [70] generics_0.1.3         DelayedArray_0.20.0    DBI_1.1.3             
## [73] pillar_1.8.1           haven_2.5.1            withr_2.5.0           
## [76] survival_3.4-0         KEGGREST_1.34.0        RCurl_1.98-1.9        
## [79] modelr_0.1.9           crayon_1.5.2           fdrtool_1.2.17        
## [82] utf8_1.2.2             tzdb_0.3.0             rmarkdown_2.17        
## [85] locfit_1.5-9.6         grid_4.1.3             readxl_1.4.1          
## [88] blob_1.2.3             reprex_2.0.2           digest_0.6.29         
## [91] xtable_1.8-4           munsell_0.5.0          bslib_0.4.0