diff 01_evaluation_overview.Rmd @ 11:507eec497730 draft

update fastqc site
author mingchen0919
date Tue, 07 Nov 2017 16:52:24 -0500
parents d732d4526c6d
children
line wrap: on
line diff
--- a/01_evaluation_overview.Rmd	Tue Aug 15 15:50:21 2017 -0400
+++ b/01_evaluation_overview.Rmd	Tue Nov 07 16:52:24 2017 -0500
@@ -1,123 +1,124 @@
 ---
-title: "Evaluation Overview"
-output: html_document
+title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
+output:
+    html_document:
+      number_sections: true
+      toc: true
+      theme: cosmo
+      highlight: tango
 ---
 
 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
-```
-
-```{bash 'copy data from datasets directory to working directory', echo=FALSE}
-# Copy uploaded data to the working directory
-for f in $(echo READS | sed "s/,/ /g")
-do
-    cp $f ./
-done
-```
-
-```{bash 'run fastqc', echo=FALSE}
-# run fastqc and place outputs into the report directory
-for r in $(ls *.dat)
-do
-    fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
-done
-```
-
-```{bash 'parse fastqc results', echo=FALSE}
-##==== copy fastqc generated zip files from report output directory to job work directory ==
-cp -r REPORT_OUTPUT_DIR/*zip ./
-
-# create a file to store data file paths
-echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
-echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
-echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
-echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
-echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
-echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
-echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
-echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
-echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
-
-for i in $(ls *.zip)
-do
-    BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
-    echo $BASE
-    unzip ${BASE}.zip > /dev/null 2>&1
-    
-    ##====== pass,warning,fail (WSF) =============
-    awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
-    echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
-
-    ##====== per base quality scores (PBQS) ======
-    awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
-    echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
-
-    ##====== per sequence quality scores (PSQS)
-    awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
-    echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
-
-    ##====== Per sequence GC content (PSGC)
-    awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
-    echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
-    
-    ##====== Per Base Sequence Content (PBSC)
-    awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
-    echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
-    
-    ##====== Per Base N Content (PBNC)
-    awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
-    echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
-    
-    ##====== Sequence Duplication Level (SDL)
-    awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
-    echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
-    
-    ##====== Sequence Length Distribution (SLD)
-    awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
-    echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
-    
-    ##====== Kmer Content ============
-    awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
-    echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
-    
-done
+knitr::opts_chunk$set(
+  echo = ECHO,
+  error = TRUE
+)
 ```
 
 
-## Evaluation Overview
+# Fastqc Evaluation
+
+## Evaluation of reads before trimming
 
-```{r 'overview'}
-PWF_file_paths = read.csv('PWF_file_paths.txt',
-                           header = TRUE, stringsAsFactors = FALSE)
-rm('PWF_df')
-for(i in 1:nrow(PWF_file_paths)) {
-  file_path = PWF_file_paths[i,2]
-  pwf_df = read.csv(file_path,
-                     sep='\t', header=FALSE, stringsAsFactors = FALSE)
-  colnames(pwf_df) = c('item', PWF_file_paths[i,1])
-  if (!exists('PWF_df')) {
-    PWF_df = pwf_df
-  } else {
-    PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
-  }
+```{r}
+if ('READS_1' == 'None') {
+  stop("No pre-trimming reads provided!")
+} else {
+  ## run fastqc evaluation
+  fastqc_command = paste0('fastqc ') %>% 
+  (function(x) {
+    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+  }) %>% 
+  (function(x) {
+    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+  }) %>% 
+  (function(x) {
+    paste0(x, '-o REPORT_DIR ')
+  })
+  fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
+  system(fastqc_command_reads_1, intern = TRUE)
+  
+  # Original html report
+  reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
+  original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
+  
+  unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+  reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
+  # fastqc_data.txt
+  file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
+  fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
+  # summary.txt
+  file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
+  summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
+  
+  tags$ul(
+    tags$li(original_html),
+    tags$li(fastqc_data),
+    tags$li(summary_data)
+  )
 }
 ```
 
 
+## Evaluation of reads after trimming
+
 ```{r}
-my_icon = c('ok', 'remove', 'star')
-names(my_icon) = c('pass', 'fail', 'warn')
-evaluate_list = list()
-for (i in colnames(PWF_df)[-1]) {
-  evaluate_list[[i]] = formatter(
-      "span", 
-      style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), 
-                        "color" = "white",
-                        "width" = "50px",
-                        "float" = "left",
-                        "padding-right" = "5px")
-    )
+if ('READS_2' == 'None') {
+  stop("No pre-trimming reads provided!")
+} else {
+  ## run fastqc evaluation
+  fastqc_command = paste0('fastqc ') %>% 
+  (function(x) {
+    ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+  }) %>% 
+  (function(x) {
+    ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+  }) %>% 
+  (function(x) {
+    paste0(x, '-o REPORT_DIR ')
+  })
+  fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
+  system(fastqc_command_reads_2, intern = TRUE)
+  
+  # Original html report
+  reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
+  original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
+  
+  unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+  reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
+  # fastqc_data.txt
+  file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
+  fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
+  # summary.txt
+  file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
+  summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
+  
+  tags$ul(
+    tags$li(original_html),
+    tags$li(fastqc_data),
+    tags$li(summary_data)
+  )
 }
+```
 
-formattable(PWF_df, evaluate_list)
+
+
+# Fastqc output visualization
+
+## Overview
+
+```{r}
+reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
+reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1]
+combined_summary = cbind(reads_1_summary, reads_2_summary)
+names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
+combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
+combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
+knitr::kable(combined_summary)
+```
+
+# Session Info
+
+```{r 'session info'}
+sessionInfo()
 ```
\ No newline at end of file