rmarkdown_fastqc_site: 01_evaluation

comparison 01_evaluation_overview.Rmd @ 11:507eec497730 draft

update fastqc site

author	mingchen0919
date	Tue, 07 Nov 2017 16:52:24 -0500
parents	d732d4526c6d
children

comparison

equal deleted inserted replaced

-:600c39b11913
+:507eec497730
 ---
-title: "Evaluation Overview"
+title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
-output: html_document
+output:
+html_document:
+number_sections: true
+toc: true
+theme: cosmo
+highlight: tango
 ---
 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
-knitr::opts_chunk$set(echo = ECHO)
+knitr::opts_chunk$set(
-```
+echo = ECHO,
+error = TRUE
-```{bash 'copy data from datasets directory to working directory', echo=FALSE}
+)
-# Copy uploaded data to the working directory
-for f in $(echo READS | sed "s/,/ /g")
-do
-cp $f ./
-done
-```
-```{bash 'run fastqc', echo=FALSE}
-# run fastqc and place outputs into the report directory
-for r in $(ls *.dat)
-do
-fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
-done
-```
-```{bash 'parse fastqc results', echo=FALSE}
-##==== copy fastqc generated zip files from report output directory to job work directory ==
-cp -r REPORT_OUTPUT_DIR/*zip ./
-# create a file to store data file paths
-echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
-echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
-echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
-echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
-echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
-echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
-echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
-echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
-echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
-for i in $(ls *.zip)
-do
-BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
-echo $BASE
-unzip ${BASE}.zip > /dev/null 2>&1
-##====== pass,warning,fail (WSF) =============
-awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
-echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
-##====== per base quality scores (PBQS) ======
-awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
-echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
-##====== per sequence quality scores (PSQS)
-awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
-echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
-##====== Per sequence GC content (PSGC)
-awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
-echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
-##====== Per Base Sequence Content (PBSC)
-awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
-echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
-##====== Per Base N Content (PBNC)
-awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
-echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
-##====== Sequence Duplication Level (SDL)
-awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
-echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
-##====== Sequence Length Distribution (SLD)
-awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
-echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
-##====== Kmer Content ============
-awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
-echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
-done
 ```
-## Evaluation Overview
+# Fastqc Evaluation
-```{r 'overview'}
+## Evaluation of reads before trimming
-PWF_file_paths = read.csv('PWF_file_paths.txt',
-header = TRUE, stringsAsFactors = FALSE)
+```{r}
-rm('PWF_df')
+if ('READS_1' == 'None') {
-for(i in 1:nrow(PWF_file_paths)) {
+stop("No pre-trimming reads provided!")
-file_path = PWF_file_paths[i,2]
+} else {
-pwf_df = read.csv(file_path,
+## run fastqc evaluation
-sep='\t', header=FALSE, stringsAsFactors = FALSE)
+fastqc_command = paste0('fastqc ') %>%
-colnames(pwf_df) = c('item', PWF_file_paths[i,1])
+(function(x) {
-if (!exists('PWF_df')) {
+ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
-PWF_df = pwf_df
+}) %>%
-} else {
+(function(x) {
-PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
+ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
-}
+}) %>%
+(function(x) {
+paste0(x, '-o REPORT_DIR ')
+})
+fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
+system(fastqc_command_reads_1, intern = TRUE)
+# Original html report
+reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
+original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
+unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
+# fastqc_data.txt
+file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
+fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
+# summary.txt
+file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
+summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
+tags$ul(
+tags$li(original_html),
+tags$li(fastqc_data),
+tags$li(summary_data)
+)
 }
 ```
+## Evaluation of reads after trimming
 ```{r}
-my_icon = c('ok', 'remove', 'star')
+if ('READS_2' == 'None') {
-names(my_icon) = c('pass', 'fail', 'warn')
+stop("No pre-trimming reads provided!")
-evaluate_list = list()
+} else {
-for (i in colnames(PWF_df)[-1]) {
+## run fastqc evaluation
-evaluate_list[[i]] = formatter(
+fastqc_command = paste0('fastqc ') %>%
-"span",
+(function(x) {
-style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')),
+ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
-"color" = "white",
+}) %>%
-"width" = "50px",
+(function(x) {
-"float" = "left",
+ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
-"padding-right" = "5px")
+}) %>%
-)
+(function(x) {
+paste0(x, '-o REPORT_DIR ')
+})
+fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
+system(fastqc_command_reads_2, intern = TRUE)
+# Original html report
+reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
+original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
+unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
+# fastqc_data.txt
+file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
+fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
+# summary.txt
+file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
+summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
+tags$ul(
+tags$li(original_html),
+tags$li(fastqc_data),
+tags$li(summary_data)
+)
 }
+```
-formattable(PWF_df, evaluate_list)
+# Fastqc output visualization
+## Overview
+```{r}
+reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
+reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1]
+combined_summary = cbind(reads_1_summary, reads_2_summary)
+names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
+combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
+combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
+knitr::kable(combined_summary)
 ```
+# Session Info
+```{r 'session info'}
+sessionInfo()
+```

Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site

comparison 01_evaluation_overview.Rmd @ 11:507eec497730 draft