Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site
diff 01_evaluation_overview.Rmd @ 11:507eec497730 draft
update fastqc site
author | mingchen0919 |
---|---|
date | Tue, 07 Nov 2017 16:52:24 -0500 |
parents | d732d4526c6d |
children |
line wrap: on
line diff
--- a/01_evaluation_overview.Rmd Tue Aug 15 15:50:21 2017 -0400 +++ b/01_evaluation_overview.Rmd Tue Nov 07 16:52:24 2017 -0500 @@ -1,123 +1,124 @@ --- -title: "Evaluation Overview" -output: html_document +title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)' +output: + html_document: + number_sections: true + toc: true + theme: cosmo + highlight: tango --- ```{r setup, include=FALSE, warning=FALSE, message=FALSE} -knitr::opts_chunk$set(echo = ECHO) -``` - -```{bash 'copy data from datasets directory to working directory', echo=FALSE} -# Copy uploaded data to the working directory -for f in $(echo READS | sed "s/,/ /g") -do - cp $f ./ -done -``` - -```{bash 'run fastqc', echo=FALSE} -# run fastqc and place outputs into the report directory -for r in $(ls *.dat) -do - fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 -done -``` - -```{bash 'parse fastqc results', echo=FALSE} -##==== copy fastqc generated zip files from report output directory to job work directory == -cp -r REPORT_OUTPUT_DIR/*zip ./ - -# create a file to store data file paths -echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail -echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score -echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score -echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content -echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content -echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content -echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level -echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution -echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content - -for i in $(ls *.zip) -do - BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') - echo $BASE - unzip ${BASE}.zip > /dev/null 2>&1 - - ##====== pass,warning,fail (WSF) ============= - awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt - echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt - - ##====== per base quality scores (PBQS) ====== - awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt - echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt - - ##====== per sequence quality scores (PSQS) - awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt - echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt - - ##====== Per sequence GC content (PSGC) - awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt - echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt - - ##====== Per Base Sequence Content (PBSC) - awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt - echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt - - ##====== Per Base N Content (PBNC) - awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt - echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt - - ##====== Sequence Duplication Level (SDL) - awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt - echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt - - ##====== Sequence Length Distribution (SLD) - awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt - echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt - - ##====== Kmer Content ============ - awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt - echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt - -done +knitr::opts_chunk$set( + echo = ECHO, + error = TRUE +) ``` -## Evaluation Overview +# Fastqc Evaluation + +## Evaluation of reads before trimming -```{r 'overview'} -PWF_file_paths = read.csv('PWF_file_paths.txt', - header = TRUE, stringsAsFactors = FALSE) -rm('PWF_df') -for(i in 1:nrow(PWF_file_paths)) { - file_path = PWF_file_paths[i,2] - pwf_df = read.csv(file_path, - sep='\t', header=FALSE, stringsAsFactors = FALSE) - colnames(pwf_df) = c('item', PWF_file_paths[i,1]) - if (!exists('PWF_df')) { - PWF_df = pwf_df - } else { - PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) - } +```{r} +if ('READS_1' == 'None') { + stop("No pre-trimming reads provided!") +} else { + ## run fastqc evaluation + fastqc_command = paste0('fastqc ') %>% + (function(x) { + ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) + }) %>% + (function(x) { + ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) + }) %>% + (function(x) { + paste0(x, '-o REPORT_DIR ') + }) + fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1') + system(fastqc_command_reads_1, intern = TRUE) + + # Original html report + reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1) + original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1)) + + unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR') + reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/') + # fastqc_data.txt + file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt') + fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1)) + # summary.txt + file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt') + summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1)) + + tags$ul( + tags$li(original_html), + tags$li(fastqc_data), + tags$li(summary_data) + ) } ``` +## Evaluation of reads after trimming + ```{r} -my_icon = c('ok', 'remove', 'star') -names(my_icon) = c('pass', 'fail', 'warn') -evaluate_list = list() -for (i in colnames(PWF_df)[-1]) { - evaluate_list[[i]] = formatter( - "span", - style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), - "color" = "white", - "width" = "50px", - "float" = "left", - "padding-right" = "5px") - ) +if ('READS_2' == 'None') { + stop("No pre-trimming reads provided!") +} else { + ## run fastqc evaluation + fastqc_command = paste0('fastqc ') %>% + (function(x) { + ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) + }) %>% + (function(x) { + ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) + }) %>% + (function(x) { + paste0(x, '-o REPORT_DIR ') + }) + fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1') + system(fastqc_command_reads_2, intern = TRUE) + + # Original html report + reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1) + original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2)) + + unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR') + reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/') + # fastqc_data.txt + file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt') + fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2)) + # summary.txt + file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt') + summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2)) + + tags$ul( + tags$li(original_html), + tags$li(fastqc_data), + tags$li(summary_data) + ) } +``` -formattable(PWF_df, evaluate_list) + + +# Fastqc output visualization + +## Overview + +```{r} +reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] +reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1] +combined_summary = cbind(reads_1_summary, reads_2_summary) +names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) +combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' +combined_summary[combined_summary == 'WARN'] = 'WARN (!)' +knitr::kable(combined_summary) +``` + +# Session Info + +```{r 'session info'} +sessionInfo() ``` \ No newline at end of file