# HG changeset patch # User mingchen0919 # Date 1508592349 14400 # Node ID 1710b0e874f1f5e434b519d48d94533d759875e7 # Parent d1d20f3416326d70afb7d6bf4ac5914ddf369f5b fix file name issue diff -r d1d20f341632 -r 1710b0e874f1 fastqc_report.Rmd --- a/fastqc_report.Rmd Thu Oct 19 00:11:14 2017 -0400 +++ b/fastqc_report.Rmd Sat Oct 21 09:25:49 2017 -0400 @@ -10,72 +10,112 @@ ```{r setup, include=FALSE, warning=FALSE, message=FALSE} knitr::opts_chunk$set( - echo = ECHO + echo = ECHO, + error = TRUE ) ``` -# Fastqc Analysis - -* Copy fastq files to job working directory +# Fastqc Evaluation -```{bash 'copy files'} -for f in $(echo READS | sed "s/,/ /g") -do - cp $f ./ -done -``` - -* Run fastqc +## Evaluation of reads before trimming -```{bash 'run fastqc'} -for r in $(ls *.dat) -do - fastqc -o REPORT_DIR $r > /dev/null 2>&1 -done -``` - -## Evaluation results - -```{r 'html report links'} -html_file = list.files('REPORT_DIR', pattern = '.*html') -tags$ul(tags$a(href=html_file, paste0('HTML report', opt$name))) +```{r} +if ('READS_1' == 'None') { + stop("No pre-trimming reads provided!") +} else { + ## run fastqc evaluation + fastqc_command = paste0('fastqc ') %>% + (function(x) { + ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) + }) %>% + (function(x) { + ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) + }) %>% + (function(x) { + paste0(x, '-o REPORT_DIR ') + }) + fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1') + system(fastqc_command_reads_1, intern = TRUE) + + # Original html report + reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1) + original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1)) + + unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR') + reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/') + # fastqc_data.txt + file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt') + fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1)) + # summary.txt + file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt') + summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1)) + + tags$ul( + tags$li(original_html), + tags$li(fastqc_data), + tags$li(summary_data) + ) +} ``` -```{r 'extract fastqc_data.txt and summary.txt'} -# list all zip files -zip_file = list.files(path = 'REPORT_DIR', pattern = '.zip') -unzip(paste0('REPORT_DIR/', zip_file), exdir = 'REPORT_DIR') +## Evaluation of reads after trimming -unzip_directory = paste0(tail(strsplit(opt$reads, '/')[[1]], 1), '_fastqc/') -fastqc_data_txt_path = paste0('REPORT_DIR/', unzip_directory, 'fastqc_data.txt') -summary_txt_path = paste0('REPORT_DIR/', unzip_directory, 'summary.txt') +```{r} +if ('READS_2' == 'None') { + stop("No pre-trimming reads provided!") +} else { + ## run fastqc evaluation + fastqc_command = paste0('fastqc ') %>% + (function(x) { + ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) + }) %>% + (function(x) { + ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) + }) %>% + (function(x) { + paste0(x, '-o REPORT_DIR ') + }) + fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1') + system(fastqc_command_reads_2, intern = TRUE) + + # Original html report + reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1) + original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2)) + + unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR') + reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/') + # fastqc_data.txt + file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt') + fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2)) + # summary.txt + file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt') + summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2)) + + tags$ul( + tags$li(original_html), + tags$li(fastqc_data), + tags$li(summary_data) + ) +} ``` -```{r 'summary.txt'} -tags$ul(tags$a(href=paste0(unzip_directory, 'summary.txt'), 'summary.txt')) -``` - - -```{r 'fastqc_data.txt'} -tags$ul(tags$a(href=paste0(unzip_directory, 'fastqc_data.txt'), 'fastqc_data.txt')) -``` - # Fastqc output visualization ## Overview ```{r} -# read.table(fastqc_data_txt_path) -summary_txt = read.csv(summary_txt_path, header = FALSE, sep = '\t')[, 2:1] -names(summary_txt) = c('MODULE', 'PASS/FAIL') -knitr::kable(summary_txt) +reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] +reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1] +combined_summary = cbind(reads_1_summary, reads_2_summary) +names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) +knitr::kable(combined_summary) ``` -## Summary by module {.tabset} +## Visualization by module {.tabset} * Define a function to extract outputs for each module from fastqc output @@ -93,16 +133,53 @@ ### Per base sequence quality -```{r} -pbsq = extract_data_module(fastqc_data_txt_path, 'Per base sequence quality') -knitr::kable(pbsq) +```{r 'per base sequence quality', fig.width=10} +## reads 1 +pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality') +pbsq_1$id = 1:length(pbsq_1$X.Base) + +melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') +melt_pbsq_1$trim = 'before' + + +## reads 2 +pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality') +pbsq_2$id = 1:length(pbsq_2$X.Base) + +melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') +melt_pbsq_2$trim = 'after' + +comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2) +comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) +p = ggplot(data = comb_pbsq) + + geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) + + scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + + facet_grid(. ~ trim) + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) + ``` ### Per tile sequence quality -```{r} -ptsq = extract_data_module(fastqc_data_txt_path, 'Per tile sequence quality') -knitr::kable(ptsq) +```{r 'per tile sequence quality'} +## reads 1 +ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') +ptsq_1$trim = 'before' + +## reads 2 +ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') +ptsq_2$trim = 'after' + +comb_ptsq = rbind(ptsq_1, ptsq_2) +comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) +comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) + +p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) + + geom_raster() + + facet_grid(. ~ trim) + + theme(axis.text.x = element_text(angle=45)) +ggplotly(p) ``` diff -r d1d20f341632 -r 1710b0e874f1 fastqc_report.xml --- a/fastqc_report.xml Thu Oct 19 00:11:14 2017 -0400 +++ b/fastqc_report.xml Sat Oct 21 09:25:49 2017 -0400 @@ -1,6 +1,7 @@ - Implements FastQC analysis and display results in R Markdown html. + Evaluate short reads with FastQC software on a single reads file or a paired of untrimmed and trimmed reads + files. pandoc @@ -29,8 +30,12 @@ Rscript '${__tool_directory__}/fastqc_report_render.R' -e $echo - -r $reads - -n $reads.name + -r $reads_1 + -n '$reads_1.name' + -R $reads_2 + -N '$reads_2.name' + -c $contaminants + -l $limits -o $report -d $report.files_path @@ -40,8 +45,23 @@ ]]> - + + + + diff -r d1d20f341632 -r 1710b0e874f1 fastqc_report_render.R --- a/fastqc_report_render.R Thu Oct 19 00:11:14 2017 -0400 +++ b/fastqc_report_render.R Sat Oct 21 09:25:49 2017 -0400 @@ -9,6 +9,7 @@ library(reshape2) library(plotly) library(formattable) +options(stringsAsFactors=FALSE, useFancyQuotes=FALSE) ##============ Sink warnings and errors to a file ============== ## use the sink() function to wrap all code within it. @@ -39,8 +40,12 @@ args_list=list() ##------- 1. input data --------------------- args_list$ECHO = c('echo', 'e', '1', 'character') - args_list$READS = c('reads', 'r', '1', 'character') - args_list$NAMES = c('names', 'n', '1', 'character') + args_list$READS_1 = c('reads_1', 'r', '1', 'character') + args_list$NAME_1 = c('name_1', 'n', '1', 'character') + args_list$READS_2 = c('reads_2', 'R', '1', 'character') + args_list$NAME_2 = c('name_2', 'N', '1', 'character') + args_list$CONTAMINANTS = c('contaminants', 'c', '1', 'character') + args_list$LIMITS = c('limits', 'l', '1', 'character') ##--------2. output report and outputs -------------- args_list$REPORT_HTML = c('report_html', 'o', '1', 'character') args_list$REPORT_DIR = c('report_dir', 'd', '1', 'character') @@ -66,7 +71,22 @@ gsub('ECHO', opt$echo, x) }) %>% (function(x) { - gsub('READS', opt$reads, x) + gsub('READS_1', opt$reads_1, x) + }) %>% + (function(x) { + gsub('NAME_1', opt$name_1, x) + }) %>% + (function(x) { + gsub('READS_2', opt$reads_2, x) + }) %>% + (function(x) { + gsub('NAME_2', opt$name_1, x) + }) %>% + (function(x) { + gsub('CONTAMINANTS', opt$contaminants, x) + }) %>% + (function(x) { + gsub('LIMITS', opt$limits, x) }) %>% (function(x) { gsub('REPORT_DIR', opt$report_dir, x)