Mercurial > repos > mingchen0919 > aurora_fastqc
diff rmarkdown_report.Rmd @ 0:0aeed70b3bc5 draft default tip
planemo upload commit 841d8b22bf9f1aaed6bfe8344b60617f45b275b2-dirty
author | mingchen0919 |
---|---|
date | Fri, 14 Dec 2018 00:38:44 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/rmarkdown_report.Rmd Fri Dec 14 00:38:44 2018 -0500 @@ -0,0 +1,458 @@ +--- +title: '[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) report' +output: + html_document: + highlight: pygments +--- + + +```{r setup, include=FALSE, warning=FALSE, message=FALSE} +knitr::opts_chunk$set(error = TRUE, echo = FALSE) +``` + +```{css, echo=FALSE} +pre code, pre, code { + white-space: pre !important; + overflow-x: scroll !important; + word-break: keep-all !important; + word-wrap: initial !important; +} +``` + +```{r, echo=FALSE} +# to make the css theme to work, <link></link> tags cannot be added directly +# as <script></script> tags as below. +# it has to be added using a code chunk with the htmltool functions!!! +css_link = tags$link() +css_link$attribs = list(rel="stylesheet", href="vakata-jstree-3.3.5/dist/themes/default/style.min.css") +css_link +``` + +```{r, eval=FALSE, echo=FALSE} +# this code chunk is purely for adding comments +# below is to add jQuery and jstree javascripts +``` + +<script src="vakata-jstree-3.3.5/dist/jstree.min.js"></script> + + +```{r, eval=FALSE, echo=FALSE} +# this code chunk is purely for adding comments +# javascript code below is to build the file tree interface +# see this for how to implement opening hyperlink: https://stackoverflow.com/questions/18611317/how-to-get-i-get-leaf-nodes-in-jstree-to-open-their-hyperlink-when-clicked-when +``` +<script> + jQuery(function () { + // create an instance when the DOM is ready + jQuery('#jstree').jstree().bind("select_node.jstree", function (e, data) { + window.open( data.node.a_attr.href, data.node.a_attr.target ) + }); + }); +</script> + + +```{r, eval=FALSE, echo=FALSE} +--- +# ADD YOUR DATA ANALYSIS CODE AND MARKUP TEXT BELOW TO EXTEND THIS R MARKDOWN FILE +--- +``` + + +# Run FastQC + +```{bash} +sh ${TOOL_INSTALL_DIR}/build-and-run-job-scripts.sh +``` + +```{r echo=FALSE,results='asis'} +# display fastqc job script +cat('```bash\n') +cat(readLines(paste0(Sys.getenv('REPORT_FILES_PATH'), '/job-1-script.sh')), sep = '\n') +cat('\n```') +``` + +# Fastqc Output Visualization + +## Overview + +```{r eval=TRUE} +read_1_summary = read.csv(paste0(opt$X_d, '/read_1_fastqc/summary.txt'), + stringsAsFactors = FALSE, + header = FALSE, sep = '\t')[, 2:1] +read_2_summary = read.csv(paste0(opt$X_d, '/read_2_fastqc/summary.txt'), + stringsAsFactors = FALSE, + header = FALSE, sep = '\t')[, 1] +combined_summary = data.frame(read_1_summary, read_2_summary, stringsAsFactors = FALSE) +names(combined_summary) = c('MODULE', 'Pre-trimming', 'Post-trimming') +combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' +combined_summary[combined_summary == 'WARN'] = 'WARN (!)' +DT::datatable(combined_summary) +``` + +```{r 'function definition', echo=FALSE} +extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { + f = readLines(fastqc_data) + start_line = grep(module_name, f) + end_module_lines = grep('END_MODULE', f) + end_line = end_module_lines[which(end_module_lines > start_line)[1]] + module_data = f[(start_line+1):(end_line-1)] + writeLines(module_data, '/tmp/temp.txt') + read.csv('/tmp/temp.txt', sep = '\t', header = header, comment.char = comment.char) +} +``` + + +### Basic Statistics {.tabset} + +#### Before + +```{r} +fastqc_data_1 = paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt') +module_name = 'Basic Statistics pass' + +basic_statistics = extract_data_module(fastqc_data_1, module_name) +colnames(basic_statistics) = c('Measure', 'Value') +DT::datatable(basic_statistics) +``` + +#### After + +```{r} +fastqc_data_2 = paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt') +module_name = 'Basic Statistics pass' + +basic_statistics = extract_data_module(fastqc_data_2, module_name) +colnames(basic_statistics) = c('Measure', 'Value') +DT::datatable(basic_statistics) +``` + + +### Per base sequence quality + +```{r 'per base sequence quality'} +## reads 1 +pbsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence quality') +pbsq_1$id = 1:length(pbsq_1$X.Base) +pbsq_1$trim = 'before' + +## reads 2 +pbsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence quality') +pbsq_2$id = 1:length(pbsq_2$X.Base) +pbsq_2$trim = 'after' + +comb_pbsq = rbind(pbsq_1, pbsq_2) +comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) + +p = ggplot(data = comb_pbsq) + + geom_boxplot(mapping = aes(x = id, + lower = Lower.Quartile, + upper = Upper.Quartile, + middle = Median, + ymin = X10th.Percentile, + ymax = X90th.Percentile, + fill = "yellow"), + stat = 'identity') + + geom_line(mapping = aes(x = id, y = Mean, color = "red")) + + scale_x_continuous(name = '\nPosition in read (bp)', breaks = pbsq_2$id, labels = pbsq_2$X.Base) + + scale_y_continuous(limits = c(0, max(comb_pbsq$Upper.Quartile) + 5)) + + scale_fill_identity() + + scale_color_identity() + + facet_grid(. ~ trim) + + theme(axis.text.x = element_text(size = 5), + panel.background = element_rect(fill = NA), + panel.grid.major.y = element_line(color = 'blue', size = 0.1)) +p +``` + + +### Per tile sequence quality + +```{r 'per tile sequence quality'} +## check if 'per tile sequence quality' module exits or not +check_ptsq = grep('Per tile sequence quality', readLines(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'))) +if (length(check_ptsq) > 0) { + ## reads 1 + ptsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per tile sequence quality') + ptsq_1$trim = 'before' + + ## reads 2 + ptsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per tile sequence quality') + ptsq_2$trim = 'after' + + comb_ptsq = rbind(ptsq_1, ptsq_2) + comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) + comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) + + # convert integers to charaters + # comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) + + p = ggplot(data = comb_ptsq) + + geom_raster(mapping = aes(x = Base, y = X.Tile, fill = Mean)) + + facet_grid(. ~ trim) + + scale_x_discrete(name = "\nPosition in read (bp)") + + scale_y_continuous(name = "") + + scale_fill_gradient(low = "blue", high = "red") + + theme(axis.text.x = element_text(size = 5, angle = 90), + axis.text.y = element_text(size = 5), + panel.background = element_rect(fill = NA)) + ggplotly(p) +} else { + print('No "per tile sequence quality" data') +} +``` + +### Per sequence quality score + +```{r 'Per sequence quality score'} +## reads 1 +psqs_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence quality scores') +psqs_1$trim = 'before' + +## reads 2 +psqs_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence quality scores') +psqs_2$trim = 'after' + +comb_psqs = rbind(psqs_1, psqs_2) +comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) + +p = ggplot(data = comb_psqs) + + geom_line(mapping = aes(x = X.Quality, y = Count), color = 'red') + + facet_grid(. ~ trim) + + scale_x_continuous(name = '\nMean Sequence Qaulity (Phred Score)', + limits = c(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality))) + + scale_y_continuous(name = '') + + theme(panel.background = element_rect(fill = NA), + axis.line = element_line(), + panel.grid.major.y = element_line(color = 'blue', size = 0.1)) +p +``` + +### Per base sequence content + +```{r 'Per base sequence content'} +## reads 1 +pbsc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence content') +pbsc_1$id = 1:length(pbsc_1$X.Base) + +melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) +melt_pbsc_1$trim = 'before' + + +## reads 2 +pbsc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence content') +pbsc_2$id = 1:length(pbsc_2$X.Base) + +melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) +melt_pbsc_2$trim = 'after' + +comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) +comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) + +p = ggplot(data = comb_pbsc) + + geom_line(mapping = aes(x = id, y = value, color = variable)) + + facet_grid(. ~ trim) + + xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + + ylim(0, 100) + + xlab('\nPosition in read (bp)') + + ylab('') + + scale_color_discrete(name = '') + + theme_classic() +p +``` + +### Per sequence GC content + +```{r 'Per sequence GC content'} +## reads 1 +psGCc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence GC content') +psGCc_1$trim = 'before' + +## reads 2 +psGCc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence GC content') +psGCc_2$trim = 'after' + +comb_psGCc = rbind(psGCc_1, psGCc_2) +comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) + +p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + + geom_line(color = 'red') + + facet_grid(. ~ trim) + + xlab('\nMean Sequence Qaulity (Phred Score)') + + ylab('') + + scale_color_discrete(name = '') + + theme_classic() +p +``` + + +### Per base N content + +```{r 'Per base N content'} +## reads 1 +pbNc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base N content') +pbNc_1$id = 1:length(pbNc_1$X.Base) +pbNc_1$trim = 'before' + +## reads 2 +pbNc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base N content') +pbNc_2$id = 1:length(pbNc_2$X.Base) +pbNc_2$trim = 'after' + +comb_pbNc = rbind(pbNc_1, pbNc_2) +comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) + +p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + + facet_grid(. ~ trim) + + ylim(0, 1) + + xlab('\nN-Count') + + ylab('') + + theme(axis.text.x = element_text(size = 5), + axis.line = element_line(), + panel.background = element_rect(fill = NA)) +p +``` + + +### Sequence Length Distribution + +```{r 'Sequence Length Distribution'} +## reads 1 +sld_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') +sld_1$id = 1:length(sld_1$X.Length) +sld_1$trim = 'before' + +## reads 2 +sld_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Length Distribution') +sld_2$id = 1:length(sld_2$X.Length) +sld_2$trim = 'after' + +comb_sld = rbind(sld_1, sld_2) +comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) + +p = ggplot(data = comb_sld, aes(x = id, y = Count)) + + geom_line(color = 'red') + + scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + + facet_grid(. ~ trim) + + xlab('\nSequence Length (bp)') + + ylab('') + + theme(axis.text.x = element_text(size = 5), + panel.background = element_rect(fill = NA), + axis.line = element_line(), + plot.margin = margin(2,2,2,10) ) +p +``` + +### Sequence Duplication Levels + +```{r 'Sequence Duplication Levels'} +## reads 1 +sdl_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_1$id = 1:length(sdl_1$Duplication_Level) + +melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) +melt_sdl_1$trim = 'before' + + +## reads 2 +sdl_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#') +names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') +sdl_2$id = 1:length(sdl_2$Duplication_Level) + +melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) +melt_sdl_2$trim = 'after' + +comb_sdl = rbind(melt_sdl_1, melt_sdl_2) +comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) + +p = ggplot(data = comb_sdl) + + geom_line(mapping = aes(x = id, y = value, color = variable)) + + scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + + facet_grid(. ~ trim) + + xlab('\nSequence Duplication Level') + + ylab('') + + scale_color_discrete(name = '') + + theme(axis.text.x = element_text(size = 5), + panel.background = element_rect(fill = NA), + axis.line = element_line(), + legend.position="top") +p +``` + + +### Overrepresented sequences {.tabset} + +#### Before + +```{r} +fastqc_data_1 = paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt') +module_name = 'Overrepresented sequences' + +overrepresented_seq = extract_data_module(fastqc_data_1, module_name) +colnames(overrepresented_seq) = c('Sequence', 'Count', 'Percentage', 'Possible Source') +DT::datatable(overrepresented_seq) +``` + +#### After + +```{r} +fastqc_data_2 = paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt') +module_name = 'Overrepresented sequences' + +overrepresented_seq = extract_data_module(fastqc_data_2, module_name) +colnames(overrepresented_seq) = c('Sequence', 'Count', 'Percentage', 'Possible Source') +DT::datatable(overrepresented_seq) +``` + + +### Adapter Content + +```{r 'Adapter Content'} +## reads 1 +ac_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Adapter Content') +ac_1$id = 1:length(ac_1$X.Position) + +melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) +melt_ac_1$trim = 'before' + +## reads 2 +ac_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Adapter Content') +ac_2$id = 1:length(ac_2$X.Position) + +melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) +melt_ac_2$trim = 'after' + +comb_ac = rbind(melt_ac_1, melt_ac_2) +comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) + +p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + + geom_line() + + facet_grid(. ~ trim) + + xlim(min(comb_ac$id), max(comb_ac$id)) + + ylim(0, 1) + + xlab('\nPosition in read (bp)') + + ylab('') + + scale_color_discrete(name = '') + + theme(axis.text.x = element_text(size = 5), + panel.background = element_rect(fill = NA), + axis.line = element_line()) +ggplotly(p) +``` + +### Kmer Content {.tabset} + +#### Before + +```{r 'Kmer Content (before)'} +kc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Kmer Content') +DT::datatable(kc_1) +``` + +#### After +```{r 'Kmer Content (after)'} +kc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Kmer Content') +DT::datatable(kc_2) +``` +