Mercurial > repos > mingchen0919 > aurora_fastqc

diff rmarkdown_report.Rmd @ 0:0aeed70b3bc5 draft default tip
planemo upload commit 841d8b22bf9f1aaed6bfe8344b60617f45b275b2-dirty
author: mingchen0919
date: Fri, 14 Dec 2018 00:38:44 -0500
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rmarkdown_report.Rmd	Fri Dec 14 00:38:44 2018 -0500
@@ -0,0 +1,458 @@
+---
+title: '[FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) report'
+output:
+    html_document:
+      highlight: pygments
+---
+
+
+```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+knitr::opts_chunk$set(error = TRUE, echo = FALSE)
+```
+
+```{css, echo=FALSE}
+pre code, pre, code {
+  white-space: pre !important;
+  overflow-x: scroll !important;
+  word-break: keep-all !important;
+  word-wrap: initial !important;
+}
+```
+
+```{r, echo=FALSE}
+# to make the css theme to work, <link></link> tags cannot be added directly 
+# as <script></script> tags as below.
+# it has to be added using a code chunk with the htmltool functions!!!
+css_link = tags$link()
+css_link$attribs = list(rel="stylesheet", href="vakata-jstree-3.3.5/dist/themes/default/style.min.css")
+css_link
+```
+
+```{r, eval=FALSE, echo=FALSE}
+# this code chunk is purely for adding comments
+# below is to add jQuery and jstree javascripts
+```
+
+<script src="vakata-jstree-3.3.5/dist/jstree.min.js"></script>
+
+
+```{r, eval=FALSE, echo=FALSE}
+# this code chunk is purely for adding comments
+# javascript code below is to build the file tree interface
+# see this for how to implement opening hyperlink: https://stackoverflow.com/questions/18611317/how-to-get-i-get-leaf-nodes-in-jstree-to-open-their-hyperlink-when-clicked-when
+```
+<script>
+  jQuery(function () {
+    // create an instance when the DOM is ready
+    jQuery('#jstree').jstree().bind("select_node.jstree", function (e, data) {
+     window.open( data.node.a_attr.href, data.node.a_attr.target )
+    });
+  });
+</script>
+
+
+```{r, eval=FALSE, echo=FALSE}
+---
+# ADD YOUR DATA ANALYSIS CODE AND MARKUP TEXT BELOW TO EXTEND THIS R MARKDOWN FILE
+---
+```
+
+
+# Run FastQC
+
+```{bash}
+sh ${TOOL_INSTALL_DIR}/build-and-run-job-scripts.sh
+```
+
+```{r echo=FALSE,results='asis'}
+# display fastqc job script
+cat('```bash\n')
+cat(readLines(paste0(Sys.getenv('REPORT_FILES_PATH'), '/job-1-script.sh')), sep = '\n')
+cat('\n```')
+```
+
+# Fastqc Output Visualization
+
+## Overview
+
+```{r eval=TRUE}
+read_1_summary = read.csv(paste0(opt$X_d, '/read_1_fastqc/summary.txt'),
+                          stringsAsFactors = FALSE,
+                          header = FALSE, sep = '\t')[, 2:1]
+read_2_summary = read.csv(paste0(opt$X_d, '/read_2_fastqc/summary.txt'),
+                          stringsAsFactors = FALSE,
+                          header = FALSE, sep = '\t')[, 1]
+combined_summary = data.frame(read_1_summary, read_2_summary, stringsAsFactors = FALSE)
+names(combined_summary) = c('MODULE', 'Pre-trimming', 'Post-trimming')
+combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
+combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
+DT::datatable(combined_summary)
+```
+
+```{r 'function definition', echo=FALSE}
+extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") {
+  f = readLines(fastqc_data)
+  start_line = grep(module_name, f)
+  end_module_lines = grep('END_MODULE', f)
+  end_line = end_module_lines[which(end_module_lines > start_line)[1]]
+  module_data = f[(start_line+1):(end_line-1)]
+  writeLines(module_data, '/tmp/temp.txt')
+  read.csv('/tmp/temp.txt', sep = '\t', header = header, comment.char = comment.char)
+}
+```
+
+
+### Basic Statistics {.tabset}
+
+#### Before
+
+```{r}
+fastqc_data_1 = paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt')
+module_name = 'Basic Statistics	pass'
+
+basic_statistics = extract_data_module(fastqc_data_1, module_name)
+colnames(basic_statistics) = c('Measure', 'Value')
+DT::datatable(basic_statistics)
+```
+
+#### After
+
+```{r}
+fastqc_data_2 = paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt')
+module_name = 'Basic Statistics	pass'
+
+basic_statistics = extract_data_module(fastqc_data_2, module_name)
+colnames(basic_statistics) = c('Measure', 'Value')
+DT::datatable(basic_statistics)
+```
+
+
+### Per base sequence quality
+
+```{r 'per base sequence quality'}
+## reads 1
+pbsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence quality')
+pbsq_1$id = 1:length(pbsq_1$X.Base)
+pbsq_1$trim = 'before'
+
+## reads 2
+pbsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence quality')
+pbsq_2$id = 1:length(pbsq_2$X.Base)
+pbsq_2$trim = 'after'
+
+comb_pbsq = rbind(pbsq_1, pbsq_2)
+comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
+
+p = ggplot(data = comb_pbsq) +
+  geom_boxplot(mapping = aes(x = id, 
+                             lower = Lower.Quartile, 
+                             upper = Upper.Quartile, 
+                             middle = Median, 
+                             ymin = X10th.Percentile, 
+                             ymax = X90th.Percentile,
+                             fill = "yellow"),
+               stat = 'identity') +
+  geom_line(mapping = aes(x = id, y = Mean, color = "red")) +
+  scale_x_continuous(name = '\nPosition in read (bp)', breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
+  scale_y_continuous(limits = c(0, max(comb_pbsq$Upper.Quartile) + 5)) +
+  scale_fill_identity() +
+  scale_color_identity() + 
+  facet_grid(. ~ trim) +
+  theme(axis.text.x = element_text(size = 5),
+        panel.background = element_rect(fill = NA),
+        panel.grid.major.y = element_line(color = 'blue', size = 0.1))
+p
+```
+
+
+### Per tile sequence quality
+
+```{r 'per tile sequence quality'}
+## check if 'per tile sequence quality' module exits or not
+check_ptsq = grep('Per tile sequence quality', readLines(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt')))
+if (length(check_ptsq) > 0) {
+    ## reads 1
+  ptsq_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per tile sequence quality')
+  ptsq_1$trim = 'before'
+  
+  ## reads 2
+  ptsq_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per tile sequence quality')
+  ptsq_2$trim = 'after'
+  
+  comb_ptsq = rbind(ptsq_1, ptsq_2)
+  comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
+  comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
+  
+  # convert integers to charaters
+  # comb_ptsq$Tile = as.character(comb_ptsq$X.Tile)
+  
+  p = ggplot(data = comb_ptsq) +
+    geom_raster(mapping = aes(x = Base, y = X.Tile, fill = Mean)) + 
+    facet_grid(. ~ trim) + 
+    scale_x_discrete(name = "\nPosition in read (bp)") +
+    scale_y_continuous(name = "") +
+    scale_fill_gradient(low = "blue", high = "red") +
+    theme(axis.text.x = element_text(size = 5, angle = 90),
+          axis.text.y = element_text(size = 5),
+        panel.background = element_rect(fill = NA))
+  ggplotly(p)
+} else {
+  print('No "per tile sequence quality" data')
+}
+```
+
+### Per sequence quality score
+
+```{r 'Per sequence quality score'}
+## reads 1
+psqs_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence quality scores')
+psqs_1$trim = 'before'
+
+## reads 2
+psqs_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence quality scores')
+psqs_2$trim = 'after'
+
+comb_psqs = rbind(psqs_1, psqs_2)
+comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim)
+
+p = ggplot(data = comb_psqs) + 
+  geom_line(mapping = aes(x = X.Quality, y = Count), color = 'red') + 
+  facet_grid(. ~ trim) + 
+  scale_x_continuous(name = '\nMean Sequence Qaulity (Phred Score)',
+                     limits = c(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality))) +
+  scale_y_continuous(name = '') +
+  theme(panel.background = element_rect(fill = NA),
+        axis.line = element_line(),
+        panel.grid.major.y = element_line(color = 'blue', size = 0.1))
+p
+```
+
+### Per base sequence content
+
+```{r 'Per base sequence content'}
+## reads 1
+pbsc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base sequence content')
+pbsc_1$id = 1:length(pbsc_1$X.Base)
+
+melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id'))
+melt_pbsc_1$trim = 'before'
+
+
+## reads 2
+pbsc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base sequence content')
+pbsc_2$id = 1:length(pbsc_2$X.Base)
+
+melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id'))
+melt_pbsc_2$trim = 'after'
+
+comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2)
+comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim)
+
+p = ggplot(data = comb_pbsc) +
+  geom_line(mapping = aes(x = id, y = value, color = variable)) +
+  facet_grid(. ~ trim) +
+  xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + 
+  ylim(0, 100) +
+  xlab('\nPosition in read (bp)') +
+  ylab('') +
+  scale_color_discrete(name = '') +
+  theme_classic()
+p
+```
+
+### Per sequence GC content
+
+```{r 'Per sequence GC content'}
+## reads 1
+psGCc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per sequence GC content')
+psGCc_1$trim = 'before'
+
+## reads 2
+psGCc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per sequence GC content')
+psGCc_2$trim = 'after'
+
+comb_psGCc = rbind(psGCc_1, psGCc_2)
+comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim)
+
+p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) +
+  geom_line(color = 'red') +
+  facet_grid(. ~ trim) +
+  xlab('\nMean Sequence Qaulity (Phred Score)') +
+  ylab('') +
+  scale_color_discrete(name = '') +
+  theme_classic()
+p
+```
+
+
+### Per base N content
+
+```{r 'Per base N content'}
+## reads 1
+pbNc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Per base N content')
+pbNc_1$id = 1:length(pbNc_1$X.Base)
+pbNc_1$trim = 'before'
+
+## reads 2
+pbNc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Per base N content')
+pbNc_2$id = 1:length(pbNc_2$X.Base)
+pbNc_2$trim = 'after'
+
+comb_pbNc = rbind(pbNc_1, pbNc_2)
+comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim)
+
+p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) +
+  geom_line(color = 'red') +
+  scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + 
+  facet_grid(. ~ trim) +
+  ylim(0, 1) + 
+  xlab('\nN-Count') +
+  ylab('') + 
+  theme(axis.text.x = element_text(size = 5),
+        axis.line = element_line(),
+        panel.background = element_rect(fill = NA))
+p
+```
+
+
+### Sequence Length Distribution
+
+```{r 'Sequence Length Distribution'}
+## reads 1
+sld_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Length Distribution')
+sld_1$id = 1:length(sld_1$X.Length)
+sld_1$trim = 'before'
+
+## reads 2
+sld_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Length Distribution')
+sld_2$id = 1:length(sld_2$X.Length)
+sld_2$trim = 'after'
+
+comb_sld = rbind(sld_1, sld_2)
+comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim)
+
+p = ggplot(data = comb_sld, aes(x = id, y = Count)) +
+  geom_line(color = 'red') +
+  scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + 
+  facet_grid(. ~ trim) +
+  xlab('\nSequence Length (bp)') +
+  ylab('') + 
+  theme(axis.text.x = element_text(size = 5),
+        panel.background = element_rect(fill = NA),
+        axis.line = element_line(), 
+        plot.margin = margin(2,2,2,10) )
+p
+```
+
+### Sequence Duplication Levels
+
+```{r 'Sequence Duplication Levels'}
+## reads 1
+sdl_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
+names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
+sdl_1$id = 1:length(sdl_1$Duplication_Level)
+
+melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id'))
+melt_sdl_1$trim = 'before'
+
+
+## reads 2
+sdl_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
+names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
+sdl_2$id = 1:length(sdl_2$Duplication_Level)
+
+melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id'))
+melt_sdl_2$trim = 'after'
+
+comb_sdl = rbind(melt_sdl_1, melt_sdl_2)
+comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim)
+
+p = ggplot(data = comb_sdl) +
+  geom_line(mapping = aes(x = id, y = value, color = variable)) +
+  scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) +
+  facet_grid(. ~ trim) +
+  xlab('\nSequence Duplication Level') +
+  ylab('') + 
+  scale_color_discrete(name = '') +
+  theme(axis.text.x = element_text(size = 5),
+        panel.background = element_rect(fill = NA),
+        axis.line = element_line(),
+        legend.position="top")
+p
+```
+
+
+### Overrepresented sequences {.tabset}
+
+#### Before
+
+```{r}
+fastqc_data_1 = paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt')
+module_name = 'Overrepresented sequences'
+
+overrepresented_seq = extract_data_module(fastqc_data_1, module_name)
+colnames(overrepresented_seq) = c('Sequence', 'Count', 'Percentage', 'Possible Source')
+DT::datatable(overrepresented_seq)
+```
+
+#### After
+
+```{r}
+fastqc_data_2 = paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt')
+module_name = 'Overrepresented sequences'
+
+overrepresented_seq = extract_data_module(fastqc_data_2, module_name)
+colnames(overrepresented_seq) = c('Sequence', 'Count', 'Percentage', 'Possible Source')
+DT::datatable(overrepresented_seq)
+```
+
+
+### Adapter Content
+
+```{r 'Adapter Content'}
+## reads 1
+ac_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Adapter Content')
+ac_1$id = 1:length(ac_1$X.Position)
+
+melt_ac_1 = melt(ac_1, id=c('X.Position', 'id'))
+melt_ac_1$trim = 'before'
+
+## reads 2
+ac_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Adapter Content')
+ac_2$id = 1:length(ac_2$X.Position)
+
+melt_ac_2 = melt(ac_2, id=c('X.Position', 'id'))
+melt_ac_2$trim = 'after'
+
+comb_ac = rbind(melt_ac_1, melt_ac_2)
+comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim)
+
+p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) +
+  geom_line() +
+  facet_grid(. ~ trim) +
+  xlim(min(comb_ac$id), max(comb_ac$id)) + 
+  ylim(0, 1) +
+  xlab('\nPosition in read (bp)') +
+  ylab('') +
+  scale_color_discrete(name = '') +
+  theme(axis.text.x = element_text(size = 5),
+        panel.background = element_rect(fill = NA),
+        axis.line = element_line())
+ggplotly(p)
+```
+
+### Kmer Content {.tabset}
+
+#### Before
+
+```{r 'Kmer Content (before)'}
+kc_1 = extract_data_module(paste0(opt$X_d, '/read_1_fastqc/fastqc_data.txt'), 'Kmer Content')
+DT::datatable(kc_1)
+```
+
+#### After
+```{r 'Kmer Content (after)'}
+kc_2 = extract_data_module(paste0(opt$X_d, '/read_2_fastqc/fastqc_data.txt'), 'Kmer Content')
+DT::datatable(kc_2)
+```
+
author	mingchen0919
date	Fri, 14 Dec 2018 00:38:44 -0500
parents
children