2
+ − 1 ---
15
+ − 2 title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
14
+ − 3 output:
+ − 4 html_document:
+ − 5 number_sections: true
+ − 6 toc: true
+ − 7 theme: cosmo
+ − 8 highlight: tango
2
+ − 9 ---
+ − 10
14
+ − 11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
+ − 12 knitr::opts_chunk$set(
16
+ − 13 echo = ECHO,
+ − 14 error = TRUE
14
+ − 15 )
2
+ − 16 ```
+ − 17
+ − 18
16
+ − 19 # Fastqc Evaluation
14
+ − 20
16
+ − 21 ## Evaluation of reads before trimming
2
+ − 22
16
+ − 23 ```{r}
+ − 24 if ('READS_1' == 'None') {
+ − 25 stop("No pre-trimming reads provided!")
+ − 26 } else {
+ − 27 ## run fastqc evaluation
+ − 28 fastqc_command = paste0('fastqc ') %>%
+ − 29 (function(x) {
+ − 30 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+ − 31 }) %>%
+ − 32 (function(x) {
+ − 33 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+ − 34 }) %>%
+ − 35 (function(x) {
+ − 36 paste0(x, '-o REPORT_DIR ')
+ − 37 })
+ − 38 fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
+ − 39 system(fastqc_command_reads_1, intern = TRUE)
+ − 40
+ − 41 # Original html report
+ − 42 reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
+ − 43 original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
+ − 44
+ − 45 unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+ − 46 reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
+ − 47 # fastqc_data.txt
+ − 48 file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
+ − 49 fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
+ − 50 # summary.txt
+ − 51 file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
+ − 52 summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
+ − 53
+ − 54 tags$ul(
+ − 55 tags$li(original_html),
+ − 56 tags$li(fastqc_data),
+ − 57 tags$li(summary_data)
+ − 58 )
+ − 59 }
15
+ − 60 ```
+ − 61
+ − 62
16
+ − 63 ## Evaluation of reads after trimming
15
+ − 64
16
+ − 65 ```{r}
+ − 66 if ('READS_2' == 'None') {
+ − 67 stop("No pre-trimming reads provided!")
+ − 68 } else {
+ − 69 ## run fastqc evaluation
+ − 70 fastqc_command = paste0('fastqc ') %>%
+ − 71 (function(x) {
+ − 72 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
+ − 73 }) %>%
+ − 74 (function(x) {
+ − 75 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
+ − 76 }) %>%
+ − 77 (function(x) {
+ − 78 paste0(x, '-o REPORT_DIR ')
+ − 79 })
+ − 80 fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
+ − 81 system(fastqc_command_reads_2, intern = TRUE)
+ − 82
+ − 83 # Original html report
+ − 84 reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
+ − 85 original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
+ − 86
+ − 87 unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
+ − 88 reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
+ − 89 # fastqc_data.txt
+ − 90 file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
+ − 91 fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
+ − 92 # summary.txt
+ − 93 file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
+ − 94 summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
+ − 95
+ − 96 tags$ul(
+ − 97 tags$li(original_html),
+ − 98 tags$li(fastqc_data),
+ − 99 tags$li(summary_data)
+ − 100 )
+ − 101 }
2
+ − 102 ```
+ − 103
15
+ − 104
+ − 105
+ − 106 # Fastqc output visualization
+ − 107
+ − 108 ## Overview
+ − 109
+ − 110 ```{r}
16
+ − 111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
17
+ − 112 reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1]
16
+ − 113 combined_summary = cbind(reads_1_summary, reads_2_summary)
+ − 114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
17
+ − 115 combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
+ − 116 combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
16
+ − 117 knitr::kable(combined_summary)
15
+ − 118 ```
+ − 119
17
+ − 120 ## Visualization by data module {.tabset}
2
+ − 121
14
+ − 122 * Define a function to extract outputs for each module from fastqc output
2
+ − 123
14
+ − 124 ```{r 'function definition'}
17
+ − 125 extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") {
14
+ − 126 f = readLines(fastqc_data)
+ − 127 start_line = grep(module_name, f)
+ − 128 end_module_lines = grep('END_MODULE', f)
+ − 129 end_line = end_module_lines[which(end_module_lines > start_line)[1]]
+ − 130 module_data = f[(start_line+1):(end_line-1)]
+ − 131 writeLines(module_data, 'temp.txt')
17
+ − 132 read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char)
2
+ − 133 }
+ − 134 ```
+ − 135
15
+ − 136 ### Per base sequence quality
+ − 137
16
+ − 138 ```{r 'per base sequence quality', fig.width=10}
+ − 139 ## reads 1
+ − 140 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
+ − 141 pbsq_1$id = 1:length(pbsq_1$X.Base)
18
+ − 142 pbsq_1$trim = 'before'
16
+ − 143
+ − 144 ## reads 2
+ − 145 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
+ − 146 pbsq_2$id = 1:length(pbsq_2$X.Base)
18
+ − 147 pbsq_2$trim = 'after'
16
+ − 148
18
+ − 149 comb_pbsq = rbind(pbsq_1, pbsq_2)
16
+ − 150 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
17
+ − 151
16
+ − 152 p = ggplot(data = comb_pbsq) +
18
+ − 153 geom_boxplot(mapping = aes(x = id,
+ − 154 lower = Lower.Quartile,
+ − 155 upper = Upper.Quartile,
+ − 156 middle = Median,
+ − 157 ymin = X10th.Percentile,
+ − 158 ymax = X90th.Percentile,
+ − 159 fill = "yellow"),
+ − 160 stat = 'identity') +
+ − 161 geom_line(mapping = aes(x = id, y = Mean, color = "red")) +
+ − 162 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
+ − 163 scale_fill_identity() +
+ − 164 scale_color_identity() +
+ − 165 ylim(0, max(comb_pbsq$Upper.Quartile) + 5) +
19
+ − 166 xlab('Position in read (bp)') +
18
+ − 167 facet_grid(. ~ trim) +
16
+ − 168 theme(axis.text.x = element_text(angle=45))
18
+ − 169 p
16
+ − 170
15
+ − 171 ```
+ − 172
+ − 173 ### Per tile sequence quality
+ − 174
17
+ − 175 ```{r 'per tile sequence quality', fig.width=10}
+ − 176 ## check if 'per tile sequence quality' module exits or not
+ − 177 check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt'))
+ − 178 if (length(check_ptsq) > 0) {
+ − 179 ## reads 1
+ − 180 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
+ − 181 ptsq_1$trim = 'before'
+ − 182
+ − 183 ## reads 2
+ − 184 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
+ − 185 ptsq_2$trim = 'after'
+ − 186
+ − 187 comb_ptsq = rbind(ptsq_1, ptsq_2)
+ − 188 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
+ − 189 comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
+ − 190
+ − 191 # convert integers to charaters
+ − 192 comb_ptsq$Tile = as.character(comb_ptsq$X.Tile)
+ − 193
+ − 194 p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) +
+ − 195 geom_raster() +
+ − 196 facet_grid(. ~ trim) +
+ − 197 xlab('Position in read (bp)') +
+ − 198 ylab('') +
+ − 199 theme(axis.text.x = element_text(angle=45))
+ − 200 ggplotly(p)
+ − 201 } else {
+ − 202 print('No "per tile sequence quality" data')
+ − 203 }
+ − 204
+ − 205
+ − 206 ```
+ − 207
+ − 208 ### Per sequence quality score
+ − 209
+ − 210 ```{r 'Per sequence quality score', fig.width=10}
16
+ − 211 ## reads 1
17
+ − 212 psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores')
+ − 213 psqs_1$trim = 'before'
16
+ − 214
+ − 215 ## reads 2
17
+ − 216 psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores')
+ − 217 psqs_2$trim = 'after'
+ − 218
+ − 219 comb_psqs = rbind(psqs_1, psqs_2)
+ − 220 comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim)
+ − 221
+ − 222 p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) +
+ − 223 geom_line(color = 'red') +
+ − 224 facet_grid(. ~ trim) +
+ − 225 xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) +
+ − 226 xlab('Mean Sequence Qaulity (Phred Score)') +
+ − 227 ylab('')
+ − 228 ggplotly(p)
+ − 229 ```
+ − 230
+ − 231
+ − 232 ### Per base sequence content
+ − 233
+ − 234 ```{r 'Per base sequence content', fig.width=10}
+ − 235 ## reads 1
+ − 236 pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content')
+ − 237 pbsc_1$id = 1:length(pbsc_1$X.Base)
+ − 238
+ − 239 melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id'))
+ − 240 melt_pbsc_1$trim = 'before'
+ − 241
+ − 242
+ − 243 ## reads 2
+ − 244 pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content')
+ − 245 pbsc_2$id = 1:length(pbsc_2$X.Base)
+ − 246
+ − 247 melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id'))
+ − 248 melt_pbsc_2$trim = 'after'
+ − 249
+ − 250 comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2)
+ − 251 comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim)
+ − 252
+ − 253 p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) +
+ − 254 geom_line() +
+ − 255 facet_grid(. ~ trim) +
+ − 256 xlim(min(comb_pbsc$id), max(comb_pbsc$id)) +
+ − 257 ylim(0, 100) +
+ − 258 xlab('Position in read (bp)') +
+ − 259 ylab('')
+ − 260 ggplotly(p)
+ − 261 ```
16
+ − 262
17
+ − 263 ### Per sequence GC content
+ − 264
+ − 265 ```{r 'Per sequence GC content', fig.width=10}
+ − 266 ## reads 1
+ − 267 psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content')
+ − 268 psGCc_1$trim = 'before'
+ − 269
+ − 270 ## reads 2
+ − 271 psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content')
+ − 272 psGCc_2$trim = 'after'
+ − 273
+ − 274 comb_psGCc = rbind(psGCc_1, psGCc_2)
+ − 275 comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim)
+ − 276
+ − 277 p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) +
+ − 278 geom_line(color = 'red') +
+ − 279 facet_grid(. ~ trim) +
+ − 280 xlab('Mean Sequence Qaulity (Phred Score)') +
+ − 281 ylab('')
+ − 282 ggplotly(p)
+ − 283 ```
+ − 284
16
+ − 285
17
+ − 286 ### Per base N content
+ − 287
+ − 288 ```{r 'Per base N content', fig.width=10}
+ − 289 ## reads 1
+ − 290 pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content')
+ − 291 pbNc_1$id = 1:length(pbNc_1$X.Base)
+ − 292 pbNc_1$trim = 'before'
+ − 293
+ − 294 ## reads 2
+ − 295 pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content')
+ − 296 pbNc_2$id = 1:length(pbNc_2$X.Base)
+ − 297 pbNc_2$trim = 'after'
+ − 298
+ − 299 comb_pbNc = rbind(pbNc_1, pbNc_2)
+ − 300 comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim)
+ − 301
+ − 302 p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) +
+ − 303 geom_line(color = 'red') +
+ − 304 scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) +
+ − 305 facet_grid(. ~ trim) +
+ − 306 ylim(0, 1) +
+ − 307 xlab('N-Count') +
+ − 308 ylab('') +
16
+ − 309 theme(axis.text.x = element_text(angle=45))
+ − 310 ggplotly(p)
15
+ − 311 ```
+ − 312
+ − 313
17
+ − 314 ### Sequence Length Distribution
+ − 315
+ − 316 ```{r 'Sequence Length Distribution', fig.width=10}
+ − 317 ## reads 1
+ − 318 sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution')
+ − 319 sld_1$id = 1:length(sld_1$X.Length)
+ − 320 sld_1$trim = 'before'
+ − 321
+ − 322 ## reads 2
+ − 323 sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution')
+ − 324 sld_2$id = 1:length(sld_2$X.Length)
+ − 325 sld_2$trim = 'after'
+ − 326
+ − 327 comb_sld = rbind(sld_1, sld_2)
+ − 328 comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim)
+ − 329
+ − 330 p = ggplot(data = comb_sld, aes(x = id, y = Count)) +
+ − 331 geom_line(color = 'red') +
+ − 332 scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) +
+ − 333 facet_grid(. ~ trim) +
+ − 334 xlab('Sequence Length (bp)') +
+ − 335 ylab('') +
+ − 336 theme(axis.text.x = element_text(angle=45))
+ − 337 ggplotly(p)
+ − 338 ```
+ − 339
+ − 340 ### Sequence Duplication Levels
+ − 341
+ − 342 ```{r 'Sequence Duplication Levels', fig.width=10}
+ − 343 ## reads 1
+ − 344 sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
+ − 345 names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
+ − 346 sdl_1$id = 1:length(sdl_1$Duplication_Level)
+ − 347
+ − 348 melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id'))
+ − 349 melt_sdl_1$trim = 'before'
+ − 350
+ − 351
+ − 352 ## reads 2
+ − 353 sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
+ − 354 names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
+ − 355 sdl_2$id = 1:length(sdl_2$Duplication_Level)
+ − 356
+ − 357 melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id'))
+ − 358 melt_sdl_2$trim = 'after'
+ − 359
+ − 360 comb_sdl = rbind(melt_sdl_1, melt_sdl_2)
+ − 361 comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim)
+ − 362
+ − 363 p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) +
+ − 364 geom_line() +
+ − 365 scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) +
+ − 366 facet_grid(. ~ trim) +
+ − 367 xlab('Sequence Duplication Level') +
+ − 368 ylab('') +
+ − 369 theme(axis.text.x = element_text(angle=45))
+ − 370 ggplotly(p)
+ − 371 ```
+ − 372
+ − 373 ### Adapter Content
+ − 374
+ − 375 ```{r 'Adapter Content', fig.width=10}
+ − 376 ## reads 1
+ − 377 ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content')
+ − 378 ac_1$id = 1:length(ac_1$X.Position)
+ − 379
+ − 380 melt_ac_1 = melt(ac_1, id=c('X.Position', 'id'))
+ − 381 melt_ac_1$trim = 'before'
+ − 382
+ − 383 ## reads 2
+ − 384 ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content')
+ − 385 ac_2$id = 1:length(ac_2$X.Position)
+ − 386
+ − 387 melt_ac_2 = melt(ac_2, id=c('X.Position', 'id'))
+ − 388 melt_ac_2$trim = 'after'
+ − 389
+ − 390 comb_ac = rbind(melt_ac_1, melt_ac_2)
+ − 391 comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim)
+ − 392
+ − 393 p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) +
+ − 394 geom_line() +
+ − 395 facet_grid(. ~ trim) +
+ − 396 xlim(min(comb_ac$id), max(comb_ac$id)) +
+ − 397 ylim(0, 1) +
+ − 398 xlab('Position in read (bp)') +
+ − 399 ylab('')
+ − 400 ggplotly(p)
+ − 401 ```
+ − 402
+ − 403 ### Kmer Content {.tabset}
+ − 404
+ − 405 #### Before
+ − 406
+ − 407 ```{r 'Kmer Content (before)', fig.width=10}
+ − 408 kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content')
+ − 409 knitr::kable(kc_1)
+ − 410 ```
+ − 411
+ − 412 #### After
+ − 413 ```{r 'Kmer Content (after)', fig.width=10}
+ − 414 kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content')
+ − 415 knitr::kable(kc_2)
+ − 416 ```
+ − 417
2
+ − 418
14
+ − 419 # Session Info
2
+ − 420
14
+ − 421 ```{r 'session info'}
+ − 422 sessionInfo()
2
+ − 423 ```
+ − 424
19
+ − 425 # References
+ − 426
+ − 427 * Bioinformatics, Babraham (2014). FastQC.
+ − 428
+ − 429 * Allaire, J and Cheng, Joe and Xie, Yihui and McPherson, Jonathan and Chang, Winston and Allen, Jeff and Wickham, Hadley and Atkins, Aron and Hyndman, Rob (2016). rmarkdown: Dynamic Documents for R, 2016. In R package version 0.9, 6.
+ − 430
+ − 431 * Xie, Yihui (2015). Dynamic Documents with R and knitr, CRC Press, Vol.29.
+ − 432
+ − 433 * Carson Sievert and Chris Parmer and Toby Hocking and Scott Chamberlain and Karthik Ram and Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. [Link]
+ − 434
+ − 435 * Wickham, H. (2016). ggplot2: elegant graphics for data analysis. Springer. Chicago