2
|
1 ---
|
15
|
2 title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
|
14
|
3 output:
|
|
4 html_document:
|
|
5 number_sections: true
|
|
6 toc: true
|
|
7 theme: cosmo
|
|
8 highlight: tango
|
2
|
9 ---
|
|
10
|
14
|
11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
|
|
12 knitr::opts_chunk$set(
|
16
|
13 echo = ECHO,
|
|
14 error = TRUE
|
14
|
15 )
|
2
|
16 ```
|
|
17
|
|
18
|
16
|
19 # Fastqc Evaluation
|
14
|
20
|
16
|
21 ## Evaluation of reads before trimming
|
2
|
22
|
16
|
23 ```{r}
|
|
24 if ('READS_1' == 'None') {
|
|
25 stop("No pre-trimming reads provided!")
|
|
26 } else {
|
|
27 ## run fastqc evaluation
|
|
28 fastqc_command = paste0('fastqc ') %>%
|
|
29 (function(x) {
|
|
30 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
|
|
31 }) %>%
|
|
32 (function(x) {
|
|
33 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
|
|
34 }) %>%
|
|
35 (function(x) {
|
|
36 paste0(x, '-o REPORT_DIR ')
|
|
37 })
|
|
38 fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
|
|
39 system(fastqc_command_reads_1, intern = TRUE)
|
|
40
|
|
41 # Original html report
|
|
42 reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
|
|
43 original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
|
|
44
|
|
45 unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
|
|
46 reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
|
|
47 # fastqc_data.txt
|
|
48 file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
|
|
49 fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
|
|
50 # summary.txt
|
|
51 file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
|
|
52 summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
|
|
53
|
|
54 tags$ul(
|
|
55 tags$li(original_html),
|
|
56 tags$li(fastqc_data),
|
|
57 tags$li(summary_data)
|
|
58 )
|
|
59 }
|
15
|
60 ```
|
|
61
|
|
62
|
16
|
63 ## Evaluation of reads after trimming
|
15
|
64
|
16
|
65 ```{r}
|
|
66 if ('READS_2' == 'None') {
|
|
67 stop("No pre-trimming reads provided!")
|
|
68 } else {
|
|
69 ## run fastqc evaluation
|
|
70 fastqc_command = paste0('fastqc ') %>%
|
|
71 (function(x) {
|
|
72 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
|
|
73 }) %>%
|
|
74 (function(x) {
|
|
75 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
|
|
76 }) %>%
|
|
77 (function(x) {
|
|
78 paste0(x, '-o REPORT_DIR ')
|
|
79 })
|
|
80 fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
|
|
81 system(fastqc_command_reads_2, intern = TRUE)
|
|
82
|
|
83 # Original html report
|
|
84 reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
|
|
85 original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
|
|
86
|
|
87 unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
|
|
88 reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
|
|
89 # fastqc_data.txt
|
|
90 file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
|
|
91 fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
|
|
92 # summary.txt
|
|
93 file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
|
|
94 summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
|
|
95
|
|
96 tags$ul(
|
|
97 tags$li(original_html),
|
|
98 tags$li(fastqc_data),
|
|
99 tags$li(summary_data)
|
|
100 )
|
|
101 }
|
2
|
102 ```
|
|
103
|
15
|
104
|
|
105
|
|
106 # Fastqc output visualization
|
|
107
|
|
108 ## Overview
|
|
109
|
|
110 ```{r}
|
16
|
111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
|
|
112 reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1]
|
|
113 combined_summary = cbind(reads_1_summary, reads_2_summary)
|
|
114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
|
|
115 knitr::kable(combined_summary)
|
15
|
116 ```
|
|
117
|
16
|
118 ## Visualization by module {.tabset}
|
2
|
119
|
14
|
120 * Define a function to extract outputs for each module from fastqc output
|
2
|
121
|
14
|
122 ```{r 'function definition'}
|
|
123 extract_data_module = function(fastqc_data, module_name) {
|
|
124 f = readLines(fastqc_data)
|
|
125 start_line = grep(module_name, f)
|
|
126 end_module_lines = grep('END_MODULE', f)
|
|
127 end_line = end_module_lines[which(end_module_lines > start_line)[1]]
|
|
128 module_data = f[(start_line+1):(end_line-1)]
|
|
129 writeLines(module_data, 'temp.txt')
|
|
130 read.csv('temp.txt', sep = '\t')
|
2
|
131 }
|
|
132 ```
|
|
133
|
15
|
134 ### Per base sequence quality
|
|
135
|
16
|
136 ```{r 'per base sequence quality', fig.width=10}
|
|
137 ## reads 1
|
|
138 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
|
|
139 pbsq_1$id = 1:length(pbsq_1$X.Base)
|
|
140
|
|
141 melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
|
|
142 melt_pbsq_1$trim = 'before'
|
|
143
|
|
144
|
|
145 ## reads 2
|
|
146 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
|
|
147 pbsq_2$id = 1:length(pbsq_2$X.Base)
|
|
148
|
|
149 melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
|
|
150 melt_pbsq_2$trim = 'after'
|
|
151
|
|
152 comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2)
|
|
153 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
|
|
154 p = ggplot(data = comb_pbsq) +
|
|
155 geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) +
|
|
156 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
|
|
157 facet_grid(. ~ trim) +
|
|
158 theme(axis.text.x = element_text(angle=45))
|
|
159 ggplotly(p)
|
|
160
|
15
|
161 ```
|
|
162
|
|
163 ### Per tile sequence quality
|
|
164
|
16
|
165 ```{r 'per tile sequence quality'}
|
|
166 ## reads 1
|
|
167 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
|
|
168 ptsq_1$trim = 'before'
|
|
169
|
|
170 ## reads 2
|
|
171 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
|
|
172 ptsq_2$trim = 'after'
|
|
173
|
|
174 comb_ptsq = rbind(ptsq_1, ptsq_2)
|
|
175 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
|
|
176 comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
|
|
177
|
|
178 p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) +
|
|
179 geom_raster() +
|
|
180 facet_grid(. ~ trim) +
|
|
181 theme(axis.text.x = element_text(angle=45))
|
|
182 ggplotly(p)
|
15
|
183 ```
|
|
184
|
|
185
|
2
|
186
|
14
|
187 # Session Info
|
2
|
188
|
14
|
189 ```{r 'session info'}
|
|
190 sessionInfo()
|
2
|
191 ```
|
|
192
|