comparison fastqc_report.Rmd @ 17:ac5c618e4d97 draft

compare evaluation before and after trimming
author mingchen0919
date Mon, 06 Nov 2017 16:53:14 -0500
parents 1710b0e874f1
children 8635a4cee6dd
comparison
equal deleted inserted replaced
16:1710b0e874f1 17:ac5c618e4d97
107 107
108 ## Overview 108 ## Overview
109 109
110 ```{r} 110 ```{r}
111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] 111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
112 reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1] 112 reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1]
113 combined_summary = cbind(reads_1_summary, reads_2_summary) 113 combined_summary = cbind(reads_1_summary, reads_2_summary)
114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) 114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
115 combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
116 combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
115 knitr::kable(combined_summary) 117 knitr::kable(combined_summary)
116 ``` 118 ```
117 119
118 ## Visualization by module {.tabset} 120 ## Visualization by data module {.tabset}
119 121
120 * Define a function to extract outputs for each module from fastqc output 122 * Define a function to extract outputs for each module from fastqc output
121 123
122 ```{r 'function definition'} 124 ```{r 'function definition'}
123 extract_data_module = function(fastqc_data, module_name) { 125 extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") {
124 f = readLines(fastqc_data) 126 f = readLines(fastqc_data)
125 start_line = grep(module_name, f) 127 start_line = grep(module_name, f)
126 end_module_lines = grep('END_MODULE', f) 128 end_module_lines = grep('END_MODULE', f)
127 end_line = end_module_lines[which(end_module_lines > start_line)[1]] 129 end_line = end_module_lines[which(end_module_lines > start_line)[1]]
128 module_data = f[(start_line+1):(end_line-1)] 130 module_data = f[(start_line+1):(end_line-1)]
129 writeLines(module_data, 'temp.txt') 131 writeLines(module_data, 'temp.txt')
130 read.csv('temp.txt', sep = '\t') 132 read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char)
131 } 133 }
132 ``` 134 ```
133 135
134 ### Per base sequence quality 136 ### Per base sequence quality
135 137
136 ```{r 'per base sequence quality', fig.width=10} 138 ```{r 'per base sequence quality', fig.width=10}
137 ## reads 1 139 ## reads 1
138 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality') 140 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
139 pbsq_1$id = 1:length(pbsq_1$X.Base) 141 pbsq_1$id = 1:length(pbsq_1$X.Base)
140 142
141 melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') 143 melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable == 'Mean')
142 melt_pbsq_1$trim = 'before' 144 melt_pbsq_1$trim = 'before'
143 145
144 146
145 ## reads 2 147 ## reads 2
146 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality') 148 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
147 pbsq_2$id = 1:length(pbsq_2$X.Base) 149 pbsq_2$id = 1:length(pbsq_2$X.Base)
148 150
149 melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') 151 melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable == 'Mean')
150 melt_pbsq_2$trim = 'after' 152 melt_pbsq_2$trim = 'after'
151 153
152 comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2) 154 comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2)
153 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) 155 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
156
154 p = ggplot(data = comb_pbsq) + 157 p = ggplot(data = comb_pbsq) +
155 geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) + 158 geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) +
156 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + 159 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
157 facet_grid(. ~ trim) + 160 facet_grid(. ~ trim) +
161 ylim(0, max(comb_pbsq$value) + 5) +
158 theme(axis.text.x = element_text(angle=45)) 162 theme(axis.text.x = element_text(angle=45))
159 ggplotly(p) 163 ggplotly(p)
160 164
161 ``` 165 ```
162 166
163 ### Per tile sequence quality 167 ### Per tile sequence quality
164 168
165 ```{r 'per tile sequence quality'} 169 ```{r 'per tile sequence quality', fig.width=10}
166 ## reads 1 170 ## check if 'per tile sequence quality' module exits or not
167 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') 171 check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt'))
168 ptsq_1$trim = 'before' 172 if (length(check_ptsq) > 0) {
169 173 ## reads 1
170 ## reads 2 174 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
171 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') 175 ptsq_1$trim = 'before'
172 ptsq_2$trim = 'after' 176
173 177 ## reads 2
174 comb_ptsq = rbind(ptsq_1, ptsq_2) 178 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
175 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) 179 ptsq_2$trim = 'after'
176 comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) 180
177 181 comb_ptsq = rbind(ptsq_1, ptsq_2)
178 p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) + 182 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
179 geom_raster() + 183 comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
184
185 # convert integers to charaters
186 comb_ptsq$Tile = as.character(comb_ptsq$X.Tile)
187
188 p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) +
189 geom_raster() +
190 facet_grid(. ~ trim) +
191 xlab('Position in read (bp)') +
192 ylab('') +
193 theme(axis.text.x = element_text(angle=45))
194 ggplotly(p)
195 } else {
196 print('No "per tile sequence quality" data')
197 }
198
199
200 ```
201
202 ### Per sequence quality score
203
204 ```{r 'Per sequence quality score', fig.width=10}
205 ## reads 1
206 psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores')
207 psqs_1$trim = 'before'
208
209 ## reads 2
210 psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores')
211 psqs_2$trim = 'after'
212
213 comb_psqs = rbind(psqs_1, psqs_2)
214 comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim)
215
216 p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) +
217 geom_line(color = 'red') +
180 facet_grid(. ~ trim) + 218 facet_grid(. ~ trim) +
219 xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) +
220 xlab('Mean Sequence Qaulity (Phred Score)') +
221 ylab('')
222 ggplotly(p)
223 ```
224
225
226 ### Per base sequence content
227
228 ```{r 'Per base sequence content', fig.width=10}
229 ## reads 1
230 pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content')
231 pbsc_1$id = 1:length(pbsc_1$X.Base)
232
233 melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id'))
234 melt_pbsc_1$trim = 'before'
235
236
237 ## reads 2
238 pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content')
239 pbsc_2$id = 1:length(pbsc_2$X.Base)
240
241 melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id'))
242 melt_pbsc_2$trim = 'after'
243
244 comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2)
245 comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim)
246
247 p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) +
248 geom_line() +
249 facet_grid(. ~ trim) +
250 xlim(min(comb_pbsc$id), max(comb_pbsc$id)) +
251 ylim(0, 100) +
252 xlab('Position in read (bp)') +
253 ylab('')
254 ggplotly(p)
255 ```
256
257 ### Per sequence GC content
258
259 ```{r 'Per sequence GC content', fig.width=10}
260 ## reads 1
261 psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content')
262 psGCc_1$trim = 'before'
263
264 ## reads 2
265 psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content')
266 psGCc_2$trim = 'after'
267
268 comb_psGCc = rbind(psGCc_1, psGCc_2)
269 comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim)
270
271 p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) +
272 geom_line(color = 'red') +
273 facet_grid(. ~ trim) +
274 xlab('Mean Sequence Qaulity (Phred Score)') +
275 ylab('')
276 ggplotly(p)
277 ```
278
279
280 ### Per base N content
281
282 ```{r 'Per base N content', fig.width=10}
283 ## reads 1
284 pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content')
285 pbNc_1$id = 1:length(pbNc_1$X.Base)
286 pbNc_1$trim = 'before'
287
288 ## reads 2
289 pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content')
290 pbNc_2$id = 1:length(pbNc_2$X.Base)
291 pbNc_2$trim = 'after'
292
293 comb_pbNc = rbind(pbNc_1, pbNc_2)
294 comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim)
295
296 p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) +
297 geom_line(color = 'red') +
298 scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) +
299 facet_grid(. ~ trim) +
300 ylim(0, 1) +
301 xlab('N-Count') +
302 ylab('') +
181 theme(axis.text.x = element_text(angle=45)) 303 theme(axis.text.x = element_text(angle=45))
182 ggplotly(p) 304 ggplotly(p)
183 ``` 305 ```
184 306
307
308 ### Sequence Length Distribution
309
310 ```{r 'Sequence Length Distribution', fig.width=10}
311 ## reads 1
312 sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution')
313 sld_1$id = 1:length(sld_1$X.Length)
314 sld_1$trim = 'before'
315
316 ## reads 2
317 sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution')
318 sld_2$id = 1:length(sld_2$X.Length)
319 sld_2$trim = 'after'
320
321 comb_sld = rbind(sld_1, sld_2)
322 comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim)
323
324 p = ggplot(data = comb_sld, aes(x = id, y = Count)) +
325 geom_line(color = 'red') +
326 scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) +
327 facet_grid(. ~ trim) +
328 xlab('Sequence Length (bp)') +
329 ylab('') +
330 theme(axis.text.x = element_text(angle=45))
331 ggplotly(p)
332 ```
333
334 ### Sequence Duplication Levels
335
336 ```{r 'Sequence Duplication Levels', fig.width=10}
337 ## reads 1
338 sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
339 names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
340 sdl_1$id = 1:length(sdl_1$Duplication_Level)
341
342 melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id'))
343 melt_sdl_1$trim = 'before'
344
345
346 ## reads 2
347 sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#')
348 names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total')
349 sdl_2$id = 1:length(sdl_2$Duplication_Level)
350
351 melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id'))
352 melt_sdl_2$trim = 'after'
353
354 comb_sdl = rbind(melt_sdl_1, melt_sdl_2)
355 comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim)
356
357 p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) +
358 geom_line() +
359 scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) +
360 facet_grid(. ~ trim) +
361 xlab('Sequence Duplication Level') +
362 ylab('') +
363 theme(axis.text.x = element_text(angle=45))
364 ggplotly(p)
365 ```
366
367 ### Adapter Content
368
369 ```{r 'Adapter Content', fig.width=10}
370 ## reads 1
371 ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content')
372 ac_1$id = 1:length(ac_1$X.Position)
373
374 melt_ac_1 = melt(ac_1, id=c('X.Position', 'id'))
375 melt_ac_1$trim = 'before'
376
377 ## reads 2
378 ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content')
379 ac_2$id = 1:length(ac_2$X.Position)
380
381 melt_ac_2 = melt(ac_2, id=c('X.Position', 'id'))
382 melt_ac_2$trim = 'after'
383
384 comb_ac = rbind(melt_ac_1, melt_ac_2)
385 comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim)
386
387 p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) +
388 geom_line() +
389 facet_grid(. ~ trim) +
390 xlim(min(comb_ac$id), max(comb_ac$id)) +
391 ylim(0, 1) +
392 xlab('Position in read (bp)') +
393 ylab('')
394 ggplotly(p)
395 ```
396
397 ### Kmer Content {.tabset}
398
399 #### Before
400
401 ```{r 'Kmer Content (before)', fig.width=10}
402 kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content')
403 knitr::kable(kc_1)
404 ```
405
406 #### After
407 ```{r 'Kmer Content (after)', fig.width=10}
408 kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content')
409 knitr::kable(kc_2)
410 ```
185 411
186 412
187 # Session Info 413 # Session Info
188 414
189 ```{r 'session info'} 415 ```{r 'session info'}