Mercurial > repos > mingchen0919 > rmarkdown_fastqc_report
comparison fastqc_report.Rmd @ 17:ac5c618e4d97 draft
compare evaluation before and after trimming
author | mingchen0919 |
---|---|
date | Mon, 06 Nov 2017 16:53:14 -0500 |
parents | 1710b0e874f1 |
children | 8635a4cee6dd |
comparison
equal
deleted
inserted
replaced
16:1710b0e874f1 | 17:ac5c618e4d97 |
---|---|
107 | 107 |
108 ## Overview | 108 ## Overview |
109 | 109 |
110 ```{r} | 110 ```{r} |
111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] | 111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] |
112 reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1] | 112 reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1] |
113 combined_summary = cbind(reads_1_summary, reads_2_summary) | 113 combined_summary = cbind(reads_1_summary, reads_2_summary) |
114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) | 114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) |
115 combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' | |
116 combined_summary[combined_summary == 'WARN'] = 'WARN (!)' | |
115 knitr::kable(combined_summary) | 117 knitr::kable(combined_summary) |
116 ``` | 118 ``` |
117 | 119 |
118 ## Visualization by module {.tabset} | 120 ## Visualization by data module {.tabset} |
119 | 121 |
120 * Define a function to extract outputs for each module from fastqc output | 122 * Define a function to extract outputs for each module from fastqc output |
121 | 123 |
122 ```{r 'function definition'} | 124 ```{r 'function definition'} |
123 extract_data_module = function(fastqc_data, module_name) { | 125 extract_data_module = function(fastqc_data, module_name, header = TRUE, comment.char = "") { |
124 f = readLines(fastqc_data) | 126 f = readLines(fastqc_data) |
125 start_line = grep(module_name, f) | 127 start_line = grep(module_name, f) |
126 end_module_lines = grep('END_MODULE', f) | 128 end_module_lines = grep('END_MODULE', f) |
127 end_line = end_module_lines[which(end_module_lines > start_line)[1]] | 129 end_line = end_module_lines[which(end_module_lines > start_line)[1]] |
128 module_data = f[(start_line+1):(end_line-1)] | 130 module_data = f[(start_line+1):(end_line-1)] |
129 writeLines(module_data, 'temp.txt') | 131 writeLines(module_data, 'temp.txt') |
130 read.csv('temp.txt', sep = '\t') | 132 read.csv('temp.txt', sep = '\t', header = header, comment.char = comment.char) |
131 } | 133 } |
132 ``` | 134 ``` |
133 | 135 |
134 ### Per base sequence quality | 136 ### Per base sequence quality |
135 | 137 |
136 ```{r 'per base sequence quality', fig.width=10} | 138 ```{r 'per base sequence quality', fig.width=10} |
137 ## reads 1 | 139 ## reads 1 |
138 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality') | 140 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality') |
139 pbsq_1$id = 1:length(pbsq_1$X.Base) | 141 pbsq_1$id = 1:length(pbsq_1$X.Base) |
140 | 142 |
141 melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') | 143 melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable == 'Mean') |
142 melt_pbsq_1$trim = 'before' | 144 melt_pbsq_1$trim = 'before' |
143 | 145 |
144 | 146 |
145 ## reads 2 | 147 ## reads 2 |
146 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality') | 148 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality') |
147 pbsq_2$id = 1:length(pbsq_2$X.Base) | 149 pbsq_2$id = 1:length(pbsq_2$X.Base) |
148 | 150 |
149 melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile') | 151 melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable == 'Mean') |
150 melt_pbsq_2$trim = 'after' | 152 melt_pbsq_2$trim = 'after' |
151 | 153 |
152 comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2) | 154 comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2) |
153 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) | 155 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim) |
156 | |
154 p = ggplot(data = comb_pbsq) + | 157 p = ggplot(data = comb_pbsq) + |
155 geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) + | 158 geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) + |
156 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + | 159 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) + |
157 facet_grid(. ~ trim) + | 160 facet_grid(. ~ trim) + |
161 ylim(0, max(comb_pbsq$value) + 5) + | |
158 theme(axis.text.x = element_text(angle=45)) | 162 theme(axis.text.x = element_text(angle=45)) |
159 ggplotly(p) | 163 ggplotly(p) |
160 | 164 |
161 ``` | 165 ``` |
162 | 166 |
163 ### Per tile sequence quality | 167 ### Per tile sequence quality |
164 | 168 |
165 ```{r 'per tile sequence quality'} | 169 ```{r 'per tile sequence quality', fig.width=10} |
166 ## reads 1 | 170 ## check if 'per tile sequence quality' module exits or not |
167 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') | 171 check_ptsq = grep('Per tile sequence quality', readLines('REPORT_DIR/reads_1_fastqc_data.txt')) |
168 ptsq_1$trim = 'before' | 172 if (length(check_ptsq) > 0) { |
169 | 173 ## reads 1 |
170 ## reads 2 | 174 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality') |
171 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') | 175 ptsq_1$trim = 'before' |
172 ptsq_2$trim = 'after' | 176 |
173 | 177 ## reads 2 |
174 comb_ptsq = rbind(ptsq_1, ptsq_2) | 178 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality') |
175 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) | 179 ptsq_2$trim = 'after' |
176 comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) | 180 |
177 | 181 comb_ptsq = rbind(ptsq_1, ptsq_2) |
178 p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) + | 182 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim) |
179 geom_raster() + | 183 comb_ptsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base) |
184 | |
185 # convert integers to charaters | |
186 comb_ptsq$Tile = as.character(comb_ptsq$X.Tile) | |
187 | |
188 p = ggplot(data = comb_ptsq, aes(x = Base, y = Tile, fill = Mean)) + | |
189 geom_raster() + | |
190 facet_grid(. ~ trim) + | |
191 xlab('Position in read (bp)') + | |
192 ylab('') + | |
193 theme(axis.text.x = element_text(angle=45)) | |
194 ggplotly(p) | |
195 } else { | |
196 print('No "per tile sequence quality" data') | |
197 } | |
198 | |
199 | |
200 ``` | |
201 | |
202 ### Per sequence quality score | |
203 | |
204 ```{r 'Per sequence quality score', fig.width=10} | |
205 ## reads 1 | |
206 psqs_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence quality scores') | |
207 psqs_1$trim = 'before' | |
208 | |
209 ## reads 2 | |
210 psqs_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence quality scores') | |
211 psqs_2$trim = 'after' | |
212 | |
213 comb_psqs = rbind(psqs_1, psqs_2) | |
214 comb_psqs$trim = factor(levels = c('before', 'after'), comb_psqs$trim) | |
215 | |
216 p = ggplot(data = comb_psqs, aes(x = X.Quality, y = Count)) + | |
217 geom_line(color = 'red') + | |
180 facet_grid(. ~ trim) + | 218 facet_grid(. ~ trim) + |
219 xlim(min(comb_psqs$X.Quality), max(comb_psqs$X.Quality)) + | |
220 xlab('Mean Sequence Qaulity (Phred Score)') + | |
221 ylab('') | |
222 ggplotly(p) | |
223 ``` | |
224 | |
225 | |
226 ### Per base sequence content | |
227 | |
228 ```{r 'Per base sequence content', fig.width=10} | |
229 ## reads 1 | |
230 pbsc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence content') | |
231 pbsc_1$id = 1:length(pbsc_1$X.Base) | |
232 | |
233 melt_pbsc_1 = melt(pbsc_1, id=c('X.Base', 'id')) | |
234 melt_pbsc_1$trim = 'before' | |
235 | |
236 | |
237 ## reads 2 | |
238 pbsc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence content') | |
239 pbsc_2$id = 1:length(pbsc_2$X.Base) | |
240 | |
241 melt_pbsc_2 = melt(pbsc_2, id=c('X.Base', 'id')) | |
242 melt_pbsc_2$trim = 'after' | |
243 | |
244 comb_pbsc = rbind(melt_pbsc_1, melt_pbsc_2) | |
245 comb_pbsc$trim = factor(levels = c('before', 'after'), comb_pbsc$trim) | |
246 | |
247 p = ggplot(data = comb_pbsc, aes(x = id, y = value, color = variable)) + | |
248 geom_line() + | |
249 facet_grid(. ~ trim) + | |
250 xlim(min(comb_pbsc$id), max(comb_pbsc$id)) + | |
251 ylim(0, 100) + | |
252 xlab('Position in read (bp)') + | |
253 ylab('') | |
254 ggplotly(p) | |
255 ``` | |
256 | |
257 ### Per sequence GC content | |
258 | |
259 ```{r 'Per sequence GC content', fig.width=10} | |
260 ## reads 1 | |
261 psGCc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per sequence GC content') | |
262 psGCc_1$trim = 'before' | |
263 | |
264 ## reads 2 | |
265 psGCc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per sequence GC content') | |
266 psGCc_2$trim = 'after' | |
267 | |
268 comb_psGCc = rbind(psGCc_1, psGCc_2) | |
269 comb_psGCc$trim = factor(levels = c('before', 'after'), comb_psGCc$trim) | |
270 | |
271 p = ggplot(data = comb_psGCc, aes(x = X.GC.Content, y = Count)) + | |
272 geom_line(color = 'red') + | |
273 facet_grid(. ~ trim) + | |
274 xlab('Mean Sequence Qaulity (Phred Score)') + | |
275 ylab('') | |
276 ggplotly(p) | |
277 ``` | |
278 | |
279 | |
280 ### Per base N content | |
281 | |
282 ```{r 'Per base N content', fig.width=10} | |
283 ## reads 1 | |
284 pbNc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base N content') | |
285 pbNc_1$id = 1:length(pbNc_1$X.Base) | |
286 pbNc_1$trim = 'before' | |
287 | |
288 ## reads 2 | |
289 pbNc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base N content') | |
290 pbNc_2$id = 1:length(pbNc_2$X.Base) | |
291 pbNc_2$trim = 'after' | |
292 | |
293 comb_pbNc = rbind(pbNc_1, pbNc_2) | |
294 comb_pbNc$trim = factor(levels = c('before', 'after'), comb_pbNc$trim) | |
295 | |
296 p = ggplot(data = comb_pbNc, aes(x = id, y = N.Count)) + | |
297 geom_line(color = 'red') + | |
298 scale_x_continuous(breaks = pbNc_2$id, labels = pbNc_2$X.Base) + | |
299 facet_grid(. ~ trim) + | |
300 ylim(0, 1) + | |
301 xlab('N-Count') + | |
302 ylab('') + | |
181 theme(axis.text.x = element_text(angle=45)) | 303 theme(axis.text.x = element_text(angle=45)) |
182 ggplotly(p) | 304 ggplotly(p) |
183 ``` | 305 ``` |
184 | 306 |
307 | |
308 ### Sequence Length Distribution | |
309 | |
310 ```{r 'Sequence Length Distribution', fig.width=10} | |
311 ## reads 1 | |
312 sld_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Length Distribution') | |
313 sld_1$id = 1:length(sld_1$X.Length) | |
314 sld_1$trim = 'before' | |
315 | |
316 ## reads 2 | |
317 sld_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Length Distribution') | |
318 sld_2$id = 1:length(sld_2$X.Length) | |
319 sld_2$trim = 'after' | |
320 | |
321 comb_sld = rbind(sld_1, sld_2) | |
322 comb_sld$trim = factor(levels = c('before', 'after'), comb_sld$trim) | |
323 | |
324 p = ggplot(data = comb_sld, aes(x = id, y = Count)) + | |
325 geom_line(color = 'red') + | |
326 scale_x_continuous(breaks = sld_2$id, labels = sld_2$X.Length) + | |
327 facet_grid(. ~ trim) + | |
328 xlab('Sequence Length (bp)') + | |
329 ylab('') + | |
330 theme(axis.text.x = element_text(angle=45)) | |
331 ggplotly(p) | |
332 ``` | |
333 | |
334 ### Sequence Duplication Levels | |
335 | |
336 ```{r 'Sequence Duplication Levels', fig.width=10} | |
337 ## reads 1 | |
338 sdl_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') | |
339 names(sdl_1) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') | |
340 sdl_1$id = 1:length(sdl_1$Duplication_Level) | |
341 | |
342 melt_sdl_1 = melt(sdl_1, id=c('Duplication_Level', 'id')) | |
343 melt_sdl_1$trim = 'before' | |
344 | |
345 | |
346 ## reads 2 | |
347 sdl_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Sequence Duplication Levels', header = FALSE, comment.char = '#') | |
348 names(sdl_2) = c('Duplication_Level', 'Percentage_of_deduplicated', 'Percentage_of_total') | |
349 sdl_2$id = 1:length(sdl_2$Duplication_Level) | |
350 | |
351 melt_sdl_2 = melt(sdl_2, id=c('Duplication_Level', 'id')) | |
352 melt_sdl_2$trim = 'after' | |
353 | |
354 comb_sdl = rbind(melt_sdl_1, melt_sdl_2) | |
355 comb_sdl$trim = factor(levels = c('before', 'after'), comb_sdl$trim) | |
356 | |
357 p = ggplot(data = comb_sdl, aes(x = id, y = value, color = variable)) + | |
358 geom_line() + | |
359 scale_x_continuous(breaks = sdl_2$id, labels = sdl_2$Duplication_Level) + | |
360 facet_grid(. ~ trim) + | |
361 xlab('Sequence Duplication Level') + | |
362 ylab('') + | |
363 theme(axis.text.x = element_text(angle=45)) | |
364 ggplotly(p) | |
365 ``` | |
366 | |
367 ### Adapter Content | |
368 | |
369 ```{r 'Adapter Content', fig.width=10} | |
370 ## reads 1 | |
371 ac_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Adapter Content') | |
372 ac_1$id = 1:length(ac_1$X.Position) | |
373 | |
374 melt_ac_1 = melt(ac_1, id=c('X.Position', 'id')) | |
375 melt_ac_1$trim = 'before' | |
376 | |
377 ## reads 2 | |
378 ac_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Adapter Content') | |
379 ac_2$id = 1:length(ac_2$X.Position) | |
380 | |
381 melt_ac_2 = melt(ac_2, id=c('X.Position', 'id')) | |
382 melt_ac_2$trim = 'after' | |
383 | |
384 comb_ac = rbind(melt_ac_1, melt_ac_2) | |
385 comb_ac$trim = factor(levels = c('before', 'after'), comb_ac$trim) | |
386 | |
387 p = ggplot(data = comb_ac, aes(x = id, y = value, color = variable)) + | |
388 geom_line() + | |
389 facet_grid(. ~ trim) + | |
390 xlim(min(comb_ac$id), max(comb_ac$id)) + | |
391 ylim(0, 1) + | |
392 xlab('Position in read (bp)') + | |
393 ylab('') | |
394 ggplotly(p) | |
395 ``` | |
396 | |
397 ### Kmer Content {.tabset} | |
398 | |
399 #### Before | |
400 | |
401 ```{r 'Kmer Content (before)', fig.width=10} | |
402 kc_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Kmer Content') | |
403 knitr::kable(kc_1) | |
404 ``` | |
405 | |
406 #### After | |
407 ```{r 'Kmer Content (after)', fig.width=10} | |
408 kc_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Kmer Content') | |
409 knitr::kable(kc_2) | |
410 ``` | |
185 | 411 |
186 | 412 |
187 # Session Info | 413 # Session Info |
188 | 414 |
189 ```{r 'session info'} | 415 ```{r 'session info'} |