comparison fastqc_report.Rmd @ 14:2efa46ce2c4c draft

upgrade fastqc_report
author mingchen0919
date Wed, 18 Oct 2017 22:06:39 -0400
parents e629c2288316
children d1d20f341632
comparison
equal deleted inserted replaced
13:9d3586701985 14:2efa46ce2c4c
1 --- 1 ---
2 title: "Fastqc report: short reads quality evaluation" 2 title: 'HTML report title'
3 author: "Ming Chen" 3 output:
4 output: html_document 4 html_document:
5 number_sections: true
6 toc: true
7 theme: cosmo
8 highlight: tango
5 --- 9 ---
6 10
7 ```{r setup, include=FALSE} 11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
8 knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE) 12 knitr::opts_chunk$set(
9 library(plyr) 13 echo = ECHO
10 library(stringr) 14 )
11 library(dplyr)
12 library(highcharter)
13 library(DT)
14 library(reshape2)
15 library(plotly)
16 library(formattable)
17 library(htmltools)
18 ``` 15 ```
19 16
20 17
21 ```{bash 'create output directory', echo=FALSE} 18 # Fastqc Analysis
22 # create extra files directory. very important!
23 mkdir REPORT_OUTPUT_DIR
24 ```
25 19
26 # Fastqc analysis 20 * Copy fastq files to job working directory
27 ```{bash 'copy data to working directory', echo=FALSE} 21
28 # Copy uploaded data to the working directory 22 ```{bash 'copy files'}
29 for f in $(echo READS | sed "s/,/ /g") 23 for f in $(echo READS | sed "s/,/ /g")
30 do 24 do
31 cp $f ./ 25 cp $f ./
32 done 26 done
33 ``` 27 ```
34 28
29 * Run fastqc
35 30
36 ```{bash 'run fastqc', echo=FALSE} 31 ```{bash 'run fastqc'}
37 for r in $(ls *.dat) 32 for r in $(ls *.dat)
38 do 33 do
39 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 34 fastqc -o REPORT_DIR $r > /dev/null 2>&1
40 done 35 done
41 ``` 36 ```
42 37
43 ## Fastqc html reports 38 * Create links to original HTML reports
44 39
45 Below are links to ***Fastqc*** original html reports.
46 ```{r 'html report links'} 40 ```{r 'html report links'}
47 html_report_list = list() 41 html_report_list = list()
48 html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html') 42 html_files = list.files('REPORT_DIR', pattern = '.*html')
49 for (i in html_files) { 43 for (i in html_files) {
50 html_report_list[[i]] = tags$li(tags$a(href=i, i)) 44 html_report_list[[i]] = tags$li(tags$a(href=i, i))
51 } 45 }
52 tags$ul(html_report_list) 46 tags$ul(html_report_list)
53 ``` 47 ```
54 48
49 # Fastqc output summary
55 50
56 ## Parsing fastqc data 51 * Define a function to extract outputs for each module from fastqc output
57 52
58 ```{bash echo=FALSE} 53 ```{r 'function definition'}
59 ##==== copy fastqc generated zip files from report output directory to job work directory == 54 extract_data_module = function(fastqc_data, module_name) {
60 cp -r REPORT_OUTPUT_DIR/*zip ./ 55 f = readLines(fastqc_data)
61 56 start_line = grep(module_name, f)
62 # create a file to store data file paths 57 end_module_lines = grep('END_MODULE', f)
63 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail 58 end_line = end_module_lines[which(end_module_lines > start_line)[1]]
64 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score 59 module_data = f[(start_line+1):(end_line-1)]
65 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score 60 writeLines(module_data, 'temp.txt')
66 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content 61 read.csv('temp.txt', sep = '\t')
67 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
68 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
69 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
70 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
71 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
72
73 for i in $(ls *.zip)
74 do
75 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
76 echo $BASE
77 unzip ${BASE}.zip > /dev/null 2>&1
78
79 ##====== pass,warning,fail (WSF) =============
80 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
81 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
82
83 ##====== per base quality scores (PBQS) ======
84 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
85 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
86
87 ##====== per sequence quality scores (PSQS)
88 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
89 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
90
91 ##====== Per sequence GC content (PSGC)
92 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
93 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
94
95 ##====== Per Base Sequence Content (PBSC)
96 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
97 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
98
99 ##====== Per Base N Content (PBNC)
100 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
101 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
102
103 ##====== Sequence Duplication Level (SDL)
104 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
105 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
106
107 ##====== Sequence Length Distribution (SLD)
108 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
109 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
110
111 ##====== Kmer Content ============
112 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
113 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
114
115 done
116 ```
117
118
119 ## Evaluation Overview
120
121 ```{r 'overview'}
122 PWF_file_paths = read.csv('PWF_file_paths.txt',
123 header = TRUE, stringsAsFactors = FALSE)
124 rm('PWF_df')
125 for(i in 1:nrow(PWF_file_paths)) {
126 file_path = PWF_file_paths[i,2]
127 pwf_df = read.csv(file_path,
128 sep='\t', header=FALSE, stringsAsFactors = FALSE)
129 colnames(pwf_df) = c('item', PWF_file_paths[i,1])
130 if (!exists('PWF_df')) {
131 PWF_df = pwf_df
132 } else {
133 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE])
134 }
135 } 62 }
136 ``` 63 ```
137 64
138 ```{r} 65 ##
139 my_icon = c('ok', 'remove', 'star')
140 names(my_icon) = c('pass', 'fail', 'warn')
141 evaluate_list = list()
142 for (i in colnames(PWF_df)[-1]) {
143 evaluate_list[[i]] = formatter(
144 "span",
145 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')),
146 "color" = "white",
147 "width" = "50px",
148 "float" = "left",
149 "padding-right" = "5px")
150 )
151 }
152 66
153 formattable(PWF_df, evaluate_list) 67 # Session Info
68
69 ```{r 'session info'}
70 sessionInfo()
154 ``` 71 ```
155 72
156
157 ## Per Base Quality Scores
158
159 ```{r}
160 PBQS_df = data.frame()
161 PBQS_file_paths = read.csv('PBQS_file_paths.txt',
162 header = TRUE, stringsAsFactors = FALSE)
163 for(i in 1:nrow(PBQS_file_paths)) {
164 # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2])
165 file_path = PBQS_file_paths[i,2]
166 pbqs_df = read.csv(file_path,
167 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
168 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
169 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
170 (function (df) {
171 df1 = select(df, -Base2)
172 df2 = select(df, -Base1) %>% filter(Base2 != '')
173 colnames(df1) = c(colnames(df1)[1:7], 'Base')
174 colnames(df2) = c(colnames(df2)[1:7], 'Base')
175 res = rbind(df1, df2) %>% arrange(Base)
176 return(res)
177 })
178 pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df))
179 PBQS_df = rbind(PBQS_df, pbqs_df)
180 }
181 ```
182
183
184 ```{r}
185 # datatable(PBQS_df)
186 max_phred = max(PBQS_df$Mean) + 10
187 hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>%
188 hc_title(
189 text = "Per Base Quality Score"
190 ) %>%
191 hc_yAxis(
192 title = list(text = "Mean Base Quality Score"),
193 min = 0,
194 max = max_phred,
195 plotLines = list(
196 list(label = list(text = "Phred Score = 27"),
197 width = 2,
198 dashStyle = "dash",
199 color = "green",
200 value = 27),
201 list(label = list(text = "Phred Score = 20"),
202 width = 2,
203 color = "red",
204 value = 20)
205 )
206 ) %>%
207 hc_exporting(enabled = TRUE)
208 ```
209
210
211 ## Per Base N Content
212
213 ```{r}
214 PBNC_df = data.frame()
215 PBNC_file_paths = read.csv('PBNC_file_paths.txt',
216 header = TRUE, stringsAsFactors = FALSE)
217 for(i in 1:nrow(PBNC_file_paths)) {
218 # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2])
219 file_path = PBNC_file_paths[i,2]
220 pbnc_df = read.csv(file_path,
221 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
222 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
223 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
224 (function (df) {
225 df1 = select(df, -Base2)
226 df2 = select(df, -Base1) %>% filter(Base2 != '')
227 colnames(df1) = c(colnames(df1)[1:2], 'Base')
228 colnames(df2) = c(colnames(df2)[1:2], 'Base')
229 res = rbind(df1, df2) %>% arrange(Base)
230 return(res)
231 })
232 pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df))
233 PBNC_df = rbind(PBNC_df, pbnc_df)
234 }
235 ```
236
237
238 ```{r}
239 PBNC_df$N.Count = PBNC_df$N.Count * 100
240 max_phred = max(PBNC_df$N.Count) + 5
241 hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>%
242 hc_title(
243 text = "Per Base N Content"
244 ) %>%
245 hc_xAxis(
246 title = list(text = "Base Position")
247 ) %>%
248 hc_yAxis(
249 title = list(text = "N %"),
250 plotLines = list(
251 list(label = list(text = "N = 5%"),
252 width = 2,
253 dashStyle = "dash",
254 color = "red",
255 value = 5)
256 )
257 ) %>%
258 hc_exporting(enabled = TRUE)
259 ```
260
261
262
263
264 ## Per Sequence Quality Scores
265
266 ```{r}
267 PSQS_df = data.frame()
268 PSQS_file_paths = read.csv('PSQS_file_paths.txt',
269 header = TRUE, stringsAsFactors = FALSE)
270 for(i in 1:nrow(PSQS_file_paths)) {
271 # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2])
272 file_path = PSQS_file_paths[i,2]
273 psqs_df = read.csv(file_path,
274 sep='\t', header=TRUE, stringsAsFactors = FALSE)
275 psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df))
276 PSQS_df = rbind(PSQS_df, psqs_df)
277 }
278 ```
279
280
281 ```{r}
282 max_phred = max(PSQS_df$X.Quality) + 5
283 hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>%
284 hc_title(
285 text = "Per Sequence Quality Score"
286 ) %>%
287 hc_xAxis(
288 title = list(text = "Mean Sequence Quality Score"),
289 min = 0,
290 max = max_phred,
291 plotLines = list(
292 list(label = list(text = "Phred Score = 27"),
293 width = 2,
294 dashStyle = "dash",
295 color = "green",
296 value = 27),
297 list(label = list(text = "Phred Score = 20"),
298 width = 2,
299 color = "red",
300 value = 20)
301 )
302 ) %>%
303 hc_exporting(enabled = TRUE)
304 ```
305
306
307 ## Per Sequence GC Content
308
309
310 ```{r}
311 PSGC_df = data.frame()
312 PSGC_file_paths = read.csv('PSGC_file_paths.txt',
313 header = TRUE, stringsAsFactors = FALSE)
314 for(i in 1:nrow(PSGC_file_paths)) {
315 # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2])
316 file_path = PSGC_file_paths[i,2]
317 psgc_df = read.csv(file_path,
318 sep='\t', header=TRUE, stringsAsFactors = FALSE)
319 psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df))
320 PSGC_df = rbind(PSGC_df, psgc_df)
321 }
322 ```
323
324
325 ```{r}
326 max_phred = max(PSGC_df$Count) + 5
327 hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>%
328 hc_title(
329 text = "Per Sequence GC Content"
330 ) %>%
331 hc_xAxis(
332 title = list(text = "% GC")
333 ) %>%
334 hc_exporting(enabled = TRUE)
335 ```
336
337
338 ## Per Base Sequence Content
339
340 ```{r}
341 PBSC_df = data.frame()
342 PBSC_file_paths = read.csv('PBSC_file_paths.txt',
343 header = TRUE, stringsAsFactors = FALSE)
344 for(i in 1:nrow(PBSC_file_paths)) {
345 # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2])
346 file_path = PBSC_file_paths[i,2]
347 pbsc_df = read.csv(file_path,
348 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>%
349 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]),
350 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>%
351 (function (df) {
352 df1 = select(df, -Base2)
353 df2 = select(df, -Base1) %>% filter(Base2 != '')
354 colnames(df1) = c(colnames(df1)[1:5], 'Base')
355 colnames(df2) = c(colnames(df2)[1:5], 'Base')
356 res = rbind(df1, df2) %>% arrange(Base)
357 return(res)
358 })
359 pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df))
360 PBSC_df = rbind(PBSC_df, pbsc_df)
361 }
362 ```
363
364
365 ```{r out.width="100%"}
366 PBSC_df_2 = select(PBSC_df, -X.Base) %>%
367 melt(id = c('Base', 'sample_id'), value.name = 'base_percentage')
368 p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) +
369 geom_line() +
370 facet_wrap(~ sample_id)
371 ggplotly(p)
372 ```
373
374
375 ## References
376
377 * Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176.
378 * Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86.
379 * Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343.
380 * Highcharts. https://www.highcharts.com/. (access by May 26, 2017).
381 * R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
382 * Joshua Kunst (2017). highcharter: A Wrapper for the 'Highcharts' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter
383 * Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly