Mercurial > repos > mingchen0919 > rmarkdown_fastqc_report
comparison fastqc_report.Rmd @ 14:2efa46ce2c4c draft
upgrade fastqc_report
author | mingchen0919 |
---|---|
date | Wed, 18 Oct 2017 22:06:39 -0400 |
parents | e629c2288316 |
children | d1d20f341632 |
comparison
equal
deleted
inserted
replaced
13:9d3586701985 | 14:2efa46ce2c4c |
---|---|
1 --- | 1 --- |
2 title: "Fastqc report: short reads quality evaluation" | 2 title: 'HTML report title' |
3 author: "Ming Chen" | 3 output: |
4 output: html_document | 4 html_document: |
5 number_sections: true | |
6 toc: true | |
7 theme: cosmo | |
8 highlight: tango | |
5 --- | 9 --- |
6 | 10 |
7 ```{r setup, include=FALSE} | 11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE} |
8 knitr::opts_chunk$set(echo=ECHO, warning=FALSE, message=FALSE) | 12 knitr::opts_chunk$set( |
9 library(plyr) | 13 echo = ECHO |
10 library(stringr) | 14 ) |
11 library(dplyr) | |
12 library(highcharter) | |
13 library(DT) | |
14 library(reshape2) | |
15 library(plotly) | |
16 library(formattable) | |
17 library(htmltools) | |
18 ``` | 15 ``` |
19 | 16 |
20 | 17 |
21 ```{bash 'create output directory', echo=FALSE} | 18 # Fastqc Analysis |
22 # create extra files directory. very important! | |
23 mkdir REPORT_OUTPUT_DIR | |
24 ``` | |
25 | 19 |
26 # Fastqc analysis | 20 * Copy fastq files to job working directory |
27 ```{bash 'copy data to working directory', echo=FALSE} | 21 |
28 # Copy uploaded data to the working directory | 22 ```{bash 'copy files'} |
29 for f in $(echo READS | sed "s/,/ /g") | 23 for f in $(echo READS | sed "s/,/ /g") |
30 do | 24 do |
31 cp $f ./ | 25 cp $f ./ |
32 done | 26 done |
33 ``` | 27 ``` |
34 | 28 |
29 * Run fastqc | |
35 | 30 |
36 ```{bash 'run fastqc', echo=FALSE} | 31 ```{bash 'run fastqc'} |
37 for r in $(ls *.dat) | 32 for r in $(ls *.dat) |
38 do | 33 do |
39 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 | 34 fastqc -o REPORT_DIR $r > /dev/null 2>&1 |
40 done | 35 done |
41 ``` | 36 ``` |
42 | 37 |
43 ## Fastqc html reports | 38 * Create links to original HTML reports |
44 | 39 |
45 Below are links to ***Fastqc*** original html reports. | |
46 ```{r 'html report links'} | 40 ```{r 'html report links'} |
47 html_report_list = list() | 41 html_report_list = list() |
48 html_files = list.files('REPORT_OUTPUT_DIR', pattern = '.*html') | 42 html_files = list.files('REPORT_DIR', pattern = '.*html') |
49 for (i in html_files) { | 43 for (i in html_files) { |
50 html_report_list[[i]] = tags$li(tags$a(href=i, i)) | 44 html_report_list[[i]] = tags$li(tags$a(href=i, i)) |
51 } | 45 } |
52 tags$ul(html_report_list) | 46 tags$ul(html_report_list) |
53 ``` | 47 ``` |
54 | 48 |
49 # Fastqc output summary | |
55 | 50 |
56 ## Parsing fastqc data | 51 * Define a function to extract outputs for each module from fastqc output |
57 | 52 |
58 ```{bash echo=FALSE} | 53 ```{r 'function definition'} |
59 ##==== copy fastqc generated zip files from report output directory to job work directory == | 54 extract_data_module = function(fastqc_data, module_name) { |
60 cp -r REPORT_OUTPUT_DIR/*zip ./ | 55 f = readLines(fastqc_data) |
61 | 56 start_line = grep(module_name, f) |
62 # create a file to store data file paths | 57 end_module_lines = grep('END_MODULE', f) |
63 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail | 58 end_line = end_module_lines[which(end_module_lines > start_line)[1]] |
64 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score | 59 module_data = f[(start_line+1):(end_line-1)] |
65 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score | 60 writeLines(module_data, 'temp.txt') |
66 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content | 61 read.csv('temp.txt', sep = '\t') |
67 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content | |
68 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content | |
69 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level | |
70 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution | |
71 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content | |
72 | |
73 for i in $(ls *.zip) | |
74 do | |
75 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') | |
76 echo $BASE | |
77 unzip ${BASE}.zip > /dev/null 2>&1 | |
78 | |
79 ##====== pass,warning,fail (WSF) ============= | |
80 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt | |
81 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt | |
82 | |
83 ##====== per base quality scores (PBQS) ====== | |
84 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt | |
85 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt | |
86 | |
87 ##====== per sequence quality scores (PSQS) | |
88 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt | |
89 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt | |
90 | |
91 ##====== Per sequence GC content (PSGC) | |
92 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt | |
93 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt | |
94 | |
95 ##====== Per Base Sequence Content (PBSC) | |
96 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt | |
97 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt | |
98 | |
99 ##====== Per Base N Content (PBNC) | |
100 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt | |
101 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt | |
102 | |
103 ##====== Sequence Duplication Level (SDL) | |
104 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt | |
105 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt | |
106 | |
107 ##====== Sequence Length Distribution (SLD) | |
108 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt | |
109 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt | |
110 | |
111 ##====== Kmer Content ============ | |
112 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt | |
113 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt | |
114 | |
115 done | |
116 ``` | |
117 | |
118 | |
119 ## Evaluation Overview | |
120 | |
121 ```{r 'overview'} | |
122 PWF_file_paths = read.csv('PWF_file_paths.txt', | |
123 header = TRUE, stringsAsFactors = FALSE) | |
124 rm('PWF_df') | |
125 for(i in 1:nrow(PWF_file_paths)) { | |
126 file_path = PWF_file_paths[i,2] | |
127 pwf_df = read.csv(file_path, | |
128 sep='\t', header=FALSE, stringsAsFactors = FALSE) | |
129 colnames(pwf_df) = c('item', PWF_file_paths[i,1]) | |
130 if (!exists('PWF_df')) { | |
131 PWF_df = pwf_df | |
132 } else { | |
133 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) | |
134 } | |
135 } | 62 } |
136 ``` | 63 ``` |
137 | 64 |
138 ```{r} | 65 ## |
139 my_icon = c('ok', 'remove', 'star') | |
140 names(my_icon) = c('pass', 'fail', 'warn') | |
141 evaluate_list = list() | |
142 for (i in colnames(PWF_df)[-1]) { | |
143 evaluate_list[[i]] = formatter( | |
144 "span", | |
145 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), | |
146 "color" = "white", | |
147 "width" = "50px", | |
148 "float" = "left", | |
149 "padding-right" = "5px") | |
150 ) | |
151 } | |
152 | 66 |
153 formattable(PWF_df, evaluate_list) | 67 # Session Info |
68 | |
69 ```{r 'session info'} | |
70 sessionInfo() | |
154 ``` | 71 ``` |
155 | 72 |
156 | |
157 ## Per Base Quality Scores | |
158 | |
159 ```{r} | |
160 PBQS_df = data.frame() | |
161 PBQS_file_paths = read.csv('PBQS_file_paths.txt', | |
162 header = TRUE, stringsAsFactors = FALSE) | |
163 for(i in 1:nrow(PBQS_file_paths)) { | |
164 # file_path = paste0('REPORT_OUTPUT_DIR/', PBQS_file_paths[i,2]) | |
165 file_path = PBQS_file_paths[i,2] | |
166 pbqs_df = read.csv(file_path, | |
167 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% | |
168 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), | |
169 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% | |
170 (function (df) { | |
171 df1 = select(df, -Base2) | |
172 df2 = select(df, -Base1) %>% filter(Base2 != '') | |
173 colnames(df1) = c(colnames(df1)[1:7], 'Base') | |
174 colnames(df2) = c(colnames(df2)[1:7], 'Base') | |
175 res = rbind(df1, df2) %>% arrange(Base) | |
176 return(res) | |
177 }) | |
178 pbqs_df$sample_id = rep(PBQS_file_paths[i,1], nrow(pbqs_df)) | |
179 PBQS_df = rbind(PBQS_df, pbqs_df) | |
180 } | |
181 ``` | |
182 | |
183 | |
184 ```{r} | |
185 # datatable(PBQS_df) | |
186 max_phred = max(PBQS_df$Mean) + 10 | |
187 hchart(PBQS_df, "line", hcaes(x = Base, y = Mean, group = sample_id)) %>% | |
188 hc_title( | |
189 text = "Per Base Quality Score" | |
190 ) %>% | |
191 hc_yAxis( | |
192 title = list(text = "Mean Base Quality Score"), | |
193 min = 0, | |
194 max = max_phred, | |
195 plotLines = list( | |
196 list(label = list(text = "Phred Score = 27"), | |
197 width = 2, | |
198 dashStyle = "dash", | |
199 color = "green", | |
200 value = 27), | |
201 list(label = list(text = "Phred Score = 20"), | |
202 width = 2, | |
203 color = "red", | |
204 value = 20) | |
205 ) | |
206 ) %>% | |
207 hc_exporting(enabled = TRUE) | |
208 ``` | |
209 | |
210 | |
211 ## Per Base N Content | |
212 | |
213 ```{r} | |
214 PBNC_df = data.frame() | |
215 PBNC_file_paths = read.csv('PBNC_file_paths.txt', | |
216 header = TRUE, stringsAsFactors = FALSE) | |
217 for(i in 1:nrow(PBNC_file_paths)) { | |
218 # file_path = paste0('REPORT_OUTPUT_DIR/', PBNC_file_paths[i,2]) | |
219 file_path = PBNC_file_paths[i,2] | |
220 pbnc_df = read.csv(file_path, | |
221 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% | |
222 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), | |
223 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% | |
224 (function (df) { | |
225 df1 = select(df, -Base2) | |
226 df2 = select(df, -Base1) %>% filter(Base2 != '') | |
227 colnames(df1) = c(colnames(df1)[1:2], 'Base') | |
228 colnames(df2) = c(colnames(df2)[1:2], 'Base') | |
229 res = rbind(df1, df2) %>% arrange(Base) | |
230 return(res) | |
231 }) | |
232 pbnc_df$sample_id = rep(PBNC_file_paths[i,1], nrow(pbnc_df)) | |
233 PBNC_df = rbind(PBNC_df, pbnc_df) | |
234 } | |
235 ``` | |
236 | |
237 | |
238 ```{r} | |
239 PBNC_df$N.Count = PBNC_df$N.Count * 100 | |
240 max_phred = max(PBNC_df$N.Count) + 5 | |
241 hchart(PBNC_df, "line", hcaes(x = as.character(Base), y = N.Count, group = sample_id)) %>% | |
242 hc_title( | |
243 text = "Per Base N Content" | |
244 ) %>% | |
245 hc_xAxis( | |
246 title = list(text = "Base Position") | |
247 ) %>% | |
248 hc_yAxis( | |
249 title = list(text = "N %"), | |
250 plotLines = list( | |
251 list(label = list(text = "N = 5%"), | |
252 width = 2, | |
253 dashStyle = "dash", | |
254 color = "red", | |
255 value = 5) | |
256 ) | |
257 ) %>% | |
258 hc_exporting(enabled = TRUE) | |
259 ``` | |
260 | |
261 | |
262 | |
263 | |
264 ## Per Sequence Quality Scores | |
265 | |
266 ```{r} | |
267 PSQS_df = data.frame() | |
268 PSQS_file_paths = read.csv('PSQS_file_paths.txt', | |
269 header = TRUE, stringsAsFactors = FALSE) | |
270 for(i in 1:nrow(PSQS_file_paths)) { | |
271 # file_path = paste0('REPORT_OUTPUT_DIR/', PSQS_file_paths[i,2]) | |
272 file_path = PSQS_file_paths[i,2] | |
273 psqs_df = read.csv(file_path, | |
274 sep='\t', header=TRUE, stringsAsFactors = FALSE) | |
275 psqs_df$sample_id = rep(PSQS_file_paths[i,1], nrow(psqs_df)) | |
276 PSQS_df = rbind(PSQS_df, psqs_df) | |
277 } | |
278 ``` | |
279 | |
280 | |
281 ```{r} | |
282 max_phred = max(PSQS_df$X.Quality) + 5 | |
283 hchart(PSQS_df, "line", hcaes(x = X.Quality, y = Count, group = sample_id)) %>% | |
284 hc_title( | |
285 text = "Per Sequence Quality Score" | |
286 ) %>% | |
287 hc_xAxis( | |
288 title = list(text = "Mean Sequence Quality Score"), | |
289 min = 0, | |
290 max = max_phred, | |
291 plotLines = list( | |
292 list(label = list(text = "Phred Score = 27"), | |
293 width = 2, | |
294 dashStyle = "dash", | |
295 color = "green", | |
296 value = 27), | |
297 list(label = list(text = "Phred Score = 20"), | |
298 width = 2, | |
299 color = "red", | |
300 value = 20) | |
301 ) | |
302 ) %>% | |
303 hc_exporting(enabled = TRUE) | |
304 ``` | |
305 | |
306 | |
307 ## Per Sequence GC Content | |
308 | |
309 | |
310 ```{r} | |
311 PSGC_df = data.frame() | |
312 PSGC_file_paths = read.csv('PSGC_file_paths.txt', | |
313 header = TRUE, stringsAsFactors = FALSE) | |
314 for(i in 1:nrow(PSGC_file_paths)) { | |
315 # file_path = paste0('REPORT_OUTPUT_DIR/', PSGC_file_paths[i,2]) | |
316 file_path = PSGC_file_paths[i,2] | |
317 psgc_df = read.csv(file_path, | |
318 sep='\t', header=TRUE, stringsAsFactors = FALSE) | |
319 psgc_df$sample_id = rep(PSGC_file_paths[i,1], nrow(psgc_df)) | |
320 PSGC_df = rbind(PSGC_df, psgc_df) | |
321 } | |
322 ``` | |
323 | |
324 | |
325 ```{r} | |
326 max_phred = max(PSGC_df$Count) + 5 | |
327 hchart(PSGC_df, "line", hcaes(x = X.GC.Content, y = Count, group = sample_id)) %>% | |
328 hc_title( | |
329 text = "Per Sequence GC Content" | |
330 ) %>% | |
331 hc_xAxis( | |
332 title = list(text = "% GC") | |
333 ) %>% | |
334 hc_exporting(enabled = TRUE) | |
335 ``` | |
336 | |
337 | |
338 ## Per Base Sequence Content | |
339 | |
340 ```{r} | |
341 PBSC_df = data.frame() | |
342 PBSC_file_paths = read.csv('PBSC_file_paths.txt', | |
343 header = TRUE, stringsAsFactors = FALSE) | |
344 for(i in 1:nrow(PBSC_file_paths)) { | |
345 # file_path = paste0('REPORT_OUTPUT_DIR/', PBSC_file_paths[i,2]) | |
346 file_path = PBSC_file_paths[i,2] | |
347 pbsc_df = read.csv(file_path, | |
348 sep='\t', header=TRUE, stringsAsFactors = FALSE) %>% | |
349 mutate(Base1=as.numeric(str_split_fixed(X.Base, '-', 2)[,1]), | |
350 Base2=as.numeric(str_split_fixed(X.Base, '-', 2)[,2])) %>% | |
351 (function (df) { | |
352 df1 = select(df, -Base2) | |
353 df2 = select(df, -Base1) %>% filter(Base2 != '') | |
354 colnames(df1) = c(colnames(df1)[1:5], 'Base') | |
355 colnames(df2) = c(colnames(df2)[1:5], 'Base') | |
356 res = rbind(df1, df2) %>% arrange(Base) | |
357 return(res) | |
358 }) | |
359 pbsc_df$sample_id = rep(PBSC_file_paths[i,1], nrow(pbsc_df)) | |
360 PBSC_df = rbind(PBSC_df, pbsc_df) | |
361 } | |
362 ``` | |
363 | |
364 | |
365 ```{r out.width="100%"} | |
366 PBSC_df_2 = select(PBSC_df, -X.Base) %>% | |
367 melt(id = c('Base', 'sample_id'), value.name = 'base_percentage') | |
368 p = ggplot(data = PBSC_df_2, aes(x = Base, y = base_percentage, group = variable, color = variable)) + | |
369 geom_line() + | |
370 facet_wrap(~ sample_id) | |
371 ggplotly(p) | |
372 ``` | |
373 | |
374 | |
375 ## References | |
376 | |
377 * Andrews, Simon. "FastQC: a quality control tool for high throughput sequence data." (2010): 175-176. | |
378 * Goecks, Jeremy, Anton Nekrutenko, and James Taylor. "Galaxy: a comprehensive approach for supporting accessible, reproducible, and transparent computational research in the life sciences." Genome biology 11.8 (2010): R86. | |
379 * Afgan, Enis, et al. "The Galaxy platform for accessible, reproducible and collaborative biomedical analyses: 2016 update." Nucleic acids research (2016): gkw343. | |
380 * Highcharts. https://www.highcharts.com/. (access by May 26, 2017). | |
381 * R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. | |
382 * Joshua Kunst (2017). highcharter: A Wrapper for the 'Highcharts' Library. R package version 0.5.0. https://CRAN.R-project.org/package=highcharter | |
383 * Carson Sievert, Chris Parmer, Toby Hocking, Scott Chamberlain, Karthik Ram, Marianne Corvellec and Pedro Despouy (2017). plotly: Create Interactive Web Graphics via 'plotly.js'. R package version 4.6.0. https://CRAN.R-project.org/package=plotly |