comparison 01_evaluation_overview.Rmd @ 11:507eec497730 draft

update fastqc site
author mingchen0919
date Tue, 07 Nov 2017 16:52:24 -0500
parents d732d4526c6d
children
comparison
equal deleted inserted replaced
10:600c39b11913 11:507eec497730
1 --- 1 ---
2 title: "Evaluation Overview" 2 title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)'
3 output: html_document 3 output:
4 html_document:
5 number_sections: true
6 toc: true
7 theme: cosmo
8 highlight: tango
4 --- 9 ---
5 10
6 ```{r setup, include=FALSE, warning=FALSE, message=FALSE} 11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
7 knitr::opts_chunk$set(echo = ECHO) 12 knitr::opts_chunk$set(
8 ``` 13 echo = ECHO,
9 14 error = TRUE
10 ```{bash 'copy data from datasets directory to working directory', echo=FALSE} 15 )
11 # Copy uploaded data to the working directory
12 for f in $(echo READS | sed "s/,/ /g")
13 do
14 cp $f ./
15 done
16 ```
17
18 ```{bash 'run fastqc', echo=FALSE}
19 # run fastqc and place outputs into the report directory
20 for r in $(ls *.dat)
21 do
22 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1
23 done
24 ```
25
26 ```{bash 'parse fastqc results', echo=FALSE}
27 ##==== copy fastqc generated zip files from report output directory to job work directory ==
28 cp -r REPORT_OUTPUT_DIR/*zip ./
29
30 # create a file to store data file paths
31 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail
32 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score
33 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score
34 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content
35 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content
36 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content
37 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level
38 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution
39 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content
40
41 for i in $(ls *.zip)
42 do
43 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g')
44 echo $BASE
45 unzip ${BASE}.zip > /dev/null 2>&1
46
47 ##====== pass,warning,fail (WSF) =============
48 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt
49 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt
50
51 ##====== per base quality scores (PBQS) ======
52 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt
53 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt
54
55 ##====== per sequence quality scores (PSQS)
56 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt
57 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt
58
59 ##====== Per sequence GC content (PSGC)
60 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt
61 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt
62
63 ##====== Per Base Sequence Content (PBSC)
64 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt
65 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt
66
67 ##====== Per Base N Content (PBNC)
68 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt
69 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt
70
71 ##====== Sequence Duplication Level (SDL)
72 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt
73 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt
74
75 ##====== Sequence Length Distribution (SLD)
76 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt
77 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt
78
79 ##====== Kmer Content ============
80 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt
81 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt
82
83 done
84 ``` 16 ```
85 17
86 18
87 ## Evaluation Overview 19 # Fastqc Evaluation
88 20
89 ```{r 'overview'} 21 ## Evaluation of reads before trimming
90 PWF_file_paths = read.csv('PWF_file_paths.txt', 22
91 header = TRUE, stringsAsFactors = FALSE) 23 ```{r}
92 rm('PWF_df') 24 if ('READS_1' == 'None') {
93 for(i in 1:nrow(PWF_file_paths)) { 25 stop("No pre-trimming reads provided!")
94 file_path = PWF_file_paths[i,2] 26 } else {
95 pwf_df = read.csv(file_path, 27 ## run fastqc evaluation
96 sep='\t', header=FALSE, stringsAsFactors = FALSE) 28 fastqc_command = paste0('fastqc ') %>%
97 colnames(pwf_df) = c('item', PWF_file_paths[i,1]) 29 (function(x) {
98 if (!exists('PWF_df')) { 30 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
99 PWF_df = pwf_df 31 }) %>%
100 } else { 32 (function(x) {
101 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) 33 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
102 } 34 }) %>%
35 (function(x) {
36 paste0(x, '-o REPORT_DIR ')
37 })
38 fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
39 system(fastqc_command_reads_1, intern = TRUE)
40
41 # Original html report
42 reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
43 original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
44
45 unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
46 reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
47 # fastqc_data.txt
48 file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
49 fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
50 # summary.txt
51 file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
52 summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
53
54 tags$ul(
55 tags$li(original_html),
56 tags$li(fastqc_data),
57 tags$li(summary_data)
58 )
103 } 59 }
104 ``` 60 ```
105 61
106 62
63 ## Evaluation of reads after trimming
64
107 ```{r} 65 ```{r}
108 my_icon = c('ok', 'remove', 'star') 66 if ('READS_2' == 'None') {
109 names(my_icon) = c('pass', 'fail', 'warn') 67 stop("No pre-trimming reads provided!")
110 evaluate_list = list() 68 } else {
111 for (i in colnames(PWF_df)[-1]) { 69 ## run fastqc evaluation
112 evaluate_list[[i]] = formatter( 70 fastqc_command = paste0('fastqc ') %>%
113 "span", 71 (function(x) {
114 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), 72 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
115 "color" = "white", 73 }) %>%
116 "width" = "50px", 74 (function(x) {
117 "float" = "left", 75 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
118 "padding-right" = "5px") 76 }) %>%
119 ) 77 (function(x) {
78 paste0(x, '-o REPORT_DIR ')
79 })
80 fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
81 system(fastqc_command_reads_2, intern = TRUE)
82
83 # Original html report
84 reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
85 original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
86
87 unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
88 reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
89 # fastqc_data.txt
90 file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
91 fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
92 # summary.txt
93 file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
94 summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
95
96 tags$ul(
97 tags$li(original_html),
98 tags$li(fastqc_data),
99 tags$li(summary_data)
100 )
120 } 101 }
102 ```
121 103
122 formattable(PWF_df, evaluate_list) 104
105
106 # Fastqc output visualization
107
108 ## Overview
109
110 ```{r}
111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
112 reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1]
113 combined_summary = cbind(reads_1_summary, reads_2_summary)
114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
115 combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)'
116 combined_summary[combined_summary == 'WARN'] = 'WARN (!)'
117 knitr::kable(combined_summary)
123 ``` 118 ```
119
120 # Session Info
121
122 ```{r 'session info'}
123 sessionInfo()
124 ```