Mercurial > repos > mingchen0919 > rmarkdown_fastqc_site
comparison 01_evaluation_overview.Rmd @ 11:507eec497730 draft
update fastqc site
author | mingchen0919 |
---|---|
date | Tue, 07 Nov 2017 16:52:24 -0500 |
parents | d732d4526c6d |
children |
comparison
equal
deleted
inserted
replaced
10:600c39b11913 | 11:507eec497730 |
---|---|
1 --- | 1 --- |
2 title: "Evaluation Overview" | 2 title: 'Short reads evaluation with [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)' |
3 output: html_document | 3 output: |
4 html_document: | |
5 number_sections: true | |
6 toc: true | |
7 theme: cosmo | |
8 highlight: tango | |
4 --- | 9 --- |
5 | 10 |
6 ```{r setup, include=FALSE, warning=FALSE, message=FALSE} | 11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE} |
7 knitr::opts_chunk$set(echo = ECHO) | 12 knitr::opts_chunk$set( |
8 ``` | 13 echo = ECHO, |
9 | 14 error = TRUE |
10 ```{bash 'copy data from datasets directory to working directory', echo=FALSE} | 15 ) |
11 # Copy uploaded data to the working directory | |
12 for f in $(echo READS | sed "s/,/ /g") | |
13 do | |
14 cp $f ./ | |
15 done | |
16 ``` | |
17 | |
18 ```{bash 'run fastqc', echo=FALSE} | |
19 # run fastqc and place outputs into the report directory | |
20 for r in $(ls *.dat) | |
21 do | |
22 fastqc -o REPORT_OUTPUT_DIR $r > /dev/null 2>&1 | |
23 done | |
24 ``` | |
25 | |
26 ```{bash 'parse fastqc results', echo=FALSE} | |
27 ##==== copy fastqc generated zip files from report output directory to job work directory == | |
28 cp -r REPORT_OUTPUT_DIR/*zip ./ | |
29 | |
30 # create a file to store data file paths | |
31 echo "sample_id,file_path" > PWF_file_paths.txt # Pass, Warning, Fail | |
32 echo "sample_id,file_path" > PBQS_file_paths.txt # Per Base Quality Score | |
33 echo "sample_id,file_path" > PSQS_file_paths.txt # Per Sequence Quality Score | |
34 echo "sample_id,file_path" > PSGC_file_paths.txt # Per Sequence GC Content | |
35 echo "sample_id,file_path" > PBSC_file_paths.txt # Per Base Sequence Content | |
36 echo "sample_id,file_path" > PBNC_file_paths.txt # Per Base N Content | |
37 echo "sample_id,file_path" > SDL_file_paths.txt # Sequence Duplication Level | |
38 echo "sample_id,file_path" > SLD_file_paths.txt # Sequence Length Distribution | |
39 echo "sample_id,file_path" > KMC_file_paths.txt # Kmer Content | |
40 | |
41 for i in $(ls *.zip) | |
42 do | |
43 BASE=$(echo $i | sed 's/\(.*\)\.zip/\1/g') | |
44 echo $BASE | |
45 unzip ${BASE}.zip > /dev/null 2>&1 | |
46 | |
47 ##====== pass,warning,fail (WSF) ============= | |
48 awk '/^>>/ {print}' "$BASE"/fastqc_data.txt | grep -v 'END_MODULE' | sed 's/>>//' > "$BASE"-PWF.txt | |
49 echo "${BASE},${BASE}-PWF.txt" >> PWF_file_paths.txt | |
50 | |
51 ##====== per base quality scores (PBQS) ====== | |
52 awk '/^>>Per base sequence quality/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBQS.txt | |
53 echo "${BASE},${BASE}-PBQS.txt" >> PBQS_file_paths.txt | |
54 | |
55 ##====== per sequence quality scores (PSQS) | |
56 awk '/^>>Per sequence quality scores/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSQS.txt | |
57 echo "${BASE},${BASE}-PSQS.txt" >> PSQS_file_paths.txt | |
58 | |
59 ##====== Per sequence GC content (PSGC) | |
60 awk '/^>>Per sequence GC content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PSGC.txt | |
61 echo "${BASE},${BASE}-PSGC.txt" >> PSGC_file_paths.txt | |
62 | |
63 ##====== Per Base Sequence Content (PBSC) | |
64 awk '/^>>Per base sequence content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBSC.txt | |
65 echo "${BASE},${BASE}-PBSC.txt" >> PBSC_file_paths.txt | |
66 | |
67 ##====== Per Base N Content (PBNC) | |
68 awk '/^>>Per base N content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-PBNC.txt | |
69 echo "${BASE},${BASE}-PBNC.txt" >> PBNC_file_paths.txt | |
70 | |
71 ##====== Sequence Duplication Level (SDL) | |
72 awk '/^>>Sequence Duplication Levels/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SDL.txt | |
73 echo "${BASE},${BASE}-SDL.txt" >> SDL_file_paths.txt | |
74 | |
75 ##====== Sequence Length Distribution (SLD) | |
76 awk '/^>>Sequence Length Distribution/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-SLD.txt | |
77 echo "${BASE},${BASE}-SLD.txt" >> SLD_file_paths.txt | |
78 | |
79 ##====== Kmer Content ============ | |
80 awk '/^>>Kmer Content/ {flag=1; next} /END_MODULE/ {flag=0} flag' "$BASE"/fastqc_data.txt >"$BASE"-KMC.txt | |
81 echo "${BASE},${BASE}-KMC.txt" >> KMC_file_paths.txt | |
82 | |
83 done | |
84 ``` | 16 ``` |
85 | 17 |
86 | 18 |
87 ## Evaluation Overview | 19 # Fastqc Evaluation |
88 | 20 |
89 ```{r 'overview'} | 21 ## Evaluation of reads before trimming |
90 PWF_file_paths = read.csv('PWF_file_paths.txt', | 22 |
91 header = TRUE, stringsAsFactors = FALSE) | 23 ```{r} |
92 rm('PWF_df') | 24 if ('READS_1' == 'None') { |
93 for(i in 1:nrow(PWF_file_paths)) { | 25 stop("No pre-trimming reads provided!") |
94 file_path = PWF_file_paths[i,2] | 26 } else { |
95 pwf_df = read.csv(file_path, | 27 ## run fastqc evaluation |
96 sep='\t', header=FALSE, stringsAsFactors = FALSE) | 28 fastqc_command = paste0('fastqc ') %>% |
97 colnames(pwf_df) = c('item', PWF_file_paths[i,1]) | 29 (function(x) { |
98 if (!exists('PWF_df')) { | 30 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) |
99 PWF_df = pwf_df | 31 }) %>% |
100 } else { | 32 (function(x) { |
101 PWF_df = cbind(PWF_df, pwf_df[,2,drop=FALSE]) | 33 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) |
102 } | 34 }) %>% |
35 (function(x) { | |
36 paste0(x, '-o REPORT_DIR ') | |
37 }) | |
38 fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1') | |
39 system(fastqc_command_reads_1, intern = TRUE) | |
40 | |
41 # Original html report | |
42 reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1) | |
43 original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1)) | |
44 | |
45 unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR') | |
46 reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/') | |
47 # fastqc_data.txt | |
48 file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt') | |
49 fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1)) | |
50 # summary.txt | |
51 file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt') | |
52 summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1)) | |
53 | |
54 tags$ul( | |
55 tags$li(original_html), | |
56 tags$li(fastqc_data), | |
57 tags$li(summary_data) | |
58 ) | |
103 } | 59 } |
104 ``` | 60 ``` |
105 | 61 |
106 | 62 |
63 ## Evaluation of reads after trimming | |
64 | |
107 ```{r} | 65 ```{r} |
108 my_icon = c('ok', 'remove', 'star') | 66 if ('READS_2' == 'None') { |
109 names(my_icon) = c('pass', 'fail', 'warn') | 67 stop("No pre-trimming reads provided!") |
110 evaluate_list = list() | 68 } else { |
111 for (i in colnames(PWF_df)[-1]) { | 69 ## run fastqc evaluation |
112 evaluate_list[[i]] = formatter( | 70 fastqc_command = paste0('fastqc ') %>% |
113 "span", | 71 (function(x) { |
114 style = x ~ style("background-color" = ifelse(x =='pass', '#9CD027', ifelse(x == 'fail', '#CC0000', '#FF4E00')), | 72 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x) |
115 "color" = "white", | 73 }) %>% |
116 "width" = "50px", | 74 (function(x) { |
117 "float" = "left", | 75 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x) |
118 "padding-right" = "5px") | 76 }) %>% |
119 ) | 77 (function(x) { |
78 paste0(x, '-o REPORT_DIR ') | |
79 }) | |
80 fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1') | |
81 system(fastqc_command_reads_2, intern = TRUE) | |
82 | |
83 # Original html report | |
84 reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1) | |
85 original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2)) | |
86 | |
87 unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR') | |
88 reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/') | |
89 # fastqc_data.txt | |
90 file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt') | |
91 fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2)) | |
92 # summary.txt | |
93 file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt') | |
94 summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2)) | |
95 | |
96 tags$ul( | |
97 tags$li(original_html), | |
98 tags$li(fastqc_data), | |
99 tags$li(summary_data) | |
100 ) | |
120 } | 101 } |
102 ``` | |
121 | 103 |
122 formattable(PWF_df, evaluate_list) | 104 |
105 | |
106 # Fastqc output visualization | |
107 | |
108 ## Overview | |
109 | |
110 ```{r} | |
111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1] | |
112 reads_2_summary = read.csv('REPORT_DIR/reads_2_summary.txt', header = FALSE, sep = '\t')[, 1] | |
113 combined_summary = cbind(reads_1_summary, reads_2_summary) | |
114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)')) | |
115 combined_summary[combined_summary == 'FAIL'] = 'FAIL (X)' | |
116 combined_summary[combined_summary == 'WARN'] = 'WARN (!)' | |
117 knitr::kable(combined_summary) | |
123 ``` | 118 ``` |
119 | |
120 # Session Info | |
121 | |
122 ```{r 'session info'} | |
123 sessionInfo() | |
124 ``` |