comparison fastqc_report.Rmd @ 16:1710b0e874f1 draft

fix file name issue
author mingchen0919
date Sat, 21 Oct 2017 09:25:49 -0400
parents d1d20f341632
children ac5c618e4d97
comparison
equal deleted inserted replaced
15:d1d20f341632 16:1710b0e874f1
8 highlight: tango 8 highlight: tango
9 --- 9 ---
10 10
11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE} 11 ```{r setup, include=FALSE, warning=FALSE, message=FALSE}
12 knitr::opts_chunk$set( 12 knitr::opts_chunk$set(
13 echo = ECHO 13 echo = ECHO,
14 error = TRUE
14 ) 15 )
15 ``` 16 ```
16 17
17 18
18 # Fastqc Analysis 19 # Fastqc Evaluation
19 20
20 * Copy fastq files to job working directory 21 ## Evaluation of reads before trimming
21 22
22 ```{bash 'copy files'} 23 ```{r}
23 for f in $(echo READS | sed "s/,/ /g") 24 if ('READS_1' == 'None') {
24 do 25 stop("No pre-trimming reads provided!")
25 cp $f ./ 26 } else {
26 done 27 ## run fastqc evaluation
27 ``` 28 fastqc_command = paste0('fastqc ') %>%
28 29 (function(x) {
29 * Run fastqc 30 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
30 31 }) %>%
31 ```{bash 'run fastqc'} 32 (function(x) {
32 for r in $(ls *.dat) 33 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
33 do 34 }) %>%
34 fastqc -o REPORT_DIR $r > /dev/null 2>&1 35 (function(x) {
35 done 36 paste0(x, '-o REPORT_DIR ')
36 ``` 37 })
37 38 fastqc_command_reads_1 = paste0(fastqc_command, 'READS_1 > /dev/null 2>&1')
38 ## Evaluation results 39 system(fastqc_command_reads_1, intern = TRUE)
39 40
40 ```{r 'html report links'} 41 # Original html report
41 html_file = list.files('REPORT_DIR', pattern = '.*html') 42 reads_1_base = tail(strsplit('READS_1', '/')[[1]], 1)
42 tags$ul(tags$a(href=html_file, paste0('HTML report', opt$name))) 43 original_html = tags$a(href=paste0(reads_1_base, '_fastqc.html'), paste0('HTML report: ', opt$name_1))
44
45 unzip(paste0('REPORT_DIR/', reads_1_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
46 reads_1_unzip = paste0('REPORT_DIR/', reads_1_base, '_fastqc/')
47 # fastqc_data.txt
48 file.copy(paste0(reads_1_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_1_fastqc_data.txt')
49 fastqc_data = tags$a(href='reads_1_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_1))
50 # summary.txt
51 file.copy(paste0(reads_1_unzip, 'summary.txt'), 'REPORT_DIR/reads_1_summary.txt')
52 summary_data = tags$a(href='reads_1_summary.txt', paste0('summary.txt: ', opt$name_1))
53
54 tags$ul(
55 tags$li(original_html),
56 tags$li(fastqc_data),
57 tags$li(summary_data)
58 )
59 }
43 ``` 60 ```
44 61
45 62
46 ```{r 'extract fastqc_data.txt and summary.txt'} 63 ## Evaluation of reads after trimming
47 # list all zip files
48 zip_file = list.files(path = 'REPORT_DIR', pattern = '.zip')
49 unzip(paste0('REPORT_DIR/', zip_file), exdir = 'REPORT_DIR')
50 64
51 unzip_directory = paste0(tail(strsplit(opt$reads, '/')[[1]], 1), '_fastqc/') 65 ```{r}
52 fastqc_data_txt_path = paste0('REPORT_DIR/', unzip_directory, 'fastqc_data.txt') 66 if ('READS_2' == 'None') {
53 summary_txt_path = paste0('REPORT_DIR/', unzip_directory, 'summary.txt') 67 stop("No pre-trimming reads provided!")
68 } else {
69 ## run fastqc evaluation
70 fastqc_command = paste0('fastqc ') %>%
71 (function(x) {
72 ifelse('CONTAMINANTS' != 'None', paste0(x, '-c CONTAMINANTS '), x)
73 }) %>%
74 (function(x) {
75 ifelse('LIMITS' != 'None', paste0(x, '-l LIMITS '), x)
76 }) %>%
77 (function(x) {
78 paste0(x, '-o REPORT_DIR ')
79 })
80 fastqc_command_reads_2 = paste0(fastqc_command, 'READS_2 > /dev/null 2>&1')
81 system(fastqc_command_reads_2, intern = TRUE)
82
83 # Original html report
84 reads_2_base = tail(strsplit('READS_2', '/')[[1]], 1)
85 original_html = tags$a(href=paste0(reads_2_base, '_fastqc.html'), paste0('HTML report: ', opt$name_2))
86
87 unzip(paste0('REPORT_DIR/', reads_2_base, '_fastqc.zip'), exdir = 'REPORT_DIR')
88 reads_2_unzip = paste0('REPORT_DIR/', reads_2_base, '_fastqc/')
89 # fastqc_data.txt
90 file.copy(paste0(reads_2_unzip, 'fastqc_data.txt'), 'REPORT_DIR/reads_2_fastqc_data.txt')
91 fastqc_data = tags$a(href='reads_2_fastqc_data.txt', paste0('fastqc_data.txt: ', opt$name_2))
92 # summary.txt
93 file.copy(paste0(reads_2_unzip, 'summary.txt'), 'REPORT_DIR/reads_2_summary.txt')
94 summary_data = tags$a(href='reads_2_summary.txt', paste0('summary.txt: ', opt$name_2))
95
96 tags$ul(
97 tags$li(original_html),
98 tags$li(fastqc_data),
99 tags$li(summary_data)
100 )
101 }
54 ``` 102 ```
55 103
56
57 ```{r 'summary.txt'}
58 tags$ul(tags$a(href=paste0(unzip_directory, 'summary.txt'), 'summary.txt'))
59 ```
60
61
62 ```{r 'fastqc_data.txt'}
63 tags$ul(tags$a(href=paste0(unzip_directory, 'fastqc_data.txt'), 'fastqc_data.txt'))
64 ```
65 104
66 105
67 # Fastqc output visualization 106 # Fastqc output visualization
68 107
69 ## Overview 108 ## Overview
70 109
71 ```{r} 110 ```{r}
72 # read.table(fastqc_data_txt_path) 111 reads_1_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 2:1]
73 summary_txt = read.csv(summary_txt_path, header = FALSE, sep = '\t')[, 2:1] 112 reads_2_summary = read.csv('REPORT_DIR/reads_1_summary.txt', header = FALSE, sep = '\t')[, 1]
74 names(summary_txt) = c('MODULE', 'PASS/FAIL') 113 combined_summary = cbind(reads_1_summary, reads_2_summary)
75 knitr::kable(summary_txt) 114 names(combined_summary) = c('MODULE', paste0(opt$name_1, '(before)'), paste0(opt$name_2, '(after)'))
115 knitr::kable(combined_summary)
76 ``` 116 ```
77 117
78 ## Summary by module {.tabset} 118 ## Visualization by module {.tabset}
79 119
80 * Define a function to extract outputs for each module from fastqc output 120 * Define a function to extract outputs for each module from fastqc output
81 121
82 ```{r 'function definition'} 122 ```{r 'function definition'}
83 extract_data_module = function(fastqc_data, module_name) { 123 extract_data_module = function(fastqc_data, module_name) {
91 } 131 }
92 ``` 132 ```
93 133
94 ### Per base sequence quality 134 ### Per base sequence quality
95 135
96 ```{r} 136 ```{r 'per base sequence quality', fig.width=10}
97 pbsq = extract_data_module(fastqc_data_txt_path, 'Per base sequence quality') 137 ## reads 1
98 knitr::kable(pbsq) 138 pbsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per base sequence quality')
139 pbsq_1$id = 1:length(pbsq_1$X.Base)
140
141 melt_pbsq_1 = filter(melt(pbsq_1, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
142 melt_pbsq_1$trim = 'before'
143
144
145 ## reads 2
146 pbsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per base sequence quality')
147 pbsq_2$id = 1:length(pbsq_2$X.Base)
148
149 melt_pbsq_2 = filter(melt(pbsq_2, id=c('X.Base', 'id')), variable != 'X90th.Percentile' & variable != 'X10th.Percentile')
150 melt_pbsq_2$trim = 'after'
151
152 comb_pbsq = rbind(melt_pbsq_1, melt_pbsq_2)
153 comb_pbsq$trim = factor(levels = c('before', 'after'), comb_pbsq$trim)
154 p = ggplot(data = comb_pbsq) +
155 geom_line(mapping = aes(x = id, y = value, group = variable, color = variable)) +
156 scale_x_continuous(breaks = pbsq_2$id, labels = pbsq_2$X.Base) +
157 facet_grid(. ~ trim) +
158 theme(axis.text.x = element_text(angle=45))
159 ggplotly(p)
160
99 ``` 161 ```
100 162
101 ### Per tile sequence quality 163 ### Per tile sequence quality
102 164
103 ```{r} 165 ```{r 'per tile sequence quality'}
104 ptsq = extract_data_module(fastqc_data_txt_path, 'Per tile sequence quality') 166 ## reads 1
105 knitr::kable(ptsq) 167 ptsq_1 = extract_data_module('REPORT_DIR/reads_1_fastqc_data.txt', 'Per tile sequence quality')
168 ptsq_1$trim = 'before'
169
170 ## reads 2
171 ptsq_2 = extract_data_module('REPORT_DIR/reads_2_fastqc_data.txt', 'Per tile sequence quality')
172 ptsq_2$trim = 'after'
173
174 comb_ptsq = rbind(ptsq_1, ptsq_2)
175 comb_ptsq$trim = factor(levels = c('before', 'after'), comb_ptsq$trim)
176 comb_pbsq$Base = factor(levels = unique(comb_ptsq$Base), comb_ptsq$Base)
177
178 p = ggplot(data = comb_ptsq, aes(x = Base, y = X.Tile, fill = Mean)) +
179 geom_raster() +
180 facet_grid(. ~ trim) +
181 theme(axis.text.x = element_text(angle=45))
182 ggplotly(p)
106 ``` 183 ```
107 184
108 185
109 186
110 # Session Info 187 # Session Info