Mercurial > repos > jvolkening > nanopore_qc
comparison nanopore_qc.R @ 0:e0006d8bf849 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/nanopore_qc commit b99a7d95d62b95ececc9d808f5f183b9eb718f80-dirty
author | jvolkening |
---|---|
date | Sat, 02 Mar 2024 03:35:34 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:e0006d8bf849 |
---|---|
1 #!/usr/bin/Rscript | |
2 | |
3 # NanoporeQC was forked from: | |
4 # MinionQC version 1.0 | |
5 # Copyright (C) 2017 Robert Lanfear | |
6 # | |
7 # Modifications (c) 2018-2024 Jeremy Volkening | |
8 # | |
9 # Released under the MIT license (see LICENSE) | |
10 | |
11 # supress warnings | |
12 options(warn=-1) | |
13 | |
14 libs <- c( | |
15 'ggplot2', | |
16 'tidyr', | |
17 'dplyr', | |
18 'readr', | |
19 'yaml', | |
20 'scales', | |
21 'futile.logger', | |
22 'data.table', | |
23 'optparse' | |
24 ) | |
25 for (l in libs) { | |
26 library(l, character.only = T, quietly = T) | |
27 } | |
28 | |
29 # option parsing # | |
30 | |
31 parser <- OptionParser() | |
32 | |
33 parser <- add_option(parser, | |
34 opt_str = c("-i", "--input"), | |
35 type = "character", | |
36 dest = 'input.file', | |
37 help="Input file or directory (required). Either a full path to a sequence_summary.txt file, or a full path to a directory containing one or more such files. In the latter case the directory is searched recursively." | |
38 ) | |
39 | |
40 parser <- add_option(parser, | |
41 opt_str = c("-o", "--outputdirectory"), | |
42 type = "character", | |
43 dest = 'output.dir', | |
44 help="Output directory (required). If a single sequencing_summary.txt file is passed as input, then the output directory will contain just the plots associated with that file. If a directory containing more than one sequencing_summary.txt files is passed as input, then the plots will be put into sub-directories that have the same names as the parent directories of each sequencing_summary.txt file" | |
45 ) | |
46 | |
47 parser <- add_option(parser, | |
48 opt_str = c("-q", "--qscore_cutoff"), | |
49 type="double", | |
50 default=7.0, | |
51 dest = 'q', | |
52 help="The cutoff value for the mean Q score of a read (default 7). Used to create separate plots for reads above and below this threshold" | |
53 ) | |
54 | |
55 parser <- add_option(parser, | |
56 opt_str = c("-d", "--discard_failed"), | |
57 type="logical", | |
58 default=FALSE, | |
59 dest = 'filt.failed', | |
60 help="Discard reads that failed Albacore filtering" | |
61 ) | |
62 | |
63 opt = parse_args(parser) | |
64 | |
65 input.file = opt$input.file | |
66 output.dir = opt$output.dir | |
67 filt.failed = opt$filt.failed | |
68 q = opt$q | |
69 | |
70 # this is how we label the reads at least as good as q | |
71 q_title = paste("Q>=", q, sep="") | |
72 | |
73 | |
74 # build the map for R9.5, which has this unusual non-sequential numbering layout | |
75 map <- rbind( | |
76 data.frame( channel=33:64, row=rep(1:4, each=8), col=rep(1:8, 4) ), | |
77 data.frame( channel=481:512, row=rep(5:8, each=8), col=rep(1:8, 4) ), | |
78 data.frame( channel=417:448, row=rep(9:12, each=8), col=rep(1:8, 4) ), | |
79 data.frame( channel=353:384, row=rep(13:16, each=8), col=rep(1:8, 4) ), | |
80 data.frame( channel=289:320, row=rep(17:20, each=8), col=rep(1:8, 4) ), | |
81 data.frame( channel=225:256, row=rep(21:24, each=8), col=rep(1:8, 4) ), | |
82 data.frame( channel=161:192, row=rep(25:28, each=8), col=rep(1:8, 4) ), | |
83 data.frame( channel=97:128, row=rep(29:32, each=8), col=rep(1:8, 4) ), | |
84 data.frame( channel=1:32, row=rep(1:4, each=8), col=rep(16:9, 4) ), | |
85 data.frame( channel=449:480, row=rep(5:8, each=8), col=rep(16:9, 4) ), | |
86 data.frame( channel=385:416, row=rep(9:12, each=8), col=rep(16:9, 4) ), | |
87 data.frame( channel=321:352, row=rep(13:16, each=8), col=rep(16:9, 4) ), | |
88 data.frame( channel=257:288, row=rep(17:20, each=8), col=rep(16:9, 4) ), | |
89 data.frame( channel=193:224, row=rep(21:24, each=8), col=rep(16:9, 4) ), | |
90 data.frame( channel=129:160, row=rep(25:28, each=8), col=rep(16:9, 4) ), | |
91 data.frame( channel=65:96, row=rep(29:32, each=8), col=rep(16:9, 4) ) | |
92 ) | |
93 | |
94 add_cols <- function(d, min.q){ | |
95 # take a sequencing sumamry file (d), and a minimum Q value you are interested in (min.q) | |
96 # return the same data frame with the following columns added | |
97 # cumulative.bases | |
98 # hour of run | |
99 # reads.per.hour | |
100 | |
101 d = subset(d, mean_qscore_template >= min.q) | |
102 | |
103 if(nrow(d)==0){ | |
104 flog.error(paste("There are no reads with a mean Q score higher than your cutoff of ", min.q, ". Please choose a lower cutoff and try again.", sep = "")) | |
105 quit() | |
106 } | |
107 | |
108 d = merge(d, map, by="channel") | |
109 d = d[with(d, order(start_time)), ] # sort by read length | |
110 d$cumulative.bases.time = cumsum(as.numeric(d$start_time)) | |
111 d = d[with(d, order(-sequence_length_template)), ] # sort by read length | |
112 d$cumulative.bases = cumsum(as.numeric(d$sequence_length_template)) | |
113 d$hour = d$start_time %/% 3600 | |
114 | |
115 # add the reads generated for each hour | |
116 reads.per.hour = as.data.frame(table(d$hour)) | |
117 names(reads.per.hour) = c("hour", "reads_per_hour") | |
118 reads.per.hour$hour = as.numeric(as.character(reads.per.hour$hour)) | |
119 d = merge(d, reads.per.hour, by = c("hour")) | |
120 return(d) | |
121 } | |
122 | |
123 | |
124 load_summary <- function(filepath, min.q){ | |
125 # load a sequencing summary and add some info | |
126 # min.q is a vector of length 2 defining 2 levels of min.q to have | |
127 # by default the lowest value is -Inf, i.e. includes all reads. The | |
128 # other value in min.q is set by the user at the command line | |
129 d = read_tsv( | |
130 filepath, | |
131 col_types = cols_only( | |
132 channel = 'i', | |
133 passes_filtering = 'c', | |
134 num_events_template = 'i', | |
135 sequence_length_template = 'i', | |
136 mean_qscore_template = 'n', | |
137 sequence_length_2d = 'i', | |
138 mean_qscore_2d = 'n', | |
139 start_time = 'n' | |
140 ) | |
141 ) | |
142 | |
143 if("sequence_length_2d" %in% names(d)){ | |
144 # it's a 1D2 or 2D run | |
145 d$sequence_length_template = as.numeric(as.character(d$sequence_length_2d)) | |
146 d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_2d)) | |
147 d$num_events_template = NA | |
148 d$start_time = as.numeric(as.character(d$start_time)) | |
149 | |
150 }else{ | |
151 d$sequence_length_template = as.numeric(as.character(d$sequence_length_template)) | |
152 d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_template)) | |
153 d$num_events_template = as.numeric(as.character(d$num_events_template)) | |
154 d$start_time = as.numeric(as.character(d$start_time)) | |
155 } | |
156 | |
157 # ignore 0-length reads | |
158 d <- d[d$sequence_length_template > 0,] | |
159 # ignore reads failing filtering | |
160 if (filt.failed) { | |
161 d <- d[d$passes_filtering == 'True',] | |
162 } | |
163 | |
164 d$events_per_base = d$num_events_template/d$sequence_length_template | |
165 | |
166 flowcell = basename(dirname(filepath)) | |
167 | |
168 # add columns for all the reads | |
169 d1 = add_cols(d, min.q[1]) | |
170 d1$Q_cutoff = "All reads" | |
171 | |
172 # add columns for just the reads that pass the user Q threshold | |
173 d2 = add_cols(d, min.q[2]) | |
174 d2$Q_cutoff = q_title | |
175 | |
176 # bind those two together into one data frame | |
177 d = as.data.frame(rbindlist(list(d1, d2))) | |
178 | |
179 # name the flowcell (useful for analyses with >1 flowcell) | |
180 d$flowcell = flowcell | |
181 | |
182 # make sure this is a factor | |
183 d$Q_cutoff = as.factor(d$Q_cutoff) | |
184 | |
185 | |
186 keep = c("hour","start_time", "channel", "sequence_length_template", "mean_qscore_template", "row", "col", "cumulative.bases", "cumulative.bases.time", "reads_per_hour", "Q_cutoff", "flowcell", "events_per_base") | |
187 d = d[keep] | |
188 | |
189 d$start_bin = cut(d$start_time, 9,labels=c(1:9)) | |
190 | |
191 return(d) | |
192 } | |
193 | |
194 reads.gt <- function(d, len){ | |
195 # return the number of reads in data frame d | |
196 # that are at least as long as length len | |
197 return(length(which(d$sequence_length_template>=len))) | |
198 } | |
199 | |
200 bases.gt <- function(d, len){ | |
201 # return the number of bases contained in reads from | |
202 # data frame d | |
203 # that are at least as long as length len | |
204 reads = subset(d, sequence_length_template >= len) | |
205 return(sum(as.numeric(reads$sequence_length_template))) | |
206 } | |
207 | |
208 log10_minor_break = function (...){ | |
209 # function to add minor breaks to a log10 graph | |
210 # hat-tip: https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot | |
211 function(x) { | |
212 minx = floor(min(log10(x), na.rm=T))-1; | |
213 maxx = ceiling(max(log10(x), na.rm=T))+1; | |
214 n_major = maxx-minx+1; | |
215 major_breaks = seq(minx, maxx, by=1) | |
216 minor_breaks = | |
217 rep(log10(seq(1, 9, by=1)), times = n_major)+ | |
218 rep(major_breaks, each = 9) | |
219 return(10^(minor_breaks)) | |
220 } | |
221 } | |
222 | |
223 binSearch <- function(min, max, df, t = 100000) { | |
224 # binary search algorithm, thanks to https://stackoverflow.com/questions/46292438/optimising-a-calculation-on-every-cumulative-subset-of-a-vector-in-r/46303384#46303384 | |
225 # the aim is to return the number of reads in a dataset (df) | |
226 # that comprise the largest subset of reads with an N50 of t | |
227 # we use this to calculte the number of 'ultra long' reads | |
228 # which are defined as those with N50 > 100KB | |
229 mid = floor(mean(c(min, max))) | |
230 if (mid == min) { | |
231 if (df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[min]/2))] < t) { | |
232 return(min - 1) | |
233 } else { | |
234 return(max - 1) | |
235 } | |
236 } | |
237 | |
238 n = df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[mid]/2))] | |
239 if (n >= t) { | |
240 return(binSearch(mid, max, df)) | |
241 } else { | |
242 return(binSearch(min, mid, df)) | |
243 } | |
244 } | |
245 | |
246 | |
247 summary.stats <- function(d, Q_cutoff="All reads"){ | |
248 # Calculate summary stats for a single value of min.q | |
249 | |
250 rows = which(as.character(d$Q_cutoff)==Q_cutoff) | |
251 d = d[rows,] | |
252 d = d[with(d, order(-sequence_length_template)), ] # sort by read length, just in case | |
253 | |
254 total.bases = sum(as.numeric(d$sequence_length_template)) | |
255 total.reads = nrow(d) | |
256 N50.length = d$sequence_length_template[min(which(d$cumulative.bases > (total.bases/2)))] | |
257 mean.length = round(mean(as.numeric(d$sequence_length_template)), digits = 1) | |
258 median.length = round(median(as.numeric(d$sequence_length_template)), digits = 1) | |
259 max.length = max(as.numeric(d$sequence_length_template)) | |
260 mean.q = round(mean(d$mean_qscore_template), digits = 1) | |
261 median.q = round(median(d$mean_qscore_template), digits = 1) | |
262 | |
263 #calculate ultra-long reads and bases (max amount of data with N50>100KB) | |
264 ultra.reads = binSearch(1, nrow(d), d, t = 100000) | |
265 if(ultra.reads>=1){ | |
266 ultra.gigabases = sum(as.numeric(d$sequence_length_template[1:ultra.reads]))/1000000000 | |
267 }else{ | |
268 ultra.gigabases = 0 | |
269 } | |
270 | |
271 reads = list( | |
272 reads.gt(d, 10000), | |
273 reads.gt(d, 20000), | |
274 reads.gt(d, 50000), | |
275 reads.gt(d, 100000), | |
276 reads.gt(d, 200000), | |
277 reads.gt(d, 500000), | |
278 reads.gt(d, 1000000), | |
279 ultra.reads) | |
280 names(reads) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong") | |
281 | |
282 bases = list( | |
283 bases.gt(d, 10000)/1000000000, | |
284 bases.gt(d, 20000)/1000000000, | |
285 bases.gt(d, 50000)/1000000000, | |
286 bases.gt(d, 100000)/1000000000, | |
287 bases.gt(d, 200000)/1000000000, | |
288 bases.gt(d, 500000)/1000000000, | |
289 bases.gt(d, 1000000)/1000000000, | |
290 ultra.gigabases) | |
291 names(bases) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong") | |
292 | |
293 return(list('total.gigabases' = total.bases/1000000000, | |
294 'total.reads' = total.reads, | |
295 'N50.length' = N50.length, | |
296 'mean.length' = mean.length, | |
297 'median.length' = median.length, | |
298 'max.length' = max.length, | |
299 'mean.q' = mean.q, | |
300 'median.q' = median.q, | |
301 'reads' = reads, | |
302 'gigabases' = bases | |
303 )) | |
304 } | |
305 | |
306 channel.summary <- function(d){ | |
307 # calculate summaries of what happened in each of the channels | |
308 # of a flowcell | |
309 | |
310 a <- d %>% group_by(channel) %>% summarize( | |
311 total.bases = sum(sequence_length_template), | |
312 total.reads = sum(which(sequence_length_template>=0)), | |
313 mean.read.length = mean(sequence_length_template), | |
314 median.read.length = median(sequence_length_template) | |
315 ) | |
316 | |
317 #b = melt(a, id.vars = c("channel")) | |
318 b = gather(a, key, -channel) | |
319 return(b) | |
320 } | |
321 | |
322 | |
323 single.flowcell <- function(input.file, output.dir, q=8){ | |
324 # wrapper function to analyse data from a single flowcell | |
325 # input.file is a sequencing_summary.txt file from a 1D run | |
326 # output.dir is the output directory into which to write results | |
327 # q is the cutoff used for Q values, set by the user | |
328 flog.info(paste("Loading input file:", input.file)) | |
329 d = load_summary(input.file, min.q=c(-Inf, q)) | |
330 | |
331 flowcell = unique(d$flowcell) | |
332 | |
333 flog.info(paste(sep = "", flowcell, ": creating output directory:", output.dir)) | |
334 dir.create(output.dir) | |
335 out.txt = file.path(output.dir, "summary.yaml") | |
336 | |
337 flog.info(paste(sep = "", flowcell, ": summarising input file for flowcell")) | |
338 all.reads.summary = summary.stats(d, Q_cutoff = "All reads") | |
339 q10.reads.summary = summary.stats(d, Q_cutoff = q_title) | |
340 | |
341 summary = list("input file" = input.file, | |
342 "All reads" = all.reads.summary, | |
343 cutoff = q10.reads.summary, | |
344 "notes" = 'ultralong reads refers to the largest set of reads with N50>100KB') | |
345 | |
346 names(summary)[3] = q_title | |
347 | |
348 write(as.yaml(summary), out.txt) | |
349 | |
350 muxes = seq(from = 0, to = max(d$hour), by = 8) | |
351 | |
352 # make plots | |
353 flog.info(paste(sep = "", flowcell, ": plotting length histogram")) | |
354 len.lims <- quantile(d$sequence_length_template,c(0.01,0.99)) | |
355 len.lims[1] <- max(1,len.lims[1]) | |
356 | |
357 p1 = ggplot(d, aes(x = sequence_length_template)) + | |
358 geom_histogram(bins = 200, fill="steelblue") + | |
359 scale_x_log10(breaks=round(10^pretty(log10(len.lims),n=5),0),limits=len.lims) + | |
360 facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") + | |
361 #theme(text = element_text(size = 15)) + | |
362 theme_linedraw() + | |
363 xlab("Read length") + | |
364 ylab("Number of reads") | |
365 ggsave(filename = file.path(output.dir, "length_histogram.png"), width = 7, height = 4, dpi=300, plot = p1) | |
366 ggsave(filename = file.path(output.dir, "length_histogram.screen.png"), width = 7, height = 4, dpi=90, plot = p1) | |
367 | |
368 flog.info(paste(sep = "", flowcell, ": plotting mean Q score histogram")) | |
369 q.lims <- quantile(d$mean_qscore_template,c(0.005,0.995)) | |
370 p2 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x = mean_qscore_template)) + | |
371 geom_histogram(bins = 200, fill="steelblue") + | |
372 xlim(q.lims) + | |
373 geom_vline(xintercept=q) + | |
374 #theme(text = element_text(size = 15)) + | |
375 theme_linedraw() + | |
376 xlab("Mean Q score of read") + | |
377 ylab("Number of reads") | |
378 ggsave(filename = file.path(output.dir, "q_histogram.png"), width = 7, height = 4, dpi=300, plot = p2) | |
379 ggsave(filename = file.path(output.dir, "q_histogram.screen.png"), width = 7, height = 4, dpi=90, plot = p2) | |
380 | |
381 | |
382 flog.info(paste(sep = "", flowcell, ": plotting flowcell overview")) | |
383 | |
384 df <- data.frame(row=integer(),col=integer(),bin=integer(), quality=numeric()) | |
385 | |
386 # swap rows and columns | |
387 for (b in c(1,5,9)) { | |
388 m <- matrix(data=rep(0,512),nrow=16,ncol=32) | |
389 for (r in 1:32) { | |
390 for (c in 1:16) { | |
391 sub <- subset(d, Q_cutoff=="All reads" & col == c & row == r & start_bin == b) | |
392 Q <- median(sub$mean_qscore_template) | |
393 df <- rbind(df, list(row=r, col=c, bin=b, quality=Q) ) | |
394 } | |
395 } | |
396 } | |
397 | |
398 df$bin <- factor(df$bin) | |
399 levels(df$bin) <- c("Run start", "Run middle", "Run end") | |
400 p5 = ggplot(df, aes(col, row)) + | |
401 geom_tile(aes(fill = quality), color="white") + | |
402 scale_fill_gradient(low = "white", high="steelblue") + | |
403 xlab("Column") + | |
404 ylab("Row") + | |
405 facet_wrap(~bin, ncol=3) + | |
406 theme_linedraw() + | |
407 theme(panel.grid.minor = element_blank(), panel.grid.major=element_blank()) | |
408 #theme(text = element_text(size = 40), axis.text.x = element_text(size=12), axis.text.y = element_text(size=12), legend.text=element_text(size=12)) | |
409 ggsave(filename = file.path(output.dir, "flowcell_overview.png"), width = 7, height = 4, dpi=300, plot = p5) | |
410 ggsave(filename = file.path(output.dir, "flowcell_overview.screen.png"), width = 7, height = 4, dpi=90, plot = p5) | |
411 | |
412 flog.info(paste(sep = "", flowcell, ": plotting cumulative yield summary")) | |
413 p5a = ggplot(d, aes(x=start_time/3600, y=cumulative.bases.time, colour = Q_cutoff)) + | |
414 geom_line(size = 1) + | |
415 geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + | |
416 xlab("Elapsed time (hrs)") + | |
417 ylab("Total yield in bases") + | |
418 scale_colour_discrete(guide = guide_legend(title = "Reads")) + | |
419 scale_x_continuous() + | |
420 theme_linedraw() | |
421 #theme(text = element_text(size = 15)) | |
422 | |
423 ggsave(filename = file.path(output.dir, "cumulative_yield.png"), width = 7, height = 4, dpi=300, plot = p5a) | |
424 ggsave(filename = file.path(output.dir, "cumulative_yield.screen.png"), width = 7, height = 4, dpi=90, plot = p5a) | |
425 | |
426 flog.info(paste(sep = "", flowcell, ": plotting flowcell yield summary")) | |
427 p6 = ggplot(d, aes(x=sequence_length_template, y=cumulative.bases, colour = Q_cutoff)) + | |
428 geom_line(size = 1) + | |
429 xlab("Minimum read length") + | |
430 ylab("Total yield in bases") + | |
431 scale_colour_discrete(guide = guide_legend(title = "Reads")) + | |
432 #theme(text = element_text(size = 15)) | |
433 theme_linedraw() | |
434 xmax = max(d$sequence_length_template[which(d$cumulative.bases > 0.01 * max(d$cumulative.bases))]) | |
435 p6 = p6 + scale_x_continuous(limits = c(0, xmax)) | |
436 | |
437 ggsave(filename = file.path(output.dir, "yield_summary.png"), width = 7, height = 4, dpi=300, plot = p6) | |
438 ggsave(filename = file.path(output.dir, "yield_summary.screen.png"), width = 7, height = 4, dpi=90, plot = p6) | |
439 | |
440 flog.info(paste(sep = "", flowcell, ": plotting sequence length over time")) | |
441 e = subset(d, Q_cutoff=="All reads") | |
442 e$Q = paste(">=", q, sep="") | |
443 e$Q[which(e$mean_qscore_template<q)] = paste("<", q, sep="") | |
444 p7 = ggplot(e, aes(x=start_time/3600, y=sequence_length_template, colour = Q, group = Q)) + | |
445 geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + | |
446 geom_smooth() + | |
447 xlab("Elapsed time (hrs)") + | |
448 ylab("Mean read length") + | |
449 ylim(0, NA) + | |
450 theme_linedraw() | |
451 ggsave(filename = file.path(output.dir, "length_by_hour.png"), width = 7, height = 4, dpi=300, plot = p7) | |
452 ggsave(filename = file.path(output.dir, "length_by_hour.screen.png"), width = 7, height = 4, dpi=90, plot = p7) | |
453 | |
454 flog.info(paste(sep = "", flowcell, ": plotting Q score over time")) | |
455 p8 = ggplot(e, aes(x=start_time/3600, y=mean_qscore_template, colour = Q, group = Q)) + | |
456 geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + | |
457 geom_smooth() + | |
458 xlab("Elapsed time (hrs)") + | |
459 ylab("Mean Q score") + | |
460 ylim(0, NA) + | |
461 theme_linedraw() | |
462 ggsave(filename = file.path(output.dir, "q_by_hour.png"), width = 7, height = 4, dpi=300, plot = p8) | |
463 ggsave(filename = file.path(output.dir, "q_by_hour.screen.png"), width = 7, height = 4, dpi=90, plot = p8) | |
464 | |
465 flog.info(paste(sep = "", flowcell, ": plotting reads per hour")) | |
466 f = d[c("hour", "reads_per_hour", "Q_cutoff")] | |
467 f = f[!duplicated(f),] | |
468 g = subset(f, Q_cutoff=="All reads") | |
469 h = subset(f, Q_cutoff==q_title) | |
470 max = max(f$hour) | |
471 # all of this is just to fill in hours with no reads recorded | |
472 all = 0:max | |
473 add.g = all[which(all %in% g$hour == FALSE)] | |
474 if(length(add.g)>0){ | |
475 add.g = data.frame(hour = add.g, reads_per_hour = 0, Q_cutoff = "All reads") | |
476 g = rbind(g, add.g) | |
477 } | |
478 add.h = all[which(all %in% h$hour == FALSE)] | |
479 if(length(add.h)>0){ | |
480 add.h = data.frame(hour = add.h, reads_per_hour = 0, Q_cutoff = q_title) | |
481 h = rbind(h, add.h) | |
482 } | |
483 i = rbind(g, h) | |
484 i$Q_cutoff = as.character(i$Q_cutoff) | |
485 i$Q_cutoff[which(i$Q_cutoff==q_title)] = paste("Q>=", q, sep="") | |
486 p9 = ggplot(i, aes(x=hour, y=reads_per_hour, colour = Q_cutoff, group = Q_cutoff)) + | |
487 geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + | |
488 #geom_point() + | |
489 geom_line(size=1) + | |
490 xlab("Elapsed time (hrs)") + | |
491 ylab("Number of reads per hour") + | |
492 ylim(0, NA) + | |
493 scale_color_discrete(guide = guide_legend(title = "Reads")) + | |
494 theme_linedraw() | |
495 ggsave(filename = file.path(output.dir, "reads_per_hour.png"), width = 7, height = 4, dpi=300, plot = p9) | |
496 ggsave(filename = file.path(output.dir, "reads_per_hour.screen.png"), width = 7, height = 4, dpi=90, plot = p9) | |
497 | |
498 flog.info(paste(sep = "", flowcell, ": plotting read length vs. q score scatterplot")) | |
499 p10 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x = sequence_length_template, y = mean_qscore_template, colour = events_per_base)) + | |
500 geom_point(alpha=0.05, size = 0.4) + | |
501 scale_x_log10(minor_breaks=log10_minor_break()) + | |
502 labs(colour='Events per base\n(log scale)\n') + | |
503 theme(text = element_text(size = 15)) + | |
504 xlab("Read length") + | |
505 ylab("Mean Q score of read") + | |
506 theme_linedraw() | |
507 | |
508 if(max(d$events_per_base, na.rm=T)>0){ | |
509 # a catch for 1D2 runs which don't have events per base | |
510 p10 = p10 + scale_colour_distiller(trans = "log", labels = scientific, palette = 'Spectral') | |
511 } | |
512 | |
513 ggsave(filename = file.path(output.dir, "length_vs_q.png"), width = 7, height = 4, dpi=300, plot = p10) | |
514 ggsave(filename = file.path(output.dir, "length_vs_q.screen.png"), width = 7, height = 4, dpi=90, plot = p10) | |
515 | |
516 return(d) | |
517 } | |
518 | |
519 if(file_test("-f", input.file)==TRUE){ | |
520 # if it's an existing file (not a folder) just run one analysis | |
521 d = single.flowcell(input.file, output.dir, q) | |
522 }else{ | |
523 #WTF | |
524 flog.warn(paste("Could find a sequencing summary file in your input which was: ", | |
525 input.file, | |
526 "\nThe input must be a sequencing_summary.txt file produced by Albacore")) | |
527 } |