Mercurial > repos > jvolkening > nanopore_qc
diff nanopore_qc.R @ 0:e0006d8bf849 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/nanopore_qc commit b99a7d95d62b95ececc9d808f5f183b9eb718f80-dirty
author | jvolkening |
---|---|
date | Sat, 02 Mar 2024 03:35:34 +0000 (10 months ago) |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/nanopore_qc.R Sat Mar 02 03:35:34 2024 +0000 @@ -0,0 +1,527 @@ +#!/usr/bin/Rscript + +# NanoporeQC was forked from: +# MinionQC version 1.0 +# Copyright (C) 2017 Robert Lanfear +# +# Modifications (c) 2018-2024 Jeremy Volkening +# +# Released under the MIT license (see LICENSE) + +# supress warnings +options(warn=-1) + +libs <- c( + 'ggplot2', + 'tidyr', + 'dplyr', + 'readr', + 'yaml', + 'scales', + 'futile.logger', + 'data.table', + 'optparse' +) +for (l in libs) { + library(l, character.only = T, quietly = T) +} + +# option parsing # + +parser <- OptionParser() + +parser <- add_option(parser, + opt_str = c("-i", "--input"), + type = "character", + dest = 'input.file', + help="Input file or directory (required). Either a full path to a sequence_summary.txt file, or a full path to a directory containing one or more such files. In the latter case the directory is searched recursively." + ) + +parser <- add_option(parser, + opt_str = c("-o", "--outputdirectory"), + type = "character", + dest = 'output.dir', + help="Output directory (required). If a single sequencing_summary.txt file is passed as input, then the output directory will contain just the plots associated with that file. If a directory containing more than one sequencing_summary.txt files is passed as input, then the plots will be put into sub-directories that have the same names as the parent directories of each sequencing_summary.txt file" + ) + +parser <- add_option(parser, + opt_str = c("-q", "--qscore_cutoff"), + type="double", + default=7.0, + dest = 'q', + help="The cutoff value for the mean Q score of a read (default 7). Used to create separate plots for reads above and below this threshold" + ) + +parser <- add_option(parser, + opt_str = c("-d", "--discard_failed"), + type="logical", + default=FALSE, + dest = 'filt.failed', + help="Discard reads that failed Albacore filtering" + ) + +opt = parse_args(parser) + +input.file = opt$input.file +output.dir = opt$output.dir +filt.failed = opt$filt.failed +q = opt$q + +# this is how we label the reads at least as good as q +q_title = paste("Q>=", q, sep="") + + +# build the map for R9.5, which has this unusual non-sequential numbering layout +map <- rbind( + data.frame( channel=33:64, row=rep(1:4, each=8), col=rep(1:8, 4) ), + data.frame( channel=481:512, row=rep(5:8, each=8), col=rep(1:8, 4) ), + data.frame( channel=417:448, row=rep(9:12, each=8), col=rep(1:8, 4) ), + data.frame( channel=353:384, row=rep(13:16, each=8), col=rep(1:8, 4) ), + data.frame( channel=289:320, row=rep(17:20, each=8), col=rep(1:8, 4) ), + data.frame( channel=225:256, row=rep(21:24, each=8), col=rep(1:8, 4) ), + data.frame( channel=161:192, row=rep(25:28, each=8), col=rep(1:8, 4) ), + data.frame( channel=97:128, row=rep(29:32, each=8), col=rep(1:8, 4) ), + data.frame( channel=1:32, row=rep(1:4, each=8), col=rep(16:9, 4) ), + data.frame( channel=449:480, row=rep(5:8, each=8), col=rep(16:9, 4) ), + data.frame( channel=385:416, row=rep(9:12, each=8), col=rep(16:9, 4) ), + data.frame( channel=321:352, row=rep(13:16, each=8), col=rep(16:9, 4) ), + data.frame( channel=257:288, row=rep(17:20, each=8), col=rep(16:9, 4) ), + data.frame( channel=193:224, row=rep(21:24, each=8), col=rep(16:9, 4) ), + data.frame( channel=129:160, row=rep(25:28, each=8), col=rep(16:9, 4) ), + data.frame( channel=65:96, row=rep(29:32, each=8), col=rep(16:9, 4) ) +) + +add_cols <- function(d, min.q){ + # take a sequencing sumamry file (d), and a minimum Q value you are interested in (min.q) + # return the same data frame with the following columns added + # cumulative.bases + # hour of run + # reads.per.hour + + d = subset(d, mean_qscore_template >= min.q) + + if(nrow(d)==0){ + flog.error(paste("There are no reads with a mean Q score higher than your cutoff of ", min.q, ". Please choose a lower cutoff and try again.", sep = "")) + quit() + } + + d = merge(d, map, by="channel") + d = d[with(d, order(start_time)), ] # sort by read length + d$cumulative.bases.time = cumsum(as.numeric(d$start_time)) + d = d[with(d, order(-sequence_length_template)), ] # sort by read length + d$cumulative.bases = cumsum(as.numeric(d$sequence_length_template)) + d$hour = d$start_time %/% 3600 + + # add the reads generated for each hour + reads.per.hour = as.data.frame(table(d$hour)) + names(reads.per.hour) = c("hour", "reads_per_hour") + reads.per.hour$hour = as.numeric(as.character(reads.per.hour$hour)) + d = merge(d, reads.per.hour, by = c("hour")) + return(d) +} + + +load_summary <- function(filepath, min.q){ + # load a sequencing summary and add some info + # min.q is a vector of length 2 defining 2 levels of min.q to have + # by default the lowest value is -Inf, i.e. includes all reads. The + # other value in min.q is set by the user at the command line + d = read_tsv( + filepath, + col_types = cols_only( + channel = 'i', + passes_filtering = 'c', + num_events_template = 'i', + sequence_length_template = 'i', + mean_qscore_template = 'n', + sequence_length_2d = 'i', + mean_qscore_2d = 'n', + start_time = 'n' + ) + ) + + if("sequence_length_2d" %in% names(d)){ + # it's a 1D2 or 2D run + d$sequence_length_template = as.numeric(as.character(d$sequence_length_2d)) + d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_2d)) + d$num_events_template = NA + d$start_time = as.numeric(as.character(d$start_time)) + + }else{ + d$sequence_length_template = as.numeric(as.character(d$sequence_length_template)) + d$mean_qscore_template = as.numeric(as.character(d$mean_qscore_template)) + d$num_events_template = as.numeric(as.character(d$num_events_template)) + d$start_time = as.numeric(as.character(d$start_time)) + } + + # ignore 0-length reads + d <- d[d$sequence_length_template > 0,] + # ignore reads failing filtering + if (filt.failed) { + d <- d[d$passes_filtering == 'True',] + } + + d$events_per_base = d$num_events_template/d$sequence_length_template + + flowcell = basename(dirname(filepath)) + + # add columns for all the reads + d1 = add_cols(d, min.q[1]) + d1$Q_cutoff = "All reads" + + # add columns for just the reads that pass the user Q threshold + d2 = add_cols(d, min.q[2]) + d2$Q_cutoff = q_title + + # bind those two together into one data frame + d = as.data.frame(rbindlist(list(d1, d2))) + + # name the flowcell (useful for analyses with >1 flowcell) + d$flowcell = flowcell + + # make sure this is a factor + d$Q_cutoff = as.factor(d$Q_cutoff) + + + keep = c("hour","start_time", "channel", "sequence_length_template", "mean_qscore_template", "row", "col", "cumulative.bases", "cumulative.bases.time", "reads_per_hour", "Q_cutoff", "flowcell", "events_per_base") + d = d[keep] + + d$start_bin = cut(d$start_time, 9,labels=c(1:9)) + + return(d) +} + +reads.gt <- function(d, len){ + # return the number of reads in data frame d + # that are at least as long as length len + return(length(which(d$sequence_length_template>=len))) +} + +bases.gt <- function(d, len){ + # return the number of bases contained in reads from + # data frame d + # that are at least as long as length len + reads = subset(d, sequence_length_template >= len) + return(sum(as.numeric(reads$sequence_length_template))) +} + +log10_minor_break = function (...){ + # function to add minor breaks to a log10 graph + # hat-tip: https://stackoverflow.com/questions/30179442/plotting-minor-breaks-on-a-log-scale-with-ggplot + function(x) { + minx = floor(min(log10(x), na.rm=T))-1; + maxx = ceiling(max(log10(x), na.rm=T))+1; + n_major = maxx-minx+1; + major_breaks = seq(minx, maxx, by=1) + minor_breaks = + rep(log10(seq(1, 9, by=1)), times = n_major)+ + rep(major_breaks, each = 9) + return(10^(minor_breaks)) + } +} + +binSearch <- function(min, max, df, t = 100000) { + # binary search algorithm, thanks to https://stackoverflow.com/questions/46292438/optimising-a-calculation-on-every-cumulative-subset-of-a-vector-in-r/46303384#46303384 + # the aim is to return the number of reads in a dataset (df) + # that comprise the largest subset of reads with an N50 of t + # we use this to calculte the number of 'ultra long' reads + # which are defined as those with N50 > 100KB + mid = floor(mean(c(min, max))) + if (mid == min) { + if (df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[min]/2))] < t) { + return(min - 1) + } else { + return(max - 1) + } + } + + n = df$sequence_length_template[min(which(df$cumulative.bases>df$cumulative.bases[mid]/2))] + if (n >= t) { + return(binSearch(mid, max, df)) + } else { + return(binSearch(min, mid, df)) + } +} + + +summary.stats <- function(d, Q_cutoff="All reads"){ + # Calculate summary stats for a single value of min.q + + rows = which(as.character(d$Q_cutoff)==Q_cutoff) + d = d[rows,] + d = d[with(d, order(-sequence_length_template)), ] # sort by read length, just in case + + total.bases = sum(as.numeric(d$sequence_length_template)) + total.reads = nrow(d) + N50.length = d$sequence_length_template[min(which(d$cumulative.bases > (total.bases/2)))] + mean.length = round(mean(as.numeric(d$sequence_length_template)), digits = 1) + median.length = round(median(as.numeric(d$sequence_length_template)), digits = 1) + max.length = max(as.numeric(d$sequence_length_template)) + mean.q = round(mean(d$mean_qscore_template), digits = 1) + median.q = round(median(d$mean_qscore_template), digits = 1) + + #calculate ultra-long reads and bases (max amount of data with N50>100KB) + ultra.reads = binSearch(1, nrow(d), d, t = 100000) + if(ultra.reads>=1){ + ultra.gigabases = sum(as.numeric(d$sequence_length_template[1:ultra.reads]))/1000000000 + }else{ + ultra.gigabases = 0 + } + + reads = list( + reads.gt(d, 10000), + reads.gt(d, 20000), + reads.gt(d, 50000), + reads.gt(d, 100000), + reads.gt(d, 200000), + reads.gt(d, 500000), + reads.gt(d, 1000000), + ultra.reads) + names(reads) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong") + + bases = list( + bases.gt(d, 10000)/1000000000, + bases.gt(d, 20000)/1000000000, + bases.gt(d, 50000)/1000000000, + bases.gt(d, 100000)/1000000000, + bases.gt(d, 200000)/1000000000, + bases.gt(d, 500000)/1000000000, + bases.gt(d, 1000000)/1000000000, + ultra.gigabases) + names(bases) = c(">10kb", ">20kb", ">50kb", ">100kb", ">200kb", ">500kb", ">1m", "ultralong") + + return(list('total.gigabases' = total.bases/1000000000, + 'total.reads' = total.reads, + 'N50.length' = N50.length, + 'mean.length' = mean.length, + 'median.length' = median.length, + 'max.length' = max.length, + 'mean.q' = mean.q, + 'median.q' = median.q, + 'reads' = reads, + 'gigabases' = bases + )) +} + +channel.summary <- function(d){ + # calculate summaries of what happened in each of the channels + # of a flowcell + + a <- d %>% group_by(channel) %>% summarize( + total.bases = sum(sequence_length_template), + total.reads = sum(which(sequence_length_template>=0)), + mean.read.length = mean(sequence_length_template), + median.read.length = median(sequence_length_template) + ) + + #b = melt(a, id.vars = c("channel")) + b = gather(a, key, -channel) + return(b) +} + + +single.flowcell <- function(input.file, output.dir, q=8){ + # wrapper function to analyse data from a single flowcell + # input.file is a sequencing_summary.txt file from a 1D run + # output.dir is the output directory into which to write results + # q is the cutoff used for Q values, set by the user + flog.info(paste("Loading input file:", input.file)) + d = load_summary(input.file, min.q=c(-Inf, q)) + + flowcell = unique(d$flowcell) + + flog.info(paste(sep = "", flowcell, ": creating output directory:", output.dir)) + dir.create(output.dir) + out.txt = file.path(output.dir, "summary.yaml") + + flog.info(paste(sep = "", flowcell, ": summarising input file for flowcell")) + all.reads.summary = summary.stats(d, Q_cutoff = "All reads") + q10.reads.summary = summary.stats(d, Q_cutoff = q_title) + + summary = list("input file" = input.file, + "All reads" = all.reads.summary, + cutoff = q10.reads.summary, + "notes" = 'ultralong reads refers to the largest set of reads with N50>100KB') + + names(summary)[3] = q_title + + write(as.yaml(summary), out.txt) + + muxes = seq(from = 0, to = max(d$hour), by = 8) + + # make plots + flog.info(paste(sep = "", flowcell, ": plotting length histogram")) + len.lims <- quantile(d$sequence_length_template,c(0.01,0.99)) + len.lims[1] <- max(1,len.lims[1]) + + p1 = ggplot(d, aes(x = sequence_length_template)) + + geom_histogram(bins = 200, fill="steelblue") + + scale_x_log10(breaks=round(10^pretty(log10(len.lims),n=5),0),limits=len.lims) + + facet_wrap(~Q_cutoff, ncol = 1, scales = "free_y") + + #theme(text = element_text(size = 15)) + + theme_linedraw() + + xlab("Read length") + + ylab("Number of reads") + ggsave(filename = file.path(output.dir, "length_histogram.png"), width = 7, height = 4, dpi=300, plot = p1) + ggsave(filename = file.path(output.dir, "length_histogram.screen.png"), width = 7, height = 4, dpi=90, plot = p1) + + flog.info(paste(sep = "", flowcell, ": plotting mean Q score histogram")) + q.lims <- quantile(d$mean_qscore_template,c(0.005,0.995)) + p2 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x = mean_qscore_template)) + + geom_histogram(bins = 200, fill="steelblue") + + xlim(q.lims) + + geom_vline(xintercept=q) + + #theme(text = element_text(size = 15)) + + theme_linedraw() + + xlab("Mean Q score of read") + + ylab("Number of reads") + ggsave(filename = file.path(output.dir, "q_histogram.png"), width = 7, height = 4, dpi=300, plot = p2) + ggsave(filename = file.path(output.dir, "q_histogram.screen.png"), width = 7, height = 4, dpi=90, plot = p2) + + + flog.info(paste(sep = "", flowcell, ": plotting flowcell overview")) + + df <- data.frame(row=integer(),col=integer(),bin=integer(), quality=numeric()) + + # swap rows and columns + for (b in c(1,5,9)) { + m <- matrix(data=rep(0,512),nrow=16,ncol=32) + for (r in 1:32) { + for (c in 1:16) { + sub <- subset(d, Q_cutoff=="All reads" & col == c & row == r & start_bin == b) + Q <- median(sub$mean_qscore_template) + df <- rbind(df, list(row=r, col=c, bin=b, quality=Q) ) + } + } + } + + df$bin <- factor(df$bin) + levels(df$bin) <- c("Run start", "Run middle", "Run end") + p5 = ggplot(df, aes(col, row)) + + geom_tile(aes(fill = quality), color="white") + + scale_fill_gradient(low = "white", high="steelblue") + + xlab("Column") + + ylab("Row") + + facet_wrap(~bin, ncol=3) + + theme_linedraw() + + theme(panel.grid.minor = element_blank(), panel.grid.major=element_blank()) + #theme(text = element_text(size = 40), axis.text.x = element_text(size=12), axis.text.y = element_text(size=12), legend.text=element_text(size=12)) + ggsave(filename = file.path(output.dir, "flowcell_overview.png"), width = 7, height = 4, dpi=300, plot = p5) + ggsave(filename = file.path(output.dir, "flowcell_overview.screen.png"), width = 7, height = 4, dpi=90, plot = p5) + + flog.info(paste(sep = "", flowcell, ": plotting cumulative yield summary")) + p5a = ggplot(d, aes(x=start_time/3600, y=cumulative.bases.time, colour = Q_cutoff)) + + geom_line(size = 1) + + geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + + xlab("Elapsed time (hrs)") + + ylab("Total yield in bases") + + scale_colour_discrete(guide = guide_legend(title = "Reads")) + + scale_x_continuous() + + theme_linedraw() + #theme(text = element_text(size = 15)) + + ggsave(filename = file.path(output.dir, "cumulative_yield.png"), width = 7, height = 4, dpi=300, plot = p5a) + ggsave(filename = file.path(output.dir, "cumulative_yield.screen.png"), width = 7, height = 4, dpi=90, plot = p5a) + + flog.info(paste(sep = "", flowcell, ": plotting flowcell yield summary")) + p6 = ggplot(d, aes(x=sequence_length_template, y=cumulative.bases, colour = Q_cutoff)) + + geom_line(size = 1) + + xlab("Minimum read length") + + ylab("Total yield in bases") + + scale_colour_discrete(guide = guide_legend(title = "Reads")) + + #theme(text = element_text(size = 15)) + theme_linedraw() + xmax = max(d$sequence_length_template[which(d$cumulative.bases > 0.01 * max(d$cumulative.bases))]) + p6 = p6 + scale_x_continuous(limits = c(0, xmax)) + + ggsave(filename = file.path(output.dir, "yield_summary.png"), width = 7, height = 4, dpi=300, plot = p6) + ggsave(filename = file.path(output.dir, "yield_summary.screen.png"), width = 7, height = 4, dpi=90, plot = p6) + + flog.info(paste(sep = "", flowcell, ": plotting sequence length over time")) + e = subset(d, Q_cutoff=="All reads") + e$Q = paste(">=", q, sep="") + e$Q[which(e$mean_qscore_template<q)] = paste("<", q, sep="") + p7 = ggplot(e, aes(x=start_time/3600, y=sequence_length_template, colour = Q, group = Q)) + + geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + + geom_smooth() + + xlab("Elapsed time (hrs)") + + ylab("Mean read length") + + ylim(0, NA) + + theme_linedraw() + ggsave(filename = file.path(output.dir, "length_by_hour.png"), width = 7, height = 4, dpi=300, plot = p7) + ggsave(filename = file.path(output.dir, "length_by_hour.screen.png"), width = 7, height = 4, dpi=90, plot = p7) + + flog.info(paste(sep = "", flowcell, ": plotting Q score over time")) + p8 = ggplot(e, aes(x=start_time/3600, y=mean_qscore_template, colour = Q, group = Q)) + + geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + + geom_smooth() + + xlab("Elapsed time (hrs)") + + ylab("Mean Q score") + + ylim(0, NA) + + theme_linedraw() + ggsave(filename = file.path(output.dir, "q_by_hour.png"), width = 7, height = 4, dpi=300, plot = p8) + ggsave(filename = file.path(output.dir, "q_by_hour.screen.png"), width = 7, height = 4, dpi=90, plot = p8) + + flog.info(paste(sep = "", flowcell, ": plotting reads per hour")) + f = d[c("hour", "reads_per_hour", "Q_cutoff")] + f = f[!duplicated(f),] + g = subset(f, Q_cutoff=="All reads") + h = subset(f, Q_cutoff==q_title) + max = max(f$hour) + # all of this is just to fill in hours with no reads recorded + all = 0:max + add.g = all[which(all %in% g$hour == FALSE)] + if(length(add.g)>0){ + add.g = data.frame(hour = add.g, reads_per_hour = 0, Q_cutoff = "All reads") + g = rbind(g, add.g) + } + add.h = all[which(all %in% h$hour == FALSE)] + if(length(add.h)>0){ + add.h = data.frame(hour = add.h, reads_per_hour = 0, Q_cutoff = q_title) + h = rbind(h, add.h) + } + i = rbind(g, h) + i$Q_cutoff = as.character(i$Q_cutoff) + i$Q_cutoff[which(i$Q_cutoff==q_title)] = paste("Q>=", q, sep="") + p9 = ggplot(i, aes(x=hour, y=reads_per_hour, colour = Q_cutoff, group = Q_cutoff)) + + geom_vline(xintercept = muxes, colour = 'red', linetype = 'dashed', alpha = 0.5) + + #geom_point() + + geom_line(size=1) + + xlab("Elapsed time (hrs)") + + ylab("Number of reads per hour") + + ylim(0, NA) + + scale_color_discrete(guide = guide_legend(title = "Reads")) + + theme_linedraw() + ggsave(filename = file.path(output.dir, "reads_per_hour.png"), width = 7, height = 4, dpi=300, plot = p9) + ggsave(filename = file.path(output.dir, "reads_per_hour.screen.png"), width = 7, height = 4, dpi=90, plot = p9) + + flog.info(paste(sep = "", flowcell, ": plotting read length vs. q score scatterplot")) + p10 = ggplot(subset(d, Q_cutoff=="All reads"), aes(x = sequence_length_template, y = mean_qscore_template, colour = events_per_base)) + + geom_point(alpha=0.05, size = 0.4) + + scale_x_log10(minor_breaks=log10_minor_break()) + + labs(colour='Events per base\n(log scale)\n') + + theme(text = element_text(size = 15)) + + xlab("Read length") + + ylab("Mean Q score of read") + + theme_linedraw() + + if(max(d$events_per_base, na.rm=T)>0){ + # a catch for 1D2 runs which don't have events per base + p10 = p10 + scale_colour_distiller(trans = "log", labels = scientific, palette = 'Spectral') + } + + ggsave(filename = file.path(output.dir, "length_vs_q.png"), width = 7, height = 4, dpi=300, plot = p10) + ggsave(filename = file.path(output.dir, "length_vs_q.screen.png"), width = 7, height = 4, dpi=90, plot = p10) + + return(d) +} + +if(file_test("-f", input.file)==TRUE){ + # if it's an existing file (not a folder) just run one analysis + d = single.flowcell(input.file, output.dir, q) +}else{ + #WTF + flog.warn(paste("Could find a sequencing summary file in your input which was: ", + input.file, + "\nThe input must be a sequencing_summary.txt file produced by Albacore")) +}