comparison dropletBarcodePlot.R @ 0:04f32429dcf2 draft

planemo upload for repository https://github.com/ebi-gene-expression-group/container-galaxy-sc-tertiary/tree/develop/tools/droplet-rank-plot/.shed.yml commit a785b79f2b5689aba87c0f7072897bb23f6bda76
author ebi-gxa
date Fri, 08 Nov 2019 09:08:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:04f32429dcf2
1 #!/usr/bin/env Rscript
2
3 # This script parses the GTF file to create a feature-wise annotation file with
4 # mitochondrial features flagged, to assist in annotation and QC of single-cell
5 # expression data analysis.
6
7 suppressPackageStartupMessages(require(optparse))
8 suppressPackageStartupMessages(require(ggplot2))
9 suppressPackageStartupMessages(require(gridExtra))
10 suppressPackageStartupMessages(require(DropletUtils))
11 suppressPackageStartupMessages(require(Matrix))
12
13 die <- function(message){
14 write(message, stderr())
15 q(status = 1)
16 }
17
18 option_list = list(
19 make_option(
20 c("-b", "--barcode-frequencies"),
21 action = "store",
22 default = NA,
23 type = 'character',
24 help = "Path to a two-column tab-delimited file, with barcodes in the first column and frequencies in the second (ignored if --mtx-matrix supplied)"
25 ),
26 make_option(
27 c("-m", "--mtx-matrix"),
28 action = "store",
29 default = NA,
30 type = 'character',
31 help = 'Matrix-market format matrix file, with cells by column (overrides --barcode-frequencies if supplied)'
32 ),
33 make_option(
34 c("-r", "--cells-by-row"),
35 action = "store_true",
36 default = FALSE,
37 type = 'logical',
38 help = 'For use with --mtx-matrix: force interpretation of matrix to assume cells are by row, rather than by column (default)'
39 ),
40 make_option(
41 c("-l", "--label"),
42 action = "store",
43 default = '',
44 type = 'character',
45 help = 'Label to use in plot'
46 ),
47 make_option(
48 c("-d", "--density-bins"),
49 action = "store",
50 default = 50,
51 type = 'numeric',
52 help = "Number of bins used to calculate density plot"
53 ),
54 make_option(
55 c("-y", "--roryk-multiplier"),
56 action = "store",
57 default = 1.5,
58 type = 'numeric',
59 help = "Above-baseline multiplier to calculate roryk threshold"
60 ),
61 make_option(
62 c("-o", "--output-plot"),
63 action = "store",
64 default = 'barcode_plot.png',
65 type = 'character',
66 help = "File path for output plot"
67 ),
68 make_option(
69 c("-t", "--output-thresholds"),
70 action = "store",
71 default = 'barcode_thresholds.txt',
72 type = 'character',
73 help = "File path for output file containing calculted thresholds"
74 )
75 )
76
77 opt <- parse_args(OptionParser(option_list = option_list), convert_hyphens_to_underscores = TRUE)
78
79 # Process inputs dependent on what has been provided
80
81 if (is.na(opt$mtx_matrix)){
82 if (is.na(opt$barcode_frequencies)){
83 die('ERROR: must supply --mtx-matrix or --barcode-frequencies')
84 }else if (! file.exists(opt$barcode_frequencies)){
85 die(paste('ERROR: barcode frequencies file', opt$barcode_frequencies, 'does not exist'))
86 }else{
87 barcode_counts <- read.delim(opt$barcode_frequencies, header = FALSE)
88 }
89 }else if (! file.exists(opt$mtx_matrix)){
90 die(paste('ERROR: MTX matrix file', opt$mtx_matrix, 'does not exist'))
91 }else{
92 result_matrix <- Matrix::readMM(opt$mtx_matrix)
93 if (opt$cells_by_row){
94 barcode_counts <- data.frame(V1 = 1:nrow(result_matrix), V2=Matrix::rowSums(result_matrix))
95 }else{
96 barcode_counts <- data.frame(V1 = 1:ncol(result_matrix), V2=Matrix::colSums(result_matrix))
97 }
98 }
99
100 # Pick a cutoff on count as per https://github.com/COMBINE-lab/salmon/issues/362#issuecomment-490160480
101
102 pick_roryk_cutoff = function(bcs, above_baseline_multiplier = 1.5){
103 bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins)
104 mids = bcs_hist$mids
105 vals = bcs_hist$count
106 wdensity = vals * (10^mids) / sum(vals * (10^mids))
107 baseline <- median(wdensity)
108
109 # Find highest density in upper half of barcode distribution
110
111 peak <- which(wdensity == max(wdensity[((length(wdensity)+1)/2):length(wdensity)]))
112
113 # Cutoff is the point before the peak at which density falls below the multiplier of baseline
114
115 10^mids[max(which(wdensity[1:peak] < (above_baseline_multiplier*baseline)))]
116 }
117
118 # Plot densities
119
120 barcode_density_plot = function(bcs, roryk_cutoff, knee, inflection, name = ' ') {
121 bcs_hist = hist(log10(bcs), plot=FALSE, n=opt$density_bins)
122 counts = bcs_hist$count
123 mids = bcs_hist$mids
124 y = counts * (10^mids) / sum(counts * (10^mids))
125 qplot(y, 10^mids) + geom_point() + theme_bw() + ggtitle(name) + ylab('Count') + xlab ('Density') +
126 geom_hline(aes(yintercept = roryk_cutoff, color = paste('roryk_cutoff =', length(which(bcs > roryk_cutoff)), 'cells'))) +
127 geom_hline(aes(yintercept = inflection, color = paste('dropletutils_inflection =', length(which(bcs > inflection)), 'cells'))) +
128 geom_hline(aes(yintercept = knee, color = paste('dropletutils_knee =', length(which(bcs > knee)), 'cells'))) +
129 scale_y_continuous(trans='log10') + theme(axis.title.y=element_blank()) + labs(color='Thresholds')
130 }
131
132 # Plot a more standard barcode rank plot
133
134 barcode_rank_plot <- function(br.out, roryk_total_cutoff, knee, inflection, name='no name'){
135 ggplot(data.frame(br.out), aes(x=rank, y=total)) + geom_line() + scale_x_continuous(trans='log10') + scale_y_continuous(trans='log10') + theme_bw() +
136 geom_hline(aes(yintercept = knee, color = 'dropletutils_knee')) +
137 geom_hline(aes(yintercept = inflection, color = 'dropletutils_inflection')) +
138 geom_hline(aes(yintercept = roryk_total_cutoff, color = 'roryk_cutoff')) +
139 ggtitle(name) + ylab('Count') + xlab('Rank') + theme(legend.position = "none")
140 }
141
142 # Sort barcodes by descending frequency
143
144 barcode_counts <- barcode_counts[order(barcode_counts$V2, decreasing = TRUE), ]
145
146 roryk_count_cutoff <- pick_roryk_cutoff(barcode_counts$V2, opt$roryk_multiplier)
147
148 # Run dropletUtils' barcodeRanks to get knee etc
149 br.out <- barcodeRanks(t(barcode_counts[,2,drop=FALSE]))
150
151 dropletutils_knee <- metadata(br.out)$knee
152 dropletutils_inflection <- metadata(br.out)$inflection
153
154 plot_label <- paste(format(nrow(barcode_counts), big.mark = ','), 'cell barcodes')
155 if ((! is.na(opt$label)) && opt$label != ''){
156 plot_label <- paste0(opt$label, ': ', plot_label)
157 }
158
159 plots <- list(
160 dropletutils = barcode_rank_plot(br.out, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = plot_label),
161 roryk = barcode_density_plot(barcode_counts$V2, roryk_count_cutoff, dropletutils_knee, dropletutils_inflection, name = ' ')
162 )
163
164 # Create output plot
165 png(width = 1000, height = 600, file=opt$output_plot)
166 grid.arrange(plots$dropletutils, plots$roryk, nrow=1)
167 dev.off()
168
169 # Return calculated thresholds
170 write.table(data.frame(dropletutils_knee = dropletutils_knee, dropletutils_inflection = dropletutils_inflection, roryk=roryk_count_cutoff), file = opt$output_thresholds, row.names = FALSE, quote = FALSE)