comparison allele-counts.py @ 6:df3b28364cd2

allele-counts.{py,xml}: Add strand bias, documentation updates.
author nicksto <nmapsy@gmail.com>
date Wed, 09 Dec 2015 11:20:51 -0500
parents 31361191d2d2
children 411adeff1eec
comparison
equal deleted inserted replaced
5:31361191d2d2 6:df3b28364cd2
1 #!/usr/bin/python 1 #!/usr/bin/python
2 # This parses the output of Dan's "Naive Variant Detector" (previously, 2 """
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". 3 Run with -h option or see DESCRIPTION for description.
4 # Or, to put it briefly, 4 This script's functionality is being obsoleted by the new, and much more sanely
5 # cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' 5 written, nvc-filter.py.
6 # 6
7 # New in this version: 7 New in this version:
8 # Made header line customizable 8 - Calculate strand bias
9 # - separate from internal column labels, which are used as dict keys 9 - Rename MINOR.FREQ.PERC to MAF
10
11 Naive Variant Caller variant count parsing one-liner:
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
13 """
14 from __future__ import division
10 import os 15 import os
11 import sys 16 import sys
17 import errno
12 import random 18 import random
13 from optparse import OptionParser 19 from optparse import OptionParser
14 20
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] 21 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
16 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] 22 'major', 'minor', 'freq', 'bias']
23 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES',
24 'MAJOR', 'MINOR', 'MAF', 'BIAS']
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] 25 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
18 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv 26 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv
19 cat variants.vcf | %prog [options] > alleles.csv""" 27 cat variants.vcf | %prog [options] > alleles.tsv"""
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, 28 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
21 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, 29 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
22 'debug_loc':'', 'seed':''} 30 'debug_loc':'', 'seed':''}
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout.""" 31 DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller"
32 (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the
33 number of reads of each base, determines the major allele, minor allele (second
34 most frequent variant), and number of alleles above a threshold. So currently
35 it only considers SNVs (ACGT), including in the coverage figure. By default it
36 reads from stdin and prints to stdout.
37 Prints a tab-delimited set of statistics to stdout.
38 To print output column labels, run "$ echo -n | ./allele-counts.py -H".
39 The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR
40 11:MINOR 12:MAF 13:BIAS,
41 unless the --stranded option is used, in which case they are:
42 1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG
43 13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS.
44 """
24 EPILOG = """Requirements: 45 EPILOG = """Requirements:
25 The input VCF must report the variants for each strand. 46 The input VCF must report the variants for each strand.
26 The variants should be case-sensitive (e.g. all capital base letters). 47 The variants should be case-sensitive (e.g. all capital base letters).
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0.""" 48 Strand bias: Both strands must show the same bases passing the frequency
28 49 threshold (but not necessarily in the same order). If the site fails this test,
50 the number of alleles is reported as 0."""
29 51
30 def get_options(defaults, usage, description='', epilog=''): 52 def get_options(defaults, usage, description='', epilog=''):
31 """Get options, print usage text.""" 53 """Get options, print usage text."""
32 54
33 parser = OptionParser(usage=usage, description=description, epilog=epilog) 55 parser = OptionParser(usage=usage, description=description, epilog=epilog)
76 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) 98 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
77 99
78 infile = options.infile 100 infile = options.infile
79 outfile = options.outfile 101 outfile = options.outfile
80 print_header = options.print_header 102 print_header = options.print_header
81 freq_thres = options.freq_thres / 100.0 103 freq_thres = options.freq_thres / 100
82 covg_thres = options.covg_thres 104 covg_thres = options.covg_thres
83 stranded = options.stranded 105 stranded = options.stranded
84 debug_loc = options.debug_loc 106 debug_loc = options.debug_loc
85 seed = options.seed 107 seed = options.seed
86 108
100 print_pos = '' 122 print_pos = ''
101 if len(coords) > 1: print_pos = coords[1] 123 if len(coords) > 1: print_pos = coords[1]
102 if len(coords) > 2: print_sample = coords[2] 124 if len(coords) > 2: print_sample = coords[2]
103 125
104 # set infile_handle to either stdin or the input file 126 # set infile_handle to either stdin or the input file
127 global infile_handle
105 if infile == OPT_DEFAULTS.get('infile'): 128 if infile == OPT_DEFAULTS.get('infile'):
106 infile_handle = sys.stdin 129 infile_handle = sys.stdin
107 sys.stderr.write("Reading from standard input..\n") 130 sys.stderr.write("Reading from standard input..\n")
108 else: 131 else:
109 if os.path.exists(infile): 132 if os.path.exists(infile):
110 infile_handle = open(infile, 'r') 133 infile_handle = open(infile, 'r')
111 else: 134 else:
112 fail('Error: Input VCF file '+infile+' not found.') 135 fail('Error: Input VCF file '+infile+' not found.')
113 136
114 # set outfile_handle to either stdout or the output file 137 # set outfile_handle to either stdout or the output file
138 global outfile_handle
115 if outfile == OPT_DEFAULTS.get('outfile'): 139 if outfile == OPT_DEFAULTS.get('outfile'):
116 outfile_handle = sys.stdout 140 outfile_handle = sys.stdout
117 else: 141 else:
118 try: 142 try:
119 outfile_handle = open(outfile, 'w') 143 outfile_handle = open(outfile, 'w')
120 except IOError, e: 144 except IOError:
121 fail('Error: The given output filename '+outfile+' could not be opened.') 145 fail('Error: The given output filename '+outfile+' could not be opened.')
122 146
123 # Take care of column names, print header 147 # Take care of column names, print header
124 if len(COLUMNS) != len(COLUMN_LABELS): 148 if len(COLUMNS) != len(COLUMN_LABELS):
125 fail('Error: Internal column names list do not match column labels list.') 149 fail('Error: Internal column names list do not match column labels list.')
167 freq_thres, covg_thres, stranded, debug=debug) 191 freq_thres, covg_thres, stranded, debug=debug)
168 192
169 if debug and site_summary[0]['print']: 193 if debug and site_summary[0]['print']:
170 print line.split('\t')[9].split(':')[-1] 194 print line.split('\t')[9].split(':')[-1]
171 195
172 print_site(outfile_handle, site_summary, COLUMNS) 196 try:
197 print_site(outfile_handle, site_summary, COLUMNS)
198 except IOError as ioe:
199 if ioe.errno == errno.EPIPE:
200 cleanup()
201 sys.exit(0)
173 202
174 # close any open filehandles 203 # close any open filehandles
175 if infile_handle is not sys.stdin: 204 cleanup()
176 infile_handle.close()
177 if outfile_handle is not sys.stdout:
178 outfile_handle.close()
179 205
180 # keeps Galaxy from giving an error if there were messages on stderr 206 # keeps Galaxy from giving an error if there were messages on stderr
181 sys.exit(0) 207 sys.exit(0)
182 208
183 209
245 if variant[0] != '-' and variant[0] != '+': 271 if variant[0] != '-' and variant[0] != '+':
246 fail("Error in input VCF: variant data not strand-specific. " 272 fail("Error in input VCF: variant data not strand-specific. "
247 +"Failed on line:\n"+line) 273 +"Failed on line:\n"+line)
248 try: 274 try:
249 variant_counts[variant] = int(float(reads)) 275 variant_counts[variant] = int(float(reads))
250 except ValueError, e: 276 except ValueError:
251 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line) 277 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
252 278
253 sample_counts[sample_names[i]] = variant_counts 279 sample_counts[sample_names[i]] = variant_counts
254 280
255 site['samples'] = sample_counts 281 site['samples'] = sample_counts
328 if debug: print "ranked +-: "+str(ranked_bases) 354 if debug: print "ranked +-: "+str(ranked_bases)
329 355
330 sample['coverage'] = coverage 356 sample['coverage'] = coverage
331 try: 357 try:
332 sample['major'] = ranked_bases[0][0] 358 sample['major'] = ranked_bases[0][0]
333 except IndexError, e: 359 except IndexError:
334 sample['major'] = '.' 360 sample['major'] = '.'
335 try: 361 try:
336 sample['minor'] = ranked_bases[1][0] 362 sample['minor'] = ranked_bases[1][0]
337 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5) 363 sample['freq'] = round(ranked_bases[1][1]/coverage, 5)
338 except IndexError, e: 364 except IndexError:
339 sample['minor'] = '.' 365 sample['minor'] = '.'
340 sample['freq'] = 0.0 366 sample['freq'] = 0.0
367
368 # Now that we've decided major and minor, we can calculate strand bias
369 bias = get_strand_bias(sample, variants)
370 if bias is None:
371 sample['bias'] = '.'
372 else:
373 sample['bias'] = round(bias, 5)
341 374
342 site_summary.append(sample) 375 site_summary.append(sample)
343 376
344 return site_summary 377 return site_summary
345 378
391 424
392 if debug: 425 if debug:
393 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) 426 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
394 for variant in variant_counts: 427 for variant in variant_counts:
395 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ 428 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
396 str(variant[1]/float(coverage))) 429 str(variant[1]/coverage))
397 430
398 # remove bases below the frequency threshold 431 # remove bases below the frequency threshold
399 if freq_thres > 0: 432 if freq_thres > 0:
400 variant_counts = [variant for variant in variant_counts 433 variant_counts = [variant for variant in variant_counts
401 if variant[1]/float(coverage) >= freq_thres] 434 if variant[1]/coverage >= freq_thres]
402 435
403 return variant_counts 436 return variant_counts
404 437
405 438
406 def count_alleles(variant_counts, freq_thres, debug=False): 439 def count_alleles(variant_counts, freq_thres, debug=False):
430 allele_count = len(alleles_plus) 463 allele_count = len(alleles_plus)
431 464
432 return allele_count 465 return allele_count
433 466
434 467
468 def get_strand_bias(sample, variants):
469 """Based on method 1 (SB) of Guo et al., 2012
470 If there a denominator would be 0, there is no valid result and this will
471 return None. This occurs when there are no reads on one of the strands, or
472 when there are no minor allele reads."""
473 # using same variable names as in paper
474 a = variants.get('+'+sample['major'], 0) # forward major allele
475 b = variants.get('+'+sample['minor'], 0) # forward minor allele
476 c = variants.get('-'+sample['major'], 0) # reverse major allele
477 d = variants.get('-'+sample['minor'], 0) # reverse minor allele
478 # no minor allele reads
479 try:
480 return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d))
481 except ZeroDivisionError:
482 return None
483
484
435 def print_site(filehandle, site, columns): 485 def print_site(filehandle, site, columns):
436 """Print the output lines for one site (one per sample). 486 """Print the output lines for one site (one per sample).
437 filehandle must be open.""" 487 filehandle must be open."""
438 for sample in site: 488 for sample in site:
439 if sample['print']: 489 if sample['print']:
440 fields = [str(sample.get(column)) for column in columns] 490 fields = [str(sample.get(column)) for column in columns]
441 filehandle.write('\t'.join(fields)+"\n") 491 filehandle.write('\t'.join(fields)+"\n")
442 492
443 493
444 def fail(message): 494 def fail(message):
495 cleanup()
445 sys.stderr.write(message+'\n') 496 sys.stderr.write(message+'\n')
446 sys.exit(1) 497 sys.exit(1)
447 498
448 499
500 def cleanup():
501 if isinstance(infile_handle, file):
502 infile_handle.close()
503 if isinstance(outfile_handle, file):
504 outfile_handle.close()
505
506
449 if __name__ == "__main__": 507 if __name__ == "__main__":
450 main() 508 main()