Mercurial > repos > nick > allele_counts
comparison allele-counts.py @ 6:df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
author | nicksto <nmapsy@gmail.com> |
---|---|
date | Wed, 09 Dec 2015 11:20:51 -0500 |
parents | 31361191d2d2 |
children | 411adeff1eec |
comparison
equal
deleted
inserted
replaced
5:31361191d2d2 | 6:df3b28364cd2 |
---|---|
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 # This parses the output of Dan's "Naive Variant Detector" (previously, | 2 """ |
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". | 3 Run with -h option or see DESCRIPTION for description. |
4 # Or, to put it briefly, | 4 This script's functionality is being obsoleted by the new, and much more sanely |
5 # cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' | 5 written, nvc-filter.py. |
6 # | 6 |
7 # New in this version: | 7 New in this version: |
8 # Made header line customizable | 8 - Calculate strand bias |
9 # - separate from internal column labels, which are used as dict keys | 9 - Rename MINOR.FREQ.PERC to MAF |
10 | |
11 Naive Variant Caller variant count parsing one-liner: | |
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' | |
13 """ | |
14 from __future__ import division | |
10 import os | 15 import os |
11 import sys | 16 import sys |
17 import errno | |
12 import random | 18 import random |
13 from optparse import OptionParser | 19 from optparse import OptionParser |
14 | 20 |
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] | 21 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', |
16 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] | 22 'major', 'minor', 'freq', 'bias'] |
23 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', | |
24 'MAJOR', 'MINOR', 'MAF', 'BIAS'] | |
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] | 25 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] |
18 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv | 26 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv |
19 cat variants.vcf | %prog [options] > alleles.csv""" | 27 cat variants.vcf | %prog [options] > alleles.tsv""" |
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, | 28 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, |
21 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, | 29 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, |
22 'debug_loc':'', 'seed':''} | 30 'debug_loc':'', 'seed':''} |
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout.""" | 31 DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller" |
32 (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the | |
33 number of reads of each base, determines the major allele, minor allele (second | |
34 most frequent variant), and number of alleles above a threshold. So currently | |
35 it only considers SNVs (ACGT), including in the coverage figure. By default it | |
36 reads from stdin and prints to stdout. | |
37 Prints a tab-delimited set of statistics to stdout. | |
38 To print output column labels, run "$ echo -n | ./allele-counts.py -H". | |
39 The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR | |
40 11:MINOR 12:MAF 13:BIAS, | |
41 unless the --stranded option is used, in which case they are: | |
42 1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG | |
43 13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS. | |
44 """ | |
24 EPILOG = """Requirements: | 45 EPILOG = """Requirements: |
25 The input VCF must report the variants for each strand. | 46 The input VCF must report the variants for each strand. |
26 The variants should be case-sensitive (e.g. all capital base letters). | 47 The variants should be case-sensitive (e.g. all capital base letters). |
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0.""" | 48 Strand bias: Both strands must show the same bases passing the frequency |
28 | 49 threshold (but not necessarily in the same order). If the site fails this test, |
50 the number of alleles is reported as 0.""" | |
29 | 51 |
30 def get_options(defaults, usage, description='', epilog=''): | 52 def get_options(defaults, usage, description='', epilog=''): |
31 """Get options, print usage text.""" | 53 """Get options, print usage text.""" |
32 | 54 |
33 parser = OptionParser(usage=usage, description=description, epilog=epilog) | 55 parser = OptionParser(usage=usage, description=description, epilog=epilog) |
76 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) | 98 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) |
77 | 99 |
78 infile = options.infile | 100 infile = options.infile |
79 outfile = options.outfile | 101 outfile = options.outfile |
80 print_header = options.print_header | 102 print_header = options.print_header |
81 freq_thres = options.freq_thres / 100.0 | 103 freq_thres = options.freq_thres / 100 |
82 covg_thres = options.covg_thres | 104 covg_thres = options.covg_thres |
83 stranded = options.stranded | 105 stranded = options.stranded |
84 debug_loc = options.debug_loc | 106 debug_loc = options.debug_loc |
85 seed = options.seed | 107 seed = options.seed |
86 | 108 |
100 print_pos = '' | 122 print_pos = '' |
101 if len(coords) > 1: print_pos = coords[1] | 123 if len(coords) > 1: print_pos = coords[1] |
102 if len(coords) > 2: print_sample = coords[2] | 124 if len(coords) > 2: print_sample = coords[2] |
103 | 125 |
104 # set infile_handle to either stdin or the input file | 126 # set infile_handle to either stdin or the input file |
127 global infile_handle | |
105 if infile == OPT_DEFAULTS.get('infile'): | 128 if infile == OPT_DEFAULTS.get('infile'): |
106 infile_handle = sys.stdin | 129 infile_handle = sys.stdin |
107 sys.stderr.write("Reading from standard input..\n") | 130 sys.stderr.write("Reading from standard input..\n") |
108 else: | 131 else: |
109 if os.path.exists(infile): | 132 if os.path.exists(infile): |
110 infile_handle = open(infile, 'r') | 133 infile_handle = open(infile, 'r') |
111 else: | 134 else: |
112 fail('Error: Input VCF file '+infile+' not found.') | 135 fail('Error: Input VCF file '+infile+' not found.') |
113 | 136 |
114 # set outfile_handle to either stdout or the output file | 137 # set outfile_handle to either stdout or the output file |
138 global outfile_handle | |
115 if outfile == OPT_DEFAULTS.get('outfile'): | 139 if outfile == OPT_DEFAULTS.get('outfile'): |
116 outfile_handle = sys.stdout | 140 outfile_handle = sys.stdout |
117 else: | 141 else: |
118 try: | 142 try: |
119 outfile_handle = open(outfile, 'w') | 143 outfile_handle = open(outfile, 'w') |
120 except IOError, e: | 144 except IOError: |
121 fail('Error: The given output filename '+outfile+' could not be opened.') | 145 fail('Error: The given output filename '+outfile+' could not be opened.') |
122 | 146 |
123 # Take care of column names, print header | 147 # Take care of column names, print header |
124 if len(COLUMNS) != len(COLUMN_LABELS): | 148 if len(COLUMNS) != len(COLUMN_LABELS): |
125 fail('Error: Internal column names list do not match column labels list.') | 149 fail('Error: Internal column names list do not match column labels list.') |
167 freq_thres, covg_thres, stranded, debug=debug) | 191 freq_thres, covg_thres, stranded, debug=debug) |
168 | 192 |
169 if debug and site_summary[0]['print']: | 193 if debug and site_summary[0]['print']: |
170 print line.split('\t')[9].split(':')[-1] | 194 print line.split('\t')[9].split(':')[-1] |
171 | 195 |
172 print_site(outfile_handle, site_summary, COLUMNS) | 196 try: |
197 print_site(outfile_handle, site_summary, COLUMNS) | |
198 except IOError as ioe: | |
199 if ioe.errno == errno.EPIPE: | |
200 cleanup() | |
201 sys.exit(0) | |
173 | 202 |
174 # close any open filehandles | 203 # close any open filehandles |
175 if infile_handle is not sys.stdin: | 204 cleanup() |
176 infile_handle.close() | |
177 if outfile_handle is not sys.stdout: | |
178 outfile_handle.close() | |
179 | 205 |
180 # keeps Galaxy from giving an error if there were messages on stderr | 206 # keeps Galaxy from giving an error if there were messages on stderr |
181 sys.exit(0) | 207 sys.exit(0) |
182 | 208 |
183 | 209 |
245 if variant[0] != '-' and variant[0] != '+': | 271 if variant[0] != '-' and variant[0] != '+': |
246 fail("Error in input VCF: variant data not strand-specific. " | 272 fail("Error in input VCF: variant data not strand-specific. " |
247 +"Failed on line:\n"+line) | 273 +"Failed on line:\n"+line) |
248 try: | 274 try: |
249 variant_counts[variant] = int(float(reads)) | 275 variant_counts[variant] = int(float(reads)) |
250 except ValueError, e: | 276 except ValueError: |
251 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line) | 277 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line) |
252 | 278 |
253 sample_counts[sample_names[i]] = variant_counts | 279 sample_counts[sample_names[i]] = variant_counts |
254 | 280 |
255 site['samples'] = sample_counts | 281 site['samples'] = sample_counts |
328 if debug: print "ranked +-: "+str(ranked_bases) | 354 if debug: print "ranked +-: "+str(ranked_bases) |
329 | 355 |
330 sample['coverage'] = coverage | 356 sample['coverage'] = coverage |
331 try: | 357 try: |
332 sample['major'] = ranked_bases[0][0] | 358 sample['major'] = ranked_bases[0][0] |
333 except IndexError, e: | 359 except IndexError: |
334 sample['major'] = '.' | 360 sample['major'] = '.' |
335 try: | 361 try: |
336 sample['minor'] = ranked_bases[1][0] | 362 sample['minor'] = ranked_bases[1][0] |
337 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5) | 363 sample['freq'] = round(ranked_bases[1][1]/coverage, 5) |
338 except IndexError, e: | 364 except IndexError: |
339 sample['minor'] = '.' | 365 sample['minor'] = '.' |
340 sample['freq'] = 0.0 | 366 sample['freq'] = 0.0 |
367 | |
368 # Now that we've decided major and minor, we can calculate strand bias | |
369 bias = get_strand_bias(sample, variants) | |
370 if bias is None: | |
371 sample['bias'] = '.' | |
372 else: | |
373 sample['bias'] = round(bias, 5) | |
341 | 374 |
342 site_summary.append(sample) | 375 site_summary.append(sample) |
343 | 376 |
344 return site_summary | 377 return site_summary |
345 | 378 |
391 | 424 |
392 if debug: | 425 if debug: |
393 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) | 426 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) |
394 for variant in variant_counts: | 427 for variant in variant_counts: |
395 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ | 428 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ |
396 str(variant[1]/float(coverage))) | 429 str(variant[1]/coverage)) |
397 | 430 |
398 # remove bases below the frequency threshold | 431 # remove bases below the frequency threshold |
399 if freq_thres > 0: | 432 if freq_thres > 0: |
400 variant_counts = [variant for variant in variant_counts | 433 variant_counts = [variant for variant in variant_counts |
401 if variant[1]/float(coverage) >= freq_thres] | 434 if variant[1]/coverage >= freq_thres] |
402 | 435 |
403 return variant_counts | 436 return variant_counts |
404 | 437 |
405 | 438 |
406 def count_alleles(variant_counts, freq_thres, debug=False): | 439 def count_alleles(variant_counts, freq_thres, debug=False): |
430 allele_count = len(alleles_plus) | 463 allele_count = len(alleles_plus) |
431 | 464 |
432 return allele_count | 465 return allele_count |
433 | 466 |
434 | 467 |
468 def get_strand_bias(sample, variants): | |
469 """Based on method 1 (SB) of Guo et al., 2012 | |
470 If there a denominator would be 0, there is no valid result and this will | |
471 return None. This occurs when there are no reads on one of the strands, or | |
472 when there are no minor allele reads.""" | |
473 # using same variable names as in paper | |
474 a = variants.get('+'+sample['major'], 0) # forward major allele | |
475 b = variants.get('+'+sample['minor'], 0) # forward minor allele | |
476 c = variants.get('-'+sample['major'], 0) # reverse major allele | |
477 d = variants.get('-'+sample['minor'], 0) # reverse minor allele | |
478 # no minor allele reads | |
479 try: | |
480 return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d)) | |
481 except ZeroDivisionError: | |
482 return None | |
483 | |
484 | |
435 def print_site(filehandle, site, columns): | 485 def print_site(filehandle, site, columns): |
436 """Print the output lines for one site (one per sample). | 486 """Print the output lines for one site (one per sample). |
437 filehandle must be open.""" | 487 filehandle must be open.""" |
438 for sample in site: | 488 for sample in site: |
439 if sample['print']: | 489 if sample['print']: |
440 fields = [str(sample.get(column)) for column in columns] | 490 fields = [str(sample.get(column)) for column in columns] |
441 filehandle.write('\t'.join(fields)+"\n") | 491 filehandle.write('\t'.join(fields)+"\n") |
442 | 492 |
443 | 493 |
444 def fail(message): | 494 def fail(message): |
495 cleanup() | |
445 sys.stderr.write(message+'\n') | 496 sys.stderr.write(message+'\n') |
446 sys.exit(1) | 497 sys.exit(1) |
447 | 498 |
448 | 499 |
500 def cleanup(): | |
501 if isinstance(infile_handle, file): | |
502 infile_handle.close() | |
503 if isinstance(outfile_handle, file): | |
504 outfile_handle.close() | |
505 | |
506 | |
449 if __name__ == "__main__": | 507 if __name__ == "__main__": |
450 main() | 508 main() |