allele_counts: allele-counts.py comparison

comparison allele-counts.py @ 6:df3b28364cd2

allele-counts.{py,xml}: Add strand bias, documentation updates.

author	nicksto <nmapsy@gmail.com>
date	Wed, 09 Dec 2015 11:20:51 -0500
parents	31361191d2d2
children	411adeff1eec

comparison

equal deleted inserted replaced

-:31361191d2d2
+:df3b28364cd2
 #!/usr/bin/python
-# This parses the output of Dan's "Naive Variant Detector" (previously,
+"""
-# "BAM Coverage"). It was forked from the code of "bam-coverage.py".
+Run with -h option or see DESCRIPTION for description.
-# Or, to put it briefly,
+This script's functionality is being obsoleted by the new, and much more sanely
-# cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
+written, nvc-filter.py.
-#
-# New in this version:
+New in this version:
-#   Made header line customizable
+- Calculate strand bias
-#     - separate from internal column labels, which are used as dict keys
+- Rename MINOR.FREQ.PERC to MAF
+Naive Variant Caller variant count parsing one-liner:
+$ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
+"""
+from __future__ import division
 import os
 import sys
+import errno
 import random
 from optparse import OptionParser
-COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
+COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles',
-COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
+'major', 'minor', 'freq', 'bias']
+COLUMN_LABELS = ['SAMPLE', 'CHR',  'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES',
+'MAJOR', 'MINOR', 'MAF', 'BIAS']
 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
-USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
+USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv
-cat variants.vcf | %prog [options] > alleles.csv"""
+cat variants.vcf | %prog [options] > alleles.tsv"""
 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
 'debug_loc':'', 'seed':''}
-DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
+DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller"
+(aka "BAM Coverage") Galaxy tool. For each position reported, it counts the
+number of reads of each base, determines the major allele, minor allele (second
+most frequent variant), and number of alleles above a threshold. So currently
+it only considers SNVs (ACGT), including in the coverage figure. By default it
+reads from stdin and prints to stdout.
+Prints a tab-delimited set of statistics to stdout.
+To print output column labels, run "$ echo -n | ./allele-counts.py -H".
+The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR
+11:MINOR 12:MAF 13:BIAS,
+unless the --stranded option is used, in which case they are:
+1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG
+13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS.
+"""
 EPILOG = """Requirements:
 The input VCF must report the variants for each strand.
 The variants should be case-sensitive (e.g. all capital base letters).
-Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
+Strand bias: Both strands must show the same bases passing the frequency
+threshold (but not necessarily in the same order). If the site fails this test,
+the number of alleles is reported as 0."""
 def get_options(defaults, usage, description='', epilog=''):
 """Get options, print usage text."""
 parser = OptionParser(usage=usage, description=description, epilog=epilog)
 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
 infile = options.infile
 outfile = options.outfile
 print_header = options.print_header
-freq_thres = options.freq_thres / 100.0
+freq_thres = options.freq_thres / 100
 covg_thres = options.covg_thres
 stranded = options.stranded
 debug_loc = options.debug_loc
 seed = options.seed
 print_pos = ''
 if len(coords) > 1: print_pos = coords[1]
 if len(coords) > 2: print_sample = coords[2]
 # set infile_handle to either stdin or the input file
+global infile_handle
 if infile == OPT_DEFAULTS.get('infile'):
 infile_handle = sys.stdin
 sys.stderr.write("Reading from standard input..\n")
 else:
 if os.path.exists(infile):
 infile_handle = open(infile, 'r')
 else:
 fail('Error: Input VCF file '+infile+' not found.')
 # set outfile_handle to either stdout or the output file
+global outfile_handle
 if outfile == OPT_DEFAULTS.get('outfile'):
 outfile_handle = sys.stdout
 else:
 try:
 outfile_handle = open(outfile, 'w')
-except IOError, e:
+except IOError:
 fail('Error: The given output filename '+outfile+' could not be opened.')
 # Take care of column names, print header
 if len(COLUMNS) != len(COLUMN_LABELS):
 fail('Error: Internal column names list do not match column labels list.')
 freq_thres, covg_thres, stranded, debug=debug)
 if debug and site_summary[0]['print']:
 print line.split('\t')[9].split(':')[-1]
-print_site(outfile_handle, site_summary, COLUMNS)
+try:
+print_site(outfile_handle, site_summary, COLUMNS)
+except IOError as ioe:
+if ioe.errno == errno.EPIPE:
+cleanup()
+sys.exit(0)
 # close any open filehandles
-if infile_handle is not sys.stdin:
+cleanup()
-infile_handle.close()
-if outfile_handle is not sys.stdout:
-outfile_handle.close()
 # keeps Galaxy from giving an error if there were messages on stderr
 sys.exit(0)
 if variant[0] != '-' and variant[0] != '+':
 fail("Error in input VCF: variant data not strand-specific. "
 +"Failed on line:\n"+line)
 try:
 variant_counts[variant] = int(float(reads))
-except ValueError, e:
+except ValueError:
 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
 sample_counts[sample_names[i]] = variant_counts
 site['samples'] = sample_counts
 if debug: print "ranked +-: "+str(ranked_bases)
 sample['coverage'] = coverage
 try:
 sample['major']  = ranked_bases[0][0]
-except IndexError, e:
+except IndexError:
 sample['major']  = '.'
 try:
 sample['minor']  = ranked_bases[1][0]
-sample['freq']   = round(ranked_bases[1][1]/float(coverage), 5)
+sample['freq']   = round(ranked_bases[1][1]/coverage, 5)
-except IndexError, e:
+except IndexError:
 sample['minor']  = '.'
 sample['freq']   = 0.0
+# Now that we've decided major and minor, we can calculate strand bias
+bias = get_strand_bias(sample, variants)
+if bias is None:
+sample['bias'] = '.'
+else:
+sample['bias'] = round(bias, 5)
 site_summary.append(sample)
 return site_summary
 if debug:
 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
 for variant in variant_counts:
 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
-str(variant[1]/float(coverage)))
+str(variant[1]/coverage))
 # remove bases below the frequency threshold
 if freq_thres > 0:
 variant_counts = [variant for variant in variant_counts
-if variant[1]/float(coverage) >= freq_thres]
+if variant[1]/coverage >= freq_thres]
 return variant_counts
 def count_alleles(variant_counts, freq_thres, debug=False):
 allele_count = len(alleles_plus)
 return allele_count
+def get_strand_bias(sample, variants):
+"""Based on method 1 (SB) of Guo et al., 2012
+If there a denominator would be 0, there is no valid result and this will
+return None. This occurs when there are no reads on one of the strands, or
+when there are no minor allele reads."""
+# using same variable names as in paper
+a = variants.get('+'+sample['major'], 0) # forward major allele
+b = variants.get('+'+sample['minor'], 0) # forward minor allele
+c = variants.get('-'+sample['major'], 0) # reverse major allele
+d = variants.get('-'+sample['minor'], 0) # reverse minor allele
+# no minor allele reads
+try:
+return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d))
+except ZeroDivisionError:
+return None
 def print_site(filehandle, site, columns):
 """Print the output lines for one site (one per sample).
 filehandle must be open."""
 for sample in site:
 if sample['print']:
 fields = [str(sample.get(column)) for column in columns]
 filehandle.write('\t'.join(fields)+"\n")
 def fail(message):
+cleanup()
 sys.stderr.write(message+'\n')
 sys.exit(1)
+def cleanup():
+if isinstance(infile_handle, file):
+infile_handle.close()
+if isinstance(outfile_handle, file):
+outfile_handle.close()
 if __name__ == "__main__":
 main()

Mercurial > repos > nick > allele_counts

comparison allele-counts.py @ 6:df3b28364cd2