annotate allele-counts.py @ 4:898eb3daab43

Complete documentation
author nick
date Tue, 04 Jun 2013 00:16:29 -0400
parents 318fdf77aa54
children 31361191d2d2
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
1 #!/usr/bin/python
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
4 #
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
5 # New in this version:
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
6 # Made header line customizable
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
7 # - separate from internal column labels, which are used as dict keys
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
8 #
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
9 # TODO:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
10 # - test handling of -c 0 (and -f 0?)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
11 # - should it technically handle data lines that start with a '#'?
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
12 import os
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
13 import sys
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
14 from optparse import OptionParser
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
15
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
16 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
17 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
18 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
19 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
20 %prog [options] -i variants.vcf -o alleles.csv"""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
21 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
22 'print_header':False, 'stdin':False}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
24 EPILOG = """Requirements:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
25 The input VCF must report the variants for each strand.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
26 The variants should be case-sensitive (e.g. all capital base letters).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
28
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
29
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
30 def get_options(defaults, usage, description='', epilog=''):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
31 """Get options, print usage text."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
32
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
33 parser = OptionParser(usage=usage, description=description, epilog=epilog)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
34
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
35 parser.add_option('-i', '--infile', dest='infile',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
36 default=defaults.get('infile'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
37 help='Read input VCF data from this file instead of stdin.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
38 parser.add_option('-o', '--outfile', dest='outfile',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
39 default=defaults.get('outfile'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
40 help='Print output data to this file instead of stdout.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
42 default=defaults.get('freq_thres'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
43 help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
44 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
45 default=defaults.get('covg_thres'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
46 help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
47 parser.add_option('-H', '--header', dest='print_header', action='store_const',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
48 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
49 help='Print header line. This is a #-commented line with the column labels. Off by default.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
50 parser.add_option('-d', '--debug', dest='debug', action='store_true',
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
51 default=False,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
52 help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
53
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
54 (options, args) = parser.parse_args()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
55
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
56 # read in positional arguments
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
57 arguments = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
58 if options.debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
59 if len(args) >= 1:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
60 arguments['print_loc'] = args[0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
61 args.remove(args[0])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
62
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
63 return (options, arguments)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
64
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
65
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
66 def main():
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
67
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
68 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
69
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
70 infile = options.infile
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
71 outfile = options.outfile
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
72 print_header = options.print_header
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
73 freq_thres = options.freq_thres / 100.0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
74 covg_thres = options.covg_thres
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
75 debug = options.debug
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
76
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
77 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
78 print_loc = args.get('print_loc')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
79 if print_loc:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
80 if ':' in print_loc:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
81 (print_chr, print_pos) = print_loc.split(':')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
82 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
83 print_pos = print_loc
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
84 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
85 sys.stderr.write("Warning: No site coordinate found in arguments. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
86 +"Turning off debug mode.\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
87 debug = False
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
88
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
89 # set infile_handle to either stdin or the input file
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
90 if infile == OPT_DEFAULTS.get('infile'):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
91 infile_handle = sys.stdin
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
92 sys.stderr.write("Reading from standard input..\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
93 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
94 if os.path.exists(infile):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
95 infile_handle = open(infile, 'r')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
96 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
97 fail('Error: Input VCF file '+infile+' not found.')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
98
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
99 # set outfile_handle to either stdout or the output file
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
100 if outfile == OPT_DEFAULTS.get('outfile'):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
101 outfile_handle = sys.stdout
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
102 else:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
103 try:
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
104 outfile_handle = open(outfile, 'w')
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
105 except IOError, e:
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
106 fail('Error: The given output filename '+outfile+' could not be opened.')
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
107
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
108 if len(COLUMNS) != len(COLUMN_LABELS):
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
109 fail('Error: Internal column names do not match column labels.')
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
110 if print_header:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
111 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
112
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
113 # main loop: process and print one line at a time
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
114 sample_names = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
115 for line in infile_handle:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
116 line = line.rstrip('\r\n')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
117
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
118 # header lines
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
119 if line[0] == '#':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
120 if line[0:6].upper() == '#CHROM':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
121 sample_names = line.split('\t')[9:]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
122 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
123
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
124 if not sample_names:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
125 fail("Error in input VCF: Data line encountered before header line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
126 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
127
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
128 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
129
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
130 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
131 if site_data['pos'] != print_pos:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
132 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
133 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
134 if site_data['chr'] != print_chr:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
135 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
136 except NameError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
137 pass # No chr specified. Just go ahead and print the line.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
138
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
139 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
140 freq_thres, covg_thres, debug=debug)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
141
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
142 if debug and site_summary[0]['print']:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
143 print line.split('\t')[9].split(':')[-1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
144
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
145 print_site(outfile_handle, site_summary, COLUMNS)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
146
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
147 # close any open filehandles
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
148 if infile_handle is not sys.stdin:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
149 infile_handle.close()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
150 if outfile_handle is not sys.stdout:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
151 outfile_handle.close()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
152
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
153 # keeps Galaxy from giving an error if there were messages on stderr
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
154 sys.exit(0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
155
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
156
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
157
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
158 def read_site(line, sample_names, canonical):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
159 """Read in a line, parse the variants into a data structure, and return it.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
160 The line should be actual site data, not a header line, so check beforehand.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
161 Notes:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
162 - The line is assumed to have been chomped."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
163
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
164 site = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
165 fields = line.split('\t')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
166
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
167 if len(fields) < 9:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
168 fail("Error in input VCF: wrong number of fields in data line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
169 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
170
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
171 site['chr'] = fields[0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
172 site['pos'] = fields[1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
173 samples = fields[9:]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
174
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
175 if len(samples) < len(sample_names):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
176 fail("Error in input VCF: missing sample fields in data line. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
177 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
178 elif len(samples) > len(sample_names):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
179 fail("Error in input VCF: more sample fields in data line than in header. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
180 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
181
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
182 sample_counts = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
183 for i in range(len(samples)):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
184
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
185 variant_counts = {}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
186 counts = samples[i].split(':')[-1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
187 counts = counts.split(',')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
188
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
189 for count in counts:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
190 if not count:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
191 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
192 fields = count.split('=')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
193 if len(fields) != 2:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
194 fail("Error in input VCF: Incorrect variant data format (must contain "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
195 +"a single '='). Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
196 (variant, reads) = fields
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
197 if variant[1:] not in canonical:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
198 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
199 if variant[0] != '-' and variant[0] != '+':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
200 fail("Error in input VCF: variant data not strand-specific. "
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
201 +"Failed on line:\n"+line)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
202 try:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
203 variant_counts[variant] = int(float(reads))
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
204 except ValueError, e:
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
205 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
206
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
207 sample_counts[sample_names[i]] = variant_counts
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
208
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
209 site['samples'] = sample_counts
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
210
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
211 return site
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
212
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
213
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
214 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
215 debug=False):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
216 """Take the raw data from the VCF line and transform it into the summary data
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
217 to be printed in the output format."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
218
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
219 site_summary = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
220 for sample_name in sample_names:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
221
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
222 sample = {'print':False}
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
223 variants = site['samples'].get(sample_name)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
224 if not variants:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
225 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
226 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
227
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
228 sample['sample'] = sample_name
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
229 sample['chr'] = site['chr']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
230 sample['pos'] = site['pos']
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
231
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
232 coverage = sum(variants.values())
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
233
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
234 # get stranded coverage
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
235 covg_plus = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
236 covg_minus = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
237 for variant in variants:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
238 if variant[0] == '+':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
239 covg_plus += variants[variant]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
240 elif variant[0] == '-':
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
241 covg_minus += variants[variant]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
242 # stranded coverage threshold
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
243 if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
244 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
245 continue
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
246 else:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
247 sample['print'] = True
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
248
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
249 # get an ordered list of read counts for all variants (either strand)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
250 ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
251
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
252 # record read counts into dict for this sample
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
253 for base in ranked_bases:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
254 sample[base[0]] = base[1]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
255 # fill in any zeros
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
256 for variant in canonical:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
257 if not sample.has_key(variant):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
258 sample[variant] = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
259
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
260 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
261
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
262 # set minor allele to N if there's a tie for 2nd
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
263 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
264 ranked_bases[1] = ('N', 0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
265 sample['alleles'] = 1 if sample['alleles'] else 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
266
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
267 if debug: print ranked_bases
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
268
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
269 sample['coverage'] = coverage
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
270 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
271 sample['major'] = ranked_bases[0][0]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
272 except IndexError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
273 sample['major'] = '.'
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
274 try:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
275 sample['minor'] = ranked_bases[1][0]
2
318fdf77aa54 Current version, from another toolshed
nick
parents: 1
diff changeset
276 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5)
1
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
277 except IndexError, e:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
278 sample['minor'] = '.'
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
279 sample['freq'] = 0.0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
280
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
281 site_summary.append(sample)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
282
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
283 return site_summary
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
284
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
285
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
286 def print_site(filehandle, site, columns):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
287 """Print the output lines for one site (one per sample).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
288 filehandle must be open."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
289 for sample in site:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
290 if sample['print']:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
291 fields = [str(sample.get(column)) for column in columns]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
292 filehandle.write('\t'.join(fields)+"\n")
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
293
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
294
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
295 def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
296 """Count the number of reads for each base, and create a ranked list of
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
297 alleles passing the frequency threshold.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
298 Arguments:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
299 variant_counts: Dict of the stranded variants (keys) and their read counts (values).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
300 freq_thres: The frequency threshold each allele needs to pass to be included.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
301 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
302 variants: A list of the variants of interest. Other types of variants will not
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
303 be included in the returned list. If no list is given, all variants found in
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
304 the variant_counts will be used.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
305 Return value:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
306 ranked_bases: A list of the alleles and their read counts. The elements are
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
307 tuples (base, read count). The alleles are listed in descending order of
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
308 frequency, and only those passing the threshold are included."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
309
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
310 # Get list of all variants from variant_counts list
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
311 variants = [variant[1:] for variant in variant_counts]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
312 # deduplicate via a dict
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
313 variant_dict = dict((variant, 1) for variant in variants)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
314 variants = variant_dict.keys()
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
315
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
316 ranked_bases = []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
317 for variant in variants:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
318 reads = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
319 for strand in strands:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
320 reads += variant_counts.get(strand+variant, 0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
321 ranked_bases.append((variant, reads))
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
322
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
323 # get coverage for the specified strands
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
324 coverage = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
325 for variant in variant_counts:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
326 if variant[0] in strands:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
327 coverage += variant_counts.get(variant, 0)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
328 # if debug: print "strands: "+strands+', covg: '+str(coverage)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
329
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
330 if coverage < 1:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
331 return []
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
332
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
333 # sort the list of alleles by read count
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
334 ranked_bases.sort(reverse=True, key=lambda base: base[1])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
335
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
336 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
337 print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
338 for base in ranked_bases:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
339 print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
340 str(base[1]/float(coverage)))
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
341
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
342 # remove bases below the frequency threshold
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
343 ranked_bases = [base for base in ranked_bases
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
344 if base[1]/float(coverage) >= freq_thres]
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
345
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
346 return ranked_bases
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
347
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
348
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
349 def count_alleles(variant_counts, freq_thres, debug=False):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
350 """Determine how many alleles to report, based on filtering rules.
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
351 The current rule determines which bases pass the frequency threshold on each
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
352 strand individually, then compares the two sets of bases. If they are the same
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
353 (regardless of order), the allele count is the number of bases. Otherwise it
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
354 is zero."""
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
355 allele_count = 0
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
356
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
357 alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
358 strands='+')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
359 alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug,
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
360 strands='-')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
361
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
362 if debug:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
363 print '+ '+str(alleles_plus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
364 print '- '+str(alleles_minus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
365
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
366 # check if each strand reports the same set of alleles
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
367 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
368 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
369 if alleles_plus_sorted == alleles_minus_sorted:
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
370 allele_count = len(alleles_plus)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
371
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
372 return allele_count
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
373
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
374
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
375 def fail(message):
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
376 sys.stderr.write(message+'\n')
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
377 sys.exit(1)
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
378
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
379 if __name__ == "__main__":
49bb46c3a1af Uploaded script
nick
parents:
diff changeset
380 main()