1
|
1 #!/usr/bin/python
|
|
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
|
|
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
|
|
4 #
|
2
|
5 # New in this version:
|
|
6 # Made header line customizable
|
|
7 # - separate from internal column labels, which are used as dict keys
|
1
|
8 #
|
|
9 # TODO:
|
|
10 # - test handling of -c 0 (and -f 0?)
|
|
11 # - should it technically handle data lines that start with a '#'?
|
|
12 import os
|
|
13 import sys
|
|
14 from optparse import OptionParser
|
|
15
|
2
|
16 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
|
|
17 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
|
1
|
18 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
|
|
19 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv
|
|
20 %prog [options] -i variants.vcf -o alleles.csv"""
|
|
21 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
|
|
22 'print_header':False, 'stdin':False}
|
|
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
|
|
24 EPILOG = """Requirements:
|
|
25 The input VCF must report the variants for each strand.
|
|
26 The variants should be case-sensitive (e.g. all capital base letters).
|
|
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
|
|
28
|
|
29
|
|
30 def get_options(defaults, usage, description='', epilog=''):
|
|
31 """Get options, print usage text."""
|
|
32
|
|
33 parser = OptionParser(usage=usage, description=description, epilog=epilog)
|
|
34
|
|
35 parser.add_option('-i', '--infile', dest='infile',
|
|
36 default=defaults.get('infile'),
|
|
37 help='Read input VCF data from this file instead of stdin.')
|
|
38 parser.add_option('-o', '--outfile', dest='outfile',
|
|
39 default=defaults.get('outfile'),
|
|
40 help='Print output data to this file instead of stdout.')
|
|
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
|
|
42 default=defaults.get('freq_thres'),
|
|
43 help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.')
|
|
44 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
|
|
45 default=defaults.get('covg_thres'),
|
|
46 help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.')
|
|
47 parser.add_option('-H', '--header', dest='print_header', action='store_const',
|
|
48 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
|
|
49 help='Print header line. This is a #-commented line with the column labels. Off by default.')
|
|
50 parser.add_option('-d', '--debug', dest='debug', action='store_true',
|
|
51 default=False,
|
|
52 help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.')
|
|
53
|
|
54 (options, args) = parser.parse_args()
|
|
55
|
|
56 # read in positional arguments
|
|
57 arguments = {}
|
|
58 if options.debug:
|
|
59 if len(args) >= 1:
|
|
60 arguments['print_loc'] = args[0]
|
|
61 args.remove(args[0])
|
|
62
|
|
63 return (options, arguments)
|
|
64
|
|
65
|
|
66 def main():
|
|
67
|
|
68 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
|
|
69
|
|
70 infile = options.infile
|
|
71 outfile = options.outfile
|
|
72 print_header = options.print_header
|
|
73 freq_thres = options.freq_thres / 100.0
|
|
74 covg_thres = options.covg_thres
|
|
75 debug = options.debug
|
|
76
|
|
77 if debug:
|
|
78 print_loc = args.get('print_loc')
|
|
79 if print_loc:
|
|
80 if ':' in print_loc:
|
|
81 (print_chr, print_pos) = print_loc.split(':')
|
|
82 else:
|
|
83 print_pos = print_loc
|
|
84 else:
|
|
85 sys.stderr.write("Warning: No site coordinate found in arguments. "
|
|
86 +"Turning off debug mode.\n")
|
|
87 debug = False
|
|
88
|
|
89 # set infile_handle to either stdin or the input file
|
|
90 if infile == OPT_DEFAULTS.get('infile'):
|
|
91 infile_handle = sys.stdin
|
|
92 sys.stderr.write("Reading from standard input..\n")
|
|
93 else:
|
|
94 if os.path.exists(infile):
|
|
95 infile_handle = open(infile, 'r')
|
|
96 else:
|
|
97 fail('Error: Input VCF file '+infile+' not found.')
|
|
98
|
|
99 # set outfile_handle to either stdout or the output file
|
|
100 if outfile == OPT_DEFAULTS.get('outfile'):
|
|
101 outfile_handle = sys.stdout
|
|
102 else:
|
2
|
103 try:
|
1
|
104 outfile_handle = open(outfile, 'w')
|
2
|
105 except IOError, e:
|
|
106 fail('Error: The given output filename '+outfile+' could not be opened.')
|
1
|
107
|
2
|
108 if len(COLUMNS) != len(COLUMN_LABELS):
|
|
109 fail('Error: Internal column names do not match column labels.')
|
1
|
110 if print_header:
|
2
|
111 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
|
1
|
112
|
|
113 # main loop: process and print one line at a time
|
|
114 sample_names = []
|
|
115 for line in infile_handle:
|
|
116 line = line.rstrip('\r\n')
|
|
117
|
|
118 # header lines
|
|
119 if line[0] == '#':
|
|
120 if line[0:6].upper() == '#CHROM':
|
|
121 sample_names = line.split('\t')[9:]
|
|
122 continue
|
|
123
|
|
124 if not sample_names:
|
|
125 fail("Error in input VCF: Data line encountered before header line. "
|
|
126 +"Failed on line:\n"+line)
|
|
127
|
|
128 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
|
|
129
|
|
130 if debug:
|
|
131 if site_data['pos'] != print_pos:
|
|
132 continue
|
|
133 try:
|
|
134 if site_data['chr'] != print_chr:
|
|
135 continue
|
|
136 except NameError, e:
|
|
137 pass # No chr specified. Just go ahead and print the line.
|
|
138
|
|
139 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
|
|
140 freq_thres, covg_thres, debug=debug)
|
|
141
|
|
142 if debug and site_summary[0]['print']:
|
|
143 print line.split('\t')[9].split(':')[-1]
|
|
144
|
|
145 print_site(outfile_handle, site_summary, COLUMNS)
|
|
146
|
|
147 # close any open filehandles
|
|
148 if infile_handle is not sys.stdin:
|
|
149 infile_handle.close()
|
|
150 if outfile_handle is not sys.stdout:
|
|
151 outfile_handle.close()
|
|
152
|
|
153 # keeps Galaxy from giving an error if there were messages on stderr
|
|
154 sys.exit(0)
|
|
155
|
|
156
|
|
157
|
|
158 def read_site(line, sample_names, canonical):
|
|
159 """Read in a line, parse the variants into a data structure, and return it.
|
|
160 The line should be actual site data, not a header line, so check beforehand.
|
|
161 Notes:
|
|
162 - The line is assumed to have been chomped."""
|
|
163
|
|
164 site = {}
|
|
165 fields = line.split('\t')
|
|
166
|
|
167 if len(fields) < 9:
|
|
168 fail("Error in input VCF: wrong number of fields in data line. "
|
|
169 +"Failed on line:\n"+line)
|
|
170
|
|
171 site['chr'] = fields[0]
|
|
172 site['pos'] = fields[1]
|
|
173 samples = fields[9:]
|
|
174
|
|
175 if len(samples) < len(sample_names):
|
|
176 fail("Error in input VCF: missing sample fields in data line. "
|
|
177 +"Failed on line:\n"+line)
|
|
178 elif len(samples) > len(sample_names):
|
|
179 fail("Error in input VCF: more sample fields in data line than in header. "
|
|
180 +"Failed on line:\n"+line)
|
|
181
|
|
182 sample_counts = {}
|
|
183 for i in range(len(samples)):
|
|
184
|
|
185 variant_counts = {}
|
|
186 counts = samples[i].split(':')[-1]
|
|
187 counts = counts.split(',')
|
|
188
|
|
189 for count in counts:
|
|
190 if not count:
|
|
191 continue
|
|
192 fields = count.split('=')
|
|
193 if len(fields) != 2:
|
|
194 fail("Error in input VCF: Incorrect variant data format (must contain "
|
|
195 +"a single '='). Failed on line:\n"+line)
|
|
196 (variant, reads) = fields
|
|
197 if variant[1:] not in canonical:
|
|
198 continue
|
|
199 if variant[0] != '-' and variant[0] != '+':
|
|
200 fail("Error in input VCF: variant data not strand-specific. "
|
|
201 +"Failed on line:\n"+line)
|
|
202 try:
|
2
|
203 variant_counts[variant] = int(float(reads))
|
1
|
204 except ValueError, e:
|
2
|
205 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
|
1
|
206
|
|
207 sample_counts[sample_names[i]] = variant_counts
|
|
208
|
|
209 site['samples'] = sample_counts
|
|
210
|
|
211 return site
|
|
212
|
|
213
|
|
214 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
|
|
215 debug=False):
|
|
216 """Take the raw data from the VCF line and transform it into the summary data
|
|
217 to be printed in the output format."""
|
|
218
|
|
219 site_summary = []
|
|
220 for sample_name in sample_names:
|
|
221
|
|
222 sample = {'print':False}
|
|
223 variants = site['samples'].get(sample_name)
|
|
224 if not variants:
|
|
225 site_summary.append(sample)
|
|
226 continue
|
|
227
|
|
228 sample['sample'] = sample_name
|
|
229 sample['chr'] = site['chr']
|
|
230 sample['pos'] = site['pos']
|
|
231
|
|
232 coverage = sum(variants.values())
|
|
233
|
|
234 # get stranded coverage
|
|
235 covg_plus = 0
|
|
236 covg_minus = 0
|
|
237 for variant in variants:
|
|
238 if variant[0] == '+':
|
|
239 covg_plus += variants[variant]
|
|
240 elif variant[0] == '-':
|
|
241 covg_minus += variants[variant]
|
|
242 # stranded coverage threshold
|
|
243 if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres:
|
|
244 site_summary.append(sample)
|
|
245 continue
|
|
246 else:
|
|
247 sample['print'] = True
|
|
248
|
|
249 # get an ordered list of read counts for all variants (either strand)
|
|
250 ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug)
|
|
251
|
|
252 # record read counts into dict for this sample
|
|
253 for base in ranked_bases:
|
|
254 sample[base[0]] = base[1]
|
|
255 # fill in any zeros
|
|
256 for variant in canonical:
|
|
257 if not sample.has_key(variant):
|
|
258 sample[variant] = 0
|
|
259
|
|
260 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
|
|
261
|
|
262 # set minor allele to N if there's a tie for 2nd
|
|
263 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
|
|
264 ranked_bases[1] = ('N', 0)
|
|
265 sample['alleles'] = 1 if sample['alleles'] else 0
|
|
266
|
|
267 if debug: print ranked_bases
|
|
268
|
|
269 sample['coverage'] = coverage
|
|
270 try:
|
|
271 sample['major'] = ranked_bases[0][0]
|
|
272 except IndexError, e:
|
|
273 sample['major'] = '.'
|
|
274 try:
|
|
275 sample['minor'] = ranked_bases[1][0]
|
2
|
276 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5)
|
1
|
277 except IndexError, e:
|
|
278 sample['minor'] = '.'
|
|
279 sample['freq'] = 0.0
|
|
280
|
|
281 site_summary.append(sample)
|
|
282
|
|
283 return site_summary
|
|
284
|
|
285
|
|
286 def print_site(filehandle, site, columns):
|
|
287 """Print the output lines for one site (one per sample).
|
|
288 filehandle must be open."""
|
|
289 for sample in site:
|
|
290 if sample['print']:
|
|
291 fields = [str(sample.get(column)) for column in columns]
|
|
292 filehandle.write('\t'.join(fields)+"\n")
|
|
293
|
|
294
|
|
295 def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False):
|
|
296 """Count the number of reads for each base, and create a ranked list of
|
|
297 alleles passing the frequency threshold.
|
|
298 Arguments:
|
|
299 variant_counts: Dict of the stranded variants (keys) and their read counts (values).
|
|
300 freq_thres: The frequency threshold each allele needs to pass to be included.
|
|
301 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
|
|
302 variants: A list of the variants of interest. Other types of variants will not
|
|
303 be included in the returned list. If no list is given, all variants found in
|
|
304 the variant_counts will be used.
|
|
305 Return value:
|
|
306 ranked_bases: A list of the alleles and their read counts. The elements are
|
|
307 tuples (base, read count). The alleles are listed in descending order of
|
|
308 frequency, and only those passing the threshold are included."""
|
|
309
|
|
310 # Get list of all variants from variant_counts list
|
|
311 variants = [variant[1:] for variant in variant_counts]
|
|
312 # deduplicate via a dict
|
|
313 variant_dict = dict((variant, 1) for variant in variants)
|
|
314 variants = variant_dict.keys()
|
|
315
|
|
316 ranked_bases = []
|
|
317 for variant in variants:
|
|
318 reads = 0
|
|
319 for strand in strands:
|
|
320 reads += variant_counts.get(strand+variant, 0)
|
|
321 ranked_bases.append((variant, reads))
|
|
322
|
|
323 # get coverage for the specified strands
|
|
324 coverage = 0
|
|
325 for variant in variant_counts:
|
|
326 if variant[0] in strands:
|
|
327 coverage += variant_counts.get(variant, 0)
|
|
328 # if debug: print "strands: "+strands+', covg: '+str(coverage)
|
|
329
|
|
330 if coverage < 1:
|
|
331 return []
|
|
332
|
|
333 # sort the list of alleles by read count
|
|
334 ranked_bases.sort(reverse=True, key=lambda base: base[1])
|
|
335
|
|
336 if debug:
|
|
337 print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
|
|
338 for base in ranked_bases:
|
|
339 print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+
|
|
340 str(base[1]/float(coverage)))
|
|
341
|
|
342 # remove bases below the frequency threshold
|
|
343 ranked_bases = [base for base in ranked_bases
|
|
344 if base[1]/float(coverage) >= freq_thres]
|
|
345
|
|
346 return ranked_bases
|
|
347
|
|
348
|
|
349 def count_alleles(variant_counts, freq_thres, debug=False):
|
|
350 """Determine how many alleles to report, based on filtering rules.
|
|
351 The current rule determines which bases pass the frequency threshold on each
|
|
352 strand individually, then compares the two sets of bases. If they are the same
|
|
353 (regardless of order), the allele count is the number of bases. Otherwise it
|
|
354 is zero."""
|
|
355 allele_count = 0
|
|
356
|
|
357 alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug,
|
|
358 strands='+')
|
|
359 alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug,
|
|
360 strands='-')
|
|
361
|
|
362 if debug:
|
|
363 print '+ '+str(alleles_plus)
|
|
364 print '- '+str(alleles_minus)
|
|
365
|
|
366 # check if each strand reports the same set of alleles
|
|
367 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
|
|
368 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
|
|
369 if alleles_plus_sorted == alleles_minus_sorted:
|
|
370 allele_count = len(alleles_plus)
|
|
371
|
|
372 return allele_count
|
|
373
|
|
374
|
|
375 def fail(message):
|
|
376 sys.stderr.write(message+'\n')
|
|
377 sys.exit(1)
|
|
378
|
|
379 if __name__ == "__main__":
|
|
380 main() |