1
|
1 #!/usr/bin/python
|
|
2 # This parses the output of Dan's "Naive Variant Detector" (previously,
|
|
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py".
|
5
|
4 # Or, to put it briefly,
|
|
5 # cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
|
1
|
6 #
|
2
|
7 # New in this version:
|
|
8 # Made header line customizable
|
|
9 # - separate from internal column labels, which are used as dict keys
|
1
|
10 import os
|
|
11 import sys
|
5
|
12 import random
|
1
|
13 from optparse import OptionParser
|
|
14
|
2
|
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias']
|
|
16 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS']
|
1
|
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T']
|
5
|
18 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv
|
|
19 cat variants.vcf | %prog [options] > alleles.csv"""
|
1
|
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100,
|
5
|
21 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False,
|
|
22 'debug_loc':'', 'seed':''}
|
1
|
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout."""
|
|
24 EPILOG = """Requirements:
|
|
25 The input VCF must report the variants for each strand.
|
|
26 The variants should be case-sensitive (e.g. all capital base letters).
|
|
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0."""
|
|
28
|
|
29
|
|
30 def get_options(defaults, usage, description='', epilog=''):
|
|
31 """Get options, print usage text."""
|
|
32
|
|
33 parser = OptionParser(usage=usage, description=description, epilog=epilog)
|
|
34
|
|
35 parser.add_option('-i', '--infile', dest='infile',
|
|
36 default=defaults.get('infile'),
|
|
37 help='Read input VCF data from this file instead of stdin.')
|
|
38 parser.add_option('-o', '--outfile', dest='outfile',
|
|
39 default=defaults.get('outfile'),
|
|
40 help='Print output data to this file instead of stdout.')
|
|
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
|
|
42 default=defaults.get('freq_thres'),
|
5
|
43 help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
|
|
44 +'= 1% frequency. Default is %default%.'))
|
1
|
45 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
|
|
46 default=defaults.get('covg_thres'),
|
5
|
47 help=('Coverage threshold. Each site must be supported by at least this '
|
|
48 +'many reads on each strand. Otherwise the site will not be printed in '
|
|
49 +'the output. The default is %default reads per strand.'))
|
|
50 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const',
|
|
51 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'),
|
|
52 help=('Operate without a frequency threshold or coverage threshold. '
|
|
53 +'Equivalent to "-c 0 -f 0".'))
|
1
|
54 parser.add_option('-H', '--header', dest='print_header', action='store_const',
|
|
55 const=not(defaults.get('print_header')), default=defaults.get('print_header'),
|
5
|
56 help=('Print header line. This is a #-commented line with the column '
|
|
57 +'labels. Off by default.'))
|
|
58 parser.add_option('-s', '--stranded', dest='stranded', action='store_const',
|
|
59 const=not(defaults.get('stranded')), default=defaults.get('stranded'),
|
|
60 help='Report variant counts by strand, in separate columns. Off by default.')
|
|
61 parser.add_option('-r', '--rand-seed', dest='seed',
|
|
62 default=defaults.get('seed'), help=('Seed for random number generator.'))
|
|
63 parser.add_option('-d', '--debug', dest='debug_loc',
|
|
64 default=defaults.get('debug_loc'),
|
|
65 help=('Turn on debug mode and specify a single site to process using UCSC '
|
|
66 +'coordinate format. You can also append a sample ID after another ":" '
|
|
67 +'to restrict it further.'))
|
1
|
68
|
|
69 (options, args) = parser.parse_args()
|
|
70
|
5
|
71 return (options, args)
|
1
|
72
|
|
73
|
|
74 def main():
|
|
75
|
|
76 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG)
|
|
77
|
|
78 infile = options.infile
|
|
79 outfile = options.outfile
|
|
80 print_header = options.print_header
|
|
81 freq_thres = options.freq_thres / 100.0
|
|
82 covg_thres = options.covg_thres
|
5
|
83 stranded = options.stranded
|
|
84 debug_loc = options.debug_loc
|
|
85 seed = options.seed
|
|
86
|
|
87 if options.no_filter:
|
|
88 freq_thres = 0
|
|
89 covg_thres = 0
|
1
|
90
|
5
|
91 if seed:
|
|
92 random.seed(seed)
|
|
93
|
|
94 debug = False
|
|
95 print_sample = ''
|
|
96 if debug_loc:
|
|
97 debug = True
|
|
98 coords = debug_loc.split(':')
|
|
99 print_chr = coords[0]
|
|
100 print_pos = ''
|
|
101 if len(coords) > 1: print_pos = coords[1]
|
|
102 if len(coords) > 2: print_sample = coords[2]
|
1
|
103
|
|
104 # set infile_handle to either stdin or the input file
|
|
105 if infile == OPT_DEFAULTS.get('infile'):
|
|
106 infile_handle = sys.stdin
|
|
107 sys.stderr.write("Reading from standard input..\n")
|
|
108 else:
|
|
109 if os.path.exists(infile):
|
|
110 infile_handle = open(infile, 'r')
|
|
111 else:
|
|
112 fail('Error: Input VCF file '+infile+' not found.')
|
|
113
|
|
114 # set outfile_handle to either stdout or the output file
|
|
115 if outfile == OPT_DEFAULTS.get('outfile'):
|
|
116 outfile_handle = sys.stdout
|
|
117 else:
|
2
|
118 try:
|
1
|
119 outfile_handle = open(outfile, 'w')
|
2
|
120 except IOError, e:
|
|
121 fail('Error: The given output filename '+outfile+' could not be opened.')
|
1
|
122
|
5
|
123 # Take care of column names, print header
|
2
|
124 if len(COLUMNS) != len(COLUMN_LABELS):
|
5
|
125 fail('Error: Internal column names list do not match column labels list.')
|
|
126 if stranded:
|
|
127 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
|
|
128 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T']
|
1
|
129 if print_header:
|
2
|
130 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n")
|
1
|
131
|
5
|
132 # main loop
|
|
133 # each iteration processes one VCF line and prints one or more output lines
|
|
134 # one VCF line = one site, one or more samples
|
|
135 # one output line = one site, one sample
|
1
|
136 sample_names = []
|
|
137 for line in infile_handle:
|
|
138 line = line.rstrip('\r\n')
|
|
139
|
|
140 # header lines
|
|
141 if line[0] == '#':
|
|
142 if line[0:6].upper() == '#CHROM':
|
|
143 sample_names = line.split('\t')[9:]
|
|
144 continue
|
|
145
|
|
146 if not sample_names:
|
|
147 fail("Error in input VCF: Data line encountered before header line. "
|
|
148 +"Failed on line:\n"+line)
|
|
149
|
|
150 site_data = read_site(line, sample_names, CANONICAL_VARIANTS)
|
|
151
|
|
152 if debug:
|
5
|
153 if print_pos != site_data['pos']:
|
|
154 continue
|
|
155 if print_chr != site_data['chr'] and print_chr != '':
|
1
|
156 continue
|
5
|
157 if print_sample != '':
|
|
158 for sample in site_data['samples'].keys():
|
|
159 if sample.lower() != print_sample.lower():
|
|
160 site_data['samples'].pop(sample, None)
|
|
161 if len(site_data['samples']) == 0:
|
|
162 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
|
|
163 sys.exit(1)
|
|
164
|
1
|
165
|
|
166 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
|
5
|
167 freq_thres, covg_thres, stranded, debug=debug)
|
1
|
168
|
|
169 if debug and site_summary[0]['print']:
|
5
|
170 print line.split('\t')[9].split(':')[-1]
|
1
|
171
|
|
172 print_site(outfile_handle, site_summary, COLUMNS)
|
|
173
|
|
174 # close any open filehandles
|
|
175 if infile_handle is not sys.stdin:
|
|
176 infile_handle.close()
|
|
177 if outfile_handle is not sys.stdout:
|
|
178 outfile_handle.close()
|
|
179
|
|
180 # keeps Galaxy from giving an error if there were messages on stderr
|
|
181 sys.exit(0)
|
|
182
|
|
183
|
|
184
|
|
185 def read_site(line, sample_names, canonical):
|
|
186 """Read in a line, parse the variants into a data structure, and return it.
|
|
187 The line should be actual site data, not a header line, so check beforehand.
|
5
|
188 Only the variants in 'canonical' will be read; all others are ignored.
|
|
189 Note: the line is assumed to have been chomped.
|
|
190 The returned data is stored in a dict, with the following structure:
|
|
191 {
|
|
192 'chr': 'chr1',
|
|
193 'pos': '2617',
|
|
194 'samples': {
|
|
195 'THYROID': {
|
|
196 '+A': 32,
|
|
197 '-A': 45,
|
|
198 '-G': 1,
|
|
199 },
|
|
200 'BLOOD': {
|
|
201 '+A': 2,
|
|
202 '-C': 1,
|
|
203 '+G': 37,
|
|
204 '-G': 42,
|
|
205 },
|
|
206 },
|
|
207 }
|
|
208 """
|
1
|
209
|
|
210 site = {}
|
|
211 fields = line.split('\t')
|
|
212
|
|
213 if len(fields) < 9:
|
|
214 fail("Error in input VCF: wrong number of fields in data line. "
|
|
215 +"Failed on line:\n"+line)
|
|
216
|
|
217 site['chr'] = fields[0]
|
|
218 site['pos'] = fields[1]
|
|
219 samples = fields[9:]
|
|
220
|
|
221 if len(samples) < len(sample_names):
|
|
222 fail("Error in input VCF: missing sample fields in data line. "
|
|
223 +"Failed on line:\n"+line)
|
|
224 elif len(samples) > len(sample_names):
|
|
225 fail("Error in input VCF: more sample fields in data line than in header. "
|
|
226 +"Failed on line:\n"+line)
|
|
227
|
|
228 sample_counts = {}
|
|
229 for i in range(len(samples)):
|
|
230
|
|
231 variant_counts = {}
|
|
232 counts = samples[i].split(':')[-1]
|
|
233 counts = counts.split(',')
|
|
234
|
|
235 for count in counts:
|
|
236 if not count:
|
|
237 continue
|
|
238 fields = count.split('=')
|
|
239 if len(fields) != 2:
|
|
240 fail("Error in input VCF: Incorrect variant data format (must contain "
|
|
241 +"a single '='). Failed on line:\n"+line)
|
|
242 (variant, reads) = fields
|
|
243 if variant[1:] not in canonical:
|
|
244 continue
|
|
245 if variant[0] != '-' and variant[0] != '+':
|
|
246 fail("Error in input VCF: variant data not strand-specific. "
|
|
247 +"Failed on line:\n"+line)
|
|
248 try:
|
2
|
249 variant_counts[variant] = int(float(reads))
|
1
|
250 except ValueError, e:
|
2
|
251 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line)
|
1
|
252
|
|
253 sample_counts[sample_names[i]] = variant_counts
|
|
254
|
|
255 site['samples'] = sample_counts
|
|
256
|
|
257 return site
|
|
258
|
|
259
|
|
260 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres,
|
5
|
261 stranded=False, debug=False):
|
1
|
262 """Take the raw data from the VCF line and transform it into the summary data
|
|
263 to be printed in the output format."""
|
|
264
|
|
265 site_summary = []
|
|
266 for sample_name in sample_names:
|
|
267
|
|
268 sample = {'print':False}
|
|
269 variants = site['samples'].get(sample_name)
|
|
270
|
|
271 sample['sample'] = sample_name
|
|
272 sample['chr'] = site['chr']
|
|
273 sample['pos'] = site['pos']
|
|
274
|
|
275 coverage = sum(variants.values())
|
|
276
|
|
277 # get stranded coverage
|
|
278 covg_plus = 0
|
|
279 covg_minus = 0
|
|
280 for variant in variants:
|
|
281 if variant[0] == '+':
|
|
282 covg_plus += variants[variant]
|
|
283 elif variant[0] == '-':
|
|
284 covg_minus += variants[variant]
|
|
285 # stranded coverage threshold
|
5
|
286 if covg_plus < covg_thres or covg_minus < covg_thres:
|
1
|
287 site_summary.append(sample)
|
|
288 continue
|
|
289 else:
|
|
290 sample['print'] = True
|
|
291
|
5
|
292 # get an ordered list of read counts for all variants (both strands)
|
|
293 bases = get_read_counts(variants, '+-')
|
|
294 ranked_bases = process_read_counts(bases, sort=True, debug=debug)
|
|
295
|
|
296 # prepare stranded or unstranded lists of base counts
|
|
297 base_count_lists = []
|
|
298 if stranded:
|
|
299 strands = ['+', '-']
|
|
300 base_count_lists.append(get_read_counts(variants, '+'))
|
|
301 base_count_lists.append(get_read_counts(variants, '-'))
|
|
302 else:
|
|
303 strands = ['']
|
|
304 base_count_lists.append(ranked_bases)
|
1
|
305
|
5
|
306 # record read counts into output dict
|
|
307 # If stranded, this will loop twice, once for each strand, and prepend '+'
|
|
308 # or '-' to the base name. If not stranded, it will loop once, and prepend
|
|
309 # nothing ('').
|
|
310 for (strand, base_count_list) in zip(strands, base_count_lists):
|
|
311 for base_count in base_count_list:
|
|
312 sample[strand+base_count[0]] = base_count[1]
|
|
313 # fill in any zeros
|
|
314 for base in canonical:
|
|
315 if not sample.has_key(strand+base):
|
|
316 sample[strand+base] = 0
|
1
|
317
|
5
|
318 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
|
1
|
319
|
5
|
320 # If there's a tie for 2nd, randomly choose one to be 2nd
|
1
|
321 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]:
|
5
|
322 swap = random.choice([True,False])
|
|
323 if swap:
|
|
324 tmp_base = ranked_bases[1]
|
|
325 ranked_bases[1] = ranked_bases[2]
|
|
326 ranked_bases[2] = tmp_base
|
1
|
327
|
5
|
328 if debug: print "ranked +-: "+str(ranked_bases)
|
1
|
329
|
|
330 sample['coverage'] = coverage
|
|
331 try:
|
|
332 sample['major'] = ranked_bases[0][0]
|
|
333 except IndexError, e:
|
|
334 sample['major'] = '.'
|
|
335 try:
|
|
336 sample['minor'] = ranked_bases[1][0]
|
2
|
337 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5)
|
1
|
338 except IndexError, e:
|
|
339 sample['minor'] = '.'
|
|
340 sample['freq'] = 0.0
|
|
341
|
|
342 site_summary.append(sample)
|
|
343
|
|
344 return site_summary
|
|
345
|
|
346
|
5
|
347 def get_read_counts(stranded_counts, strands='+-'):
|
|
348 """Do a simple sum of the read counts per variant, on the specified strands.
|
|
349 Arguments:
|
|
350 stranded_counts: Dict of the stranded variants (keys) and their read counts
|
|
351 (values).
|
|
352 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default).
|
|
353 Return value:
|
|
354 summed_counts: A list of the alleles and their read counts. The elements are
|
|
355 tuples (variant, read count)."""
|
|
356
|
|
357 variants = stranded_counts.keys()
|
|
358
|
|
359 summed_counts = {}
|
|
360 for variant in variants:
|
|
361 strand = variant[0]
|
|
362 base = variant[1:]
|
|
363 if strand in strands:
|
|
364 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
|
|
365
|
|
366 return summed_counts.items()
|
1
|
367
|
|
368
|
5
|
369 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
|
|
370 """Process a list of read counts by frequency filtering and/or sorting.
|
1
|
371 Arguments:
|
5
|
372 variant_counts: List of the non-stranded variants and their read counts. The
|
|
373 elements are tuples (variant, read count).
|
1
|
374 freq_thres: The frequency threshold each allele needs to pass to be included.
|
5
|
375 sort: Whether to sort the list in descending order of read counts.
|
1
|
376 Return value:
|
5
|
377 variant_counts: A list of the processed alleles and their read counts. The
|
|
378 elements are tuples (variant, read count)."""
|
1
|
379
|
|
380 # get coverage for the specified strands
|
|
381 coverage = 0
|
|
382 for variant in variant_counts:
|
5
|
383 coverage += variant[1]
|
1
|
384
|
5
|
385 if coverage <= 0:
|
1
|
386 return []
|
|
387
|
|
388 # sort the list of alleles by read count
|
5
|
389 if sort:
|
|
390 variant_counts.sort(reverse=True, key=lambda variant: variant[1])
|
1
|
391
|
|
392 if debug:
|
5
|
393 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
|
|
394 for variant in variant_counts:
|
|
395 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
|
|
396 str(variant[1]/float(coverage)))
|
1
|
397
|
|
398 # remove bases below the frequency threshold
|
5
|
399 if freq_thres > 0:
|
|
400 variant_counts = [variant for variant in variant_counts
|
|
401 if variant[1]/float(coverage) >= freq_thres]
|
1
|
402
|
5
|
403 return variant_counts
|
1
|
404
|
|
405
|
|
406 def count_alleles(variant_counts, freq_thres, debug=False):
|
|
407 """Determine how many alleles to report, based on filtering rules.
|
|
408 The current rule determines which bases pass the frequency threshold on each
|
|
409 strand individually, then compares the two sets of bases. If they are the same
|
|
410 (regardless of order), the allele count is the number of bases. Otherwise it
|
|
411 is zero."""
|
|
412 allele_count = 0
|
|
413
|
5
|
414 alleles_plus = get_read_counts(variant_counts, '+')
|
|
415 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres,
|
|
416 sort=False, debug=debug)
|
|
417 alleles_minus = get_read_counts(variant_counts, '-')
|
|
418 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres,
|
|
419 sort=False, debug=debug)
|
1
|
420
|
|
421 if debug:
|
|
422 print '+ '+str(alleles_plus)
|
|
423 print '- '+str(alleles_minus)
|
|
424
|
5
|
425 # Check if each strand reports the same set of alleles.
|
|
426 # Sorting by base is to compare lists without regard to order (as sets).
|
1
|
427 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
|
|
428 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
|
|
429 if alleles_plus_sorted == alleles_minus_sorted:
|
|
430 allele_count = len(alleles_plus)
|
|
431
|
|
432 return allele_count
|
|
433
|
|
434
|
5
|
435 def print_site(filehandle, site, columns):
|
|
436 """Print the output lines for one site (one per sample).
|
|
437 filehandle must be open."""
|
|
438 for sample in site:
|
|
439 if sample['print']:
|
|
440 fields = [str(sample.get(column)) for column in columns]
|
|
441 filehandle.write('\t'.join(fields)+"\n")
|
|
442
|
|
443
|
1
|
444 def fail(message):
|
|
445 sys.stderr.write(message+'\n')
|
|
446 sys.exit(1)
|
|
447
|
5
|
448
|
1
|
449 if __name__ == "__main__":
|
|
450 main() |