Mercurial > repos > nick > allele_counts
comparison allele-counts.py @ 5:31361191d2d2
Uploaded tarball.
Version 1.1: Stranded output, slightly different handling of minor allele ties and 0 coverage sites, revised help text, added test datasets.
author | nick |
---|---|
date | Thu, 12 Sep 2013 11:34:23 -0400 |
parents | 318fdf77aa54 |
children | df3b28364cd2 |
comparison
equal
deleted
inserted
replaced
4:898eb3daab43 | 5:31361191d2d2 |
---|---|
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 # This parses the output of Dan's "Naive Variant Detector" (previously, | 2 # This parses the output of Dan's "Naive Variant Detector" (previously, |
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". | 3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". |
4 # Or, to put it briefly, | |
5 # cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' | |
4 # | 6 # |
5 # New in this version: | 7 # New in this version: |
6 # Made header line customizable | 8 # Made header line customizable |
7 # - separate from internal column labels, which are used as dict keys | 9 # - separate from internal column labels, which are used as dict keys |
8 # | |
9 # TODO: | |
10 # - test handling of -c 0 (and -f 0?) | |
11 # - should it technically handle data lines that start with a '#'? | |
12 import os | 10 import os |
13 import sys | 11 import sys |
12 import random | |
14 from optparse import OptionParser | 13 from optparse import OptionParser |
15 | 14 |
16 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] | 15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] |
17 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] | 16 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] |
18 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] | 17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] |
19 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv | 18 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.csv |
20 %prog [options] -i variants.vcf -o alleles.csv""" | 19 cat variants.vcf | %prog [options] > alleles.csv""" |
21 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, | 20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, |
22 'print_header':False, 'stdin':False} | 21 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, |
22 'debug_loc':'', 'seed':''} | |
23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout.""" | 23 DESCRIPTION = """This will parse the VCF output of Dan's "Naive Variant Caller" (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the number of reads of each base, determines the major allele, minor allele (second most frequent variant), and number of alleles above a threshold. So currently it only considers SNVs (ACGT), including in the coverage figure. By default it reads from stdin and prints to stdout.""" |
24 EPILOG = """Requirements: | 24 EPILOG = """Requirements: |
25 The input VCF must report the variants for each strand. | 25 The input VCF must report the variants for each strand. |
26 The variants should be case-sensitive (e.g. all capital base letters). | 26 The variants should be case-sensitive (e.g. all capital base letters). |
27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0.""" | 27 Strand bias: Both strands must show the same bases passing the frequency threshold (but not necessarily in the same order). If the site fails this test, the number of alleles is reported as 0.""" |
38 parser.add_option('-o', '--outfile', dest='outfile', | 38 parser.add_option('-o', '--outfile', dest='outfile', |
39 default=defaults.get('outfile'), | 39 default=defaults.get('outfile'), |
40 help='Print output data to this file instead of stdout.') | 40 help='Print output data to this file instead of stdout.') |
41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | 41 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', |
42 default=defaults.get('freq_thres'), | 42 default=defaults.get('freq_thres'), |
43 help='Frequency threshold for counting alleles, given in percentage: -f 1 = 1% frequency. Default is %default%.') | 43 help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' |
44 +'= 1% frequency. Default is %default%.')) | |
44 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', | 45 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', |
45 default=defaults.get('covg_thres'), | 46 default=defaults.get('covg_thres'), |
46 help='Coverage threshold. Each site must be supported by at least this many reads on each strand. Otherwise the site will not be printed in the output. The default is %default reads per strand.') | 47 help=('Coverage threshold. Each site must be supported by at least this ' |
48 +'many reads on each strand. Otherwise the site will not be printed in ' | |
49 +'the output. The default is %default reads per strand.')) | |
50 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const', | |
51 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'), | |
52 help=('Operate without a frequency threshold or coverage threshold. ' | |
53 +'Equivalent to "-c 0 -f 0".')) | |
47 parser.add_option('-H', '--header', dest='print_header', action='store_const', | 54 parser.add_option('-H', '--header', dest='print_header', action='store_const', |
48 const=not(defaults.get('print_header')), default=defaults.get('print_header'), | 55 const=not(defaults.get('print_header')), default=defaults.get('print_header'), |
49 help='Print header line. This is a #-commented line with the column labels. Off by default.') | 56 help=('Print header line. This is a #-commented line with the column ' |
50 parser.add_option('-d', '--debug', dest='debug', action='store_true', | 57 +'labels. Off by default.')) |
51 default=False, | 58 parser.add_option('-s', '--stranded', dest='stranded', action='store_const', |
52 help='Turn on debug mode. You must also specify a single site to process in a final argument using UCSC coordinate format.') | 59 const=not(defaults.get('stranded')), default=defaults.get('stranded'), |
60 help='Report variant counts by strand, in separate columns. Off by default.') | |
61 parser.add_option('-r', '--rand-seed', dest='seed', | |
62 default=defaults.get('seed'), help=('Seed for random number generator.')) | |
63 parser.add_option('-d', '--debug', dest='debug_loc', | |
64 default=defaults.get('debug_loc'), | |
65 help=('Turn on debug mode and specify a single site to process using UCSC ' | |
66 +'coordinate format. You can also append a sample ID after another ":" ' | |
67 +'to restrict it further.')) | |
53 | 68 |
54 (options, args) = parser.parse_args() | 69 (options, args) = parser.parse_args() |
55 | 70 |
56 # read in positional arguments | 71 return (options, args) |
57 arguments = {} | |
58 if options.debug: | |
59 if len(args) >= 1: | |
60 arguments['print_loc'] = args[0] | |
61 args.remove(args[0]) | |
62 | |
63 return (options, arguments) | |
64 | 72 |
65 | 73 |
66 def main(): | 74 def main(): |
67 | 75 |
68 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) | 76 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) |
70 infile = options.infile | 78 infile = options.infile |
71 outfile = options.outfile | 79 outfile = options.outfile |
72 print_header = options.print_header | 80 print_header = options.print_header |
73 freq_thres = options.freq_thres / 100.0 | 81 freq_thres = options.freq_thres / 100.0 |
74 covg_thres = options.covg_thres | 82 covg_thres = options.covg_thres |
75 debug = options.debug | 83 stranded = options.stranded |
76 | 84 debug_loc = options.debug_loc |
77 if debug: | 85 seed = options.seed |
78 print_loc = args.get('print_loc') | 86 |
79 if print_loc: | 87 if options.no_filter: |
80 if ':' in print_loc: | 88 freq_thres = 0 |
81 (print_chr, print_pos) = print_loc.split(':') | 89 covg_thres = 0 |
82 else: | 90 |
83 print_pos = print_loc | 91 if seed: |
84 else: | 92 random.seed(seed) |
85 sys.stderr.write("Warning: No site coordinate found in arguments. " | 93 |
86 +"Turning off debug mode.\n") | 94 debug = False |
87 debug = False | 95 print_sample = '' |
96 if debug_loc: | |
97 debug = True | |
98 coords = debug_loc.split(':') | |
99 print_chr = coords[0] | |
100 print_pos = '' | |
101 if len(coords) > 1: print_pos = coords[1] | |
102 if len(coords) > 2: print_sample = coords[2] | |
88 | 103 |
89 # set infile_handle to either stdin or the input file | 104 # set infile_handle to either stdin or the input file |
90 if infile == OPT_DEFAULTS.get('infile'): | 105 if infile == OPT_DEFAULTS.get('infile'): |
91 infile_handle = sys.stdin | 106 infile_handle = sys.stdin |
92 sys.stderr.write("Reading from standard input..\n") | 107 sys.stderr.write("Reading from standard input..\n") |
103 try: | 118 try: |
104 outfile_handle = open(outfile, 'w') | 119 outfile_handle = open(outfile, 'w') |
105 except IOError, e: | 120 except IOError, e: |
106 fail('Error: The given output filename '+outfile+' could not be opened.') | 121 fail('Error: The given output filename '+outfile+' could not be opened.') |
107 | 122 |
123 # Take care of column names, print header | |
108 if len(COLUMNS) != len(COLUMN_LABELS): | 124 if len(COLUMNS) != len(COLUMN_LABELS): |
109 fail('Error: Internal column names do not match column labels.') | 125 fail('Error: Internal column names list do not match column labels list.') |
126 if stranded: | |
127 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
128 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
110 if print_header: | 129 if print_header: |
111 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") | 130 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") |
112 | 131 |
113 # main loop: process and print one line at a time | 132 # main loop |
133 # each iteration processes one VCF line and prints one or more output lines | |
134 # one VCF line = one site, one or more samples | |
135 # one output line = one site, one sample | |
114 sample_names = [] | 136 sample_names = [] |
115 for line in infile_handle: | 137 for line in infile_handle: |
116 line = line.rstrip('\r\n') | 138 line = line.rstrip('\r\n') |
117 | 139 |
118 # header lines | 140 # header lines |
126 +"Failed on line:\n"+line) | 148 +"Failed on line:\n"+line) |
127 | 149 |
128 site_data = read_site(line, sample_names, CANONICAL_VARIANTS) | 150 site_data = read_site(line, sample_names, CANONICAL_VARIANTS) |
129 | 151 |
130 if debug: | 152 if debug: |
131 if site_data['pos'] != print_pos: | 153 if print_pos != site_data['pos']: |
132 continue | 154 continue |
133 try: | 155 if print_chr != site_data['chr'] and print_chr != '': |
134 if site_data['chr'] != print_chr: | 156 continue |
135 continue | 157 if print_sample != '': |
136 except NameError, e: | 158 for sample in site_data['samples'].keys(): |
137 pass # No chr specified. Just go ahead and print the line. | 159 if sample.lower() != print_sample.lower(): |
160 site_data['samples'].pop(sample, None) | |
161 if len(site_data['samples']) == 0: | |
162 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") | |
163 sys.exit(1) | |
164 | |
138 | 165 |
139 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, | 166 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, |
140 freq_thres, covg_thres, debug=debug) | 167 freq_thres, covg_thres, stranded, debug=debug) |
141 | 168 |
142 if debug and site_summary[0]['print']: | 169 if debug and site_summary[0]['print']: |
143 print line.split('\t')[9].split(':')[-1] | 170 print line.split('\t')[9].split(':')[-1] |
144 | 171 |
145 print_site(outfile_handle, site_summary, COLUMNS) | 172 print_site(outfile_handle, site_summary, COLUMNS) |
146 | 173 |
147 # close any open filehandles | 174 # close any open filehandles |
148 if infile_handle is not sys.stdin: | 175 if infile_handle is not sys.stdin: |
156 | 183 |
157 | 184 |
158 def read_site(line, sample_names, canonical): | 185 def read_site(line, sample_names, canonical): |
159 """Read in a line, parse the variants into a data structure, and return it. | 186 """Read in a line, parse the variants into a data structure, and return it. |
160 The line should be actual site data, not a header line, so check beforehand. | 187 The line should be actual site data, not a header line, so check beforehand. |
161 Notes: | 188 Only the variants in 'canonical' will be read; all others are ignored. |
162 - The line is assumed to have been chomped.""" | 189 Note: the line is assumed to have been chomped. |
190 The returned data is stored in a dict, with the following structure: | |
191 { | |
192 'chr': 'chr1', | |
193 'pos': '2617', | |
194 'samples': { | |
195 'THYROID': { | |
196 '+A': 32, | |
197 '-A': 45, | |
198 '-G': 1, | |
199 }, | |
200 'BLOOD': { | |
201 '+A': 2, | |
202 '-C': 1, | |
203 '+G': 37, | |
204 '-G': 42, | |
205 }, | |
206 }, | |
207 } | |
208 """ | |
163 | 209 |
164 site = {} | 210 site = {} |
165 fields = line.split('\t') | 211 fields = line.split('\t') |
166 | 212 |
167 if len(fields) < 9: | 213 if len(fields) < 9: |
210 | 256 |
211 return site | 257 return site |
212 | 258 |
213 | 259 |
214 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, | 260 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, |
215 debug=False): | 261 stranded=False, debug=False): |
216 """Take the raw data from the VCF line and transform it into the summary data | 262 """Take the raw data from the VCF line and transform it into the summary data |
217 to be printed in the output format.""" | 263 to be printed in the output format.""" |
218 | 264 |
219 site_summary = [] | 265 site_summary = [] |
220 for sample_name in sample_names: | 266 for sample_name in sample_names: |
221 | 267 |
222 sample = {'print':False} | 268 sample = {'print':False} |
223 variants = site['samples'].get(sample_name) | 269 variants = site['samples'].get(sample_name) |
224 if not variants: | |
225 site_summary.append(sample) | |
226 continue | |
227 | 270 |
228 sample['sample'] = sample_name | 271 sample['sample'] = sample_name |
229 sample['chr'] = site['chr'] | 272 sample['chr'] = site['chr'] |
230 sample['pos'] = site['pos'] | 273 sample['pos'] = site['pos'] |
231 | 274 |
238 if variant[0] == '+': | 281 if variant[0] == '+': |
239 covg_plus += variants[variant] | 282 covg_plus += variants[variant] |
240 elif variant[0] == '-': | 283 elif variant[0] == '-': |
241 covg_minus += variants[variant] | 284 covg_minus += variants[variant] |
242 # stranded coverage threshold | 285 # stranded coverage threshold |
243 if coverage <= 0 or covg_plus < covg_thres or covg_minus < covg_thres: | 286 if covg_plus < covg_thres or covg_minus < covg_thres: |
244 site_summary.append(sample) | 287 site_summary.append(sample) |
245 continue | 288 continue |
246 else: | 289 else: |
247 sample['print'] = True | 290 sample['print'] = True |
248 | 291 |
249 # get an ordered list of read counts for all variants (either strand) | 292 # get an ordered list of read counts for all variants (both strands) |
250 ranked_bases = get_read_counts(variants, 0, strands='+-', debug=debug) | 293 bases = get_read_counts(variants, '+-') |
251 | 294 ranked_bases = process_read_counts(bases, sort=True, debug=debug) |
252 # record read counts into dict for this sample | 295 |
253 for base in ranked_bases: | 296 # prepare stranded or unstranded lists of base counts |
254 sample[base[0]] = base[1] | 297 base_count_lists = [] |
255 # fill in any zeros | 298 if stranded: |
256 for variant in canonical: | 299 strands = ['+', '-'] |
257 if not sample.has_key(variant): | 300 base_count_lists.append(get_read_counts(variants, '+')) |
258 sample[variant] = 0 | 301 base_count_lists.append(get_read_counts(variants, '-')) |
259 | 302 else: |
260 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) | 303 strands = [''] |
261 | 304 base_count_lists.append(ranked_bases) |
262 # set minor allele to N if there's a tie for 2nd | 305 |
306 # record read counts into output dict | |
307 # If stranded, this will loop twice, once for each strand, and prepend '+' | |
308 # or '-' to the base name. If not stranded, it will loop once, and prepend | |
309 # nothing (''). | |
310 for (strand, base_count_list) in zip(strands, base_count_lists): | |
311 for base_count in base_count_list: | |
312 sample[strand+base_count[0]] = base_count[1] | |
313 # fill in any zeros | |
314 for base in canonical: | |
315 if not sample.has_key(strand+base): | |
316 sample[strand+base] = 0 | |
317 | |
318 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) | |
319 | |
320 # If there's a tie for 2nd, randomly choose one to be 2nd | |
263 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: | 321 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: |
264 ranked_bases[1] = ('N', 0) | 322 swap = random.choice([True,False]) |
265 sample['alleles'] = 1 if sample['alleles'] else 0 | 323 if swap: |
266 | 324 tmp_base = ranked_bases[1] |
267 if debug: print ranked_bases | 325 ranked_bases[1] = ranked_bases[2] |
326 ranked_bases[2] = tmp_base | |
327 | |
328 if debug: print "ranked +-: "+str(ranked_bases) | |
268 | 329 |
269 sample['coverage'] = coverage | 330 sample['coverage'] = coverage |
270 try: | 331 try: |
271 sample['major'] = ranked_bases[0][0] | 332 sample['major'] = ranked_bases[0][0] |
272 except IndexError, e: | 333 except IndexError, e: |
281 site_summary.append(sample) | 342 site_summary.append(sample) |
282 | 343 |
283 return site_summary | 344 return site_summary |
284 | 345 |
285 | 346 |
347 def get_read_counts(stranded_counts, strands='+-'): | |
348 """Do a simple sum of the read counts per variant, on the specified strands. | |
349 Arguments: | |
350 stranded_counts: Dict of the stranded variants (keys) and their read counts | |
351 (values). | |
352 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). | |
353 Return value: | |
354 summed_counts: A list of the alleles and their read counts. The elements are | |
355 tuples (variant, read count).""" | |
356 | |
357 variants = stranded_counts.keys() | |
358 | |
359 summed_counts = {} | |
360 for variant in variants: | |
361 strand = variant[0] | |
362 base = variant[1:] | |
363 if strand in strands: | |
364 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) | |
365 | |
366 return summed_counts.items() | |
367 | |
368 | |
369 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): | |
370 """Process a list of read counts by frequency filtering and/or sorting. | |
371 Arguments: | |
372 variant_counts: List of the non-stranded variants and their read counts. The | |
373 elements are tuples (variant, read count). | |
374 freq_thres: The frequency threshold each allele needs to pass to be included. | |
375 sort: Whether to sort the list in descending order of read counts. | |
376 Return value: | |
377 variant_counts: A list of the processed alleles and their read counts. The | |
378 elements are tuples (variant, read count).""" | |
379 | |
380 # get coverage for the specified strands | |
381 coverage = 0 | |
382 for variant in variant_counts: | |
383 coverage += variant[1] | |
384 | |
385 if coverage <= 0: | |
386 return [] | |
387 | |
388 # sort the list of alleles by read count | |
389 if sort: | |
390 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) | |
391 | |
392 if debug: | |
393 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) | |
394 for variant in variant_counts: | |
395 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ | |
396 str(variant[1]/float(coverage))) | |
397 | |
398 # remove bases below the frequency threshold | |
399 if freq_thres > 0: | |
400 variant_counts = [variant for variant in variant_counts | |
401 if variant[1]/float(coverage) >= freq_thres] | |
402 | |
403 return variant_counts | |
404 | |
405 | |
406 def count_alleles(variant_counts, freq_thres, debug=False): | |
407 """Determine how many alleles to report, based on filtering rules. | |
408 The current rule determines which bases pass the frequency threshold on each | |
409 strand individually, then compares the two sets of bases. If they are the same | |
410 (regardless of order), the allele count is the number of bases. Otherwise it | |
411 is zero.""" | |
412 allele_count = 0 | |
413 | |
414 alleles_plus = get_read_counts(variant_counts, '+') | |
415 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres, | |
416 sort=False, debug=debug) | |
417 alleles_minus = get_read_counts(variant_counts, '-') | |
418 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, | |
419 sort=False, debug=debug) | |
420 | |
421 if debug: | |
422 print '+ '+str(alleles_plus) | |
423 print '- '+str(alleles_minus) | |
424 | |
425 # Check if each strand reports the same set of alleles. | |
426 # Sorting by base is to compare lists without regard to order (as sets). | |
427 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) | |
428 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) | |
429 if alleles_plus_sorted == alleles_minus_sorted: | |
430 allele_count = len(alleles_plus) | |
431 | |
432 return allele_count | |
433 | |
434 | |
286 def print_site(filehandle, site, columns): | 435 def print_site(filehandle, site, columns): |
287 """Print the output lines for one site (one per sample). | 436 """Print the output lines for one site (one per sample). |
288 filehandle must be open.""" | 437 filehandle must be open.""" |
289 for sample in site: | 438 for sample in site: |
290 if sample['print']: | 439 if sample['print']: |
291 fields = [str(sample.get(column)) for column in columns] | 440 fields = [str(sample.get(column)) for column in columns] |
292 filehandle.write('\t'.join(fields)+"\n") | 441 filehandle.write('\t'.join(fields)+"\n") |
293 | 442 |
294 | 443 |
295 def get_read_counts(variant_counts, freq_thres, strands='+-', debug=False): | |
296 """Count the number of reads for each base, and create a ranked list of | |
297 alleles passing the frequency threshold. | |
298 Arguments: | |
299 variant_counts: Dict of the stranded variants (keys) and their read counts (values). | |
300 freq_thres: The frequency threshold each allele needs to pass to be included. | |
301 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). | |
302 variants: A list of the variants of interest. Other types of variants will not | |
303 be included in the returned list. If no list is given, all variants found in | |
304 the variant_counts will be used. | |
305 Return value: | |
306 ranked_bases: A list of the alleles and their read counts. The elements are | |
307 tuples (base, read count). The alleles are listed in descending order of | |
308 frequency, and only those passing the threshold are included.""" | |
309 | |
310 # Get list of all variants from variant_counts list | |
311 variants = [variant[1:] for variant in variant_counts] | |
312 # deduplicate via a dict | |
313 variant_dict = dict((variant, 1) for variant in variants) | |
314 variants = variant_dict.keys() | |
315 | |
316 ranked_bases = [] | |
317 for variant in variants: | |
318 reads = 0 | |
319 for strand in strands: | |
320 reads += variant_counts.get(strand+variant, 0) | |
321 ranked_bases.append((variant, reads)) | |
322 | |
323 # get coverage for the specified strands | |
324 coverage = 0 | |
325 for variant in variant_counts: | |
326 if variant[0] in strands: | |
327 coverage += variant_counts.get(variant, 0) | |
328 # if debug: print "strands: "+strands+', covg: '+str(coverage) | |
329 | |
330 if coverage < 1: | |
331 return [] | |
332 | |
333 # sort the list of alleles by read count | |
334 ranked_bases.sort(reverse=True, key=lambda base: base[1]) | |
335 | |
336 if debug: | |
337 print strands+' coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) | |
338 for base in ranked_bases: | |
339 print (base[0]+': '+str(base[1])+'/'+str(float(coverage))+' = '+ | |
340 str(base[1]/float(coverage))) | |
341 | |
342 # remove bases below the frequency threshold | |
343 ranked_bases = [base for base in ranked_bases | |
344 if base[1]/float(coverage) >= freq_thres] | |
345 | |
346 return ranked_bases | |
347 | |
348 | |
349 def count_alleles(variant_counts, freq_thres, debug=False): | |
350 """Determine how many alleles to report, based on filtering rules. | |
351 The current rule determines which bases pass the frequency threshold on each | |
352 strand individually, then compares the two sets of bases. If they are the same | |
353 (regardless of order), the allele count is the number of bases. Otherwise it | |
354 is zero.""" | |
355 allele_count = 0 | |
356 | |
357 alleles_plus = get_read_counts(variant_counts, freq_thres, debug=debug, | |
358 strands='+') | |
359 alleles_minus = get_read_counts(variant_counts, freq_thres, debug=debug, | |
360 strands='-') | |
361 | |
362 if debug: | |
363 print '+ '+str(alleles_plus) | |
364 print '- '+str(alleles_minus) | |
365 | |
366 # check if each strand reports the same set of alleles | |
367 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) | |
368 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) | |
369 if alleles_plus_sorted == alleles_minus_sorted: | |
370 allele_count = len(alleles_plus) | |
371 | |
372 return allele_count | |
373 | |
374 | |
375 def fail(message): | 444 def fail(message): |
376 sys.stderr.write(message+'\n') | 445 sys.stderr.write(message+'\n') |
377 sys.exit(1) | 446 sys.exit(1) |
378 | 447 |
448 | |
379 if __name__ == "__main__": | 449 if __name__ == "__main__": |
380 main() | 450 main() |