Mercurial > repos > nick > allele_counts
annotate allele-counts.py @ 6:df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
author | nicksto <nmapsy@gmail.com> |
---|---|
date | Wed, 09 Dec 2015 11:20:51 -0500 |
parents | 31361191d2d2 |
children | 411adeff1eec |
rev | line source |
---|---|
1 | 1 #!/usr/bin/python |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
2 """ |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
3 Run with -h option or see DESCRIPTION for description. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
4 This script's functionality is being obsoleted by the new, and much more sanely |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
5 written, nvc-filter.py. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
6 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
7 New in this version: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
8 - Calculate strand bias |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
9 - Rename MINOR.FREQ.PERC to MAF |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
10 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
11 Naive Variant Caller variant count parsing one-liner: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
13 """ |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
14 from __future__ import division |
1 | 15 import os |
16 import sys | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
17 import errno |
5 | 18 import random |
1 | 19 from optparse import OptionParser |
20 | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
21 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
22 'major', 'minor', 'freq', 'bias'] |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
23 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
24 'MAJOR', 'MINOR', 'MAF', 'BIAS'] |
1 | 25 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
26 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
27 cat variants.vcf | %prog [options] > alleles.tsv""" |
1 | 28 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, |
5 | 29 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, |
30 'debug_loc':'', 'seed':''} | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
31 DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller" |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
32 (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
33 number of reads of each base, determines the major allele, minor allele (second |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
34 most frequent variant), and number of alleles above a threshold. So currently |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
35 it only considers SNVs (ACGT), including in the coverage figure. By default it |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
36 reads from stdin and prints to stdout. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
37 Prints a tab-delimited set of statistics to stdout. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
38 To print output column labels, run "$ echo -n | ./allele-counts.py -H". |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
39 The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
40 11:MINOR 12:MAF 13:BIAS, |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
41 unless the --stranded option is used, in which case they are: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
42 1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
43 13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
44 """ |
1 | 45 EPILOG = """Requirements: |
46 The input VCF must report the variants for each strand. | |
47 The variants should be case-sensitive (e.g. all capital base letters). | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
48 Strand bias: Both strands must show the same bases passing the frequency |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
49 threshold (but not necessarily in the same order). If the site fails this test, |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
50 the number of alleles is reported as 0.""" |
1 | 51 |
52 def get_options(defaults, usage, description='', epilog=''): | |
53 """Get options, print usage text.""" | |
54 | |
55 parser = OptionParser(usage=usage, description=description, epilog=epilog) | |
56 | |
57 parser.add_option('-i', '--infile', dest='infile', | |
58 default=defaults.get('infile'), | |
59 help='Read input VCF data from this file instead of stdin.') | |
60 parser.add_option('-o', '--outfile', dest='outfile', | |
61 default=defaults.get('outfile'), | |
62 help='Print output data to this file instead of stdout.') | |
63 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | |
64 default=defaults.get('freq_thres'), | |
5 | 65 help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' |
66 +'= 1% frequency. Default is %default%.')) | |
1 | 67 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', |
68 default=defaults.get('covg_thres'), | |
5 | 69 help=('Coverage threshold. Each site must be supported by at least this ' |
70 +'many reads on each strand. Otherwise the site will not be printed in ' | |
71 +'the output. The default is %default reads per strand.')) | |
72 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const', | |
73 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'), | |
74 help=('Operate without a frequency threshold or coverage threshold. ' | |
75 +'Equivalent to "-c 0 -f 0".')) | |
1 | 76 parser.add_option('-H', '--header', dest='print_header', action='store_const', |
77 const=not(defaults.get('print_header')), default=defaults.get('print_header'), | |
5 | 78 help=('Print header line. This is a #-commented line with the column ' |
79 +'labels. Off by default.')) | |
80 parser.add_option('-s', '--stranded', dest='stranded', action='store_const', | |
81 const=not(defaults.get('stranded')), default=defaults.get('stranded'), | |
82 help='Report variant counts by strand, in separate columns. Off by default.') | |
83 parser.add_option('-r', '--rand-seed', dest='seed', | |
84 default=defaults.get('seed'), help=('Seed for random number generator.')) | |
85 parser.add_option('-d', '--debug', dest='debug_loc', | |
86 default=defaults.get('debug_loc'), | |
87 help=('Turn on debug mode and specify a single site to process using UCSC ' | |
88 +'coordinate format. You can also append a sample ID after another ":" ' | |
89 +'to restrict it further.')) | |
1 | 90 |
91 (options, args) = parser.parse_args() | |
92 | |
5 | 93 return (options, args) |
1 | 94 |
95 | |
96 def main(): | |
97 | |
98 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) | |
99 | |
100 infile = options.infile | |
101 outfile = options.outfile | |
102 print_header = options.print_header | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
103 freq_thres = options.freq_thres / 100 |
1 | 104 covg_thres = options.covg_thres |
5 | 105 stranded = options.stranded |
106 debug_loc = options.debug_loc | |
107 seed = options.seed | |
108 | |
109 if options.no_filter: | |
110 freq_thres = 0 | |
111 covg_thres = 0 | |
1 | 112 |
5 | 113 if seed: |
114 random.seed(seed) | |
115 | |
116 debug = False | |
117 print_sample = '' | |
118 if debug_loc: | |
119 debug = True | |
120 coords = debug_loc.split(':') | |
121 print_chr = coords[0] | |
122 print_pos = '' | |
123 if len(coords) > 1: print_pos = coords[1] | |
124 if len(coords) > 2: print_sample = coords[2] | |
1 | 125 |
126 # set infile_handle to either stdin or the input file | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
127 global infile_handle |
1 | 128 if infile == OPT_DEFAULTS.get('infile'): |
129 infile_handle = sys.stdin | |
130 sys.stderr.write("Reading from standard input..\n") | |
131 else: | |
132 if os.path.exists(infile): | |
133 infile_handle = open(infile, 'r') | |
134 else: | |
135 fail('Error: Input VCF file '+infile+' not found.') | |
136 | |
137 # set outfile_handle to either stdout or the output file | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
138 global outfile_handle |
1 | 139 if outfile == OPT_DEFAULTS.get('outfile'): |
140 outfile_handle = sys.stdout | |
141 else: | |
2 | 142 try: |
1 | 143 outfile_handle = open(outfile, 'w') |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
144 except IOError: |
2 | 145 fail('Error: The given output filename '+outfile+' could not be opened.') |
1 | 146 |
5 | 147 # Take care of column names, print header |
2 | 148 if len(COLUMNS) != len(COLUMN_LABELS): |
5 | 149 fail('Error: Internal column names list do not match column labels list.') |
150 if stranded: | |
151 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
152 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
1 | 153 if print_header: |
2 | 154 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") |
1 | 155 |
5 | 156 # main loop |
157 # each iteration processes one VCF line and prints one or more output lines | |
158 # one VCF line = one site, one or more samples | |
159 # one output line = one site, one sample | |
1 | 160 sample_names = [] |
161 for line in infile_handle: | |
162 line = line.rstrip('\r\n') | |
163 | |
164 # header lines | |
165 if line[0] == '#': | |
166 if line[0:6].upper() == '#CHROM': | |
167 sample_names = line.split('\t')[9:] | |
168 continue | |
169 | |
170 if not sample_names: | |
171 fail("Error in input VCF: Data line encountered before header line. " | |
172 +"Failed on line:\n"+line) | |
173 | |
174 site_data = read_site(line, sample_names, CANONICAL_VARIANTS) | |
175 | |
176 if debug: | |
5 | 177 if print_pos != site_data['pos']: |
178 continue | |
179 if print_chr != site_data['chr'] and print_chr != '': | |
1 | 180 continue |
5 | 181 if print_sample != '': |
182 for sample in site_data['samples'].keys(): | |
183 if sample.lower() != print_sample.lower(): | |
184 site_data['samples'].pop(sample, None) | |
185 if len(site_data['samples']) == 0: | |
186 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") | |
187 sys.exit(1) | |
188 | |
1 | 189 |
190 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, | |
5 | 191 freq_thres, covg_thres, stranded, debug=debug) |
1 | 192 |
193 if debug and site_summary[0]['print']: | |
5 | 194 print line.split('\t')[9].split(':')[-1] |
1 | 195 |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
196 try: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
197 print_site(outfile_handle, site_summary, COLUMNS) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
198 except IOError as ioe: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
199 if ioe.errno == errno.EPIPE: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
200 cleanup() |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
201 sys.exit(0) |
1 | 202 |
203 # close any open filehandles | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
204 cleanup() |
1 | 205 |
206 # keeps Galaxy from giving an error if there were messages on stderr | |
207 sys.exit(0) | |
208 | |
209 | |
210 | |
211 def read_site(line, sample_names, canonical): | |
212 """Read in a line, parse the variants into a data structure, and return it. | |
213 The line should be actual site data, not a header line, so check beforehand. | |
5 | 214 Only the variants in 'canonical' will be read; all others are ignored. |
215 Note: the line is assumed to have been chomped. | |
216 The returned data is stored in a dict, with the following structure: | |
217 { | |
218 'chr': 'chr1', | |
219 'pos': '2617', | |
220 'samples': { | |
221 'THYROID': { | |
222 '+A': 32, | |
223 '-A': 45, | |
224 '-G': 1, | |
225 }, | |
226 'BLOOD': { | |
227 '+A': 2, | |
228 '-C': 1, | |
229 '+G': 37, | |
230 '-G': 42, | |
231 }, | |
232 }, | |
233 } | |
234 """ | |
1 | 235 |
236 site = {} | |
237 fields = line.split('\t') | |
238 | |
239 if len(fields) < 9: | |
240 fail("Error in input VCF: wrong number of fields in data line. " | |
241 +"Failed on line:\n"+line) | |
242 | |
243 site['chr'] = fields[0] | |
244 site['pos'] = fields[1] | |
245 samples = fields[9:] | |
246 | |
247 if len(samples) < len(sample_names): | |
248 fail("Error in input VCF: missing sample fields in data line. " | |
249 +"Failed on line:\n"+line) | |
250 elif len(samples) > len(sample_names): | |
251 fail("Error in input VCF: more sample fields in data line than in header. " | |
252 +"Failed on line:\n"+line) | |
253 | |
254 sample_counts = {} | |
255 for i in range(len(samples)): | |
256 | |
257 variant_counts = {} | |
258 counts = samples[i].split(':')[-1] | |
259 counts = counts.split(',') | |
260 | |
261 for count in counts: | |
262 if not count: | |
263 continue | |
264 fields = count.split('=') | |
265 if len(fields) != 2: | |
266 fail("Error in input VCF: Incorrect variant data format (must contain " | |
267 +"a single '='). Failed on line:\n"+line) | |
268 (variant, reads) = fields | |
269 if variant[1:] not in canonical: | |
270 continue | |
271 if variant[0] != '-' and variant[0] != '+': | |
272 fail("Error in input VCF: variant data not strand-specific. " | |
273 +"Failed on line:\n"+line) | |
274 try: | |
2 | 275 variant_counts[variant] = int(float(reads)) |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
276 except ValueError: |
2 | 277 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line) |
1 | 278 |
279 sample_counts[sample_names[i]] = variant_counts | |
280 | |
281 site['samples'] = sample_counts | |
282 | |
283 return site | |
284 | |
285 | |
286 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, | |
5 | 287 stranded=False, debug=False): |
1 | 288 """Take the raw data from the VCF line and transform it into the summary data |
289 to be printed in the output format.""" | |
290 | |
291 site_summary = [] | |
292 for sample_name in sample_names: | |
293 | |
294 sample = {'print':False} | |
295 variants = site['samples'].get(sample_name) | |
296 | |
297 sample['sample'] = sample_name | |
298 sample['chr'] = site['chr'] | |
299 sample['pos'] = site['pos'] | |
300 | |
301 coverage = sum(variants.values()) | |
302 | |
303 # get stranded coverage | |
304 covg_plus = 0 | |
305 covg_minus = 0 | |
306 for variant in variants: | |
307 if variant[0] == '+': | |
308 covg_plus += variants[variant] | |
309 elif variant[0] == '-': | |
310 covg_minus += variants[variant] | |
311 # stranded coverage threshold | |
5 | 312 if covg_plus < covg_thres or covg_minus < covg_thres: |
1 | 313 site_summary.append(sample) |
314 continue | |
315 else: | |
316 sample['print'] = True | |
317 | |
5 | 318 # get an ordered list of read counts for all variants (both strands) |
319 bases = get_read_counts(variants, '+-') | |
320 ranked_bases = process_read_counts(bases, sort=True, debug=debug) | |
321 | |
322 # prepare stranded or unstranded lists of base counts | |
323 base_count_lists = [] | |
324 if stranded: | |
325 strands = ['+', '-'] | |
326 base_count_lists.append(get_read_counts(variants, '+')) | |
327 base_count_lists.append(get_read_counts(variants, '-')) | |
328 else: | |
329 strands = [''] | |
330 base_count_lists.append(ranked_bases) | |
1 | 331 |
5 | 332 # record read counts into output dict |
333 # If stranded, this will loop twice, once for each strand, and prepend '+' | |
334 # or '-' to the base name. If not stranded, it will loop once, and prepend | |
335 # nothing (''). | |
336 for (strand, base_count_list) in zip(strands, base_count_lists): | |
337 for base_count in base_count_list: | |
338 sample[strand+base_count[0]] = base_count[1] | |
339 # fill in any zeros | |
340 for base in canonical: | |
341 if not sample.has_key(strand+base): | |
342 sample[strand+base] = 0 | |
1 | 343 |
5 | 344 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) |
1 | 345 |
5 | 346 # If there's a tie for 2nd, randomly choose one to be 2nd |
1 | 347 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: |
5 | 348 swap = random.choice([True,False]) |
349 if swap: | |
350 tmp_base = ranked_bases[1] | |
351 ranked_bases[1] = ranked_bases[2] | |
352 ranked_bases[2] = tmp_base | |
1 | 353 |
5 | 354 if debug: print "ranked +-: "+str(ranked_bases) |
1 | 355 |
356 sample['coverage'] = coverage | |
357 try: | |
358 sample['major'] = ranked_bases[0][0] | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
359 except IndexError: |
1 | 360 sample['major'] = '.' |
361 try: | |
362 sample['minor'] = ranked_bases[1][0] | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
363 sample['freq'] = round(ranked_bases[1][1]/coverage, 5) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
364 except IndexError: |
1 | 365 sample['minor'] = '.' |
366 sample['freq'] = 0.0 | |
367 | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
368 # Now that we've decided major and minor, we can calculate strand bias |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
369 bias = get_strand_bias(sample, variants) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
370 if bias is None: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
371 sample['bias'] = '.' |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
372 else: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
373 sample['bias'] = round(bias, 5) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
374 |
1 | 375 site_summary.append(sample) |
376 | |
377 return site_summary | |
378 | |
379 | |
5 | 380 def get_read_counts(stranded_counts, strands='+-'): |
381 """Do a simple sum of the read counts per variant, on the specified strands. | |
382 Arguments: | |
383 stranded_counts: Dict of the stranded variants (keys) and their read counts | |
384 (values). | |
385 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). | |
386 Return value: | |
387 summed_counts: A list of the alleles and their read counts. The elements are | |
388 tuples (variant, read count).""" | |
389 | |
390 variants = stranded_counts.keys() | |
391 | |
392 summed_counts = {} | |
393 for variant in variants: | |
394 strand = variant[0] | |
395 base = variant[1:] | |
396 if strand in strands: | |
397 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) | |
398 | |
399 return summed_counts.items() | |
1 | 400 |
401 | |
5 | 402 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): |
403 """Process a list of read counts by frequency filtering and/or sorting. | |
1 | 404 Arguments: |
5 | 405 variant_counts: List of the non-stranded variants and their read counts. The |
406 elements are tuples (variant, read count). | |
1 | 407 freq_thres: The frequency threshold each allele needs to pass to be included. |
5 | 408 sort: Whether to sort the list in descending order of read counts. |
1 | 409 Return value: |
5 | 410 variant_counts: A list of the processed alleles and their read counts. The |
411 elements are tuples (variant, read count).""" | |
1 | 412 |
413 # get coverage for the specified strands | |
414 coverage = 0 | |
415 for variant in variant_counts: | |
5 | 416 coverage += variant[1] |
1 | 417 |
5 | 418 if coverage <= 0: |
1 | 419 return [] |
420 | |
421 # sort the list of alleles by read count | |
5 | 422 if sort: |
423 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) | |
1 | 424 |
425 if debug: | |
5 | 426 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) |
427 for variant in variant_counts: | |
428 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
429 str(variant[1]/coverage)) |
1 | 430 |
431 # remove bases below the frequency threshold | |
5 | 432 if freq_thres > 0: |
433 variant_counts = [variant for variant in variant_counts | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
434 if variant[1]/coverage >= freq_thres] |
1 | 435 |
5 | 436 return variant_counts |
1 | 437 |
438 | |
439 def count_alleles(variant_counts, freq_thres, debug=False): | |
440 """Determine how many alleles to report, based on filtering rules. | |
441 The current rule determines which bases pass the frequency threshold on each | |
442 strand individually, then compares the two sets of bases. If they are the same | |
443 (regardless of order), the allele count is the number of bases. Otherwise it | |
444 is zero.""" | |
445 allele_count = 0 | |
446 | |
5 | 447 alleles_plus = get_read_counts(variant_counts, '+') |
448 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres, | |
449 sort=False, debug=debug) | |
450 alleles_minus = get_read_counts(variant_counts, '-') | |
451 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, | |
452 sort=False, debug=debug) | |
1 | 453 |
454 if debug: | |
455 print '+ '+str(alleles_plus) | |
456 print '- '+str(alleles_minus) | |
457 | |
5 | 458 # Check if each strand reports the same set of alleles. |
459 # Sorting by base is to compare lists without regard to order (as sets). | |
1 | 460 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) |
461 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) | |
462 if alleles_plus_sorted == alleles_minus_sorted: | |
463 allele_count = len(alleles_plus) | |
464 | |
465 return allele_count | |
466 | |
467 | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
468 def get_strand_bias(sample, variants): |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
469 """Based on method 1 (SB) of Guo et al., 2012 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
470 If there a denominator would be 0, there is no valid result and this will |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
471 return None. This occurs when there are no reads on one of the strands, or |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
472 when there are no minor allele reads.""" |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
473 # using same variable names as in paper |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
474 a = variants.get('+'+sample['major'], 0) # forward major allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
475 b = variants.get('+'+sample['minor'], 0) # forward minor allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
476 c = variants.get('-'+sample['major'], 0) # reverse major allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
477 d = variants.get('-'+sample['minor'], 0) # reverse minor allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
478 # no minor allele reads |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
479 try: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
480 return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d)) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
481 except ZeroDivisionError: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
482 return None |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
483 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
484 |
5 | 485 def print_site(filehandle, site, columns): |
486 """Print the output lines for one site (one per sample). | |
487 filehandle must be open.""" | |
488 for sample in site: | |
489 if sample['print']: | |
490 fields = [str(sample.get(column)) for column in columns] | |
491 filehandle.write('\t'.join(fields)+"\n") | |
492 | |
493 | |
1 | 494 def fail(message): |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
495 cleanup() |
1 | 496 sys.stderr.write(message+'\n') |
497 sys.exit(1) | |
498 | |
5 | 499 |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
500 def cleanup(): |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
501 if isinstance(infile_handle, file): |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
502 infile_handle.close() |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
503 if isinstance(outfile_handle, file): |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
504 outfile_handle.close() |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
505 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
506 |
1 | 507 if __name__ == "__main__": |
508 main() |