Mercurial > repos > nick > allele_counts
annotate allele-counts.py @ 9:6cc488e11544 draft
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
author | nick |
---|---|
date | Tue, 31 Mar 2020 05:09:12 -0400 |
parents | 411adeff1eec |
children |
rev | line source |
---|---|
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
1 #!/usr/bin/python3 |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
2 """ |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
3 Run with -h option or see DESCRIPTION for description. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
4 This script's functionality is being obsoleted by the new, and much more sanely |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
5 written, nvc-filter.py. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
6 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
7 New in this version: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
8 - Calculate strand bias |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
9 - Rename MINOR.FREQ.PERC to MAF |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
10 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
11 Naive Variant Caller variant count parsing one-liner: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
13 """ |
1 | 14 import os |
15 import sys | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
16 import errno |
5 | 17 import random |
1 | 18 from optparse import OptionParser |
19 | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
20 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
21 'major', 'minor', 'freq', 'bias'] |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
22 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
23 'MAJOR', 'MINOR', 'MAF', 'BIAS'] |
1 | 24 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
25 USAGE = """Usage: %prog [options] -i variants.vcf -o alleles.tsv |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
26 cat variants.vcf | %prog [options] > alleles.tsv""" |
1 | 27 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, |
5 | 28 'print_header':False, 'stdin':False, 'stranded':False, 'no_filter':False, |
29 'debug_loc':'', 'seed':''} | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
30 DESCRIPTION = """This will parse the VCF output of the "Naive Variant Caller" |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
31 (aka "BAM Coverage") Galaxy tool. For each position reported, it counts the |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
32 number of reads of each base, determines the major allele, minor allele (second |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
33 most frequent variant), and number of alleles above a threshold. So currently |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
34 it only considers SNVs (ACGT), including in the coverage figure. By default it |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
35 reads from stdin and prints to stdout. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
36 Prints a tab-delimited set of statistics to stdout. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
37 To print output column labels, run "$ echo -n | ./allele-counts.py -H". |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
38 The columns are: 1:SAMPLE 2:CHR 3:POS 4:A 5:C 6:G 7:T 8:CVRG 9:ALLELES 10:MAJOR |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
39 11:MINOR 12:MAF 13:BIAS, |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
40 unless the --stranded option is used, in which case they are: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
41 1:SAMPLE 2:CHR 3:POS 4:+A 5:+C 6:+G 7:+T 8:-A 9:-C 10:-G 11:-T 12:CVRG |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
42 13:ALLELES 14:MAJOR 15:MINOR 16:MAF 17:BIAS. |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
43 """ |
1 | 44 EPILOG = """Requirements: |
45 The input VCF must report the variants for each strand. | |
46 The variants should be case-sensitive (e.g. all capital base letters). | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
47 Strand bias: Both strands must show the same bases passing the frequency |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
48 threshold (but not necessarily in the same order). If the site fails this test, |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
49 the number of alleles is reported as 0.""" |
1 | 50 |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
51 |
1 | 52 def get_options(defaults, usage, description='', epilog=''): |
53 """Get options, print usage text.""" | |
54 | |
55 parser = OptionParser(usage=usage, description=description, epilog=epilog) | |
56 | |
57 parser.add_option('-i', '--infile', dest='infile', | |
58 default=defaults.get('infile'), | |
59 help='Read input VCF data from this file instead of stdin.') | |
60 parser.add_option('-o', '--outfile', dest='outfile', | |
61 default=defaults.get('outfile'), | |
62 help='Print output data to this file instead of stdout.') | |
63 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | |
64 default=defaults.get('freq_thres'), | |
5 | 65 help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' |
66 +'= 1% frequency. Default is %default%.')) | |
1 | 67 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', |
68 default=defaults.get('covg_thres'), | |
5 | 69 help=('Coverage threshold. Each site must be supported by at least this ' |
70 +'many reads on each strand. Otherwise the site will not be printed in ' | |
71 +'the output. The default is %default reads per strand.')) | |
72 parser.add_option('-n', '--no-filter', dest='no_filter', action='store_const', | |
73 const=not(defaults.get('no_filter')), default=defaults.get('no_filter'), | |
74 help=('Operate without a frequency threshold or coverage threshold. ' | |
75 +'Equivalent to "-c 0 -f 0".')) | |
1 | 76 parser.add_option('-H', '--header', dest='print_header', action='store_const', |
77 const=not(defaults.get('print_header')), default=defaults.get('print_header'), | |
5 | 78 help=('Print header line. This is a #-commented line with the column ' |
79 +'labels. Off by default.')) | |
80 parser.add_option('-s', '--stranded', dest='stranded', action='store_const', | |
81 const=not(defaults.get('stranded')), default=defaults.get('stranded'), | |
82 help='Report variant counts by strand, in separate columns. Off by default.') | |
83 parser.add_option('-r', '--rand-seed', dest='seed', | |
84 default=defaults.get('seed'), help=('Seed for random number generator.')) | |
85 parser.add_option('-d', '--debug', dest='debug_loc', | |
86 default=defaults.get('debug_loc'), | |
87 help=('Turn on debug mode and specify a single site to process using UCSC ' | |
88 +'coordinate format. You can also append a sample ID after another ":" ' | |
89 +'to restrict it further.')) | |
1 | 90 |
91 (options, args) = parser.parse_args() | |
92 | |
5 | 93 return (options, args) |
1 | 94 |
95 | |
96 def main(): | |
97 | |
98 (options, args) = get_options(OPT_DEFAULTS, USAGE, DESCRIPTION, EPILOG) | |
99 | |
100 infile = options.infile | |
101 outfile = options.outfile | |
102 print_header = options.print_header | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
103 freq_thres = options.freq_thres / 100 |
1 | 104 covg_thres = options.covg_thres |
5 | 105 stranded = options.stranded |
106 debug_loc = options.debug_loc | |
107 seed = options.seed | |
108 | |
109 if options.no_filter: | |
110 freq_thres = 0 | |
111 covg_thres = 0 | |
1 | 112 |
5 | 113 if seed: |
114 random.seed(seed) | |
115 | |
116 debug = False | |
117 print_sample = '' | |
118 if debug_loc: | |
119 debug = True | |
120 coords = debug_loc.split(':') | |
121 print_chr = coords[0] | |
122 print_pos = '' | |
123 if len(coords) > 1: print_pos = coords[1] | |
124 if len(coords) > 2: print_sample = coords[2] | |
1 | 125 |
126 # set infile_handle to either stdin or the input file | |
127 if infile == OPT_DEFAULTS.get('infile'): | |
128 infile_handle = sys.stdin | |
129 sys.stderr.write("Reading from standard input..\n") | |
130 else: | |
131 if os.path.exists(infile): | |
132 infile_handle = open(infile, 'r') | |
133 else: | |
134 fail('Error: Input VCF file '+infile+' not found.') | |
135 | |
136 # set outfile_handle to either stdout or the output file | |
137 if outfile == OPT_DEFAULTS.get('outfile'): | |
138 outfile_handle = sys.stdout | |
139 else: | |
2 | 140 try: |
1 | 141 outfile_handle = open(outfile, 'w') |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
142 except IOError: |
2 | 143 fail('Error: The given output filename '+outfile+' could not be opened.') |
1 | 144 |
5 | 145 # Take care of column names, print header |
2 | 146 if len(COLUMNS) != len(COLUMN_LABELS): |
5 | 147 fail('Error: Internal column names list do not match column labels list.') |
148 if stranded: | |
149 COLUMNS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
150 COLUMN_LABELS[3:7] = ['+A', '+C', '+G', '+T', '-A', '-C', '-G', '-T'] | |
1 | 151 if print_header: |
2 | 152 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") |
1 | 153 |
5 | 154 # main loop |
155 # each iteration processes one VCF line and prints one or more output lines | |
156 # one VCF line = one site, one or more samples | |
157 # one output line = one site, one sample | |
1 | 158 sample_names = [] |
159 for line in infile_handle: | |
160 line = line.rstrip('\r\n') | |
161 | |
162 # header lines | |
163 if line[0] == '#': | |
164 if line[0:6].upper() == '#CHROM': | |
165 sample_names = line.split('\t')[9:] | |
166 continue | |
167 | |
168 if not sample_names: | |
169 fail("Error in input VCF: Data line encountered before header line. " | |
170 +"Failed on line:\n"+line) | |
171 | |
172 site_data = read_site(line, sample_names, CANONICAL_VARIANTS) | |
173 | |
174 if debug: | |
5 | 175 if print_pos != site_data['pos']: |
176 continue | |
177 if print_chr != site_data['chr'] and print_chr != '': | |
1 | 178 continue |
5 | 179 if print_sample != '': |
180 for sample in site_data['samples'].keys(): | |
181 if sample.lower() != print_sample.lower(): | |
182 site_data['samples'].pop(sample, None) | |
183 if len(site_data['samples']) == 0: | |
184 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") | |
185 sys.exit(1) | |
186 | |
1 | 187 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, |
5 | 188 freq_thres, covg_thres, stranded, debug=debug) |
1 | 189 |
190 if debug and site_summary[0]['print']: | |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
191 print(line.split('\t')[9].split(':')[-1]) |
1 | 192 |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
193 try: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
194 print_site(outfile_handle, site_summary, COLUMNS) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
195 except IOError as ioe: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
196 if ioe.errno == errno.EPIPE: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
197 sys.exit(0) |
1 | 198 |
199 # keeps Galaxy from giving an error if there were messages on stderr | |
200 sys.exit(0) | |
201 | |
202 | |
203 | |
204 def read_site(line, sample_names, canonical): | |
205 """Read in a line, parse the variants into a data structure, and return it. | |
206 The line should be actual site data, not a header line, so check beforehand. | |
5 | 207 Only the variants in 'canonical' will be read; all others are ignored. |
208 Note: the line is assumed to have been chomped. | |
209 The returned data is stored in a dict, with the following structure: | |
210 { | |
211 'chr': 'chr1', | |
212 'pos': '2617', | |
213 'samples': { | |
214 'THYROID': { | |
215 '+A': 32, | |
216 '-A': 45, | |
217 '-G': 1, | |
218 }, | |
219 'BLOOD': { | |
220 '+A': 2, | |
221 '-C': 1, | |
222 '+G': 37, | |
223 '-G': 42, | |
224 }, | |
225 }, | |
226 } | |
227 """ | |
1 | 228 |
229 site = {} | |
230 fields = line.split('\t') | |
231 | |
232 if len(fields) < 9: | |
233 fail("Error in input VCF: wrong number of fields in data line. " | |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
234 "Failed on line:\n"+line) |
1 | 235 |
236 site['chr'] = fields[0] | |
237 site['pos'] = fields[1] | |
238 samples = fields[9:] | |
239 | |
240 if len(samples) < len(sample_names): | |
241 fail("Error in input VCF: missing sample fields in data line. " | |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
242 "Failed on line:\n"+line) |
1 | 243 elif len(samples) > len(sample_names): |
244 fail("Error in input VCF: more sample fields in data line than in header. " | |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
245 "Failed on line:\n"+line) |
1 | 246 |
247 sample_counts = {} | |
248 for i in range(len(samples)): | |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
249 |
1 | 250 variant_counts = {} |
251 counts = samples[i].split(':')[-1] | |
252 counts = counts.split(',') | |
253 | |
254 for count in counts: | |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
255 if not count or count == '.': |
1 | 256 continue |
257 fields = count.split('=') | |
258 if len(fields) != 2: | |
259 fail("Error in input VCF: Incorrect variant data format (must contain " | |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
260 "a single '='). Failed on data \"{}\" in line:\n{}" |
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
261 .format(count, line)) |
1 | 262 (variant, reads) = fields |
263 if variant[1:] not in canonical: | |
264 continue | |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
265 if not variant.startswith('-') and not variant.startswith('+'): |
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
266 fail("Error in input VCF: variant data not strand-specific. Failed on " |
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
267 "data \"{}\" on line:\n{}".format(variant, line)) |
1 | 268 try: |
2 | 269 variant_counts[variant] = int(float(reads)) |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
270 except ValueError: |
8
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
271 fail("Error in input VCF: Variant count not a valid number. Failed on " |
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
272 "variant count string \"{}\"\nIn the following line:\n{}" |
411adeff1eec
Handle "." sample columns, update tests to work with BIAS column.
nick
parents:
6
diff
changeset
|
273 .format(reads, line)) |
1 | 274 |
275 sample_counts[sample_names[i]] = variant_counts | |
276 | |
277 site['samples'] = sample_counts | |
278 | |
279 return site | |
280 | |
281 | |
282 def summarize_site(site, sample_names, canonical, freq_thres, covg_thres, | |
5 | 283 stranded=False, debug=False): |
1 | 284 """Take the raw data from the VCF line and transform it into the summary data |
285 to be printed in the output format.""" | |
286 | |
287 site_summary = [] | |
288 for sample_name in sample_names: | |
289 | |
290 sample = {'print':False} | |
291 variants = site['samples'].get(sample_name) | |
292 | |
293 sample['sample'] = sample_name | |
294 sample['chr'] = site['chr'] | |
295 sample['pos'] = site['pos'] | |
296 | |
297 coverage = sum(variants.values()) | |
298 | |
299 # get stranded coverage | |
300 covg_plus = 0 | |
301 covg_minus = 0 | |
302 for variant in variants: | |
303 if variant[0] == '+': | |
304 covg_plus += variants[variant] | |
305 elif variant[0] == '-': | |
306 covg_minus += variants[variant] | |
307 # stranded coverage threshold | |
5 | 308 if covg_plus < covg_thres or covg_minus < covg_thres: |
1 | 309 site_summary.append(sample) |
310 continue | |
311 else: | |
312 sample['print'] = True | |
313 | |
5 | 314 # get an ordered list of read counts for all variants (both strands) |
315 bases = get_read_counts(variants, '+-') | |
316 ranked_bases = process_read_counts(bases, sort=True, debug=debug) | |
317 | |
318 # prepare stranded or unstranded lists of base counts | |
319 base_count_lists = [] | |
320 if stranded: | |
321 strands = ['+', '-'] | |
322 base_count_lists.append(get_read_counts(variants, '+')) | |
323 base_count_lists.append(get_read_counts(variants, '-')) | |
324 else: | |
325 strands = [''] | |
326 base_count_lists.append(ranked_bases) | |
1 | 327 |
5 | 328 # record read counts into output dict |
329 # If stranded, this will loop twice, once for each strand, and prepend '+' | |
330 # or '-' to the base name. If not stranded, it will loop once, and prepend | |
331 # nothing (''). | |
332 for (strand, base_count_list) in zip(strands, base_count_lists): | |
333 for base_count in base_count_list: | |
334 sample[strand+base_count[0]] = base_count[1] | |
335 # fill in any zeros | |
336 for base in canonical: | |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
337 if strand+base not in sample: |
5 | 338 sample[strand+base] = 0 |
1 | 339 |
5 | 340 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) |
1 | 341 |
5 | 342 # If there's a tie for 2nd, randomly choose one to be 2nd |
1 | 343 if len(ranked_bases) >= 3 and ranked_bases[1][1] == ranked_bases[2][1]: |
5 | 344 swap = random.choice([True,False]) |
345 if swap: | |
346 tmp_base = ranked_bases[1] | |
347 ranked_bases[1] = ranked_bases[2] | |
348 ranked_bases[2] = tmp_base | |
1 | 349 |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
350 if debug: print("ranked +-: "+str(ranked_bases)) |
1 | 351 |
352 sample['coverage'] = coverage | |
353 try: | |
354 sample['major'] = ranked_bases[0][0] | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
355 except IndexError: |
1 | 356 sample['major'] = '.' |
357 try: | |
358 sample['minor'] = ranked_bases[1][0] | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
359 sample['freq'] = round(ranked_bases[1][1]/coverage, 5) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
360 except IndexError: |
1 | 361 sample['minor'] = '.' |
362 sample['freq'] = 0.0 | |
363 | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
364 # Now that we've decided major and minor, we can calculate strand bias |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
365 bias = get_strand_bias(sample, variants) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
366 if bias is None: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
367 sample['bias'] = '.' |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
368 else: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
369 sample['bias'] = round(bias, 5) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
370 |
1 | 371 site_summary.append(sample) |
372 | |
373 return site_summary | |
374 | |
375 | |
5 | 376 def get_read_counts(stranded_counts, strands='+-'): |
377 """Do a simple sum of the read counts per variant, on the specified strands. | |
378 Arguments: | |
379 stranded_counts: Dict of the stranded variants (keys) and their read counts | |
380 (values). | |
381 strands: Which strand(s) to count. Can be '+', '-', or '+-' for both (default). | |
382 Return value: | |
383 summed_counts: A list of the alleles and their read counts. The elements are | |
384 tuples (variant, read count).""" | |
385 | |
386 variants = stranded_counts.keys() | |
387 | |
388 summed_counts = {} | |
389 for variant in variants: | |
390 strand = variant[0] | |
391 base = variant[1:] | |
392 if strand in strands: | |
393 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) | |
394 | |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
395 return list(summed_counts.items()) |
1 | 396 |
397 | |
5 | 398 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): |
399 """Process a list of read counts by frequency filtering and/or sorting. | |
1 | 400 Arguments: |
5 | 401 variant_counts: List of the non-stranded variants and their read counts. The |
402 elements are tuples (variant, read count). | |
1 | 403 freq_thres: The frequency threshold each allele needs to pass to be included. |
5 | 404 sort: Whether to sort the list in descending order of read counts. |
1 | 405 Return value: |
5 | 406 variant_counts: A list of the processed alleles and their read counts. The |
407 elements are tuples (variant, read count).""" | |
1 | 408 |
409 # get coverage for the specified strands | |
410 coverage = 0 | |
411 for variant in variant_counts: | |
5 | 412 coverage += variant[1] |
1 | 413 |
5 | 414 if coverage <= 0: |
1 | 415 return [] |
416 | |
417 # sort the list of alleles by read count | |
5 | 418 if sort: |
419 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) | |
1 | 420 |
421 if debug: | |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
422 print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)) |
5 | 423 for variant in variant_counts: |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
424 print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ |
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
425 str(variant[1]/coverage))) |
1 | 426 |
427 # remove bases below the frequency threshold | |
5 | 428 if freq_thres > 0: |
429 variant_counts = [variant for variant in variant_counts | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
430 if variant[1]/coverage >= freq_thres] |
1 | 431 |
5 | 432 return variant_counts |
1 | 433 |
434 | |
435 def count_alleles(variant_counts, freq_thres, debug=False): | |
436 """Determine how many alleles to report, based on filtering rules. | |
437 The current rule determines which bases pass the frequency threshold on each | |
438 strand individually, then compares the two sets of bases. If they are the same | |
439 (regardless of order), the allele count is the number of bases. Otherwise it | |
440 is zero.""" | |
441 allele_count = 0 | |
442 | |
5 | 443 alleles_plus = get_read_counts(variant_counts, '+') |
444 alleles_plus = process_read_counts(alleles_plus, freq_thres=freq_thres, | |
445 sort=False, debug=debug) | |
446 alleles_minus = get_read_counts(variant_counts, '-') | |
447 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, | |
448 sort=False, debug=debug) | |
1 | 449 |
450 if debug: | |
9
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
451 print('+ '+str(alleles_plus)) |
6cc488e11544
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
nick
parents:
8
diff
changeset
|
452 print('- '+str(alleles_minus)) |
1 | 453 |
5 | 454 # Check if each strand reports the same set of alleles. |
455 # Sorting by base is to compare lists without regard to order (as sets). | |
1 | 456 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) |
457 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) | |
458 if alleles_plus_sorted == alleles_minus_sorted: | |
459 allele_count = len(alleles_plus) | |
460 | |
461 return allele_count | |
462 | |
463 | |
6
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
464 def get_strand_bias(sample, variants): |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
465 """Based on method 1 (SB) of Guo et al., 2012 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
466 If there a denominator would be 0, there is no valid result and this will |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
467 return None. This occurs when there are no reads on one of the strands, or |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
468 when there are no minor allele reads.""" |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
469 # using same variable names as in paper |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
470 a = variants.get('+'+sample['major'], 0) # forward major allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
471 b = variants.get('+'+sample['minor'], 0) # forward minor allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
472 c = variants.get('-'+sample['major'], 0) # reverse major allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
473 d = variants.get('-'+sample['minor'], 0) # reverse minor allele |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
474 # no minor allele reads |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
475 try: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
476 return abs(b/(a+b) - d/(c+d)) / ((b+d) / (a+b+c+d)) |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
477 except ZeroDivisionError: |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
478 return None |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
479 |
df3b28364cd2
allele-counts.{py,xml}: Add strand bias, documentation updates.
nicksto <nmapsy@gmail.com>
parents:
5
diff
changeset
|
480 |
5 | 481 def print_site(filehandle, site, columns): |
482 """Print the output lines for one site (one per sample). | |
483 filehandle must be open.""" | |
484 for sample in site: | |
485 if sample['print']: | |
486 fields = [str(sample.get(column)) for column in columns] | |
487 filehandle.write('\t'.join(fields)+"\n") | |
488 | |
489 | |
1 | 490 def fail(message): |
491 sys.stderr.write(message+'\n') | |
492 sys.exit(1) | |
493 | |
5 | 494 |
1 | 495 if __name__ == "__main__": |
496 main() |