Mercurial > repos > nick > allele_counts
comparison allele-counts.py @ 9:6cc488e11544 draft
"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
author | nick |
---|---|
date | Tue, 31 Mar 2020 05:09:12 -0400 |
parents | 411adeff1eec |
children |
comparison
equal
deleted
inserted
replaced
8:411adeff1eec | 9:6cc488e11544 |
---|---|
1 #!/usr/bin/python | 1 #!/usr/bin/python3 |
2 """ | 2 """ |
3 Run with -h option or see DESCRIPTION for description. | 3 Run with -h option or see DESCRIPTION for description. |
4 This script's functionality is being obsoleted by the new, and much more sanely | 4 This script's functionality is being obsoleted by the new, and much more sanely |
5 written, nvc-filter.py. | 5 written, nvc-filter.py. |
6 | 6 |
9 - Rename MINOR.FREQ.PERC to MAF | 9 - Rename MINOR.FREQ.PERC to MAF |
10 | 10 |
11 Naive Variant Caller variant count parsing one-liner: | 11 Naive Variant Caller variant count parsing one-liner: |
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' | 12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' |
13 """ | 13 """ |
14 from __future__ import division | |
15 import os | 14 import os |
16 import sys | 15 import sys |
17 import errno | 16 import errno |
18 import random | 17 import random |
19 from optparse import OptionParser | 18 from optparse import OptionParser |
46 The input VCF must report the variants for each strand. | 45 The input VCF must report the variants for each strand. |
47 The variants should be case-sensitive (e.g. all capital base letters). | 46 The variants should be case-sensitive (e.g. all capital base letters). |
48 Strand bias: Both strands must show the same bases passing the frequency | 47 Strand bias: Both strands must show the same bases passing the frequency |
49 threshold (but not necessarily in the same order). If the site fails this test, | 48 threshold (but not necessarily in the same order). If the site fails this test, |
50 the number of alleles is reported as 0.""" | 49 the number of alleles is reported as 0.""" |
50 | |
51 | 51 |
52 def get_options(defaults, usage, description='', epilog=''): | 52 def get_options(defaults, usage, description='', epilog=''): |
53 """Get options, print usage text.""" | 53 """Get options, print usage text.""" |
54 | 54 |
55 parser = OptionParser(usage=usage, description=description, epilog=epilog) | 55 parser = OptionParser(usage=usage, description=description, epilog=epilog) |
122 print_pos = '' | 122 print_pos = '' |
123 if len(coords) > 1: print_pos = coords[1] | 123 if len(coords) > 1: print_pos = coords[1] |
124 if len(coords) > 2: print_sample = coords[2] | 124 if len(coords) > 2: print_sample = coords[2] |
125 | 125 |
126 # set infile_handle to either stdin or the input file | 126 # set infile_handle to either stdin or the input file |
127 global infile_handle | |
128 if infile == OPT_DEFAULTS.get('infile'): | 127 if infile == OPT_DEFAULTS.get('infile'): |
129 infile_handle = sys.stdin | 128 infile_handle = sys.stdin |
130 sys.stderr.write("Reading from standard input..\n") | 129 sys.stderr.write("Reading from standard input..\n") |
131 else: | 130 else: |
132 if os.path.exists(infile): | 131 if os.path.exists(infile): |
133 infile_handle = open(infile, 'r') | 132 infile_handle = open(infile, 'r') |
134 else: | 133 else: |
135 fail('Error: Input VCF file '+infile+' not found.') | 134 fail('Error: Input VCF file '+infile+' not found.') |
136 | 135 |
137 # set outfile_handle to either stdout or the output file | 136 # set outfile_handle to either stdout or the output file |
138 global outfile_handle | |
139 if outfile == OPT_DEFAULTS.get('outfile'): | 137 if outfile == OPT_DEFAULTS.get('outfile'): |
140 outfile_handle = sys.stdout | 138 outfile_handle = sys.stdout |
141 else: | 139 else: |
142 try: | 140 try: |
143 outfile_handle = open(outfile, 'w') | 141 outfile_handle = open(outfile, 'w') |
184 site_data['samples'].pop(sample, None) | 182 site_data['samples'].pop(sample, None) |
185 if len(site_data['samples']) == 0: | 183 if len(site_data['samples']) == 0: |
186 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") | 184 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") |
187 sys.exit(1) | 185 sys.exit(1) |
188 | 186 |
189 | |
190 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, | 187 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, |
191 freq_thres, covg_thres, stranded, debug=debug) | 188 freq_thres, covg_thres, stranded, debug=debug) |
192 | 189 |
193 if debug and site_summary[0]['print']: | 190 if debug and site_summary[0]['print']: |
194 print line.split('\t')[9].split(':')[-1] | 191 print(line.split('\t')[9].split(':')[-1]) |
195 | 192 |
196 try: | 193 try: |
197 print_site(outfile_handle, site_summary, COLUMNS) | 194 print_site(outfile_handle, site_summary, COLUMNS) |
198 except IOError as ioe: | 195 except IOError as ioe: |
199 if ioe.errno == errno.EPIPE: | 196 if ioe.errno == errno.EPIPE: |
200 cleanup() | |
201 sys.exit(0) | 197 sys.exit(0) |
202 | |
203 # close any open filehandles | |
204 cleanup() | |
205 | 198 |
206 # keeps Galaxy from giving an error if there were messages on stderr | 199 # keeps Galaxy from giving an error if there were messages on stderr |
207 sys.exit(0) | 200 sys.exit(0) |
208 | 201 |
209 | 202 |
339 for (strand, base_count_list) in zip(strands, base_count_lists): | 332 for (strand, base_count_list) in zip(strands, base_count_lists): |
340 for base_count in base_count_list: | 333 for base_count in base_count_list: |
341 sample[strand+base_count[0]] = base_count[1] | 334 sample[strand+base_count[0]] = base_count[1] |
342 # fill in any zeros | 335 # fill in any zeros |
343 for base in canonical: | 336 for base in canonical: |
344 if not sample.has_key(strand+base): | 337 if strand+base not in sample: |
345 sample[strand+base] = 0 | 338 sample[strand+base] = 0 |
346 | 339 |
347 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) | 340 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) |
348 | 341 |
349 # If there's a tie for 2nd, randomly choose one to be 2nd | 342 # If there's a tie for 2nd, randomly choose one to be 2nd |
352 if swap: | 345 if swap: |
353 tmp_base = ranked_bases[1] | 346 tmp_base = ranked_bases[1] |
354 ranked_bases[1] = ranked_bases[2] | 347 ranked_bases[1] = ranked_bases[2] |
355 ranked_bases[2] = tmp_base | 348 ranked_bases[2] = tmp_base |
356 | 349 |
357 if debug: print "ranked +-: "+str(ranked_bases) | 350 if debug: print("ranked +-: "+str(ranked_bases)) |
358 | 351 |
359 sample['coverage'] = coverage | 352 sample['coverage'] = coverage |
360 try: | 353 try: |
361 sample['major'] = ranked_bases[0][0] | 354 sample['major'] = ranked_bases[0][0] |
362 except IndexError: | 355 except IndexError: |
397 strand = variant[0] | 390 strand = variant[0] |
398 base = variant[1:] | 391 base = variant[1:] |
399 if strand in strands: | 392 if strand in strands: |
400 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) | 393 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) |
401 | 394 |
402 return summed_counts.items() | 395 return list(summed_counts.items()) |
403 | 396 |
404 | 397 |
405 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): | 398 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): |
406 """Process a list of read counts by frequency filtering and/or sorting. | 399 """Process a list of read counts by frequency filtering and/or sorting. |
407 Arguments: | 400 Arguments: |
424 # sort the list of alleles by read count | 417 # sort the list of alleles by read count |
425 if sort: | 418 if sort: |
426 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) | 419 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) |
427 | 420 |
428 if debug: | 421 if debug: |
429 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) | 422 print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)) |
430 for variant in variant_counts: | 423 for variant in variant_counts: |
431 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ | 424 print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ |
432 str(variant[1]/coverage)) | 425 str(variant[1]/coverage))) |
433 | 426 |
434 # remove bases below the frequency threshold | 427 # remove bases below the frequency threshold |
435 if freq_thres > 0: | 428 if freq_thres > 0: |
436 variant_counts = [variant for variant in variant_counts | 429 variant_counts = [variant for variant in variant_counts |
437 if variant[1]/coverage >= freq_thres] | 430 if variant[1]/coverage >= freq_thres] |
453 alleles_minus = get_read_counts(variant_counts, '-') | 446 alleles_minus = get_read_counts(variant_counts, '-') |
454 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, | 447 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, |
455 sort=False, debug=debug) | 448 sort=False, debug=debug) |
456 | 449 |
457 if debug: | 450 if debug: |
458 print '+ '+str(alleles_plus) | 451 print('+ '+str(alleles_plus)) |
459 print '- '+str(alleles_minus) | 452 print('- '+str(alleles_minus)) |
460 | 453 |
461 # Check if each strand reports the same set of alleles. | 454 # Check if each strand reports the same set of alleles. |
462 # Sorting by base is to compare lists without regard to order (as sets). | 455 # Sorting by base is to compare lists without regard to order (as sets). |
463 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) | 456 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) |
464 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) | 457 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) |
493 fields = [str(sample.get(column)) for column in columns] | 486 fields = [str(sample.get(column)) for column in columns] |
494 filehandle.write('\t'.join(fields)+"\n") | 487 filehandle.write('\t'.join(fields)+"\n") |
495 | 488 |
496 | 489 |
497 def fail(message): | 490 def fail(message): |
498 cleanup() | |
499 sys.stderr.write(message+'\n') | 491 sys.stderr.write(message+'\n') |
500 sys.exit(1) | 492 sys.exit(1) |
501 | 493 |
502 | 494 |
503 def cleanup(): | |
504 if isinstance(infile_handle, file): | |
505 infile_handle.close() | |
506 if isinstance(outfile_handle, file): | |
507 outfile_handle.close() | |
508 | |
509 | |
510 if __name__ == "__main__": | 495 if __name__ == "__main__": |
511 main() | 496 main() |