comparison allele-counts.py @ 9:6cc488e11544 draft

"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"
author nick
date Tue, 31 Mar 2020 05:09:12 -0400
parents 411adeff1eec
children
comparison
equal deleted inserted replaced
8:411adeff1eec 9:6cc488e11544
1 #!/usr/bin/python 1 #!/usr/bin/python3
2 """ 2 """
3 Run with -h option or see DESCRIPTION for description. 3 Run with -h option or see DESCRIPTION for description.
4 This script's functionality is being obsoleted by the new, and much more sanely 4 This script's functionality is being obsoleted by the new, and much more sanely
5 written, nvc-filter.py. 5 written, nvc-filter.py.
6 6
9 - Rename MINOR.FREQ.PERC to MAF 9 - Rename MINOR.FREQ.PERC to MAF
10 10
11 Naive Variant Caller variant count parsing one-liner: 11 Naive Variant Caller variant count parsing one-liner:
12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:' 12 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
13 """ 13 """
14 from __future__ import division
15 import os 14 import os
16 import sys 15 import sys
17 import errno 16 import errno
18 import random 17 import random
19 from optparse import OptionParser 18 from optparse import OptionParser
46 The input VCF must report the variants for each strand. 45 The input VCF must report the variants for each strand.
47 The variants should be case-sensitive (e.g. all capital base letters). 46 The variants should be case-sensitive (e.g. all capital base letters).
48 Strand bias: Both strands must show the same bases passing the frequency 47 Strand bias: Both strands must show the same bases passing the frequency
49 threshold (but not necessarily in the same order). If the site fails this test, 48 threshold (but not necessarily in the same order). If the site fails this test,
50 the number of alleles is reported as 0.""" 49 the number of alleles is reported as 0."""
50
51 51
52 def get_options(defaults, usage, description='', epilog=''): 52 def get_options(defaults, usage, description='', epilog=''):
53 """Get options, print usage text.""" 53 """Get options, print usage text."""
54 54
55 parser = OptionParser(usage=usage, description=description, epilog=epilog) 55 parser = OptionParser(usage=usage, description=description, epilog=epilog)
122 print_pos = '' 122 print_pos = ''
123 if len(coords) > 1: print_pos = coords[1] 123 if len(coords) > 1: print_pos = coords[1]
124 if len(coords) > 2: print_sample = coords[2] 124 if len(coords) > 2: print_sample = coords[2]
125 125
126 # set infile_handle to either stdin or the input file 126 # set infile_handle to either stdin or the input file
127 global infile_handle
128 if infile == OPT_DEFAULTS.get('infile'): 127 if infile == OPT_DEFAULTS.get('infile'):
129 infile_handle = sys.stdin 128 infile_handle = sys.stdin
130 sys.stderr.write("Reading from standard input..\n") 129 sys.stderr.write("Reading from standard input..\n")
131 else: 130 else:
132 if os.path.exists(infile): 131 if os.path.exists(infile):
133 infile_handle = open(infile, 'r') 132 infile_handle = open(infile, 'r')
134 else: 133 else:
135 fail('Error: Input VCF file '+infile+' not found.') 134 fail('Error: Input VCF file '+infile+' not found.')
136 135
137 # set outfile_handle to either stdout or the output file 136 # set outfile_handle to either stdout or the output file
138 global outfile_handle
139 if outfile == OPT_DEFAULTS.get('outfile'): 137 if outfile == OPT_DEFAULTS.get('outfile'):
140 outfile_handle = sys.stdout 138 outfile_handle = sys.stdout
141 else: 139 else:
142 try: 140 try:
143 outfile_handle = open(outfile, 'w') 141 outfile_handle = open(outfile, 'w')
184 site_data['samples'].pop(sample, None) 182 site_data['samples'].pop(sample, None)
185 if len(site_data['samples']) == 0: 183 if len(site_data['samples']) == 0:
186 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n") 184 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
187 sys.exit(1) 185 sys.exit(1)
188 186
189
190 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS, 187 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
191 freq_thres, covg_thres, stranded, debug=debug) 188 freq_thres, covg_thres, stranded, debug=debug)
192 189
193 if debug and site_summary[0]['print']: 190 if debug and site_summary[0]['print']:
194 print line.split('\t')[9].split(':')[-1] 191 print(line.split('\t')[9].split(':')[-1])
195 192
196 try: 193 try:
197 print_site(outfile_handle, site_summary, COLUMNS) 194 print_site(outfile_handle, site_summary, COLUMNS)
198 except IOError as ioe: 195 except IOError as ioe:
199 if ioe.errno == errno.EPIPE: 196 if ioe.errno == errno.EPIPE:
200 cleanup()
201 sys.exit(0) 197 sys.exit(0)
202
203 # close any open filehandles
204 cleanup()
205 198
206 # keeps Galaxy from giving an error if there were messages on stderr 199 # keeps Galaxy from giving an error if there were messages on stderr
207 sys.exit(0) 200 sys.exit(0)
208 201
209 202
339 for (strand, base_count_list) in zip(strands, base_count_lists): 332 for (strand, base_count_list) in zip(strands, base_count_lists):
340 for base_count in base_count_list: 333 for base_count in base_count_list:
341 sample[strand+base_count[0]] = base_count[1] 334 sample[strand+base_count[0]] = base_count[1]
342 # fill in any zeros 335 # fill in any zeros
343 for base in canonical: 336 for base in canonical:
344 if not sample.has_key(strand+base): 337 if strand+base not in sample:
345 sample[strand+base] = 0 338 sample[strand+base] = 0
346 339
347 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug) 340 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
348 341
349 # If there's a tie for 2nd, randomly choose one to be 2nd 342 # If there's a tie for 2nd, randomly choose one to be 2nd
352 if swap: 345 if swap:
353 tmp_base = ranked_bases[1] 346 tmp_base = ranked_bases[1]
354 ranked_bases[1] = ranked_bases[2] 347 ranked_bases[1] = ranked_bases[2]
355 ranked_bases[2] = tmp_base 348 ranked_bases[2] = tmp_base
356 349
357 if debug: print "ranked +-: "+str(ranked_bases) 350 if debug: print("ranked +-: "+str(ranked_bases))
358 351
359 sample['coverage'] = coverage 352 sample['coverage'] = coverage
360 try: 353 try:
361 sample['major'] = ranked_bases[0][0] 354 sample['major'] = ranked_bases[0][0]
362 except IndexError: 355 except IndexError:
397 strand = variant[0] 390 strand = variant[0]
398 base = variant[1:] 391 base = variant[1:]
399 if strand in strands: 392 if strand in strands:
400 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0) 393 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
401 394
402 return summed_counts.items() 395 return list(summed_counts.items())
403 396
404 397
405 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False): 398 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
406 """Process a list of read counts by frequency filtering and/or sorting. 399 """Process a list of read counts by frequency filtering and/or sorting.
407 Arguments: 400 Arguments:
424 # sort the list of alleles by read count 417 # sort the list of alleles by read count
425 if sort: 418 if sort:
426 variant_counts.sort(reverse=True, key=lambda variant: variant[1]) 419 variant_counts.sort(reverse=True, key=lambda variant: variant[1])
427 420
428 if debug: 421 if debug:
429 print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres) 422 print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres))
430 for variant in variant_counts: 423 for variant in variant_counts:
431 print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+ 424 print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
432 str(variant[1]/coverage)) 425 str(variant[1]/coverage)))
433 426
434 # remove bases below the frequency threshold 427 # remove bases below the frequency threshold
435 if freq_thres > 0: 428 if freq_thres > 0:
436 variant_counts = [variant for variant in variant_counts 429 variant_counts = [variant for variant in variant_counts
437 if variant[1]/coverage >= freq_thres] 430 if variant[1]/coverage >= freq_thres]
453 alleles_minus = get_read_counts(variant_counts, '-') 446 alleles_minus = get_read_counts(variant_counts, '-')
454 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres, 447 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres,
455 sort=False, debug=debug) 448 sort=False, debug=debug)
456 449
457 if debug: 450 if debug:
458 print '+ '+str(alleles_plus) 451 print('+ '+str(alleles_plus))
459 print '- '+str(alleles_minus) 452 print('- '+str(alleles_minus))
460 453
461 # Check if each strand reports the same set of alleles. 454 # Check if each strand reports the same set of alleles.
462 # Sorting by base is to compare lists without regard to order (as sets). 455 # Sorting by base is to compare lists without regard to order (as sets).
463 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]]) 456 alleles_plus_sorted = sorted([base[0] for base in alleles_plus if base[1]])
464 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]]) 457 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
493 fields = [str(sample.get(column)) for column in columns] 486 fields = [str(sample.get(column)) for column in columns]
494 filehandle.write('\t'.join(fields)+"\n") 487 filehandle.write('\t'.join(fields)+"\n")
495 488
496 489
497 def fail(message): 490 def fail(message):
498 cleanup()
499 sys.stderr.write(message+'\n') 491 sys.stderr.write(message+'\n')
500 sys.exit(1) 492 sys.exit(1)
501 493
502 494
503 def cleanup():
504 if isinstance(infile_handle, file):
505 infile_handle.close()
506 if isinstance(outfile_handle, file):
507 outfile_handle.close()
508
509
510 if __name__ == "__main__": 495 if __name__ == "__main__":
511 main() 496 main()