allele_counts: allele-counts.py comparison

comparison allele-counts.py @ 9:6cc488e11544 draft

"planemo upload for repository https://github.com/galaxyproject/dunovo commit 5a2e08bc1213b0437d0adcb45f7f431bd3c735f4"

author	nick
date	Tue, 31 Mar 2020 05:09:12 -0400
parents	411adeff1eec
children

comparison

equal deleted inserted replaced

-:411adeff1eec
+:6cc488e11544
-#!/usr/bin/python
+#!/usr/bin/python3
 """
 Run with -h option or see DESCRIPTION for description.
 This script's functionality is being obsoleted by the new, and much more sanely
 written, nvc-filter.py.
 - Rename MINOR.FREQ.PERC to MAF
 Naive Variant Caller variant count parsing one-liner:
 $ cat variants.vcf | grep -v '^#' | cut -f 10 | cut -d ':' -f 4 | tr ',=' '\t:'
 """
-from __future__ import division
 import os
 import sys
 import errno
 import random
 from optparse import OptionParser
 The input VCF must report the variants for each strand.
 The variants should be case-sensitive (e.g. all capital base letters).
 Strand bias: Both strands must show the same bases passing the frequency
 threshold (but not necessarily in the same order). If the site fails this test,
 the number of alleles is reported as 0."""
 def get_options(defaults, usage, description='', epilog=''):
 """Get options, print usage text."""
 parser = OptionParser(usage=usage, description=description, epilog=epilog)
 print_pos = ''
 if len(coords) > 1: print_pos = coords[1]
 if len(coords) > 2: print_sample = coords[2]
 # set infile_handle to either stdin or the input file
-global infile_handle
 if infile == OPT_DEFAULTS.get('infile'):
 infile_handle = sys.stdin
 sys.stderr.write("Reading from standard input..\n")
 else:
 if os.path.exists(infile):
 infile_handle = open(infile, 'r')
 else:
 fail('Error: Input VCF file '+infile+' not found.')
 # set outfile_handle to either stdout or the output file
-global outfile_handle
 if outfile == OPT_DEFAULTS.get('outfile'):
 outfile_handle = sys.stdout
 else:
 try:
 outfile_handle = open(outfile, 'w')
 site_data['samples'].pop(sample, None)
 if len(site_data['samples']) == 0:
 sys.stderr.write("Error: Sample '"+print_sample+"' not found.\n")
 sys.exit(1)
 site_summary = summarize_site(site_data, sample_names, CANONICAL_VARIANTS,
 freq_thres, covg_thres, stranded, debug=debug)
 if debug and site_summary[0]['print']:
-print line.split('\t')[9].split(':')[-1]
+print(line.split('\t')[9].split(':')[-1])
 try:
 print_site(outfile_handle, site_summary, COLUMNS)
 except IOError as ioe:
 if ioe.errno == errno.EPIPE:
-cleanup()
 sys.exit(0)
-# close any open filehandles
-cleanup()
 # keeps Galaxy from giving an error if there were messages on stderr
 sys.exit(0)
 for (strand, base_count_list) in zip(strands, base_count_lists):
 for base_count in base_count_list:
 sample[strand+base_count[0]] = base_count[1]
 # fill in any zeros
 for base in canonical:
-if not sample.has_key(strand+base):
+if strand+base not in sample:
 sample[strand+base] = 0
 sample['alleles'] = count_alleles(variants, freq_thres, debug=debug)
 # If there's a tie for 2nd, randomly choose one to be 2nd
 if swap:
 tmp_base = ranked_bases[1]
 ranked_bases[1] = ranked_bases[2]
 ranked_bases[2] = tmp_base
-if debug: print "ranked +-: "+str(ranked_bases)
+if debug: print("ranked +-: "+str(ranked_bases))
 sample['coverage'] = coverage
 try:
 sample['major']  = ranked_bases[0][0]
 except IndexError:
 strand = variant[0]
 base = variant[1:]
 if strand in strands:
 summed_counts[base] = stranded_counts[variant] + summed_counts.get(base, 0)
-return summed_counts.items()
+return list(summed_counts.items())
 def process_read_counts(variant_counts, freq_thres=0, sort=False, debug=False):
 """Process a list of read counts by frequency filtering and/or sorting.
 Arguments:
 # sort the list of alleles by read count
 if sort:
 variant_counts.sort(reverse=True, key=lambda variant: variant[1])
 if debug:
-print 'coverage: '+str(coverage)+', freq_thres: '+str(freq_thres)
+print('coverage: '+str(coverage)+', freq_thres: '+str(freq_thres))
 for variant in variant_counts:
-print (variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
+print((variant[0]+': '+str(variant[1])+'/'+str(float(coverage))+' = '+
-str(variant[1]/coverage))
+str(variant[1]/coverage)))
 # remove bases below the frequency threshold
 if freq_thres > 0:
 variant_counts = [variant for variant in variant_counts
 if variant[1]/coverage >= freq_thres]
 alleles_minus = get_read_counts(variant_counts, '-')
 alleles_minus = process_read_counts(alleles_minus, freq_thres=freq_thres,
 sort=False, debug=debug)
 if debug:
-print '+ '+str(alleles_plus)
+print('+ '+str(alleles_plus))
-print '- '+str(alleles_minus)
+print('- '+str(alleles_minus))
 # Check if each strand reports the same set of alleles.
 # Sorting by base is to compare lists without regard to order (as sets).
 alleles_plus_sorted  = sorted([base[0] for base in alleles_plus if base[1]])
 alleles_minus_sorted = sorted([base[0] for base in alleles_minus if base[1]])
 fields = [str(sample.get(column)) for column in columns]
 filehandle.write('\t'.join(fields)+"\n")
 def fail(message):
-cleanup()
 sys.stderr.write(message+'\n')
 sys.exit(1)
-def cleanup():
-if isinstance(infile_handle, file):
-infile_handle.close()
-if isinstance(outfile_handle, file):
-outfile_handle.close()
 if __name__ == "__main__":
 main()

Mercurial > repos > nick > allele_counts

comparison allele-counts.py @ 9:6cc488e11544 draft