Mercurial > repos > nick > allele_counts
comparison allele-counts.py @ 2:318fdf77aa54
Current version, from another toolshed
Includes change in header line
author | nick |
---|---|
date | Fri, 31 May 2013 12:33:48 -0400 |
parents | 49bb46c3a1af |
children | 31361191d2d2 |
comparison
equal
deleted
inserted
replaced
1:49bb46c3a1af | 2:318fdf77aa54 |
---|---|
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 # This parses the output of Dan's "Naive Variant Detector" (previously, | 2 # This parses the output of Dan's "Naive Variant Detector" (previously, |
3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". | 3 # "BAM Coverage"). It was forked from the code of "bam-coverage.py". |
4 # | 4 # |
5 # New in this version: default to stdin and stdout, override by using -i and -o | 5 # New in this version: |
6 # to specify filenames | 6 # Made header line customizable |
7 # - separate from internal column labels, which are used as dict keys | |
7 # | 8 # |
8 # TODO: | 9 # TODO: |
9 # - test handling of -c 0 (and -f 0?) | 10 # - test handling of -c 0 (and -f 0?) |
10 # - should it technically handle data lines that start with a '#'? | 11 # - should it technically handle data lines that start with a '#'? |
11 import os | 12 import os |
12 import sys | 13 import sys |
13 from optparse import OptionParser | 14 from optparse import OptionParser |
14 | 15 |
15 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', | 16 COLUMNS = ['sample', 'chr', 'pos', 'A', 'C', 'G', 'T', 'coverage', 'alleles', 'major', 'minor', 'freq'] #, 'bias'] |
16 'major', 'minor', 'freq'] #, 'bias'] | 17 COLUMN_LABELS = ['SAMPLE', 'CHR', 'POS', 'A', 'C', 'G', 'T', 'CVRG', 'ALLELES', 'MAJOR', 'MINOR', 'MINOR.FREQ.PERC.'] #, 'STRAND.BIAS'] |
17 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] | 18 CANONICAL_VARIANTS = ['A', 'C', 'G', 'T'] |
18 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv | 19 USAGE = """Usage: cat variants.vcf | %prog [options] > alleles.csv |
19 %prog [options] -i variants.vcf -o alleles.csv""" | 20 %prog [options] -i variants.vcf -o alleles.csv""" |
20 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, | 21 OPT_DEFAULTS = {'infile':'-', 'outfile':'-', 'freq_thres':1.0, 'covg_thres':100, |
21 'print_header':False, 'stdin':False} | 22 'print_header':False, 'stdin':False} |
97 | 98 |
98 # set outfile_handle to either stdout or the output file | 99 # set outfile_handle to either stdout or the output file |
99 if outfile == OPT_DEFAULTS.get('outfile'): | 100 if outfile == OPT_DEFAULTS.get('outfile'): |
100 outfile_handle = sys.stdout | 101 outfile_handle = sys.stdout |
101 else: | 102 else: |
102 if os.path.exists(outfile): | 103 try: |
103 fail('Error: The given output filename '+outfile+' already exists.') | |
104 else: | |
105 outfile_handle = open(outfile, 'w') | 104 outfile_handle = open(outfile, 'w') |
106 | 105 except IOError, e: |
106 fail('Error: The given output filename '+outfile+' could not be opened.') | |
107 | |
108 if len(COLUMNS) != len(COLUMN_LABELS): | |
109 fail('Error: Internal column names do not match column labels.') | |
107 if print_header: | 110 if print_header: |
108 outfile_handle.write('#'+'\t'.join(COLUMNS)+"\n") | 111 outfile_handle.write('#'+'\t'.join(COLUMN_LABELS)+"\n") |
109 | 112 |
110 # main loop: process and print one line at a time | 113 # main loop: process and print one line at a time |
111 sample_names = [] | 114 sample_names = [] |
112 for line in infile_handle: | 115 for line in infile_handle: |
113 line = line.rstrip('\r\n') | 116 line = line.rstrip('\r\n') |
195 continue | 198 continue |
196 if variant[0] != '-' and variant[0] != '+': | 199 if variant[0] != '-' and variant[0] != '+': |
197 fail("Error in input VCF: variant data not strand-specific. " | 200 fail("Error in input VCF: variant data not strand-specific. " |
198 +"Failed on line:\n"+line) | 201 +"Failed on line:\n"+line) |
199 try: | 202 try: |
200 variant_counts[variant] = int(reads) | 203 variant_counts[variant] = int(float(reads)) |
201 except ValueError, e: | 204 except ValueError, e: |
202 continue | 205 fail("Error in input VCF: Variant count not a valid number. Failed on variant count string '"+reads+"'\nIn the following line:\n"+line) |
203 | 206 |
204 sample_counts[sample_names[i]] = variant_counts | 207 sample_counts[sample_names[i]] = variant_counts |
205 | 208 |
206 site['samples'] = sample_counts | 209 site['samples'] = sample_counts |
207 | 210 |
268 sample['major'] = ranked_bases[0][0] | 271 sample['major'] = ranked_bases[0][0] |
269 except IndexError, e: | 272 except IndexError, e: |
270 sample['major'] = '.' | 273 sample['major'] = '.' |
271 try: | 274 try: |
272 sample['minor'] = ranked_bases[1][0] | 275 sample['minor'] = ranked_bases[1][0] |
273 sample['freq'] = ranked_bases[1][1] / float(coverage) | 276 sample['freq'] = round(ranked_bases[1][1]/float(coverage), 5) |
274 except IndexError, e: | 277 except IndexError, e: |
275 sample['minor'] = '.' | 278 sample['minor'] = '.' |
276 sample['freq'] = 0.0 | 279 sample['freq'] = 0.0 |
277 | 280 |
278 site_summary.append(sample) | 281 site_summary.append(sample) |