Mercurial > repos > nick > allele_counts
comparison tests/compute-site.py @ 5:31361191d2d2
Uploaded tarball.
Version 1.1: Stranded output, slightly different handling of minor allele ties and 0 coverage sites, revised help text, added test datasets.
author | nick |
---|---|
date | Thu, 12 Sep 2013 11:34:23 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:898eb3daab43 | 5:31361191d2d2 |
---|---|
1 #!/usr/bin/python | |
2 import os | |
3 import sys | |
4 from optparse import OptionParser | |
5 | |
6 OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0} | |
7 BASES = ['A', 'C', 'G', 'T'] | |
8 USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n" | |
9 +" cat variants.txt > %prog (options)") | |
10 | |
11 def main(): | |
12 | |
13 parser = OptionParser(usage=USAGE) | |
14 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float', | |
15 default=OPT_DEFAULTS.get('freq_thres'), | |
16 help=('Frequency threshold for counting alleles, given in percentage: -f 1 ' | |
17 +'= 1% frequency. Default is %default%.')) | |
18 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int', | |
19 default=OPT_DEFAULTS.get('covg_thres'), | |
20 help=('Coverage threshold. Each site must be supported by at least this ' | |
21 +'many reads on each strand. Otherwise the site will not be printed in ' | |
22 +'the output. The default is %default reads per strand.')) | |
23 (options, args) = parser.parse_args() | |
24 freq_thres = options.freq_thres | |
25 covg_thres = options.covg_thres | |
26 | |
27 if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]: | |
28 script_name = os.path.basename(sys.argv[0]) | |
29 print """USAGE: | |
30 $ """+script_name+""" [sample column text] | |
31 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' | |
32 $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' | |
33 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,' | |
34 Or invoke with no arguments to use interactively. It will read from stdin, so | |
35 just paste one sample per line.""" | |
36 sys.exit(0) | |
37 | |
38 if len(args) > 0: | |
39 stdin = False | |
40 samples = args | |
41 else: | |
42 stdin = True | |
43 samples = sys.stdin | |
44 print "Reading from standard input.." | |
45 | |
46 for sample in samples: | |
47 print '' | |
48 sample = sample.split(':')[-1] | |
49 if not sample: | |
50 continue | |
51 print sample | |
52 counts_dict = parse_counts(sample) | |
53 compute_stats(counts_dict, freq_thres, covg_thres) | |
54 | |
55 | |
56 def parse_counts(sample_str): | |
57 counts_dict = {} | |
58 counts = sample_str.split(',') | |
59 for count in counts: | |
60 if '=' not in count: | |
61 continue | |
62 (var, reads) = count.split('=') | |
63 if var[1:] in BASES: | |
64 counts_dict[var] = int(reads) | |
65 return counts_dict | |
66 | |
67 | |
68 def compute_stats(counts_dict, freq_thres, covg_thres): | |
69 # totals for A, C, G, T | |
70 counts_unstranded = {} | |
71 for base in BASES: | |
72 counts_unstranded[base] = 0 | |
73 for strand in '+-': | |
74 counts_unstranded[base] += counts_dict.get(strand+base, 0) | |
75 print '+- '+str(counts_unstranded) | |
76 | |
77 # get counts for each strand | |
78 plus_counts = get_stranded_counts(counts_dict, '+') | |
79 minus_counts = get_stranded_counts(counts_dict, '-') | |
80 counts_lists = {'+':plus_counts, '-':minus_counts} | |
81 print ' + '+str(plus_counts) | |
82 print ' - '+str(minus_counts) | |
83 | |
84 # stranded coverage threshold | |
85 coverages = {} | |
86 failing_strands = {} | |
87 for strand in '+-': | |
88 coverages[strand] = 0 | |
89 for count in counts_lists[strand].values(): | |
90 coverages[strand] += count | |
91 if coverages[strand] < covg_thres: | |
92 failing_strands[strand] = coverages[strand] | |
93 sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t") | |
94 coverages['+-'] = coverages['+'] + coverages['-'] | |
95 sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n") | |
96 | |
97 if failing_strands: | |
98 for strand in failing_strands: | |
99 print ('coverage on '+strand+' strand too low (' | |
100 +str(failing_strands[strand])+' < '+str(covg_thres)+")") | |
101 return | |
102 | |
103 # apply frequency threshold | |
104 for strand in counts_lists: | |
105 strand_counts = counts_lists[strand] | |
106 for variant in strand_counts.keys(): | |
107 # print (variant+" freq: "+str(strand_counts[variant])+"/" | |
108 # +str(coverages[strand])+" = " | |
109 # +str(strand_counts[variant]/float(coverages[strand]))) | |
110 if strand_counts[variant]/float(coverages[strand]) < freq_thres: | |
111 strand_counts.pop(variant) | |
112 plus_variants = sorted(plus_counts.keys()) | |
113 minus_variants = sorted(minus_counts.keys()) | |
114 if plus_variants == minus_variants: | |
115 strand_bias = False | |
116 print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants) | |
117 sys.stdout.write("alleles: "+str(len(plus_variants))+"\t") | |
118 else: | |
119 strand_bias = True | |
120 print " strand bias: +"+str(plus_variants)+" != -"+str(minus_variants) | |
121 sys.stdout.write("alleles: 0\t") | |
122 | |
123 variants_sorted = sort_variants(counts_unstranded) | |
124 if len(variants_sorted) >= 1: | |
125 sys.stdout.write("major: "+variants_sorted[0]+"\t") | |
126 minor = '.' | |
127 if len(variants_sorted) == 2: | |
128 minor = variants_sorted[1] | |
129 elif len(variants_sorted) > 2: | |
130 if (counts_unstranded.get(variants_sorted[1]) == | |
131 counts_unstranded.get(variants_sorted[2])): | |
132 minor = 'N' | |
133 else: | |
134 minor = variants_sorted[1] | |
135 | |
136 sys.stdout.write("minor: "+minor+"\tfreq: ") | |
137 print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5) | |
138 | |
139 | |
140 def get_stranded_counts(unstranded_counts, strand): | |
141 stranded_counts = {} | |
142 for variant in unstranded_counts: | |
143 if variant[0] == strand: | |
144 stranded_counts[variant[1:]] = unstranded_counts[variant] | |
145 return stranded_counts | |
146 | |
147 def sort_variants(variant_counts): | |
148 """Sort the list of variants based on their counts. Returns a list of just | |
149 the variants, no counts.""" | |
150 variants = variant_counts.keys() | |
151 var_del = [] | |
152 for variant in variants: | |
153 if variant_counts.get(variant) == 0: | |
154 var_del.append(variant) | |
155 for variant in var_del: | |
156 variants.remove(variant) | |
157 variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0)) | |
158 return variants | |
159 | |
160 if __name__ == "__main__": | |
161 main() |