5
|
1 #!/usr/bin/python
|
|
2 import os
|
|
3 import sys
|
|
4 from optparse import OptionParser
|
|
5
|
|
6 OPT_DEFAULTS = {'freq_thres':0, 'covg_thres':0}
|
|
7 BASES = ['A', 'C', 'G', 'T']
|
|
8 USAGE = ("Usage: %prog (options) 'variant_str' ('variant_str2' (etc))\n"
|
|
9 +" cat variants.txt > %prog (options)")
|
|
10
|
|
11 def main():
|
|
12
|
|
13 parser = OptionParser(usage=USAGE)
|
|
14 parser.add_option('-f', '--freq-thres', dest='freq_thres', type='float',
|
|
15 default=OPT_DEFAULTS.get('freq_thres'),
|
|
16 help=('Frequency threshold for counting alleles, given in percentage: -f 1 '
|
|
17 +'= 1% frequency. Default is %default%.'))
|
|
18 parser.add_option('-c', '--covg-thres', dest='covg_thres', type='int',
|
|
19 default=OPT_DEFAULTS.get('covg_thres'),
|
|
20 help=('Coverage threshold. Each site must be supported by at least this '
|
|
21 +'many reads on each strand. Otherwise the site will not be printed in '
|
|
22 +'the output. The default is %default reads per strand.'))
|
|
23 (options, args) = parser.parse_args()
|
|
24 freq_thres = options.freq_thres
|
|
25 covg_thres = options.covg_thres
|
|
26
|
|
27 if len(sys.argv) > 1 and '-h' in sys.argv[1][0:3]:
|
|
28 script_name = os.path.basename(sys.argv[0])
|
|
29 print """USAGE:
|
|
30 $ """+script_name+""" [sample column text]
|
|
31 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,'
|
|
32 $ """+script_name+""" '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
|
|
33 $ """+script_name+""" '+A=10,+G=2,-A=20,-G=41,' '0/1:10,1:0.25,0.025:-A=29,-AG=1,-G=10,'
|
|
34 Or invoke with no arguments to use interactively. It will read from stdin, so
|
|
35 just paste one sample per line."""
|
|
36 sys.exit(0)
|
|
37
|
|
38 if len(args) > 0:
|
|
39 stdin = False
|
|
40 samples = args
|
|
41 else:
|
|
42 stdin = True
|
|
43 samples = sys.stdin
|
|
44 print "Reading from standard input.."
|
|
45
|
|
46 for sample in samples:
|
|
47 print ''
|
|
48 sample = sample.split(':')[-1]
|
|
49 if not sample:
|
|
50 continue
|
|
51 print sample
|
|
52 counts_dict = parse_counts(sample)
|
|
53 compute_stats(counts_dict, freq_thres, covg_thres)
|
|
54
|
|
55
|
|
56 def parse_counts(sample_str):
|
|
57 counts_dict = {}
|
|
58 counts = sample_str.split(',')
|
|
59 for count in counts:
|
|
60 if '=' not in count:
|
|
61 continue
|
|
62 (var, reads) = count.split('=')
|
|
63 if var[1:] in BASES:
|
|
64 counts_dict[var] = int(reads)
|
|
65 return counts_dict
|
|
66
|
|
67
|
|
68 def compute_stats(counts_dict, freq_thres, covg_thres):
|
|
69 # totals for A, C, G, T
|
|
70 counts_unstranded = {}
|
|
71 for base in BASES:
|
|
72 counts_unstranded[base] = 0
|
|
73 for strand in '+-':
|
|
74 counts_unstranded[base] += counts_dict.get(strand+base, 0)
|
|
75 print '+- '+str(counts_unstranded)
|
|
76
|
|
77 # get counts for each strand
|
|
78 plus_counts = get_stranded_counts(counts_dict, '+')
|
|
79 minus_counts = get_stranded_counts(counts_dict, '-')
|
|
80 counts_lists = {'+':plus_counts, '-':minus_counts}
|
|
81 print ' + '+str(plus_counts)
|
|
82 print ' - '+str(minus_counts)
|
|
83
|
|
84 # stranded coverage threshold
|
|
85 coverages = {}
|
|
86 failing_strands = {}
|
|
87 for strand in '+-':
|
|
88 coverages[strand] = 0
|
|
89 for count in counts_lists[strand].values():
|
|
90 coverages[strand] += count
|
|
91 if coverages[strand] < covg_thres:
|
|
92 failing_strands[strand] = coverages[strand]
|
|
93 sys.stdout.write(strand+'coverage: '+str(coverages[strand])+"\t")
|
|
94 coverages['+-'] = coverages['+'] + coverages['-']
|
|
95 sys.stdout.write("+-coverage: "+str(coverages['+-'])+"\n")
|
|
96
|
|
97 if failing_strands:
|
|
98 for strand in failing_strands:
|
|
99 print ('coverage on '+strand+' strand too low ('
|
|
100 +str(failing_strands[strand])+' < '+str(covg_thres)+")")
|
|
101 return
|
|
102
|
|
103 # apply frequency threshold
|
|
104 for strand in counts_lists:
|
|
105 strand_counts = counts_lists[strand]
|
|
106 for variant in strand_counts.keys():
|
|
107 # print (variant+" freq: "+str(strand_counts[variant])+"/"
|
|
108 # +str(coverages[strand])+" = "
|
|
109 # +str(strand_counts[variant]/float(coverages[strand])))
|
|
110 if strand_counts[variant]/float(coverages[strand]) < freq_thres:
|
|
111 strand_counts.pop(variant)
|
|
112 plus_variants = sorted(plus_counts.keys())
|
|
113 minus_variants = sorted(minus_counts.keys())
|
|
114 if plus_variants == minus_variants:
|
|
115 strand_bias = False
|
|
116 print "no strand bias: +"+str(plus_variants)+" == -"+str(minus_variants)
|
|
117 sys.stdout.write("alleles: "+str(len(plus_variants))+"\t")
|
|
118 else:
|
|
119 strand_bias = True
|
|
120 print " strand bias: +"+str(plus_variants)+" != -"+str(minus_variants)
|
|
121 sys.stdout.write("alleles: 0\t")
|
|
122
|
|
123 variants_sorted = sort_variants(counts_unstranded)
|
|
124 if len(variants_sorted) >= 1:
|
|
125 sys.stdout.write("major: "+variants_sorted[0]+"\t")
|
|
126 minor = '.'
|
|
127 if len(variants_sorted) == 2:
|
|
128 minor = variants_sorted[1]
|
|
129 elif len(variants_sorted) > 2:
|
|
130 if (counts_unstranded.get(variants_sorted[1]) ==
|
|
131 counts_unstranded.get(variants_sorted[2])):
|
|
132 minor = 'N'
|
|
133 else:
|
|
134 minor = variants_sorted[1]
|
|
135
|
|
136 sys.stdout.write("minor: "+minor+"\tfreq: ")
|
|
137 print round(float(counts_unstranded.get(minor, 0))/coverages['+-'], 5)
|
|
138
|
|
139
|
|
140 def get_stranded_counts(unstranded_counts, strand):
|
|
141 stranded_counts = {}
|
|
142 for variant in unstranded_counts:
|
|
143 if variant[0] == strand:
|
|
144 stranded_counts[variant[1:]] = unstranded_counts[variant]
|
|
145 return stranded_counts
|
|
146
|
|
147 def sort_variants(variant_counts):
|
|
148 """Sort the list of variants based on their counts. Returns a list of just
|
|
149 the variants, no counts."""
|
|
150 variants = variant_counts.keys()
|
|
151 var_del = []
|
|
152 for variant in variants:
|
|
153 if variant_counts.get(variant) == 0:
|
|
154 var_del.append(variant)
|
|
155 for variant in var_del:
|
|
156 variants.remove(variant)
|
|
157 variants.sort(reverse=True, key=lambda variant: variant_counts.get(variant,0))
|
|
158 return variants
|
|
159
|
|
160 if __name__ == "__main__":
|
|
161 main()
|