annotate pyPRADA_1.2/tools/samtools-0.1.16/misc/varfilter.py @ 0:acc2ca1a3ba4

Uploaded
author siyuan
date Thu, 20 Feb 2014 00:44:58 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #!/software/bin/python
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 # Author: lh3, converted to python and modified to add -C option by Aylwyn Scally
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 # About:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 # varfilter.py is a port of Heng's samtools.pl varFilter script into
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 # python, with an additional -C INT option. This option sets a minimum
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 # consensus score, above which the script will output a pileup line
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 # wherever it _could have_ called a variant, even if none is actually
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 # called (i.e. hom-ref positions). This is important if you want to
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 # subsequently merge the calls with those for another individual to get a
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 # synoptic view of calls at each site. Without this option, and in all
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 # other respects, it behaves like samtools.pl varFilter.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 # Aylwyn Scally as6@sanger.ac.uk
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 # Filtration code:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 # C low CNS quality (hom-ref only)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 # d low depth
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 # D high depth
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 # W too many SNPs in a window (SNP only)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 # G close to a high-quality indel (SNP only)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 # Q low RMS mapping quality (SNP only)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 # g close to another indel with higher quality (indel only)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 # s low SNP quality (SNP only)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 # i low indel quality (indel only)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 import sys
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 import getopt
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 def usage():
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 print '''usage: varfilter.py [options] [cns-pileup]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 Options: -Q INT minimum RMS mapping quality for SNPs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 -q INT minimum RMS mapping quality for gaps
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 -d INT minimum read depth
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 -D INT maximum read depth
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 -S INT minimum SNP quality
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 -i INT minimum indel quality
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 -C INT minimum consensus quality for hom-ref sites
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 -G INT min indel score for nearby SNP filtering
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 -w INT SNP within INT bp around a gap to be filtered
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 -W INT window size for filtering dense SNPs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 -N INT max number of SNPs in a window
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 -l INT window size for filtering adjacent gaps
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 -p print filtered variants'''
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 def varFilter_aux(first, is_print):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 try:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 if first[1] == 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 sys.stdout.write("\t".join(first[4:]) + "\n")
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 elif is_print:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n")
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 except IOError:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 sys.exit()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 mindepth = 3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 maxdepth = 100
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 gapgapwin = 30
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 minsnpmapq = 25
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 mingapmapq = 10
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 minindelscore = 25
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 scorefactor = 100
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 snpgapwin = 10
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 densesnpwin = 10
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 densesnps = 2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 printfilt = False
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 minsnpq = 0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 minindelq = 0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 mincnsq = 0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 try:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', [])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 except getopt.GetoptError:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 usage()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 sys.exit(2)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 for (oflag, oarg) in options:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 if oflag == '-d': mindepth = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 if oflag == '-D': maxdepth = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 if oflag == '-l': gapgapwin = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 if oflag == '-Q': minsnpmapq = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 if oflag == '-q': mingapmapq = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 if oflag == '-G': minindelscore = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 if oflag == '-s': scorefactor = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 if oflag == '-w': snpgapwin = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 if oflag == '-W': densesnpwin = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 if oflag == '-C': mincnsq = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 if oflag == '-N': densesnps = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 if oflag == '-p': printfilt = True
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 if oflag == '-S': minsnpq = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 if oflag == '-i': minindelq = int(oarg)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 if len(args) < 1:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 inp = sys.stdin
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 inp = open(args[0])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 # calculate the window size
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 max_dist = max(gapgapwin, snpgapwin, densesnpwin)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 staging = []
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 for t in (line.strip().split() for line in inp):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 (flt, score) = (0, -1)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 # non-var sites
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 if t[3] == '*/*':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 is_snp = t[2].upper() != t[3].upper()
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 if not (is_snp or mincnsq):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 # clear the out-of-range elements
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 while staging:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 # Still on the same chromosome and the first element's window still affects this position?
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 varFilter_aux(staging.pop(0), printfilt)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 # first a simple filter
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 if int(t[7]) < mindepth:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 flt = 2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 elif int(t[7]) > maxdepth:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 flt = 3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 if t[2] == '*': # an indel
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 if minindelq and minindelq > int(t[5]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 flt = 8
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 elif is_snp:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 if minsnpq and minsnpq> int(t[5]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 flt = 7
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 if mincnsq and mincnsq > int(t[4]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 flt = 9
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 # site dependent filters
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 dlen = 0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 if flt == 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 if t[2] == '*': # an indel
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 # If deletion, remember the length of the deletion
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 (a,b) = t[3].split('/')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 alen = len(a) - 1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 blen = len(b) - 1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 if alen>blen:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 if a[0] == '-': dlen=alen
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 elif b[0] == '-': dlen=blen
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 if int(t[6]) < mingapmapq:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 flt = 1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 # filtering SNPs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 if int(t[5]) >= minindelscore:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 for x in (y for y in staging if y[3]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 # Is it a SNP and is it outside the SNP filter window?
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 if x[1] == 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 x[1] = 5
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 # calculate the filtering score (different from indel quality)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 score = int(t[5])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 if t[8] != '*':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 score += scorefactor * int(t[10])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 if t[9] != '*':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 score += scorefactor * int(t[11])
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 # check the staging list for indel filtering
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 for x in (y for y in staging if y[3]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 # Is it a SNP and is it outside the gap filter window
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 if x[0] < score:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 x[1] = 6
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 else:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 flt = 6
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 else: # a SNP or hom-ref
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 if int(t[6]) < minsnpmapq:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 flt = 1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 # check adjacent SNPs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 k = 1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 for x in (y for y in staging if y[3]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 k += 1
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 # filtering is necessary
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 if k > densesnps:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 flt = 4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 for x in (y for y in staging if y[3]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 x[1] = 4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 else: # then check gap filter
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 for x in (y for y in staging if y[3]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 continue
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 if x[0] >= minindelscore:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 flt = 5
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 break
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 staging.append([score, flt, dlen, is_snp] + t)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 # output the last few elements in the staging list
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204 while staging:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 varFilter_aux(staging.pop(0), printfilt)