Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/varfilter.py @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #!/software/bin/python | |
2 | |
3 # Author: lh3, converted to python and modified to add -C option by Aylwyn Scally | |
4 # | |
5 # About: | |
6 # varfilter.py is a port of Heng's samtools.pl varFilter script into | |
7 # python, with an additional -C INT option. This option sets a minimum | |
8 # consensus score, above which the script will output a pileup line | |
9 # wherever it _could have_ called a variant, even if none is actually | |
10 # called (i.e. hom-ref positions). This is important if you want to | |
11 # subsequently merge the calls with those for another individual to get a | |
12 # synoptic view of calls at each site. Without this option, and in all | |
13 # other respects, it behaves like samtools.pl varFilter. | |
14 # | |
15 # Aylwyn Scally as6@sanger.ac.uk | |
16 | |
17 | |
18 # Filtration code: | |
19 # | |
20 # C low CNS quality (hom-ref only) | |
21 # d low depth | |
22 # D high depth | |
23 # W too many SNPs in a window (SNP only) | |
24 # G close to a high-quality indel (SNP only) | |
25 # Q low RMS mapping quality (SNP only) | |
26 # g close to another indel with higher quality (indel only) | |
27 # s low SNP quality (SNP only) | |
28 # i low indel quality (indel only) | |
29 | |
30 | |
31 import sys | |
32 import getopt | |
33 | |
34 def usage(): | |
35 print '''usage: varfilter.py [options] [cns-pileup] | |
36 | |
37 Options: -Q INT minimum RMS mapping quality for SNPs | |
38 -q INT minimum RMS mapping quality for gaps | |
39 -d INT minimum read depth | |
40 -D INT maximum read depth | |
41 -S INT minimum SNP quality | |
42 -i INT minimum indel quality | |
43 -C INT minimum consensus quality for hom-ref sites | |
44 | |
45 -G INT min indel score for nearby SNP filtering | |
46 -w INT SNP within INT bp around a gap to be filtered | |
47 | |
48 -W INT window size for filtering dense SNPs | |
49 -N INT max number of SNPs in a window | |
50 | |
51 -l INT window size for filtering adjacent gaps | |
52 | |
53 -p print filtered variants''' | |
54 | |
55 def varFilter_aux(first, is_print): | |
56 try: | |
57 if first[1] == 0: | |
58 sys.stdout.write("\t".join(first[4:]) + "\n") | |
59 elif is_print: | |
60 sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n") | |
61 except IOError: | |
62 sys.exit() | |
63 | |
64 mindepth = 3 | |
65 maxdepth = 100 | |
66 gapgapwin = 30 | |
67 minsnpmapq = 25 | |
68 mingapmapq = 10 | |
69 minindelscore = 25 | |
70 scorefactor = 100 | |
71 snpgapwin = 10 | |
72 densesnpwin = 10 | |
73 densesnps = 2 | |
74 printfilt = False | |
75 minsnpq = 0 | |
76 minindelq = 0 | |
77 mincnsq = 0 | |
78 | |
79 try: | |
80 options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', []) | |
81 except getopt.GetoptError: | |
82 usage() | |
83 sys.exit(2) | |
84 for (oflag, oarg) in options: | |
85 if oflag == '-d': mindepth = int(oarg) | |
86 if oflag == '-D': maxdepth = int(oarg) | |
87 if oflag == '-l': gapgapwin = int(oarg) | |
88 if oflag == '-Q': minsnpmapq = int(oarg) | |
89 if oflag == '-q': mingapmapq = int(oarg) | |
90 if oflag == '-G': minindelscore = int(oarg) | |
91 if oflag == '-s': scorefactor = int(oarg) | |
92 if oflag == '-w': snpgapwin = int(oarg) | |
93 if oflag == '-W': densesnpwin = int(oarg) | |
94 if oflag == '-C': mincnsq = int(oarg) | |
95 if oflag == '-N': densesnps = int(oarg) | |
96 if oflag == '-p': printfilt = True | |
97 if oflag == '-S': minsnpq = int(oarg) | |
98 if oflag == '-i': minindelq = int(oarg) | |
99 | |
100 if len(args) < 1: | |
101 inp = sys.stdin | |
102 else: | |
103 inp = open(args[0]) | |
104 | |
105 # calculate the window size | |
106 max_dist = max(gapgapwin, snpgapwin, densesnpwin) | |
107 | |
108 staging = [] | |
109 for t in (line.strip().split() for line in inp): | |
110 (flt, score) = (0, -1) | |
111 # non-var sites | |
112 if t[3] == '*/*': | |
113 continue | |
114 is_snp = t[2].upper() != t[3].upper() | |
115 if not (is_snp or mincnsq): | |
116 continue | |
117 # clear the out-of-range elements | |
118 while staging: | |
119 # Still on the same chromosome and the first element's window still affects this position? | |
120 if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]): | |
121 break | |
122 varFilter_aux(staging.pop(0), printfilt) | |
123 | |
124 # first a simple filter | |
125 if int(t[7]) < mindepth: | |
126 flt = 2 | |
127 elif int(t[7]) > maxdepth: | |
128 flt = 3 | |
129 if t[2] == '*': # an indel | |
130 if minindelq and minindelq > int(t[5]): | |
131 flt = 8 | |
132 elif is_snp: | |
133 if minsnpq and minsnpq> int(t[5]): | |
134 flt = 7 | |
135 else: | |
136 if mincnsq and mincnsq > int(t[4]): | |
137 flt = 9 | |
138 | |
139 # site dependent filters | |
140 dlen = 0 | |
141 if flt == 0: | |
142 if t[2] == '*': # an indel | |
143 # If deletion, remember the length of the deletion | |
144 (a,b) = t[3].split('/') | |
145 alen = len(a) - 1 | |
146 blen = len(b) - 1 | |
147 if alen>blen: | |
148 if a[0] == '-': dlen=alen | |
149 elif b[0] == '-': dlen=blen | |
150 | |
151 if int(t[6]) < mingapmapq: | |
152 flt = 1 | |
153 # filtering SNPs | |
154 if int(t[5]) >= minindelscore: | |
155 for x in (y for y in staging if y[3]): | |
156 # Is it a SNP and is it outside the SNP filter window? | |
157 if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]): | |
158 continue | |
159 if x[1] == 0: | |
160 x[1] = 5 | |
161 | |
162 # calculate the filtering score (different from indel quality) | |
163 score = int(t[5]) | |
164 if t[8] != '*': | |
165 score += scorefactor * int(t[10]) | |
166 if t[9] != '*': | |
167 score += scorefactor * int(t[11]) | |
168 # check the staging list for indel filtering | |
169 for x in (y for y in staging if y[3]): | |
170 # Is it a SNP and is it outside the gap filter window | |
171 if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]): | |
172 continue | |
173 if x[0] < score: | |
174 x[1] = 6 | |
175 else: | |
176 flt = 6 | |
177 break | |
178 else: # a SNP or hom-ref | |
179 if int(t[6]) < minsnpmapq: | |
180 flt = 1 | |
181 # check adjacent SNPs | |
182 k = 1 | |
183 for x in (y for y in staging if y[3]): | |
184 if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5): | |
185 k += 1 | |
186 | |
187 # filtering is necessary | |
188 if k > densesnps: | |
189 flt = 4 | |
190 for x in (y for y in staging if y[3]): | |
191 if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0: | |
192 x[1] = 4 | |
193 else: # then check gap filter | |
194 for x in (y for y in staging if y[3]): | |
195 if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]): | |
196 continue | |
197 if x[0] >= minindelscore: | |
198 flt = 5 | |
199 break | |
200 | |
201 staging.append([score, flt, dlen, is_snp] + t) | |
202 | |
203 # output the last few elements in the staging list | |
204 while staging: | |
205 varFilter_aux(staging.pop(0), printfilt) |