diff PsiCLASS-1.0.2/samtools-0.1.19/misc/varfilter.py @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/varfilter.py	Fri Mar 26 16:52:45 2021 +0000
@@ -0,0 +1,205 @@
+#!/software/bin/python
+
+# Author: lh3, converted to python and modified to add -C option by Aylwyn Scally
+#
+# About:
+#   varfilter.py is a port of Heng's samtools.pl varFilter script into 
+#   python, with an additional -C INT option. This option sets a minimum 
+#   consensus score, above which the script will output a pileup line 
+#   wherever it _could have_ called a variant, even if none is actually 
+#   called (i.e. hom-ref positions). This is important if you want to
+#   subsequently merge the calls with those for another individual to get a
+#   synoptic view of calls at each site. Without this option, and in all 
+#   other respects, it behaves like samtools.pl varFilter.
+#   
+#   Aylwyn Scally as6@sanger.ac.uk
+
+
+# Filtration code:
+#
+# C low CNS quality (hom-ref only)
+# d low depth
+# D high depth
+# W too many SNPs in a window (SNP only)
+# G close to a high-quality indel (SNP only)
+# Q low RMS mapping quality (SNP only)
+# g close to another indel with higher quality (indel only)
+# s low SNP quality (SNP only)
+# i low indel quality (indel only)
+
+
+import sys
+import getopt
+
+def usage():
+	print '''usage: varfilter.py [options] [cns-pileup]
+
+Options: -Q INT	minimum RMS mapping quality for SNPs
+		 -q INT	minimum RMS mapping quality for gaps
+		 -d INT	minimum read depth 
+		 -D INT	maximum read depth
+		 -S INT	minimum SNP quality
+		 -i INT	minimum indel quality
+		 -C INT	minimum consensus quality for hom-ref sites
+
+		 -G INT	min indel score for nearby SNP filtering
+		 -w INT	SNP within INT bp around a gap to be filtered
+
+		 -W INT	window size for filtering dense SNPs
+		 -N INT	max number of SNPs in a window
+
+		 -l INT	window size for filtering adjacent gaps
+
+		 -p print filtered variants'''
+
+def varFilter_aux(first, is_print):
+	try:
+		if first[1] == 0:
+			sys.stdout.write("\t".join(first[4:]) + "\n")
+		elif is_print:
+			sys.stderr.write("\t".join(["UQdDWGgsiCX"[first[1]]] + first[4:]) + "\n")
+	except IOError:
+		sys.exit()
+ 
+mindepth = 3
+maxdepth = 100
+gapgapwin = 30
+minsnpmapq = 25
+mingapmapq = 10
+minindelscore = 25
+scorefactor = 100
+snpgapwin = 10
+densesnpwin = 10
+densesnps = 2
+printfilt = False
+minsnpq = 0
+minindelq = 0
+mincnsq = 0
+
+try:
+	options, args = getopt.gnu_getopt(sys.argv[1:], 'pq:d:D:l:Q:w:W:N:G:S:i:C:', [])
+except getopt.GetoptError:
+	usage()
+	sys.exit(2)
+for (oflag, oarg) in options:
+	if oflag == '-d': mindepth = int(oarg)
+	if oflag == '-D': maxdepth = int(oarg)
+	if oflag == '-l': gapgapwin = int(oarg)
+	if oflag == '-Q': minsnpmapq = int(oarg)
+	if oflag == '-q': mingapmapq = int(oarg)
+	if oflag == '-G': minindelscore = int(oarg)
+	if oflag == '-s': scorefactor = int(oarg)
+	if oflag == '-w': snpgapwin = int(oarg)
+	if oflag == '-W': densesnpwin = int(oarg)
+	if oflag == '-C': mincnsq = int(oarg)
+	if oflag == '-N': densesnps = int(oarg)
+	if oflag == '-p': printfilt = True
+	if oflag == '-S': minsnpq = int(oarg)
+	if oflag == '-i': minindelq = int(oarg)
+
+if len(args) < 1:
+	inp = sys.stdin
+else:
+	inp = open(args[0])
+
+# calculate the window size
+max_dist = max(gapgapwin, snpgapwin, densesnpwin)
+
+staging = []
+for t in (line.strip().split() for line in inp):
+	(flt, score) = (0, -1)
+	# non-var sites
+	if t[3] == '*/*':
+		continue
+	is_snp = t[2].upper() != t[3].upper()
+	if not (is_snp or mincnsq):
+		continue
+	# clear the out-of-range elements
+	while staging:
+		# Still on the same chromosome and the first element's window still affects this position?  
+		if staging[0][4] == t[0] and int(staging[0][5]) + staging[0][2] + max_dist >= int(t[1]):
+			break
+		varFilter_aux(staging.pop(0), printfilt)
+	
+	# first a simple filter
+	if int(t[7]) < mindepth:
+		flt = 2
+	elif int(t[7]) > maxdepth:
+		flt = 3
+	if t[2] == '*': # an indel
+		if minindelq and minindelq > int(t[5]):
+			flt = 8
+	elif is_snp:
+		if minsnpq and minsnpq> int(t[5]):
+			flt = 7
+	else:
+		if mincnsq and mincnsq > int(t[4]):
+			flt = 9
+
+	# site dependent filters
+	dlen = 0
+	if flt == 0:
+		if t[2] == '*': # an indel
+			# If deletion, remember the length of the deletion
+			(a,b) = t[3].split('/')
+			alen = len(a) - 1
+			blen = len(b) - 1
+			if alen>blen:
+				if a[0] == '-': dlen=alen 
+			elif b[0] == '-': dlen=blen 
+
+			if int(t[6]) < mingapmapq:
+				flt = 1
+			# filtering SNPs
+			if int(t[5]) >= minindelscore:
+				for x in (y for y in staging if y[3]):
+					# Is it a SNP and is it outside the SNP filter window?
+					if x[0] >= 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
+						continue
+					if x[1] == 0:
+						x[1] = 5
+			
+			# calculate the filtering score (different from indel quality)
+			score = int(t[5])
+			if t[8] != '*':
+				score += scorefactor * int(t[10])
+			if t[9] != '*':
+				score += scorefactor * int(t[11])
+			# check the staging list for indel filtering
+			for x in (y for y in staging if y[3]):
+			  # Is it a SNP and is it outside the gap filter window
+				if x[0] < 0 or int(x[5]) + x[2] + gapgapwin < int(t[1]):
+					continue
+				if x[0] < score:
+					x[1] = 6
+				else:
+					flt = 6
+					break
+		else: # a SNP or hom-ref
+			if int(t[6]) < minsnpmapq:
+				flt = 1
+			# check adjacent SNPs
+			k = 1
+			for x in (y for y in staging if y[3]):
+				if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and (x[1] == 0 or x[1] == 4 or x[1] == 5):
+					k += 1
+			
+			# filtering is necessary
+			if k > densesnps:
+				flt = 4
+				for x in (y for y in staging if y[3]):
+					if x[0] < 0 and int(x[5]) + x[2] + densesnpwin >= int(t[1]) and x[1] == 0:
+						x[1] = 4
+			else: # then check gap filter
+				for x in (y for y in staging if y[3]):
+					if x[0] < 0 or int(x[5]) + x[2] + snpgapwin < int(t[1]):
+						continue
+					if x[0] >= minindelscore:
+						flt = 5
+						break
+	
+	staging.append([score, flt, dlen, is_snp] + t)
+  
+# output the last few elements in the staging list
+while staging:
+	varFilter_aux(staging.pop(0), printfilt)