1
|
1 #!/usr/bin/env python
|
|
2 # Boris Rebolledo Jaramillo
|
|
3 # berebolledo@gmail.com
|
|
4 # v1.0.2
|
|
5
|
|
6 """Filter mapped reads on MD tag string.
|
|
7
|
|
8 This tool reads the MD tag of mapped reads (see SAM format specification).
|
|
9 The user defines the 5' and 3' windows n and m (in bp), respectively.
|
|
10 The mapped read is discarded if it contains any number of mismatches within n
|
|
11 bases of the read 5' end and within m bases of the read 3' end.
|
|
12 Option: save discarded reads in an additional SAM file.
|
|
13
|
|
14 usage: %prog in.sam n m out.sam saveDiscarded?[yes/no] [discarded.sam]
|
|
15
|
|
16 """
|
|
17
|
|
18 import sys,re
|
|
19
|
|
20 if len(sys.argv) < 6:
|
|
21 sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> '
|
|
22 '<saveDiscarded?[yes/no]> [<discarded.sam>]')
|
|
23 if sys.argv[5] == 'yes' and len(sys.argv) < 7:
|
|
24 sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> '
|
|
25 '<saveDiscarded?[yes/no]> [<discarded.sam>]')
|
|
26
|
|
27 sam_file=list(open(sys.argv[1]))
|
|
28 sam_mdfiltered=open(sys.argv[4],'w+')
|
|
29 if sys.argv[5] == 'yes':
|
|
30 sam_discarded=open(sys.argv[6],'w+')
|
|
31
|
|
32
|
|
33 # The MD tag is generated out of the alignment operations, regardless the strand
|
|
34 # the read was mapped to. It describes the alignment from the leftmost aligned
|
|
35 # base, which might be either the 5' or 3' end of the original read.
|
|
36 # Luckily, the read orientation in the alignment can be extracted from the
|
|
37 # alignment flag.
|
|
38 # The code used here to identify the mapped reads orientation was taken from
|
|
39 # the 'explain_sam_flags.py' script of the Picard-Tools.
|
|
40 # It defines the meaning of the sam flags in plain English, so they can be used
|
|
41 # to identify reads mapped to the reverse strand: flag = 16
|
|
42 # http://picard-tools.sourcearchive.com/documentation/1.25-1/
|
|
43
|
|
44 lstFlags = [
|
|
45 ("read paired", 0x1),
|
|
46 ("read mapped in proper pair", 0x2),
|
|
47 ("read unmapped", 0x4),
|
|
48 ("mate unmapped", 0x8),
|
|
49 ("read reverse strand", 0x10),
|
|
50 ("mate reverse strand", 0x20),
|
|
51 ("first in pair", 0x40),
|
|
52 ("second in pair", 0x80),
|
|
53 ("not primary alignment", 0x100),
|
|
54 ("read fails platform/vendor quality checks", 0x200),
|
|
55 ("read is PCR or optical duplicate", 0x400)
|
|
56 ]
|
|
57
|
|
58 for line in sam_file:
|
|
59 if line.split("\t")[0].startswith('@'):
|
|
60 if sys.argv[5] == 'yes':
|
|
61 sam_discarded.write(line)
|
|
62 sam_mdfiltered.write(line)
|
|
63 else:
|
|
64 sam_mdfiltered.write(line)
|
|
65 else:
|
|
66 if len(line.split("\t")) > 11: # Any SAM optional fields?
|
|
67 for field in line.split("\t")[11:]:
|
|
68 if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', field): # is MD tag field present?
|
|
69 flag = int(line.split("\t")[1])
|
|
70 for strFlagName, iMask in lstFlags:
|
|
71 if flag & iMask:
|
|
72 if iMask == 16:
|
|
73 strand = 'reverse'
|
|
74 break
|
|
75 else:
|
|
76 strand = 'forward'
|
|
77 else: # flag = 0 is not part of the definitions,
|
|
78 # but it corresponds to forward strand mapping
|
|
79 strand = 'forward'
|
|
80 # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag:
|
|
81 mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0]
|
|
82 mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2]
|
|
83 md_list = re.split(r'(\D)',mdtag)
|
|
84 if strand == 'forward':
|
|
85 if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]):
|
|
86 sam_mdfiltered.write(line)
|
|
87 else:
|
|
88 if sys.argv[5] == 'yes':
|
|
89 sam_discarded.write(line)
|
|
90 elif strand == 'reverse':
|
|
91 if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]):
|
|
92 sam_mdfiltered.write(line)
|
|
93 else:
|
|
94 if sys.argv[5] == 'yes':
|
|
95 sam_discarded.write(line)
|
|
96 break
|
|
97
|
|
98 sam_mdfiltered.close()
|
|
99
|
|
100 if sys.argv[5] == 'yes':
|
|
101 sam_discarded.close() |