comparison MDtag_filter.py @ 1:f3d024fd9bf4 draft

Uploaded python script
author boris
date Tue, 24 Apr 2012 12:14:56 -0400
parents
children
comparison
equal deleted inserted replaced
0:447b7748eb83 1:f3d024fd9bf4
1 #!/usr/bin/env python
2 # Boris Rebolledo Jaramillo
3 # berebolledo@gmail.com
4 # v1.0.2
5
6 """Filter mapped reads on MD tag string.
7
8 This tool reads the MD tag of mapped reads (see SAM format specification).
9 The user defines the 5' and 3' windows n and m (in bp), respectively.
10 The mapped read is discarded if it contains any number of mismatches within n
11 bases of the read 5' end and within m bases of the read 3' end.
12 Option: save discarded reads in an additional SAM file.
13
14 usage: %prog in.sam n m out.sam saveDiscarded?[yes/no] [discarded.sam]
15
16 """
17
18 import sys,re
19
20 if len(sys.argv) < 6:
21 sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> '
22 '<saveDiscarded?[yes/no]> [<discarded.sam>]')
23 if sys.argv[5] == 'yes' and len(sys.argv) < 7:
24 sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> '
25 '<saveDiscarded?[yes/no]> [<discarded.sam>]')
26
27 sam_file=list(open(sys.argv[1]))
28 sam_mdfiltered=open(sys.argv[4],'w+')
29 if sys.argv[5] == 'yes':
30 sam_discarded=open(sys.argv[6],'w+')
31
32
33 # The MD tag is generated out of the alignment operations, regardless the strand
34 # the read was mapped to. It describes the alignment from the leftmost aligned
35 # base, which might be either the 5' or 3' end of the original read.
36 # Luckily, the read orientation in the alignment can be extracted from the
37 # alignment flag.
38 # The code used here to identify the mapped reads orientation was taken from
39 # the 'explain_sam_flags.py' script of the Picard-Tools.
40 # It defines the meaning of the sam flags in plain English, so they can be used
41 # to identify reads mapped to the reverse strand: flag = 16
42 # http://picard-tools.sourcearchive.com/documentation/1.25-1/
43
44 lstFlags = [
45 ("read paired", 0x1),
46 ("read mapped in proper pair", 0x2),
47 ("read unmapped", 0x4),
48 ("mate unmapped", 0x8),
49 ("read reverse strand", 0x10),
50 ("mate reverse strand", 0x20),
51 ("first in pair", 0x40),
52 ("second in pair", 0x80),
53 ("not primary alignment", 0x100),
54 ("read fails platform/vendor quality checks", 0x200),
55 ("read is PCR or optical duplicate", 0x400)
56 ]
57
58 for line in sam_file:
59 if line.split("\t")[0].startswith('@'):
60 if sys.argv[5] == 'yes':
61 sam_discarded.write(line)
62 sam_mdfiltered.write(line)
63 else:
64 sam_mdfiltered.write(line)
65 else:
66 if len(line.split("\t")) > 11: # Any SAM optional fields?
67 for field in line.split("\t")[11:]:
68 if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', field): # is MD tag field present?
69 flag = int(line.split("\t")[1])
70 for strFlagName, iMask in lstFlags:
71 if flag & iMask:
72 if iMask == 16:
73 strand = 'reverse'
74 break
75 else:
76 strand = 'forward'
77 else: # flag = 0 is not part of the definitions,
78 # but it corresponds to forward strand mapping
79 strand = 'forward'
80 # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag:
81 mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0]
82 mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2]
83 md_list = re.split(r'(\D)',mdtag)
84 if strand == 'forward':
85 if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]):
86 sam_mdfiltered.write(line)
87 else:
88 if sys.argv[5] == 'yes':
89 sam_discarded.write(line)
90 elif strand == 'reverse':
91 if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]):
92 sam_mdfiltered.write(line)
93 else:
94 if sys.argv[5] == 'yes':
95 sam_discarded.write(line)
96 break
97
98 sam_mdfiltered.close()
99
100 if sys.argv[5] == 'yes':
101 sam_discarded.close()