Mercurial > repos > boris > filter_on_md
comparison MDtag_filter.py @ 1:f3d024fd9bf4 draft
Uploaded python script
author | boris |
---|---|
date | Tue, 24 Apr 2012 12:14:56 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:447b7748eb83 | 1:f3d024fd9bf4 |
---|---|
1 #!/usr/bin/env python | |
2 # Boris Rebolledo Jaramillo | |
3 # berebolledo@gmail.com | |
4 # v1.0.2 | |
5 | |
6 """Filter mapped reads on MD tag string. | |
7 | |
8 This tool reads the MD tag of mapped reads (see SAM format specification). | |
9 The user defines the 5' and 3' windows n and m (in bp), respectively. | |
10 The mapped read is discarded if it contains any number of mismatches within n | |
11 bases of the read 5' end and within m bases of the read 3' end. | |
12 Option: save discarded reads in an additional SAM file. | |
13 | |
14 usage: %prog in.sam n m out.sam saveDiscarded?[yes/no] [discarded.sam] | |
15 | |
16 """ | |
17 | |
18 import sys,re | |
19 | |
20 if len(sys.argv) < 6: | |
21 sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> ' | |
22 '<saveDiscarded?[yes/no]> [<discarded.sam>]') | |
23 if sys.argv[5] == 'yes' and len(sys.argv) < 7: | |
24 sys.exit('ERROR! Usage: %prog <in.sam> <n> <m> <out.sam> ' | |
25 '<saveDiscarded?[yes/no]> [<discarded.sam>]') | |
26 | |
27 sam_file=list(open(sys.argv[1])) | |
28 sam_mdfiltered=open(sys.argv[4],'w+') | |
29 if sys.argv[5] == 'yes': | |
30 sam_discarded=open(sys.argv[6],'w+') | |
31 | |
32 | |
33 # The MD tag is generated out of the alignment operations, regardless the strand | |
34 # the read was mapped to. It describes the alignment from the leftmost aligned | |
35 # base, which might be either the 5' or 3' end of the original read. | |
36 # Luckily, the read orientation in the alignment can be extracted from the | |
37 # alignment flag. | |
38 # The code used here to identify the mapped reads orientation was taken from | |
39 # the 'explain_sam_flags.py' script of the Picard-Tools. | |
40 # It defines the meaning of the sam flags in plain English, so they can be used | |
41 # to identify reads mapped to the reverse strand: flag = 16 | |
42 # http://picard-tools.sourcearchive.com/documentation/1.25-1/ | |
43 | |
44 lstFlags = [ | |
45 ("read paired", 0x1), | |
46 ("read mapped in proper pair", 0x2), | |
47 ("read unmapped", 0x4), | |
48 ("mate unmapped", 0x8), | |
49 ("read reverse strand", 0x10), | |
50 ("mate reverse strand", 0x20), | |
51 ("first in pair", 0x40), | |
52 ("second in pair", 0x80), | |
53 ("not primary alignment", 0x100), | |
54 ("read fails platform/vendor quality checks", 0x200), | |
55 ("read is PCR or optical duplicate", 0x400) | |
56 ] | |
57 | |
58 for line in sam_file: | |
59 if line.split("\t")[0].startswith('@'): | |
60 if sys.argv[5] == 'yes': | |
61 sam_discarded.write(line) | |
62 sam_mdfiltered.write(line) | |
63 else: | |
64 sam_mdfiltered.write(line) | |
65 else: | |
66 if len(line.split("\t")) > 11: # Any SAM optional fields? | |
67 for field in line.split("\t")[11:]: | |
68 if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', field): # is MD tag field present? | |
69 flag = int(line.split("\t")[1]) | |
70 for strFlagName, iMask in lstFlags: | |
71 if flag & iMask: | |
72 if iMask == 16: | |
73 strand = 'reverse' | |
74 break | |
75 else: | |
76 strand = 'forward' | |
77 else: # flag = 0 is not part of the definitions, | |
78 # but it corresponds to forward strand mapping | |
79 strand = 'forward' | |
80 # The position of the optional fields in the SAM format is variable. Finds the location of the MD tag: | |
81 mdtag_idx = [i for i, item in enumerate(line.split("\t")[11:]) if re.match('MD:Z:[0-9]+(([A-Z]|\^[A-Z]+)[0-9]+)*', item)][0] | |
82 mdtag=line.split("\t")[mdtag_idx+11].strip().split(":")[2] | |
83 md_list = re.split(r'(\D)',mdtag) | |
84 if strand == 'forward': | |
85 if int(md_list[0]) >= int(sys.argv[2]) and int(md_list[-1]) >= int(sys.argv[3]): | |
86 sam_mdfiltered.write(line) | |
87 else: | |
88 if sys.argv[5] == 'yes': | |
89 sam_discarded.write(line) | |
90 elif strand == 'reverse': | |
91 if int(md_list[0]) >= int(sys.argv[3]) and int(md_list[-1]) >= int(sys.argv[2]): | |
92 sam_mdfiltered.write(line) | |
93 else: | |
94 if sys.argv[5] == 'yes': | |
95 sam_discarded.write(line) | |
96 break | |
97 | |
98 sam_mdfiltered.close() | |
99 | |
100 if sys.argv[5] == 'yes': | |
101 sam_discarded.close() |