Mercurial > repos > artbio > lumpy_sv
comparison extractSplitReads_BwaMem.py @ 1:1ed8619a5611 draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 0b55a106b1f76e3cc3d89932fef2cc8d3eb24e4f
author | artbio |
---|---|
date | Wed, 26 Jul 2017 18:17:01 -0400 |
parents | 796552c157de |
children | 48e97429a749 |
comparison
equal
deleted
inserted
replaced
0:796552c157de | 1:1ed8619a5611 |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | 2 |
3 import re | |
3 import sys | 4 import sys |
4 import getopt | |
5 import string | |
6 from optparse import OptionParser | 5 from optparse import OptionParser |
7 import re | 6 |
8 | 7 |
9 def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap): | 8 def extractSplitsFromBwaMem(inFile, numSplits, includeDups, minNonOverlap): |
10 if inFile == "stdin": | 9 if inFile == "stdin": |
11 data = sys.stdin | 10 data = sys.stdin |
12 else: | 11 else: |
13 data = open(inFile, 'r') | 12 data = open(inFile, 'r') |
14 for line in data: | 13 for line in data: |
15 split = 0 | 14 split = 0 |
16 if line[0] == '@': | 15 if line[0] == '@': |
17 print line.strip() | 16 print(line.strip()) |
18 continue | 17 continue |
19 samList = line.strip().split('\t') | 18 samList = line.strip().split('\t') |
20 sam = SAM(samList) | 19 sam = SAM(samList) |
21 if includeDups==0 and (1024 & sam.flag)==1024: | 20 if includeDups == 0 and (1024 & sam.flag) == 1024: |
22 continue | 21 continue |
23 for el in sam.tags: | 22 for el in sam.tags: |
24 if "SA:" in el: | 23 if "SA:" in el: |
25 if(len(el.split(";")))<=numSplits: | 24 if(len(el.split(";"))) <= numSplits: |
26 split = 1 | 25 split = 1 |
27 mate = el.split(",") | 26 mate = el.split(",") |
28 mateCigar = mate[3] | 27 mateCigar = mate[3] |
29 mateFlag = int(0) | 28 mateFlag = int(0) |
30 if mate[2]=="-": mateFlag = int(16) | 29 if mate[2] == "-": |
30 mateFlag = int(16) | |
31 if split: | 31 if split: |
32 read1 = sam.flag & 64 | 32 read1 = sam.flag & 64 |
33 if read1 == 64: tag = "_1" | 33 if read1 == 64: |
34 else: tag="_2" | 34 tag = "_1" |
35 else: | |
36 tag = "_2" | |
35 samList[0] = sam.query + tag | 37 samList[0] = sam.query + tag |
36 readCigar = sam.cigar | 38 readCigar = sam.cigar |
37 readCigarOps = extractCigarOps(readCigar,sam.flag) | 39 readCigarOps = extractCigarOps(readCigar, sam.flag) |
38 readQueryPos = calcQueryPosFromCigar(readCigarOps) | 40 readQueryPos = calcQueryPosFromCigar(readCigarOps) |
39 mateCigarOps = extractCigarOps(mateCigar,mateFlag) | 41 mateCigarOps = extractCigarOps(mateCigar, mateFlag) |
40 mateQueryPos = calcQueryPosFromCigar(mateCigarOps) | 42 mateQueryPos = calcQueryPosFromCigar(mateCigarOps) |
41 overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos) | 43 overlap = calcQueryOverlap(readQueryPos.qsPos, readQueryPos.qePos, |
44 mateQueryPos.qsPos, mateQueryPos.qePos) | |
42 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap | 45 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap |
43 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap | 46 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap |
44 mno = min(nonOverlap1, nonOverlap2) | 47 mno = min(nonOverlap1, nonOverlap2) |
45 if mno >= minNonOverlap: | 48 if mno >= minNonOverlap: |
46 print "\t".join(samList) | 49 print("\t".join(samList)) |
47 | 50 |
48 #-------------------------------------------------------------------------------------------------- | 51 # ----------------------------------------------------------------------- |
49 # functions | 52 # functions |
50 #-------------------------------------------------------------------------------------------------- | 53 # ----------------------------------------------------------------------- |
54 | |
51 | 55 |
52 class SAM (object): | 56 class SAM (object): |
53 """ | 57 """ |
54 __very__ basic class for SAM input. | 58 __very__ basic class for SAM input. |
55 """ | 59 """ |
56 def __init__(self, samList = []): | 60 def __init__(self, samList=[]): |
57 if len(samList) > 0: | 61 if len(samList) > 0: |
58 self.query = samList[0] | 62 self.query = samList[0] |
59 self.flag = int(samList[1]) | 63 self.flag = int(samList[1]) |
60 self.ref = samList[2] | 64 self.ref = samList[2] |
61 self.pos = int(samList[3]) | 65 self.pos = int(samList[3]) |
62 self.mapq = int(samList[4]) | 66 self.mapq = int(samList[4]) |
63 self.cigar = samList[5] | 67 self.cigar = samList[5] |
64 self.matRef = samList[6] | 68 self.matRef = samList[6] |
65 self.matePos = int(samList[7]) | 69 self.matePos = int(samList[7]) |
66 self.iSize = int(samList[8]) | 70 self.iSize = int(samList[8]) |
67 self.seq = samList[9] | 71 self.seq = samList[9] |
68 self.qual = samList[10] | 72 self.qual = samList[10] |
69 self.tags = samList[11:]#tags is a list of each tag:vtype:value sets | 73 self.tags = samList[11:] |
70 self.valid = 1 | 74 # tags is a list of each tag:vtype:value sets |
75 self.valid = 1 | |
71 else: | 76 else: |
72 self.valid = 0 | 77 self.valid = 0 |
73 self.query = 'null' | 78 self.query = 'null' |
74 | 79 |
75 def extractTagValue (self, tagID): | 80 def extractTagValue(self, tagID): |
76 for tag in self.tags: | 81 for tag in self.tags: |
77 tagParts = tag.split(':', 2); | 82 tagParts = tag.split(':', 2) |
78 if (tagParts[0] == tagID): | 83 if (tagParts[0] == tagID): |
79 if (tagParts[1] == 'i'): | 84 if (tagParts[1] == 'i'): |
80 return int(tagParts[2]); | 85 return int(tagParts[2]) |
81 elif (tagParts[1] == 'H'): | 86 elif (tagParts[1] == 'H'): |
82 return int(tagParts[2],16); | 87 return int(tagParts[2], 16) |
83 return tagParts[2]; | 88 return tagParts[2] |
84 return None; | 89 return None |
85 | 90 |
86 #----------------------------------------------- | 91 |
87 cigarPattern = '([0-9]+[MIDNSHP])' | 92 cigarPattern = '([0-9]+[MIDNSHP])' |
88 cigarSearch = re.compile(cigarPattern) | 93 cigarSearch = re.compile(cigarPattern) |
89 atomicCigarPattern = '([0-9]+)([MIDNSHP])' | 94 atomicCigarPattern = '([0-9]+)([MIDNSHP])' |
90 atomicCigarSearch = re.compile(atomicCigarPattern) | 95 atomicCigarSearch = re.compile(atomicCigarPattern) |
91 | 96 |
92 def extractCigarOps(cigar,flag): | 97 |
98 def extractCigarOps(cigar, flag): | |
93 if (cigar == "*"): | 99 if (cigar == "*"): |
94 cigarOps = [] | 100 cigarOps = [] |
95 elif (flag & 0x0010): | 101 elif (flag & 0x0010): |
96 cigarOpStrings = cigarSearch.findall(cigar) | 102 cigarOpStrings = cigarSearch.findall(cigar) |
97 cigarOps = [] | 103 cigarOps = [] |
102 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) | 108 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) |
103 # add to the list of cigarOps | 109 # add to the list of cigarOps |
104 cigarOps.append(cigar) | 110 cigarOps.append(cigar) |
105 cigarOps = cigarOps | 111 cigarOps = cigarOps |
106 cigarOps.reverse() | 112 cigarOps.reverse() |
107 ##do in reverse order because negative strand## | 113 # do in reverse order because negative strand |
108 else: | 114 else: |
109 cigarOpStrings = cigarSearch.findall(cigar) | 115 cigarOpStrings = cigarSearch.findall(cigar) |
110 cigarOps = [] | 116 cigarOps = [] |
111 for opString in cigarOpStrings: | 117 for opString in cigarOpStrings: |
112 cigarOpList = atomicCigarSearch.findall(opString) | 118 cigarOpList = atomicCigarSearch.findall(opString) |
115 # add to the list of cigarOps | 121 # add to the list of cigarOps |
116 cigarOps.append(cigar) | 122 cigarOps.append(cigar) |
117 # cigarOps = cigarOps | 123 # cigarOps = cigarOps |
118 return(cigarOps) | 124 return(cigarOps) |
119 | 125 |
126 | |
120 def calcQueryPosFromCigar(cigarOps): | 127 def calcQueryPosFromCigar(cigarOps): |
121 qsPos = 0 | 128 qsPos = 0 |
122 qePos = 0 | 129 qePos = 0 |
123 qLen = 0 | 130 qLen = 0 |
124 # if first op is a H, need to shift start position | 131 # if first op is a H, need to shift start position |
125 # the opPosition counter sees if the for loop is looking at the first index of the cigar object | 132 # the opPosition counter sees if the for loop is looking |
133 # at the first index of the cigar object | |
126 opPosition = 0 | 134 opPosition = 0 |
127 for cigar in cigarOps: | 135 for cigar in cigarOps: |
128 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): | 136 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): |
129 qsPos += cigar.length | 137 qsPos += cigar.length |
130 qePos += cigar.length | 138 qePos += cigar.length |
131 qLen += cigar.length | 139 qLen += cigar.length |
132 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): | 140 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): |
133 qLen += cigar.length | 141 qLen += cigar.length |
134 elif cigar.op == 'M' or cigar.op == 'I': | 142 elif cigar.op == 'M' or cigar.op == 'I': |
135 qePos += cigar.length | 143 qePos += cigar.length |
136 qLen += cigar.length | 144 qLen += cigar.length |
137 opPosition += 1 | 145 opPosition += 1 |
138 d = queryPos(qsPos, qePos, qLen); | 146 d = queryPos(qsPos, qePos, qLen) |
139 return d | 147 return d |
148 | |
140 | 149 |
141 class cigarOp (object): | 150 class cigarOp (object): |
142 """ | 151 """ |
143 sturct to store a discrete CIGAR operations | 152 sturct to store a discrete CIGAR operations |
144 """ | 153 """ |
145 def __init__(self, opLength, op): | 154 def __init__(self, opLength, op): |
146 self.length = int(opLength) | 155 self.length = int(opLength) |
147 self.op = op | 156 self.op = op |
157 | |
148 | 158 |
149 class queryPos (object): | 159 class queryPos (object): |
150 """ | 160 """ |
151 struct to store the start and end positions of query CIGAR operations | 161 struct to store the start and end positions of query CIGAR operations |
152 """ | 162 """ |
153 def __init__(self, qsPos, qePos, qLen): | 163 def __init__(self, qsPos, qePos, qLen): |
154 self.qsPos = int(qsPos) | 164 self.qsPos = int(qsPos) |
155 self.qePos = int(qePos) | 165 self.qePos = int(qePos) |
156 self.qLen = int(qLen) | 166 self.qLen = int(qLen) |
157 | 167 |
158 | 168 |
159 def calcQueryOverlap(s1,e1,s2,e2): | 169 def calcQueryOverlap(s1, e1, s2, e2): |
160 o = 1 + min(e1, e2) - max(s1, s2) | 170 o = 1 + min(e1, e2) - max(s1, s2) |
161 return max(0, o) | 171 return max(0, o) |
162 | 172 |
163 ############################################### | 173 ############################################### |
174 | |
164 | 175 |
165 class Usage(Exception): | 176 class Usage(Exception): |
166 def __init__(self, msg): | 177 def __init__(self, msg): |
167 self.msg = msg | 178 self.msg = msg |
168 | 179 |
180 | |
169 def main(): | 181 def main(): |
170 | |
171 usage = """%prog -i <file> | 182 usage = """%prog -i <file> |
172 | 183 |
173 extractSplitReads_BwaMem v0.1.0 | 184 extractSplitReads_BwaMem v0.1.0 |
174 Author: Ira Hall | 185 Author: Ira Hall |
175 Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates. | 186 Description: Get split-read alignments from bwa-mem in lumpy compatible |
187 format. Ignores reads marked as duplicates. | |
176 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. | 188 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. |
177 """ | 189 """ |
178 parser = OptionParser(usage) | 190 parser = OptionParser(usage) |
179 | 191 |
180 parser.add_option("-i", "--inFile", dest="inFile", | 192 parser.add_option("-i", "--inFile", dest="inFile", |
181 help="A SAM file or standard input (-i stdin).", | 193 help="A SAM file or standard input (-i stdin).", |
182 metavar="FILE") | 194 metavar="FILE") |
183 parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int", | 195 parser.add_option("-n", "--numSplits", dest="numSplits", default=2, |
184 help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2", | 196 type="int", |
185 metavar="INT") | 197 help='''The maximum number of split-read mappings to |
186 parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0, | 198 allow per read. Reads with more are excluded. |
187 help="Include alignments marked as duplicates. Default=False") | 199 Default=2''', metavar="INT") |
188 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int", | 200 parser.add_option("-d", "--includeDups", dest="includeDups", |
189 help="minimum non-overlap between split alignments on the query (default=20)", | 201 action="store_true", default=0, |
190 metavar="INT") | 202 help='''Include alignments marked as duplicates. |
203 Default=False''') | |
204 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", | |
205 default=20, type="int", help='''minimum non-overlap between | |
206 split alignments on the query (default=20)''', | |
207 metavar="INT") | |
191 (opts, args) = parser.parse_args() | 208 (opts, args) = parser.parse_args() |
192 if opts.inFile is None: | 209 if opts.inFile is None: |
193 parser.print_help() | 210 parser.print_help() |
194 print | 211 print |
195 else: | 212 else: |
196 try: | 213 try: |
197 extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap) | 214 extractSplitsFromBwaMem(opts.inFile, opts.numSplits, |
215 opts.includeDups, opts.minNonOverlap) | |
198 except IOError as err: | 216 except IOError as err: |
199 sys.stderr.write("IOError " + str(err) + "\n"); | 217 sys.stderr.write("IOError " + str(err) + "\n") |
200 return | 218 return |
219 | |
220 | |
201 if __name__ == "__main__": | 221 if __name__ == "__main__": |
202 sys.exit(main()) | 222 sys.exit(main()) |