comparison extractSplitReads_BwaMem.py @ 1:1ed8619a5611 draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy-sv commit 0b55a106b1f76e3cc3d89932fef2cc8d3eb24e4f
author artbio
date Wed, 26 Jul 2017 18:17:01 -0400
parents 796552c157de
children 48e97429a749
comparison
equal deleted inserted replaced
0:796552c157de 1:1ed8619a5611
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 2
3 import re
3 import sys 4 import sys
4 import getopt
5 import string
6 from optparse import OptionParser 5 from optparse import OptionParser
7 import re 6
8 7
9 def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap): 8 def extractSplitsFromBwaMem(inFile, numSplits, includeDups, minNonOverlap):
10 if inFile == "stdin": 9 if inFile == "stdin":
11 data = sys.stdin 10 data = sys.stdin
12 else: 11 else:
13 data = open(inFile, 'r') 12 data = open(inFile, 'r')
14 for line in data: 13 for line in data:
15 split = 0 14 split = 0
16 if line[0] == '@': 15 if line[0] == '@':
17 print line.strip() 16 print(line.strip())
18 continue 17 continue
19 samList = line.strip().split('\t') 18 samList = line.strip().split('\t')
20 sam = SAM(samList) 19 sam = SAM(samList)
21 if includeDups==0 and (1024 & sam.flag)==1024: 20 if includeDups == 0 and (1024 & sam.flag) == 1024:
22 continue 21 continue
23 for el in sam.tags: 22 for el in sam.tags:
24 if "SA:" in el: 23 if "SA:" in el:
25 if(len(el.split(";")))<=numSplits: 24 if(len(el.split(";"))) <= numSplits:
26 split = 1 25 split = 1
27 mate = el.split(",") 26 mate = el.split(",")
28 mateCigar = mate[3] 27 mateCigar = mate[3]
29 mateFlag = int(0) 28 mateFlag = int(0)
30 if mate[2]=="-": mateFlag = int(16) 29 if mate[2] == "-":
30 mateFlag = int(16)
31 if split: 31 if split:
32 read1 = sam.flag & 64 32 read1 = sam.flag & 64
33 if read1 == 64: tag = "_1" 33 if read1 == 64:
34 else: tag="_2" 34 tag = "_1"
35 else:
36 tag = "_2"
35 samList[0] = sam.query + tag 37 samList[0] = sam.query + tag
36 readCigar = sam.cigar 38 readCigar = sam.cigar
37 readCigarOps = extractCigarOps(readCigar,sam.flag) 39 readCigarOps = extractCigarOps(readCigar, sam.flag)
38 readQueryPos = calcQueryPosFromCigar(readCigarOps) 40 readQueryPos = calcQueryPosFromCigar(readCigarOps)
39 mateCigarOps = extractCigarOps(mateCigar,mateFlag) 41 mateCigarOps = extractCigarOps(mateCigar, mateFlag)
40 mateQueryPos = calcQueryPosFromCigar(mateCigarOps) 42 mateQueryPos = calcQueryPosFromCigar(mateCigarOps)
41 overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos) 43 overlap = calcQueryOverlap(readQueryPos.qsPos, readQueryPos.qePos,
44 mateQueryPos.qsPos, mateQueryPos.qePos)
42 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap 45 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap
43 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap 46 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap
44 mno = min(nonOverlap1, nonOverlap2) 47 mno = min(nonOverlap1, nonOverlap2)
45 if mno >= minNonOverlap: 48 if mno >= minNonOverlap:
46 print "\t".join(samList) 49 print("\t".join(samList))
47 50
48 #-------------------------------------------------------------------------------------------------- 51 # -----------------------------------------------------------------------
49 # functions 52 # functions
50 #-------------------------------------------------------------------------------------------------- 53 # -----------------------------------------------------------------------
54
51 55
52 class SAM (object): 56 class SAM (object):
53 """ 57 """
54 __very__ basic class for SAM input. 58 __very__ basic class for SAM input.
55 """ 59 """
56 def __init__(self, samList = []): 60 def __init__(self, samList=[]):
57 if len(samList) > 0: 61 if len(samList) > 0:
58 self.query = samList[0] 62 self.query = samList[0]
59 self.flag = int(samList[1]) 63 self.flag = int(samList[1])
60 self.ref = samList[2] 64 self.ref = samList[2]
61 self.pos = int(samList[3]) 65 self.pos = int(samList[3])
62 self.mapq = int(samList[4]) 66 self.mapq = int(samList[4])
63 self.cigar = samList[5] 67 self.cigar = samList[5]
64 self.matRef = samList[6] 68 self.matRef = samList[6]
65 self.matePos = int(samList[7]) 69 self.matePos = int(samList[7])
66 self.iSize = int(samList[8]) 70 self.iSize = int(samList[8])
67 self.seq = samList[9] 71 self.seq = samList[9]
68 self.qual = samList[10] 72 self.qual = samList[10]
69 self.tags = samList[11:]#tags is a list of each tag:vtype:value sets 73 self.tags = samList[11:]
70 self.valid = 1 74 # tags is a list of each tag:vtype:value sets
75 self.valid = 1
71 else: 76 else:
72 self.valid = 0 77 self.valid = 0
73 self.query = 'null' 78 self.query = 'null'
74 79
75 def extractTagValue (self, tagID): 80 def extractTagValue(self, tagID):
76 for tag in self.tags: 81 for tag in self.tags:
77 tagParts = tag.split(':', 2); 82 tagParts = tag.split(':', 2)
78 if (tagParts[0] == tagID): 83 if (tagParts[0] == tagID):
79 if (tagParts[1] == 'i'): 84 if (tagParts[1] == 'i'):
80 return int(tagParts[2]); 85 return int(tagParts[2])
81 elif (tagParts[1] == 'H'): 86 elif (tagParts[1] == 'H'):
82 return int(tagParts[2],16); 87 return int(tagParts[2], 16)
83 return tagParts[2]; 88 return tagParts[2]
84 return None; 89 return None
85 90
86 #----------------------------------------------- 91
87 cigarPattern = '([0-9]+[MIDNSHP])' 92 cigarPattern = '([0-9]+[MIDNSHP])'
88 cigarSearch = re.compile(cigarPattern) 93 cigarSearch = re.compile(cigarPattern)
89 atomicCigarPattern = '([0-9]+)([MIDNSHP])' 94 atomicCigarPattern = '([0-9]+)([MIDNSHP])'
90 atomicCigarSearch = re.compile(atomicCigarPattern) 95 atomicCigarSearch = re.compile(atomicCigarPattern)
91 96
92 def extractCigarOps(cigar,flag): 97
98 def extractCigarOps(cigar, flag):
93 if (cigar == "*"): 99 if (cigar == "*"):
94 cigarOps = [] 100 cigarOps = []
95 elif (flag & 0x0010): 101 elif (flag & 0x0010):
96 cigarOpStrings = cigarSearch.findall(cigar) 102 cigarOpStrings = cigarSearch.findall(cigar)
97 cigarOps = [] 103 cigarOps = []
102 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) 108 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1])
103 # add to the list of cigarOps 109 # add to the list of cigarOps
104 cigarOps.append(cigar) 110 cigarOps.append(cigar)
105 cigarOps = cigarOps 111 cigarOps = cigarOps
106 cigarOps.reverse() 112 cigarOps.reverse()
107 ##do in reverse order because negative strand## 113 # do in reverse order because negative strand
108 else: 114 else:
109 cigarOpStrings = cigarSearch.findall(cigar) 115 cigarOpStrings = cigarSearch.findall(cigar)
110 cigarOps = [] 116 cigarOps = []
111 for opString in cigarOpStrings: 117 for opString in cigarOpStrings:
112 cigarOpList = atomicCigarSearch.findall(opString) 118 cigarOpList = atomicCigarSearch.findall(opString)
115 # add to the list of cigarOps 121 # add to the list of cigarOps
116 cigarOps.append(cigar) 122 cigarOps.append(cigar)
117 # cigarOps = cigarOps 123 # cigarOps = cigarOps
118 return(cigarOps) 124 return(cigarOps)
119 125
126
120 def calcQueryPosFromCigar(cigarOps): 127 def calcQueryPosFromCigar(cigarOps):
121 qsPos = 0 128 qsPos = 0
122 qePos = 0 129 qePos = 0
123 qLen = 0 130 qLen = 0
124 # if first op is a H, need to shift start position 131 # if first op is a H, need to shift start position
125 # the opPosition counter sees if the for loop is looking at the first index of the cigar object 132 # the opPosition counter sees if the for loop is looking
133 # at the first index of the cigar object
126 opPosition = 0 134 opPosition = 0
127 for cigar in cigarOps: 135 for cigar in cigarOps:
128 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): 136 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'):
129 qsPos += cigar.length 137 qsPos += cigar.length
130 qePos += cigar.length 138 qePos += cigar.length
131 qLen += cigar.length 139 qLen += cigar.length
132 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): 140 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'):
133 qLen += cigar.length 141 qLen += cigar.length
134 elif cigar.op == 'M' or cigar.op == 'I': 142 elif cigar.op == 'M' or cigar.op == 'I':
135 qePos += cigar.length 143 qePos += cigar.length
136 qLen += cigar.length 144 qLen += cigar.length
137 opPosition += 1 145 opPosition += 1
138 d = queryPos(qsPos, qePos, qLen); 146 d = queryPos(qsPos, qePos, qLen)
139 return d 147 return d
148
140 149
141 class cigarOp (object): 150 class cigarOp (object):
142 """ 151 """
143 sturct to store a discrete CIGAR operations 152 sturct to store a discrete CIGAR operations
144 """ 153 """
145 def __init__(self, opLength, op): 154 def __init__(self, opLength, op):
146 self.length = int(opLength) 155 self.length = int(opLength)
147 self.op = op 156 self.op = op
157
148 158
149 class queryPos (object): 159 class queryPos (object):
150 """ 160 """
151 struct to store the start and end positions of query CIGAR operations 161 struct to store the start and end positions of query CIGAR operations
152 """ 162 """
153 def __init__(self, qsPos, qePos, qLen): 163 def __init__(self, qsPos, qePos, qLen):
154 self.qsPos = int(qsPos) 164 self.qsPos = int(qsPos)
155 self.qePos = int(qePos) 165 self.qePos = int(qePos)
156 self.qLen = int(qLen) 166 self.qLen = int(qLen)
157 167
158 168
159 def calcQueryOverlap(s1,e1,s2,e2): 169 def calcQueryOverlap(s1, e1, s2, e2):
160 o = 1 + min(e1, e2) - max(s1, s2) 170 o = 1 + min(e1, e2) - max(s1, s2)
161 return max(0, o) 171 return max(0, o)
162 172
163 ############################################### 173 ###############################################
174
164 175
165 class Usage(Exception): 176 class Usage(Exception):
166 def __init__(self, msg): 177 def __init__(self, msg):
167 self.msg = msg 178 self.msg = msg
168 179
180
169 def main(): 181 def main():
170
171 usage = """%prog -i <file> 182 usage = """%prog -i <file>
172 183
173 extractSplitReads_BwaMem v0.1.0 184 extractSplitReads_BwaMem v0.1.0
174 Author: Ira Hall 185 Author: Ira Hall
175 Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates. 186 Description: Get split-read alignments from bwa-mem in lumpy compatible
187 format. Ignores reads marked as duplicates.
176 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. 188 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405.
177 """ 189 """
178 parser = OptionParser(usage) 190 parser = OptionParser(usage)
179 191
180 parser.add_option("-i", "--inFile", dest="inFile", 192 parser.add_option("-i", "--inFile", dest="inFile",
181 help="A SAM file or standard input (-i stdin).", 193 help="A SAM file or standard input (-i stdin).",
182 metavar="FILE") 194 metavar="FILE")
183 parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int", 195 parser.add_option("-n", "--numSplits", dest="numSplits", default=2,
184 help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2", 196 type="int",
185 metavar="INT") 197 help='''The maximum number of split-read mappings to
186 parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0, 198 allow per read. Reads with more are excluded.
187 help="Include alignments marked as duplicates. Default=False") 199 Default=2''', metavar="INT")
188 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int", 200 parser.add_option("-d", "--includeDups", dest="includeDups",
189 help="minimum non-overlap between split alignments on the query (default=20)", 201 action="store_true", default=0,
190 metavar="INT") 202 help='''Include alignments marked as duplicates.
203 Default=False''')
204 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap",
205 default=20, type="int", help='''minimum non-overlap between
206 split alignments on the query (default=20)''',
207 metavar="INT")
191 (opts, args) = parser.parse_args() 208 (opts, args) = parser.parse_args()
192 if opts.inFile is None: 209 if opts.inFile is None:
193 parser.print_help() 210 parser.print_help()
194 print 211 print
195 else: 212 else:
196 try: 213 try:
197 extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap) 214 extractSplitsFromBwaMem(opts.inFile, opts.numSplits,
215 opts.includeDups, opts.minNonOverlap)
198 except IOError as err: 216 except IOError as err:
199 sys.stderr.write("IOError " + str(err) + "\n"); 217 sys.stderr.write("IOError " + str(err) + "\n")
200 return 218 return
219
220
201 if __name__ == "__main__": 221 if __name__ == "__main__":
202 sys.exit(main()) 222 sys.exit(main())