comparison extractSplitReads_BwaMem.py @ 0:8b3daa745d9b draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy commit c0bfc4b2215705e1b5fd1d4e60b1d72e5da13c92
author drosofff
date Tue, 06 Dec 2016 05:46:28 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:8b3daa745d9b
1 #!/usr/bin/env python
2
3 import sys
4 import getopt
5 import string
6 from optparse import OptionParser
7 import re
8
9 def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap):
10 if inFile == "stdin":
11 data = sys.stdin
12 else:
13 data = open(inFile, 'r')
14 for line in data:
15 split = 0
16 if line[0] == '@':
17 print line.strip()
18 continue
19 samList = line.strip().split('\t')
20 sam = SAM(samList)
21 if includeDups==0 and (1024 & sam.flag)==1024:
22 continue
23 for el in sam.tags:
24 if "SA:" in el:
25 if(len(el.split(";")))<=numSplits:
26 split = 1
27 mate = el.split(",")
28 mateCigar = mate[3]
29 mateFlag = int(0)
30 if mate[2]=="-": mateFlag = int(16)
31 if split:
32 read1 = sam.flag & 64
33 if read1 == 64: tag = "_1"
34 else: tag="_2"
35 samList[0] = sam.query + tag
36 readCigar = sam.cigar
37 readCigarOps = extractCigarOps(readCigar,sam.flag)
38 readQueryPos = calcQueryPosFromCigar(readCigarOps)
39 mateCigarOps = extractCigarOps(mateCigar,mateFlag)
40 mateQueryPos = calcQueryPosFromCigar(mateCigarOps)
41 overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos)
42 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap
43 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap
44 mno = min(nonOverlap1, nonOverlap2)
45 if mno >= minNonOverlap:
46 print "\t".join(samList)
47
48 #--------------------------------------------------------------------------------------------------
49 # functions
50 #--------------------------------------------------------------------------------------------------
51
52 class SAM (object):
53 """
54 __very__ basic class for SAM input.
55 """
56 def __init__(self, samList = []):
57 if len(samList) > 0:
58 self.query = samList[0]
59 self.flag = int(samList[1])
60 self.ref = samList[2]
61 self.pos = int(samList[3])
62 self.mapq = int(samList[4])
63 self.cigar = samList[5]
64 self.matRef = samList[6]
65 self.matePos = int(samList[7])
66 self.iSize = int(samList[8])
67 self.seq = samList[9]
68 self.qual = samList[10]
69 self.tags = samList[11:]#tags is a list of each tag:vtype:value sets
70 self.valid = 1
71 else:
72 self.valid = 0
73 self.query = 'null'
74
75 def extractTagValue (self, tagID):
76 for tag in self.tags:
77 tagParts = tag.split(':', 2);
78 if (tagParts[0] == tagID):
79 if (tagParts[1] == 'i'):
80 return int(tagParts[2]);
81 elif (tagParts[1] == 'H'):
82 return int(tagParts[2],16);
83 return tagParts[2];
84 return None;
85
86 #-----------------------------------------------
87 cigarPattern = '([0-9]+[MIDNSHP])'
88 cigarSearch = re.compile(cigarPattern)
89 atomicCigarPattern = '([0-9]+)([MIDNSHP])'
90 atomicCigarSearch = re.compile(atomicCigarPattern)
91
92 def extractCigarOps(cigar,flag):
93 if (cigar == "*"):
94 cigarOps = []
95 elif (flag & 0x0010):
96 cigarOpStrings = cigarSearch.findall(cigar)
97 cigarOps = []
98 for opString in cigarOpStrings:
99 cigarOpList = atomicCigarSearch.findall(opString)
100 # print cigarOpList
101 # "struct" for the op and it's length
102 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1])
103 # add to the list of cigarOps
104 cigarOps.append(cigar)
105 cigarOps = cigarOps
106 cigarOps.reverse()
107 ##do in reverse order because negative strand##
108 else:
109 cigarOpStrings = cigarSearch.findall(cigar)
110 cigarOps = []
111 for opString in cigarOpStrings:
112 cigarOpList = atomicCigarSearch.findall(opString)
113 # "struct" for the op and it's length
114 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1])
115 # add to the list of cigarOps
116 cigarOps.append(cigar)
117 # cigarOps = cigarOps
118 return(cigarOps)
119
120 def calcQueryPosFromCigar(cigarOps):
121 qsPos = 0
122 qePos = 0
123 qLen = 0
124 # if first op is a H, need to shift start position
125 # the opPosition counter sees if the for loop is looking at the first index of the cigar object
126 opPosition = 0
127 for cigar in cigarOps:
128 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'):
129 qsPos += cigar.length
130 qePos += cigar.length
131 qLen += cigar.length
132 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'):
133 qLen += cigar.length
134 elif cigar.op == 'M' or cigar.op == 'I':
135 qePos += cigar.length
136 qLen += cigar.length
137 opPosition += 1
138 d = queryPos(qsPos, qePos, qLen);
139 return d
140
141 class cigarOp (object):
142 """
143 sturct to store a discrete CIGAR operations
144 """
145 def __init__(self, opLength, op):
146 self.length = int(opLength)
147 self.op = op
148
149 class queryPos (object):
150 """
151 struct to store the start and end positions of query CIGAR operations
152 """
153 def __init__(self, qsPos, qePos, qLen):
154 self.qsPos = int(qsPos)
155 self.qePos = int(qePos)
156 self.qLen = int(qLen)
157
158
159 def calcQueryOverlap(s1,e1,s2,e2):
160 o = 1 + min(e1, e2) - max(s1, s2)
161 return max(0, o)
162
163 ###############################################
164
165 class Usage(Exception):
166 def __init__(self, msg):
167 self.msg = msg
168
169 def main():
170
171 usage = """%prog -i <file>
172
173 extractSplitReads_BwaMem v0.1.0
174 Author: Ira Hall
175 Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates.
176 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405.
177 """
178 parser = OptionParser(usage)
179
180 parser.add_option("-i", "--inFile", dest="inFile",
181 help="A SAM file or standard input (-i stdin).",
182 metavar="FILE")
183 parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int",
184 help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2",
185 metavar="INT")
186 parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0,
187 help="Include alignments marked as duplicates. Default=False")
188 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int",
189 help="minimum non-overlap between split alignments on the query (default=20)",
190 metavar="INT")
191 (opts, args) = parser.parse_args()
192 if opts.inFile is None:
193 parser.print_help()
194 print
195 else:
196 try:
197 extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap)
198 except IOError as err:
199 sys.stderr.write("IOError " + str(err) + "\n");
200 return
201 if __name__ == "__main__":
202 sys.exit(main())