Mercurial > repos > drosofff > lumpy
comparison extractSplitReads_BwaMem.py @ 0:8b3daa745d9b draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/lumpy commit c0bfc4b2215705e1b5fd1d4e60b1d72e5da13c92
author | drosofff |
---|---|
date | Tue, 06 Dec 2016 05:46:28 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:8b3daa745d9b |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 import sys | |
4 import getopt | |
5 import string | |
6 from optparse import OptionParser | |
7 import re | |
8 | |
9 def extractSplitsFromBwaMem(inFile,numSplits,includeDups,minNonOverlap): | |
10 if inFile == "stdin": | |
11 data = sys.stdin | |
12 else: | |
13 data = open(inFile, 'r') | |
14 for line in data: | |
15 split = 0 | |
16 if line[0] == '@': | |
17 print line.strip() | |
18 continue | |
19 samList = line.strip().split('\t') | |
20 sam = SAM(samList) | |
21 if includeDups==0 and (1024 & sam.flag)==1024: | |
22 continue | |
23 for el in sam.tags: | |
24 if "SA:" in el: | |
25 if(len(el.split(";")))<=numSplits: | |
26 split = 1 | |
27 mate = el.split(",") | |
28 mateCigar = mate[3] | |
29 mateFlag = int(0) | |
30 if mate[2]=="-": mateFlag = int(16) | |
31 if split: | |
32 read1 = sam.flag & 64 | |
33 if read1 == 64: tag = "_1" | |
34 else: tag="_2" | |
35 samList[0] = sam.query + tag | |
36 readCigar = sam.cigar | |
37 readCigarOps = extractCigarOps(readCigar,sam.flag) | |
38 readQueryPos = calcQueryPosFromCigar(readCigarOps) | |
39 mateCigarOps = extractCigarOps(mateCigar,mateFlag) | |
40 mateQueryPos = calcQueryPosFromCigar(mateCigarOps) | |
41 overlap = calcQueryOverlap(readQueryPos.qsPos,readQueryPos.qePos,mateQueryPos.qsPos,mateQueryPos.qePos) | |
42 nonOverlap1 = 1 + readQueryPos.qePos - readQueryPos.qsPos - overlap | |
43 nonOverlap2 = 1 + mateQueryPos.qePos - mateQueryPos.qsPos - overlap | |
44 mno = min(nonOverlap1, nonOverlap2) | |
45 if mno >= minNonOverlap: | |
46 print "\t".join(samList) | |
47 | |
48 #-------------------------------------------------------------------------------------------------- | |
49 # functions | |
50 #-------------------------------------------------------------------------------------------------- | |
51 | |
52 class SAM (object): | |
53 """ | |
54 __very__ basic class for SAM input. | |
55 """ | |
56 def __init__(self, samList = []): | |
57 if len(samList) > 0: | |
58 self.query = samList[0] | |
59 self.flag = int(samList[1]) | |
60 self.ref = samList[2] | |
61 self.pos = int(samList[3]) | |
62 self.mapq = int(samList[4]) | |
63 self.cigar = samList[5] | |
64 self.matRef = samList[6] | |
65 self.matePos = int(samList[7]) | |
66 self.iSize = int(samList[8]) | |
67 self.seq = samList[9] | |
68 self.qual = samList[10] | |
69 self.tags = samList[11:]#tags is a list of each tag:vtype:value sets | |
70 self.valid = 1 | |
71 else: | |
72 self.valid = 0 | |
73 self.query = 'null' | |
74 | |
75 def extractTagValue (self, tagID): | |
76 for tag in self.tags: | |
77 tagParts = tag.split(':', 2); | |
78 if (tagParts[0] == tagID): | |
79 if (tagParts[1] == 'i'): | |
80 return int(tagParts[2]); | |
81 elif (tagParts[1] == 'H'): | |
82 return int(tagParts[2],16); | |
83 return tagParts[2]; | |
84 return None; | |
85 | |
86 #----------------------------------------------- | |
87 cigarPattern = '([0-9]+[MIDNSHP])' | |
88 cigarSearch = re.compile(cigarPattern) | |
89 atomicCigarPattern = '([0-9]+)([MIDNSHP])' | |
90 atomicCigarSearch = re.compile(atomicCigarPattern) | |
91 | |
92 def extractCigarOps(cigar,flag): | |
93 if (cigar == "*"): | |
94 cigarOps = [] | |
95 elif (flag & 0x0010): | |
96 cigarOpStrings = cigarSearch.findall(cigar) | |
97 cigarOps = [] | |
98 for opString in cigarOpStrings: | |
99 cigarOpList = atomicCigarSearch.findall(opString) | |
100 # print cigarOpList | |
101 # "struct" for the op and it's length | |
102 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) | |
103 # add to the list of cigarOps | |
104 cigarOps.append(cigar) | |
105 cigarOps = cigarOps | |
106 cigarOps.reverse() | |
107 ##do in reverse order because negative strand## | |
108 else: | |
109 cigarOpStrings = cigarSearch.findall(cigar) | |
110 cigarOps = [] | |
111 for opString in cigarOpStrings: | |
112 cigarOpList = atomicCigarSearch.findall(opString) | |
113 # "struct" for the op and it's length | |
114 cigar = cigarOp(cigarOpList[0][0], cigarOpList[0][1]) | |
115 # add to the list of cigarOps | |
116 cigarOps.append(cigar) | |
117 # cigarOps = cigarOps | |
118 return(cigarOps) | |
119 | |
120 def calcQueryPosFromCigar(cigarOps): | |
121 qsPos = 0 | |
122 qePos = 0 | |
123 qLen = 0 | |
124 # if first op is a H, need to shift start position | |
125 # the opPosition counter sees if the for loop is looking at the first index of the cigar object | |
126 opPosition = 0 | |
127 for cigar in cigarOps: | |
128 if opPosition == 0 and (cigar.op == 'H' or cigar.op == 'S'): | |
129 qsPos += cigar.length | |
130 qePos += cigar.length | |
131 qLen += cigar.length | |
132 elif opPosition > 0 and (cigar.op == 'H' or cigar.op == 'S'): | |
133 qLen += cigar.length | |
134 elif cigar.op == 'M' or cigar.op == 'I': | |
135 qePos += cigar.length | |
136 qLen += cigar.length | |
137 opPosition += 1 | |
138 d = queryPos(qsPos, qePos, qLen); | |
139 return d | |
140 | |
141 class cigarOp (object): | |
142 """ | |
143 sturct to store a discrete CIGAR operations | |
144 """ | |
145 def __init__(self, opLength, op): | |
146 self.length = int(opLength) | |
147 self.op = op | |
148 | |
149 class queryPos (object): | |
150 """ | |
151 struct to store the start and end positions of query CIGAR operations | |
152 """ | |
153 def __init__(self, qsPos, qePos, qLen): | |
154 self.qsPos = int(qsPos) | |
155 self.qePos = int(qePos) | |
156 self.qLen = int(qLen) | |
157 | |
158 | |
159 def calcQueryOverlap(s1,e1,s2,e2): | |
160 o = 1 + min(e1, e2) - max(s1, s2) | |
161 return max(0, o) | |
162 | |
163 ############################################### | |
164 | |
165 class Usage(Exception): | |
166 def __init__(self, msg): | |
167 self.msg = msg | |
168 | |
169 def main(): | |
170 | |
171 usage = """%prog -i <file> | |
172 | |
173 extractSplitReads_BwaMem v0.1.0 | |
174 Author: Ira Hall | |
175 Description: Get split-read alignments from bwa-mem in lumpy compatible format. Ignores reads marked as duplicates. | |
176 Works on read or position sorted SAM input. Tested on bwa mem v0.7.5a-r405. | |
177 """ | |
178 parser = OptionParser(usage) | |
179 | |
180 parser.add_option("-i", "--inFile", dest="inFile", | |
181 help="A SAM file or standard input (-i stdin).", | |
182 metavar="FILE") | |
183 parser.add_option("-n", "--numSplits", dest="numSplits", default=2, type = "int", | |
184 help="The maximum number of split-read mappings to allow per read. Reads with more are excluded. Default=2", | |
185 metavar="INT") | |
186 parser.add_option("-d", "--includeDups", dest="includeDups", action="store_true",default=0, | |
187 help="Include alignments marked as duplicates. Default=False") | |
188 parser.add_option("-m", "--minNonOverlap", dest="minNonOverlap", default=20, type = "int", | |
189 help="minimum non-overlap between split alignments on the query (default=20)", | |
190 metavar="INT") | |
191 (opts, args) = parser.parse_args() | |
192 if opts.inFile is None: | |
193 parser.print_help() | |
194 print | |
195 else: | |
196 try: | |
197 extractSplitsFromBwaMem(opts.inFile, opts.numSplits, opts.includeDups, opts.minNonOverlap) | |
198 except IOError as err: | |
199 sys.stderr.write("IOError " + str(err) + "\n"); | |
200 return | |
201 if __name__ == "__main__": | |
202 sys.exit(main()) |