Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/RmvPairAlignInChunkOverlaps.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Remove hits due to chunk overlaps. | |
5 """ | |
6 | |
7 import os | |
8 import sys | |
9 import getopt | |
10 import exceptions | |
11 import copy | |
12 from commons.core.coord.Align import * | |
13 | |
14 | |
15 class RmvPairAlignInChunkOverlaps( object ): | |
16 """ | |
17 Remove hits due to chunk overlaps. | |
18 """ | |
19 | |
20 | |
21 def __init__( self, inFileName="", chunkLength=200000, chunkOverlap=10000, margin=10, outFileName="", verbose=0 ): | |
22 """ | |
23 Constructor. | |
24 """ | |
25 self._inFileName = inFileName | |
26 self._chunkLength = chunkLength | |
27 self._chunkOverlap = chunkOverlap | |
28 self._margin = margin | |
29 self._outFileName = outFileName | |
30 self._verbose = verbose | |
31 | |
32 def help( self ): | |
33 """ | |
34 Display the help. | |
35 """ | |
36 print | |
37 print "usage: %s [ options ]" % ( sys.argv[0] ) | |
38 print "options:" | |
39 print " -h: this help" | |
40 print " -i: name of the input file (format='align')" | |
41 print " -l: chunk length (in bp)" | |
42 print " -o: chunk overlap (in bp)" | |
43 print " -m: margin to remove match included into a chunk overlap (default=10)" | |
44 print " -O: name of the output file (default=inFileName+'.not_over')" | |
45 print " -v: verbose (default=0/1)" | |
46 print | |
47 | |
48 def setAttributesFromCmdLine( self ): | |
49 """ | |
50 Set attributes from the command-line arguments. | |
51 """ | |
52 try: | |
53 opts, args = getopt.getopt(sys.argv[1:],"h:i:l:o:m:O:v:") | |
54 except getopt.GetoptError, err: | |
55 print str(err); self.help(); sys.exit(1) | |
56 for o,a in opts: | |
57 if o == "-h": | |
58 self.help(); sys.exit(0) | |
59 elif o == "-i": | |
60 self.setInputFileName( a ) | |
61 elif o == "-l": | |
62 self.setChunkLength( a ) | |
63 elif o == "-o": | |
64 self.setChunkOverlap( a ) | |
65 elif o == "-m": | |
66 self.setMargin( a ) | |
67 elif o == "-O": | |
68 self.setOutputFileName( a ) | |
69 elif o == "-v": | |
70 self.setVerbosityLevel( a ) | |
71 | |
72 def setInputFileName( self, inFileName ): | |
73 self._inFileName = inFileName | |
74 | |
75 def setChunkLength( self, chunkLength ): | |
76 self._chunkLength = int(chunkLength) | |
77 | |
78 def setChunkOverlap( self, chunkOverlap ): | |
79 self._chunkOverlap = int(chunkOverlap) | |
80 | |
81 def setMargin( self, margin ): | |
82 self._margin = int(margin) | |
83 | |
84 def setOutputFileName( self, outFileName ): | |
85 self._outFileName = outFileName | |
86 | |
87 def setVerbosityLevel( self, verbose ): | |
88 self._verbose = int(verbose) | |
89 | |
90 def checkAttributes( self ): | |
91 """ | |
92 Before running, check the required attributes are properly filled. | |
93 """ | |
94 if self._inFileName == "": | |
95 print "ERROR: missing input file"; self.help(); sys.exit(1) | |
96 if not os.path.exists(self._inFileName ): | |
97 print "ERROR: input file '%s' doesn't exist" %( self._inFileName ) | |
98 if self._outFileName == "": | |
99 self._outFileName = "%s.not_over" % ( self._inFileName ) | |
100 | |
101 | |
102 def isPairAlignAChunkOverlap( self, a, chunkQuery, chunkSubject ): | |
103 """ | |
104 Return True if the pairwise alignment exactly corresponds to a 2-chunk overlap, False otherwise. | |
105 Take into account cases specific to BLASTER or PALS. | |
106 """ | |
107 | |
108 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand(): | |
109 if self._verbose > 1: print "on different strand" | |
110 return False | |
111 | |
112 if chunkQuery == chunkSubject + 1: | |
113 if self._verbose > 1: print "query > subject" | |
114 if a.range_query.start == 1 and a.range_subject.end == self._chunkLength \ | |
115 and ( a.range_query.getLength() == self._chunkOverlap \ | |
116 or a.range_query.getLength() == self._chunkOverlap + 1 ) \ | |
117 and ( a.range_subject.getLength() == self._chunkOverlap \ | |
118 or a.range_subject.getLength() == self._chunkOverlap + 1 ): | |
119 if self._verbose > 1: print "chunk overlap" | |
120 return True | |
121 | |
122 elif chunkQuery == chunkSubject - 1: | |
123 if self._verbose > 1: print "query < subject" | |
124 if a.range_query.end == self._chunkLength and a.range_subject.start == 1 \ | |
125 and ( a.range_query.getLength() == self._chunkOverlap \ | |
126 or a.range_query.getLength() == self._chunkOverlap + 1 ) \ | |
127 and ( a.range_subject.getLength() == self._chunkOverlap \ | |
128 or a.range_subject.getLength() == self._chunkOverlap + 1 ): | |
129 if self._verbose > 1: print "chunk overlap" | |
130 return True | |
131 | |
132 if self._verbose > 1: print "not a chunk overlap" | |
133 return False | |
134 | |
135 | |
136 def isInInterval( self, coord, midpoint ): | |
137 """ | |
138 Check if a given coordinate is inside the interval [midpoint-margin;midpoint+margin]. | |
139 """ | |
140 if coord >= midpoint - self._margin and coord <= midpoint + self._margin: | |
141 return True | |
142 return False | |
143 | |
144 | |
145 def isPairAlignWithinAndDueToAChunkOverlap( self, a, chunkQuery, chunkSubject ): | |
146 """ | |
147 Return True if the pairwise alignment lies within an overlap between two contiguous chunks and is due to it, False otherwise. | |
148 """ | |
149 uniqLength = self._chunkLength - self._chunkOverlap | |
150 | |
151 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand(): | |
152 if self._verbose > 1: print "on different strand" | |
153 return False | |
154 | |
155 if chunkQuery == chunkSubject + 1: | |
156 if self._verbose > 1: print "query > subject" | |
157 if a.range_query.getMin() >= 1 and a.range_query.getMax() <= self._chunkOverlap \ | |
158 and a.range_subject.getMin() >= self._chunkLength - self._chunkOverlap + 1 \ | |
159 and a.range_subject.getMax() <= self._chunkLength: | |
160 if self._verbose > 1: print "included" | |
161 if self.isInInterval( a.range_query.getMin(), a.range_subject.getMin() - uniqLength ) \ | |
162 and self.isInInterval( self._chunkOverlap - a.range_query.getMax(), self._chunkLength - a.range_subject.getMax() ): | |
163 if self._verbose > 1: print "due to overlap" | |
164 return True | |
165 else: | |
166 if self._verbose > 1: print "not due to overlap" | |
167 return False | |
168 | |
169 elif chunkQuery == chunkSubject - 1: | |
170 if self._verbose > 1: print "query < subject" | |
171 if a.range_query.getMin() >= self._chunkLength - self._chunkOverlap + 1 \ | |
172 and a.range_query.getMax() <= self._chunkLength \ | |
173 and a.range_subject.getMin() >= 1 \ | |
174 and a.range_subject.getMax() <= self._chunkOverlap: | |
175 if self._verbose > 1: print "included" | |
176 if self.isInInterval( a.range_subject.getMin(), a.range_query.getMin() - uniqLength ) \ | |
177 and self.isInInterval( self._chunkOverlap - a.range_subject.getMax(), self._chunkLength - a.range_query.getMax() ): | |
178 if self._verbose > 1: print "due to overlap" | |
179 return True | |
180 else: | |
181 if self._verbose > 1: print "not due to overlap" | |
182 return False | |
183 | |
184 else: | |
185 if self._verbose > 1: print "not contiguous chunks" | |
186 return False | |
187 | |
188 if self._verbose > 1: print "not included" | |
189 return False | |
190 | |
191 | |
192 def removeChunkOverlaps( self ): | |
193 """ | |
194 Remove pairwise alignments exactly corresponding to chunk overlaps or those included within such overlaps. | |
195 """ | |
196 totalNbPairAlign = 0 | |
197 nbChunkOverlaps = 0 | |
198 d = {} | |
199 nbPairAlignWithinChunkOverlaps = 0 | |
200 | |
201 inF = open( self._inFileName, "r" ) | |
202 outF = open( self._outFileName, "w" ) | |
203 alignInstance = Align() | |
204 | |
205 while True: | |
206 if not alignInstance.read( inF ): break | |
207 totalNbPairAlign += 1 | |
208 if self._verbose > 1: alignInstance.show() | |
209 | |
210 if "chunk" not in alignInstance.range_query.seqname or "chunk" not in alignInstance.range_subject.seqname: | |
211 print "WARNING: no 'chunk' in query or subject name"; return False | |
212 | |
213 chunkQuery = int(alignInstance.range_query.seqname.replace("chunk","")) | |
214 chunkSubject = int(alignInstance.range_subject.seqname.replace("chunk","")) | |
215 | |
216 if abs( chunkSubject - chunkQuery ) > 1: | |
217 if self._verbose > 1: print "non contiguous chunks -> keep" | |
218 alignInstance.write( outF ) | |
219 continue | |
220 | |
221 if alignInstance.range_query.isOnDirectStrand() != alignInstance.range_subject.isOnDirectStrand(): | |
222 if self._verbose > 1: print "on different strand" | |
223 alignInstance.write( outF ) | |
224 continue | |
225 | |
226 if abs( chunkSubject - chunkQuery ) == 0: | |
227 if alignInstance.range_query.start == 1 \ | |
228 and alignInstance.range_query.end == self._chunkLength \ | |
229 and alignInstance.range_subject.start == 1 \ | |
230 and alignInstance.range_subject.end == self._chunkLength: | |
231 if self._verbose > 1: print "self-alignment on whole chunk -> remove" | |
232 continue | |
233 | |
234 if self.isPairAlignAChunkOverlap( alignInstance, chunkQuery, chunkSubject ): | |
235 if self._verbose > 1: print "chunk overlap -> remove" | |
236 nbChunkOverlaps += 1 | |
237 | |
238 elif self.isPairAlignWithinAndDueToAChunkOverlap( alignInstance, chunkQuery, chunkSubject ): | |
239 if self._verbose > 1: print "within chunk overlap -> remove" | |
240 nbPairAlignWithinChunkOverlaps += 1 | |
241 | |
242 else: | |
243 if self._verbose > 1: print "keep" | |
244 alignInstance.write( outF ) | |
245 | |
246 inF.close() | |
247 if self._verbose > 0: print "nb of pairwise alignments in input file: %i" % ( totalNbPairAlign ) | |
248 if self._verbose > 0: print "nb of chunk overlaps: %i" % ( nbChunkOverlaps ) | |
249 if self._verbose > 0: print "nb of pairwise alignments within chunk overlaps: %i" % ( nbPairAlignWithinChunkOverlaps ) | |
250 | |
251 for names,lAligns in d.items(): | |
252 for alignInstance in lAligns: | |
253 alignInstance.write( outF ) | |
254 outF.close() | |
255 | |
256 | |
257 def start( self ): | |
258 """ | |
259 Useful commands before running the program. | |
260 """ | |
261 if self._verbose > 0: | |
262 print "START %s" % ( type(self).__name__ ); sys.stdout.flush() | |
263 self.checkAttributes() | |
264 | |
265 | |
266 def end( self ): | |
267 """ | |
268 Useful commands before ending the program. | |
269 """ | |
270 if self._verbose > 0: | |
271 print "END %s" % ( type(self).__name__ ); sys.stdout.flush() | |
272 | |
273 | |
274 def run( self ): | |
275 """ | |
276 Run the program. | |
277 """ | |
278 self.start() | |
279 self.removeChunkOverlaps() | |
280 self.end() | |
281 | |
282 | |
283 if __name__ == '__main__': | |
284 i = RmvPairAlignInChunkOverlaps() | |
285 i.setAttributesFromCmdLine() | |
286 i.run() |