18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Remove hits due to chunk overlaps.
|
|
5 """
|
|
6
|
|
7 import os
|
|
8 import sys
|
|
9 import getopt
|
|
10 import exceptions
|
|
11 import copy
|
|
12 from commons.core.coord.Align import *
|
|
13
|
|
14
|
|
15 class RmvPairAlignInChunkOverlaps( object ):
|
|
16 """
|
|
17 Remove hits due to chunk overlaps.
|
|
18 """
|
|
19
|
|
20
|
|
21 def __init__( self, inFileName="", chunkLength=200000, chunkOverlap=10000, margin=10, outFileName="", verbose=0 ):
|
|
22 """
|
|
23 Constructor.
|
|
24 """
|
|
25 self._inFileName = inFileName
|
|
26 self._chunkLength = chunkLength
|
|
27 self._chunkOverlap = chunkOverlap
|
|
28 self._margin = margin
|
|
29 self._outFileName = outFileName
|
|
30 self._verbose = verbose
|
|
31
|
|
32 def help( self ):
|
|
33 """
|
|
34 Display the help.
|
|
35 """
|
|
36 print
|
|
37 print "usage: %s [ options ]" % ( sys.argv[0] )
|
|
38 print "options:"
|
|
39 print " -h: this help"
|
|
40 print " -i: name of the input file (format='align')"
|
|
41 print " -l: chunk length (in bp)"
|
|
42 print " -o: chunk overlap (in bp)"
|
|
43 print " -m: margin to remove match included into a chunk overlap (default=10)"
|
|
44 print " -O: name of the output file (default=inFileName+'.not_over')"
|
|
45 print " -v: verbose (default=0/1)"
|
|
46 print
|
|
47
|
|
48 def setAttributesFromCmdLine( self ):
|
|
49 """
|
|
50 Set attributes from the command-line arguments.
|
|
51 """
|
|
52 try:
|
|
53 opts, args = getopt.getopt(sys.argv[1:],"h:i:l:o:m:O:v:")
|
|
54 except getopt.GetoptError, err:
|
|
55 print str(err); self.help(); sys.exit(1)
|
|
56 for o,a in opts:
|
|
57 if o == "-h":
|
|
58 self.help(); sys.exit(0)
|
|
59 elif o == "-i":
|
|
60 self.setInputFileName( a )
|
|
61 elif o == "-l":
|
|
62 self.setChunkLength( a )
|
|
63 elif o == "-o":
|
|
64 self.setChunkOverlap( a )
|
|
65 elif o == "-m":
|
|
66 self.setMargin( a )
|
|
67 elif o == "-O":
|
|
68 self.setOutputFileName( a )
|
|
69 elif o == "-v":
|
|
70 self.setVerbosityLevel( a )
|
|
71
|
|
72 def setInputFileName( self, inFileName ):
|
|
73 self._inFileName = inFileName
|
|
74
|
|
75 def setChunkLength( self, chunkLength ):
|
|
76 self._chunkLength = int(chunkLength)
|
|
77
|
|
78 def setChunkOverlap( self, chunkOverlap ):
|
|
79 self._chunkOverlap = int(chunkOverlap)
|
|
80
|
|
81 def setMargin( self, margin ):
|
|
82 self._margin = int(margin)
|
|
83
|
|
84 def setOutputFileName( self, outFileName ):
|
|
85 self._outFileName = outFileName
|
|
86
|
|
87 def setVerbosityLevel( self, verbose ):
|
|
88 self._verbose = int(verbose)
|
|
89
|
|
90 def checkAttributes( self ):
|
|
91 """
|
|
92 Before running, check the required attributes are properly filled.
|
|
93 """
|
|
94 if self._inFileName == "":
|
|
95 print "ERROR: missing input file"; self.help(); sys.exit(1)
|
|
96 if not os.path.exists(self._inFileName ):
|
|
97 print "ERROR: input file '%s' doesn't exist" %( self._inFileName )
|
|
98 if self._outFileName == "":
|
|
99 self._outFileName = "%s.not_over" % ( self._inFileName )
|
|
100
|
|
101
|
|
102 def isPairAlignAChunkOverlap( self, a, chunkQuery, chunkSubject ):
|
|
103 """
|
|
104 Return True if the pairwise alignment exactly corresponds to a 2-chunk overlap, False otherwise.
|
|
105 Take into account cases specific to BLASTER or PALS.
|
|
106 """
|
|
107
|
|
108 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand():
|
|
109 if self._verbose > 1: print "on different strand"
|
|
110 return False
|
|
111
|
|
112 if chunkQuery == chunkSubject + 1:
|
|
113 if self._verbose > 1: print "query > subject"
|
|
114 if a.range_query.start == 1 and a.range_subject.end == self._chunkLength \
|
|
115 and ( a.range_query.getLength() == self._chunkOverlap \
|
|
116 or a.range_query.getLength() == self._chunkOverlap + 1 ) \
|
|
117 and ( a.range_subject.getLength() == self._chunkOverlap \
|
|
118 or a.range_subject.getLength() == self._chunkOverlap + 1 ):
|
|
119 if self._verbose > 1: print "chunk overlap"
|
|
120 return True
|
|
121
|
|
122 elif chunkQuery == chunkSubject - 1:
|
|
123 if self._verbose > 1: print "query < subject"
|
|
124 if a.range_query.end == self._chunkLength and a.range_subject.start == 1 \
|
|
125 and ( a.range_query.getLength() == self._chunkOverlap \
|
|
126 or a.range_query.getLength() == self._chunkOverlap + 1 ) \
|
|
127 and ( a.range_subject.getLength() == self._chunkOverlap \
|
|
128 or a.range_subject.getLength() == self._chunkOverlap + 1 ):
|
|
129 if self._verbose > 1: print "chunk overlap"
|
|
130 return True
|
|
131
|
|
132 if self._verbose > 1: print "not a chunk overlap"
|
|
133 return False
|
|
134
|
|
135
|
|
136 def isInInterval( self, coord, midpoint ):
|
|
137 """
|
|
138 Check if a given coordinate is inside the interval [midpoint-margin;midpoint+margin].
|
|
139 """
|
|
140 if coord >= midpoint - self._margin and coord <= midpoint + self._margin:
|
|
141 return True
|
|
142 return False
|
|
143
|
|
144
|
|
145 def isPairAlignWithinAndDueToAChunkOverlap( self, a, chunkQuery, chunkSubject ):
|
|
146 """
|
|
147 Return True if the pairwise alignment lies within an overlap between two contiguous chunks and is due to it, False otherwise.
|
|
148 """
|
|
149 uniqLength = self._chunkLength - self._chunkOverlap
|
|
150
|
|
151 if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand():
|
|
152 if self._verbose > 1: print "on different strand"
|
|
153 return False
|
|
154
|
|
155 if chunkQuery == chunkSubject + 1:
|
|
156 if self._verbose > 1: print "query > subject"
|
|
157 if a.range_query.getMin() >= 1 and a.range_query.getMax() <= self._chunkOverlap \
|
|
158 and a.range_subject.getMin() >= self._chunkLength - self._chunkOverlap + 1 \
|
|
159 and a.range_subject.getMax() <= self._chunkLength:
|
|
160 if self._verbose > 1: print "included"
|
|
161 if self.isInInterval( a.range_query.getMin(), a.range_subject.getMin() - uniqLength ) \
|
|
162 and self.isInInterval( self._chunkOverlap - a.range_query.getMax(), self._chunkLength - a.range_subject.getMax() ):
|
|
163 if self._verbose > 1: print "due to overlap"
|
|
164 return True
|
|
165 else:
|
|
166 if self._verbose > 1: print "not due to overlap"
|
|
167 return False
|
|
168
|
|
169 elif chunkQuery == chunkSubject - 1:
|
|
170 if self._verbose > 1: print "query < subject"
|
|
171 if a.range_query.getMin() >= self._chunkLength - self._chunkOverlap + 1 \
|
|
172 and a.range_query.getMax() <= self._chunkLength \
|
|
173 and a.range_subject.getMin() >= 1 \
|
|
174 and a.range_subject.getMax() <= self._chunkOverlap:
|
|
175 if self._verbose > 1: print "included"
|
|
176 if self.isInInterval( a.range_subject.getMin(), a.range_query.getMin() - uniqLength ) \
|
|
177 and self.isInInterval( self._chunkOverlap - a.range_subject.getMax(), self._chunkLength - a.range_query.getMax() ):
|
|
178 if self._verbose > 1: print "due to overlap"
|
|
179 return True
|
|
180 else:
|
|
181 if self._verbose > 1: print "not due to overlap"
|
|
182 return False
|
|
183
|
|
184 else:
|
|
185 if self._verbose > 1: print "not contiguous chunks"
|
|
186 return False
|
|
187
|
|
188 if self._verbose > 1: print "not included"
|
|
189 return False
|
|
190
|
|
191
|
|
192 def removeChunkOverlaps( self ):
|
|
193 """
|
|
194 Remove pairwise alignments exactly corresponding to chunk overlaps or those included within such overlaps.
|
|
195 """
|
|
196 totalNbPairAlign = 0
|
|
197 nbChunkOverlaps = 0
|
|
198 d = {}
|
|
199 nbPairAlignWithinChunkOverlaps = 0
|
|
200
|
|
201 inF = open( self._inFileName, "r" )
|
|
202 outF = open( self._outFileName, "w" )
|
|
203 alignInstance = Align()
|
|
204
|
|
205 while True:
|
|
206 if not alignInstance.read( inF ): break
|
|
207 totalNbPairAlign += 1
|
|
208 if self._verbose > 1: alignInstance.show()
|
|
209
|
|
210 if "chunk" not in alignInstance.range_query.seqname or "chunk" not in alignInstance.range_subject.seqname:
|
|
211 print "WARNING: no 'chunk' in query or subject name"; return False
|
|
212
|
|
213 chunkQuery = int(alignInstance.range_query.seqname.replace("chunk",""))
|
|
214 chunkSubject = int(alignInstance.range_subject.seqname.replace("chunk",""))
|
|
215
|
|
216 if abs( chunkSubject - chunkQuery ) > 1:
|
|
217 if self._verbose > 1: print "non contiguous chunks -> keep"
|
|
218 alignInstance.write( outF )
|
|
219 continue
|
|
220
|
|
221 if alignInstance.range_query.isOnDirectStrand() != alignInstance.range_subject.isOnDirectStrand():
|
|
222 if self._verbose > 1: print "on different strand"
|
|
223 alignInstance.write( outF )
|
|
224 continue
|
|
225
|
|
226 if abs( chunkSubject - chunkQuery ) == 0:
|
|
227 if alignInstance.range_query.start == 1 \
|
|
228 and alignInstance.range_query.end == self._chunkLength \
|
|
229 and alignInstance.range_subject.start == 1 \
|
|
230 and alignInstance.range_subject.end == self._chunkLength:
|
|
231 if self._verbose > 1: print "self-alignment on whole chunk -> remove"
|
|
232 continue
|
|
233
|
|
234 if self.isPairAlignAChunkOverlap( alignInstance, chunkQuery, chunkSubject ):
|
|
235 if self._verbose > 1: print "chunk overlap -> remove"
|
|
236 nbChunkOverlaps += 1
|
|
237
|
|
238 elif self.isPairAlignWithinAndDueToAChunkOverlap( alignInstance, chunkQuery, chunkSubject ):
|
|
239 if self._verbose > 1: print "within chunk overlap -> remove"
|
|
240 nbPairAlignWithinChunkOverlaps += 1
|
|
241
|
|
242 else:
|
|
243 if self._verbose > 1: print "keep"
|
|
244 alignInstance.write( outF )
|
|
245
|
|
246 inF.close()
|
|
247 if self._verbose > 0: print "nb of pairwise alignments in input file: %i" % ( totalNbPairAlign )
|
|
248 if self._verbose > 0: print "nb of chunk overlaps: %i" % ( nbChunkOverlaps )
|
|
249 if self._verbose > 0: print "nb of pairwise alignments within chunk overlaps: %i" % ( nbPairAlignWithinChunkOverlaps )
|
|
250
|
|
251 for names,lAligns in d.items():
|
|
252 for alignInstance in lAligns:
|
|
253 alignInstance.write( outF )
|
|
254 outF.close()
|
|
255
|
|
256
|
|
257 def start( self ):
|
|
258 """
|
|
259 Useful commands before running the program.
|
|
260 """
|
|
261 if self._verbose > 0:
|
|
262 print "START %s" % ( type(self).__name__ ); sys.stdout.flush()
|
|
263 self.checkAttributes()
|
|
264
|
|
265
|
|
266 def end( self ):
|
|
267 """
|
|
268 Useful commands before ending the program.
|
|
269 """
|
|
270 if self._verbose > 0:
|
|
271 print "END %s" % ( type(self).__name__ ); sys.stdout.flush()
|
|
272
|
|
273
|
|
274 def run( self ):
|
|
275 """
|
|
276 Run the program.
|
|
277 """
|
|
278 self.start()
|
|
279 self.removeChunkOverlaps()
|
|
280 self.end()
|
|
281
|
|
282
|
|
283 if __name__ == '__main__':
|
|
284 i = RmvPairAlignInChunkOverlaps()
|
|
285 i.setAttributesFromCmdLine()
|
|
286 i.run()
|