Mercurial > repos > yufei-luo > s_mart
comparison commons/core/coord/PathUtils.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | 769e306b7933 |
children |
comparison
equal
deleted
inserted
replaced
35:d94018ca4ada | 36:44d5973c188c |
---|---|
1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
2 # http://www.inra.fr | |
3 # http://urgi.versailles.inra.fr | |
4 # | |
5 # This software is governed by the CeCILL license under French law and | |
6 # abiding by the rules of distribution of free software. You can use, | |
7 # modify and/ or redistribute the software under the terms of the CeCILL | |
8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
9 # "http://www.cecill.info". | |
10 # | |
11 # As a counterpart to the access to the source code and rights to copy, | |
12 # modify and redistribute granted by the license, users are provided only | |
13 # with a limited warranty and the software's author, the holder of the | |
14 # economic rights, and the successive licensors have only limited | |
15 # liability. | |
16 # | |
17 # In this respect, the user's attention is drawn to the risks associated | |
18 # with loading, using, modifying and/or developing or reproducing the | |
19 # software by the user in light of its specific status of free software, | |
20 # that may mean that it is complicated to manipulate, and that also | |
21 # therefore means that it is reserved for developers and experienced | |
22 # professionals having in-depth computer knowledge. Users are therefore | |
23 # encouraged to load and test the software's suitability as regards their | |
24 # requirements in conditions enabling the security of their systems and/or | |
25 # data to be ensured and, more generally, to use and operate it in the | |
26 # same conditions as regards security. | |
27 # | |
28 # The fact that you are presently reading this means that you have had | |
29 # knowledge of the CeCILL license and that you accept its terms. | |
30 | |
31 | |
32 import os | |
33 import sys | |
34 import copy | |
35 from commons.core.coord.Path import Path | |
36 from commons.core.coord.SetUtils import SetUtils | |
37 from commons.core.coord.Map import Map | |
38 from commons.core.coord.AlignUtils import AlignUtils | |
39 from commons.core.checker.RepetException import RepetDataException | |
40 | |
41 ## Static methods for the manipulation of Path instances | |
42 # | |
43 class PathUtils ( object ): | |
44 | |
45 ## Change the identifier of each Set instance in the given list | |
46 # | |
47 # @param lPaths list of Path instances | |
48 # @param newId new identifier | |
49 # | |
50 def changeIdInList(lPaths, newId): | |
51 for iPath in lPaths: | |
52 iPath.id = newId | |
53 | |
54 changeIdInList = staticmethod( changeIdInList ) | |
55 | |
56 | |
57 ## Return a list of Set instances containing the query range from a list of Path instances | |
58 # | |
59 # @param lPaths a list of Path instances | |
60 # | |
61 def getSetListFromQueries(lPaths): | |
62 lSets = [] | |
63 for iPath in lPaths: | |
64 lSets.append( iPath.getSubjectAsSetOfQuery() ) | |
65 return lSets | |
66 | |
67 getSetListFromQueries = staticmethod( getSetListFromQueries ) | |
68 | |
69 #TODO: add tests !!!! | |
70 ## Return a list of Set instances containing the query range from a list of Path instances | |
71 # | |
72 # @param lPaths a list of Path instances | |
73 # | |
74 @staticmethod | |
75 def getSetListFromSubjects(lPaths): | |
76 lSets = [] | |
77 for iPath in lPaths: | |
78 lSets.append( iPath.getQuerySetOfSubject() ) | |
79 return lSets | |
80 | |
81 | |
82 ## Return a sorted list of Range instances containing the subjects from a list of Path instances | |
83 # | |
84 # @param lPaths a list of Path instances | |
85 # @note meaningful only if all Path instances have same identifier | |
86 # | |
87 def getRangeListFromSubjects( lPaths ): | |
88 lRanges = [] | |
89 for iPath in lPaths: | |
90 lRanges.append( iPath.range_subject ) | |
91 if lRanges[0].isOnDirectStrand(): | |
92 return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) ) | |
93 else: | |
94 return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) ) | |
95 | |
96 getRangeListFromSubjects = staticmethod( getRangeListFromSubjects ) | |
97 | |
98 | |
99 ## Return a tuple with min and max of query coordinates from Path instances in the given list | |
100 # | |
101 # @param lPaths a list of Path instances | |
102 # | |
103 def getQueryMinMaxFromPathList(lPaths): | |
104 qmin = -1 | |
105 qmax = -1 | |
106 for iPath in lPaths: | |
107 if qmin == -1: | |
108 qmin = iPath.range_query.start | |
109 qmin = min(qmin, iPath.range_query.getMin()) | |
110 qmax = max(qmax, iPath.range_query.getMax()) | |
111 return (qmin, qmax) | |
112 | |
113 getQueryMinMaxFromPathList = staticmethod( getQueryMinMaxFromPathList ) | |
114 | |
115 | |
116 ## Return a tuple with min and max of subject coordinates from Path instances in the given list | |
117 # | |
118 # @param lPaths lists of Path instances | |
119 # | |
120 def getSubjectMinMaxFromPathList(lPaths): | |
121 smin = -1 | |
122 smax = -1 | |
123 for iPath in lPaths: | |
124 if smin == -1: | |
125 smin = iPath.range_subject.start | |
126 smin = min(smin, iPath.range_subject.getMin()) | |
127 smax = max(smax, iPath.range_subject.getMax()) | |
128 return (smin, smax) | |
129 | |
130 getSubjectMinMaxFromPathList = staticmethod( getSubjectMinMaxFromPathList ) | |
131 | |
132 | |
133 ## Return True if the query range of any Path instance from the first list overlaps with the query range of any Path instance from the second list | |
134 # | |
135 # @param lPaths1: list of Path instances | |
136 # @param lPaths2: list of Path instances | |
137 # @return boolean | |
138 # | |
139 def areQueriesOverlappingBetweenPathLists( lPaths1, lPaths2 ): | |
140 lSortedPaths1 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths1 ) | |
141 lSortedPaths2 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths2 ) | |
142 i = 0 | |
143 j = 0 | |
144 while i != len(lSortedPaths1): | |
145 while j != len(lSortedPaths2): | |
146 if not lSortedPaths1[i].range_query.isOverlapping( lSortedPaths2[j].range_query ): | |
147 j += 1 | |
148 else: | |
149 return True | |
150 i += 1 | |
151 return False | |
152 | |
153 areQueriesOverlappingBetweenPathLists = staticmethod( areQueriesOverlappingBetweenPathLists ) | |
154 | |
155 | |
156 ## Show Path instances contained in the given list | |
157 # | |
158 # @param lPaths a list of Path instances | |
159 # | |
160 def showList(lPaths): | |
161 for iPath in lPaths: | |
162 iPath.show() | |
163 | |
164 showList = staticmethod( showList ) | |
165 | |
166 | |
167 ## Write Path instances contained in the given list | |
168 # | |
169 # @param lPaths a list of Path instances | |
170 # @param fileName name of the file to write the Path instances | |
171 # @param mode the open mode of the file ""w"" or ""a"" | |
172 # | |
173 def writeListInFile(lPaths, fileName, mode="w"): | |
174 AlignUtils.writeListInFile(lPaths, fileName, mode) | |
175 | |
176 writeListInFile = staticmethod( writeListInFile ) | |
177 | |
178 | |
179 ## Return new list of Path instances with no duplicate | |
180 # | |
181 # @param lPaths a list of Path instances | |
182 # @param useOnlyCoord boolean if True, check only coordinates and sequence names | |
183 # @return lUniqPaths a path instances list | |
184 # | |
185 def getPathListWithoutDuplicates(lPaths, useOnlyCoord = False): | |
186 if len(lPaths) < 2: | |
187 return lPaths | |
188 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) | |
189 lUniqPaths = [ lSortedPaths[0] ] | |
190 if useOnlyCoord: | |
191 for iPath in lSortedPaths[1:]: | |
192 if iPath.range_query.start != lUniqPaths[-1].range_query.start \ | |
193 or iPath.range_query.end != lUniqPaths[-1].range_query.end \ | |
194 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname \ | |
195 or iPath.range_subject.start != lUniqPaths[-1].range_subject.start \ | |
196 or iPath.range_subject.end != lUniqPaths[-1].range_subject.end \ | |
197 or iPath.range_subject.seqname != lUniqPaths[-1].range_subject.seqname: | |
198 lUniqPaths.append( iPath ) | |
199 else: | |
200 for iPath in lSortedPaths[1:]: | |
201 if iPath != lUniqPaths[-1]: | |
202 lUniqPaths.append( iPath ) | |
203 return lUniqPaths | |
204 | |
205 getPathListWithoutDuplicates = staticmethod( getPathListWithoutDuplicates ) | |
206 | |
207 | |
208 def getPathListWithoutDuplicatesOnQueryCoord(lPaths): | |
209 if len(lPaths) < 2: | |
210 return lPaths | |
211 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths ) | |
212 lUniqPaths = [ lSortedPaths[0] ] | |
213 for iPath in lSortedPaths[1:]: | |
214 if iPath.range_query.start != lUniqPaths[-1].range_query.start \ | |
215 or iPath.range_query.end != lUniqPaths[-1].range_query.end \ | |
216 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname: | |
217 lUniqPaths.append( iPath ) | |
218 return lUniqPaths | |
219 | |
220 getPathListWithoutDuplicatesOnQueryCoord = staticmethod(getPathListWithoutDuplicatesOnQueryCoord) | |
221 | |
222 | |
223 ## Split a Path list in several Path lists according to the identifier | |
224 # | |
225 # @param lPaths a list of Path instances | |
226 # @return a dictionary which keys are identifiers and values Path lists | |
227 # | |
228 def getDictOfListsWithIdAsKey( lPaths ): | |
229 dId2PathList = {} | |
230 for iPath in lPaths: | |
231 if dId2PathList.has_key( iPath.id ): | |
232 dId2PathList[ iPath.id ].append( iPath ) | |
233 else: | |
234 dId2PathList[ iPath.id ] = [ iPath ] | |
235 return dId2PathList | |
236 | |
237 getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey ) | |
238 | |
239 | |
240 ## Split a Path file in several Path lists according to the identifier | |
241 # | |
242 # @param pathFile name of the input Path file | |
243 # @return a dictionary which keys are identifiers and values Path lists | |
244 # | |
245 def getDictOfListsWithIdAsKeyFromFile( pathFile ): | |
246 dId2PathList = {} | |
247 pathFileHandler = open( pathFile, "r" ) | |
248 while True: | |
249 line = pathFileHandler.readline() | |
250 if line == "": | |
251 break | |
252 iPath = Path() | |
253 iPath.setFromString( line ) | |
254 if dId2PathList.has_key( iPath.id ): | |
255 dId2PathList[ iPath.id ].append( iPath ) | |
256 else: | |
257 dId2PathList[ iPath.id ] = [ iPath ] | |
258 pathFileHandler.close() | |
259 return dId2PathList | |
260 | |
261 getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile ) | |
262 | |
263 | |
264 ## Return a list of Path list(s) obtained while splitting a list of connected Path instances according to another based on query coordinates | |
265 # | |
266 # @param lToKeep: a list of Path instances to keep (reference) | |
267 # @param lToUnjoin: a list of Path instances to unjoin | |
268 # @return: list of Path list(s) (can be empty if one of the input lists is empty) | |
269 # @warning: all the path instances in a given list MUST be connected (i.e. same identifier) | |
270 # @warning: all the path instances in a given list MUST NOT overlap neither within each other nor with the Path instances of the other list | |
271 # | |
272 def getPathListUnjoinedBasedOnQuery( lToKeep, lToUnjoin ): | |
273 lSortedToKeep = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToKeep ) | |
274 lSortedToUnjoin = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToUnjoin ) | |
275 if lToUnjoin == []: | |
276 return [] | |
277 if lToKeep == []: | |
278 return [ lToUnjoin ] | |
279 | |
280 lLists = [] | |
281 k = 0 | |
282 while k < len(lSortedToKeep): | |
283 j1 = 0 | |
284 while j1 < len(lSortedToUnjoin) and lSortedToKeep[k].range_query.getMin() > lSortedToUnjoin[j1].range_query.getMax(): | |
285 j1 += 1 | |
286 if j1 == len(lSortedToUnjoin): | |
287 break | |
288 if j1 != 0: | |
289 lLists.append( lSortedToUnjoin[:j1] ) | |
290 del lSortedToUnjoin[:j1] | |
291 j1 = 0 | |
292 if k+1 == len(lSortedToKeep): | |
293 break | |
294 j2 = j1 | |
295 if j2 < len(lSortedToUnjoin) and lSortedToKeep[k+1].range_query.getMin() > lSortedToUnjoin[j2].range_query.getMax(): | |
296 while j2 < len(lSortedToUnjoin) and lSortedToKeep[k+1].range_query.getMin() > lSortedToUnjoin[j2].range_query.getMax(): | |
297 j2 += 1 | |
298 lLists.append( lSortedToUnjoin[j1:j2] ) | |
299 del lSortedToUnjoin[j1:j2] | |
300 k += 1 | |
301 | |
302 if lLists != [] or k == 0: | |
303 lLists.append( lSortedToUnjoin ) | |
304 return lLists | |
305 | |
306 getPathListUnjoinedBasedOnQuery = staticmethod( getPathListUnjoinedBasedOnQuery ) | |
307 | |
308 | |
309 ## Return the identity of the Path list, the identity of each instance being weighted by the length of each query range | |
310 # All Paths should have the same query and subject. | |
311 # The Paths are merged using query coordinates only. | |
312 # | |
313 # @param lPaths list of Path instances | |
314 # | |
315 def getIdentityFromPathList( lPaths, checkSubjects=True ): | |
316 if len( PathUtils.getListOfDistinctQueryNames( lPaths ) ) > 1: | |
317 msg = "ERROR: try to compute identity from Paths with different queries" | |
318 sys.stderr.write( "%s\n" % msg ) | |
319 sys.stderr.flush() | |
320 raise Exception | |
321 if checkSubjects and len( PathUtils.getListOfDistinctSubjectNames( lPaths ) ) > 1: | |
322 msg = "ERROR: try to compute identity from Paths with different subjects" | |
323 sys.stderr.write( "%s\n" % msg ) | |
324 sys.stderr.flush() | |
325 raise Exception | |
326 identity = 0 | |
327 lMergedPaths = PathUtils.mergePathsInListUsingQueryCoordsOnly( lPaths ) | |
328 lQuerySets = PathUtils.getSetListFromQueries( lMergedPaths ) | |
329 lMergedQuerySets = SetUtils.mergeSetsInList( lQuerySets ) | |
330 totalLengthOnQry = SetUtils.getCumulLength( lMergedQuerySets ) | |
331 for iPath in lMergedPaths: | |
332 identity += iPath.identity * iPath.getLengthOnQuery() | |
333 weightedIdentity = identity / float(totalLengthOnQry) | |
334 if weightedIdentity < 0: | |
335 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity | |
336 sys.stderr.write("%s\n" % msg) | |
337 sys.stderr.flush() | |
338 raise Exception | |
339 elif weightedIdentity > 100: | |
340 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity | |
341 sys.stderr.write("%s\n" % msg) | |
342 sys.stderr.flush() | |
343 raise RepetDataException(msg) | |
344 return weightedIdentity | |
345 | |
346 getIdentityFromPathList = staticmethod( getIdentityFromPathList ) | |
347 | |
348 | |
349 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, and finally their initial order. | |
350 # | |
351 # @param lPaths list of Path instances | |
352 # | |
353 def getPathListSortedByIncreasingMinQueryThenMaxQuery(lPaths): | |
354 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax() ) ) | |
355 | |
356 getPathListSortedByIncreasingMinQueryThenMaxQuery = staticmethod( getPathListSortedByIncreasingMinQueryThenMaxQuery ) | |
357 | |
358 | |
359 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then their identifier, and finally their initial order. | |
360 # | |
361 # @param lPaths list of Path instances | |
362 # | |
363 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier(lPaths): | |
364 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax(), iPath.getIdentifier() ) ) | |
365 | |
366 getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier = staticmethod( getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier ) | |
367 | |
368 | |
369 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then the min of the subject, then the max of the subject and finally their initial order. | |
370 # | |
371 # @param lPaths list of Path instances | |
372 # | |
373 @staticmethod | |
374 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenMinSubjectThenMaxSubject(lPaths): | |
375 return sorted(lPaths, key=lambda iPath: (iPath.getQueryMin(), iPath.getQueryMax(), iPath.getSubjectMin(), iPath.getSubjectMax())) | |
376 | |
377 | |
378 ## Return a list of the distinct identifiers | |
379 # | |
380 # @param lPaths list of Path instances | |
381 # | |
382 def getListOfDistinctIdentifiers( lPaths ): | |
383 sDistinctIdentifiers = set() | |
384 for iPath in lPaths: | |
385 sDistinctIdentifiers.add(iPath.id) | |
386 return list(sDistinctIdentifiers) | |
387 | |
388 getListOfDistinctIdentifiers = staticmethod( getListOfDistinctIdentifiers ) | |
389 | |
390 | |
391 ## Return a list of the distinct query names present in the collection | |
392 # | |
393 # @param lPaths list of Path instances | |
394 # | |
395 def getListOfDistinctQueryNames( lPaths ): | |
396 sDistinctQueryNames = set() | |
397 for iPath in lPaths: | |
398 sDistinctQueryNames.add(iPath.range_query.seqname) | |
399 return list(sDistinctQueryNames) | |
400 | |
401 getListOfDistinctQueryNames = staticmethod( getListOfDistinctQueryNames ) | |
402 | |
403 | |
404 ## Return a list of the distinct subject names present in the collection | |
405 # | |
406 # @param lPaths list of Path instances | |
407 # | |
408 def getListOfDistinctSubjectNames( lPaths ): | |
409 sDistinctSubjectNames = set() | |
410 for iPath in lPaths: | |
411 sDistinctSubjectNames.add(iPath.range_subject.seqname) | |
412 return list(sDistinctSubjectNames) | |
413 | |
414 getListOfDistinctSubjectNames = staticmethod( getListOfDistinctSubjectNames ) | |
415 | |
416 | |
417 ## Return a list of lists containing query coordinates of the connections sorted in increasing order. | |
418 # | |
419 # @param lConnectedPaths: list of Path instances having the same identifier | |
420 # @param minLength: threshold below which connections are not reported (default= 0 bp) | |
421 # @note: return only connections longer than threshold | |
422 # @note: if coordinate on query ends at 100, return 101 | |
423 # @warning: Path instances MUST be sorted in increasing order according to query coordinates | |
424 # @warning: Path instances MUST be on direct query strand (and maybe on reverse subject strand) | |
425 # | |
426 def getListOfJoinCoordinatesOnQuery(lConnectedPaths, minLength=0): | |
427 lJoinCoordinates = [] | |
428 for i in xrange(1,len(lConnectedPaths)): | |
429 startJoin = lConnectedPaths[i-1].range_query.end | |
430 endJoin = lConnectedPaths[i].range_query.start | |
431 if endJoin - startJoin + 1 > minLength: | |
432 lJoinCoordinates.append( [ startJoin + 1, endJoin - 1 ] ) | |
433 return lJoinCoordinates | |
434 | |
435 getListOfJoinCoordinatesOnQuery = staticmethod( getListOfJoinCoordinatesOnQuery ) | |
436 | |
437 | |
438 ## Return the length on the query of all Path instance in the given list | |
439 # | |
440 # @param lPaths list of Path instances | |
441 # @note overlapping ranges are not summed but truncated. | |
442 # | |
443 def getLengthOnQueryFromPathList( lPaths ): | |
444 lSets = PathUtils.getSetListFromQueries( lPaths ) | |
445 lMergedSets = SetUtils.mergeSetsInList( lSets ) | |
446 length = SetUtils.getCumulLength( lMergedSets ) | |
447 return length | |
448 | |
449 getLengthOnQueryFromPathList = staticmethod( getLengthOnQueryFromPathList ) | |
450 | |
451 | |
452 ## Convert a Path file into an Align file | |
453 # | |
454 # @param pathFile: name of the input Path file | |
455 # @param alignFile: name of the output Align file | |
456 # | |
457 def convertPathFileIntoAlignFile(pathFile, alignFile): | |
458 pathFileHandler = open( pathFile, "r" ) | |
459 alignFileHandler = open( alignFile, "w" ) | |
460 iPath = Path() | |
461 while True: | |
462 line = pathFileHandler.readline() | |
463 if line == "": | |
464 break | |
465 iPath.setFromString( line ) | |
466 iAlign = iPath.getAlignInstance() | |
467 iAlign.write( alignFileHandler ) | |
468 pathFileHandler.close() | |
469 alignFileHandler.close() | |
470 | |
471 convertPathFileIntoAlignFile = staticmethod( convertPathFileIntoAlignFile ) | |
472 | |
473 #TODO: duplicated method => to rename with the name of the next method (which is called) ? | |
474 ## Convert a Path File into a Map file with query coordinates only | |
475 # | |
476 # @param pathFile: name of the input Path file | |
477 # @param mapFile: name of the output Map file | |
478 # | |
479 def convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ): | |
480 pathFileHandler = open( pathFile, "r" ) | |
481 mapFileHandler = open( mapFile, "w" ) | |
482 p = Path() | |
483 while True: | |
484 line = pathFileHandler.readline() | |
485 if line == "": | |
486 break | |
487 p.reset() | |
488 p.setFromTuple( line.split("\t") ) | |
489 p.writeSubjectAsMapOfQuery( mapFileHandler ) | |
490 pathFileHandler.close() | |
491 mapFileHandler.close() | |
492 | |
493 convertPathFileIntoMapFileWithQueryCoordsOnly = staticmethod( convertPathFileIntoMapFileWithQueryCoordsOnly ) | |
494 | |
495 | |
496 ## for each line of a given Path file, write the coordinates of the subject on the query as one line in a Map file | |
497 # | |
498 # @param pathFile: name of the input Path file | |
499 # @param mapFile: name of the output Map file | |
500 # | |
501 def convertPathFileIntoMapFileWithSubjectsOnQueries( pathFile, mapFile ): | |
502 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ) | |
503 convertPathFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertPathFileIntoMapFileWithSubjectsOnQueries ) | |
504 | |
505 | |
506 ## Merge matches on queries | |
507 # | |
508 # @param inFile: name of the input Path file | |
509 # @param outFile: name of the output Path file | |
510 # | |
511 def mergeMatchesOnQueries(inFile, outFile): | |
512 mapFile = "%s.map" % ( inFile ) | |
513 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( inFile, mapFile ) | |
514 cmd = "mapOp" | |
515 cmd += " -q %s" % ( mapFile ) | |
516 cmd += " -m" | |
517 cmd += " 2>&1 > /dev/null" | |
518 exitStatus = os.system( cmd ) | |
519 if exitStatus != 0: | |
520 print "ERROR: mapOp returned %i" % ( exitStatus ) | |
521 sys.exit(1) | |
522 os.remove( mapFile ) | |
523 mergeFile = "%s.merge" % ( mapFile ) | |
524 mergeFileHandler = open( mergeFile, "r" ) | |
525 outFileHandler = open( outFile, "w" ) | |
526 m = Map() | |
527 while True: | |
528 line = mergeFileHandler.readline() | |
529 if line == "": | |
530 break | |
531 m.reset() | |
532 m.setFromString( line, "\t" ) | |
533 m.writeAsQueryOfPath( outFileHandler ) | |
534 mergeFileHandler.close() | |
535 os.remove( mergeFile ) | |
536 outFileHandler.close() | |
537 | |
538 mergeMatchesOnQueries = staticmethod( mergeMatchesOnQueries ) | |
539 | |
540 | |
541 ## Filter chains of Path(s) which length is below a given threshold | |
542 # | |
543 # @param lPaths: list of Path instances | |
544 # @param minLengthChain: minimum length of a chain to be kept | |
545 # @note: a chain may contain a single Path instance | |
546 # @return: a list of Path instances | |
547 # | |
548 def filterPathListOnChainLength( lPaths, minLengthChain ): | |
549 lFilteredPaths = [] | |
550 dPathnum2Paths = PathUtils.getDictOfListsWithIdAsKey( lPaths ) | |
551 for pathnum in dPathnum2Paths.keys(): | |
552 length = PathUtils.getLengthOnQueryFromPathList( dPathnum2Paths[ pathnum ] ) | |
553 if length >= minLengthChain: | |
554 lFilteredPaths += dPathnum2Paths[ pathnum ] | |
555 return lFilteredPaths | |
556 | |
557 filterPathListOnChainLength = staticmethod( filterPathListOnChainLength ) | |
558 | |
559 | |
560 ## Return a Path list from a Path file | |
561 # | |
562 # @param pathFile string name of a Path file | |
563 # @return a list of Path instances | |
564 # | |
565 def getPathListFromFile( pathFile ): | |
566 lPaths = [] | |
567 pathFileHandler = open( pathFile, "r" ) | |
568 while True: | |
569 line = pathFileHandler.readline() | |
570 if line == "": | |
571 break | |
572 iPath = Path() | |
573 iPath.setFromString( line ) | |
574 lPaths.append( iPath ) | |
575 pathFileHandler.close() | |
576 return lPaths | |
577 | |
578 getPathListFromFile = staticmethod( getPathListFromFile ) | |
579 | |
580 | |
581 ## Convert a chain into a 'pathrange' | |
582 # | |
583 # @param lPaths a list of Path instances with the same identifier | |
584 # @note: the min and max of each Path is used | |
585 # | |
586 def convertPathListToPathrange( lPaths ): | |
587 if len(lPaths) == 0: | |
588 return | |
589 if len(lPaths) == 1: | |
590 return lPaths[0] | |
591 iPathrange = copy.deepcopy( lPaths[0] ) | |
592 iPathrange.identity = lPaths[0].identity * lPaths[0].getLengthOnQuery() | |
593 cumulQueryLength = iPathrange.getLengthOnQuery() | |
594 for iPath in lPaths[1:]: | |
595 if iPath.id != iPathrange.id: | |
596 msg = "ERROR: two Path instances in the chain have different identifiers" | |
597 sys.stderr.write( "%s\n" % ( msg ) ) | |
598 sys.exit(1) | |
599 if iPathrange.range_subject.isOnDirectStrand() != iPath.range_subject.isOnDirectStrand(): | |
600 msg = "ERROR: two Path instances in the chain are on different strands" | |
601 sys.stderr.write( "%s\n" % ( msg ) ) | |
602 sys.exit(1) | |
603 iPathrange.range_query.start = min( iPathrange.range_query.start, iPath.range_query.start ) | |
604 iPathrange.range_query.end = max( iPathrange.range_query.end, iPath.range_query.end ) | |
605 if iPathrange.range_subject.isOnDirectStrand(): | |
606 iPathrange.range_subject.start = min( iPathrange.range_subject.start, iPath.range_subject.start ) | |
607 iPathrange.range_subject.end = max( iPathrange.range_subject.end, iPath.range_subject.end ) | |
608 else: | |
609 iPathrange.range_subject.start = max( iPathrange.range_subject.start, iPath.range_subject.start ) | |
610 iPathrange.range_subject.end = min( iPathrange.range_subject.end, iPath.range_subject.end ) | |
611 iPathrange.e_value = min( iPathrange.e_value, iPath.e_value ) | |
612 iPathrange.score += iPath.score | |
613 iPathrange.identity += iPath.identity * iPath.getLengthOnQuery() | |
614 cumulQueryLength += iPath.getLengthOnQuery() | |
615 iPathrange.identity = iPathrange.identity / float(cumulQueryLength) | |
616 return iPathrange | |
617 | |
618 convertPathListToPathrange = staticmethod( convertPathListToPathrange ) | |
619 | |
620 | |
621 ## Convert a Path file into an Align file via 'pathrange' | |
622 # | |
623 # @param pathFile: name of the input Path file | |
624 # @param alignFile: name of the output Align file | |
625 # @param verbose integer verbosity level | |
626 # @note: the min and max of each Path is used | |
627 # | |
628 def convertPathFileIntoAlignFileViaPathrange( pathFile, alignFile, verbose=0 ): | |
629 lPaths = PathUtils.getPathListFromFile( pathFile ) | |
630 dId2PathList = PathUtils.getDictOfListsWithIdAsKey( lPaths ) | |
631 lIds = dId2PathList.keys() | |
632 lIds.sort() | |
633 if verbose > 0: | |
634 msg = "number of chains: %i" % ( len(lIds) ) | |
635 sys.stdout.write( "%s\n" % ( msg ) ) | |
636 sys.stdout.flush() | |
637 alignFileHandler = open( alignFile, "w" ) | |
638 for identifier in lIds: | |
639 iPath = PathUtils.convertPathListToPathrange( dId2PathList[ identifier ] ) | |
640 iAlign = iPath.getAlignInstance() | |
641 iAlign.write( alignFileHandler ) | |
642 alignFileHandler.close() | |
643 | |
644 convertPathFileIntoAlignFileViaPathrange = staticmethod( convertPathFileIntoAlignFileViaPathrange ) | |
645 | |
646 | |
647 ## Split a list of Path instances according to the name of the query | |
648 # | |
649 # @param lInPaths list of align instances | |
650 # @return lOutPathLists list of align instances lists | |
651 # | |
652 def splitPathListByQueryName( lInPaths ): | |
653 lInSortedPaths = sorted( lInPaths, key=lambda o: o.range_query.seqname ) | |
654 lOutPathLists = [] | |
655 if len(lInSortedPaths) != 0 : | |
656 lPathsForCurrentQuery = [] | |
657 previousQuery = lInSortedPaths[0].range_query.seqname | |
658 for iPath in lInSortedPaths : | |
659 currentQuery = iPath.range_query.seqname | |
660 if previousQuery != currentQuery : | |
661 lOutPathLists.append( lPathsForCurrentQuery ) | |
662 previousQuery = currentQuery | |
663 lPathsForCurrentQuery = [] | |
664 lPathsForCurrentQuery.append( iPath ) | |
665 | |
666 lOutPathLists.append(lPathsForCurrentQuery) | |
667 | |
668 return lOutPathLists | |
669 | |
670 splitPathListByQueryName = staticmethod( splitPathListByQueryName ) | |
671 | |
672 | |
673 ## Create an Path file from each list of Path instances in the input list | |
674 # | |
675 # @param lPathList list of lists with Path instances | |
676 # @param pattern string | |
677 # @param dirName string | |
678 # | |
679 def createPathFiles( lPathList, pattern, dirName="" ): | |
680 nbFiles = len(lPathList) | |
681 countFile = 1 | |
682 if dirName != "" : | |
683 if dirName[-1] != "/": | |
684 dirName = dirName + '/' | |
685 os.mkdir( dirName ) | |
686 | |
687 for lPath in lPathList: | |
688 fileName = dirName + pattern + "_%s.path" % ( str(countFile).zfill( len(str(nbFiles)) ) ) | |
689 PathUtils.writeListInFile( lPath, fileName ) | |
690 countFile += 1 | |
691 | |
692 createPathFiles = staticmethod( createPathFiles ) | |
693 | |
694 | |
695 ## Return a list of Path instances sorted in increasing order according to the min, then the inverse of the query length, and finally their initial order | |
696 # | |
697 # @param lPaths: list of Path instances | |
698 # | |
699 def getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ): | |
700 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), 1 / float(iPath.getLengthOnQuery()) ) ) | |
701 | |
702 getPathListSortedByIncreasingQueryMinThenInvQueryLength = staticmethod( getPathListSortedByIncreasingQueryMinThenInvQueryLength ) | |
703 | |
704 | |
705 ## Merge all overlapping Path instances in a list without considering the identifiers | |
706 # Start by sorting the Path instances by their increasing min coordinate | |
707 # | |
708 # @return: a new list with the merged Path instances | |
709 # | |
710 def mergePathsInList( lPaths ): | |
711 lMergedPaths = [] | |
712 if len(lPaths)==0: | |
713 return lMergedPaths | |
714 | |
715 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ) | |
716 | |
717 prev_count = 0 | |
718 for iPath in lSortedPaths[0:]: | |
719 if prev_count != len(lSortedPaths): | |
720 for i in lSortedPaths[ prev_count + 1: ]: | |
721 if iPath.isOverlapping( i ): | |
722 iPath.merge( i ) | |
723 isAlreadyInList = False | |
724 for newPath in lMergedPaths: | |
725 if newPath.isOverlapping( iPath ): | |
726 isAlreadyInList = True | |
727 newPath.merge( iPath ) | |
728 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath | |
729 if not isAlreadyInList: | |
730 lMergedPaths.append( iPath ) | |
731 prev_count += 1 | |
732 return lMergedPaths | |
733 | |
734 mergePathsInList = staticmethod( mergePathsInList ) | |
735 | |
736 | |
737 ## Merge all overlapping Path instances in a list without considering if subjects are overlapping. | |
738 # Start by sorting the Path instances by their increasing min coordinate. | |
739 # | |
740 # @return: a new list with the merged Path instances | |
741 # | |
742 def mergePathsInListUsingQueryCoordsOnly( lPaths ): | |
743 lMergedPaths = [] | |
744 if len(lPaths)==0: | |
745 return lMergedPaths | |
746 | |
747 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ) | |
748 | |
749 prev_count = 0 | |
750 for iPath in lSortedPaths[0:]: | |
751 if prev_count != len(lSortedPaths): | |
752 for i in lSortedPaths[ prev_count + 1: ]: | |
753 if iPath.isQueryOverlapping( i ): | |
754 iPath.merge( i ) | |
755 isAlreadyInList = False | |
756 for newPath in lMergedPaths: | |
757 if newPath.isQueryOverlapping( iPath ): | |
758 isAlreadyInList = True | |
759 newPath.merge( iPath ) | |
760 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath | |
761 if not isAlreadyInList: | |
762 lMergedPaths.append( iPath ) | |
763 prev_count += 1 | |
764 return lMergedPaths | |
765 | |
766 mergePathsInListUsingQueryCoordsOnly = staticmethod( mergePathsInListUsingQueryCoordsOnly ) | |
767 | |
768 | |
769 ## Convert a Path file into a GFF file | |
770 # | |
771 # @param pathFile: name of the input Path file | |
772 # @param gffFile: name of the output GFF file | |
773 # @param source: source to write in the GFF file (column 2) | |
774 # | |
775 # @note the 'path' query is supposed to correspond to the 'gff' first column | |
776 # | |
777 def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ): | |
778 dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile ) | |
779 if verbose > 0: | |
780 msg = "number of chains: %i" % ( len(dId2PathList.keys()) ) | |
781 sys.stdout.write( "%s\n" % msg ) | |
782 sys.stdout.flush() | |
783 gffFileHandler = open( gffFile, "w" ) | |
784 for id in dId2PathList.keys(): | |
785 if len( dId2PathList[ id ] ) == 1: | |
786 iPath = dId2PathList[ id ][0] | |
787 string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(), | |
788 source=source ) | |
789 gffFileHandler.write( "%s\n" % string ) | |
790 else: | |
791 iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] ) | |
792 string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(), | |
793 source=source ) | |
794 gffFileHandler.write( "%s\n" % string ) | |
795 count = 0 | |
796 for iPath in dId2PathList[ id ]: | |
797 count += 1 | |
798 string = iPath.toStringAsGff( type="match_part", | |
799 ID="mp%i-%i" % ( iPath.getIdentifier(), count ), | |
800 Parent="ms%i" % iPathrange.getIdentifier(), | |
801 source=source ) | |
802 gffFileHandler.write( "%s\n" % string ) | |
803 gffFileHandler.close() | |
804 | |
805 convertPathFileIntoGffFile = staticmethod( convertPathFileIntoGffFile ) | |
806 | |
807 | |
808 ## Convert a Path file into a Set file | |
809 # replace old parser.pathrange2set | |
810 # @param pathFile: name of the input Path file | |
811 # @param setFile: name of the output Set file | |
812 # | |
813 def convertPathFileIntoSetFile( pathFile, setFile ): | |
814 pathFileHandler = open( pathFile, "r" ) | |
815 setFileHandler = open( setFile, "w" ) | |
816 iPath = Path() | |
817 while True: | |
818 line = pathFileHandler.readline() | |
819 if line == "": | |
820 break | |
821 iPath.setFromString( line ) | |
822 iSet = iPath.getSubjectAsSetOfQuery() | |
823 iSet.write( setFileHandler ) | |
824 pathFileHandler.close() | |
825 setFileHandler.close() | |
826 | |
827 convertPathFileIntoSetFile = staticmethod( convertPathFileIntoSetFile ) | |
828 | |
829 ## Write Path File without duplicated Path (same query, same subject and same coordinate) | |
830 # | |
831 # @param inputFile: name of the input Path file | |
832 # @param outputFile: name of the output Path file | |
833 # | |
834 def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile): | |
835 f = open(inputFile, "r") | |
836 line = f.readline() | |
837 previousQuery = "" | |
838 previousSubject = "" | |
839 lPaths = [] | |
840 while line: | |
841 iPath = Path() | |
842 iPath.setFromString(line) | |
843 query = iPath.getQueryName() | |
844 subject = iPath.getSubjectName() | |
845 if (query != previousQuery or subject != previousSubject) and lPaths != []: | |
846 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths) | |
847 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a") | |
848 lPaths = [] | |
849 lPaths.append(iPath) | |
850 previousQuery = query | |
851 previousSubject = subject | |
852 line = f.readline() | |
853 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths) | |
854 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a") | |
855 f.close() | |
856 removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName = staticmethod(removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName) | |
857 | |
858 |