comparison smart_toolShed/commons/core/coord/PathUtils.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import sys
34 import copy
35 from commons.core.coord.Path import Path
36 from commons.core.coord.SetUtils import SetUtils
37 from commons.core.coord.Map import Map
38 from commons.core.coord.AlignUtils import AlignUtils
39 from commons.core.checker.RepetException import RepetDataException
40
41 ## Static methods for the manipulation of Path instances
42 #
43 class PathUtils ( object ):
44
45 ## Change the identifier of each Set instance in the given list
46 #
47 # @param lPaths list of Path instances
48 # @param newId new identifier
49 #
50 def changeIdInList(lPaths, newId):
51 for iPath in lPaths:
52 iPath.id = newId
53
54 changeIdInList = staticmethod( changeIdInList )
55
56
57 ## Return a list of Set instances containing the query range from a list of Path instances
58 #
59 # @param lPaths a list of Path instances
60 #
61 def getSetListFromQueries(lPaths):
62 lSets = []
63 for iPath in lPaths:
64 lSets.append( iPath.getSubjectAsSetOfQuery() )
65 return lSets
66
67 getSetListFromQueries = staticmethod( getSetListFromQueries )
68
69 #TODO: add tests !!!!
70 ## Return a list of Set instances containing the query range from a list of Path instances
71 #
72 # @param lPaths a list of Path instances
73 #
74 @staticmethod
75 def getSetListFromSubjects(lPaths):
76 lSets = []
77 for iPath in lPaths:
78 lSets.append( iPath.getQuerySetOfSubject() )
79 return lSets
80
81
82 ## Return a sorted list of Range instances containing the subjects from a list of Path instances
83 #
84 # @param lPaths a list of Path instances
85 # @note meaningful only if all Path instances have same identifier
86 #
87 def getRangeListFromSubjects( lPaths ):
88 lRanges = []
89 for iPath in lPaths:
90 lRanges.append( iPath.range_subject )
91 if lRanges[0].isOnDirectStrand():
92 return sorted( lRanges, key=lambda iRange: ( iRange.getMin(), iRange.getMax() ) )
93 else:
94 return sorted( lRanges, key=lambda iRange: ( iRange.getMax(), iRange.getMin() ) )
95
96 getRangeListFromSubjects = staticmethod( getRangeListFromSubjects )
97
98
99 ## Return a tuple with min and max of query coordinates from Path instances in the given list
100 #
101 # @param lPaths a list of Path instances
102 #
103 def getQueryMinMaxFromPathList(lPaths):
104 qmin = -1
105 qmax = -1
106 for iPath in lPaths:
107 if qmin == -1:
108 qmin = iPath.range_query.start
109 qmin = min(qmin, iPath.range_query.getMin())
110 qmax = max(qmax, iPath.range_query.getMax())
111 return (qmin, qmax)
112
113 getQueryMinMaxFromPathList = staticmethod( getQueryMinMaxFromPathList )
114
115
116 ## Return a tuple with min and max of subject coordinates from Path instances in the given list
117 #
118 # @param lPaths lists of Path instances
119 #
120 def getSubjectMinMaxFromPathList(lPaths):
121 smin = -1
122 smax = -1
123 for iPath in lPaths:
124 if smin == -1:
125 smin = iPath.range_subject.start
126 smin = min(smin, iPath.range_subject.getMin())
127 smax = max(smax, iPath.range_subject.getMax())
128 return (smin, smax)
129
130 getSubjectMinMaxFromPathList = staticmethod( getSubjectMinMaxFromPathList )
131
132
133 ## Return True if the query range of any Path instance from the first list overlaps with the query range of any Path instance from the second list
134 #
135 # @param lPaths1: list of Path instances
136 # @param lPaths2: list of Path instances
137 # @return boolean
138 #
139 def areQueriesOverlappingBetweenPathLists( lPaths1, lPaths2 ):
140 lSortedPaths1 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths1 )
141 lSortedPaths2 = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lPaths2 )
142 i = 0
143 j = 0
144 while i != len(lSortedPaths1):
145 while j != len(lSortedPaths2):
146 if not lSortedPaths1[i].range_query.isOverlapping( lSortedPaths2[j].range_query ):
147 j += 1
148 else:
149 return True
150 i += 1
151 return False
152
153 areQueriesOverlappingBetweenPathLists = staticmethod( areQueriesOverlappingBetweenPathLists )
154
155
156 ## Show Path instances contained in the given list
157 #
158 # @param lPaths a list of Path instances
159 #
160 def showList(lPaths):
161 for iPath in lPaths:
162 iPath.show()
163
164 showList = staticmethod( showList )
165
166
167 ## Write Path instances contained in the given list
168 #
169 # @param lPaths a list of Path instances
170 # @param fileName name of the file to write the Path instances
171 # @param mode the open mode of the file ""w"" or ""a""
172 #
173 def writeListInFile(lPaths, fileName, mode="w"):
174 AlignUtils.writeListInFile(lPaths, fileName, mode)
175
176 writeListInFile = staticmethod( writeListInFile )
177
178
179 ## Return new list of Path instances with no duplicate
180 #
181 # @param lPaths a list of Path instances
182 # @param useOnlyCoord boolean if True, check only coordinates and sequence names
183 # @return lUniqPaths a path instances list
184 #
185 def getPathListWithoutDuplicates(lPaths, useOnlyCoord = False):
186 if len(lPaths) < 2:
187 return lPaths
188 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
189 lUniqPaths = [ lSortedPaths[0] ]
190 if useOnlyCoord:
191 for iPath in lSortedPaths[1:]:
192 if iPath.range_query.start != lUniqPaths[-1].range_query.start \
193 or iPath.range_query.end != lUniqPaths[-1].range_query.end \
194 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname \
195 or iPath.range_subject.start != lUniqPaths[-1].range_subject.start \
196 or iPath.range_subject.end != lUniqPaths[-1].range_subject.end \
197 or iPath.range_subject.seqname != lUniqPaths[-1].range_subject.seqname:
198 lUniqPaths.append( iPath )
199 else:
200 for iPath in lSortedPaths[1:]:
201 if iPath != lUniqPaths[-1]:
202 lUniqPaths.append( iPath )
203 return lUniqPaths
204
205 getPathListWithoutDuplicates = staticmethod( getPathListWithoutDuplicates )
206
207
208 def getPathListWithoutDuplicatesOnQueryCoord(lPaths):
209 if len(lPaths) < 2:
210 return lPaths
211 lSortedPaths = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier( lPaths )
212 lUniqPaths = [ lSortedPaths[0] ]
213 for iPath in lSortedPaths[1:]:
214 if iPath.range_query.start != lUniqPaths[-1].range_query.start \
215 or iPath.range_query.end != lUniqPaths[-1].range_query.end \
216 or iPath.range_query.seqname != lUniqPaths[-1].range_query.seqname:
217 lUniqPaths.append( iPath )
218 return lUniqPaths
219
220 getPathListWithoutDuplicatesOnQueryCoord = staticmethod(getPathListWithoutDuplicatesOnQueryCoord)
221
222
223 ## Split a Path list in several Path lists according to the identifier
224 #
225 # @param lPaths a list of Path instances
226 # @return a dictionary which keys are identifiers and values Path lists
227 #
228 def getDictOfListsWithIdAsKey( lPaths ):
229 dId2PathList = {}
230 for iPath in lPaths:
231 if dId2PathList.has_key( iPath.id ):
232 dId2PathList[ iPath.id ].append( iPath )
233 else:
234 dId2PathList[ iPath.id ] = [ iPath ]
235 return dId2PathList
236
237 getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey )
238
239
240 ## Split a Path file in several Path lists according to the identifier
241 #
242 # @param pathFile name of the input Path file
243 # @return a dictionary which keys are identifiers and values Path lists
244 #
245 def getDictOfListsWithIdAsKeyFromFile( pathFile ):
246 dId2PathList = {}
247 pathFileHandler = open( pathFile, "r" )
248 while True:
249 line = pathFileHandler.readline()
250 if line == "":
251 break
252 iPath = Path()
253 iPath.setFromString( line )
254 if dId2PathList.has_key( iPath.id ):
255 dId2PathList[ iPath.id ].append( iPath )
256 else:
257 dId2PathList[ iPath.id ] = [ iPath ]
258 pathFileHandler.close()
259 return dId2PathList
260
261 getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile )
262
263
264 ## Return a list of Path list(s) obtained while splitting a list of connected Path instances according to another based on query coordinates
265 #
266 # @param lToKeep: a list of Path instances to keep (reference)
267 # @param lToUnjoin: a list of Path instances to unjoin
268 # @return: list of Path list(s) (can be empty if one of the input lists is empty)
269 # @warning: all the path instances in a given list MUST be connected (i.e. same identifier)
270 # @warning: all the path instances in a given list MUST NOT overlap neither within each other nor with the Path instances of the other list
271 #
272 def getPathListUnjoinedBasedOnQuery( lToKeep, lToUnjoin ):
273 lSortedToKeep = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToKeep )
274 lSortedToUnjoin = PathUtils.getPathListSortedByIncreasingMinQueryThenMaxQuery( lToUnjoin )
275 if lToUnjoin == []:
276 return []
277 if lToKeep == []:
278 return [ lToUnjoin ]
279
280 lLists = []
281 k = 0
282 while k < len(lSortedToKeep):
283 j1 = 0
284 while j1 < len(lSortedToUnjoin) and lSortedToKeep[k].range_query.getMin() > lSortedToUnjoin[j1].range_query.getMax():
285 j1 += 1
286 if j1 == len(lSortedToUnjoin):
287 break
288 if j1 != 0:
289 lLists.append( lSortedToUnjoin[:j1] )
290 del lSortedToUnjoin[:j1]
291 j1 = 0
292 if k+1 == len(lSortedToKeep):
293 break
294 j2 = j1
295 if j2 < len(lSortedToUnjoin) and lSortedToKeep[k+1].range_query.getMin() > lSortedToUnjoin[j2].range_query.getMax():
296 while j2 < len(lSortedToUnjoin) and lSortedToKeep[k+1].range_query.getMin() > lSortedToUnjoin[j2].range_query.getMax():
297 j2 += 1
298 lLists.append( lSortedToUnjoin[j1:j2] )
299 del lSortedToUnjoin[j1:j2]
300 k += 1
301
302 if lLists != [] or k == 0:
303 lLists.append( lSortedToUnjoin )
304 return lLists
305
306 getPathListUnjoinedBasedOnQuery = staticmethod( getPathListUnjoinedBasedOnQuery )
307
308
309 ## Return the identity of the Path list, the identity of each instance being weighted by the length of each query range
310 # All Paths should have the same query and subject.
311 # The Paths are merged using query coordinates only.
312 #
313 # @param lPaths list of Path instances
314 #
315 def getIdentityFromPathList( lPaths, checkSubjects=True ):
316 if len( PathUtils.getListOfDistinctQueryNames( lPaths ) ) > 1:
317 msg = "ERROR: try to compute identity from Paths with different queries"
318 sys.stderr.write( "%s\n" % msg )
319 sys.stderr.flush()
320 raise Exception
321 if checkSubjects and len( PathUtils.getListOfDistinctSubjectNames( lPaths ) ) > 1:
322 msg = "ERROR: try to compute identity from Paths with different subjects"
323 sys.stderr.write( "%s\n" % msg )
324 sys.stderr.flush()
325 raise Exception
326 identity = 0
327 lMergedPaths = PathUtils.mergePathsInListUsingQueryCoordsOnly( lPaths )
328 lQuerySets = PathUtils.getSetListFromQueries( lMergedPaths )
329 lMergedQuerySets = SetUtils.mergeSetsInList( lQuerySets )
330 totalLengthOnQry = SetUtils.getCumulLength( lMergedQuerySets )
331 for iPath in lMergedPaths:
332 identity += iPath.identity * iPath.getLengthOnQuery()
333 weightedIdentity = identity / float(totalLengthOnQry)
334 if weightedIdentity < 0:
335 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
336 sys.stderr.write("%s\n" % msg)
337 sys.stderr.flush()
338 raise Exception
339 elif weightedIdentity > 100:
340 msg = "ERROR: weighted identity '%.2f' outside range" % weightedIdentity
341 sys.stderr.write("%s\n" % msg)
342 sys.stderr.flush()
343 raise RepetDataException(msg)
344 return weightedIdentity
345
346 getIdentityFromPathList = staticmethod( getIdentityFromPathList )
347
348
349 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, and finally their initial order.
350 #
351 # @param lPaths list of Path instances
352 #
353 def getPathListSortedByIncreasingMinQueryThenMaxQuery(lPaths):
354 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax() ) )
355
356 getPathListSortedByIncreasingMinQueryThenMaxQuery = staticmethod( getPathListSortedByIncreasingMinQueryThenMaxQuery )
357
358
359 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then their identifier, and finally their initial order.
360 #
361 # @param lPaths list of Path instances
362 #
363 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier(lPaths):
364 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), iPath.getQueryMax(), iPath.getIdentifier() ) )
365
366 getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier = staticmethod( getPathListSortedByIncreasingMinQueryThenMaxQueryThenIdentifier )
367
368
369 ## Return a list of Path instances sorted in increasing order according to the min of the query, then the max of the query, then the min of the subject, then the max of the subject and finally their initial order.
370 #
371 # @param lPaths list of Path instances
372 #
373 @staticmethod
374 def getPathListSortedByIncreasingMinQueryThenMaxQueryThenMinSubjectThenMaxSubject(lPaths):
375 return sorted(lPaths, key=lambda iPath: (iPath.getQueryMin(), iPath.getQueryMax(), iPath.getSubjectMin(), iPath.getSubjectMax()))
376
377
378 ## Return a list of the distinct identifiers
379 #
380 # @param lPaths list of Path instances
381 #
382 def getListOfDistinctIdentifiers( lPaths ):
383 sDistinctIdentifiers = set()
384 for iPath in lPaths:
385 sDistinctIdentifiers.add(iPath.id)
386 return list(sDistinctIdentifiers)
387
388 getListOfDistinctIdentifiers = staticmethod( getListOfDistinctIdentifiers )
389
390
391 ## Return a list of the distinct query names present in the collection
392 #
393 # @param lPaths list of Path instances
394 #
395 def getListOfDistinctQueryNames( lPaths ):
396 sDistinctQueryNames = set()
397 for iPath in lPaths:
398 sDistinctQueryNames.add(iPath.range_query.seqname)
399 return list(sDistinctQueryNames)
400
401 getListOfDistinctQueryNames = staticmethod( getListOfDistinctQueryNames )
402
403
404 ## Return a list of the distinct subject names present in the collection
405 #
406 # @param lPaths list of Path instances
407 #
408 def getListOfDistinctSubjectNames( lPaths ):
409 sDistinctSubjectNames = set()
410 for iPath in lPaths:
411 sDistinctSubjectNames.add(iPath.range_subject.seqname)
412 return list(sDistinctSubjectNames)
413
414 getListOfDistinctSubjectNames = staticmethod( getListOfDistinctSubjectNames )
415
416
417 ## Return a list of lists containing query coordinates of the connections sorted in increasing order.
418 #
419 # @param lConnectedPaths: list of Path instances having the same identifier
420 # @param minLength: threshold below which connections are not reported (default= 0 bp)
421 # @note: return only connections longer than threshold
422 # @note: if coordinate on query ends at 100, return 101
423 # @warning: Path instances MUST be sorted in increasing order according to query coordinates
424 # @warning: Path instances MUST be on direct query strand (and maybe on reverse subject strand)
425 #
426 def getListOfJoinCoordinatesOnQuery(lConnectedPaths, minLength=0):
427 lJoinCoordinates = []
428 for i in xrange(1,len(lConnectedPaths)):
429 startJoin = lConnectedPaths[i-1].range_query.end
430 endJoin = lConnectedPaths[i].range_query.start
431 if endJoin - startJoin + 1 > minLength:
432 lJoinCoordinates.append( [ startJoin + 1, endJoin - 1 ] )
433 return lJoinCoordinates
434
435 getListOfJoinCoordinatesOnQuery = staticmethod( getListOfJoinCoordinatesOnQuery )
436
437
438 ## Return the length on the query of all Path instance in the given list
439 #
440 # @param lPaths list of Path instances
441 # @note overlapping ranges are not summed but truncated.
442 #
443 def getLengthOnQueryFromPathList( lPaths ):
444 lSets = PathUtils.getSetListFromQueries( lPaths )
445 lMergedSets = SetUtils.mergeSetsInList( lSets )
446 length = SetUtils.getCumulLength( lMergedSets )
447 return length
448
449 getLengthOnQueryFromPathList = staticmethod( getLengthOnQueryFromPathList )
450
451
452 ## Convert a Path file into an Align file
453 #
454 # @param pathFile: name of the input Path file
455 # @param alignFile: name of the output Align file
456 #
457 def convertPathFileIntoAlignFile(pathFile, alignFile):
458 pathFileHandler = open( pathFile, "r" )
459 alignFileHandler = open( alignFile, "w" )
460 iPath = Path()
461 while True:
462 line = pathFileHandler.readline()
463 if line == "":
464 break
465 iPath.setFromString( line )
466 iAlign = iPath.getAlignInstance()
467 iAlign.write( alignFileHandler )
468 pathFileHandler.close()
469 alignFileHandler.close()
470
471 convertPathFileIntoAlignFile = staticmethod( convertPathFileIntoAlignFile )
472
473 #TODO: duplicated method => to rename with the name of the next method (which is called) ?
474 ## Convert a Path File into a Map file with query coordinates only
475 #
476 # @param pathFile: name of the input Path file
477 # @param mapFile: name of the output Map file
478 #
479 def convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile ):
480 pathFileHandler = open( pathFile, "r" )
481 mapFileHandler = open( mapFile, "w" )
482 p = Path()
483 while True:
484 line = pathFileHandler.readline()
485 if line == "":
486 break
487 p.reset()
488 p.setFromTuple( line.split("\t") )
489 p.writeSubjectAsMapOfQuery( mapFileHandler )
490 pathFileHandler.close()
491 mapFileHandler.close()
492
493 convertPathFileIntoMapFileWithQueryCoordsOnly = staticmethod( convertPathFileIntoMapFileWithQueryCoordsOnly )
494
495
496 ## for each line of a given Path file, write the coordinates of the subject on the query as one line in a Map file
497 #
498 # @param pathFile: name of the input Path file
499 # @param mapFile: name of the output Map file
500 #
501 def convertPathFileIntoMapFileWithSubjectsOnQueries( pathFile, mapFile ):
502 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( pathFile, mapFile )
503 convertPathFileIntoMapFileWithSubjectsOnQueries = staticmethod( convertPathFileIntoMapFileWithSubjectsOnQueries )
504
505
506 ## Merge matches on queries
507 #
508 # @param inFile: name of the input Path file
509 # @param outFile: name of the output Path file
510 #
511 def mergeMatchesOnQueries(inFile, outFile):
512 mapFile = "%s.map" % ( inFile )
513 PathUtils.convertPathFileIntoMapFileWithQueryCoordsOnly( inFile, mapFile )
514 cmd = "mapOp"
515 cmd += " -q %s" % ( mapFile )
516 cmd += " -m"
517 cmd += " 2>&1 > /dev/null"
518 exitStatus = os.system( cmd )
519 if exitStatus != 0:
520 print "ERROR: mapOp returned %i" % ( exitStatus )
521 sys.exit(1)
522 os.remove( mapFile )
523 mergeFile = "%s.merge" % ( mapFile )
524 mergeFileHandler = open( mergeFile, "r" )
525 outFileHandler = open( outFile, "w" )
526 m = Map()
527 while True:
528 line = mergeFileHandler.readline()
529 if line == "":
530 break
531 m.reset()
532 m.setFromString( line, "\t" )
533 m.writeAsQueryOfPath( outFileHandler )
534 mergeFileHandler.close()
535 os.remove( mergeFile )
536 outFileHandler.close()
537
538 mergeMatchesOnQueries = staticmethod( mergeMatchesOnQueries )
539
540
541 ## Filter chains of Path(s) which length is below a given threshold
542 #
543 # @param lPaths: list of Path instances
544 # @param minLengthChain: minimum length of a chain to be kept
545 # @note: a chain may contain a single Path instance
546 # @return: a list of Path instances
547 #
548 def filterPathListOnChainLength( lPaths, minLengthChain ):
549 lFilteredPaths = []
550 dPathnum2Paths = PathUtils.getDictOfListsWithIdAsKey( lPaths )
551 for pathnum in dPathnum2Paths.keys():
552 length = PathUtils.getLengthOnQueryFromPathList( dPathnum2Paths[ pathnum ] )
553 if length >= minLengthChain:
554 lFilteredPaths += dPathnum2Paths[ pathnum ]
555 return lFilteredPaths
556
557 filterPathListOnChainLength = staticmethod( filterPathListOnChainLength )
558
559
560 ## Return a Path list from a Path file
561 #
562 # @param pathFile string name of a Path file
563 # @return a list of Path instances
564 #
565 def getPathListFromFile( pathFile ):
566 lPaths = []
567 pathFileHandler = open( pathFile, "r" )
568 while True:
569 line = pathFileHandler.readline()
570 if line == "":
571 break
572 iPath = Path()
573 iPath.setFromString( line )
574 lPaths.append( iPath )
575 pathFileHandler.close()
576 return lPaths
577
578 getPathListFromFile = staticmethod( getPathListFromFile )
579
580
581 ## Convert a chain into a 'pathrange'
582 #
583 # @param lPaths a list of Path instances with the same identifier
584 # @note: the min and max of each Path is used
585 #
586 def convertPathListToPathrange( lPaths ):
587 if len(lPaths) == 0:
588 return
589 if len(lPaths) == 1:
590 return lPaths[0]
591 iPathrange = copy.deepcopy( lPaths[0] )
592 iPathrange.identity = lPaths[0].identity * lPaths[0].getLengthOnQuery()
593 cumulQueryLength = iPathrange.getLengthOnQuery()
594 for iPath in lPaths[1:]:
595 if iPath.id != iPathrange.id:
596 msg = "ERROR: two Path instances in the chain have different identifiers"
597 sys.stderr.write( "%s\n" % ( msg ) )
598 sys.exit(1)
599 if iPathrange.range_subject.isOnDirectStrand() != iPath.range_subject.isOnDirectStrand():
600 msg = "ERROR: two Path instances in the chain are on different strands"
601 sys.stderr.write( "%s\n" % ( msg ) )
602 sys.exit(1)
603 iPathrange.range_query.start = min( iPathrange.range_query.start, iPath.range_query.start )
604 iPathrange.range_query.end = max( iPathrange.range_query.end, iPath.range_query.end )
605 if iPathrange.range_subject.isOnDirectStrand():
606 iPathrange.range_subject.start = min( iPathrange.range_subject.start, iPath.range_subject.start )
607 iPathrange.range_subject.end = max( iPathrange.range_subject.end, iPath.range_subject.end )
608 else:
609 iPathrange.range_subject.start = max( iPathrange.range_subject.start, iPath.range_subject.start )
610 iPathrange.range_subject.end = min( iPathrange.range_subject.end, iPath.range_subject.end )
611 iPathrange.e_value = min( iPathrange.e_value, iPath.e_value )
612 iPathrange.score += iPath.score
613 iPathrange.identity += iPath.identity * iPath.getLengthOnQuery()
614 cumulQueryLength += iPath.getLengthOnQuery()
615 iPathrange.identity = iPathrange.identity / float(cumulQueryLength)
616 return iPathrange
617
618 convertPathListToPathrange = staticmethod( convertPathListToPathrange )
619
620
621 ## Convert a Path file into an Align file via 'pathrange'
622 #
623 # @param pathFile: name of the input Path file
624 # @param alignFile: name of the output Align file
625 # @param verbose integer verbosity level
626 # @note: the min and max of each Path is used
627 #
628 def convertPathFileIntoAlignFileViaPathrange( pathFile, alignFile, verbose=0 ):
629 lPaths = PathUtils.getPathListFromFile( pathFile )
630 dId2PathList = PathUtils.getDictOfListsWithIdAsKey( lPaths )
631 lIds = dId2PathList.keys()
632 lIds.sort()
633 if verbose > 0:
634 msg = "number of chains: %i" % ( len(lIds) )
635 sys.stdout.write( "%s\n" % ( msg ) )
636 sys.stdout.flush()
637 alignFileHandler = open( alignFile, "w" )
638 for identifier in lIds:
639 iPath = PathUtils.convertPathListToPathrange( dId2PathList[ identifier ] )
640 iAlign = iPath.getAlignInstance()
641 iAlign.write( alignFileHandler )
642 alignFileHandler.close()
643
644 convertPathFileIntoAlignFileViaPathrange = staticmethod( convertPathFileIntoAlignFileViaPathrange )
645
646
647 ## Split a list of Path instances according to the name of the query
648 #
649 # @param lInPaths list of align instances
650 # @return lOutPathLists list of align instances lists
651 #
652 def splitPathListByQueryName( lInPaths ):
653 lInSortedPaths = sorted( lInPaths, key=lambda o: o.range_query.seqname )
654 lOutPathLists = []
655 if len(lInSortedPaths) != 0 :
656 lPathsForCurrentQuery = []
657 previousQuery = lInSortedPaths[0].range_query.seqname
658 for iPath in lInSortedPaths :
659 currentQuery = iPath.range_query.seqname
660 if previousQuery != currentQuery :
661 lOutPathLists.append( lPathsForCurrentQuery )
662 previousQuery = currentQuery
663 lPathsForCurrentQuery = []
664 lPathsForCurrentQuery.append( iPath )
665
666 lOutPathLists.append(lPathsForCurrentQuery)
667
668 return lOutPathLists
669
670 splitPathListByQueryName = staticmethod( splitPathListByQueryName )
671
672
673 ## Create an Path file from each list of Path instances in the input list
674 #
675 # @param lPathList list of lists with Path instances
676 # @param pattern string
677 # @param dirName string
678 #
679 def createPathFiles( lPathList, pattern, dirName="" ):
680 nbFiles = len(lPathList)
681 countFile = 1
682 if dirName != "" :
683 if dirName[-1] != "/":
684 dirName = dirName + '/'
685 os.mkdir( dirName )
686
687 for lPath in lPathList:
688 fileName = dirName + pattern + "_%s.path" % ( str(countFile).zfill( len(str(nbFiles)) ) )
689 PathUtils.writeListInFile( lPath, fileName )
690 countFile += 1
691
692 createPathFiles = staticmethod( createPathFiles )
693
694
695 ## Return a list of Path instances sorted in increasing order according to the min, then the inverse of the query length, and finally their initial order
696 #
697 # @param lPaths: list of Path instances
698 #
699 def getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths ):
700 return sorted( lPaths, key=lambda iPath: ( iPath.getQueryMin(), 1 / float(iPath.getLengthOnQuery()) ) )
701
702 getPathListSortedByIncreasingQueryMinThenInvQueryLength = staticmethod( getPathListSortedByIncreasingQueryMinThenInvQueryLength )
703
704
705 ## Merge all overlapping Path instances in a list without considering the identifiers
706 # Start by sorting the Path instances by their increasing min coordinate
707 #
708 # @return: a new list with the merged Path instances
709 #
710 def mergePathsInList( lPaths ):
711 lMergedPaths = []
712 if len(lPaths)==0:
713 return lMergedPaths
714
715 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
716
717 prev_count = 0
718 for iPath in lSortedPaths[0:]:
719 if prev_count != len(lSortedPaths):
720 for i in lSortedPaths[ prev_count + 1: ]:
721 if iPath.isOverlapping( i ):
722 iPath.merge( i )
723 isAlreadyInList = False
724 for newPath in lMergedPaths:
725 if newPath.isOverlapping( iPath ):
726 isAlreadyInList = True
727 newPath.merge( iPath )
728 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
729 if not isAlreadyInList:
730 lMergedPaths.append( iPath )
731 prev_count += 1
732 return lMergedPaths
733
734 mergePathsInList = staticmethod( mergePathsInList )
735
736
737 ## Merge all overlapping Path instances in a list without considering if subjects are overlapping.
738 # Start by sorting the Path instances by their increasing min coordinate.
739 #
740 # @return: a new list with the merged Path instances
741 #
742 def mergePathsInListUsingQueryCoordsOnly( lPaths ):
743 lMergedPaths = []
744 if len(lPaths)==0:
745 return lMergedPaths
746
747 lSortedPaths = PathUtils.getPathListSortedByIncreasingQueryMinThenInvQueryLength( lPaths )
748
749 prev_count = 0
750 for iPath in lSortedPaths[0:]:
751 if prev_count != len(lSortedPaths):
752 for i in lSortedPaths[ prev_count + 1: ]:
753 if iPath.isQueryOverlapping( i ):
754 iPath.merge( i )
755 isAlreadyInList = False
756 for newPath in lMergedPaths:
757 if newPath.isQueryOverlapping( iPath ):
758 isAlreadyInList = True
759 newPath.merge( iPath )
760 lMergedPaths [ lMergedPaths.index( newPath ) ] = newPath
761 if not isAlreadyInList:
762 lMergedPaths.append( iPath )
763 prev_count += 1
764 return lMergedPaths
765
766 mergePathsInListUsingQueryCoordsOnly = staticmethod( mergePathsInListUsingQueryCoordsOnly )
767
768
769 ## Convert a Path file into a GFF file
770 #
771 # @param pathFile: name of the input Path file
772 # @param gffFile: name of the output GFF file
773 # @param source: source to write in the GFF file (column 2)
774 #
775 # @note the 'path' query is supposed to correspond to the 'gff' first column
776 #
777 def convertPathFileIntoGffFile( pathFile, gffFile, source="REPET", verbose=0 ):
778 dId2PathList = PathUtils.getDictOfListsWithIdAsKeyFromFile( pathFile )
779 if verbose > 0:
780 msg = "number of chains: %i" % ( len(dId2PathList.keys()) )
781 sys.stdout.write( "%s\n" % msg )
782 sys.stdout.flush()
783 gffFileHandler = open( gffFile, "w" )
784 for id in dId2PathList.keys():
785 if len( dId2PathList[ id ] ) == 1:
786 iPath = dId2PathList[ id ][0]
787 string = iPath.toStringAsGff( ID="%i" % iPath.getIdentifier(),
788 source=source )
789 gffFileHandler.write( "%s\n" % string )
790 else:
791 iPathrange = PathUtils.convertPathListToPathrange( dId2PathList[ id ] )
792 string = iPathrange.toStringAsGff( ID="ms%i" % iPathrange.getIdentifier(),
793 source=source )
794 gffFileHandler.write( "%s\n" % string )
795 count = 0
796 for iPath in dId2PathList[ id ]:
797 count += 1
798 string = iPath.toStringAsGff( type="match_part",
799 ID="mp%i-%i" % ( iPath.getIdentifier(), count ),
800 Parent="ms%i" % iPathrange.getIdentifier(),
801 source=source )
802 gffFileHandler.write( "%s\n" % string )
803 gffFileHandler.close()
804
805 convertPathFileIntoGffFile = staticmethod( convertPathFileIntoGffFile )
806
807
808 ## Convert a Path file into a Set file
809 # replace old parser.pathrange2set
810 # @param pathFile: name of the input Path file
811 # @param setFile: name of the output Set file
812 #
813 def convertPathFileIntoSetFile( pathFile, setFile ):
814 pathFileHandler = open( pathFile, "r" )
815 setFileHandler = open( setFile, "w" )
816 iPath = Path()
817 while True:
818 line = pathFileHandler.readline()
819 if line == "":
820 break
821 iPath.setFromString( line )
822 iSet = iPath.getSubjectAsSetOfQuery()
823 iSet.write( setFileHandler )
824 pathFileHandler.close()
825 setFileHandler.close()
826
827 convertPathFileIntoSetFile = staticmethod( convertPathFileIntoSetFile )
828
829 ## Write Path File without duplicated Path (same query, same subject and same coordinate)
830 #
831 # @param inputFile: name of the input Path file
832 # @param outputFile: name of the output Path file
833 #
834 def removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName(inputFile, outputFile):
835 f = open(inputFile, "r")
836 line = f.readline()
837 previousQuery = ""
838 previousSubject = ""
839 lPaths = []
840 while line:
841 iPath = Path()
842 iPath.setFromString(line)
843 query = iPath.getQueryName()
844 subject = iPath.getSubjectName()
845 if (query != previousQuery or subject != previousSubject) and lPaths != []:
846 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
847 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
848 lPaths = []
849 lPaths.append(iPath)
850 previousQuery = query
851 previousSubject = subject
852 line = f.readline()
853 lPathsWithoutDuplicate = PathUtils.getPathListWithoutDuplicatesOnQueryCoord(lPaths)
854 PathUtils.writeListInFile(lPathsWithoutDuplicate, outputFile, "a")
855 f.close()
856 removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName = staticmethod(removeInPathFileDuplicatedPathOnQueryNameQueryCoordAndSubjectName)
857
858