Mercurial > repos > yufei-luo > s_mart
view commons/core/coord/SetUtils.py @ 59:2a4884ba3e5c
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 10 Feb 2014 03:39:09 -0500 |
parents | 769e306b7933 |
children |
line wrap: on
line source
# Copyright INRA (Institut National de la Recherche Agronomique) # http://www.inra.fr # http://urgi.versailles.inra.fr # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. from commons.core.coord.Set import Set ## Static methods for the manipulation of Path instances # class SetUtils( object ): ## Change the identifier of each Set instance in the given list # # @param lSets list of Set instances # @param newId new identifier # def changeIdInList(lSets, newId): for iSet in lSets: iSet.id = newId changeIdInList = staticmethod( changeIdInList ) ## Return the length of the overlap between two lists of Set instances # # @param lSets1 list of Set instances # @param lSets2 list of Set instances # @return length of overlap # @warning sequence names are supposed to be identical # def getOverlapLengthBetweenLists(lSets1, lSets2): lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) osize = 0 i = 0 j = 0 while i!= len(lSet1Sorted): while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): j+=1 jj=j while jj!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[jj]): osize+=lSet1Sorted[i].getOverlapLength(lSet2Sorted[jj]) jj+=1 i+=1 return osize getOverlapLengthBetweenLists = staticmethod( getOverlapLengthBetweenLists ) ## Return True if the two lists of Set instances overlap, False otherwise # # @param lSets1 list of Set instances # @param lSets2 list of Set instances # def areSetsOverlappingBetweenLists( lSets1, lSets2 ): lSet1Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets1) lSet2Sorted = SetUtils.getSetListSortedByIncreasingMinThenMax(lSets2) i=0 j=0 while i!= len(lSet1Sorted): while j!= len(lSet2Sorted) and lSet1Sorted[i].getMin()>lSet2Sorted[j].getMax()\ and not(lSet1Sorted[i].isOverlapping(lSet2Sorted[j])): j+=1 if j!= len(lSet2Sorted) and lSet1Sorted[i].isOverlapping(lSet2Sorted[j]): return True i+=1 return False areSetsOverlappingBetweenLists = staticmethod( areSetsOverlappingBetweenLists ) ## Merge all overlapping Set instances between two lists of Set and give the next identifier # # @param lSets1 list of Set instances # @param lSets2 list of Set instances # @param max_id start id value for inserting new Set # @return a new list of the merged Set instances and the next identifier # def getListOfMergedSetsAndNextId(lSets1, lSets2, max_id=0): lSets_merged = [] list2merge = SetUtils.getListOfIdListOfOverlappingSets ( lSets1,lSets2 ) idlist1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) idlist2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) if max_id == 0: max_id = max(idlist1.keys()) + 1 for i in list2merge: if i == []: continue l = [] min_id = max(i) for j in i: if j>0: if min_id>j: min_id=j l.extend(idlist1[j]) del idlist1[j] else: l.extend(idlist2[j*-1]) del idlist2[j*-1] l = SetUtils.mergeSetsInList(l) SetUtils.changeIdInList(l, min_id) lSets_merged.extend(l) for id, alist in idlist1.items(): lSets_merged.extend(alist) for id,alist in idlist2.items(): SetUtils.changeIdInList(alist,max_id) lSets_merged.extend(alist) max_id+=1 return lSets_merged, max_id getListOfMergedSetsAndNextId = staticmethod ( getListOfMergedSetsAndNextId ) # ## Concatenate two Set instance lists and give the next identifier # # # # @param lSets1 list of Set instances # # @param lSets2 list of Set instances # # @param maxId start id value for inserting new Set # # @return a new list of Set instances and the next identifier # # # @staticmethod # def getSetsListOfTwoConcatenatedSetsListAndNextId(lSets1, lSets2, maxId = 0): # lOutSets = lSets1 # dId2SetsList2 = SetUtils.getDictOfListsWithIdAsKey(lSets2) # if maxId == 0: # dId2SetsList1 = SetUtils.getDictOfListsWithIdAsKey(lSets1) # maxId = max(dId2SetsList1.keys()) # for lSets in dId2SetsList2.values(): # SetUtils.changeIdInList(lSets, maxId) # lOutSets.extend(lSets) # maxId += 1 # return lOutSets, maxId ## Return the sum of the length of each Set instance in the given list # # @param lSets: list of Set instances # def getCumulLength(lSets): length = 0 for i in lSets: length += i.getLength() return length getCumulLength = staticmethod( getCumulLength ) ## Return a tuple with min and max coordinates of Set instances in the given list # # @param lSets list of Set instances # def getListBoundaries(lSets): qmin = -1 qmax = -1 for iSet in lSets: if qmin == -1: qmin = iSet.start qmin = min(qmin, iSet.getMin()) qmax = max(qmax, iSet.getMax()) return (qmin, qmax) getListBoundaries = staticmethod( getListBoundaries ) ## Show Set instances contained in the given list # # @param lSets list of Set instances # def showList(lSets): for iSet in lSets: iSet.show() showList = staticmethod( showList ) ## Write Set instances contained in the given list # # @param lSets list of Set instances # @param fileName a file name # @param mode the open mode of the file '"w"' or '"a"' # def writeListInFile(lSets, fileName, mode="w"): fileHandler = open(fileName, mode) for iSet in lSets: iSet.write(fileHandler) fileHandler.close() writeListInFile = staticmethod( writeListInFile ) ## Split a Set list in several Set lists according to the identifier # # @param lSets list of Set instances # @return a dictionary which keys are identifiers and values Set lists # def getDictOfListsWithIdAsKey(lSets): dId2SetList = {} for iSet in lSets: if dId2SetList.has_key(iSet.id): dId2SetList[iSet.id].append(iSet) else: dId2SetList[iSet.id] = [iSet] return dId2SetList getDictOfListsWithIdAsKey = staticmethod( getDictOfListsWithIdAsKey ) ## Split a Set list in several Set lists according to the identifier # # @param lSets list of Set instances # @return a dictionary which keys are identifiers and values Set lists # def getDictOfListsWithIdAsKeyFromFile( setFile ): dId2SetList = {} setFileHandler = open( setFile, "r" ) while True: line = setFileHandler.readline() if line == "": break iSet = Set() iSet.setFromTuple( line[:-1].split("\t") ) if not dId2SetList.has_key( iSet.id ): dId2SetList[ iSet.id ] = [] dId2SetList[ iSet.id ].append( iSet ) setFileHandler.close() return dId2SetList getDictOfListsWithIdAsKeyFromFile = staticmethod( getDictOfListsWithIdAsKeyFromFile ) ## Return a Map list from the given Set List # # @param lSets list of Set instances # def getMapListFromSetList(lSets): lMaps = [] for iSet in lSets: lMaps.append(iSet.set2map()) return lMaps getMapListFromSetList = staticmethod( getMapListFromSetList ) ## Construct a Set list from a Map list # # @param lMaps list of Map instances # def getSetListFromMapList(lMaps): lSets = [] c = 0 for iMap in lMaps: c += 1 lSets.append( Set(c, iMap.name, iMap.seqname, iMap.start, iMap.end) ) return lSets getSetListFromMapList = staticmethod( getSetListFromMapList ) ## Merge all overlapping Set instances in a list without considering the identifiers. # Start by sorting Set instances by their increasing Min coordinate. # # @return: a new list of the merged Set instances # def mergeSetsInList(lSets): l=[] if len(lSets)==0: return l lSortedSets = SetUtils.getSetListSortedByIncreasingMinThenInvLength( lSets ) prev_count = 0 for iSet in lSortedSets[0:]: if prev_count != len(lSortedSets): for i in lSortedSets[ prev_count + 1: ]: if iSet.isOverlapping( i ): iSet.merge( i ) IsAlreadyInList = False for newSet in l: if newSet.isOverlapping( iSet ): IsAlreadyInList = True newSet.merge( iSet ) l [ l.index( newSet ) ] = newSet if not IsAlreadyInList: l.append( iSet ) prev_count += 1 return l mergeSetsInList = staticmethod( mergeSetsInList ) ## Unjoin a Set list according to another # # @param lToKeep: a list of Set instances to keep # @param lToUnjoin: a list of Set instances to unjoin # @return: lToUnjoin split in several list # def getSetListUnjoined(lToKeep, lToUnjoin): lSortedToKeep = SetUtils.getSetListSortedByIncreasingMinThenMax( lToKeep ) lSortedToUnjoin = SetUtils.getSetListSortedByIncreasingMinThenMax( lToUnjoin ) if lSortedToUnjoin == []: return [] if lSortedToKeep == []: return [ lSortedToUnjoin ] i=0 resultListSet=[] while i<len(lSortedToKeep): j1=0 while j1<len(lSortedToUnjoin) and lSortedToKeep[i].getMin() > lSortedToUnjoin[j1].getMax(): j1+=1 if j1==len(lSortedToUnjoin): break if j1!=0: resultListSet.append(lSortedToUnjoin[:j1]) del lSortedToUnjoin[:j1] j1=0 if i+1==len(lSortedToKeep): break j2=j1 if j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax(): while j2<len(lSortedToUnjoin) and lSortedToKeep[i+1].getMin() > lSortedToUnjoin[j2].getMax(): j2+=1 resultListSet.append(lSortedToUnjoin[j1:j2]) del lSortedToUnjoin[j1:j2] i+=1 if resultListSet!=[] or i == 0: resultListSet.append(lSortedToUnjoin) return resultListSet getSetListUnjoined = staticmethod(getSetListUnjoined) ## Return new list of Set instances with no duplicate # # @param lSets list of Set instances # def getSetListWithoutDuplicates( lSets ): if len(lSets) < 2: return lSets lSortedSet = SetUtils.getSetListSortedByIncreasingMinThenMax( lSets ) lUniqSet = [ lSortedSet[0] ] for iSet in lSortedSet[1:]: if iSet != lUniqSet[-1]: lUniqSet.append( iSet ) return lUniqSet getSetListWithoutDuplicates = staticmethod( getSetListWithoutDuplicates ) ## Return a list of Set instances sorted in increasing order according to the Min, then the Max, and finally their initial order # # @param lSets: list of Set instances # def getSetListSortedByIncreasingMinThenMax( lSets ): return sorted( lSets, key=lambda iSet: ( iSet.getMin(), iSet.getMax() ) ) getSetListSortedByIncreasingMinThenMax = staticmethod( getSetListSortedByIncreasingMinThenMax ) ## Return a list of Set instances sorted in increasing order according to the min, then the inverse of the length, and finally their initial order # # @param lSets: list of Set instances # def getSetListSortedByIncreasingMinThenInvLength( lSets ): return sorted( lSets, key=lambda iSet: ( iSet.getMin(), 1 / float(iSet.getLength()) ) ) getSetListSortedByIncreasingMinThenInvLength = staticmethod( getSetListSortedByIncreasingMinThenInvLength ) ## Return a list of Set instances sorted in increasing order according to the SeqName, then the Name, then the Min, then the Max and finally their initial order # # @param lSets: list of Set instances # def getSetListSortedBySeqThenRegionThenMinThenMax(lSets): return sorted(lSets, key=lambda iSet: (iSet.getSeqname(), iSet.getName(), iSet.getMin(), iSet.getMax())) getSetListSortedBySeqThenRegionThenMinThenMax = staticmethod(getSetListSortedBySeqThenRegionThenMinThenMax) ## Return a list of identifier lists of overlapping Sets from the subject list, according to the reference list # # @param lRef list of Set instances # @param lSubject list of Set instances # def getListOfIdListOfOverlappingSets(lRef,lSubject): lSortedRef = SetUtils.getSetListSortedByIncreasingMinThenMax( lRef ) lSortedSubject = SetUtils.getSetListSortedByIncreasingMinThenMax( lSubject ) lOverlappingSet = [] lOverlappingSetCounter = 0 id2LOverlappingSet_pos = {} i = 0 j = 0 while i!= len(lSortedRef): while j!= len(lSortedSubject) and lSortedRef[i].getMin()>lSortedSubject[j].getMax()\ and not(lSortedRef[i].isOverlapping(lSortedSubject[j])\ and lSortedRef[i].isOnDirectStrand()==lSortedSubject[j].isOnDirectStrand()): j+=1 jj=j while jj!= len(lSortedSubject) and lSortedRef[i].isOverlapping(lSortedSubject[jj])\ and lSortedRef[i].isOnDirectStrand()==lSortedSubject[jj].isOnDirectStrand(): id1=lSortedRef[i].id id2=lSortedSubject[jj].id*-1 if id2LOverlappingSet_pos.has_key(id1) \ and not id2LOverlappingSet_pos.has_key(id2): lOverlappingSet[id2LOverlappingSet_pos[id1]].append(id2) id2LOverlappingSet_pos[id2]=id2LOverlappingSet_pos[id1] if id2LOverlappingSet_pos.has_key(id2) \ and not id2LOverlappingSet_pos.has_key(id1): lOverlappingSet[id2LOverlappingSet_pos[id2]].append(id1) id2LOverlappingSet_pos[id1]=id2LOverlappingSet_pos[id2] if not id2LOverlappingSet_pos.has_key(id2) \ and not id2LOverlappingSet_pos.has_key(id1): lOverlappingSet.append([id1,id2]) id2LOverlappingSet_pos[id1]=lOverlappingSetCounter id2LOverlappingSet_pos[id2]=lOverlappingSetCounter lOverlappingSetCounter+=1 jj+=1 i+=1 return lOverlappingSet getListOfIdListOfOverlappingSets = staticmethod (getListOfIdListOfOverlappingSets) ## Return a list of sets without overlapping between two lists of sets # # @param lSet1 and lSet2 # def getListOfSetWithoutOverlappingBetweenTwoListOfSet(lSet1, lSet2): for i in lSet1: for idx,j in enumerate(lSet2): n=j.diff(i) if not n.isEmpty() and n.getLength()>=20: lSet2.append(n) lSet2WithoutOverlaps=[] for i in lSet2: if not i.isEmpty() and i.getLength()>=20: lSet2WithoutOverlaps.append(i) return lSet2WithoutOverlaps getListOfSetWithoutOverlappingBetweenTwoListOfSet = staticmethod (getListOfSetWithoutOverlappingBetweenTwoListOfSet) ## Return a Set list from a Set file # # @param setFile string name of a Set file # @return a list of Set instances # def getSetListFromFile( setFile ): lSets = [] setFileHandler = open( setFile, "r" ) while True: line = setFileHandler.readline() if line == "": break iSet = Set() iSet.setFromString( line ) lSets.append( iSet ) setFileHandler.close() return lSets getSetListFromFile = staticmethod( getSetListFromFile ) def convertSetFileIntoMapFile( setFile, mapFile ): setFileHandler = open( setFile, "r" ) mapFileHandler = open( mapFile, "w" ) iSet = Set() while True: line = setFileHandler.readline() if line == "": break iSet.setFromString( line ) iMap = iSet.getMapInstance() iMap.write( mapFileHandler ) setFileHandler.close() mapFileHandler.close() convertSetFileIntoMapFile = staticmethod( convertSetFileIntoMapFile ) def getDictOfListsWithSeqnameAsKey( lSets ): dSeqnamesToSetList = {} for iSet in lSets: if not dSeqnamesToSetList.has_key( iSet.seqname ): dSeqnamesToSetList[ iSet.seqname ] = [] dSeqnamesToSetList[ iSet.seqname ].append( iSet ) return dSeqnamesToSetList getDictOfListsWithSeqnameAsKey = staticmethod( getDictOfListsWithSeqnameAsKey ) def filterOnLength( lSets, minLength=0, maxLength=10000000000 ): if minLength == 0 and maxLength == 0: return lSets lFiltered = [] for iSet in lSets: if minLength <= iSet.getLength() <= maxLength: lFiltered.append( iSet ) return lFiltered filterOnLength = staticmethod( filterOnLength ) def getListOfNames( setFile ): lNames = [] setFileHandler = open( setFile, "r" ) iSet = Set() while True: line = setFileHandler.readline() if line == "": break iSet.setFromTuple( line[:-1].split("\t") ) if iSet.name not in lNames: lNames.append( iSet.name ) setFileHandler.close() return lNames getListOfNames = staticmethod( getListOfNames ) def getDictOfDictsWithNamesThenIdAsKeyFromFile( setFile ): dNames2DictsId = {} setFileHandler = open( setFile, "r" ) while True: line = setFileHandler.readline() if line == "": break iSet = Set() iSet.setFromTuple( line[:-1].split("\t") ) if not dNames2DictsId.has_key( iSet.name ): dNames2DictsId[ iSet.name ] = { iSet.id: [ iSet ] } else: if not dNames2DictsId[ iSet.name ].has_key( iSet.id ): dNames2DictsId[ iSet.name ][ iSet.id ] = [ iSet ] else: dNames2DictsId[ iSet.name ][ iSet.id ].append( iSet ) setFileHandler.close() return dNames2DictsId getDictOfDictsWithNamesThenIdAsKeyFromFile = staticmethod( getDictOfDictsWithNamesThenIdAsKeyFromFile )