Mercurial > repos > yufei-luo > s_mart
diff commons/tools/RmvPairAlignInChunkOverlaps.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/RmvPairAlignInChunkOverlaps.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,286 @@ +#!/usr/bin/env python + +""" +Remove hits due to chunk overlaps. +""" + +import os +import sys +import getopt +import exceptions +import copy +from commons.core.coord.Align import * + + +class RmvPairAlignInChunkOverlaps( object ): + """ + Remove hits due to chunk overlaps. + """ + + + def __init__( self, inFileName="", chunkLength=200000, chunkOverlap=10000, margin=10, outFileName="", verbose=0 ): + """ + Constructor. + """ + self._inFileName = inFileName + self._chunkLength = chunkLength + self._chunkOverlap = chunkOverlap + self._margin = margin + self._outFileName = outFileName + self._verbose = verbose + + def help( self ): + """ + Display the help. + """ + print + print "usage: %s [ options ]" % ( sys.argv[0] ) + print "options:" + print " -h: this help" + print " -i: name of the input file (format='align')" + print " -l: chunk length (in bp)" + print " -o: chunk overlap (in bp)" + print " -m: margin to remove match included into a chunk overlap (default=10)" + print " -O: name of the output file (default=inFileName+'.not_over')" + print " -v: verbose (default=0/1)" + print + + def setAttributesFromCmdLine( self ): + """ + Set attributes from the command-line arguments. + """ + try: + opts, args = getopt.getopt(sys.argv[1:],"h:i:l:o:m:O:v:") + except getopt.GetoptError, err: + print str(err); self.help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); sys.exit(0) + elif o == "-i": + self.setInputFileName( a ) + elif o == "-l": + self.setChunkLength( a ) + elif o == "-o": + self.setChunkOverlap( a ) + elif o == "-m": + self.setMargin( a ) + elif o == "-O": + self.setOutputFileName( a ) + elif o == "-v": + self.setVerbosityLevel( a ) + + def setInputFileName( self, inFileName ): + self._inFileName = inFileName + + def setChunkLength( self, chunkLength ): + self._chunkLength = int(chunkLength) + + def setChunkOverlap( self, chunkOverlap ): + self._chunkOverlap = int(chunkOverlap) + + def setMargin( self, margin ): + self._margin = int(margin) + + def setOutputFileName( self, outFileName ): + self._outFileName = outFileName + + def setVerbosityLevel( self, verbose ): + self._verbose = int(verbose) + + def checkAttributes( self ): + """ + Before running, check the required attributes are properly filled. + """ + if self._inFileName == "": + print "ERROR: missing input file"; self.help(); sys.exit(1) + if not os.path.exists(self._inFileName ): + print "ERROR: input file '%s' doesn't exist" %( self._inFileName ) + if self._outFileName == "": + self._outFileName = "%s.not_over" % ( self._inFileName ) + + + def isPairAlignAChunkOverlap( self, a, chunkQuery, chunkSubject ): + """ + Return True if the pairwise alignment exactly corresponds to a 2-chunk overlap, False otherwise. + Take into account cases specific to BLASTER or PALS. + """ + + if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand(): + if self._verbose > 1: print "on different strand" + return False + + if chunkQuery == chunkSubject + 1: + if self._verbose > 1: print "query > subject" + if a.range_query.start == 1 and a.range_subject.end == self._chunkLength \ + and ( a.range_query.getLength() == self._chunkOverlap \ + or a.range_query.getLength() == self._chunkOverlap + 1 ) \ + and ( a.range_subject.getLength() == self._chunkOverlap \ + or a.range_subject.getLength() == self._chunkOverlap + 1 ): + if self._verbose > 1: print "chunk overlap" + return True + + elif chunkQuery == chunkSubject - 1: + if self._verbose > 1: print "query < subject" + if a.range_query.end == self._chunkLength and a.range_subject.start == 1 \ + and ( a.range_query.getLength() == self._chunkOverlap \ + or a.range_query.getLength() == self._chunkOverlap + 1 ) \ + and ( a.range_subject.getLength() == self._chunkOverlap \ + or a.range_subject.getLength() == self._chunkOverlap + 1 ): + if self._verbose > 1: print "chunk overlap" + return True + + if self._verbose > 1: print "not a chunk overlap" + return False + + + def isInInterval( self, coord, midpoint ): + """ + Check if a given coordinate is inside the interval [midpoint-margin;midpoint+margin]. + """ + if coord >= midpoint - self._margin and coord <= midpoint + self._margin: + return True + return False + + + def isPairAlignWithinAndDueToAChunkOverlap( self, a, chunkQuery, chunkSubject ): + """ + Return True if the pairwise alignment lies within an overlap between two contiguous chunks and is due to it, False otherwise. + """ + uniqLength = self._chunkLength - self._chunkOverlap + + if a.range_query.isOnDirectStrand() != a.range_subject.isOnDirectStrand(): + if self._verbose > 1: print "on different strand" + return False + + if chunkQuery == chunkSubject + 1: + if self._verbose > 1: print "query > subject" + if a.range_query.getMin() >= 1 and a.range_query.getMax() <= self._chunkOverlap \ + and a.range_subject.getMin() >= self._chunkLength - self._chunkOverlap + 1 \ + and a.range_subject.getMax() <= self._chunkLength: + if self._verbose > 1: print "included" + if self.isInInterval( a.range_query.getMin(), a.range_subject.getMin() - uniqLength ) \ + and self.isInInterval( self._chunkOverlap - a.range_query.getMax(), self._chunkLength - a.range_subject.getMax() ): + if self._verbose > 1: print "due to overlap" + return True + else: + if self._verbose > 1: print "not due to overlap" + return False + + elif chunkQuery == chunkSubject - 1: + if self._verbose > 1: print "query < subject" + if a.range_query.getMin() >= self._chunkLength - self._chunkOverlap + 1 \ + and a.range_query.getMax() <= self._chunkLength \ + and a.range_subject.getMin() >= 1 \ + and a.range_subject.getMax() <= self._chunkOverlap: + if self._verbose > 1: print "included" + if self.isInInterval( a.range_subject.getMin(), a.range_query.getMin() - uniqLength ) \ + and self.isInInterval( self._chunkOverlap - a.range_subject.getMax(), self._chunkLength - a.range_query.getMax() ): + if self._verbose > 1: print "due to overlap" + return True + else: + if self._verbose > 1: print "not due to overlap" + return False + + else: + if self._verbose > 1: print "not contiguous chunks" + return False + + if self._verbose > 1: print "not included" + return False + + + def removeChunkOverlaps( self ): + """ + Remove pairwise alignments exactly corresponding to chunk overlaps or those included within such overlaps. + """ + totalNbPairAlign = 0 + nbChunkOverlaps = 0 + d = {} + nbPairAlignWithinChunkOverlaps = 0 + + inF = open( self._inFileName, "r" ) + outF = open( self._outFileName, "w" ) + alignInstance = Align() + + while True: + if not alignInstance.read( inF ): break + totalNbPairAlign += 1 + if self._verbose > 1: alignInstance.show() + + if "chunk" not in alignInstance.range_query.seqname or "chunk" not in alignInstance.range_subject.seqname: + print "WARNING: no 'chunk' in query or subject name"; return False + + chunkQuery = int(alignInstance.range_query.seqname.replace("chunk","")) + chunkSubject = int(alignInstance.range_subject.seqname.replace("chunk","")) + + if abs( chunkSubject - chunkQuery ) > 1: + if self._verbose > 1: print "non contiguous chunks -> keep" + alignInstance.write( outF ) + continue + + if alignInstance.range_query.isOnDirectStrand() != alignInstance.range_subject.isOnDirectStrand(): + if self._verbose > 1: print "on different strand" + alignInstance.write( outF ) + continue + + if abs( chunkSubject - chunkQuery ) == 0: + if alignInstance.range_query.start == 1 \ + and alignInstance.range_query.end == self._chunkLength \ + and alignInstance.range_subject.start == 1 \ + and alignInstance.range_subject.end == self._chunkLength: + if self._verbose > 1: print "self-alignment on whole chunk -> remove" + continue + + if self.isPairAlignAChunkOverlap( alignInstance, chunkQuery, chunkSubject ): + if self._verbose > 1: print "chunk overlap -> remove" + nbChunkOverlaps += 1 + + elif self.isPairAlignWithinAndDueToAChunkOverlap( alignInstance, chunkQuery, chunkSubject ): + if self._verbose > 1: print "within chunk overlap -> remove" + nbPairAlignWithinChunkOverlaps += 1 + + else: + if self._verbose > 1: print "keep" + alignInstance.write( outF ) + + inF.close() + if self._verbose > 0: print "nb of pairwise alignments in input file: %i" % ( totalNbPairAlign ) + if self._verbose > 0: print "nb of chunk overlaps: %i" % ( nbChunkOverlaps ) + if self._verbose > 0: print "nb of pairwise alignments within chunk overlaps: %i" % ( nbPairAlignWithinChunkOverlaps ) + + for names,lAligns in d.items(): + for alignInstance in lAligns: + alignInstance.write( outF ) + outF.close() + + + def start( self ): + """ + Useful commands before running the program. + """ + if self._verbose > 0: + print "START %s" % ( type(self).__name__ ); sys.stdout.flush() + self.checkAttributes() + + + def end( self ): + """ + Useful commands before ending the program. + """ + if self._verbose > 0: + print "END %s" % ( type(self).__name__ ); sys.stdout.flush() + + + def run( self ): + """ + Run the program. + """ + self.start() + self.removeChunkOverlaps() + self.end() + + +if __name__ == '__main__': + i = RmvPairAlignInChunkOverlaps() + i.setAttributesFromCmdLine() + i.run()