Mercurial > repos > yufei-luo > s_mart
view SMART/Java/Python/CompareOverlapping.py @ 59:2a4884ba3e5c
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 10 Feb 2014 03:39:09 -0500 |
parents | 169d364ddd91 |
children |
line wrap: on
line source
#! /usr/bin/env python # # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # import os, struct, time, random from optparse import OptionParser from commons.core.parsing.ParserChooser import ParserChooser from commons.core.writer.Gff3Writer import Gff3Writer from SMART.Java.Python.structure.Transcript import Transcript from SMART.Java.Python.structure.Interval import Interval from SMART.Java.Python.ncList.NCList import NCList from SMART.Java.Python.ncList.NCListCursor import NCListCursor from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle from SMART.Java.Python.ncList.NCListHandler import NCListHandler from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList from SMART.Java.Python.misc.Progress import Progress from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress from SMART.Java.Python.misc import Utils try: import cPickle as pickle except: import pickle REFERENCE = 0 QUERY = 1 TYPES = (REFERENCE, QUERY) TYPETOSTRING = {0: "reference", 1: "query"} class CompareOverlapping(object): def __init__(self, verbosity = 1): self._outputFileName = "outputOverlaps.gff3" self._iWriter = None self._nbOverlappingQueries = 0 self._nbOverlaps = 0 self._nbLines = {REFERENCE: 0, QUERY: 0} self._verbosity = verbosity self._ncLists = {} self._cursors = {} self._splittedFileNames = {} self._nbElements = {} self._nbElementsPerChromosome = {} self._inputFileNames = {REFERENCE: None, QUERY: None} self._inputFileFormats = {REFERENCE: None, QUERY: None} self._starts = {REFERENCE: None, QUERY: None} self._ends = {REFERENCE: None, QUERY: None} self._fivePrimes = {REFERENCE: None, QUERY: None} self._threePrimes = {REFERENCE: None, QUERY: None} self._ncListHandlers = {REFERENCE: None, QUERY: None} self._convertedFileNames = {REFERENCE: False, QUERY: False} self._sorted = False self._index = False self._introns = False self._antisense = False self._colinear = False self._invert = False self._distance = 0 self._minOverlap = 1 self._pcOverlap = None self._included = False self._including = False self._outputNotOverlapping = False self._tmpRefFileName = None self._currentQueryTranscript = None self._currentOrQueryTranscript = None self._currentExQueryTranscript = None self._randInt = random.randint(0, 100000) def __del__(self): for fileName in [self._tmpRefFileName] + self._convertedFileNames.values(): if fileName != None and os.path.exists(fileName): os.remove(fileName) def close(self): self._iWriter.close() def setInput(self, fileName, format, type): chooser = ParserChooser(self._verbosity) chooser.findFormat(format) self._inputFileNames[type] = fileName self._inputFileFormats[type] = format def setOutput(self, outputFileName): if outputFileName != '': self._outputFileName = outputFileName self._iWriter = Gff3Writer(self._outputFileName) def setSorted(self, sorted): self._sorted = sorted def setIndex(self, index): self._index = index def restrictToStart(self, distance, type): self._starts[type] = distance def restrictToEnd(self, distance, type): self._ends[type] = distance def extendFivePrime(self, distance, type): self._fivePrimes[type] = distance def extendThreePrime(self, distance, type): self._threePrimes[type] = distance def acceptIntrons(self, boolean): self._introns = boolean def getAntisenseOnly(self, boolean): self._antisense = boolean def getColinearOnly(self, boolean): self._colinear = boolean def getInvert(self, boolean): self._invert = boolean def setMaxDistance(self, distance): self._distance = distance def setMinOverlap(self, overlap): self._minOverlap = overlap def setPcOverlap(self, overlap): self._pcOverlap = overlap def setIncludedOnly(self, boolean): self._included = boolean def setIncludingOnly(self, boolean): self._including = boolean def includeNotOverlapping(self, boolean): self._outputNotOverlapping = boolean def transformTranscript(self, transcript, type): if self._starts[type] != None: transcript.restrictStart(self._starts[type]) if self._ends[type] != None: transcript.restrictEnd(self._ends[type]) if self._fivePrimes[type] != None: transcript.extendStart(self._fivePrimes[type]) if self._threePrimes[type] != None: transcript.extendEnd(self._threePrimes[type]) if self._introns: transcript.exons = [] if type == REFERENCE and self._distance > 0: transcript.extendExons(self._distance) return transcript def extendQueryTranscript(self, transcript): self._currentExQueryTranscript = Transcript() self._currentExQueryTranscript.copy(transcript) if self._fivePrimes[QUERY] != None: self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY]) if self._threePrimes[QUERY] != None: self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY]) transcript.exons = [] def createTmpRefFile(self): self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt) if "SMARTTMPPATH" in os.environ: self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName) chooser = ParserChooser(self._verbosity) chooser.findFormat(self._inputFileFormats[REFERENCE]) parser = chooser.getParser(self._inputFileNames[REFERENCE]) writer = NCListFilePickle(self._tmpRefFileName, self._verbosity) for transcript in parser.getIterator(): transcript = self.transformTranscript(transcript, REFERENCE) writer.addTranscript(transcript) writer.close() self._inputFileNames[REFERENCE] = self._tmpRefFileName self._inputFileFormats[REFERENCE] = "pkl" def createNCLists(self): self._ncLists = dict([type, {}] for type in TYPES) self._indices = dict([type, {}] for type in TYPES) self._cursors = dict([type, {}] for type in TYPES) for type in TYPES: if self._verbosity > 2: print "Creating %s NC-list..." % (TYPETOSTRING[type]) self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type) if "SMARTTMPPATH" in os.environ: self._convertedFileNames[type] = os.path.join(os.environ["SMARTTMPPATH"], self._convertedFileNames[type]) ncLists = ConvertToNCList(self._verbosity) ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) ncLists.setOutputFileName(self._convertedFileNames[type]) ncLists.setSorted(self._sorted) if type == REFERENCE and self._index: ncLists.setIndex(True) ncLists.run() self._ncListHandlers[type] = NCListHandler(self._verbosity) self._ncListHandlers[type].setFileName(self._convertedFileNames[type]) self._ncListHandlers[type].loadData() self._nbLines[type] = self._ncListHandlers[type].getNbElements() self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome() self._ncLists[type] = self._ncListHandlers[type].getNCLists() for chromosome, ncList in self._ncLists[type].iteritems(): self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity) if type == REFERENCE and self._index: self._indices[REFERENCE][chromosome] = ncList.getIndex() if self._verbosity > 2: print " ...done" def compare(self): nbSkips, nbMoves = 0, 0 previousChromosome = None done = False refNCList = None queryNCList = None startTime = time.time() progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity) for chromosome, queryNCList in self._ncLists[QUERY].iteritems(): queryParser = self._ncListHandlers[QUERY].getParser(chromosome) queryNCList = self._ncLists[QUERY][chromosome] queryCursor = self._cursors[QUERY][chromosome] if chromosome != previousChromosome: skipChromosome = False previousChromosome = chromosome if chromosome not in self._ncLists[REFERENCE]: if self._outputNotOverlapping: while not queryCursor.isOut(): self._currentQueryTranscript = queryCursor.getTranscript() self._writeIntervalInNewGFF3({}) if queryCursor.hasChildren(): queryCursor.moveDown() else: queryCursor.moveNext() progress.inc() continue refNCList = self._ncLists[REFERENCE][chromosome] refCursor = self._cursors[REFERENCE][chromosome] while True: self._currentOrQueryTranscript = queryCursor.getTranscript() self._currentQueryTranscript = Transcript() self._currentQueryTranscript.copy(self._currentOrQueryTranscript) self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY) self.extendQueryTranscript(self._currentOrQueryTranscript) newRefLaddr = self.checkIndex(refCursor) if newRefLaddr != None: nbMoves += 1 refCursor.setLIndex(newRefLaddr) done = False refCursor, done, unmatched = self.findOverlapIter(refCursor, done) if refCursor.isOut(): if not self._invert and not self._outputNotOverlapping: break if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren(): queryCursor.moveNext() nbSkips += 1 else: queryCursor.moveDown() if queryCursor.isOut(): break progress.inc() progress.done() endTime = time.time() self._timeSpent = endTime - startTime if self._verbosity >= 10: print "# skips: %d" % (nbSkips) print "# moves: %d" % (nbMoves) def findOverlapIter(self, cursor, done): chromosome = self._currentQueryTranscript.getChromosome() matched = False if chromosome not in self._ncLists[REFERENCE]: return None, False, True ncList = self._ncLists[REFERENCE][chromosome] overlappingNames = {} nextDone = False firstOverlapLAddr = NCListCursor(cursor) firstOverlapLAddr.setLIndex(-1) if cursor.isOut(): self._writeIntervalInNewGFF3(overlappingNames) return firstOverlapLAddr, False, True parentCursor = NCListCursor(cursor) parentCursor.moveUp() firstParentAfter = False while not parentCursor.isOut(): if self.isOverlapping(parentCursor) == 0: matched = True if self._checkOverlap(parentCursor.getTranscript()): overlappingNames.update(self._extractID(parentCursor.getTranscript())) if firstOverlapLAddr.isOut(): firstOverlapLAddr.copy(parentCursor) nextDone = True elif self.isOverlapping(parentCursor) == 1: firstParentAfter = NCListCursor(parentCursor) parentCursor.moveUp() if firstParentAfter: written = self._writeIntervalInNewGFF3(overlappingNames) return firstParentAfter, False, not written if self._invert else not matched #This loop finds the overlaps with currentRefLAddr.# while True: parentCursor = NCListCursor(cursor) parentCursor.moveUp() #In case: Query is on the right of the RefInterval and does not overlap. overlap = self.isOverlapping(cursor) if overlap == -1: cursor.moveNext() #In case: Query overlaps with RefInterval. elif overlap == 0: matched = True if self._checkOverlap(cursor.getTranscript()): overlappingNames.update(self._extractID(cursor.getTranscript())) if firstOverlapLAddr.compare(parentCursor): firstOverlapLAddr.copy(cursor) nextDone = True if done: cursor.moveNext() else: if not cursor.hasChildren(): cursor.moveNext() if cursor.isOut(): break else: cursor.moveDown() #In case: Query is on the left of the RefInterval and does not overlap. else: if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor): firstOverlapLAddr.copy(cursor) nextDone = False # new break done = False if cursor.isOut(): break written = self._writeIntervalInNewGFF3(overlappingNames) return firstOverlapLAddr, nextDone, not written if self._invert else not matched def isOverlapping(self, refTranscript): if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()): return 0 if self._currentExQueryTranscript.getEnd() < refTranscript.getStart(): return 1 return -1 def checkIndex(self, cursor): if not self._index: return None if cursor.isOut(): return None chromosome = self._currentExQueryTranscript.getChromosome() nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript) if nextLIndex == None: return None ncList = self._ncLists[REFERENCE][chromosome] nextGffAddress = ncList.getRefGffAddr(nextLIndex) thisGffAddress = cursor.getGffAddress() if nextGffAddress > thisGffAddress: return nextLIndex return None def _writeIntervalInNewGFF3(self, names): nbOverlaps = 0 for cpt in names.values(): nbOverlaps += cpt self._nbOverlappingQueries += 1 if Utils.xor(names, self._invert) else 0 self._nbOverlaps += nbOverlaps if Utils.xor(names, self._invert) else 0 if names: self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names)) self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps) if self._invert: return False else: if self._outputNotOverlapping: self._currentQueryTranscript.setTagValue("nbOverlaps", 0) elif not self._invert: return False self._iWriter.addTranscript(self._currentQueryTranscript) self._iWriter.write() return True def _extractID(self, transcript): id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName() nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1 return {id: float(nbElements)} def _checkOverlap(self, refTranscript): if self._currentQueryTranscript.getDistance(refTranscript) > self._distance: return False minOverlap = self._minOverlap if self._pcOverlap != None: minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap) if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap): return False if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection(): return False if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection(): return False if self._included and not refTranscript.include(self._currentQueryTranscript): return False if self._including and not self._currentQueryTranscript.include(refTranscript): return False if self._introns: return True return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap) def run(self): self.createTmpRefFile() self.createNCLists() self.compare() self.close() if self._verbosity > 0: print "# queries: %d" % (self._nbLines[QUERY]) print "# refs: %d" % (self._nbLines[REFERENCE]) print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) print "time: %ds" % (self._timeSpent) if __name__ == "__main__": description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]" parser = OptionParser(description = description) parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") parser.add_option("-D", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]") parser.add_option("-r", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]") parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]") parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]") parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]") parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]") parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]") parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=1, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") (options, args) = parser.parse_args() co = CompareOverlapping(options.verbosity) co.setInput(options.inputFileName1, options.format1, QUERY) co.setInput(options.inputFileName2, options.format2, REFERENCE) co.setOutput(options.output) co.setSorted(options.sorted) co.setIndex(options.index) co.restrictToStart(options.start1, QUERY) co.restrictToStart(options.start2, REFERENCE) co.restrictToEnd(options.end1, QUERY) co.restrictToEnd(options.end2, REFERENCE) co.extendFivePrime(options.fivePrime1, QUERY) co.extendFivePrime(options.fivePrime2, REFERENCE) co.extendThreePrime(options.threePrime1, QUERY) co.extendThreePrime(options.threePrime2, REFERENCE) co.acceptIntrons(options.introns) co.getAntisenseOnly(options.antisense) co.getColinearOnly(options.colinear) co.getInvert(options.exclude) co.setMaxDistance(options.distance) co.setMinOverlap(options.minOverlap) co.setPcOverlap(options.pcOverlap) co.setIncludedOnly(options.included) co.setIncludingOnly(options.including) co.includeNotOverlapping(options.notOverlapping) co.run()