Mercurial > repos > yufei-luo > s_mart
diff smart_toolShed/SMART/Java/Python/CompareOverlapping.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/smart_toolShed/SMART/Java/Python/CompareOverlapping.py Thu Jan 17 10:52:14 2013 -0500 @@ -0,0 +1,491 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import os, struct, time, random +from optparse import OptionParser +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.writer.Gff3Writer import Gff3Writer +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.Interval import Interval +from SMART.Java.Python.ncList.NCList import NCList +from SMART.Java.Python.ncList.NCListCursor import NCListCursor +from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle +from SMART.Java.Python.ncList.NCListHandler import NCListHandler +from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress +from SMART.Java.Python.misc import Utils +try: + import cPickle as pickle +except: + import pickle + +REFERENCE = 0 +QUERY = 1 +TYPES = (REFERENCE, QUERY) +TYPETOSTRING = {0: "reference", 1: "query"} + +class CompareOverlapping(object): + + def __init__(self, verbosity = 1): + self._outputFileName = "outputOverlaps.gff3" + self._iWriter = None + self._nbOverlappingQueries = 0 + self._nbOverlaps = 0 + self._nbLines = {REFERENCE: 0, QUERY: 0} + self._verbosity = verbosity + self._ncLists = {} + self._cursors = {} + self._splittedFileNames = {} + self._nbElements = {} + self._nbElementsPerChromosome = {} + self._inputFileNames = {REFERENCE: None, QUERY: None} + self._inputFileFormats = {REFERENCE: None, QUERY: None} + self._starts = {REFERENCE: None, QUERY: None} + self._ends = {REFERENCE: None, QUERY: None} + self._fivePrimes = {REFERENCE: None, QUERY: None} + self._threePrimes = {REFERENCE: None, QUERY: None} + self._ncListHandlers = {REFERENCE: None, QUERY: None} + self._convertedFileNames = {REFERENCE: False, QUERY: False} + self._sorted = False + self._index = False + self._introns = False + self._antisense = False + self._colinear = False + self._invert = False + self._distance = 0 + self._minOverlap = 1 + self._pcOverlap = None + self._included = False + self._including = False + self._outputNotOverlapping = False + self._tmpRefFileName = None + self._currentQueryTranscript = None + self._currentOrQueryTranscript = None + self._currentExQueryTranscript = None + self._randInt = random.randint(0, 100000) + + def __del__(self): + for fileName in [self._tmpRefFileName] + self._convertedFileNames.values(): + if fileName != None and os.path.exists(fileName): + os.remove(fileName) + + def close(self): + self._iWriter.close() + + def setInput(self, fileName, format, type): + chooser = ParserChooser(self._verbosity) + chooser.findFormat(format) + self._inputFileNames[type] = fileName + self._inputFileFormats[type] = format + + def setOutput(self, outputFileName): + if outputFileName != '': + self._outputFileName = outputFileName + self._iWriter = Gff3Writer(self._outputFileName) + + def setSorted(self, sorted): + self._sorted = sorted + + def setIndex(self, index): + self._index = index + + def restrictToStart(self, distance, type): + self._starts[type] = distance + + def restrictToEnd(self, distance, type): + self._ends[type] = distance + + def extendFivePrime(self, distance, type): + self._fivePrimes[type] = distance + + def extendThreePrime(self, distance, type): + self._threePrimes[type] = distance + + def acceptIntrons(self, boolean): + self._introns = boolean + + def getAntisenseOnly(self, boolean): + self._antisense = boolean + + def getColinearOnly(self, boolean): + self._colinear = boolean + + def getInvert(self, boolean): + self._invert = boolean + + def setMaxDistance(self, distance): + self._distance = distance + + def setMinOverlap(self, overlap): + self._minOverlap = overlap + + def setPcOverlap(self, overlap): + self._pcOverlap = overlap + + def setIncludedOnly(self, boolean): + self._included = boolean + + def setIncludingOnly(self, boolean): + self._including = boolean + + def includeNotOverlapping(self, boolean): + self._outputNotOverlapping = boolean + + def transformTranscript(self, transcript, type): + if self._starts[type] != None: + transcript.restrictStart(self._starts[type]) + if self._ends[type] != None: + transcript.restrictEnd(self._ends[type]) + if self._fivePrimes[type] != None: + transcript.extendStart(self._fivePrimes[type]) + if self._threePrimes[type] != None: + transcript.extendEnd(self._threePrimes[type]) + if self._introns: + transcript.exons = [] + if type == REFERENCE and self._distance > 0: + transcript.extendExons(self._distance) + return transcript + + def extendQueryTranscript(self, transcript): + self._currentExQueryTranscript = Transcript() + self._currentExQueryTranscript.copy(transcript) + if self._fivePrimes[QUERY] != None: + self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY]) + if self._threePrimes[QUERY] != None: + self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY]) + transcript.exons = [] + + def createTmpRefFile(self): + self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt) + if "SMARTTMPPATH" in os.environ: + self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName) + chooser = ParserChooser(self._verbosity) + chooser.findFormat(self._inputFileFormats[REFERENCE]) + parser = chooser.getParser(self._inputFileNames[REFERENCE]) + writer = NCListFilePickle(self._tmpRefFileName, self._verbosity) + for transcript in parser.getIterator(): + transcript = self.transformTranscript(transcript, REFERENCE) + writer.addTranscript(transcript) + writer.close() + self._inputFileNames[REFERENCE] = self._tmpRefFileName + self._inputFileFormats[REFERENCE] = "pkl" + + def createNCLists(self): + self._ncLists = dict([type, {}] for type in TYPES) + self._indices = dict([type, {}] for type in TYPES) + self._cursors = dict([type, {}] for type in TYPES) + for type in TYPES: + if self._verbosity > 2: + print "Creating %s NC-list..." % (TYPETOSTRING[type]) + self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type) + ncLists = ConvertToNCList(self._verbosity) + ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type]) + ncLists.setOutputFileName(self._convertedFileNames[type]) + ncLists.setSorted(self._sorted) + if type == REFERENCE and self._index: + ncLists.setIndex(True) + ncLists.run() + self._ncListHandlers[type] = NCListHandler(self._verbosity) + self._ncListHandlers[type].setFileName(self._convertedFileNames[type]) + self._ncListHandlers[type].loadData() + self._nbLines[type] = self._ncListHandlers[type].getNbElements() + self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome() + self._ncLists[type] = self._ncListHandlers[type].getNCLists() + for chromosome, ncList in self._ncLists[type].iteritems(): + self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity) + if type == REFERENCE and self._index: + self._indices[REFERENCE][chromosome] = ncList.getIndex() + if self._verbosity > 2: + print " ...done" + + def compare(self): + nbSkips, nbMoves = 0, 0 + previousChromosome = None + done = False + refNCList = None + queryNCList = None + startTime = time.time() + progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity) + for chromosome, queryNCList in self._ncLists[QUERY].iteritems(): + queryParser = self._ncListHandlers[QUERY].getParser(chromosome) + queryNCList = self._ncLists[QUERY][chromosome] + queryCursor = self._cursors[QUERY][chromosome] + if chromosome != previousChromosome: + skipChromosome = False + previousChromosome = chromosome + if chromosome not in self._ncLists[REFERENCE]: + if self._outputNotOverlapping: + while not queryCursor.isOut(): + self._currentQueryTranscript = queryCursor.getTranscript() + self._writeIntervalInNewGFF3({}) + if queryCursor.hasChildren(): + queryCursor.moveDown() + else: + queryCursor.moveNext() + progress.inc() + continue + refNCList = self._ncLists[REFERENCE][chromosome] + refCursor = self._cursors[REFERENCE][chromosome] + while True: + self._currentOrQueryTranscript = queryCursor.getTranscript() + self._currentQueryTranscript = Transcript() + self._currentQueryTranscript.copy(self._currentOrQueryTranscript) + self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY) + self.extendQueryTranscript(self._currentOrQueryTranscript) + newRefLaddr = self.checkIndex(refCursor) + if newRefLaddr != None: + nbMoves += 1 + refCursor.setLIndex(newRefLaddr) + done = False + refCursor, done, unmatched = self.findOverlapIter(refCursor, done) + if refCursor.isOut(): + if not self._invert and not self._outputNotOverlapping: + break + if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren(): + queryCursor.moveNext() + nbSkips += 1 + else: + queryCursor.moveDown() + if queryCursor.isOut(): + break + progress.inc() + progress.done() + endTime = time.time() + self._timeSpent = endTime - startTime + if self._verbosity >= 10: + print "# skips: %d" % (nbSkips) + print "# moves: %d" % (nbMoves) + + def findOverlapIter(self, cursor, done): + chromosome = self._currentQueryTranscript.getChromosome() + matched = False + if chromosome not in self._ncLists[REFERENCE]: + return None, False, True + ncList = self._ncLists[REFERENCE][chromosome] + overlappingNames = {} + nextDone = False + firstOverlapLAddr = NCListCursor(cursor) + firstOverlapLAddr.setLIndex(-1) + if cursor.isOut(): + self._writeIntervalInNewGFF3(overlappingNames) + return firstOverlapLAddr, False, True + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + firstParentAfter = False + while not parentCursor.isOut(): + if self.isOverlapping(parentCursor) == 0: + matched = True + if self._checkOverlap(parentCursor.getTranscript()): + overlappingNames.update(self._extractID(parentCursor.getTranscript())) + if firstOverlapLAddr.isOut(): + firstOverlapLAddr.copy(parentCursor) + nextDone = True + elif self.isOverlapping(parentCursor) == 1: + firstParentAfter = NCListCursor(parentCursor) + parentCursor.moveUp() + if firstParentAfter: + written = self._writeIntervalInNewGFF3(overlappingNames) + return firstParentAfter, False, not written if self._invert else not matched + #This loop finds the overlaps with currentRefLAddr.# + while True: + parentCursor = NCListCursor(cursor) + parentCursor.moveUp() + #In case: Query is on the right of the RefInterval and does not overlap. + overlap = self.isOverlapping(cursor) + if overlap == -1: + cursor.moveNext() + #In case: Query overlaps with RefInterval. + elif overlap == 0: + matched = True + if self._checkOverlap(cursor.getTranscript()): + overlappingNames.update(self._extractID(cursor.getTranscript())) + if firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = True + if done: + cursor.moveNext() + else: + if not cursor.hasChildren(): + cursor.moveNext() + if cursor.isOut(): + break + else: + cursor.moveDown() + #In case: Query is on the left of the RefInterval and does not overlap. + else: + if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor): + firstOverlapLAddr.copy(cursor) + nextDone = False # new + break + + done = False + if cursor.isOut(): + break + written = self._writeIntervalInNewGFF3(overlappingNames) + return firstOverlapLAddr, nextDone, not written if self._invert else not matched + + def isOverlapping(self, refTranscript): + if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()): + return 0 + if self._currentExQueryTranscript.getEnd() < refTranscript.getStart(): + return 1 + return -1 + + def checkIndex(self, cursor): + if not self._index: + return None + if cursor.isOut(): + return None + chromosome = self._currentExQueryTranscript.getChromosome() + nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript) + if nextLIndex == None: + return None + ncList = self._ncLists[REFERENCE][chromosome] + nextGffAddress = ncList.getRefGffAddr(nextLIndex) + thisGffAddress = cursor.getGffAddress() + if nextGffAddress > thisGffAddress: + return nextLIndex + return None + + def _writeIntervalInNewGFF3(self, names): + nbOverlaps = 0 + for cpt in names.values(): + nbOverlaps += cpt + self._nbOverlappingQueries += 1 if Utils.xor(names, self._invert) else 0 + self._nbOverlaps += nbOverlaps if Utils.xor(names, self._invert) else 0 + if names: + self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names)) + self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps) + if self._invert: + return False + else: + if self._outputNotOverlapping: + self._currentQueryTranscript.setTagValue("nbOverlaps", 0) + elif not self._invert: + return False + self._iWriter.addTranscript(self._currentQueryTranscript) + self._iWriter.write() + return True + + def _extractID(self, transcript): + id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName() + nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1 + return {id: float(nbElements)} + + def _checkOverlap(self, refTranscript): + if self._currentQueryTranscript.getDistance(refTranscript) > self._distance: + return False + minOverlap = self._minOverlap + if self._pcOverlap != None: + minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap) + if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap): + return False + if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection(): + return False + if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection(): + return False + if self._included and not refTranscript.include(self._currentQueryTranscript): + return False + if self._including and not self._currentQueryTranscript.include(refTranscript): + return False + if self._introns: + return True + return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap) + + def run(self): + self.createTmpRefFile() + self.createNCLists() + self.compare() + self.close() + if self._verbosity > 0: + print "# queries: %d" % (self._nbLines[QUERY]) + print "# refs: %d" % (self._nbLines[REFERENCE]) + print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps) + print "time: %ds" % (self._timeSpent) + + +if __name__ == "__main__": + description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input1", dest="inputFileName1", action="store", type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format1", dest="format1", action="store", type="string", help="format of file 1 [compulsory] [format: transcript file format]") + parser.add_option("-j", "--input2", dest="inputFileName2", action="store", type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]") + parser.add_option("-g", "--format2", dest="format2", action="store", type="string", help="format of file 2 [compulsory] [format: transcript file format]") + parser.add_option("-o", "--output", dest="output", action="store", default=None, type="string", help="output file [compulsory] [format: output file in GFF3 format]") + parser.add_option("-D", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]") + parser.add_option("-r", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]") + parser.add_option("-S", "--start1", dest="start1", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]") + parser.add_option("-s", "--start2", dest="start2", action="store", default=None, type="int", help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]") + parser.add_option("-U", "--end1", dest="end1", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]") + parser.add_option("-u", "--end2", dest="end2", action="store", default=None, type="int", help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]") + parser.add_option("-t", "--intron", dest="introns", action="store_true", default=False, help="also report introns [format: bool] [default: false]") + parser.add_option("-E", "--5primeExtension1", dest="fivePrime1", action="store", default=None, type="int", help="extension towards 5' in file 1 [format: int]") + parser.add_option("-e", "--5primeExtension2", dest="fivePrime2", action="store", default=None, type="int", help="extension towards 5' in file 2 [format: int]") + parser.add_option("-N", "--3primeExtension1", dest="threePrime1", action="store", default=None, type="int", help="extension towards 3' in file 1 [format: int]") + parser.add_option("-n", "--3primeExtension2", dest="threePrime2", action="store", default=None, type="int", help="extension towards 3' in file 2 [format: int]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="colinear only [format: bool] [default: false]") + parser.add_option("-a", "--antisense", dest="antisense", action="store_true", default=False, help="antisense only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="accept some distance between query and reference [format: int]") + parser.add_option("-k", "--included", dest="included", action="store_true", default=False, help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]") + parser.add_option("-K", "--including", dest="including", action="store_true", default=False, help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]") + parser.add_option("-m", "--minOverlap", dest="minOverlap", action="store", default=1, type="int", help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]") + parser.add_option("-p", "--pcOverlap", dest="pcOverlap", action="store", default=None, type="int", help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]") + parser.add_option("-O", "--notOverlapping", dest="notOverlapping", action="store_true", default=False, help="also output not overlapping data [format: bool] [default: false]") + parser.add_option("-x", "--exclude", dest="exclude", action="store_true", default=False, help="invert the match [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int]") + (options, args) = parser.parse_args() + + co = CompareOverlapping(options.verbosity) + co.setInput(options.inputFileName1, options.format1, QUERY) + co.setInput(options.inputFileName2, options.format2, REFERENCE) + co.setOutput(options.output) + co.setSorted(options.sorted) + co.setIndex(options.index) + co.restrictToStart(options.start1, QUERY) + co.restrictToStart(options.start2, REFERENCE) + co.restrictToEnd(options.end1, QUERY) + co.restrictToEnd(options.end2, REFERENCE) + co.extendFivePrime(options.fivePrime1, QUERY) + co.extendFivePrime(options.fivePrime2, REFERENCE) + co.extendThreePrime(options.threePrime1, QUERY) + co.extendThreePrime(options.threePrime2, REFERENCE) + co.acceptIntrons(options.introns) + co.getAntisenseOnly(options.antisense) + co.getColinearOnly(options.colinear) + co.getInvert(options.exclude) + co.setMaxDistance(options.distance) + co.setMinOverlap(options.minOverlap) + co.setPcOverlap(options.pcOverlap) + co.setIncludedOnly(options.included) + co.setIncludingOnly(options.including) + co.includeNotOverlapping(options.notOverlapping) + co.run()