view SMART/Java/Python/CompareOverlapping.py @ 69:1473ab954708 draft

Corrected bug in "CollapsedReads" XML file.
author m-zytnicki
date Wed, 18 Nov 2015 10:59:02 -0500
parents 169d364ddd91
children
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import os, struct, time, random
from optparse import OptionParser
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.writer.Gff3Writer import Gff3Writer
from SMART.Java.Python.structure.Transcript import Transcript
from SMART.Java.Python.structure.Interval import Interval
from SMART.Java.Python.ncList.NCList import NCList
from SMART.Java.Python.ncList.NCListCursor import NCListCursor
from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
from SMART.Java.Python.ncList.NCListHandler import NCListHandler
from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList
from SMART.Java.Python.misc.Progress import Progress
from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
from SMART.Java.Python.misc import Utils
try:
	import cPickle as pickle
except:
	import pickle

REFERENCE = 0
QUERY = 1
TYPES = (REFERENCE, QUERY)
TYPETOSTRING = {0: "reference", 1: "query"}

class CompareOverlapping(object):

	def __init__(self, verbosity = 1):
		self._outputFileName		   = "outputOverlaps.gff3"
		self._iWriter				   = None
		self._nbOverlappingQueries	   = 0
		self._nbOverlaps			   = 0
		self._nbLines				   = {REFERENCE: 0, QUERY: 0}
		self._verbosity				   = verbosity
		self._ncLists				   = {}
		self._cursors				   = {}
		self._splittedFileNames		   = {}
		self._nbElements			   = {}
		self._nbElementsPerChromosome  = {}
		self._inputFileNames		   = {REFERENCE: None,  QUERY: None}
		self._inputFileFormats		   = {REFERENCE: None,  QUERY: None}
		self._starts				   = {REFERENCE: None, QUERY: None}
		self._ends					   = {REFERENCE: None, QUERY: None}
		self._fivePrimes			   = {REFERENCE: None, QUERY: None}
		self._threePrimes			   = {REFERENCE: None, QUERY: None}
		self._ncListHandlers		   = {REFERENCE: None,  QUERY: None}
		self._convertedFileNames	   = {REFERENCE: False, QUERY: False}
		self._sorted                   = False
		self._index                    = False
		self._introns				   = False
		self._antisense				   = False
		self._colinear				   = False
		self._invert				   = False
		self._distance				   = 0
		self._minOverlap			   = 1
		self._pcOverlap				   = None
		self._included				   = False
		self._including				   = False
		self._outputNotOverlapping	   = False
		self._tmpRefFileName		   = None
		self._currentQueryTranscript   = None
		self._currentOrQueryTranscript = None
		self._currentExQueryTranscript = None
		self._randInt				   = random.randint(0, 100000)
		
	def __del__(self):
		for fileName in [self._tmpRefFileName] + self._convertedFileNames.values():
			if fileName != None and os.path.exists(fileName):
				os.remove(fileName)

	def close(self):
		self._iWriter.close()
		
	def setInput(self, fileName, format, type):
		chooser = ParserChooser(self._verbosity)
		chooser.findFormat(format)
		self._inputFileNames[type]   = fileName
		self._inputFileFormats[type] = format
		
	def setOutput(self, outputFileName):
		if outputFileName != '':
			self._outputFileName = outputFileName
		self._iWriter = Gff3Writer(self._outputFileName)

	def setSorted(self, sorted):
		self._sorted = sorted

	def setIndex(self, index):
		self._index = index

	def restrictToStart(self, distance, type):
		self._starts[type] = distance
		
	def restrictToEnd(self, distance, type):
		self._ends[type] = distance
		
	def extendFivePrime(self, distance, type):
		self._fivePrimes[type] = distance
		
	def extendThreePrime(self, distance, type):
		self._threePrimes[type] = distance

	def acceptIntrons(self, boolean):
		self._introns = boolean

	def getAntisenseOnly(self, boolean):
		self._antisense = boolean
		
	def getColinearOnly(self, boolean):
		self._colinear = boolean

	def getInvert(self, boolean):
		self._invert = boolean

	def setMaxDistance(self, distance):
		self._distance = distance

	def setMinOverlap(self, overlap):
		self._minOverlap = overlap

	def setPcOverlap(self, overlap):
		self._pcOverlap = overlap

	def setIncludedOnly(self, boolean):
		self._included = boolean
		
	def setIncludingOnly(self, boolean):
		self._including = boolean

	def includeNotOverlapping(self, boolean):
		self._outputNotOverlapping = boolean
		
	def transformTranscript(self, transcript, type):
		if self._starts[type] != None:
			transcript.restrictStart(self._starts[type])
		if self._ends[type] != None:
			transcript.restrictEnd(self._ends[type])
		if self._fivePrimes[type] != None:
			transcript.extendStart(self._fivePrimes[type])
		if self._threePrimes[type] != None:
			transcript.extendEnd(self._threePrimes[type])
		if self._introns:
			transcript.exons = []
		if type == REFERENCE and self._distance > 0:
			transcript.extendExons(self._distance)
		return transcript

	def extendQueryTranscript(self, transcript):
		self._currentExQueryTranscript = Transcript()
		self._currentExQueryTranscript.copy(transcript)
		if self._fivePrimes[QUERY] != None:
			self._currentExQueryTranscript.extendStart(self._fivePrimes[QUERY])
		if self._threePrimes[QUERY] != None:
			self._currentExQueryTranscript.extendEnd(self._threePrimes[QUERY])
		transcript.exons = []

	def createTmpRefFile(self):
		self._tmpRefFileName = "tmp_ref_%d.pkl" % (self._randInt)
		if "SMARTTMPPATH" in os.environ:
			self._tmpRefFileName = os.path.join(os.environ["SMARTTMPPATH"], self._tmpRefFileName)
		chooser = ParserChooser(self._verbosity)
		chooser.findFormat(self._inputFileFormats[REFERENCE])
		parser = chooser.getParser(self._inputFileNames[REFERENCE])
		writer = NCListFilePickle(self._tmpRefFileName, self._verbosity)
		for transcript in parser.getIterator():
			transcript = self.transformTranscript(transcript, REFERENCE)
			writer.addTranscript(transcript)
		writer.close()
		self._inputFileNames[REFERENCE]   = self._tmpRefFileName
		self._inputFileFormats[REFERENCE] = "pkl"

	def createNCLists(self):
		self._ncLists = dict([type, {}] for type in TYPES)
		self._indices = dict([type, {}] for type in TYPES)
		self._cursors = dict([type, {}] for type in TYPES)
		for type in TYPES:
			if self._verbosity > 2:
				print "Creating %s NC-list..." % (TYPETOSTRING[type])
			self._convertedFileNames[type] = "%s_%d_%d.ncl" % (self._inputFileNames[type], self._randInt, type)
			if "SMARTTMPPATH" in os.environ:
				self._convertedFileNames[type] = os.path.join(os.environ["SMARTTMPPATH"], self._convertedFileNames[type])
			ncLists = ConvertToNCList(self._verbosity)
			ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type])
			ncLists.setOutputFileName(self._convertedFileNames[type])
			ncLists.setSorted(self._sorted)
			if type == REFERENCE and self._index:
				ncLists.setIndex(True)
			ncLists.run()
			self._ncListHandlers[type] = NCListHandler(self._verbosity)
			self._ncListHandlers[type].setFileName(self._convertedFileNames[type])
			self._ncListHandlers[type].loadData()
			self._nbLines[type]					= self._ncListHandlers[type].getNbElements()
			self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome()
			self._ncLists[type]					= self._ncListHandlers[type].getNCLists()
			for chromosome, ncList in self._ncLists[type].iteritems():
				self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity)
				if type == REFERENCE and self._index:
					self._indices[REFERENCE][chromosome] = ncList.getIndex()
			if self._verbosity > 2:
				print "	...done"

	def compare(self):
		nbSkips, nbMoves   = 0, 0
		previousChromosome = None
		done			   = False
		refNCList		   = None
		queryNCList		   = None
		startTime		   = time.time()
		progress		   = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity)
		for chromosome, queryNCList in self._ncLists[QUERY].iteritems():
			queryParser = self._ncListHandlers[QUERY].getParser(chromosome)
			queryNCList = self._ncLists[QUERY][chromosome]
			queryCursor = self._cursors[QUERY][chromosome]
			if chromosome != previousChromosome:
				skipChromosome		= False
				previousChromosome  = chromosome
				if chromosome not in self._ncLists[REFERENCE]:
					if self._outputNotOverlapping:
						while not queryCursor.isOut():
							self._currentQueryTranscript = queryCursor.getTranscript()
							self._writeIntervalInNewGFF3({})
							if queryCursor.hasChildren():
								queryCursor.moveDown()
							else:
								queryCursor.moveNext()
					progress.inc()
					continue
				refNCList = self._ncLists[REFERENCE][chromosome]
				refCursor = self._cursors[REFERENCE][chromosome]
			while True:
				self._currentOrQueryTranscript = queryCursor.getTranscript()
				self._currentQueryTranscript = Transcript()
				self._currentQueryTranscript.copy(self._currentOrQueryTranscript)
				self._currentQueryTranscript = self.transformTranscript(self._currentQueryTranscript, QUERY)
				self.extendQueryTranscript(self._currentOrQueryTranscript)
				newRefLaddr = self.checkIndex(refCursor)
				if newRefLaddr != None:
					nbMoves += 1
					refCursor.setLIndex(newRefLaddr)
					done = False
				refCursor, done, unmatched = self.findOverlapIter(refCursor, done)
				if refCursor.isOut():
					if not self._invert and not self._outputNotOverlapping:
						break
				if (unmatched and not self._invert and not self._outputNotOverlapping) or not queryCursor.hasChildren():
					queryCursor.moveNext()
					nbSkips += 1
				else:
					queryCursor.moveDown()
				if queryCursor.isOut():
					break
			progress.inc()
		progress.done()
		endTime = time.time()
		self._timeSpent = endTime - startTime
		if self._verbosity >= 10:
			print "# skips:   %d" % (nbSkips)
			print "# moves:   %d" % (nbMoves)

	def findOverlapIter(self, cursor, done):
		chromosome = self._currentQueryTranscript.getChromosome()
		matched	= False
		if chromosome not in self._ncLists[REFERENCE]:
			return None, False, True
		ncList = self._ncLists[REFERENCE][chromosome]
		overlappingNames = {}
		nextDone = False
		firstOverlapLAddr = NCListCursor(cursor)
		firstOverlapLAddr.setLIndex(-1)
		if cursor.isOut():
			self._writeIntervalInNewGFF3(overlappingNames)
			return firstOverlapLAddr, False, True
		parentCursor = NCListCursor(cursor)
		parentCursor.moveUp()
		firstParentAfter = False
		while not parentCursor.isOut(): 
			if self.isOverlapping(parentCursor) == 0:
				matched = True
				if self._checkOverlap(parentCursor.getTranscript()):
					overlappingNames.update(self._extractID(parentCursor.getTranscript()))
				if firstOverlapLAddr.isOut():
					firstOverlapLAddr.copy(parentCursor)
					nextDone = True 
			elif self.isOverlapping(parentCursor) == 1:
				firstParentAfter = NCListCursor(parentCursor)
			parentCursor.moveUp()
		if firstParentAfter:
			written = self._writeIntervalInNewGFF3(overlappingNames)
			return firstParentAfter, False, not written if self._invert else not matched
		#This loop finds the overlaps with currentRefLAddr.#
		while True:
			parentCursor = NCListCursor(cursor)
			parentCursor.moveUp()
			#In case: Query is on the right of the RefInterval and does not overlap.
			overlap = self.isOverlapping(cursor)
			if overlap == -1:
				cursor.moveNext()
			#In case: Query overlaps with RefInterval.	
			elif overlap == 0:
				matched = True
				if self._checkOverlap(cursor.getTranscript()):
					overlappingNames.update(self._extractID(cursor.getTranscript()))
				if firstOverlapLAddr.compare(parentCursor):
					firstOverlapLAddr.copy(cursor)
					nextDone = True
				if done:
					cursor.moveNext()
				else:
					if not cursor.hasChildren():
						cursor.moveNext()
						if cursor.isOut():
							break
					else:
						cursor.moveDown()
			#In case: Query is on the left of the RefInterval and does not overlap.		
			else:
				if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor):
					firstOverlapLAddr.copy(cursor)
					nextDone = False # new
				break
			
			done = False
			if cursor.isOut():
				break
		written = self._writeIntervalInNewGFF3(overlappingNames)
		return firstOverlapLAddr, nextDone, not written if self._invert else not matched
	
	def isOverlapping(self, refTranscript):
		if (self._currentExQueryTranscript.getStart() <= refTranscript.getEnd() and self._currentExQueryTranscript.getEnd() >= refTranscript.getStart()):
			return 0   
		if self._currentExQueryTranscript.getEnd() < refTranscript.getStart():
			return 1
		return -1

	def checkIndex(self, cursor):
		if not self._index:
			return None
		if cursor.isOut():
			return None
		chromosome = self._currentExQueryTranscript.getChromosome()
		nextLIndex = self._indices[REFERENCE][chromosome].getIndex(self._currentExQueryTranscript)
		if nextLIndex == None:
			return None
		ncList		 = self._ncLists[REFERENCE][chromosome]
		nextGffAddress = ncList.getRefGffAddr(nextLIndex)
		thisGffAddress = cursor.getGffAddress()
		if nextGffAddress > thisGffAddress:
			return nextLIndex
		return None
		
	def _writeIntervalInNewGFF3(self, names):
		nbOverlaps = 0
		for cpt in names.values():
			nbOverlaps += cpt
		self._nbOverlappingQueries += 1		      if Utils.xor(names, self._invert) else 0
		self._nbOverlaps		   += nbOverlaps  if Utils.xor(names, self._invert) else 0
		if names:
			self._currentQueryTranscript.setTagValue("overlapWith", ",".join(names))
			self._currentQueryTranscript.setTagValue("nbOverlaps", nbOverlaps)
			if self._invert:
				return False
		else:
			if self._outputNotOverlapping:
				self._currentQueryTranscript.setTagValue("nbOverlaps", 0)
			elif not self._invert:
				return False
		self._iWriter.addTranscript(self._currentQueryTranscript)
		self._iWriter.write()
		return True
		
	def _extractID(self, transcript):
		id		 = transcript.getTagValue("ID")		      if "ID"		  in transcript.getTagNames() else transcript.getUniqueName()
		nbElements = transcript.getTagValue("nbElements") if "nbElements" in transcript.getTagNames() else 1
		return {id: float(nbElements)}

	def _checkOverlap(self, refTranscript):
		if self._currentQueryTranscript.getDistance(refTranscript) > self._distance:
			return False
		minOverlap = self._minOverlap
		if self._pcOverlap != None:
			minOverlap = max(self._minOverlap, self._currentQueryTranscript.getSize() / 100.0 * self._pcOverlap)
		if not self._currentQueryTranscript.overlapWith(refTranscript, minOverlap):
			return False
		if self._antisense and self._currentQueryTranscript.getDirection() == refTranscript.getDirection():
			return False
		if self._colinear and self._currentQueryTranscript.getDirection() != refTranscript.getDirection():
			return False
		if self._included and not refTranscript.include(self._currentQueryTranscript):
			return False
		if self._including and not self._currentQueryTranscript.include(refTranscript):
			return False
		if self._introns:
			return True
		return self._currentQueryTranscript.overlapWithExon(refTranscript, minOverlap)
		
	def run(self):
		self.createTmpRefFile()
		self.createNCLists()
		self.compare()
		self.close()
		if self._verbosity > 0:
			print "# queries: %d" % (self._nbLines[QUERY])
			print "# refs:	  %d" % (self._nbLines[REFERENCE])
			print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
			print "time: 	  %ds" % (self._timeSpent)


if __name__ == "__main__":
	description = "Compare Overlapping v1.0.4: Get the data which overlap with a reference set. [Category: Data Comparison]"

	parser = OptionParser(description = description)
	parser.add_option("-i", "--input1",		      dest="inputFileName1", action="store",					 type="string", help="input file 1 [compulsory] [format: file in transcript format given by -f]")
	parser.add_option("-f", "--format1",		  dest="format1",		 action="store",					 type="string", help="format of file 1 [compulsory] [format: transcript file format]")
	parser.add_option("-j", "--input2",		      dest="inputFileName2", action="store",					 type="string", help="input file 2 [compulsory] [format: file in transcript format given by -g]")
	parser.add_option("-g", "--format2",		  dest="format2",		 action="store",					 type="string", help="format of file 2 [compulsory] [format: transcript file format]")
	parser.add_option("-o", "--output",		      dest="output",		 action="store",	  default=None,  type="string", help="output file [compulsory] [format: output file in GFF3 format]")
	parser.add_option("-D", "--index",	          dest="index",	         action="store_true", default=False,	            help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]")
	parser.add_option("-r", "--sorted",	          dest="sorted",	     action="store_true", default=False,	            help="input files are already sorted [format: boolean] [default: False]")
	parser.add_option("-S", "--start1",		      dest="start1",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 1 (do not use it with -U) [format: int]")
	parser.add_option("-s", "--start2",		      dest="start2",		 action="store",	  default=None,  type="int",	help="only consider the n first nucleotides of the transcripts in file 2 (do not use it with -u) [format: int]")
	parser.add_option("-U", "--end1",			  dest="end1",		     action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 1 (do not use it with -S) [format: int]")
	parser.add_option("-u", "--end2",			  dest="end2",		     action="store",	  default=None,  type="int",	help="only consider the n last nucleotides of the transcripts in file 2 (do not use it with -s) [format: int]")
	parser.add_option("-t", "--intron",		      dest="introns",		 action="store_true", default=False,				help="also report introns [format: bool] [default: false]")
	parser.add_option("-E", "--5primeExtension1", dest="fivePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 1 [format: int]")
	parser.add_option("-e", "--5primeExtension2", dest="fivePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 5' in file 2 [format: int]")
	parser.add_option("-N", "--3primeExtension1", dest="threePrime1",	 action="store",	  default=None,  type="int",	help="extension towards 3' in file 1 [format: int]")
	parser.add_option("-n", "--3primeExtension2", dest="threePrime2",	 action="store",	  default=None,  type="int",	help="extension towards 3' in file 2 [format: int]")
	parser.add_option("-c", "--colinear",		  dest="colinear",		 action="store_true", default=False,				help="colinear only [format: bool] [default: false]")
	parser.add_option("-a", "--antisense",		  dest="antisense",		 action="store_true", default=False,				help="antisense only [format: bool] [default: false]")
	parser.add_option("-d", "--distance",		  dest="distance",	     action="store",	  default=0,	 type="int",	help="accept some distance between query and reference [format: int]")
	parser.add_option("-k", "--included",		  dest="included",	     action="store_true", default=False,				help="keep only elements from file 1 which are included in an element of file 2 [format: bool] [default: false]")
	parser.add_option("-K", "--including",		  dest="including",	     action="store_true", default=False,				help="keep only elements from file 2 which are included in an element of file 1 [format: bool] [default: false]")
	parser.add_option("-m", "--minOverlap",		  dest="minOverlap",	 action="store",	  default=1,	 type="int",	help="minimum number of nucleotides overlapping to declare an overlap [format: int] [default: 1]")
	parser.add_option("-p", "--pcOverlap",		  dest="pcOverlap",	     action="store",	  default=None,  type="int",	help="minimum percentage of nucleotides to overlap to declare an overlap [format: int]")
	parser.add_option("-O", "--notOverlapping",   dest="notOverlapping", action="store_true", default=False,				help="also output not overlapping data [format: bool] [default: false]")
	parser.add_option("-x", "--exclude",		  dest="exclude",		 action="store_true", default=False,				help="invert the match [format: bool] [default: false]")
	parser.add_option("-v", "--verbosity",		  dest="verbosity",		 action="store",	  default=1,	 type="int",	help="trace level [format: int]")
	(options, args) = parser.parse_args()

	co = CompareOverlapping(options.verbosity)
	co.setInput(options.inputFileName1, options.format1, QUERY)
	co.setInput(options.inputFileName2, options.format2, REFERENCE)
	co.setOutput(options.output)
	co.setSorted(options.sorted)
	co.setIndex(options.index)
	co.restrictToStart(options.start1, QUERY)
	co.restrictToStart(options.start2, REFERENCE)
	co.restrictToEnd(options.end1, QUERY)
	co.restrictToEnd(options.end2, REFERENCE)
	co.extendFivePrime(options.fivePrime1, QUERY)
	co.extendFivePrime(options.fivePrime2, REFERENCE)
	co.extendThreePrime(options.threePrime1, QUERY)
	co.extendThreePrime(options.threePrime2, REFERENCE)
	co.acceptIntrons(options.introns)
	co.getAntisenseOnly(options.antisense)
	co.getColinearOnly(options.colinear)
	co.getInvert(options.exclude)
	co.setMaxDistance(options.distance)
	co.setMinOverlap(options.minOverlap)
	co.setPcOverlap(options.pcOverlap)
	co.setIncludedOnly(options.included)
	co.setIncludingOnly(options.including)
	co.includeNotOverlapping(options.notOverlapping)
	co.run()