view smart_toolShed/SMART/Java/Python/FindOverlapsOptim.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2012
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#

import os, struct, time, shutil
from optparse import OptionParser
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.writer.Gff3Writer import Gff3Writer
from SMART.Java.Python.structure.Transcript import Transcript
from SMART.Java.Python.structure.Interval import Interval
from SMART.Java.Python.ncList.NCList import NCList
from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList
from SMART.Java.Python.ncList.NCListParser import NCListParser
from SMART.Java.Python.ncList.NCListCursor import NCListCursor
from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
from SMART.Java.Python.ncList.NCListHandler import NCListHandler
from SMART.Java.Python.misc.Progress import Progress
from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
try:
   import cPickle as pickle
except:
   import pickle

REFERENCE = 0
QUERY = 1
TYPES = (REFERENCE, QUERY)
TYPETOSTRING = {0: "reference", 1: "query"}

class FindOverlapsOptim(object):
	
	def __init__(self, verbosity = 1):
		self._parsers				  = {}
		self._sortedFileNames		  = {}
		self._outputFileName		  = "outputOverlaps.gff3"
		self._iWriter				  = None
		self._inputFileNames		  = {REFERENCE: None,  QUERY: None}
		self._convertedFileNames      = {REFERENCE: False, QUERY: False}
		self._inputFileFormats		  = {REFERENCE: None,  QUERY: None}
		self._converted			      = {REFERENCE: False, QUERY: False}
		self._ncListHandlers          = {REFERENCE: None,  QUERY: None}
		self._splittedFileNames	      = {REFERENCE: {},	QUERY: {}}
		self._nbOverlappingQueries	  = 0
		self._nbOverlaps			  = 0
		self._nbLines				  = {REFERENCE: 0, QUERY: 0}
		self._sorted                  = False
		self._index                   = False
		self._verbosity			      = verbosity
		self._ncLists				  = {}
		self._cursors				  = {}
		self._nbElementsPerChromosome = {}
		self._tmpDirectories		  = {REFERENCE: False, QUERY: False}
		
	def close(self):
		self._iWriter.close()
		for fileName in (self._sortedFileNames.values()):
			if os.path.exists(fileName):
				os.remove(fileName)
		for fileName in self._convertedFileNames.values():
			if fileName:
				os.remove(fileName)
		
	def setRefFileName(self, fileName, format):
		self.setFileName(fileName, format, REFERENCE)
		
	def setQueryFileName(self, fileName, format):
		self.setFileName(fileName, format, QUERY)

	def setFileName(self, fileName, format, type):
		self._inputFileNames[type] = fileName
		self._inputFileFormats[type] = format
		if format.lower() != "nclist":
			self._converted[type] = True
		
	def setOutputFileName(self, outputFileName):
		self._outputFileName = outputFileName
		self._iWriter = Gff3Writer(self._outputFileName)
	
	def setSorted(self, sorted):
		self._sorted = sorted

	def setIndex(self, index):
		self._index = index

	def createNCLists(self):
		startTime = time.time()
		if self._verbosity > 1:
			print "Building database"
		self._ncLists = dict([type, {}] for type in TYPES)
		self._indices = dict([type, {}] for type in TYPES)
		self._cursors = dict([type, {}] for type in TYPES)
		for type in TYPES:
			self._ncListHandlers[type] = NCListHandler(self._verbosity-3)
			if self._converted[type]:
				self._convertedFileNames[type] = "%s_%d.ncl" % (os.path.splitext(self._inputFileNames[type])[0], type)
				ncLists = ConvertToNCList(self._verbosity-3)
				ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type])
				ncLists.setSorted(self._sorted)
				ncLists.setOutputFileName(self._convertedFileNames[type])
				if type == REFERENCE and self._index:
					ncLists.setIndex(True)
				ncLists.run()
				self._ncListHandlers[type].setFileName(self._convertedFileNames[type])
			else:
				self._ncListHandlers[type].setFileName(self._inputFileNames[type])
			self._ncListHandlers[type].loadData()
			self._nbLines[type]				    = self._ncListHandlers[type].getNbElements()
			self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome()
			self._ncLists[type]				    = self._ncListHandlers[type].getNCLists()
			for chromosome, ncList in self._ncLists[type].iteritems():
				self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity)
				if type == REFERENCE and self._index:
					self._indices[REFERENCE][chromosome] = ncList.getIndex()
		endTime = time.time()
		if self._verbosity > 1:
			print "done (%.2gs)" % (endTime - startTime)

	def compare(self):
		nbSkips, nbMoves   = 0, 0
		previousChromosome = None
		done			   = False
		startTime		   = time.time()
		progress		   = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity)
		#print "query:", self._ncLists[QUERY].keys()
		#print "reference:", self._ncLists[REFERENCE].keys()
		for chromosome, queryNCList in self._ncLists[QUERY].iteritems():
			queryParser = self._ncListHandlers[QUERY].getParser(chromosome)
			queryCursor = self._cursors[QUERY][chromosome]
			if chromosome != previousChromosome:
				skipChromosome	  = False
				previousChromosome  = chromosome
				if chromosome not in self._ncLists[REFERENCE]:
					#print "out ", chromosome
					continue
				refNCList = self._ncLists[REFERENCE][chromosome]
				refCursor = self._cursors[REFERENCE][chromosome]
			#print "starting", chromosome
			while True:
				queryTranscript = queryCursor.getTranscript()
				newRefLaddr = self.checkIndex(queryTranscript, refCursor)
				#print "query is", queryTranscript
				if newRefLaddr != None:
					nbMoves += 1
					refCursor.setLIndex(newRefLaddr)
					#print "skipping to", refCursor
					done = False
				refCursor, done, unmatched = self.findOverlapIter(queryTranscript, refCursor, done)
				#print "completed with", refCursor, done, unmatched
				if refCursor.isOut():
					#print "exiting 1", chromosome
					break
				if unmatched or not queryCursor.hasChildren():
					queryCursor.moveNext()
					#print "moving next to", queryCursor
					nbSkips += 1
				else:
					queryCursor.moveDown()
					#print "moving down to", queryCursor
				if queryCursor.isOut():
					#print "exiting 2", chromosome
					break
			progress.inc()
		progress.done()
		endTime = time.time()
		self._timeSpent = endTime - startTime
		if self._verbosity >= 10:
			print "# skips:   %d" % (nbSkips)
			print "# moves:   %d" % (nbMoves)

	def findOverlapIter(self, queryTranscript, cursor, done):
		chromosome = queryTranscript.getChromosome()
		if chromosome not in self._ncLists[REFERENCE]:
			return False, None
		ncList = self._ncLists[REFERENCE][chromosome]
		overlappingNames = {}
		nextDone = False
		firstOverlapLAddr = NCListCursor(cursor)
		firstOverlapLAddr.setLIndex(-1)
		if cursor.isOut():
			return firstOverlapLAddr, False
		parentCursor = NCListCursor(cursor)
		parentCursor.moveUp()
		firstParentAfter = False
		#print "query transcript 1", queryTranscript
		#print "cursor 1", cursor
		#print "parent 1", parentCursor
		while not parentCursor.isOut(): 
			if self.isOverlapping(queryTranscript, parentCursor) == 0:
				#print "overlap parent choice 0"
				overlappingNames.update(self._extractID(parentCursor.getTranscript()))
				if firstOverlapLAddr.isOut():
					#print "overlap parent 2"
					firstOverlapLAddr.copy(parentCursor)
					nextDone = True # new
			elif self.isOverlapping(queryTranscript, parentCursor) == 1:
				#print "overlap parent choice 1"
				firstParentAfter = NCListCursor(parentCursor)
			parentCursor.moveUp()
			#print "parent 2", parentCursor
		if firstParentAfter:
			#print "exit parent", firstParentAfter, overlappingNames
			self._writeIntervalInNewGFF3(queryTranscript, overlappingNames)
			return firstParentAfter, False, not overlappingNames
		#This loop finds the overlaps with currentRefLAddr.#
		while True:
			#print "ref cursor now is", cursor
			parentCursor = NCListCursor(cursor)
			parentCursor.moveUp()
			#In case: Query is on the right of the RefInterval and does not overlap.
			overlap = self.isOverlapping(queryTranscript, cursor)
			if overlap == -1:
				cursor.moveNext()
			#In case: Query overlaps with RefInterval.	
			elif overlap == 0:
				#print "choice 2"
				overlappingNames.update(self._extractID(cursor.getTranscript()))
				if firstOverlapLAddr.compare(parentCursor):
					firstOverlapLAddr.copy(cursor)
					nextDone = True # new
				if done:
					cursor.moveNext()
				else:
					if not cursor.hasChildren():
						cursor.moveNext()
						if cursor.isOut():
							#print "break 1"
							break
					else:
						cursor.moveDown()
			#In case: Query is on the left of the RefInterval and does not overlap.		
			else:
				#print "choice 3"
				if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor):
					#print "changing nfo 2"
					firstOverlapLAddr.copy(cursor)
					nextDone = False # new
				#print "break 2"
				break
			
			done = False
			if cursor.isOut():
				#print "break 3"
				break
		self._writeIntervalInNewGFF3(queryTranscript, overlappingNames)
		return firstOverlapLAddr, nextDone, not overlappingNames
	
	def isOverlapping(self, queryTranscript, refTranscript):
		if (queryTranscript.getStart() <= refTranscript.getEnd() and queryTranscript.getEnd() >= refTranscript.getStart()):
			return 0   
		if queryTranscript.getEnd() < refTranscript.getStart():
			return 1
		return -1

	def checkIndex(self, transcript, cursor):
		if not self._index:
			return None
		chromosome = transcript.getChromosome()
		nextLIndex = self._indices[REFERENCE][chromosome].getIndex(transcript)
		if nextLIndex == None:
			return None
		ncList		 = self._ncLists[REFERENCE][chromosome]
		nextGffAddress = ncList.getRefGffAddr(nextLIndex)
		thisGffAddress = cursor.getGffAddress()
		if nextGffAddress > thisGffAddress:
			return nextLIndex
		return None
		
	def _writeIntervalInNewGFF3(self, transcript, names):
		nbOverlaps = 0
		for cpt in names.values():
			nbOverlaps += cpt
		if not names:
			return
		transcript.setTagValue("overlapsWith", "--".join(sorted(names.keys())))
		transcript.setTagValue("nbOverlaps", nbOverlaps)
		self._iWriter.addTranscript(transcript)
		self._iWriter.write()
		self._nbOverlappingQueries += 1
		self._nbOverlaps		   += nbOverlaps
		
	def _extractID(self, transcript):
		nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
		id		   = transcript.getTagValue("ID")				 if "ID"		 in transcript.getTagNames() else transcript.getUniqueName()
		return {id: nbElements}
		
	def run(self):
		self.createNCLists()
		self.compare()
		self.close()
		if self._verbosity > 0:
			print "# queries: %d" % (self._nbLines[QUERY])
			print "# refs:    %d" % (self._nbLines[REFERENCE])
			print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
			print "time:      %.2gs" % (self._timeSpent)


if __name__ == "__main__":
	description = "Find Overlaps Optim v1.0.0: Finds overlaps with several query intervals. [Category: Data Comparison]"

	parser = OptionParser(description = description)
	parser.add_option("-i", "--query",	     dest="inputQueryFileName", action="store",			            type="string", help="query input file [compulsory] [format: file in transcript or other format given by -f]")
	parser.add_option("-f", "--queryFormat", dest="queryFormat",		action="store",			            type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")
	parser.add_option("-j", "--ref",		 dest="inputRefFileName",   action="store",			            type="string", help="reference input file [compulsory] [format: file in transcript or other format given by -g]")
	parser.add_option("-g", "--refFormat",   dest="refFormat",		    action="store",			            type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")
	parser.add_option("-o", "--output",	     dest="outputFileName",	    action="store",			            type="string", help="output file [compulsory] [format: output file in GFF3 format]")
	parser.add_option("-d", "--index",	     dest="index",	            action="store_true", default=False,	               help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]")
	parser.add_option("-s", "--sorted",	     dest="sorted",	            action="store_true", default=False,	               help="input files are already sorted [format: boolean] [default: False]")
	parser.add_option("-v", "--verbosity",   dest="verbosity",		    action="store",      default=1,     type="int",	   help="Trace level [format: int] [default: 1]")
	(options, args) = parser.parse_args()
	
	iFOO = FindOverlapsOptim(options.verbosity)
	iFOO.setRefFileName(options.inputRefFileName, options.refFormat)
	iFOO.setQueryFileName(options.inputQueryFileName, options.queryFormat)
	iFOO.setOutputFileName(options.outputFileName)
	iFOO.setIndex(options.index)
	iFOO.setSorted(options.sorted)
	iFOO.run()