| 6 | 1 #! /usr/bin/env python | 
|  | 2 # | 
|  | 3 # Copyright INRA-URGI 2009-2010 | 
|  | 4 # | 
|  | 5 # This software is governed by the CeCILL license under French law and | 
|  | 6 # abiding by the rules of distribution of free software. You can use, | 
|  | 7 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 8 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 9 # "http://www.cecill.info". | 
|  | 10 # | 
|  | 11 # As a counterpart to the access to the source code and rights to copy, | 
|  | 12 # modify and redistribute granted by the license, users are provided only | 
|  | 13 # with a limited warranty and the software's author, the holder of the | 
|  | 14 # economic rights, and the successive licensors have only limited | 
|  | 15 # liability. | 
|  | 16 # | 
|  | 17 # In this respect, the user's attention is drawn to the risks associated | 
|  | 18 # with loading, using, modifying and/or developing or reproducing the | 
|  | 19 # software by the user in light of its specific status of free software, | 
|  | 20 # that may mean that it is complicated to manipulate, and that also | 
|  | 21 # therefore means that it is reserved for developers and experienced | 
|  | 22 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 23 # encouraged to load and test the software's suitability as regards their | 
|  | 24 # requirements in conditions enabling the security of their systems and/or | 
|  | 25 # data to be ensured and, more generally, to use and operate it in the | 
|  | 26 # same conditions as regards security. | 
|  | 27 # | 
|  | 28 # The fact that you are presently reading this means that you have had | 
|  | 29 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 30 # | 
|  | 31 import os, os.path | 
|  | 32 import struct | 
|  | 33 import shelve | 
|  | 34 import sys | 
|  | 35 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle | 
|  | 36 from SMART.Java.Python.ncList.NCIndex import NCIndex | 
|  | 37 from SMART.Java.Python.misc.Progress import Progress | 
|  | 38 | 
|  | 39 LONG_SIZE = struct.calcsize('l') | 
|  | 40 | 
|  | 41 H = 0 | 
|  | 42 L = 1 | 
|  | 43 T = 2 | 
|  | 44 G = 3 | 
|  | 45 | 
|  | 46 H_CELL_SIZE = 2 | 
|  | 47 L_CELL_SIZE = 5 | 
|  | 48 T_CELL_SIZE = 6 | 
|  | 49 | 
|  | 50 START   = 0 | 
|  | 51 END	 = 1 | 
|  | 52 ADDRESS = 2 | 
|  | 53 LIST	= 3 | 
|  | 54 PARENT  = 4 | 
|  | 55 NEW	 = 5 | 
|  | 56 LENGTH  = 1 | 
|  | 57 | 
|  | 58 def pack(input): | 
|  | 59 	return struct.pack("l", long(input)) | 
|  | 60 def unpack(input): | 
|  | 61 	return struct.unpack("l", input)[0] | 
|  | 62 | 
|  | 63 | 
|  | 64 class NCList(object): | 
|  | 65 | 
|  | 66 	def __init__(self, verbosity): | 
|  | 67 		self._verbosity		         = verbosity | 
|  | 68 		self._subPos			     = 0 | 
|  | 69 		self._parentPos		         = 0 | 
|  | 70 		self._nbLines			     = 0 | 
|  | 71 		self._nbLists			     = 0 | 
|  | 72 		self._chromosome		     = None | 
|  | 73 		self._transcriptFileName     = None | 
|  | 74 		self._lHandle			     = None | 
|  | 75 		self._hHandle			     = None | 
|  | 76 		self._tHandle			     = None | 
|  | 77 		self._parser			     = None | 
|  | 78 		self._sizeDict		         = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE} | 
|  | 79 		self._offsets			     = {H: 0, L: 0, G: 0} | 
|  | 80 		self._fileNameDict	         = {} | 
|  | 81 		self._handleDict		     = {} | 
|  | 82 		self._createIndex		     = False | 
|  | 83 		self._missingValues	         = dict([table, {}] for table in self._sizeDict) | 
|  | 84 		self._missingValues[T][LIST] = -1 | 
|  | 85 		self._missingValues[L][LIST] =  0 | 
|  | 86 		self._missingValues[T][NEW]  = -1 | 
|  | 87 | 
|  | 88 	def __del__(self): | 
|  | 89 		for handle in (self._lHandle, self._hHandle): | 
|  | 90 			if handle != None: | 
|  | 91 				handle.close() | 
|  | 92 | 
|  | 93 	def createIndex(self, boolean): | 
|  | 94 		self._createIndex = boolean | 
|  | 95 | 
|  | 96 	def setChromosome(self, chromosome): | 
|  | 97 		self._chromosome = chromosome | 
|  | 98 | 
|  | 99 	def setFileName(self, fileName): | 
|  | 100 		self._transcriptFileName = fileName | 
|  | 101 		self._parser = NCListFileUnpickle(fileName, self._verbosity) | 
|  | 102 		self._setFileNames(fileName) | 
|  | 103 | 
|  | 104 	def setNbElements(self, nbElements): | 
|  | 105 		self._nbLines = nbElements | 
|  | 106 | 
|  | 107 	def setOffset(self, fileType, offset): | 
|  | 108 		self._offsets[fileType] = offset | 
|  | 109 | 
|  | 110 	def _setFileNames(self, fileName): | 
| 46 | 111 		print "Got file name", fileName | 
| 6 | 112 		if self._chromosome != None and fileName != None: | 
|  | 113 			coreName = os.path.splitext(fileName)[0] | 
|  | 114 			if "SMARTTMPPATH" in os.environ: | 
|  | 115 				coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName) | 
| 46 | 116 			print "Used core name", coreName | 
| 6 | 117 			self._hFileName = "%s_H.bin" % (coreName) | 
|  | 118 			self._lFileName = "%s_L.bin" % (coreName) | 
|  | 119 			self._tFileName = "%s_T.bin" % (coreName) | 
|  | 120 			self._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName} | 
|  | 121 | 
|  | 122 	def getSizeFirstList(self): | 
|  | 123 		return self._sizeFirstList | 
|  | 124 | 
|  | 125 	def _writeSubListIntoH(self, SubListAddr, SubListLength): | 
|  | 126 		self._hHandle.write(pack(SubListAddr)) | 
|  | 127 		self._hHandle.write(pack(SubListLength)) | 
|  | 128 		self._subPos += H_CELL_SIZE | 
|  | 129 | 
|  | 130 	def _writeParentIntoL(self, readAddr, subListAddr, parentAddr, start, end): | 
|  | 131 		self._lHandle.write(pack(start)) | 
|  | 132 		self._lHandle.write(pack(end)) | 
|  | 133 		self._lHandle.write(pack(readAddr)) | 
|  | 134 		self._lHandle.write(pack(subListAddr)) | 
|  | 135 		self._lHandle.write(pack(parentAddr)) | 
|  | 136 		self._parentPos += L_CELL_SIZE | 
|  | 137 | 
|  | 138 	def getLLineElements(self, subListLAddr): | 
|  | 139 		if subListLAddr == -1 or subListLAddr == None: | 
|  | 140 			#print "reading bad from L", subListLAddr | 
|  | 141 			return -1, -1, -1, -1, -1 | 
|  | 142 		else: | 
|  | 143 			self._lHandle.seek(subListLAddr * L_CELL_SIZE * LONG_SIZE + self._offsets[L]) | 
|  | 144 			start = self._lHandle.read(LONG_SIZE) | 
|  | 145 			if len(start) < LONG_SIZE: | 
|  | 146 				#print "reading very bad from L", subListLAddr | 
|  | 147 				return -1, -1, -1, -1, -1 | 
|  | 148 			start		= unpack(start) | 
|  | 149 			end		  = unpack(self._lHandle.read(LONG_SIZE)) | 
|  | 150 			gff3Addr	 = unpack(self._lHandle.read(LONG_SIZE)) | 
|  | 151 			subListHAddr = unpack(self._lHandle.read(LONG_SIZE)) | 
|  | 152 			parentLAddr  = unpack(self._lHandle.read(LONG_SIZE)) | 
|  | 153 			#print "reading from L", subListLAddr, "-->", gff3Addr, subListHAddr, parentLAddr, start, end | 
|  | 154 			return gff3Addr, subListHAddr, parentLAddr, start, end | 
|  | 155 | 
|  | 156 	def getHLineElements(self, subListHAddr): | 
|  | 157 		self._hHandle.seek(subListHAddr * H_CELL_SIZE * LONG_SIZE + self._offsets[H]) | 
|  | 158 		subListStartBin = self._hHandle.read(LONG_SIZE) | 
|  | 159 		if len(subListStartBin) < 8 : | 
|  | 160 			#print "reading bad from H" | 
|  | 161 			return -1, -1 | 
|  | 162 		subListStart		 = unpack(subListStartBin) | 
|  | 163 		subListElementsNb	= unpack(self._hHandle.read(LONG_SIZE)) | 
|  | 164 		#print "reading from H", subListHAddr, "-->", subListStart, subListElementsNb | 
|  | 165 		return subListStart, subListElementsNb | 
|  | 166 | 
|  | 167 	def getRefGffAddr(self, currentRefLAddr): | 
|  | 168 		RefGff3Addr, subListHAddr, parentLAddr, start, end = self.getLLineElements(currentRefLAddr) | 
|  | 169 		return RefGff3Addr | 
|  | 170 | 
|  | 171 	def getIntervalFromAdress(self, address): | 
|  | 172 		self._parser.gotoAddress(int(address) + self._offsets[G]) | 
|  | 173 		iTranscrit = self._parser.getNextTranscript() | 
|  | 174 		return iTranscrit | 
|  | 175 | 
|  | 176 	def removeFiles(self): | 
|  | 177 		return | 
|  | 178 | 
|  | 179 	def buildLists(self): | 
|  | 180 		if self._createIndex: | 
|  | 181 			self._index = NCIndex(self._verbosity) | 
|  | 182 		self._createTables() | 
|  | 183 		self._labelLists() | 
|  | 184 		self._computeSubStart() | 
|  | 185 		self._computeAbsPosition() | 
|  | 186 		self._cleanFiles() | 
|  | 187 | 
|  | 188 	def _createTables(self): | 
|  | 189 		self._initLists() | 
|  | 190 		self._createTable(H, self._nbLists) | 
|  | 191 		self._createTable(T, self._nbLines) | 
|  | 192 		self._createTable(L, self._nbLines) | 
|  | 193 		self._fillTables() | 
|  | 194 | 
|  | 195 	def _initLists(self): | 
|  | 196 		previousTranscript = None | 
|  | 197 		self._nbLists	  = 1 | 
|  | 198 		progress = Progress(self._nbLines, "Initializing lists", self._verbosity-5) | 
|  | 199 		for transcript in self._parser.getIterator(): | 
|  | 200 			if self._isIncluded(transcript, previousTranscript): | 
|  | 201 				self._nbLists += 1 | 
|  | 202 			previousTranscript = transcript | 
|  | 203 			progress.inc() | 
|  | 204 		progress.done() | 
|  | 205 | 
|  | 206 	def _isIncluded(self, transcript1, transcript2): | 
|  | 207 		return transcript1 != None and transcript2 != None and transcript1.getStart() >= transcript2.getStart() and transcript1.getEnd() <= transcript2.getEnd() | 
|  | 208 | 
|  | 209 	def _createTable(self, name, size): | 
|  | 210 		handle = open(self._fileNameDict[name], "w+b") | 
|  | 211 		progress = Progress(self._sizeDict[name] * size, "Initializing table %d" % (name), self._verbosity-5) | 
|  | 212 		for i in xrange(self._sizeDict[name] * size): | 
|  | 213 			handle.write(pack(-1)) | 
|  | 214 			progress.inc() | 
|  | 215 		progress.done() | 
|  | 216 		self._handleDict[name] = handle | 
|  | 217 | 
|  | 218 	def _fillTables(self): | 
|  | 219 		progress = Progress(self._nbLines, "Filling table T", self._verbosity-5) | 
|  | 220 		for i, transcript in enumerate(self._parser.getIterator()): | 
|  | 221 			self._writeValue(T, i, START,   transcript.getStart()) | 
|  | 222 			self._writeValue(T, i, END,	 transcript.getEnd()) | 
|  | 223 			self._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress()) | 
|  | 224 			self._writeValue(T, i, PARENT,  -1) | 
|  | 225 			self._writeValue(T, i, LIST,	-1) | 
|  | 226 			progress.inc() | 
|  | 227 		progress.done() | 
|  | 228 		progress = Progress(self._nbLists, "Filling table H", self._verbosity-5) | 
|  | 229 		for i in xrange(self._nbLists): | 
|  | 230 			self._writeValue(H, i, LENGTH, 0) | 
|  | 231 			progress.inc() | 
|  | 232 		progress.done() | 
|  | 233 | 
|  | 234 	def _labelLists(self): | 
|  | 235 		progress = Progress(self._nbLines, "Getting table structure", self._verbosity-5) | 
|  | 236 		nextL = 0 | 
|  | 237 		for i in xrange(self._nbLines): | 
|  | 238 			p	 = i - 1 | 
|  | 239 			start = self._readValue(T, i, START) | 
|  | 240 			end   = self._readValue(T, i, END) | 
|  | 241 			while p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)): | 
|  | 242 				p = self._readValue(T, p, PARENT) | 
|  | 243 			thisL = self._readValue(T, p, LIST) | 
|  | 244 			if thisL == -1: | 
|  | 245 				#print "entering" | 
|  | 246 				thisL  = nextL | 
|  | 247 				nextL += 1 | 
|  | 248 				length = 0 | 
|  | 249 				self._writeValue(T, p, LIST, thisL) | 
|  | 250 			else: | 
|  | 251 				length = self._readValue(H, thisL, LENGTH) | 
|  | 252 			self._writeValue(T, i,	 PARENT, p) | 
|  | 253 			self._writeValue(H, thisL, LENGTH, length + 1) | 
|  | 254 			progress.inc() | 
|  | 255 		progress.done() | 
|  | 256 | 
|  | 257 	def _computeSubStart(self): | 
|  | 258 		progress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5) | 
|  | 259 		total = 0 | 
|  | 260 		for i in xrange(self._nbLists): | 
|  | 261 			self._writeValue(H, i, START, total) | 
|  | 262 			total += self._readValue(H, i, LENGTH) | 
|  | 263 			self._writeValue(H, i, LENGTH, 0) | 
|  | 264 			progress.inc() | 
|  | 265 		progress.done() | 
|  | 266 | 
|  | 267 	def _computeAbsPosition(self): | 
|  | 268 		progress = Progress(self._nbLines, "Writing table", self._verbosity-5) | 
|  | 269 		self._sizeFirstList = 0 | 
|  | 270 		for i in xrange(self._nbLines): | 
|  | 271 			s  = self._readValue(T, i,  START) | 
|  | 272 			e  = self._readValue(T, i,  END) | 
|  | 273 			a  = self._readValue(T, i,  ADDRESS) | 
|  | 274 			pt = self._readValue(T, i,  PARENT) | 
|  | 275 			h  = self._readValue(T, pt, LIST) | 
|  | 276 			pl = self._readValue(T, pt, NEW) | 
|  | 277 			nb = self._readValue(H, h,  LENGTH) | 
|  | 278 			l  = self._readValue(H, h,  START) + nb | 
|  | 279 			self._writeValue(T, i, NEW,	 l) | 
|  | 280 			self._writeValue(L, l, START,   s) | 
|  | 281 			self._writeValue(L, l, END,	 e) | 
|  | 282 			self._writeValue(L, l, ADDRESS, a) | 
|  | 283 			self._writeValue(L, l, LIST,	-1) | 
|  | 284 			self._writeValue(L, l, PARENT,  pl) | 
|  | 285 			self._writeValue(H, h, LENGTH,  nb+1) | 
|  | 286 			if nb == 0: | 
|  | 287 				#print "adding it" | 
|  | 288 				self._writeValue(L, pl, LIST, h) | 
|  | 289 			if pl == -1: | 
|  | 290 				self._sizeFirstList += 1 | 
|  | 291 				if self._createIndex: | 
|  | 292 					self._index.addTranscript(e, l) | 
|  | 293 			progress.inc() | 
|  | 294 		progress.done() | 
|  | 295 | 
|  | 296 	def closeFiles(self): | 
|  | 297 		for handle in self._handleDict.values(): | 
|  | 298 			handle.close() | 
|  | 299 		del self._handleDict | 
|  | 300 		self._lHandle = None | 
|  | 301 		self._hHandle = None | 
|  | 302 		self._tHandle = None | 
|  | 303 		self._parser = None | 
|  | 304 | 
|  | 305 	def openFiles(self): | 
|  | 306 		self._lHandle = open(self._fileNameDict[L], "rb") | 
|  | 307 		self._hHandle = open(self._fileNameDict[H], "rb") | 
|  | 308 		self._handleDict = {H: self._hHandle, L: self._lHandle} | 
|  | 309 		self._parser  = NCListFileUnpickle(self._transcriptFileName, self._verbosity) | 
|  | 310 | 
|  | 311 	def _cleanFiles(self): | 
|  | 312 		self.closeFiles() | 
|  | 313 		os.remove(self._fileNameDict[T]) | 
|  | 314 | 
|  | 315 	def _getPosition(self, table, line, key): | 
|  | 316 		handle = self._handleDict[table] | 
|  | 317 		handle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE) | 
|  | 318 		return handle | 
|  | 319 | 
|  | 320 	def _writeValue(self, table, line, key, value): | 
|  | 321 		#print "writing", table, line, key, "<-", value | 
|  | 322 		if line == -1: | 
|  | 323 			self._missingValues[table][key] = value | 
|  | 324 			return | 
|  | 325 		handle = self._getPosition(table, line, key) | 
|  | 326 		handle.write(pack(value)) | 
|  | 327 | 
|  | 328 	def _readValue(self, table, line, key): | 
|  | 329 		#print "reading", table, line, key, "->", | 
|  | 330 		if line == -1: | 
|  | 331 			#print self._missingValues[table][key] | 
|  | 332 			return self._missingValues[table][key] | 
|  | 333 		handle = self._getPosition(table, line, key) | 
|  | 334 		r = unpack(handle.read(LONG_SIZE)) | 
|  | 335 		#print r | 
|  | 336 		return r | 
|  | 337 | 
|  | 338 	def getIndex(self): | 
|  | 339 		return self._index |