6
|
1 #! /usr/bin/env python
|
|
2 #
|
|
3 # Copyright INRA-URGI 2009-2010
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30 #
|
|
31 import os, os.path
|
|
32 import struct
|
|
33 import shelve
|
|
34 import sys
|
|
35 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
|
|
36 from SMART.Java.Python.ncList.NCIndex import NCIndex
|
|
37 from SMART.Java.Python.misc.Progress import Progress
|
|
38
|
|
39 LONG_SIZE = struct.calcsize('l')
|
|
40
|
|
41 H = 0
|
|
42 L = 1
|
|
43 T = 2
|
|
44 G = 3
|
|
45
|
|
46 H_CELL_SIZE = 2
|
|
47 L_CELL_SIZE = 5
|
|
48 T_CELL_SIZE = 6
|
|
49
|
|
50 START = 0
|
|
51 END = 1
|
|
52 ADDRESS = 2
|
|
53 LIST = 3
|
|
54 PARENT = 4
|
|
55 NEW = 5
|
|
56 LENGTH = 1
|
|
57
|
|
58 def pack(input):
|
|
59 return struct.pack("l", long(input))
|
|
60 def unpack(input):
|
|
61 return struct.unpack("l", input)[0]
|
|
62
|
|
63
|
|
64 class NCList(object):
|
|
65
|
|
66 def __init__(self, verbosity):
|
|
67 self._verbosity = verbosity
|
|
68 self._subPos = 0
|
|
69 self._parentPos = 0
|
|
70 self._nbLines = 0
|
|
71 self._nbLists = 0
|
|
72 self._chromosome = None
|
|
73 self._transcriptFileName = None
|
|
74 self._lHandle = None
|
|
75 self._hHandle = None
|
|
76 self._tHandle = None
|
|
77 self._parser = None
|
|
78 self._sizeDict = {H: H_CELL_SIZE, L: L_CELL_SIZE, T: T_CELL_SIZE}
|
|
79 self._offsets = {H: 0, L: 0, G: 0}
|
|
80 self._fileNameDict = {}
|
|
81 self._handleDict = {}
|
|
82 self._createIndex = False
|
|
83 self._missingValues = dict([table, {}] for table in self._sizeDict)
|
|
84 self._missingValues[T][LIST] = -1
|
|
85 self._missingValues[L][LIST] = 0
|
|
86 self._missingValues[T][NEW] = -1
|
|
87
|
|
88 def __del__(self):
|
|
89 for handle in (self._lHandle, self._hHandle):
|
|
90 if handle != None:
|
|
91 handle.close()
|
|
92
|
|
93 def createIndex(self, boolean):
|
|
94 self._createIndex = boolean
|
|
95
|
|
96 def setChromosome(self, chromosome):
|
|
97 self._chromosome = chromosome
|
|
98
|
|
99 def setFileName(self, fileName):
|
|
100 self._transcriptFileName = fileName
|
|
101 self._parser = NCListFileUnpickle(fileName, self._verbosity)
|
|
102 self._setFileNames(fileName)
|
|
103
|
|
104 def setNbElements(self, nbElements):
|
|
105 self._nbLines = nbElements
|
|
106
|
|
107 def setOffset(self, fileType, offset):
|
|
108 self._offsets[fileType] = offset
|
|
109
|
|
110 def _setFileNames(self, fileName):
|
46
|
111 print "Got file name", fileName
|
6
|
112 if self._chromosome != None and fileName != None:
|
|
113 coreName = os.path.splitext(fileName)[0]
|
|
114 if "SMARTTMPPATH" in os.environ:
|
|
115 coreName = os.path.join(os.environ["SMARTTMPPATH"], coreName)
|
46
|
116 print "Used core name", coreName
|
6
|
117 self._hFileName = "%s_H.bin" % (coreName)
|
|
118 self._lFileName = "%s_L.bin" % (coreName)
|
|
119 self._tFileName = "%s_T.bin" % (coreName)
|
|
120 self._fileNameDict = {H: self._hFileName, L: self._lFileName, T: self._tFileName}
|
|
121
|
|
122 def getSizeFirstList(self):
|
|
123 return self._sizeFirstList
|
|
124
|
|
125 def _writeSubListIntoH(self, SubListAddr, SubListLength):
|
|
126 self._hHandle.write(pack(SubListAddr))
|
|
127 self._hHandle.write(pack(SubListLength))
|
|
128 self._subPos += H_CELL_SIZE
|
|
129
|
|
130 def _writeParentIntoL(self, readAddr, subListAddr, parentAddr, start, end):
|
|
131 self._lHandle.write(pack(start))
|
|
132 self._lHandle.write(pack(end))
|
|
133 self._lHandle.write(pack(readAddr))
|
|
134 self._lHandle.write(pack(subListAddr))
|
|
135 self._lHandle.write(pack(parentAddr))
|
|
136 self._parentPos += L_CELL_SIZE
|
|
137
|
|
138 def getLLineElements(self, subListLAddr):
|
|
139 if subListLAddr == -1 or subListLAddr == None:
|
|
140 #print "reading bad from L", subListLAddr
|
|
141 return -1, -1, -1, -1, -1
|
|
142 else:
|
|
143 self._lHandle.seek(subListLAddr * L_CELL_SIZE * LONG_SIZE + self._offsets[L])
|
|
144 start = self._lHandle.read(LONG_SIZE)
|
|
145 if len(start) < LONG_SIZE:
|
|
146 #print "reading very bad from L", subListLAddr
|
|
147 return -1, -1, -1, -1, -1
|
|
148 start = unpack(start)
|
|
149 end = unpack(self._lHandle.read(LONG_SIZE))
|
|
150 gff3Addr = unpack(self._lHandle.read(LONG_SIZE))
|
|
151 subListHAddr = unpack(self._lHandle.read(LONG_SIZE))
|
|
152 parentLAddr = unpack(self._lHandle.read(LONG_SIZE))
|
|
153 #print "reading from L", subListLAddr, "-->", gff3Addr, subListHAddr, parentLAddr, start, end
|
|
154 return gff3Addr, subListHAddr, parentLAddr, start, end
|
|
155
|
|
156 def getHLineElements(self, subListHAddr):
|
|
157 self._hHandle.seek(subListHAddr * H_CELL_SIZE * LONG_SIZE + self._offsets[H])
|
|
158 subListStartBin = self._hHandle.read(LONG_SIZE)
|
|
159 if len(subListStartBin) < 8 :
|
|
160 #print "reading bad from H"
|
|
161 return -1, -1
|
|
162 subListStart = unpack(subListStartBin)
|
|
163 subListElementsNb = unpack(self._hHandle.read(LONG_SIZE))
|
|
164 #print "reading from H", subListHAddr, "-->", subListStart, subListElementsNb
|
|
165 return subListStart, subListElementsNb
|
|
166
|
|
167 def getRefGffAddr(self, currentRefLAddr):
|
|
168 RefGff3Addr, subListHAddr, parentLAddr, start, end = self.getLLineElements(currentRefLAddr)
|
|
169 return RefGff3Addr
|
|
170
|
|
171 def getIntervalFromAdress(self, address):
|
|
172 self._parser.gotoAddress(int(address) + self._offsets[G])
|
|
173 iTranscrit = self._parser.getNextTranscript()
|
|
174 return iTranscrit
|
|
175
|
|
176 def removeFiles(self):
|
|
177 return
|
|
178
|
|
179 def buildLists(self):
|
|
180 if self._createIndex:
|
|
181 self._index = NCIndex(self._verbosity)
|
|
182 self._createTables()
|
|
183 self._labelLists()
|
|
184 self._computeSubStart()
|
|
185 self._computeAbsPosition()
|
|
186 self._cleanFiles()
|
|
187
|
|
188 def _createTables(self):
|
|
189 self._initLists()
|
|
190 self._createTable(H, self._nbLists)
|
|
191 self._createTable(T, self._nbLines)
|
|
192 self._createTable(L, self._nbLines)
|
|
193 self._fillTables()
|
|
194
|
|
195 def _initLists(self):
|
|
196 previousTranscript = None
|
|
197 self._nbLists = 1
|
|
198 progress = Progress(self._nbLines, "Initializing lists", self._verbosity-5)
|
|
199 for transcript in self._parser.getIterator():
|
|
200 if self._isIncluded(transcript, previousTranscript):
|
|
201 self._nbLists += 1
|
|
202 previousTranscript = transcript
|
|
203 progress.inc()
|
|
204 progress.done()
|
|
205
|
|
206 def _isIncluded(self, transcript1, transcript2):
|
|
207 return transcript1 != None and transcript2 != None and transcript1.getStart() >= transcript2.getStart() and transcript1.getEnd() <= transcript2.getEnd()
|
|
208
|
|
209 def _createTable(self, name, size):
|
|
210 handle = open(self._fileNameDict[name], "w+b")
|
|
211 progress = Progress(self._sizeDict[name] * size, "Initializing table %d" % (name), self._verbosity-5)
|
|
212 for i in xrange(self._sizeDict[name] * size):
|
|
213 handle.write(pack(-1))
|
|
214 progress.inc()
|
|
215 progress.done()
|
|
216 self._handleDict[name] = handle
|
|
217
|
|
218 def _fillTables(self):
|
|
219 progress = Progress(self._nbLines, "Filling table T", self._verbosity-5)
|
|
220 for i, transcript in enumerate(self._parser.getIterator()):
|
|
221 self._writeValue(T, i, START, transcript.getStart())
|
|
222 self._writeValue(T, i, END, transcript.getEnd())
|
|
223 self._writeValue(T, i, ADDRESS, self._parser.getCurrentTranscriptAddress())
|
|
224 self._writeValue(T, i, PARENT, -1)
|
|
225 self._writeValue(T, i, LIST, -1)
|
|
226 progress.inc()
|
|
227 progress.done()
|
|
228 progress = Progress(self._nbLists, "Filling table H", self._verbosity-5)
|
|
229 for i in xrange(self._nbLists):
|
|
230 self._writeValue(H, i, LENGTH, 0)
|
|
231 progress.inc()
|
|
232 progress.done()
|
|
233
|
|
234 def _labelLists(self):
|
|
235 progress = Progress(self._nbLines, "Getting table structure", self._verbosity-5)
|
|
236 nextL = 0
|
|
237 for i in xrange(self._nbLines):
|
|
238 p = i - 1
|
|
239 start = self._readValue(T, i, START)
|
|
240 end = self._readValue(T, i, END)
|
|
241 while p != -1 and (start < self._readValue(T, p, START) or end > self._readValue(T, p, END)):
|
|
242 p = self._readValue(T, p, PARENT)
|
|
243 thisL = self._readValue(T, p, LIST)
|
|
244 if thisL == -1:
|
|
245 #print "entering"
|
|
246 thisL = nextL
|
|
247 nextL += 1
|
|
248 length = 0
|
|
249 self._writeValue(T, p, LIST, thisL)
|
|
250 else:
|
|
251 length = self._readValue(H, thisL, LENGTH)
|
|
252 self._writeValue(T, i, PARENT, p)
|
|
253 self._writeValue(H, thisL, LENGTH, length + 1)
|
|
254 progress.inc()
|
|
255 progress.done()
|
|
256
|
|
257 def _computeSubStart(self):
|
|
258 progress = Progress(self._nbLines, "Getting table sub-lists", self._verbosity-5)
|
|
259 total = 0
|
|
260 for i in xrange(self._nbLists):
|
|
261 self._writeValue(H, i, START, total)
|
|
262 total += self._readValue(H, i, LENGTH)
|
|
263 self._writeValue(H, i, LENGTH, 0)
|
|
264 progress.inc()
|
|
265 progress.done()
|
|
266
|
|
267 def _computeAbsPosition(self):
|
|
268 progress = Progress(self._nbLines, "Writing table", self._verbosity-5)
|
|
269 self._sizeFirstList = 0
|
|
270 for i in xrange(self._nbLines):
|
|
271 s = self._readValue(T, i, START)
|
|
272 e = self._readValue(T, i, END)
|
|
273 a = self._readValue(T, i, ADDRESS)
|
|
274 pt = self._readValue(T, i, PARENT)
|
|
275 h = self._readValue(T, pt, LIST)
|
|
276 pl = self._readValue(T, pt, NEW)
|
|
277 nb = self._readValue(H, h, LENGTH)
|
|
278 l = self._readValue(H, h, START) + nb
|
|
279 self._writeValue(T, i, NEW, l)
|
|
280 self._writeValue(L, l, START, s)
|
|
281 self._writeValue(L, l, END, e)
|
|
282 self._writeValue(L, l, ADDRESS, a)
|
|
283 self._writeValue(L, l, LIST, -1)
|
|
284 self._writeValue(L, l, PARENT, pl)
|
|
285 self._writeValue(H, h, LENGTH, nb+1)
|
|
286 if nb == 0:
|
|
287 #print "adding it"
|
|
288 self._writeValue(L, pl, LIST, h)
|
|
289 if pl == -1:
|
|
290 self._sizeFirstList += 1
|
|
291 if self._createIndex:
|
|
292 self._index.addTranscript(e, l)
|
|
293 progress.inc()
|
|
294 progress.done()
|
|
295
|
|
296 def closeFiles(self):
|
|
297 for handle in self._handleDict.values():
|
|
298 handle.close()
|
|
299 del self._handleDict
|
|
300 self._lHandle = None
|
|
301 self._hHandle = None
|
|
302 self._tHandle = None
|
|
303 self._parser = None
|
|
304
|
|
305 def openFiles(self):
|
|
306 self._lHandle = open(self._fileNameDict[L], "rb")
|
|
307 self._hHandle = open(self._fileNameDict[H], "rb")
|
|
308 self._handleDict = {H: self._hHandle, L: self._lHandle}
|
|
309 self._parser = NCListFileUnpickle(self._transcriptFileName, self._verbosity)
|
|
310
|
|
311 def _cleanFiles(self):
|
|
312 self.closeFiles()
|
|
313 os.remove(self._fileNameDict[T])
|
|
314
|
|
315 def _getPosition(self, table, line, key):
|
|
316 handle = self._handleDict[table]
|
|
317 handle.seek(self._sizeDict[table] * line * LONG_SIZE + key * LONG_SIZE)
|
|
318 return handle
|
|
319
|
|
320 def _writeValue(self, table, line, key, value):
|
|
321 #print "writing", table, line, key, "<-", value
|
|
322 if line == -1:
|
|
323 self._missingValues[table][key] = value
|
|
324 return
|
|
325 handle = self._getPosition(table, line, key)
|
|
326 handle.write(pack(value))
|
|
327
|
|
328 def _readValue(self, table, line, key):
|
|
329 #print "reading", table, line, key, "->",
|
|
330 if line == -1:
|
|
331 #print self._missingValues[table][key]
|
|
332 return self._missingValues[table][key]
|
|
333 handle = self._getPosition(table, line, key)
|
|
334 r = unpack(handle.read(LONG_SIZE))
|
|
335 #print r
|
|
336 return r
|
|
337
|
|
338 def getIndex(self):
|
|
339 return self._index
|