comparison smart_toolShed/SMART/Java/Python/FindOverlapsOptim.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2012
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31
32 import os, struct, time, shutil
33 from optparse import OptionParser
34 from commons.core.parsing.ParserChooser import ParserChooser
35 from commons.core.writer.Gff3Writer import Gff3Writer
36 from SMART.Java.Python.structure.Transcript import Transcript
37 from SMART.Java.Python.structure.Interval import Interval
38 from SMART.Java.Python.ncList.NCList import NCList
39 from SMART.Java.Python.ncList.ConvertToNCList import ConvertToNCList
40 from SMART.Java.Python.ncList.NCListParser import NCListParser
41 from SMART.Java.Python.ncList.NCListCursor import NCListCursor
42 from SMART.Java.Python.ncList.NCListFilePickle import NCListFilePickle, NCListFileUnpickle
43 from SMART.Java.Python.ncList.NCListHandler import NCListHandler
44 from SMART.Java.Python.misc.Progress import Progress
45 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
46 try:
47 import cPickle as pickle
48 except:
49 import pickle
50
51 REFERENCE = 0
52 QUERY = 1
53 TYPES = (REFERENCE, QUERY)
54 TYPETOSTRING = {0: "reference", 1: "query"}
55
56 class FindOverlapsOptim(object):
57
58 def __init__(self, verbosity = 1):
59 self._parsers = {}
60 self._sortedFileNames = {}
61 self._outputFileName = "outputOverlaps.gff3"
62 self._iWriter = None
63 self._inputFileNames = {REFERENCE: None, QUERY: None}
64 self._convertedFileNames = {REFERENCE: False, QUERY: False}
65 self._inputFileFormats = {REFERENCE: None, QUERY: None}
66 self._converted = {REFERENCE: False, QUERY: False}
67 self._ncListHandlers = {REFERENCE: None, QUERY: None}
68 self._splittedFileNames = {REFERENCE: {}, QUERY: {}}
69 self._nbOverlappingQueries = 0
70 self._nbOverlaps = 0
71 self._nbLines = {REFERENCE: 0, QUERY: 0}
72 self._sorted = False
73 self._index = False
74 self._verbosity = verbosity
75 self._ncLists = {}
76 self._cursors = {}
77 self._nbElementsPerChromosome = {}
78 self._tmpDirectories = {REFERENCE: False, QUERY: False}
79
80 def close(self):
81 self._iWriter.close()
82 for fileName in (self._sortedFileNames.values()):
83 if os.path.exists(fileName):
84 os.remove(fileName)
85 for fileName in self._convertedFileNames.values():
86 if fileName:
87 os.remove(fileName)
88
89 def setRefFileName(self, fileName, format):
90 self.setFileName(fileName, format, REFERENCE)
91
92 def setQueryFileName(self, fileName, format):
93 self.setFileName(fileName, format, QUERY)
94
95 def setFileName(self, fileName, format, type):
96 self._inputFileNames[type] = fileName
97 self._inputFileFormats[type] = format
98 if format.lower() != "nclist":
99 self._converted[type] = True
100
101 def setOutputFileName(self, outputFileName):
102 self._outputFileName = outputFileName
103 self._iWriter = Gff3Writer(self._outputFileName)
104
105 def setSorted(self, sorted):
106 self._sorted = sorted
107
108 def setIndex(self, index):
109 self._index = index
110
111 def createNCLists(self):
112 startTime = time.time()
113 if self._verbosity > 1:
114 print "Building database"
115 self._ncLists = dict([type, {}] for type in TYPES)
116 self._indices = dict([type, {}] for type in TYPES)
117 self._cursors = dict([type, {}] for type in TYPES)
118 for type in TYPES:
119 self._ncListHandlers[type] = NCListHandler(self._verbosity-3)
120 if self._converted[type]:
121 self._convertedFileNames[type] = "%s_%d.ncl" % (os.path.splitext(self._inputFileNames[type])[0], type)
122 ncLists = ConvertToNCList(self._verbosity-3)
123 ncLists.setInputFileName(self._inputFileNames[type], self._inputFileFormats[type])
124 ncLists.setSorted(self._sorted)
125 ncLists.setOutputFileName(self._convertedFileNames[type])
126 if type == REFERENCE and self._index:
127 ncLists.setIndex(True)
128 ncLists.run()
129 self._ncListHandlers[type].setFileName(self._convertedFileNames[type])
130 else:
131 self._ncListHandlers[type].setFileName(self._inputFileNames[type])
132 self._ncListHandlers[type].loadData()
133 self._nbLines[type] = self._ncListHandlers[type].getNbElements()
134 self._nbElementsPerChromosome[type] = self._ncListHandlers[type].getNbElementsPerChromosome()
135 self._ncLists[type] = self._ncListHandlers[type].getNCLists()
136 for chromosome, ncList in self._ncLists[type].iteritems():
137 self._cursors[type][chromosome] = NCListCursor(None, ncList, 0, self._verbosity)
138 if type == REFERENCE and self._index:
139 self._indices[REFERENCE][chromosome] = ncList.getIndex()
140 endTime = time.time()
141 if self._verbosity > 1:
142 print "done (%.2gs)" % (endTime - startTime)
143
144 def compare(self):
145 nbSkips, nbMoves = 0, 0
146 previousChromosome = None
147 done = False
148 startTime = time.time()
149 progress = Progress(len(self._ncLists[QUERY].keys()), "Checking overlap", self._verbosity)
150 #print "query:", self._ncLists[QUERY].keys()
151 #print "reference:", self._ncLists[REFERENCE].keys()
152 for chromosome, queryNCList in self._ncLists[QUERY].iteritems():
153 queryParser = self._ncListHandlers[QUERY].getParser(chromosome)
154 queryCursor = self._cursors[QUERY][chromosome]
155 if chromosome != previousChromosome:
156 skipChromosome = False
157 previousChromosome = chromosome
158 if chromosome not in self._ncLists[REFERENCE]:
159 #print "out ", chromosome
160 continue
161 refNCList = self._ncLists[REFERENCE][chromosome]
162 refCursor = self._cursors[REFERENCE][chromosome]
163 #print "starting", chromosome
164 while True:
165 queryTranscript = queryCursor.getTranscript()
166 newRefLaddr = self.checkIndex(queryTranscript, refCursor)
167 #print "query is", queryTranscript
168 if newRefLaddr != None:
169 nbMoves += 1
170 refCursor.setLIndex(newRefLaddr)
171 #print "skipping to", refCursor
172 done = False
173 refCursor, done, unmatched = self.findOverlapIter(queryTranscript, refCursor, done)
174 #print "completed with", refCursor, done, unmatched
175 if refCursor.isOut():
176 #print "exiting 1", chromosome
177 break
178 if unmatched or not queryCursor.hasChildren():
179 queryCursor.moveNext()
180 #print "moving next to", queryCursor
181 nbSkips += 1
182 else:
183 queryCursor.moveDown()
184 #print "moving down to", queryCursor
185 if queryCursor.isOut():
186 #print "exiting 2", chromosome
187 break
188 progress.inc()
189 progress.done()
190 endTime = time.time()
191 self._timeSpent = endTime - startTime
192 if self._verbosity >= 10:
193 print "# skips: %d" % (nbSkips)
194 print "# moves: %d" % (nbMoves)
195
196 def findOverlapIter(self, queryTranscript, cursor, done):
197 chromosome = queryTranscript.getChromosome()
198 if chromosome not in self._ncLists[REFERENCE]:
199 return False, None
200 ncList = self._ncLists[REFERENCE][chromosome]
201 overlappingNames = {}
202 nextDone = False
203 firstOverlapLAddr = NCListCursor(cursor)
204 firstOverlapLAddr.setLIndex(-1)
205 if cursor.isOut():
206 return firstOverlapLAddr, False
207 parentCursor = NCListCursor(cursor)
208 parentCursor.moveUp()
209 firstParentAfter = False
210 #print "query transcript 1", queryTranscript
211 #print "cursor 1", cursor
212 #print "parent 1", parentCursor
213 while not parentCursor.isOut():
214 if self.isOverlapping(queryTranscript, parentCursor) == 0:
215 #print "overlap parent choice 0"
216 overlappingNames.update(self._extractID(parentCursor.getTranscript()))
217 if firstOverlapLAddr.isOut():
218 #print "overlap parent 2"
219 firstOverlapLAddr.copy(parentCursor)
220 nextDone = True # new
221 elif self.isOverlapping(queryTranscript, parentCursor) == 1:
222 #print "overlap parent choice 1"
223 firstParentAfter = NCListCursor(parentCursor)
224 parentCursor.moveUp()
225 #print "parent 2", parentCursor
226 if firstParentAfter:
227 #print "exit parent", firstParentAfter, overlappingNames
228 self._writeIntervalInNewGFF3(queryTranscript, overlappingNames)
229 return firstParentAfter, False, not overlappingNames
230 #This loop finds the overlaps with currentRefLAddr.#
231 while True:
232 #print "ref cursor now is", cursor
233 parentCursor = NCListCursor(cursor)
234 parentCursor.moveUp()
235 #In case: Query is on the right of the RefInterval and does not overlap.
236 overlap = self.isOverlapping(queryTranscript, cursor)
237 if overlap == -1:
238 cursor.moveNext()
239 #In case: Query overlaps with RefInterval.
240 elif overlap == 0:
241 #print "choice 2"
242 overlappingNames.update(self._extractID(cursor.getTranscript()))
243 if firstOverlapLAddr.compare(parentCursor):
244 firstOverlapLAddr.copy(cursor)
245 nextDone = True # new
246 if done:
247 cursor.moveNext()
248 else:
249 if not cursor.hasChildren():
250 cursor.moveNext()
251 if cursor.isOut():
252 #print "break 1"
253 break
254 else:
255 cursor.moveDown()
256 #In case: Query is on the left of the RefInterval and does not overlap.
257 else:
258 #print "choice 3"
259 if firstOverlapLAddr.isOut() or firstOverlapLAddr.compare(parentCursor):
260 #print "changing nfo 2"
261 firstOverlapLAddr.copy(cursor)
262 nextDone = False # new
263 #print "break 2"
264 break
265
266 done = False
267 if cursor.isOut():
268 #print "break 3"
269 break
270 self._writeIntervalInNewGFF3(queryTranscript, overlappingNames)
271 return firstOverlapLAddr, nextDone, not overlappingNames
272
273 def isOverlapping(self, queryTranscript, refTranscript):
274 if (queryTranscript.getStart() <= refTranscript.getEnd() and queryTranscript.getEnd() >= refTranscript.getStart()):
275 return 0
276 if queryTranscript.getEnd() < refTranscript.getStart():
277 return 1
278 return -1
279
280 def checkIndex(self, transcript, cursor):
281 if not self._index:
282 return None
283 chromosome = transcript.getChromosome()
284 nextLIndex = self._indices[REFERENCE][chromosome].getIndex(transcript)
285 if nextLIndex == None:
286 return None
287 ncList = self._ncLists[REFERENCE][chromosome]
288 nextGffAddress = ncList.getRefGffAddr(nextLIndex)
289 thisGffAddress = cursor.getGffAddress()
290 if nextGffAddress > thisGffAddress:
291 return nextLIndex
292 return None
293
294 def _writeIntervalInNewGFF3(self, transcript, names):
295 nbOverlaps = 0
296 for cpt in names.values():
297 nbOverlaps += cpt
298 if not names:
299 return
300 transcript.setTagValue("overlapsWith", "--".join(sorted(names.keys())))
301 transcript.setTagValue("nbOverlaps", nbOverlaps)
302 self._iWriter.addTranscript(transcript)
303 self._iWriter.write()
304 self._nbOverlappingQueries += 1
305 self._nbOverlaps += nbOverlaps
306
307 def _extractID(self, transcript):
308 nbElements = float(transcript.getTagValue("nbElements")) if "nbElements" in transcript.getTagNames() else 1
309 id = transcript.getTagValue("ID") if "ID" in transcript.getTagNames() else transcript.getUniqueName()
310 return {id: nbElements}
311
312 def run(self):
313 self.createNCLists()
314 self.compare()
315 self.close()
316 if self._verbosity > 0:
317 print "# queries: %d" % (self._nbLines[QUERY])
318 print "# refs: %d" % (self._nbLines[REFERENCE])
319 print "# written: %d (%d overlaps)" % (self._nbOverlappingQueries, self._nbOverlaps)
320 print "time: %.2gs" % (self._timeSpent)
321
322
323 if __name__ == "__main__":
324 description = "Find Overlaps Optim v1.0.0: Finds overlaps with several query intervals. [Category: Data Comparison]"
325
326 parser = OptionParser(description = description)
327 parser.add_option("-i", "--query", dest="inputQueryFileName", action="store", type="string", help="query input file [compulsory] [format: file in transcript or other format given by -f]")
328 parser.add_option("-f", "--queryFormat", dest="queryFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")
329 parser.add_option("-j", "--ref", dest="inputRefFileName", action="store", type="string", help="reference input file [compulsory] [format: file in transcript or other format given by -g]")
330 parser.add_option("-g", "--refFormat", dest="refFormat", action="store", type="string", help="format of previous file (possibly in NCL format) [compulsory] [format: transcript or other file format]")
331 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
332 parser.add_option("-d", "--index", dest="index", action="store_true", default=False, help="add an index to the reference file (faster but more memory) [format: boolean] [default: False]")
333 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input files are already sorted [format: boolean] [default: False]")
334 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="Trace level [format: int] [default: 1]")
335 (options, args) = parser.parse_args()
336
337 iFOO = FindOverlapsOptim(options.verbosity)
338 iFOO.setRefFileName(options.inputRefFileName, options.refFormat)
339 iFOO.setQueryFileName(options.inputQueryFileName, options.queryFormat)
340 iFOO.setOutputFileName(options.outputFileName)
341 iFOO.setIndex(options.index)
342 iFOO.setSorted(options.sorted)
343 iFOO.run()