Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/structure/TranscriptListsComparator.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | 769e306b7933 |
children |
comparison
equal
deleted
inserted
replaced
35:d94018ca4ada | 36:44d5973c188c |
---|---|
1 # | |
2 # Copyright INRA-URGI 2009-2010 | |
3 # | |
4 # This software is governed by the CeCILL license under French law and | |
5 # abiding by the rules of distribution of free software. You can use, | |
6 # modify and/ or redistribute the software under the terms of the CeCILL | |
7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
8 # "http://www.cecill.info". | |
9 # | |
10 # As a counterpart to the access to the source code and rights to copy, | |
11 # modify and redistribute granted by the license, users are provided only | |
12 # with a limited warranty and the software's author, the holder of the | |
13 # economic rights, and the successive licensors have only limited | |
14 # liability. | |
15 # | |
16 # In this respect, the user's attention is drawn to the risks associated | |
17 # with loading, using, modifying and/or developing or reproducing the | |
18 # software by the user in light of its specific status of free software, | |
19 # that may mean that it is complicated to manipulate, and that also | |
20 # therefore means that it is reserved for developers and experienced | |
21 # professionals having in-depth computer knowledge. Users are therefore | |
22 # encouraged to load and test the software's suitability as regards their | |
23 # requirements in conditions enabling the security of their systems and/or | |
24 # data to be ensured and, more generally, to use and operate it in the | |
25 # same conditions as regards security. | |
26 # | |
27 # The fact that you are presently reading this means that you have had | |
28 # knowledge of the CeCILL license and that you accept its terms. | |
29 # | |
30 import sys | |
31 import random | |
32 from SMART.Java.Python.misc import Utils | |
33 from SMART.Java.Python.structure.Transcript import Transcript | |
34 from SMART.Java.Python.structure.TranscriptList import TranscriptList | |
35 from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer | |
36 from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection | |
37 from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable | |
38 from SMART.Java.Python.misc.Progress import Progress | |
39 from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter | |
40 | |
41 | |
42 | |
43 class TranscriptListsComparator(object): | |
44 """ | |
45 Compare two transcript lists, using a database for one of the list | |
46 Uses one TranscriptContainer for query data, | |
47 one TranscriptContainer exported to MySqlTranscriptTable for reference data, | |
48 one MySqlTranscriptTable for transformed reference data | |
49 @ivar inputTranscriptContainers: parsers to the list of query transcripts | |
50 @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>} | |
51 @ivar writer: transcript list writer | |
52 @type writer: class L{TranscriptListWriter<TranscriptListWriter>} | |
53 @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently) | |
54 @type mySqlConnection: class L{MySqlConnection<MySqlConnection>} | |
55 @ivar introns: compare transcripts or exons only | |
56 @type introns: list of 2 boolean | |
57 @ivar starts: restrict the query transcripts to first nucleotides | |
58 @type starts: list of 2 int or None | |
59 @ivar fivePrimes: extend a list of transcripts by their 5' end | |
60 @type fivePrimes: list of 2 int or None | |
61 @ivar threePrimes: extend a list of transcripts by their 3' end | |
62 @type threePrimes: list of 2 int or None | |
63 @ivar minDistance: min distance between two transcripts [default: 0] | |
64 @type minDistance: int | |
65 @ivar maxDistance: max distance between two transcripts [default: 0] | |
66 @type maxDistance: int | |
67 @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap | |
68 @type minOverlap: int | |
69 @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap | |
70 @type pcOverlap: int | |
71 @ivar upstreams: consider distances with elements which are upstream of the transcripts | |
72 @type upstreams: boolean | |
73 @ivar downstreams: consider distances with elements which are downstream of the transcripts | |
74 @type downstreams: boolean | |
75 @ivar colinear: whether transcripts should overlap in the same direction | |
76 @type colinear: boolean | |
77 @ivar antisense: whether transcripts should overlap in the opposite direction | |
78 @type antisense: boolean | |
79 @ivar outputDistance: output distance between query and reference instead of query transcript | |
80 @type outputDistance: boolean | |
81 @ivar absolute: do not consider the strand while computing distance | |
82 @type absolute: boolean | |
83 @ivar strandedDistance: return a line per strand while computing distances | |
84 @type strandedDistance: boolean | |
85 @ivar QUERY: constant specifying the query objects | |
86 @type QUERY: int | |
87 @ivar REFERENCE: constant specifying the reference objects | |
88 @type REFERENCE: int | |
89 @ivar INPUTTYPES: set of input types of data (query or reference) objects | |
90 @type INPUTTYPES: list of 2 int | |
91 @ivar typeToString: string representation of the previous types | |
92 @type typeToString: dict | |
93 @ivar tableNames: name of the transcript tables | |
94 @type tableNames: dict of strings | |
95 @ivar nbTranscripts: number of transcript in the query/reference set | |
96 @type nbTranscripts: list of 2 int or None | |
97 @ivar nbNucleotides: number of nucleotides in the query/reference set | |
98 @type nbNucleotides: list of 2 int or None | |
99 @ivar transcriptsToBeStored: transcripts that will be stored into database | |
100 @type transcriptsToBeStored: dict of class L{TranscriptList<TranscriptList>} | |
101 @ivar multiple: in merge mode, aggregate multiple transcripts | |
102 @type multiple: boolean | |
103 @ivar normalization: normalize each element by the number of mappings of this element | |
104 @type normalization: boolean | |
105 @ivar invert: invert the current comparison | |
106 @type invert: boolean | |
107 @ivar splitDifference: split into intervals when computing difference | |
108 @type splitDifference: boolean | |
109 @ivar odds: whether odds about the comparison should be computed | |
110 @type odds: boolean | |
111 @ivar overlapResults: count the number of overlaps | |
112 @type overlapResults: dictionary | |
113 @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one | |
114 @type oddResults: dictionary | |
115 @ivar outputContainer: container of the output transcripts | |
116 @type outputContainer: class L{TranscriptContainer<TranscriptContainer>} | |
117 @ivar logHandle: log handle | |
118 @type logHandle: file | |
119 @ivar verbosity: verbosity | |
120 @type verbosity: int | |
121 """ | |
122 | |
123 def __init__(self, logHandle = None, verbosity = 0): | |
124 """ | |
125 Constructor | |
126 @param transcriptListParser2: parser to the list of reference transcripts | |
127 @type transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>} | |
128 @param logHandle: log handle | |
129 @type logHandle: file | |
130 @param verbosity: verbosity | |
131 @type verbosity: int | |
132 """ | |
133 self.QUERY = 0 | |
134 self.REFERENCE = 1 | |
135 self.WORKING = 2 | |
136 self.INPUTTYPES = (self.QUERY, self.REFERENCE) | |
137 self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING) | |
138 self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"} | |
139 | |
140 self.logHandle = logHandle | |
141 self.verbosity = verbosity | |
142 self.mySqlConnection = MySqlConnection(self.verbosity-1) | |
143 self.inputTranscriptContainers = [None, None] | |
144 self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))] | |
145 self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames] | |
146 self.writer = None | |
147 self.introns = [False, False] | |
148 self.starts = [None, None] | |
149 self.ends = [None, None] | |
150 self.fivePrimes = [None, None] | |
151 self.threePrimes = [None, None] | |
152 self.minDistance = None | |
153 self.maxDistance = 0 | |
154 self.minOverlap = 1 | |
155 self.pcOverlap = None | |
156 self.colinear = False | |
157 self.antisense = False | |
158 self.downstreams = [False, False] | |
159 self.upstreams = [False, False] | |
160 self.outputDistance = False | |
161 self.absolute = False | |
162 self.strandedDistance = False | |
163 self.nbTranscripts = [None, None] | |
164 self.nbNucleotides = [None, None] | |
165 self.normalization = False | |
166 self.included = False | |
167 self.including = False | |
168 self.invert = False | |
169 self.notOverlapping = False | |
170 self.splitDifference = False | |
171 self.multiple = False | |
172 self.odds = False | |
173 self.overlapResults = None | |
174 self.oddResults = None | |
175 self.outputContainer = None | |
176 self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES]) | |
177 self.nbPrinted = 0 | |
178 | |
179 self.mySqlConnection.createDatabase() | |
180 | |
181 | |
182 def __del__(self): | |
183 """ | |
184 Destructor | |
185 Remove all temporary tables | |
186 """ | |
187 for type in self.INPUTWORKINGTYPES: | |
188 self.mySqlTranscriptWriters[type].removeTables() | |
189 self.mySqlConnection.deleteDatabase() | |
190 | |
191 def acceptIntrons(self, type, bool): | |
192 """ | |
193 Compare transcripts or exons only | |
194 @param type: whether use query/reference data | |
195 @type type: int | |
196 @param bool: include introns or not | |
197 @type bool: boolean | |
198 """ | |
199 self.introns[type] = bool | |
200 | |
201 | |
202 def restrictToStart(self, type, size): | |
203 """ | |
204 Restrict a list of transcripts to first nucleotides | |
205 @param type: whether use query/reference data | |
206 @type type: int | |
207 @param size: the size of the transcript to be considered | |
208 @type size: int | |
209 """ | |
210 self.starts[type] = size | |
211 self.introns[type] = False | |
212 | |
213 | |
214 def restrictToEnd(self, type, size): | |
215 """ | |
216 Restrict a list of transcripts to first nucleotides | |
217 @param type: whether use query/reference data | |
218 @type type: int | |
219 @param size: the size of the transcript to be considered | |
220 @type size: int | |
221 """ | |
222 self.ends[type] = size | |
223 self.introns[type] = False | |
224 | |
225 | |
226 def extendFivePrime(self, type, size): | |
227 """ | |
228 Extend a list of transcripts by their 5' end | |
229 @param type: whether use query/reference data | |
230 @type type: int | |
231 @param size: size of the extension | |
232 @type size: int | |
233 """ | |
234 self.fivePrimes[type] = size | |
235 | |
236 | |
237 def extendThreePrime(self, type, size): | |
238 """ | |
239 Extend the list of query transcripts by their 3' end | |
240 @param type: whether use query/reference data | |
241 @type type: int | |
242 @param size: size of the extension | |
243 @type size: int | |
244 """ | |
245 self.threePrimes[type] = size | |
246 | |
247 | |
248 def setMinDistance(self, distance): | |
249 """ | |
250 Set the min distance between two transcripts | |
251 @param distance: distance | |
252 @type distance: int | |
253 """ | |
254 self.minDistance = distance | |
255 | |
256 | |
257 def setMaxDistance(self, distance): | |
258 """ | |
259 Set the max distance between two transcripts | |
260 @param distance: distance | |
261 @type distance: int | |
262 """ | |
263 self.maxDistance = distance | |
264 | |
265 | |
266 def setMinOverlap(self, overlap): | |
267 """ | |
268 Set the minimum number of nucleotides to declare an overlap | |
269 @param overlap: minimum number of nucleotides | |
270 @type overlap: int | |
271 """ | |
272 self.minOverlap = overlap | |
273 | |
274 | |
275 def setPcOverlap(self, overlap): | |
276 """ | |
277 Set the percentage of nucleotides to declare an overlap | |
278 @param overlap: percentage of nucleotides | |
279 @type overlap: int | |
280 """ | |
281 self.pcOverlap = overlap | |
282 | |
283 | |
284 def setUpstream(self, type, boolean): | |
285 """ | |
286 Consider transcripts which are upstream of some transcripts | |
287 @param type: whether use query/reference data | |
288 @type type: int | |
289 @param boolean: consider only these transcripts or not | |
290 @type boolean: boolean | |
291 """ | |
292 self.upstreams[type] = boolean | |
293 | |
294 | |
295 def setDownstream(self, type, boolean): | |
296 """ | |
297 Consider transcripts which are downstream of some transcripts | |
298 @param type: whether use query/reference data | |
299 @type type: int | |
300 @param boolean: consider only these transcripts or not | |
301 @type boolean: boolean | |
302 """ | |
303 self.downstreams[type] = boolean | |
304 | |
305 | |
306 def setOutputDistance(self, boolean): | |
307 """ | |
308 Output distance between query and reference instead of query transcript | |
309 @param boolean: whether distance should be output | |
310 @type boolean: boolean | |
311 """ | |
312 self.outputDistance = boolean | |
313 | |
314 | |
315 def setAbsolute(self, boolean): | |
316 """ | |
317 Do not consider strand when computing distance (thus, having only non-negative values) | |
318 @param boolean: whether we should consider strands | |
319 @type boolean: boolean | |
320 """ | |
321 self.absolute = boolean | |
322 | |
323 | |
324 def setStrandedDistance(self, boolean): | |
325 """ | |
326 Return two distance distributions, one per strand | |
327 @param boolean: whether we should return 2 distance distance | |
328 @type boolean: boolean | |
329 """ | |
330 self.strandedDistance = boolean | |
331 | |
332 | |
333 def getColinearOnly(self, boolean): | |
334 """ | |
335 Only consider transcripts that overlap in the same direction | |
336 @param boolean: whether transcripts should overlap in the same direction | |
337 @type boolean: boolean | |
338 """ | |
339 self.colinear = boolean | |
340 | |
341 | |
342 def getAntisenseOnly(self, boolean): | |
343 """ | |
344 Only consider transcripts that overlap in the opposite direction | |
345 @param boolean: whether transcripts should overlap in the opposite direction | |
346 @type boolean: boolean | |
347 """ | |
348 self.antisense = boolean | |
349 | |
350 | |
351 def setIncludedOnly(self, boolean): | |
352 """ | |
353 Keep the elements from first set which are included in the second set | |
354 @param boolean: whether to keep included elements only | |
355 @type boolean: boolean | |
356 """ | |
357 self.included = boolean | |
358 | |
359 | |
360 def setIncludingOnly(self, boolean): | |
361 """ | |
362 Keep the elements from second set which are included in the first set | |
363 @param boolean: whether to keep included elements only | |
364 @type boolean: boolean | |
365 """ | |
366 self.including = boolean | |
367 | |
368 | |
369 def setNormalization(self, boolean): | |
370 """ | |
371 Normalize the elements by the number of mappings in the genome | |
372 @param boolean: whether normalize | |
373 @type boolean: boolean | |
374 """ | |
375 self.normalization = boolean | |
376 | |
377 | |
378 def getInvert(self, boolean): | |
379 """ | |
380 Only consider transcripts that do not overlap | |
381 @param boolean: whether invert the selection | |
382 @type boolean: boolean | |
383 """ | |
384 self.invert = boolean | |
385 | |
386 | |
387 def includeNotOverlapping(self, boolean): | |
388 """ | |
389 Also output the elements which do not overlap | |
390 @param boolean: whether output the elements which do not overlap | |
391 @type boolean: boolean | |
392 """ | |
393 self.notOverlapping = boolean | |
394 | |
395 | |
396 def setSplitDifference(self, boolean): | |
397 """ | |
398 Split into intervals when computing difference | |
399 @param boolean: whether to split | |
400 @type boolean: boolean | |
401 """ | |
402 self.splitDifference = boolean | |
403 | |
404 | |
405 def aggregate(self, boolean): | |
406 """ | |
407 In merge mode, aggregate multiple transcripts | |
408 @param boolean: aggregate multiple transcripts | |
409 @type boolean: boolean | |
410 """ | |
411 self.multiple = boolean | |
412 | |
413 | |
414 def getTables(self, type): | |
415 """ | |
416 Get the SQL tables | |
417 @param type: type of the table (query, reference, etc.) | |
418 @type type: int | |
419 """ | |
420 return self.mySqlTranscriptWriters[type].getTables() | |
421 | |
422 | |
423 def computeOdds(self, boolean): | |
424 """ | |
425 Compute odds | |
426 @param boolean: whether odds should be computed | |
427 @type boolean: boolean | |
428 """ | |
429 self.odds = boolean | |
430 if self.odds: | |
431 self.overlapResults = dict() | |
432 | |
433 | |
434 def computeOddsPerTranscript(self, boolean): | |
435 """ | |
436 Compute odds for each transcript | |
437 @param boolean: whether odds for each transcript should be computed | |
438 @type boolean: boolean | |
439 """ | |
440 self.odds = boolean | |
441 if self.odds: | |
442 self.overlapResults = dict() | |
443 | |
444 | |
445 def removeTables(self): | |
446 """ | |
447 Remove the temporary MySQL tables | |
448 """ | |
449 for type in self.INPUTWORKINGTYPES: | |
450 for chromosome in self.getTables(type): | |
451 self.getTables(type)[chromosome].remove() | |
452 | |
453 | |
454 def clearTables(self): | |
455 """ | |
456 Empty the content of the databases | |
457 """ | |
458 for type in self.INPUTWORKINGTYPES: | |
459 if self.transcriptListParsers[type] != None: | |
460 for chromosome in self.getTables(type): | |
461 self.getTables(type)[chromosome].clear() | |
462 | |
463 | |
464 def extendTranscript(self, type, transcript): | |
465 """ | |
466 Extend a transcript corresponding to the parameters of the class | |
467 @param transcript: a transcript | |
468 @type transcript: class L{Transcript<Transcript>} | |
469 @return: the possibly extended transcript | |
470 """ | |
471 extendedTranscript = Transcript() | |
472 extendedTranscript.copy(transcript) | |
473 if self.starts[type] != None: | |
474 extendedTranscript.restrictStart(self.starts[type]) | |
475 if self.ends[type] != None: | |
476 extendedTranscript.restrictEnd(self.ends[type]) | |
477 if self.fivePrimes[type] != None: | |
478 extendedTranscript.extendStart(self.fivePrimes[type]) | |
479 if self.threePrimes[type] != None: | |
480 extendedTranscript.extendEnd(self.threePrimes[type]) | |
481 return extendedTranscript | |
482 | |
483 | |
484 def storeTranscript(self, type, transcript, now = True): | |
485 """ | |
486 Add a transcript to a MySQL database, or postpone the store | |
487 @param type: whether use query/reference table | |
488 @type type: int | |
489 @param transcript: a transcript | |
490 @type transcript: class L{Transcript<Transcript>} | |
491 @param now: whether transcript should be stored now (or stored can be postponed) | |
492 @type now: bool | |
493 """ | |
494 self.mySqlTranscriptWriters[type].addTranscript(transcript) | |
495 if type == self.REFERENCE: | |
496 self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript) | |
497 if now: | |
498 self.mySqlTranscriptWriters[type].write() | |
499 if type == self.REFERENCE: | |
500 self.mySqlTranscriptWriters[self.WORKING].write() | |
501 | |
502 | |
503 def writeTranscript(self, transcript): | |
504 """ | |
505 Write a transcript in the output file | |
506 @param transcript: a transcript | |
507 @type transcript: class L{Transcript<Transcript>} | |
508 """ | |
509 if self.writer != None: | |
510 self.writer.addTranscript(transcript) | |
511 self.nbPrinted += 1 | |
512 | |
513 | |
514 def flushData(self, type = None): | |
515 """ | |
516 Store the remaining transcripts | |
517 @param type: whether use query/reference table (None for all) | |
518 @type type: int or None | |
519 """ | |
520 if type == None: | |
521 types = self.INPUTWORKINGTYPES | |
522 else: | |
523 types = [type] | |
524 for type in types: | |
525 self.mySqlTranscriptWriters[type].write() | |
526 if self.writer != None: | |
527 self.writer.write() | |
528 | |
529 | |
530 def unstoreTranscript(self, type, transcript): | |
531 """ | |
532 Remove a transcript from a MySQL database | |
533 @param type: whether use query/reference table | |
534 @type type: int | |
535 @param transcript: a transcript | |
536 @type transcript: class L{Transcript<Transcript>} | |
537 """ | |
538 self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript) | |
539 if type == self.REFERENCE: | |
540 self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript) | |
541 | |
542 | |
543 def addIndexes(self, tables): | |
544 """ | |
545 Add useful indexes to the tables | |
546 @param tables: which tables should be indexed | |
547 @type tables: list of int | |
548 """ | |
549 for type in tables: | |
550 for chromosome in self.getTables(type): | |
551 self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"]) | |
552 self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"]) | |
553 | |
554 | |
555 def storeTranscriptList(self, type, transcriptListParser, extension): | |
556 """ | |
557 Store a transcript list into database | |
558 @param type: whether use query/reference parser | |
559 @type type: int | |
560 @param parser: a parser of transcript list | |
561 @type parser: class L{TranscriptContainer<TranscriptContainer>} | |
562 @param extension: extend (or not) the transcripts | |
563 @type extension: boolean | |
564 """ | |
565 progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1) | |
566 for transcript in transcriptListParser.getIterator(): | |
567 if extension: | |
568 transcript = self.extendTranscript(type, transcript) | |
569 self.mySqlTranscriptWriters[type].addTranscript(transcript) | |
570 progress.inc() | |
571 self.mySqlTranscriptWriters[type].write() | |
572 progress.done() | |
573 if type == self.REFERENCE: | |
574 for chromosome in self.getTables(self.REFERENCE): | |
575 self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1) | |
576 self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome]) | |
577 | |
578 | |
579 def setInputTranscriptContainer(self, type, inputTranscriptContainer): | |
580 """ | |
581 Set an input transcript list container | |
582 @param type: whether use query/reference parser | |
583 @type type: int | |
584 @param inputTranscriptContainer: a container | |
585 @type inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>} | |
586 """ | |
587 self.inputTranscriptContainers[type] = inputTranscriptContainer | |
588 self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts() | |
589 self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides() | |
590 | |
591 | |
592 def setOutputWriter(self, writer): | |
593 """ | |
594 Set an output transcript list writer | |
595 @param writer: a writer | |
596 @type writer: class L{TranscriptListWriter<TranscriptListWriter>} | |
597 """ | |
598 self.writer = writer | |
599 | |
600 | |
601 def compareTranscript(self, transcript1, transcript2, includeDistance = False): | |
602 """ | |
603 Compare two transcripts, using user defined parameters | |
604 @param transcript1: a transcript from the query set (already extended) | |
605 @type transcript1: class L{Transcript<Transcript>} | |
606 @param transcript2: a transcript from the reference set (already extended) | |
607 @type transcript2: class L{Transcript<Transcript>} | |
608 @param includeDistance: take into account the distance too | |
609 @type includeDistance: boolean | |
610 @return: true, if they overlap | |
611 """ | |
612 extendedTranscript1 = Transcript() | |
613 extendedTranscript1.copy(transcript1) | |
614 if includeDistance: | |
615 if self.maxDistance > 0: | |
616 extendedTranscript1.extendStart(self.maxDistance) | |
617 extendedTranscript1.extendEnd(self.maxDistance) | |
618 | |
619 minOverlap = self.minOverlap | |
620 if self.pcOverlap != None: | |
621 minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap) | |
622 if not extendedTranscript1.overlapWith(transcript2, self.minOverlap): | |
623 return False | |
624 if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \ | |
625 (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \ | |
626 (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \ | |
627 (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()): | |
628 return False | |
629 if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()): | |
630 return False | |
631 if self.included and not transcript2.include(extendedTranscript1): | |
632 return False | |
633 if self.including and not extendedTranscript1.include(transcript2): | |
634 return False | |
635 if self.introns[self.REFERENCE] and self.introns[self.QUERY]: | |
636 if self.logHandle != None: | |
637 self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2))) | |
638 return True | |
639 if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap): | |
640 if self.logHandle != None: | |
641 self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2))) | |
642 return True | |
643 return False | |
644 | |
645 | |
646 def compareTranscriptToList(self, transcript1): | |
647 """ | |
648 Compare a transcript to the reference list of transcripts | |
649 (Do not extend the transcripts, except for the distance) | |
650 @param transcript1: a transcript (from the query set) | |
651 @type transcript1: class L{Transcript<Transcript>} | |
652 @return: the reference transcripts overlapping | |
653 """ | |
654 # no transcript in the reference table | |
655 if transcript1.getChromosome() not in self.getTables(self.WORKING): | |
656 return | |
657 | |
658 # retrieve the the transcripts that may overlap in the working tables | |
659 clauses = [] | |
660 extendedTranscript1 = Transcript() | |
661 extendedTranscript1.copy(transcript1) | |
662 if self.maxDistance > 0: | |
663 extendedTranscript1.extendStart(self.maxDistance) | |
664 if self.maxDistance > 0: | |
665 extendedTranscript1.extendEnd(self.maxDistance) | |
666 command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName()) | |
667 for binPair in extendedTranscript1.getBins(): | |
668 clause = "bin " | |
669 if binPair[0] == binPair[1]: | |
670 clause += "= %i" % (binPair[0]) | |
671 else: | |
672 clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1]) | |
673 clauses.append(clause) | |
674 command += " OR ".join(clauses) | |
675 command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart()) | |
676 | |
677 for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command): | |
678 if self.compareTranscript(extendedTranscript1, transcript2): | |
679 yield transcript2 | |
680 | |
681 | |
682 def compareTranscriptList(self): | |
683 """ | |
684 Compare a list of transcript to the reference one | |
685 @return: the transcripts that overlap with the reference set | |
686 """ | |
687 distance = 0 | |
688 nbClustersIn = 0 | |
689 nbClustersOut = 0 | |
690 if self.maxDistance != None: | |
691 distance = self.maxDistance | |
692 | |
693 self.addIndexes([self.QUERY, self.REFERENCE]) | |
694 | |
695 # export the container into tables | |
696 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
697 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | |
698 | |
699 # looping | |
700 for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | |
701 # get range of transcripts | |
702 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) | |
703 query = self.mySqlConnection.executeQuery(command) | |
704 result = query.getLine() | |
705 first = result[0] | |
706 last = result[1] | |
707 nb = result[2] | |
708 | |
709 transcripts1 = [] | |
710 toBeRemoved1 = [] | |
711 transcripts2 = [] | |
712 toBeRemoved2 = [] | |
713 overlapsWith = [] | |
714 nbOverlaps = [] | |
715 nbChunks = max(1, nb / 100) | |
716 chunkSize = (last - first) / nbChunks | |
717 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | |
718 for chunk in range(nbChunks + 1): | |
719 | |
720 # load transcripts | |
721 start = first + chunk * chunkSize | |
722 end = start + chunkSize - 1 | |
723 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) | |
724 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | |
725 transcripts1.append(transcript1) | |
726 overlapsWith.append([]) | |
727 nbOverlaps.append(0) | |
728 nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | |
729 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) | |
730 self.mySqlConnection.executeQuery(command) | |
731 if chromosome1 in self.getTables(self.REFERENCE): | |
732 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) | |
733 if chunk == 0: | |
734 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
735 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): | |
736 transcripts2.append(transcript2) | |
737 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
738 self.mySqlConnection.executeQuery(command) | |
739 | |
740 # compare sets | |
741 for index1, transcript1 in enumerate(transcripts1): | |
742 overlappingNames = [] | |
743 nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | |
744 for transcript2 in transcripts2: | |
745 if self.compareTranscript(transcript1, transcript2, True): | |
746 id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() | |
747 if id2 not in overlapsWith[index1]: | |
748 overlapsWith[index1].append(id2) | |
749 nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements") | |
750 if self.odds: | |
751 if transcript2.getName() not in self.overlapResults: | |
752 self.overlapResults[transcript2.getName()] = 0 | |
753 self.overlapResults[transcript2.getName()] += nbElements1 | |
754 | |
755 # check if query transcript extends bounds of the chunk | |
756 if transcript1.getEnd() < end: | |
757 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: | |
758 if overlapsWith[index1]: | |
759 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) | |
760 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) | |
761 elif self.notOverlapping: | |
762 transcript1.setTagValue("nbOverlaps", "0") | |
763 self.writeTranscript(transcript1) | |
764 nbClustersOut += nbElements1 | |
765 toBeRemoved1.append(index1) | |
766 | |
767 # update list of query transcripts | |
768 for index1 in reversed(toBeRemoved1): | |
769 del transcripts1[index1] | |
770 del overlapsWith[index1] | |
771 del nbOverlaps[index1] | |
772 toBeRemoved1 = [] | |
773 | |
774 # check if the reference transcripts extends bounds of the chunk | |
775 for index2, transcript2 in enumerate(transcripts2): | |
776 if transcript2.getEnd() + distance < end: | |
777 toBeRemoved2.append(index2) | |
778 for index2 in reversed(toBeRemoved2): | |
779 del transcripts2[index2] | |
780 toBeRemoved2 = [] | |
781 | |
782 progress.inc() | |
783 | |
784 for index1, transcript1 in enumerate(transcripts1): | |
785 if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: | |
786 if overlapsWith[index1]: | |
787 transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) | |
788 transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) | |
789 elif self.notOverlapping: | |
790 transcript1.setTagValue("nbOverlaps", "0") | |
791 self.writeTranscript(transcript1) | |
792 nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") | |
793 progress.done() | |
794 self.getTables(self.QUERY)[chromosome1].remove() | |
795 if chromosome1 in self.getTables(self.REFERENCE): | |
796 self.getTables(self.REFERENCE)[chromosome1].remove() | |
797 self.getTables(self.WORKING)[chromosome1].remove() | |
798 | |
799 self.flushData() | |
800 if self.writer != None: | |
801 self.writer.close() | |
802 self.writer = None | |
803 | |
804 if self.verbosity > 0: | |
805 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) | |
806 print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn) | |
807 if self.nbTranscripts[self.QUERY] != 0: | |
808 print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100), | |
809 if nbClustersOut != 0: | |
810 print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100) | |
811 | |
812 | |
813 def compareTranscriptListDistance(self): | |
814 """ | |
815 Compare a list of transcript to the reference one | |
816 @return: the distance distributions in a hash | |
817 """ | |
818 nbDistances = 0 | |
819 distances = {} | |
820 absDistances = {} | |
821 strandedDistances = dict([(strand, {}) for strand in (1, -1)]) | |
822 | |
823 # export the container into tables | |
824 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
825 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | |
826 | |
827 progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1) | |
828 for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator(): | |
829 # get the distance | |
830 transcript1 = self.extendTranscript(self.QUERY, transcript1) | |
831 distance = self.maxDistance + 1 | |
832 strand = None | |
833 closestElement = "None" | |
834 for transcript2 in self.compareTranscriptToList(transcript1): | |
835 thisStrand = transcript1.getDirection() * transcript2.getDirection() | |
836 if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()): | |
837 transcript2.reverse() | |
838 if self.absolute: | |
839 transcript2.setDirection(transcript1.getDirection()) | |
840 if transcript2.getDirection() == transcript1.getDirection(): | |
841 if self.starts[self.REFERENCE] != None: | |
842 transcript2.restrictStart(self.starts[self.REFERENCE]) | |
843 if self.ends[self.REFERENCE] != None: | |
844 transcript2.restrictEnd(self.ends[self.REFERENCE]) | |
845 thisDistance = transcript1.getRelativeDistance(transcript2) | |
846 if (self.absolute): | |
847 thisDistance = abs(thisDistance) | |
848 if abs(thisDistance) < abs(distance): | |
849 distance = thisDistance | |
850 strand = thisStrand | |
851 closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() | |
852 if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance): | |
853 nbDistances += 1 | |
854 distances[distance] = distances.get(distance, 0) + 1 | |
855 absDistance = abs(distance) | |
856 absDistances[absDistance] = absDistances.get(absDistance, 0) + 1 | |
857 strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0) | |
858 if distance not in strandedDistances[-strand]: | |
859 strandedDistances[-strand][distance] = 0 | |
860 | |
861 # write transcript | |
862 if distance == self.maxDistance + 1: | |
863 distance = "None" | |
864 tmpTranscript = Transcript() | |
865 tmpTranscript.copy(transcript1) | |
866 tmpTranscript.setTagValue("distance", distance) | |
867 tmpTranscript.setTagValue("closestElement", closestElement) | |
868 self.writeTranscript(tmpTranscript) | |
869 | |
870 progress.inc() | |
871 progress.done() | |
872 | |
873 self.flushData() | |
874 | |
875 if self.verbosity > 0: | |
876 print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE]) | |
877 print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) | |
878 if nbDistances == 0: | |
879 print "Nothing matches" | |
880 else: | |
881 print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances) | |
882 print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100) | |
883 | |
884 if self.strandedDistance: | |
885 return strandedDistances | |
886 return distances | |
887 | |
888 | |
889 def compareTranscriptListMerge(self): | |
890 """ | |
891 Merge the query list of transcript with itself | |
892 @return: the merged transcripts in a transcript list database | |
893 """ | |
894 nbMerges = 0 | |
895 | |
896 for type in (self.QUERY, self.REFERENCE): | |
897 self.storeTranscriptList(type, self.inputTranscriptContainers[type], True) | |
898 self.flushData() | |
899 | |
900 # Loop on the chromosomes | |
901 for chromosome in sorted(self.getTables(self.QUERY).keys()): | |
902 if chromosome not in self.getTables(self.REFERENCE): | |
903 continue | |
904 | |
905 # Get the size of the chromosome | |
906 maxEnd = 0 | |
907 nbChunks = 0 | |
908 for type in (self.QUERY, self.REFERENCE): | |
909 command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName()) | |
910 query = self.mySqlConnection.executeQuery(command) | |
911 maxEnd = max(maxEnd, int(query.getLine()[0])) | |
912 nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements()) | |
913 | |
914 mergedTranscripts = {} | |
915 transcripts = {self.QUERY: [], self.REFERENCE: []} | |
916 progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1) | |
917 for i in range(nbChunks): | |
918 rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1 | |
919 rangeEnd = int((i+1) * (float(maxEnd) / nbChunks)) | |
920 | |
921 # Get all transcripts in query and reference from chunk | |
922 for type in (self.QUERY, self.REFERENCE): | |
923 correction = 0 if self.QUERY else self.maxDistance | |
924 command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction) | |
925 for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command): | |
926 transcripts[type].append(transcript) | |
927 | |
928 # Merge elements between the two samples | |
929 for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]): | |
930 for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]): | |
931 if referenceTranscript == None: continue | |
932 if self.compareTranscript(queryTranscript, referenceTranscript, True): | |
933 if queryTranscript.getDirection() != referenceTranscript.getDirection(): | |
934 referenceTranscript.setDirection(queryTranscript.getDirection()) | |
935 queryTranscript.merge(referenceTranscript, self.normalization) | |
936 nbMerges += 1 | |
937 transcripts[self.REFERENCE][iReference] = None | |
938 if not self.multiple: | |
939 mergedTranscripts[iQuery] = 0 | |
940 | |
941 # Remove transcripts from database | |
942 for type in (self.QUERY, self.REFERENCE): | |
943 correction = 0 if self.QUERY else self.maxDistance | |
944 command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction) | |
945 query = self.mySqlConnection.executeQuery(command) | |
946 | |
947 # Just in case, self-merge the elements in the query (beware of mergedTranscripts!) | |
948 if (self.multiple): | |
949 for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]): | |
950 if queryTranscript1 == None: continue | |
951 for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]): | |
952 if iQuery2 <= iQuery1 or queryTranscript2 == None: continue | |
953 minOverlap = self.minOverlap | |
954 if self.pcOverlap != None: | |
955 minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap) | |
956 if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear): | |
957 if queryTranscript1.getDirection() != queryTranscript2.getDirection(): | |
958 queryTranscript2.setDirection(queryTranscript1.getDirection()) | |
959 queryTranscript1.merge(queryTranscript2, self.normalization) | |
960 transcripts[self.QUERY][iQuery2] = None | |
961 nbMerges += 1 | |
962 if not self.multiple: | |
963 mergedTranscripts[iQuery1] = 0 | |
964 | |
965 # Update the sets of transcripts and write into database (also update mergedTranscripts) | |
966 newTranscripts = {self.QUERY: [], self.REFERENCE: []} | |
967 newMergedTranscripts = {} | |
968 for type in (self.QUERY, self.REFERENCE): | |
969 for i, transcript in enumerate(transcripts[type]): | |
970 if transcript == None: continue | |
971 correction = 0 if self.QUERY else self.maxDistance | |
972 if transcript.getEnd() < rangeEnd - correction: | |
973 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): | |
974 self.writeTranscript(transcripts[type][i]) | |
975 else: | |
976 if type == self.QUERY and i in mergedTranscripts: | |
977 newMergedTranscripts[len(newTranscripts[type])] = 0 | |
978 newTranscripts[type].append(transcript) | |
979 transcripts = newTranscripts | |
980 mergedTranscripts = newMergedTranscripts | |
981 | |
982 progress.inc() | |
983 progress.done() | |
984 | |
985 for type in (self.QUERY, self.REFERENCE): | |
986 for i, transcript in enumerate(transcripts[type]): | |
987 if transcripts == None: continue | |
988 if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): | |
989 self.writeTranscript(transcripts[type][i]) | |
990 | |
991 # Manage chromosomes with no corresponding data | |
992 if self.multiple: | |
993 for type in self.INPUTTYPES: | |
994 for chromosome in self.getTables(type): | |
995 if chromosome in self.getTables(1 - type): | |
996 continue | |
997 for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator(): | |
998 self.writeTranscript(transcript) | |
999 | |
1000 self.flushData() | |
1001 if self.writer != None: | |
1002 self.writer.close() | |
1003 self.writer = None | |
1004 | |
1005 if self.verbosity > 0: | |
1006 print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) | |
1007 print "# merges: %d" % (nbMerges) | |
1008 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | |
1009 | |
1010 | |
1011 def compareTranscriptListSelfMerge(self): | |
1012 """ | |
1013 Merge the query list of transcript with itself | |
1014 @return: the merged transcripts in a transcript list database | |
1015 """ | |
1016 nbMerges = 0 | |
1017 distance = self.maxDistance if self.maxDistance != None else 0 | |
1018 | |
1019 self.addIndexes([self.QUERY]) | |
1020 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
1021 self.flushData() | |
1022 | |
1023 # looping | |
1024 for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | |
1025 transcripts2 = [] | |
1026 | |
1027 # get range of transcripts | |
1028 progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | |
1029 command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName()) | |
1030 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | |
1031 | |
1032 # compare sets | |
1033 toBeRemoved = set() | |
1034 toBePrinted = set() | |
1035 for index2, transcript2 in enumerate(transcripts2): | |
1036 | |
1037 if self.compareTranscript(transcript1, transcript2, True): | |
1038 if transcript1.getDirection() != transcript2.getDirection(): | |
1039 transcript2.setDirection(transcript1.getDirection()) | |
1040 transcript1.merge(transcript2, self.normalization) | |
1041 toBeRemoved.add(index2) | |
1042 nbMerges += 1 | |
1043 elif transcript2.getEnd() + distance < transcript1.getStart(): | |
1044 toBePrinted.add(index2) | |
1045 transcripts2.append(transcript1) | |
1046 | |
1047 for index2 in sorted(toBePrinted): | |
1048 self.writeTranscript(transcripts2[index2]) | |
1049 transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)] | |
1050 | |
1051 for transcript2 in transcripts2: | |
1052 self.writeTranscript(transcript2) | |
1053 progress.done() | |
1054 self.getTables(self.QUERY)[chromosome1].remove() | |
1055 | |
1056 self.flushData() | |
1057 if self.writer != None: | |
1058 self.writer.close() | |
1059 self.writer = None | |
1060 | |
1061 if self.verbosity > 0: | |
1062 print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) | |
1063 print "# merges: %d" % (nbMerges) | |
1064 print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | |
1065 | |
1066 | |
1067 def getDifferenceTranscriptList(self): | |
1068 """ | |
1069 Get the elements of the first list which do not overlap the second list (at the nucleotide level) | |
1070 @return: the transcripts that overlap with the reference set | |
1071 """ | |
1072 distance = 0 if self.maxDistance == None else self.maxDistance | |
1073 | |
1074 self.addIndexes([self.QUERY, self.REFERENCE]) | |
1075 | |
1076 # export the container into tables | |
1077 self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) | |
1078 self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) | |
1079 | |
1080 # looping | |
1081 for chromosome1 in sorted(self.getTables(self.QUERY).keys()): | |
1082 # get range of transcripts | |
1083 command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) | |
1084 query = self.mySqlConnection.executeQuery(command) | |
1085 result = query.getLine() | |
1086 first = result[0] | |
1087 last = result[1] | |
1088 nb = result[2] | |
1089 | |
1090 transcripts1 = [] | |
1091 transcripts2 = [] | |
1092 nbChunks = max(1, nb / 100) | |
1093 chunkSize = (last - first) / nbChunks | |
1094 progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) | |
1095 for chunk in range(nbChunks + 1): | |
1096 | |
1097 # load transcripts | |
1098 start = first + chunk * chunkSize | |
1099 end = start + chunkSize - 1 | |
1100 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) | |
1101 for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): | |
1102 transcripts1.append(transcript1) | |
1103 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) | |
1104 self.mySqlConnection.executeQuery(command) | |
1105 if chromosome1 in self.getTables(self.REFERENCE): | |
1106 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) | |
1107 if chunk == 0: | |
1108 command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
1109 for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): | |
1110 transcripts2.append(transcript2) | |
1111 command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) | |
1112 self.mySqlConnection.executeQuery(command) | |
1113 | |
1114 # compare sets | |
1115 toBeRemoved1 = [] | |
1116 for index1, transcript1 in enumerate(transcripts1): | |
1117 newTranscript1 = Transcript() | |
1118 newTranscript1.copy(transcript1) | |
1119 for transcript2 in transcripts2: | |
1120 newTranscript1 = newTranscript1.getDifference(transcript2) | |
1121 if newTranscript1 == None: | |
1122 toBeRemoved1.append(index1) | |
1123 break | |
1124 transcripts1[index1] = newTranscript1 | |
1125 | |
1126 # check if query transcript extends bounds of the chunk | |
1127 if newTranscript1 != None and newTranscript1.getEnd() < end: | |
1128 if self.splitDifference: | |
1129 for exon in newTranscript1.getExons(): | |
1130 transcript = Transcript() | |
1131 transcript.copy(exon) | |
1132 self.writeTranscript(transcript) | |
1133 else: | |
1134 self.writeTranscript(newTranscript1) | |
1135 toBeRemoved1.append(index1) | |
1136 | |
1137 # update list of query transcripts | |
1138 for index1 in reversed(toBeRemoved1): | |
1139 del transcripts1[index1] | |
1140 | |
1141 # check if the reference transcripts extends bounds of the chunk | |
1142 toBeRemoved2 = [] | |
1143 for index2, transcript2 in enumerate(transcripts2): | |
1144 if transcript2.getEnd() + distance < end: | |
1145 toBeRemoved2.append(index2) | |
1146 for index2 in reversed(toBeRemoved2): | |
1147 del transcripts2[index2] | |
1148 | |
1149 progress.inc() | |
1150 | |
1151 for transcript1 in transcripts1: | |
1152 if self.splitDifference: | |
1153 for exon in transcript1.getExons(): | |
1154 transcript = Transcript() | |
1155 transcript.copy(exon) | |
1156 self.writeTranscript(transcript) | |
1157 else: | |
1158 self.writeTranscript(transcript1) | |
1159 progress.done() | |
1160 self.getTables(self.QUERY)[chromosome1].remove() | |
1161 if chromosome1 in self.getTables(self.REFERENCE): | |
1162 self.getTables(self.REFERENCE)[chromosome1].remove() | |
1163 self.getTables(self.WORKING)[chromosome1].remove() | |
1164 | |
1165 self.flushData() | |
1166 if self.writer != None: | |
1167 self.writer.close() | |
1168 self.writer = None | |
1169 | |
1170 if self.verbosity > 0: | |
1171 print "query: %d elements" % (self.nbTranscripts[self.QUERY]) | |
1172 print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) | |
1173 print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) | |
1174 | |
1175 | |
1176 def getOddsPerTranscript(self): | |
1177 """ | |
1178 Return overlap results | |
1179 @return a dict of data | |
1180 """ | |
1181 if not self.odds: | |
1182 raise Exception("Did not compute odds!") | |
1183 return self.overlapResults | |
1184 | |
1185 | |
1186 def getOdds(self): | |
1187 """ | |
1188 Return odds about the overlap | |
1189 @return a dict of data | |
1190 """ | |
1191 if not self.odds: | |
1192 raise Exception("Did not compute odds!") | |
1193 if self.oddResults != None: | |
1194 return self.oddResults | |
1195 self.oddResults = {} | |
1196 for name, value in self.overlapResults.iteritems(): | |
1197 self.oddResults[value] = self.oddResults.get(value, 0) + 1 | |
1198 return self.oddResults |