comparison smart_toolShed/SMART/Java/Python/structure/Interval.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #
2 # Copyright INRA-URGI 2009-2010
3 #
4 # This software is governed by the CeCILL license under French law and
5 # abiding by the rules of distribution of free software. You can use,
6 # modify and/ or redistribute the software under the terms of the CeCILL
7 # license as circulated by CEA, CNRS and INRIA at the following URL
8 # "http://www.cecill.info".
9 #
10 # As a counterpart to the access to the source code and rights to copy,
11 # modify and redistribute granted by the license, users are provided only
12 # with a limited warranty and the software's author, the holder of the
13 # economic rights, and the successive licensors have only limited
14 # liability.
15 #
16 # In this respect, the user's attention is drawn to the risks associated
17 # with loading, using, modifying and/or developing or reproducing the
18 # software by the user in light of its specific status of free software,
19 # that may mean that it is complicated to manipulate, and that also
20 # therefore means that it is reserved for developers and experienced
21 # professionals having in-depth computer knowledge. Users are therefore
22 # encouraged to load and test the software's suitability as regards their
23 # requirements in conditions enabling the security of their systems and/or
24 # data to be ensured and, more generally, to use and operate it in the
25 # same conditions as regards security.
26 #
27 # The fact that you are presently reading this means that you have had
28 # knowledge of the CeCILL license and that you accept its terms.
29 #
30
31 from SMART.Java.Python.structure.Bins import *
32 from commons.core.coord.Range import Range
33
34 class Interval(Range):
35 """
36 Store a genomic interval
37 @ivar name: name of the interval [optional]
38 @type name: string
39 @ivar id: id of the interval [optional]
40 @type id: int
41 @ivar bin: bin in which the interval should be if stored in a database [computed]
42 @type bin: int
43 @ival tags: information about the transcript [optional]
44 @type tags: dict
45 @ivar verbosity: verbosity
46 @type verbosity: int [default: 0]
47 """
48
49 def __init__(self, interval = None, verbosity = 0):
50 """
51 Constructor
52 @param interval: interval to be copied
53 @type interval: class L{Interval<Interval>}
54 @param verbosity: verbosity
55 @type verbosity: int
56 """
57 Range.__init__(self)
58 self.name = None
59 self.id = None
60 self.bin = None
61 self.verbosity = verbosity
62 self.tags = {}
63 if interval != None:
64 self.copy(interval)
65
66 #!!!! Warning: two methods getStart() and getEnd() give the information maximum and minimum in interval.!!!!#
67 #In case strand = "+", start < end; strand = "-", start > end
68 def getStart(self):
69 if self.start == -1:
70 return -1
71 if self.end == -1:
72 return self.start
73 return self.getMin()
74
75
76 def getEnd(self):
77 if self.end == -1:
78 return -1
79 if self.start == -1:
80 return self.end
81 return self.getMax()
82
83
84 def getChromosome(self):
85 return self.getSeqname()
86
87
88 def getDirection(self):
89 return 1 if self.getStrand() == "+" else -1
90
91
92 def getName(self):
93 return self.name
94
95
96 def isSet(self):
97 """
98 Check if the interval is set
99 """
100 return self.getStart() == None and self.getEnd() == None
101
102
103 def copy(self, interval):
104 """
105 Copy method
106 @param interval: interval to be copied
107 @type interval: class L{Interval<Interval>}
108 """
109 self.setStart(interval.getStart())
110 self.setEnd(interval.getEnd())
111 self.setChromosome(interval.getChromosome())
112 self.setDirection(interval.getDirection())
113 self.name = interval.name
114 self.id = interval.id
115 self.bin = interval.bin
116 self.tags = {}
117 for tag in interval.tags:
118 self.tags[tag] = interval.tags[tag]
119 self.verbosity = interval.verbosity
120
121
122 def setName(self, name):
123 """
124 Set the name
125 @param name: name of the interval
126 @type name: string
127 """
128 if len(name) > 100:
129 name = name[:100]
130 self.name = name
131
132
133 def setChromosome(self, chromosome=""):
134 """
135 Set the chromosome
136 @param chromosome: chromosome on which the interval is
137 @type chromosome: string
138 """
139 if not chromosome:
140 self.seqname = None
141 else:
142 self.seqname = chromosome.replace(".", "_").replace("|", "_")
143
144
145 def setStart(self, start):
146 """
147 Set the start point
148 Possibly reset bin
149 @param start: start point of the interval
150 @type start: int
151 """
152 self.bin = None
153 direction = self.getDirection()
154 if self.start == -1:
155 self.start = start
156 elif self.end == -1:
157 self.end = start
158 else:
159 if direction == 1:
160 self.start = start
161 else:
162 self.end = start
163 if direction == 1:
164 self.start, self.end = min(self.start, self.end), max(self.start, self.end)
165 else:
166 self.start, self.end = max(self.start, self.end), min(self.start, self.end)
167
168
169 def setEnd(self, end):
170 """
171 Set the end point
172 Possibly reset bin
173 @param end: end point of the interval of the interval
174 @type end: int
175 """
176 self.bin = None
177 direction = self.getDirection()
178 if self.end == -1:
179 self.end = end
180 elif self.start == -1:
181 self.start = end
182 else:
183 if direction == 1:
184 self.end = end
185 else:
186 self.start = end
187 if direction == 1:
188 self.start, self.end = min(self.start, self.end), max(self.start, self.end)
189 else:
190 self.start, self.end = max(self.start, self.end), min(self.start, self.end)
191
192
193 def setSize(self, size):
194 """
195 Possibly modify the end point
196 @param size: size of the transcript
197 @type size: int
198 """
199 if self.end == None and self.start != None:
200 self.setEnd(self.start + self.getSize() - 1)
201 elif self.start == None and self.end != None:
202 self.setStart(self.end - self.getSize() + 1)
203
204
205 def getSize(self):
206 """
207 Get the size
208 """
209 return self.getEnd() - self.getStart() + 1
210
211
212 def _setDirection(self, direction):
213 """
214 Set the direction of the interval (connection to Range)
215 @param direction: direction of the transcript (+ / -)
216 @type direction: int (1 or -1)
217 """
218 if direction * self.getDirection() < 0:
219 self.reverse()
220
221
222 def setDirection(self, direction):
223 """
224 Set the direction of the interval
225 Possibly parse different formats
226 @param direction: direction of the transcript (+ / -)
227 @type direction: int or string
228 """
229 if type(direction).__name__ == 'int':
230 self._setDirection(direction / abs(direction))
231 elif type(direction).__name__ == 'str':
232 if direction == "+":
233 self._setDirection(1)
234 elif direction == "-":
235 self._setDirection(-1)
236 elif direction == "1" or direction == "-1":
237 self._setDirection(int(direction))
238 elif direction.lower() == "plus":
239 self._setDirection(1)
240 elif direction.lower() == "minus":
241 self._setDirection(-1)
242 else:
243 raise Exception("Cannot understand direction %s" % (direction))
244 else:
245 raise Exception("Cannot understand direction %s" % (direction))
246
247
248 def extendStart(self, size):
249 """
250 Extend the interval by the 5' end
251 @param size: the size to be exended
252 @type size: int
253 """
254 if self.getDirection() == 1:
255 self.setStart(max(0, self.getStart() - size))
256 else:
257 self.setEnd(self.getEnd() + size)
258 self.bin = None
259
260
261 def extendEnd(self, size):
262 """
263 Extend the interval by the 3' end
264 @param size: the size to be exended
265 @type size: int
266 """
267 if self.getDirection() == 1:
268 self.setEnd(self.getEnd() + size)
269 else:
270 self.setStart(max(0, self.getStart() - size))
271 self.bin = None
272
273
274 def restrictStart(self, size = 1):
275 """
276 Restrict the interval by some nucleotides, start from its start position
277 Remove the exons
278 @param size: the size to be restricted to
279 @type size: int
280 """
281 if self.getDirection() == 1:
282 self.setEnd(min(self.getEnd(), self.getStart() + size - 1))
283 else:
284 self.setStart(max(self.getStart(), self.getEnd() - size + 1))
285 self.bin = None
286
287
288 def restrictEnd(self, size = 1):
289 """
290 Restrict the interval by some nucleotides, end from its end position
291 Remove the exons
292 @param size: the size to be restricted to
293 @type size: int
294 """
295 if self.getDirection() == 1:
296 self.setStart(max(self.getStart(), self.getEnd() - size + 1))
297 else:
298 self.setEnd(min(self.getEnd(), self.getStart() + size - 1))
299 self.bin = None
300
301
302
303 def setTagValue(self, name, value):
304 """
305 Set a tag
306 @param name: name of the tag
307 @type name: string
308 @param value: value of the tag
309 @type value: int or string
310 """
311 self.tags[name] = value
312
313
314 def getTagNames(self):
315 """
316 Get all the names of the tags
317 """
318 return self.tags.keys()
319
320
321 def getTagValue(self, tag):
322 """
323 Get the value of a tag
324 @param tag: name of a tag
325 @type tag: string
326 """
327 if tag not in self.tags:
328 return None
329 return self.tags[tag]
330
331
332 def getTagValues(self, tagSep = "; ", fieldSep = " ", surrounder = ""):
333 """
334 Get the formatted tag values
335 @param tagSep: separator between tags
336 @type tagSep: string
337 @param fieldSep: separator between tag name and tag value
338 @type fieldSep: string
339 @param surrounder: string which optionally surround values
340 @type surrounder: string
341 """
342 tags = []
343 for name in self.tags:
344 value = self.tags[name]
345 if value == None:
346 continue
347 if isinstance(value, basestring):
348 tags.append("%s%s%s%s%s" % (name, fieldSep, surrounder, self.tags[name], surrounder))
349 elif type(value) is int:
350 tags.append("%s%s%s%i%s" % (name, fieldSep, surrounder, self.tags[name], surrounder))
351 elif type(value) is float:
352 tags.append("%s%s%s%f%s" % (name, fieldSep, surrounder, self.tags[name], surrounder))
353 else:
354 raise Exception("Do not know how to print '" + value + "'.")
355 if self.getName() != None:
356 tags.append("%s%s%s%s%s" % ("Name", fieldSep, surrounder, self.getName(), surrounder))
357 return tagSep.join(tags)
358
359
360 def setTagValues(self, tags, tagSep = "; ", fieldSep = " "):
361 """
362 Set the tag values using given string
363 @param tags: the tags, concatenated
364 @type tags: string
365 @param tagSep: separator between tags
366 @type tagSep: string
367 @param fieldSep: separator between tag name and tag value
368 @type fieldSep: string
369 """
370 if tags == "":
371 self.tags = {}
372 return
373 for splittedTag in tags.split(tagSep):
374 if fieldSep not in splittedTag:
375 raise Exception("Weird field '%s' in tags '%s'" % (splittedTag, tags))
376 tag, value = splittedTag.split(fieldSep, 1)
377 if tag == "Name":
378 self.setName(value)
379 continue
380 try:
381 intValue = int(value)
382 self.tags[tag] = intValue
383 except ValueError:
384 try:
385 floatValue = float(value)
386 self.tags[tag] = floatValue
387 except ValueError:
388 self.tags[tag] = value
389
390
391 def deleteTag(self, tag):
392 """
393 Remove a tag
394 @param tag: the tag to be removed
395 @type tag: string
396 """
397 if tag in self.tags:
398 del self.tags[tag]
399
400
401 def setNbOccurrences(self, nbOccurrences):
402 """
403 Set the number of occurrences of the interval
404 @param nbOccurrences: number of occurrences of the interval
405 @type nbOccurrences: int
406 """
407 self.setTagValue("nbOccurrences", nbOccurrences)
408
409
410 def setOccurrence(self, occurrence):
411 """
412 Set the occurrence of this interval
413 @param occurrence: an occurrence for this transcript
414 @type occurrence: int
415 """
416 self.setTagValue("occurrence", occurrence)
417
418 def __eq__(self, interval):
419 """
420 Whether two intervals are equal (start and end at same position)
421 @param interval: object to be compared to
422 @type interval: class L{Interval<Interval>}
423 """
424 if not interval:
425 return False
426 return self.getChromosome() == interval.getChromosome() and self.getStart() == interval.getStart() and self.getEnd() == interval.getEnd() and self.getDirection() == interval.getDirection()
427
428
429 def overlapWith(self, interval, nbNucleotides = 1):
430 """
431 Whether two intervals overlap
432 @param interval: object to be compared to
433 @type interval: class L{Interval<Interval>}
434 @param nbNucleotides: minimum number of nucleotides to declare and overlap
435 @type nbNucleotides: int
436 """
437 if self.getChromosome() != interval.getChromosome():
438 return False
439 return (min(self.getEnd(), interval.getEnd()) - max(self.getStart(), interval.getStart()) + 1 >= nbNucleotides)
440
441 def isIncludeIn(self, interval):
442 return interval.include(self)
443
444
445 def include(self, interval):
446 """
447 Whether this interval includes the other one
448 @param interval: object to be compared to
449 @type interval: class L{Interval<Interval>}
450 """
451 if self.getChromosome() != interval.getChromosome():
452 return False
453 return ((self.getStart() <= interval.getStart()) and (self.getEnd() >= interval.getEnd()))
454
455
456 def getDifference(self, interval, sameStrand = False):
457 """
458 Get the difference between this cluster and another one
459 @param interval: object to be compared to
460 @type interval: class L{Interval<Interval>}
461 @param sameStrand: do the comparison iff the intervals are on the same strand
462 @type sameStrand: boolean
463 @return: a (possibly empty) list of intervals
464 """
465 newInterval = Interval()
466 newInterval.copy(self)
467 if self.getChromosome() != interval.getChromosome():
468 return [newInterval]
469 if not self.overlapWith(interval):
470 return [newInterval]
471 if sameStrand and self.getDirection() != interval.getDirection():
472 return [newInterval]
473 intervals = []
474 if self.getStart() < interval.getStart():
475 newInterval = Interval()
476 newInterval.copy(self)
477 newInterval.setEnd(min(self.getEnd(), interval.getStart() - 1))
478 intervals.append(newInterval)
479 if self.getEnd() > interval.getEnd():
480 newInterval = Interval()
481 newInterval.copy(self)
482 newInterval.setStart(max(self.getStart(), interval.getEnd() + 1))
483 intervals.append(newInterval)
484 return intervals
485
486
487 def getIntersection(self, interval):
488 """
489 Get the intersection between this interval and another one
490 @param interval: object to be compared to
491 @type interval: class L{Interval<Interval>}
492 @return: an other interval
493 """
494 if not self.overlapWith(interval):
495 return None
496 newInterval = Interval()
497 newInterval.setChromosome(self.getChromosome())
498 newInterval.setDirection(self.getDirection())
499 newInterval.setName("%s_intersect_%s" % (self.getName(), interval.getName()))
500 newInterval.setStart(max(self.getStart(), interval.getStart()))
501 newInterval.setEnd(min(self.getEnd(), interval.getEnd()))
502 return newInterval
503
504
505 def getDistance(self, interval):
506 """
507 Get the distance between two intervals (a non-negative value)
508 @param interval: another interval
509 @type interval: class L{Interval<Interval>}
510 """
511 if self.overlapWith(interval):
512 return 0
513 if self.getChromosome() != interval.getChromosome():
514 raise Exception("Cannot get the distance between %s and %s" % (str(self), str(interval)))
515 return min(abs(self.getStart() - interval.getEnd()), abs(self.getEnd() - interval.getStart()))
516
517
518 def getRelativeDistance(self, interval):
519 """
520 Get the distance between two intervals (negative if first interval is before)
521 @param interval: another interval
522 @type interval: class L{Interval<Interval>}
523 """
524 if self.overlapWith(interval):
525 return 0
526 if self.getChromosome() != interval.getChromosome():
527 raise Exception("Cannot get the distance between %s and %s" % (str(self), str(interval)))
528 if self.getEnd() < interval.getStart():
529 distance = interval.getStart() - self.getEnd()
530 else:
531 distance = interval.getEnd() - self.getStart()
532 distance *= self.getDirection()
533 return distance
534
535
536 def merge(self, interval, normalization = False):
537 """
538 Merge two intervals
539 @param interval: another interval
540 @type interval: class L{Interval<Interval>}
541 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements
542 @type normalization: boolean
543 """
544 if self.getChromosome() != interval.getChromosome():
545 raise Exception("Cannot merge '%s' and '%s' for they are on different chromosomes." % (str(self), str(interval)))
546 direction = None
547 if self.getStart() == self.getEnd():
548 direction = interval.getDirection()
549 elif interval.getStart() == interval.getEnd():
550 direction = self.getDirection()
551 elif self.getDirection() != interval.getDirection():
552 raise Exception("Cannot merge '%s' and '%s' for they are on different strands." % (str(self), str(interval)))
553 self.setStart(min(self.getStart(), interval.getStart()))
554 self.setEnd(max(self.getEnd(), interval.getEnd()))
555 if direction != None:
556 self.setDirection(direction)
557 nbElements = 0.0
558 for element in (self, interval):
559 for tagName in ("nbElements", "nbOccurrences"):
560 if tagName not in element.getTagNames():
561 element.setTagValue(tagName, 1)
562 nbElements += float(element.getTagValue("nbElements")) / float(element.getTagValue("nbOccurrences")) if normalization else float(element.getTagValue("nbElements"))
563 self.setTagValue("nbElements", nbElements)
564 self.bin = None
565 for tagName in ("identity", "nbOccurrences", "occurrence", "nbMismatches", "nbGaps", "rank", "evalue", "bestRegion"):
566 if tagName in self.getTagNames():
567 del self.tags[tagName]
568
569
570 def getBin(self):
571 """
572 Get the bin of the interval
573 Computed on the fly
574 """
575 if self.bin == None:
576 self.bin = getBin(self.getStart(), self.getEnd())
577 return self.bin
578
579
580 def getBins(self):
581 """
582 Get all the bin this interval could fall into
583 """
584 return getOverlappingBins(self.getStart(), self.getEnd())
585
586
587 def getSqlVariables(cls):
588 """
589 Get the properties of the object that should be saved in a database
590 """
591 variables = ["name", "chromosome", "start", "end", "direction", "tags", "bin"]
592 return variables
593 getSqlVariables = classmethod(getSqlVariables)
594
595
596 def setSqlValues(self, array):
597 """
598 Set the values of the properties of this object as given by a results line of a SQL query
599 """
600 self.id = array[0]
601 self.name = array[1].strip("'")
602 self.setChromosome(array[2].strip("'"))
603 self.setStart(array[3])
604 self.setEnd(array[4])
605 self.setDirection(array[5])
606 self.setTagValues(array[6].strip("'"), ";", "=")
607 self.bin = array[7]
608
609
610 def getSqlValues(self):
611 """
612 Get the values of the properties that should be saved in a database
613 """
614 values = dict()
615 values["name"] = self.name
616 values["chromosome"] = self.getChromosome()
617 values["start"] = self.getStart()
618 values["end"] = self.getEnd()
619 values["direction"] = self.getDirection()
620 values["tags"] = self.getTagValues(";", "=")
621 values["bin"] = self.getBin()
622 return values
623
624
625 def getSqlTypes(cls):
626 """
627 Get the values of the properties that should be saved in a database
628 """
629 types = dict()
630 types["name"] = "varchar"
631 types["chromosome"] = "varchar"
632 types["start"] = "int"
633 types["end"] = "int"
634 types["direction"] = "tinyint"
635 types["tags"] = "varchar"
636 types["bin"] = "int"
637 return types
638 getSqlTypes = classmethod(getSqlTypes)
639
640
641 def getSqlSizes(cls):
642 """
643 Get the sizes of the properties that should be saved in a database
644 """
645 sizes = dict()
646 sizes["name"] = 255
647 sizes["chromosome"] = 255
648 sizes["start"] = 11
649 sizes["end"] = 11
650 sizes["direction"] = 4
651 sizes["tags"] = 1023
652 sizes["bin"] = 11
653 return sizes
654 getSqlSizes = classmethod(getSqlSizes)
655
656
657 def printCoordinates(self):
658 """
659 Print start and end positions (depending on the direction of the interval)
660 """
661 if self.getDirection() == 1:
662 return "%d-%d" % (self.getStart(), self.getEnd())
663 else:
664 return "%d-%d" % (self.getEnd(), self.getStart())
665
666
667 def extractSequence(self, parser):
668 """
669 Get the sequence corresponding to this interval
670 @param parser: a parser to a FASTA file
671 @type parser: class L{SequenceListParser<SequenceListParser>}
672 @return : a instance of L{Sequence<Sequence>}
673 """
674 return parser.getSubSequence(self.getChromosome(), self.getStart(), self.getEnd(), self.getDirection(), self.name)
675
676
677 def extractWigData(self, parser):
678 """
679 Get the data retrieved from a wig file
680 @param parser: a parser class to a WIG file
681 @type parser: class L{WigParser<WigParser>}
682 """
683 data = parser.getRange(self.getChromosome(), self.getStart(), self.getEnd())
684 if self.getDirection() == -1:
685 if parser.strands:
686 newData = {}
687 for strand in data:
688 data[strand].reverse()
689 newData[-strand] = data[strand]
690 data = newData
691 else:
692 data.reverse()
693 return data
694
695
696 def __str__(self):
697 """
698 Output a simple representation of this interval
699 """
700 direction = "+"
701 if self.getDirection() == -1:
702 direction = "-"
703 string = "%s:%d-%d (%s)" % (self.getChromosome(), self.getStart(), self.getEnd(), direction)
704 if self.name != "":
705 string = "(%s) %s" % (self.name, string)
706 return string
707