6
+ − 1 #
+ − 2 # Copyright INRA-URGI 2009-2010
+ − 3 #
+ − 4 # This software is governed by the CeCILL license under French law and
+ − 5 # abiding by the rules of distribution of free software. You can use,
+ − 6 # modify and/ or redistribute the software under the terms of the CeCILL
+ − 7 # license as circulated by CEA, CNRS and INRIA at the following URL
+ − 8 # "http://www.cecill.info".
+ − 9 #
+ − 10 # As a counterpart to the access to the source code and rights to copy,
+ − 11 # modify and redistribute granted by the license, users are provided only
+ − 12 # with a limited warranty and the software's author, the holder of the
+ − 13 # economic rights, and the successive licensors have only limited
+ − 14 # liability.
+ − 15 #
+ − 16 # In this respect, the user's attention is drawn to the risks associated
+ − 17 # with loading, using, modifying and/or developing or reproducing the
+ − 18 # software by the user in light of its specific status of free software,
+ − 19 # that may mean that it is complicated to manipulate, and that also
+ − 20 # therefore means that it is reserved for developers and experienced
+ − 21 # professionals having in-depth computer knowledge. Users are therefore
+ − 22 # encouraged to load and test the software's suitability as regards their
+ − 23 # requirements in conditions enabling the security of their systems and/or
+ − 24 # data to be ensured and, more generally, to use and operate it in the
+ − 25 # same conditions as regards security.
+ − 26 #
+ − 27 # The fact that you are presently reading this means that you have had
+ − 28 # knowledge of the CeCILL license and that you accept its terms.
+ − 29 #
+ − 30 import sys
+ − 31 from SMART.Java.Python.structure.Interval import Interval
+ − 32 from SMART.Java.Python.structure.Sequence import Sequence
+ − 33
+ − 34
+ − 35 class Transcript(Interval):
+ − 36 """
+ − 37 A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals)
+ − 38 @ivar exons: a list of exons (intervals)
+ − 39 @type exons: list of L{Interval{Interval}}
+ − 40 """
+ − 41
+ − 42 def __init__(self, transcript = None, verbosity = 0):
+ − 43 """
+ − 44 Constructor
+ − 45 @param transcript: transcript to be copied
+ − 46 @type transcript: class L{Transcript<Transcript>}
+ − 47 @param verbosity: verbosity
+ − 48 @type verbosity: int
+ − 49 """
+ − 50 super(Transcript, self).__init__(None, verbosity)
+ − 51 self.exons = []
+ − 52 self.introns = None
+ − 53 if transcript != None:
+ − 54 self.copy(transcript)
+ − 55
+ − 56
+ − 57 def copy(self, transcript):
+ − 58 """
+ − 59 Copy method
+ − 60 @param transcript: transcript to be copied
+ − 61 @type transcript: class L{Transcript<Transcript>} or L{Interval<Interval>}
+ − 62 """
+ − 63 super(Transcript, self).copy(transcript)
+ − 64 if transcript.__class__.__name__ == "Transcript":
+ − 65 exons = transcript.getExons()
+ − 66 if len(exons) > 1:
+ − 67 for exon in exons:
+ − 68 exonCopy = Interval(exon)
+ − 69 self.addExon(exonCopy)
+ − 70
+ − 71
+ − 72 def setDirection(self, direction):
+ − 73 """
+ − 74 Set the direction of the interval
+ − 75 Possibly parse different formats
+ − 76 Impact all exons
+ − 77 @param direction: direction of the transcript (+ / -)
+ − 78 @type direction: int or string
+ − 79 """
+ − 80 super(Transcript, self).setDirection(direction)
+ − 81 for exon in self.exons:
+ − 82 exon.setDirection(direction)
+ − 83
+ − 84
+ − 85 def setChromosome(self, chromosome):
+ − 86 """
+ − 87 Set the chromosome
+ − 88 @param chromosome: chromosome on which the transcript is
+ − 89 @type chromosome: string
+ − 90 """
+ − 91 super(Transcript, self).setChromosome(chromosome)
+ − 92 for exon in self.exons:
+ − 93 exon.setChromosome(chromosome)
+ − 94
+ − 95
+ − 96 def addExon(self, exon):
+ − 97 """
+ − 98 Add an exon to the list of exons
+ − 99 @param exon: a new exon
+ − 100 @type exon: class L{Interval<Interval>}
+ − 101 """
+ − 102 if not self.exons and not exon.overlapWith(self):
+ − 103 firstExon = Interval()
+ − 104 firstExon.setStart(self.getStart())
+ − 105 firstExon.setEnd(self.getEnd())
+ − 106 firstExon.setDirection(self.getDirection())
+ − 107 firstExon.setChromosome(self.getChromosome())
+ − 108 self.exons.append(firstExon)
+ − 109 newExon = Interval(exon)
+ − 110 newExon.setDirection(self.getDirection())
+ − 111 self.exons.append(newExon)
+ − 112 if newExon.getStart() < self.getStart():
+ − 113 self.setStart(newExon.getStart())
+ − 114 if newExon.getEnd() > self.getEnd():
+ − 115 self.setEnd(newExon.getEnd())
+ − 116
+ − 117
+ − 118 def setStart(self, start):
+ − 119 """
+ − 120 Set the new start, move the first exon accordingly (if exists)
+ − 121 @param start: the new start
+ − 122 @type start: int
+ − 123 """
+ − 124 super(Transcript, self).setStart(start)
+ − 125 if self.exons:
+ − 126 self.sortExonsIncreasing()
+ − 127 self.exons[0].setStart(start)
+ − 128
+ − 129
+ − 130 def setEnd(self, end):
+ − 131 """
+ − 132 Set the new end, move the last exon accordingly (if exists)
+ − 133 @param end: the new end
+ − 134 @type end: int
+ − 135 """
+ − 136 super(Transcript, self).setEnd(end)
+ − 137 if self.exons:
+ − 138 self.sortExonsIncreasing()
+ − 139 self.exons[-1].setEnd(end)
+ − 140
+ − 141
+ − 142 def reverse(self):
+ − 143 """
+ − 144 Reverse the strand of the transcript
+ − 145 """
+ − 146 super(Transcript, self).reverse()
+ − 147 for exon in self.exons:
+ − 148 exon.reverse()
+ − 149
+ − 150
+ − 151 def getUniqueName(self):
+ − 152 """
+ − 153 Try to give a unique name by possibly adding occurrence
+ − 154 """
+ − 155 if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1:
+ − 156 return "%s-%d" % (self.name, self.tags["occurrence"])
+ − 157 return self.name
+ − 158
+ − 159
+ − 160 def getNbExons(self):
+ − 161 """
+ − 162 Get the number of exons
+ − 163 """
+ − 164 return max(1, len(self.exons))
+ − 165
+ − 166
+ − 167 def getExon(self, i):
+ − 168 """
+ − 169 Get a specific exon
+ − 170 @param i: the rank of the exon
+ − 171 @type i: int
+ − 172 """
+ − 173 if len(self.exons) == 0:
+ − 174 if i != 0:
+ − 175 raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i))
+ − 176 return self
+ − 177 return self.exons[i]
+ − 178
+ − 179
+ − 180 def getExons(self):
+ − 181 """
+ − 182 Get all the exons
+ − 183 """
+ − 184 if len(self.exons) == 0:
+ − 185 return [Interval(self)]
+ − 186 return self.exons
+ − 187
+ − 188
+ − 189 def getIntrons(self):
+ − 190 """
+ − 191 Get all the introns
+ − 192 Compute introns on the fly
+ − 193 """
+ − 194 if self.introns != None:
+ − 195 return self.introns
+ − 196 self.sortExons()
+ − 197 self.introns = []
+ − 198 exonStart = self.getExon(0)
+ − 199 for cpt, exonEnd in enumerate(self.exons[1:]):
+ − 200 intron = Interval()
+ − 201 intron.setName("%s_intron%d" % (self.getName(), cpt+1))
+ − 202 intron.setChromosome(self.getChromosome())
+ − 203 intron.setDirection(self.getDirection())
+ − 204 if self.getDirection() == 1:
+ − 205 intron.setEnd(exonEnd.getStart() - 1)
+ − 206 intron.setStart(exonStart.getEnd() + 1)
+ − 207 else:
+ − 208 intron.setStart(exonEnd.getEnd() + 1)
+ − 209 intron.setEnd(exonStart.getStart() - 1)
+ − 210 intron.setDirection(self.getDirection())
+ − 211 if intron.getSize() > 0:
+ − 212 self.introns.append(intron)
+ − 213 exonStart = exonEnd
+ − 214 intron.setSize(intron.getEnd() - intron.getStart() + 1)
+ − 215 return self.introns
+ − 216
+ − 217
+ − 218 def getSize(self):
+ − 219 """
+ − 220 Get the size of the transcript (i.e. the number of nucleotides)
+ − 221 Compute size on the fly
+ − 222 """
+ − 223 if len(self.exons) == 0:
+ − 224 return self.getSizeWithIntrons()
+ − 225 size = 0
+ − 226 for exon in self.exons:
+ − 227 size += exon.getSize()
+ − 228 return size
+ − 229
+ − 230
+ − 231 def getSizeWithIntrons(self):
+ − 232 """
+ − 233 Get the size of the interval (i.e. distance from start to end)
+ − 234 """
+ − 235 return super(Transcript, self).getSize()
+ − 236
+ − 237
+ − 238 def overlapWithExon(self, transcript, nbNucleotides = 1):
+ − 239 """
+ − 240 Check if the exons of this transcript overlap with the exons of another transcript
+ − 241 @param transcript: transcript to be compared to
+ − 242 @type transcript: class L{Transcript<Transcript>}
+ − 243 @param nbNucleotides: minimum number of nucleotides to declare and overlap
+ − 244 @type nbNucleotides: int
+ − 245 """
+ − 246 if not self.overlapWith(transcript, nbNucleotides):
+ − 247 return False
+ − 248 for thisExon in self.getExons():
+ − 249 for thatExon in transcript.getExons():
+ − 250 if thisExon.overlapWith(thatExon, nbNucleotides):
+ − 251 return True
+ − 252 return False
+ − 253
+ − 254
+ − 255 def include(self, transcript):
+ − 256 """
+ − 257 Whether this transcript includes the other one
+ − 258 @param transcript: object to be compared to
+ − 259 @type transcript: class L{Transcript<Transcript>}
+ − 260 """
+ − 261 if not super(Transcript, self).include(transcript):
+ − 262 return False
+ − 263 for thatExon in transcript.getExons():
+ − 264 for thisExon in self.getExons():
+ − 265 if thisExon.include(thatExon):
+ − 266 break
+ − 267 else:
+ − 268 return False
+ − 269 return True
+ − 270
+ − 271
+ − 272 def merge(self, transcript, normalization = False):
+ − 273 """
+ − 274 Merge with another transcript
+ − 275 Merge exons if they overlap, otherwise add exons
+ − 276 @param transcript: transcript to be merged to
+ − 277 @type transcript: class L{Transcript<Transcript>}
+ − 278 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements
+ − 279 @type normalization: boolean
+ − 280 """
+ − 281 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection():
+ − 282 raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript))
+ − 283
+ − 284 theseExons = self.getExons()
+ − 285 thoseExons = transcript.getExons()
+ − 286
+ − 287 for thatExon in thoseExons:
+ − 288 toBeRemoved = []
+ − 289 for thisIndex, thisExon in enumerate(theseExons):
+ − 290 if thisExon.overlapWith(thatExon):
+ − 291 thatExon.merge(thisExon)
+ − 292 toBeRemoved.append(thisIndex)
+ − 293 theseExons.append(thatExon)
+ − 294 for thisIndex in reversed(toBeRemoved):
+ − 295 del theseExons[thisIndex]
+ − 296 self.removeExons()
+ − 297 self.setStart(min(self.getStart(), transcript.getStart()))
+ − 298 self.setEnd(max(self.getEnd(), transcript.getEnd()))
+ − 299 if len(theseExons) > 1:
+ − 300 for thisExon in theseExons:
+ − 301 self.addExon(thisExon)
+ − 302
+ − 303 self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName()))
+ − 304 super(Transcript, self).merge(transcript, normalization)
+ − 305
+ − 306
+ − 307 def getDifference(self, transcript, sameStrand = False):
+ − 308 """
+ − 309 Get the difference between this cluster and another one
+ − 310 @param transcript: object to be compared to
+ − 311 @type transcript: class L{Transcript<Transcript>}
+ − 312 @param sameStrand: do the comparison iff the transcripts are on the same strand
+ − 313 @type sameStrand: boolean
+ − 314 @return: a transcript
+ − 315 """
+ − 316 newTranscript = Transcript()
+ − 317 newTranscript.copy(self)
+ − 318 if self.getChromosome() != transcript.getChromosome():
+ − 319 return newTranscript
+ − 320 if not self.overlapWith(transcript):
+ − 321 return newTranscript
+ − 322 if sameStrand and self.getDirection() != transcript.getDirection():
+ − 323 return newTranscript
+ − 324 newTranscript.removeExons()
+ − 325 if transcript.getEnd() > newTranscript.getStart():
+ − 326 newTranscript.setStart(transcript.getEnd() + 1)
+ − 327 if transcript.getStart() < newTranscript.getEnd():
+ − 328 newTranscript.setEnd(transcript.getStart() + 1)
+ − 329 theseExons = []
+ − 330 for exon in self.getExons():
+ − 331 exonCopy = Interval()
+ − 332 exonCopy.copy(exon)
+ − 333 theseExons.append(exonCopy)
+ − 334 for thatExon in transcript.getExons():
+ − 335 newExons = []
+ − 336 for thisExon in theseExons:
+ − 337 newExons.extend(thisExon.getDifference(thatExon))
+ − 338 theseExons = newExons
+ − 339 if not theseExons:
+ − 340 return None
+ − 341 newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd()
+ − 342 for thisExon in theseExons[1:]:
+ − 343 newStart = min(newStart, thisExon.getStart())
+ − 344 newEnd = max(newEnd, thisExon.getEnd())
+ − 345 newTranscript.setEnd(newEnd)
+ − 346 newTranscript.setStart(newStart)
+ − 347 newTranscript.exons = theseExons
+ − 348 return newTranscript
+ − 349
+ − 350
+ − 351 def getSqlVariables(cls):
+ − 352 """
+ − 353 Get the properties of the object that should be saved in a database
+ − 354 """
+ − 355 variables = Interval.getSqlVariables()
+ − 356 variables.append("exons")
+ − 357 return variables
+ − 358 getSqlVariables = classmethod(getSqlVariables)
+ − 359
+ − 360
+ − 361 def setSqlValues(self, array):
+ − 362 """
+ − 363 Set the values of the properties of this object as given by a results line of a SQL query
+ − 364 @param array: the values to be copied
+ − 365 @type array: a list
+ − 366 """
+ − 367 super(Transcript, self).setSqlValues(array)
+ − 368 mergedExons = array[8]
+ − 369 if not mergedExons:
+ − 370 return
+ − 371 for exonCount, splittedExon in enumerate(mergedExons.split(",")):
+ − 372 start, end = splittedExon.split("-")
+ − 373 exon = Interval()
+ − 374 exon.setChromosome(self.getChromosome())
+ − 375 exon.setDirection(self.getDirection())
+ − 376 exon.setName("%s_exon%d" % (self.getName(), exonCount+1))
+ − 377 exon.setStart(int(start))
+ − 378 exon.setEnd(int(end))
+ − 379 self.addExon(exon)
+ − 380
+ − 381
+ − 382 def getSqlValues(self):
+ − 383 """
+ − 384 Get the values of the properties that should be saved in a database
+ − 385 """
+ − 386 values = super(Transcript, self).getSqlValues()
+ − 387 values["size"] = self.getSize()
+ − 388 if self.getNbExons() == 1:
+ − 389 values["exons"] = ""
+ − 390 else:
+ − 391 values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()])
+ − 392 return values
+ − 393
+ − 394
+ − 395 def getSqlTypes(cls):
+ − 396 """
+ − 397 Get the types of the properties that should be saved in a database
+ − 398 """
+ − 399 types = Interval.getSqlTypes()
+ − 400 types["exons"] = "varchar"
+ − 401 return types
+ − 402 getSqlTypes = classmethod(getSqlTypes)
+ − 403
+ − 404
+ − 405 def getSqlSizes(cls):
+ − 406 """
+ − 407 Get the sizes of the properties that should be saved in a database
+ − 408 """
+ − 409 sizes = Interval.getSqlSizes()
+ − 410 sizes["exons"] = 10000
+ − 411 return sizes
+ − 412 getSqlSizes = classmethod(getSqlSizes)
+ − 413
+ − 414
+ − 415 def sortExons(self):
+ − 416 """
+ − 417 Sort the exons
+ − 418 Increasing order if transcript is on strand "+", decreasing otherwise
+ − 419 """
+ − 420 self.sortExonsIncreasing()
+ − 421 if self.getDirection() == -1:
+ − 422 exons = self.getExons()
+ − 423 exons.reverse()
+ − 424 self.exons = exons
+ − 425
+ − 426
+ − 427 def sortExonsIncreasing(self):
+ − 428 """
+ − 429 Sort the exons
+ − 430 Increasing order
+ − 431 """
+ − 432 exons = self.getExons()
+ − 433 sortedExons = []
+ − 434 while len(exons) > 0:
+ − 435 minExon = exons[0]
+ − 436 for index in range(1, len(exons)):
+ − 437 if minExon.getStart() > exons[index].getStart():
+ − 438 minExon = exons[index]
+ − 439 sortedExons.append(minExon)
+ − 440 exons.remove(minExon)
+ − 441 self.exons = sortedExons
+ − 442
+ − 443
+ − 444 def extendStart(self, size):
+ − 445 """
+ − 446 Extend the transcript by the 5' end
+ − 447 @param size: the size to be extended
+ − 448 @type size: int
+ − 449 """
+ − 450 if len(self.exons) != 0:
+ − 451 self.sortExons()
+ − 452 if self.getDirection() == 1:
+ − 453 self.exons[0].setStart(max(0, self.exons[0].getStart() - size))
+ − 454 else:
+ − 455 self.exons[0].setEnd(self.exons[0].getEnd() + size)
+ − 456 super(Transcript, self).extendStart(size)
+ − 457 self.bin = None
+ − 458
+ − 459
+ − 460 def extendEnd(self, size):
+ − 461 """
+ − 462 Extend the transcript by the 3' end
+ − 463 @param size: the size to be extended
+ − 464 @type size: int
+ − 465 """
+ − 466 if len(self.exons) != 0:
+ − 467 self.sortExons()
+ − 468 if self.getDirection() == 1:
+ − 469 self.exons[-1].setEnd(self.exons[-1].getEnd() + size)
+ − 470 else:
+ − 471 self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size))
+ − 472 super(Transcript, self).extendEnd(size)
+ − 473 self.bin = None
+ − 474
+ − 475
+ − 476 def extendExons(self, size):
+ − 477 """
+ − 478 Extend all the exons
+ − 479 @param size: the size to be extended
+ − 480 @type size: int
+ − 481 """
+ − 482 if len(self.exons) != 0:
+ − 483 self.sortExons()
+ − 484 exons = []
+ − 485 previousExon = None
+ − 486 for exon in self.exons:
+ − 487 exon.extendStart(size)
+ − 488 exon.extendEnd(size)
+ − 489 exon.setDirection(self.getDirection())
+ − 490 if previousExon != None and previousExon.overlapWith(exon):
+ − 491 previousExon.merge(exon)
+ − 492 else:
+ − 493 if previousExon != None:
+ − 494 exons.append(previousExon)
+ − 495 previousExon = exon
+ − 496 exons.append(previousExon)
+ − 497 self.exons = exons
+ − 498 super(Transcript, self).extendStart(size)
+ − 499 super(Transcript, self).extendEnd(size)
+ − 500 self.bin = None
+ − 501
+ − 502
+ − 503 def restrictStart(self, size = 1):
+ − 504 """
+ − 505 Restrict the transcript by some nucleotides, start from its start position
+ − 506 Remove the exons
+ − 507 @param size: the size to be restricted to
+ − 508 @type size: int
+ − 509 """
+ − 510 newExons = []
+ − 511 if self.getDirection() == 1:
+ − 512 for exon in self.exons:
+ − 513 if exon.getStart() <= self.getStart() + size - 1:
+ − 514 if exon.getEnd() > self.getStart() + size - 1:
+ − 515 exon.setEnd(self.getStart() + size - 1)
+ − 516 newExons.append(exon)
+ − 517 else:
+ − 518 for exon in self.exons:
+ − 519 if exon.getEnd() >= self.getEnd() - size + 1:
+ − 520 if exon.getStart() < self.getEnd() - size + 1:
+ − 521 exon.setStart(self.getEnd() - size + 1)
+ − 522 newExons.append(exon)
+ − 523 super(Transcript, self).restrictStart(size)
+ − 524 self.exons = newExons
+ − 525
+ − 526
+ − 527 def restrictEnd(self, size = 1):
+ − 528 """
+ − 529 Restrict the transcript by some nucleotides, end from its end position
+ − 530 Remove the exons
+ − 531 @param size: the size to be restricted to
+ − 532 @type size: int
+ − 533 """
+ − 534 newExons = []
+ − 535 if self.getDirection() == 1:
+ − 536 for exon in self.exons:
+ − 537 if exon.getEnd() >= self.getEnd() - size + 1:
+ − 538 if exon.getStart() < self.getEnd() - size + 1:
+ − 539 exon.setStart(self.getEnd() - size + 1)
+ − 540 newExons.append(exon)
+ − 541 else:
+ − 542 for exon in self.exons:
+ − 543 if exon.getEnd() >= self.getEnd() - size + 1:
+ − 544 if exon.getStart() < self.getEnd() - size + 1:
+ − 545 exon.setEnd(self.getEnd() - size + 1)
+ − 546 newExons.append(exon)
+ − 547 super(Transcript, self).restrictEnd(size)
+ − 548 self.exons = newExons
+ − 549
+ − 550
+ − 551 def removeExons(self):
+ − 552 """
+ − 553 Remove the exons and transforms the current transcript into a mere interval
+ − 554 """
+ − 555 self.exons = []
+ − 556 self.bin = None
+ − 557
+ − 558
+ − 559 def printGtf(self, title):
+ − 560 """
+ − 561 Export this transcript using GTF2.2 format
+ − 562 @param title: the title of the transcripts
+ − 563 @type title: string
+ − 564 @return: a string
+ − 565 """
+ − 566 transcriptId = self.getUniqueName()
+ − 567 geneId = "%s_gene" % (transcriptId)
+ − 568 direction = "+"
+ − 569 if self.getDirection() == -1:
+ − 570 direction = "-"
+ − 571 self.sortExonsIncreasing()
+ − 572 string = ""
+ − 573 for i, exon in enumerate(self.getExons()):
+ − 574 exonCopy = Interval()
+ − 575 exonCopy.copy(exon)
+ − 576 if "ID" in exonCopy.getTagValues():
+ − 577 del exonCopy.tags["ID"]
+ − 578 feature = "exon"
+ − 579 if "feature" in exonCopy.getTagNames():
+ − 580 feature = exonCopy.getTagValue("feature")
+ − 581 del exonCopy.tags["feature"]
+ − 582 score = "."
+ − 583 if "score" in exonCopy.getTagNames():
+ − 584 score = "%d" % (int(exonCopy.getTagValue("score")))
+ − 585 del exonCopy.tags["score"]
+ − 586 if "Parent" in exonCopy.getTagNames():
+ − 587 del exonCopy.tags["Parent"]
+ − 588 exonCopy.setName("%s_part%d" % (self.getName(), i+1))
+ − 589 comment = exonCopy.getTagValues("; ", " ", "\"")
+ − 590 string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment)
+ − 591 return string
+ − 592
+ − 593
+ − 594 def printGff2(self, title):
+ − 595 """
+ − 596 Export this transcript using GFF2 format
+ − 597 @param title: the title of the transcripts
+ − 598 @type title: string
+ − 599 @return: a string
+ − 600 """
+ − 601 direction = "+"
+ − 602 if self.getDirection() == -1:
+ − 603 direction = "-"
+ − 604 self.sortExonsIncreasing()
+ − 605 comment = self.getTagValues()
+ − 606 if comment != None:
+ − 607 comment = ";%s" % (comment)
+ − 608 score = "."
+ − 609 if "score" in self.getTagNames():
+ − 610 score = "%d" % (int(self.getTagValue("score")))
+ − 611 feature = "transcript"
+ − 612 if "feature" in self.getTagNames():
+ − 613 feature = self.getTagValue("feature")
+ − 614 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment)
+ − 615 for exon in self.getExons():
+ − 616 if "score" in exon.getTagNames():
+ − 617 score = "%d" % (int(self.getTagValue("score")))
+ − 618 string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name)
+ − 619 return string
+ − 620
+ − 621
+ − 622 def printGff3(self, title):
+ − 623 """
+ − 624 Export this transcript using GFF3 format
+ − 625 @param title: the title of the transcripts
+ − 626 @type title: string
+ − 627 @return: a string
+ − 628 """
+ − 629 direction = "+"
+ − 630 if self.getDirection() == -1:
+ − 631 direction = "-"
+ − 632 self.sortExonsIncreasing()
+ − 633 if "ID" not in self.getTagValues():
+ − 634 self.setTagValue("ID", self.getUniqueName())
+ − 635 feature = "transcript"
+ − 636 tags = self.tags
+ − 637 if "feature" in self.getTagNames():
+ − 638 feature = self.getTagValue("feature")
+ − 639 del self.tags["feature"]
+ − 640 score = "."
+ − 641 if "score" in self.getTagNames():
+ − 642 score = "%d" % (int(self.getTagValue("score")))
+ − 643 del self.tags["score"]
+ − 644 comment = self.getTagValues(";", "=")
+ − 645 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment)
+ − 646 if len(self.exons) > 1:
+ − 647 for i, exon in enumerate(self.getExons()):
+ − 648 if "score" in exon.getTagNames():
+ − 649 score = "%d" % (int(exon.getTagValue("score")))
+ − 650 string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID"))
+ − 651 self.tags = tags
+ − 652 return string
+ − 653
+ − 654
+ − 655 def printEmbl(self):
+ − 656 """
+ − 657 Export this transcript using EMBL format
+ − 658 @return: a string
+ − 659 """
+ − 660 if len(self.exons) <= 1:
+ − 661 position = "%d..%d" % (self.getStart(), self.getEnd())
+ − 662 else:
+ − 663 positions = []
+ − 664 for exon in self.getExons():
+ − 665 positions.append("%d..%d" % (self.getStart(), self.getEnd()))
+ − 666 position = ",".join(positions)
+ − 667 position = "join(%s)" % (position)
+ − 668 if self.getDirection() == -1:
+ − 669 position = "complement(%s)" % (position)
+ − 670 feature = "misc_feature"
+ − 671 if "feature" in self.getTagNames():
+ − 672 if not self.getTagValue("feature").startswith("S-MART"):
+ − 673 feature = self.getTagValue("feature")
+ − 674 string = "FT %s %s\n" % (feature, position)
+ − 675 if "Name" in self.getTagNames():
+ − 676 string += "FT /label=\"%s\"\n" % (self.getTagValue("Name"))
+ − 677 return string
+ − 678
+ − 679
+ − 680 def printBed(self):
+ − 681 """
+ − 682 Export this transcript using BED format
+ − 683 @return: a string
+ − 684 """
+ − 685 name = self.name
+ − 686 if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"):
+ − 687 name = "%s-%d" % (name, self.getTagValue("occurrence"))
+ − 688 comment = self.getTagValues(";", "=")
+ − 689 sizes = []
+ − 690 starts = []
+ − 691 direction = "+"
+ − 692 if self.getDirection() == -1:
+ − 693 direction = "-"
+ − 694 self.sortExonsIncreasing()
+ − 695 for exon in self.getExons():
+ − 696 sizes.append("%d" % (exon.getSize()))
+ − 697 starts.append("%d" % (exon.getStart() - self.getStart()))
+ − 698 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts))
+ − 699
+ − 700
+ − 701 def printSam(self):
+ − 702 """
+ − 703 Export this transcript using SAM format
+ − 704 @return: a string
+ − 705 """
+ − 706 name = self.name
+ − 707 flag = 0 if self.getDirection() == 1 else 0x10
+ − 708 chromosome = self.getChromosome()
+ − 709 genomeStart = self.getStart()
+ − 710 quality = 255
+ − 711 mate = "*"
+ − 712 mateGenomeStart = 0
+ − 713 gapSize = 0
+ − 714 sequence = "*"
+ − 715 qualityString = "*"
+ − 716 tags = "NM:i:0"
+ − 717
+ − 718 lastExonEnd = None
+ − 719 self.sortExonsIncreasing()
+ − 720 exon = self.getExons()[0]
+ − 721 cigar = "%dM" % (self.getExons()[0].getSize())
+ − 722 lastExonEnd = exon.getEnd()
+ − 723 for i, exon in enumerate(self.getExons()):
+ − 724 if i == 0:
+ − 725 continue
+ − 726 cigar += "%dN" % (exon.getStart() - lastExonEnd - 1)
+ − 727 cigar += "%dM" % (exon.getSize())
+ − 728
+ − 729 return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags)
+ − 730
+ − 731
+ − 732 def printUcsc(self):
+ − 733 """
+ − 734 Export this transcript using UCSC BED format
+ − 735 @return: a string
+ − 736 """
+ − 737 if self.getChromosome().find("Het") != -1:
+ − 738 return ""
+ − 739 name = self.name
+ − 740 comment = self.getTagValues(";", "")
+ − 741 sizes = []
+ − 742 starts = []
+ − 743 direction = "+"
+ − 744 if self.getDirection() == -1:
+ − 745 direction = "-"
+ − 746 self.sortExonsIncreasing()
+ − 747 for exon in self.getExons():
+ − 748 sizes.append("%d" % (exon.getSize()))
+ − 749 starts.append("%d" % (exon.getStart() - self.getStart()))
+ − 750 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts))
+ − 751
+ − 752
+ − 753 def printGBrowseReference(self):
+ − 754 """
+ − 755 Export this transcript using GBrowse format (1st line only)
+ − 756 @return: a string
+ − 757 """
+ − 758 return "reference = %s\n" % (self.getChromosome())
+ − 759
+ − 760
+ − 761 def printGBrowseLine(self):
+ − 762 """
+ − 763 Export this transcript using GBrowse format (2nd line only)
+ − 764 @return: a string
+ − 765 """
+ − 766 self.sortExons()
+ − 767 coordinates = []
+ − 768 for exon in self.getExons():
+ − 769 coordinates.append(exon.printCoordinates())
+ − 770 coordinatesString = ",".join(coordinates)
+ − 771 comment = self.getTagValues(";", "=")
+ − 772 if comment:
+ − 773 comment = "\t\"%s\"" % (comment)
+ − 774 return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment)
+ − 775
+ − 776
+ − 777 def printGBrowse(self):
+ − 778 """
+ − 779 Export this transcript using GBrowse format
+ − 780 @return: a string
+ − 781 """
+ − 782 return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine())
+ − 783
+ − 784
+ − 785 def printCsv(self):
+ − 786 """
+ − 787 Export this transcript using CSV format
+ − 788 @return: a string
+ − 789 """
+ − 790 self.sortExons()
+ − 791 string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-")
+ − 792 if len(self.getExons()) == 1:
+ − 793 string += "None"
+ − 794 else:
+ − 795 for exon in self.getExons():
+ − 796 string += "%d-%d " % (exon.getStart(), exon.getEnd())
+ − 797 for tag in sorted(self.tags.keys()):
+ − 798 string += ",%s=%s" % (tag, str(self.tags[tag]))
+ − 799 string += "\n"
+ − 800 return string
+ − 801
+ − 802
+ − 803 def extractSequence(self, parser):
+ − 804 """
+ − 805 Get the sequence corresponding to this transcript
+ − 806 @param parser: a parser to a FASTA file
+ − 807 @type parser: class L{SequenceListParser<SequenceListParser>}
+ − 808 @return: an instance of L{Sequence<Sequence>}
+ − 809 """
+ − 810 self.sortExons()
+ − 811 name = self.name
+ − 812 if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name:
+ − 813 name += ":%s" % (self.getTagValue("ID"))
+ − 814 sequence = Sequence(name)
+ − 815 for exon in self.getExons():
+ − 816 sequence.concatenate(exon.extractSequence(parser))
+ − 817 return sequence
+ − 818
+ − 819
+ − 820 def extractWigData(self, parser):
+ − 821 """
+ − 822 Get some wig data corresponding to this transcript
+ − 823 @param parser: a parser to a wig file
+ − 824 @type parser: class L{WigParser<WigParser>}
+ − 825 @return: a sequence of float
+ − 826 """
+ − 827 self.sortExons()
+ − 828 if parser.strands:
+ − 829 strands = (-1, 1)
+ − 830 values = dict([(strand, []) for strand in strands])
+ − 831 for exon in self.getExons():
+ − 832 theseValues = exon.extractWigData(parser)
+ − 833 if self.getDirection() == -1:
+ − 834 for strand in strands:
+ − 835 theseValues[strand].reverse()
+ − 836 for strand in strands:
+ − 837 values[strand].extend(theseValues[strand])
+ − 838 if self.getDirection() == -1:
+ − 839 for strand in strands:
+ − 840 values[strand].reverse()
+ − 841 return values
+ − 842 else:
+ − 843 values = []
+ − 844 for exon in self.getExons():
+ − 845 theseValues = exon.extractWigData(parser)
+ − 846 #if self.getDirection() == -1:
+ − 847 # theseValues.reverse()
+ − 848 values.extend(theseValues)
+ − 849 #if self.getDirection() == -1:
+ − 850 # values.reverse()
+ − 851 return values