Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/structure/Transcript.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | |
children | 169d364ddd91 |
comparison
equal
deleted
inserted
replaced
35:d94018ca4ada | 36:44d5973c188c |
---|---|
1 # | |
2 # Copyright INRA-URGI 2009-2010 | |
3 # | |
4 # This software is governed by the CeCILL license under French law and | |
5 # abiding by the rules of distribution of free software. You can use, | |
6 # modify and/ or redistribute the software under the terms of the CeCILL | |
7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
8 # "http://www.cecill.info". | |
9 # | |
10 # As a counterpart to the access to the source code and rights to copy, | |
11 # modify and redistribute granted by the license, users are provided only | |
12 # with a limited warranty and the software's author, the holder of the | |
13 # economic rights, and the successive licensors have only limited | |
14 # liability. | |
15 # | |
16 # In this respect, the user's attention is drawn to the risks associated | |
17 # with loading, using, modifying and/or developing or reproducing the | |
18 # software by the user in light of its specific status of free software, | |
19 # that may mean that it is complicated to manipulate, and that also | |
20 # therefore means that it is reserved for developers and experienced | |
21 # professionals having in-depth computer knowledge. Users are therefore | |
22 # encouraged to load and test the software's suitability as regards their | |
23 # requirements in conditions enabling the security of their systems and/or | |
24 # data to be ensured and, more generally, to use and operate it in the | |
25 # same conditions as regards security. | |
26 # | |
27 # The fact that you are presently reading this means that you have had | |
28 # knowledge of the CeCILL license and that you accept its terms. | |
29 # | |
30 import sys | |
31 from SMART.Java.Python.structure.Interval import Interval | |
32 from SMART.Java.Python.structure.Sequence import Sequence | |
33 | |
34 | |
35 class Transcript(Interval): | |
36 """ | |
37 A class that models an transcript, considered as a specialized interval (the bounds of the transcript) that contains exons (also represented as intervals) | |
38 @ivar exons: a list of exons (intervals) | |
39 @type exons: list of L{Interval{Interval}} | |
40 """ | |
41 | |
42 def __init__(self, transcript = None, verbosity = 0): | |
43 """ | |
44 Constructor | |
45 @param transcript: transcript to be copied | |
46 @type transcript: class L{Transcript<Transcript>} | |
47 @param verbosity: verbosity | |
48 @type verbosity: int | |
49 """ | |
50 super(Transcript, self).__init__(None, verbosity) | |
51 self.exons = [] | |
52 self.introns = None | |
53 if transcript != None: | |
54 self.copy(transcript) | |
55 | |
56 | |
57 def copy(self, transcript): | |
58 """ | |
59 Copy method | |
60 @param transcript: transcript to be copied | |
61 @type transcript: class L{Transcript<Transcript>} or L{Interval<Interval>} | |
62 """ | |
63 super(Transcript, self).copy(transcript) | |
64 if transcript.__class__.__name__ == "Transcript": | |
65 exons = transcript.getExons() | |
66 if len(exons) > 1: | |
67 for exon in exons: | |
68 exonCopy = Interval(exon) | |
69 self.addExon(exonCopy) | |
70 | |
71 | |
72 def setDirection(self, direction): | |
73 """ | |
74 Set the direction of the interval | |
75 Possibly parse different formats | |
76 Impact all exons | |
77 @param direction: direction of the transcript (+ / -) | |
78 @type direction: int or string | |
79 """ | |
80 super(Transcript, self).setDirection(direction) | |
81 for exon in self.exons: | |
82 exon.setDirection(direction) | |
83 | |
84 | |
85 def setChromosome(self, chromosome): | |
86 """ | |
87 Set the chromosome | |
88 @param chromosome: chromosome on which the transcript is | |
89 @type chromosome: string | |
90 """ | |
91 super(Transcript, self).setChromosome(chromosome) | |
92 for exon in self.exons: | |
93 exon.setChromosome(chromosome) | |
94 | |
95 | |
96 def addExon(self, exon): | |
97 """ | |
98 Add an exon to the list of exons | |
99 @param exon: a new exon | |
100 @type exon: class L{Interval<Interval>} | |
101 """ | |
102 if not self.exons and not exon.overlapWith(self): | |
103 firstExon = Interval() | |
104 firstExon.setStart(self.getStart()) | |
105 firstExon.setEnd(self.getEnd()) | |
106 firstExon.setDirection(self.getDirection()) | |
107 firstExon.setChromosome(self.getChromosome()) | |
108 self.exons.append(firstExon) | |
109 newExon = Interval(exon) | |
110 newExon.setDirection(self.getDirection()) | |
111 self.exons.append(newExon) | |
112 if newExon.getStart() < self.getStart(): | |
113 self.setStart(newExon.getStart()) | |
114 if newExon.getEnd() > self.getEnd(): | |
115 self.setEnd(newExon.getEnd()) | |
116 | |
117 | |
118 def setStart(self, start): | |
119 """ | |
120 Set the new start, move the first exon accordingly (if exists) | |
121 @param start: the new start | |
122 @type start: int | |
123 """ | |
124 super(Transcript, self).setStart(start) | |
125 if self.exons: | |
126 self.sortExonsIncreasing() | |
127 self.exons[0].setStart(start) | |
128 | |
129 | |
130 def setEnd(self, end): | |
131 """ | |
132 Set the new end, move the last exon accordingly (if exists) | |
133 @param end: the new end | |
134 @type end: int | |
135 """ | |
136 super(Transcript, self).setEnd(end) | |
137 if self.exons: | |
138 self.sortExonsIncreasing() | |
139 self.exons[-1].setEnd(end) | |
140 | |
141 | |
142 def reverse(self): | |
143 """ | |
144 Reverse the strand of the transcript | |
145 """ | |
146 super(Transcript, self).reverse() | |
147 for exon in self.exons: | |
148 exon.reverse() | |
149 | |
150 | |
151 def getUniqueName(self): | |
152 """ | |
153 Try to give a unique name by possibly adding occurrence | |
154 """ | |
155 if "nbOccurrences" in self.tags and "occurrence" in self.tags and self.tags["nbOccurrences"] != 1: | |
156 return "%s-%d" % (self.name, self.tags["occurrence"]) | |
157 return self.name | |
158 | |
159 | |
160 def getNbExons(self): | |
161 """ | |
162 Get the number of exons | |
163 """ | |
164 return max(1, len(self.exons)) | |
165 | |
166 | |
167 def getExon(self, i): | |
168 """ | |
169 Get a specific exon | |
170 @param i: the rank of the exon | |
171 @type i: int | |
172 """ | |
173 if len(self.exons) == 0: | |
174 if i != 0: | |
175 raise Exception("Cannot get exon #%i while there is no exon in the transcript" % (i)) | |
176 return self | |
177 return self.exons[i] | |
178 | |
179 | |
180 def getExons(self): | |
181 """ | |
182 Get all the exons | |
183 """ | |
184 if len(self.exons) == 0: | |
185 return [Interval(self)] | |
186 return self.exons | |
187 | |
188 | |
189 def getIntrons(self): | |
190 """ | |
191 Get all the introns | |
192 Compute introns on the fly | |
193 """ | |
194 if self.introns != None: | |
195 return self.introns | |
196 self.sortExons() | |
197 self.introns = [] | |
198 exonStart = self.getExon(0) | |
199 for cpt, exonEnd in enumerate(self.exons[1:]): | |
200 intron = Interval() | |
201 intron.setName("%s_intron%d" % (self.getName(), cpt+1)) | |
202 intron.setChromosome(self.getChromosome()) | |
203 intron.setDirection(self.getDirection()) | |
204 if self.getDirection() == 1: | |
205 intron.setEnd(exonEnd.getStart() - 1) | |
206 intron.setStart(exonStart.getEnd() + 1) | |
207 else: | |
208 intron.setStart(exonEnd.getEnd() + 1) | |
209 intron.setEnd(exonStart.getStart() - 1) | |
210 intron.setDirection(self.getDirection()) | |
211 if intron.getSize() > 0: | |
212 self.introns.append(intron) | |
213 exonStart = exonEnd | |
214 intron.setSize(intron.getEnd() - intron.getStart() + 1) | |
215 return self.introns | |
216 | |
217 | |
218 def getSize(self): | |
219 """ | |
220 Get the size of the transcript (i.e. the number of nucleotides) | |
221 Compute size on the fly | |
222 """ | |
223 if len(self.exons) == 0: | |
224 return self.getSizeWithIntrons() | |
225 size = 0 | |
226 for exon in self.exons: | |
227 size += exon.getSize() | |
228 return size | |
229 | |
230 | |
231 def getSizeWithIntrons(self): | |
232 """ | |
233 Get the size of the interval (i.e. distance from start to end) | |
234 """ | |
235 return super(Transcript, self).getSize() | |
236 | |
237 | |
238 def overlapWithExon(self, transcript, nbNucleotides = 1): | |
239 """ | |
240 Check if the exons of this transcript overlap with the exons of another transcript | |
241 @param transcript: transcript to be compared to | |
242 @type transcript: class L{Transcript<Transcript>} | |
243 @param nbNucleotides: minimum number of nucleotides to declare and overlap | |
244 @type nbNucleotides: int | |
245 """ | |
246 if not self.overlapWith(transcript, nbNucleotides): | |
247 return False | |
248 for thisExon in self.getExons(): | |
249 for thatExon in transcript.getExons(): | |
250 if thisExon.overlapWith(thatExon, nbNucleotides): | |
251 return True | |
252 return False | |
253 | |
254 | |
255 def include(self, transcript): | |
256 """ | |
257 Whether this transcript includes the other one | |
258 @param transcript: object to be compared to | |
259 @type transcript: class L{Transcript<Transcript>} | |
260 """ | |
261 if not super(Transcript, self).include(transcript): | |
262 return False | |
263 for thatExon in transcript.getExons(): | |
264 for thisExon in self.getExons(): | |
265 if thisExon.include(thatExon): | |
266 break | |
267 else: | |
268 return False | |
269 return True | |
270 | |
271 | |
272 def merge(self, transcript, normalization = False): | |
273 """ | |
274 Merge with another transcript | |
275 Merge exons if they overlap, otherwise add exons | |
276 @param transcript: transcript to be merged to | |
277 @type transcript: class L{Transcript<Transcript>} | |
278 @param normalization: whether the sum of the merge should be normalized wrt the number of mappings of each elements | |
279 @type normalization: boolean | |
280 """ | |
281 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): | |
282 raise Exception("Cannot merge '%s' with '%s'!" % (self, transcript)) | |
283 | |
284 theseExons = self.getExons() | |
285 thoseExons = transcript.getExons() | |
286 | |
287 for thatExon in thoseExons: | |
288 toBeRemoved = [] | |
289 for thisIndex, thisExon in enumerate(theseExons): | |
290 if thisExon.overlapWith(thatExon): | |
291 thatExon.merge(thisExon) | |
292 toBeRemoved.append(thisIndex) | |
293 theseExons.append(thatExon) | |
294 for thisIndex in reversed(toBeRemoved): | |
295 del theseExons[thisIndex] | |
296 self.removeExons() | |
297 self.setStart(min(self.getStart(), transcript.getStart())) | |
298 self.setEnd(max(self.getEnd(), transcript.getEnd())) | |
299 if len(theseExons) > 1: | |
300 for thisExon in theseExons: | |
301 self.addExon(thisExon) | |
302 | |
303 self.setName("%s--%s" % (self.getUniqueName(), transcript.getUniqueName())) | |
304 super(Transcript, self).merge(transcript, normalization) | |
305 | |
306 | |
307 def getDifference(self, transcript, sameStrand = False): | |
308 """ | |
309 Get the difference between this cluster and another one | |
310 @param transcript: object to be compared to | |
311 @type transcript: class L{Transcript<Transcript>} | |
312 @param sameStrand: do the comparison iff the transcripts are on the same strand | |
313 @type sameStrand: boolean | |
314 @return: a transcript | |
315 """ | |
316 newTranscript = Transcript() | |
317 newTranscript.copy(self) | |
318 if self.getChromosome() != transcript.getChromosome(): | |
319 return newTranscript | |
320 if not self.overlapWith(transcript): | |
321 return newTranscript | |
322 if sameStrand and self.getDirection() != transcript.getDirection(): | |
323 return newTranscript | |
324 newTranscript.removeExons() | |
325 if transcript.getEnd() > newTranscript.getStart(): | |
326 newTranscript.setStart(transcript.getEnd() + 1) | |
327 if transcript.getStart() < newTranscript.getEnd(): | |
328 newTranscript.setEnd(transcript.getStart() + 1) | |
329 theseExons = [] | |
330 for exon in self.getExons(): | |
331 exonCopy = Interval() | |
332 exonCopy.copy(exon) | |
333 theseExons.append(exonCopy) | |
334 for thatExon in transcript.getExons(): | |
335 newExons = [] | |
336 for thisExon in theseExons: | |
337 newExons.extend(thisExon.getDifference(thatExon)) | |
338 theseExons = newExons | |
339 if not theseExons: | |
340 return None | |
341 newStart, newEnd = theseExons[0].getStart(), theseExons[0].getEnd() | |
342 for thisExon in theseExons[1:]: | |
343 newStart = min(newStart, thisExon.getStart()) | |
344 newEnd = max(newEnd, thisExon.getEnd()) | |
345 newTranscript.setEnd(newEnd) | |
346 newTranscript.setStart(newStart) | |
347 newTranscript.exons = theseExons | |
348 return newTranscript | |
349 | |
350 | |
351 def getIntersection(self, transcript): | |
352 """ | |
353 Get the intersection between this transcript and another one | |
354 @param transcript: object to be compared to | |
355 @type transcript: class L{Transcript<Transcript>} | |
356 @return: an other transcript | |
357 """ | |
358 if self.getChromosome() != transcript.getChromosome() or self.getDirection() != transcript.getDirection(): | |
359 return None | |
360 newTranscript = Transcript() | |
361 newTranscript.setDirection(self.getDirection()) | |
362 newTranscript.setChromosome(self.getChromosome()) | |
363 newTranscript.setName("%s_intersect_%s" % (self.getName(), transcript.getName())) | |
364 newExons = [] | |
365 for thisExon in self.getExons(): | |
366 for thatExon in transcript.getExons(): | |
367 newExon = thisExon.getIntersection(thatExon) | |
368 if newExon != None: | |
369 newExons.append(newExon) | |
370 if not newExons: | |
371 return None | |
372 newTranscript.exons = newExons | |
373 return newTranscript | |
374 | |
375 | |
376 def getSqlVariables(cls): | |
377 """ | |
378 Get the properties of the object that should be saved in a database | |
379 """ | |
380 variables = Interval.getSqlVariables() | |
381 variables.append("exons") | |
382 return variables | |
383 getSqlVariables = classmethod(getSqlVariables) | |
384 | |
385 | |
386 def setSqlValues(self, array): | |
387 """ | |
388 Set the values of the properties of this object as given by a results line of a SQL query | |
389 @param array: the values to be copied | |
390 @type array: a list | |
391 """ | |
392 super(Transcript, self).setSqlValues(array) | |
393 mergedExons = array[8] | |
394 if not mergedExons: | |
395 return | |
396 for exonCount, splittedExon in enumerate(mergedExons.split(",")): | |
397 start, end = splittedExon.split("-") | |
398 exon = Interval() | |
399 exon.setChromosome(self.getChromosome()) | |
400 exon.setDirection(self.getDirection()) | |
401 exon.setName("%s_exon%d" % (self.getName(), exonCount+1)) | |
402 exon.setStart(int(start)) | |
403 exon.setEnd(int(end)) | |
404 self.addExon(exon) | |
405 | |
406 | |
407 def getSqlValues(self): | |
408 """ | |
409 Get the values of the properties that should be saved in a database | |
410 """ | |
411 values = super(Transcript, self).getSqlValues() | |
412 values["size"] = self.getSize() | |
413 if self.getNbExons() == 1: | |
414 values["exons"] = "" | |
415 else: | |
416 values["exons"] = ",".join(["%d-%d" % (exon.getStart(), exon.getEnd()) for exon in self.getExons()]) | |
417 return values | |
418 | |
419 | |
420 def getSqlTypes(cls): | |
421 """ | |
422 Get the types of the properties that should be saved in a database | |
423 """ | |
424 types = Interval.getSqlTypes() | |
425 types["exons"] = "varchar" | |
426 return types | |
427 getSqlTypes = classmethod(getSqlTypes) | |
428 | |
429 | |
430 def getSqlSizes(cls): | |
431 """ | |
432 Get the sizes of the properties that should be saved in a database | |
433 """ | |
434 sizes = Interval.getSqlSizes() | |
435 sizes["exons"] = 10000 | |
436 return sizes | |
437 getSqlSizes = classmethod(getSqlSizes) | |
438 | |
439 | |
440 def sortExons(self): | |
441 """ | |
442 Sort the exons | |
443 Increasing order if transcript is on strand "+", decreasing otherwise | |
444 """ | |
445 self.sortExonsIncreasing() | |
446 if self.getDirection() == -1: | |
447 exons = self.getExons() | |
448 exons.reverse() | |
449 self.exons = exons | |
450 | |
451 | |
452 def sortExonsIncreasing(self): | |
453 """ | |
454 Sort the exons | |
455 Increasing order | |
456 """ | |
457 exons = self.getExons() | |
458 sortedExons = [] | |
459 while len(exons) > 0: | |
460 minExon = exons[0] | |
461 for index in range(1, len(exons)): | |
462 if minExon.getStart() > exons[index].getStart(): | |
463 minExon = exons[index] | |
464 sortedExons.append(minExon) | |
465 exons.remove(minExon) | |
466 self.exons = sortedExons | |
467 | |
468 | |
469 def extendStart(self, size): | |
470 """ | |
471 Extend the transcript by the 5' end | |
472 @param size: the size to be extended | |
473 @type size: int | |
474 """ | |
475 if len(self.exons) != 0: | |
476 self.sortExons() | |
477 if self.getDirection() == 1: | |
478 self.exons[0].setStart(max(0, self.exons[0].getStart() - size)) | |
479 else: | |
480 self.exons[0].setEnd(self.exons[0].getEnd() + size) | |
481 super(Transcript, self).extendStart(size) | |
482 self.bin = None | |
483 | |
484 | |
485 def extendEnd(self, size): | |
486 """ | |
487 Extend the transcript by the 3' end | |
488 @param size: the size to be extended | |
489 @type size: int | |
490 """ | |
491 if len(self.exons) != 0: | |
492 self.sortExons() | |
493 if self.getDirection() == 1: | |
494 self.exons[-1].setEnd(self.exons[-1].getEnd() + size) | |
495 else: | |
496 self.exons[-1].setStart(max(0, self.exons[-1].getStart() - size)) | |
497 super(Transcript, self).extendEnd(size) | |
498 self.bin = None | |
499 | |
500 | |
501 def extendExons(self, size): | |
502 """ | |
503 Extend all the exons | |
504 @param size: the size to be extended | |
505 @type size: int | |
506 """ | |
507 if len(self.exons) != 0: | |
508 self.sortExons() | |
509 exons = [] | |
510 previousExon = None | |
511 for exon in self.exons: | |
512 exon.extendStart(size) | |
513 exon.extendEnd(size) | |
514 exon.setDirection(self.getDirection()) | |
515 if previousExon != None and previousExon.overlapWith(exon): | |
516 previousExon.merge(exon) | |
517 else: | |
518 if previousExon != None: | |
519 exons.append(previousExon) | |
520 previousExon = exon | |
521 exons.append(previousExon) | |
522 self.exons = exons | |
523 super(Transcript, self).extendStart(size) | |
524 super(Transcript, self).extendEnd(size) | |
525 self.bin = None | |
526 | |
527 | |
528 def restrictStart(self, size = 1): | |
529 """ | |
530 Restrict the transcript by some nucleotides, start from its start position | |
531 Remove the exons | |
532 @param size: the size to be restricted to | |
533 @type size: int | |
534 """ | |
535 newExons = [] | |
536 if self.getDirection() == 1: | |
537 for exon in self.exons: | |
538 if exon.getStart() <= self.getStart() + size - 1: | |
539 if exon.getEnd() > self.getStart() + size - 1: | |
540 exon.setEnd(self.getStart() + size - 1) | |
541 newExons.append(exon) | |
542 else: | |
543 for exon in self.exons: | |
544 if exon.getEnd() >= self.getEnd() - size + 1: | |
545 if exon.getStart() < self.getEnd() - size + 1: | |
546 exon.setStart(self.getEnd() - size + 1) | |
547 newExons.append(exon) | |
548 super(Transcript, self).restrictStart(size) | |
549 self.exons = newExons | |
550 | |
551 | |
552 def restrictEnd(self, size = 1): | |
553 """ | |
554 Restrict the transcript by some nucleotides, end from its end position | |
555 Remove the exons | |
556 @param size: the size to be restricted to | |
557 @type size: int | |
558 """ | |
559 newExons = [] | |
560 if self.getDirection() == 1: | |
561 for exon in self.exons: | |
562 if exon.getEnd() >= self.getEnd() - size + 1: | |
563 if exon.getStart() < self.getEnd() - size + 1: | |
564 exon.setStart(self.getEnd() - size + 1) | |
565 newExons.append(exon) | |
566 else: | |
567 for exon in self.exons: | |
568 if exon.getEnd() >= self.getEnd() - size + 1: | |
569 if exon.getStart() < self.getEnd() - size + 1: | |
570 exon.setEnd(self.getEnd() - size + 1) | |
571 newExons.append(exon) | |
572 super(Transcript, self).restrictEnd(size) | |
573 self.exons = newExons | |
574 | |
575 | |
576 def removeExons(self): | |
577 """ | |
578 Remove the exons and transforms the current transcript into a mere interval | |
579 """ | |
580 self.exons = [] | |
581 self.bin = None | |
582 | |
583 | |
584 def printGtf(self, title): | |
585 """ | |
586 Export this transcript using GTF2.2 format | |
587 @param title: the title of the transcripts | |
588 @type title: string | |
589 @return: a string | |
590 """ | |
591 transcriptId = self.getUniqueName() | |
592 geneId = "%s_gene" % (transcriptId) | |
593 direction = "+" | |
594 if self.getDirection() == -1: | |
595 direction = "-" | |
596 self.sortExonsIncreasing() | |
597 string = "" | |
598 for i, exon in enumerate(self.getExons()): | |
599 exonCopy = Interval() | |
600 exonCopy.copy(exon) | |
601 if "ID" in exonCopy.getTagValues(): | |
602 del exonCopy.tags["ID"] | |
603 feature = "exon" | |
604 if "feature" in exonCopy.getTagNames(): | |
605 feature = exonCopy.getTagValue("feature") | |
606 del exonCopy.tags["feature"] | |
607 score = "." | |
608 if "score" in exonCopy.getTagNames(): | |
609 score = "%d" % (int(exonCopy.getTagValue("score"))) | |
610 del exonCopy.tags["score"] | |
611 if "Parent" in exonCopy.getTagNames(): | |
612 del exonCopy.tags["Parent"] | |
613 exonCopy.setName("%s_part%d" % (self.getName(), i+1)) | |
614 comment = exonCopy.getTagValues("; ", " ", "\"") | |
615 string += "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\ttranscript_id \"%s\"; gene_id \"%s\"; %s\n" % (exonCopy.getChromosome(), title, feature, exonCopy.getStart(), exonCopy.getEnd(), score, direction, transcriptId, geneId, comment) | |
616 return string | |
617 | |
618 | |
619 def printGff2(self, title): | |
620 """ | |
621 Export this transcript using GFF2 format | |
622 @param title: the title of the transcripts | |
623 @type title: string | |
624 @return: a string | |
625 """ | |
626 direction = "+" | |
627 if self.getDirection() == -1: | |
628 direction = "-" | |
629 self.sortExonsIncreasing() | |
630 comment = self.getTagValues() | |
631 if comment != None: | |
632 comment = ";%s" % (comment) | |
633 score = "." | |
634 if "score" in self.getTagNames(): | |
635 score = "%d" % (int(self.getTagValue("score"))) | |
636 feature = "transcript" | |
637 if "feature" in self.getTagNames(): | |
638 feature = self.getTagValue("feature") | |
639 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\tGENE %s%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, self.name, comment) | |
640 for exon in self.getExons(): | |
641 if "score" in exon.getTagNames(): | |
642 score = "%d" % (int(self.getTagValue("score"))) | |
643 string += "%s\t%s\t_exon\t%d\t%d\t%s\t%s\t.\tGENE %s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.name) | |
644 return string | |
645 | |
646 | |
647 def printGff3(self, title): | |
648 """ | |
649 Export this transcript using GFF3 format | |
650 @param title: the title of the transcripts | |
651 @type title: string | |
652 @return: a string | |
653 """ | |
654 direction = "+" | |
655 if self.getDirection() == -1: | |
656 direction = "-" | |
657 self.sortExonsIncreasing() | |
658 if "ID" not in self.getTagValues(): | |
659 self.setTagValue("ID", self.getUniqueName()) | |
660 feature = "transcript" | |
661 tags = self.tags | |
662 if "feature" in self.getTagNames(): | |
663 feature = self.getTagValue("feature") | |
664 del self.tags["feature"] | |
665 score = "." | |
666 if "score" in self.getTagNames(): | |
667 score = "%d" % (int(self.getTagValue("score"))) | |
668 del self.tags["score"] | |
669 comment = self.getTagValues(";", "=") | |
670 string = "%s\t%s\t%s\t%d\t%d\t%s\t%s\t.\t%s\n" % (self.getChromosome(), title, feature, self.getStart(), self.getEnd(), score, direction, comment) | |
671 if len(self.exons) > 1: | |
672 for i, exon in enumerate(self.getExons()): | |
673 if "score" in exon.getTagNames(): | |
674 score = "%d" % (int(exon.getTagValue("score"))) | |
675 string += "%s\t%s\texon\t%d\t%d\t%s\t%s\t.\tID=%s-exon%d;Name=%s-exon%d;Parent=%s\n" % (self.getChromosome(), title, exon.getStart(), exon.getEnd(), score, direction, self.getTagValue("ID"), i+1, self.name, i+1, self.getTagValue("ID")) | |
676 self.tags = tags | |
677 return string | |
678 | |
679 | |
680 def printEmbl(self): | |
681 """ | |
682 Export this transcript using EMBL format | |
683 @return: a string | |
684 """ | |
685 if len(self.exons) <= 1: | |
686 position = "%d..%d" % (self.getStart(), self.getEnd()) | |
687 else: | |
688 positions = [] | |
689 for exon in self.getExons(): | |
690 positions.append("%d..%d" % (self.getStart(), self.getEnd())) | |
691 position = ",".join(positions) | |
692 position = "join(%s)" % (position) | |
693 if self.getDirection() == -1: | |
694 position = "complement(%s)" % (position) | |
695 feature = "misc_feature" | |
696 if "feature" in self.getTagNames(): | |
697 if not self.getTagValue("feature").startswith("S-MART"): | |
698 feature = self.getTagValue("feature") | |
699 string = "FT %s %s\n" % (feature, position) | |
700 if "Name" in self.getTagNames(): | |
701 string += "FT /label=\"%s\"\n" % (self.getTagValue("Name")) | |
702 return string | |
703 | |
704 | |
705 def printBed(self): | |
706 """ | |
707 Export this transcript using BED format | |
708 @return: a string | |
709 """ | |
710 name = self.name | |
711 if "nbOccurrences" in self.getTagNames() and self.getTagValue("nbOccurrences") != 1 and self.getTagValue("occurrences"): | |
712 name = "%s-%d" % (name, self.getTagValue("occurrence")) | |
713 comment = self.getTagValues(";", "=") | |
714 sizes = [] | |
715 starts = [] | |
716 direction = "+" | |
717 if self.getDirection() == -1: | |
718 direction = "-" | |
719 self.sortExonsIncreasing() | |
720 for exon in self.getExons(): | |
721 sizes.append("%d" % (exon.getSize())) | |
722 starts.append("%d" % (exon.getStart() - self.getStart())) | |
723 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome(), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
724 | |
725 | |
726 def printSam(self): | |
727 """ | |
728 Export this transcript using SAM format | |
729 @return: a string | |
730 """ | |
731 name = self.name | |
732 flag = 0 if self.getDirection() == 1 else 0x10 | |
733 chromosome = self.getChromosome() | |
734 genomeStart = self.getStart() | |
735 quality = 255 | |
736 mate = "*" | |
737 mateGenomeStart = 0 | |
738 gapSize = 0 | |
739 sequence = "*" | |
740 qualityString = "*" | |
741 tags = "NM:i:0" | |
742 | |
743 lastExonEnd = None | |
744 self.sortExonsIncreasing() | |
745 exon = self.getExons()[0] | |
746 cigar = "%dM" % (self.getExons()[0].getSize()) | |
747 lastExonEnd = exon.getEnd() | |
748 for i, exon in enumerate(self.getExons()): | |
749 if i == 0: | |
750 continue | |
751 cigar += "%dN" % (exon.getStart() - lastExonEnd - 1) | |
752 cigar += "%dM" % (exon.getSize()) | |
753 | |
754 return "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s\t%s\n" % (name, flag, chromosome, genomeStart, quality, cigar, mate, mateGenomeStart, gapSize, sequence, qualityString, tags) | |
755 | |
756 | |
757 def printUcsc(self): | |
758 """ | |
759 Export this transcript using UCSC BED format | |
760 @return: a string | |
761 """ | |
762 if self.getChromosome().find("Het") != -1: | |
763 return "" | |
764 name = self.name | |
765 comment = self.getTagValues(";", "") | |
766 sizes = [] | |
767 starts = [] | |
768 direction = "+" | |
769 if self.getDirection() == -1: | |
770 direction = "-" | |
771 self.sortExonsIncreasing() | |
772 for exon in self.getExons(): | |
773 sizes.append("%d" % (exon.getSize())) | |
774 starts.append("%d" % (exon.getStart() - self.getStart())) | |
775 return "%s\t%d\t%d\t%s\t1000\t%s\t%d\t%d\t0\t%d\t%s,\t%s,\n" % (self.getChromosome().replace("arm_", "chr"), self.getStart(), self.getEnd()+1, name, direction, self.getStart(), self.getEnd()+1, self.getNbExons(), ",".join(sizes), ",".join(starts)) | |
776 | |
777 | |
778 def printGBrowseReference(self): | |
779 """ | |
780 Export this transcript using GBrowse format (1st line only) | |
781 @return: a string | |
782 """ | |
783 return "reference = %s\n" % (self.getChromosome()) | |
784 | |
785 | |
786 def printGBrowseLine(self): | |
787 """ | |
788 Export this transcript using GBrowse format (2nd line only) | |
789 @return: a string | |
790 """ | |
791 self.sortExons() | |
792 coordinates = [] | |
793 for exon in self.getExons(): | |
794 coordinates.append(exon.printCoordinates()) | |
795 coordinatesString = ",".join(coordinates) | |
796 comment = self.getTagValues(";", "=") | |
797 if comment: | |
798 comment = "\t\"%s\"" % (comment) | |
799 return "User_data\t%s\t%s%s\n" % (self.name, coordinatesString, comment) | |
800 | |
801 | |
802 def printGBrowse(self): | |
803 """ | |
804 Export this transcript using GBrowse format | |
805 @return: a string | |
806 """ | |
807 return "%s%s" % (self.printGBrowseReference(), self.printGBrowseLine()) | |
808 | |
809 | |
810 def printCsv(self): | |
811 """ | |
812 Export this transcript using CSV format | |
813 @return: a string | |
814 """ | |
815 self.sortExons() | |
816 string = "%s,%d,%d,\"%s\"," % (self.getChromosome(), self.getStart(), self.getEnd(), "+" if self.getDirection() == 1 else "-") | |
817 if len(self.getExons()) == 1: | |
818 string += "None" | |
819 else: | |
820 for exon in self.getExons(): | |
821 string += "%d-%d " % (exon.getStart(), exon.getEnd()) | |
822 for tag in sorted(self.tags.keys()): | |
823 string += ",%s=%s" % (tag, str(self.tags[tag])) | |
824 string += "\n" | |
825 return string | |
826 | |
827 | |
828 def extractSequence(self, parser): | |
829 """ | |
830 Get the sequence corresponding to this transcript | |
831 @param parser: a parser to a FASTA file | |
832 @type parser: class L{SequenceListParser<SequenceListParser>} | |
833 @return: an instance of L{Sequence<Sequence>} | |
834 """ | |
835 self.sortExons() | |
836 name = self.name | |
837 if "ID" in self.getTagNames() and self.getTagValue("ID") != self.name: | |
838 name += ":%s" % (self.getTagValue("ID")) | |
839 sequence = Sequence(name) | |
840 for exon in self.getExons(): | |
841 sequence.concatenate(exon.extractSequence(parser)) | |
842 return sequence | |
843 | |
844 | |
845 def extractWigData(self, parser): | |
846 """ | |
847 Get some wig data corresponding to this transcript | |
848 @param parser: a parser to a wig file | |
849 @type parser: class L{WigParser<WigParser>} | |
850 @return: a sequence of float | |
851 """ | |
852 self.sortExons() | |
853 if parser.strands: | |
854 strands = (-1, 1) | |
855 values = dict([(strand, []) for strand in strands]) | |
856 for exon in self.getExons(): | |
857 theseValues = exon.extractWigData(parser) | |
858 if self.getDirection() == -1: | |
859 for strand in strands: | |
860 theseValues[strand].reverse() | |
861 for strand in strands: | |
862 values[strand].extend(theseValues[strand]) | |
863 if self.getDirection() == -1: | |
864 for strand in strands: | |
865 values[strand].reverse() | |
866 return values | |
867 else: | |
868 values = [] | |
869 for exon in self.getExons(): | |
870 theseValues = exon.extractWigData(parser) | |
871 #if self.getDirection() == -1: | |
872 # theseValues.reverse() | |
873 values.extend(theseValues) | |
874 #if self.getDirection() == -1: | |
875 # values.reverse() | |
876 return values |