5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Program to convert data from GFF to GTF
|
|
4
|
|
5 Usage: python gff_to_gtf.py in.gff > out.gtf
|
|
6
|
|
7 Requirement:
|
|
8 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
|
|
9
|
|
10 Copyright (C)
|
|
11 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
|
|
12 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
|
|
13 """
|
|
14
|
|
15 import re
|
|
16 import sys
|
|
17 import GFFParser
|
|
18
|
|
19 def printGTF(tinfo):
|
|
20 """
|
|
21 writing result file in GTF format
|
|
22
|
|
23 @args tinfo: parsed object from gff file
|
|
24 @type tinfo: numpy array
|
|
25 """
|
|
26
|
|
27 for ent1 in tinfo:
|
|
28 for idx, tid in enumerate(ent1['transcripts']):
|
|
29
|
|
30 exons = ent1['exons'][idx]
|
|
31 cds_exons = ent1['cds_exons'][idx]
|
|
32
|
|
33 stop_codon = start_codon = ()
|
|
34
|
|
35 if ent1['strand'] == '+':
|
|
36 if cds_exons.any():
|
|
37 start_codon = (cds_exons[0][0], cds_exons[0][0]+2)
|
|
38 stop_codon = (cds_exons[-1][1]-2, cds_exons[-1][1])
|
|
39 elif ent1['strand'] == '-':
|
|
40 if cds_exons.any():
|
|
41 start_codon = (cds_exons[-1][1]-2, cds_exons[-1][1])
|
|
42 stop_codon = (cds_exons[0][0], cds_exons[0][0]+2)
|
|
43 else:
|
|
44 print 'STRAND information missing - %s, skip the transcript - %s' % (ent1['strand'], tid[0])
|
|
45 pass
|
|
46
|
|
47 last_cds_cod = 0
|
|
48 for idz, ex_cod in enumerate(exons):
|
|
49
|
|
50 print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], ex_cod[0], ex_cod[1], ent1['strand'], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
|
|
51
|
|
52 if cds_exons.any():
|
|
53 try:
|
|
54 print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], cds_exons[idz][0], cds_exons[idz][1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
|
|
55 last_cds_cod = idz
|
|
56 except:
|
|
57 pass
|
|
58
|
|
59 if idz == 0:
|
|
60 print '%s\t%s\tstart_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], start_codon[0], start_codon[1], ent1['strand'], cds_exons[idz][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
|
|
61
|
|
62 if stop_codon:
|
|
63 print '%s\t%s\tstop_codon\t%d\t%d\t.\t%s\t%d\tgene_id "%s"; transcript_id "%s"; exon_number "%d"; gene_name "%s"; ' % (ent1['chr'], ent1['source'], stop_codon[0], stop_codon[1], ent1['strand'], cds_exons[last_cds_cod][2], ent1['name'], tid[0], idz+1, ent1['gene_info']['Name'])
|
|
64
|
|
65
|
|
66 if __name__ == "__main__":
|
|
67
|
|
68 try:
|
|
69 gff_fname = sys.argv[1]
|
|
70 except:
|
|
71 print __doc__
|
|
72 sys.exit(-1)
|
|
73
|
|
74 Transcriptdb = GFFParser.Parse(gff_fname)
|
|
75
|
|
76 printGTF(Transcriptdb)
|