annotate gtf_to_gff.py @ 5:6e589f267c14

Uploaded
author devteam
date Tue, 04 Nov 2014 12:15:19 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
6e589f267c14 Uploaded
devteam
parents:
diff changeset
1 #!/usr/bin/env python
6e589f267c14 Uploaded
devteam
parents:
diff changeset
2 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
3 Convert Gene Transfer Format [GTF] to Generic Feature Format Version 3 [GFF3].
6e589f267c14 Uploaded
devteam
parents:
diff changeset
4
6e589f267c14 Uploaded
devteam
parents:
diff changeset
5 Usage: python gtf_to_gff.py in.gtf > out.gff3
6e589f267c14 Uploaded
devteam
parents:
diff changeset
6
6e589f267c14 Uploaded
devteam
parents:
diff changeset
7 Requirement:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
8 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
6e589f267c14 Uploaded
devteam
parents:
diff changeset
9 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py
6e589f267c14 Uploaded
devteam
parents:
diff changeset
10
6e589f267c14 Uploaded
devteam
parents:
diff changeset
11 Copyright (C)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
13 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
6e589f267c14 Uploaded
devteam
parents:
diff changeset
14 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
15
6e589f267c14 Uploaded
devteam
parents:
diff changeset
16 import re
6e589f267c14 Uploaded
devteam
parents:
diff changeset
17 import sys
6e589f267c14 Uploaded
devteam
parents:
diff changeset
18 import GFFParser
6e589f267c14 Uploaded
devteam
parents:
diff changeset
19 import helper
6e589f267c14 Uploaded
devteam
parents:
diff changeset
20
6e589f267c14 Uploaded
devteam
parents:
diff changeset
21 def GFFWriter(gtf_content):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
22 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
23 write the feature information to GFF format
6e589f267c14 Uploaded
devteam
parents:
diff changeset
24
6e589f267c14 Uploaded
devteam
parents:
diff changeset
25 @args gtf_content: Parsed object from gtf file
6e589f267c14 Uploaded
devteam
parents:
diff changeset
26 @type gtf_content: numpy array
6e589f267c14 Uploaded
devteam
parents:
diff changeset
27 """
6e589f267c14 Uploaded
devteam
parents:
diff changeset
28
6e589f267c14 Uploaded
devteam
parents:
diff changeset
29 print '##gff-version 3'
6e589f267c14 Uploaded
devteam
parents:
diff changeset
30
6e589f267c14 Uploaded
devteam
parents:
diff changeset
31 for ent1 in gtf_content:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
32
6e589f267c14 Uploaded
devteam
parents:
diff changeset
33 chr_name = ent1['chr']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
34 strand = ent1['strand']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
35 start = ent1['start']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
36 stop = ent1['stop']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
37 source = ent1['source']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
38 ID = ent1['name']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
39 Name = ent1['gene_info']['Name']
6e589f267c14 Uploaded
devteam
parents:
diff changeset
40
6e589f267c14 Uploaded
devteam
parents:
diff changeset
41 Name = ID if not Name else Name
6e589f267c14 Uploaded
devteam
parents:
diff changeset
42
6e589f267c14 Uploaded
devteam
parents:
diff changeset
43 print '%s\t%s\tgene\t%d\t%d\t.\t%s\t.\tID=%s;Name=%s' % (chr_name, source, start, stop, strand, ID, Name)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
44
6e589f267c14 Uploaded
devteam
parents:
diff changeset
45 for idx, tid in enumerate(ent1['transcripts']):
6e589f267c14 Uploaded
devteam
parents:
diff changeset
46 print idx
6e589f267c14 Uploaded
devteam
parents:
diff changeset
47 print tid
6e589f267c14 Uploaded
devteam
parents:
diff changeset
48
6e589f267c14 Uploaded
devteam
parents:
diff changeset
49 t_start = ent1['exons'][idx][0][0]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
50 t_stop = ent1['exons'][idx][-1][-1]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
51 t_type = ent1['transcript_type'][idx]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
52
6e589f267c14 Uploaded
devteam
parents:
diff changeset
53 utr5_exons, utr3_exons = [], []
6e589f267c14 Uploaded
devteam
parents:
diff changeset
54 if ent1['exons'][idx].any() and ent1['cds_exons'][idx].any():
6e589f267c14 Uploaded
devteam
parents:
diff changeset
55 utr5_exons, utr3_exons = helper.buildUTR(ent1['cds_exons'][idx], ent1['exons'][idx], strand)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
56
6e589f267c14 Uploaded
devteam
parents:
diff changeset
57 print '%s\t%s\t%s\t%d\t%d\t.\t%s\t.\tID=%s;Parent=%s' % (chr_name, source, t_type, t_start, t_stop, strand, tid[0], ID)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
58
6e589f267c14 Uploaded
devteam
parents:
diff changeset
59 for ex_cod in utr5_exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
60 print '%s\t%s\tfive_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
61
6e589f267c14 Uploaded
devteam
parents:
diff changeset
62 for ex_cod in ent1['cds_exons'][idx]:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
63 print '%s\t%s\tCDS\t%d\t%d\t.\t%s\t%d\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, ex_cod[2], tid[0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
64
6e589f267c14 Uploaded
devteam
parents:
diff changeset
65 for ex_cod in utr3_exons:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
66 print '%s\t%s\tthree_prime_UTR\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
67
6e589f267c14 Uploaded
devteam
parents:
diff changeset
68 for ex_cod in ent1['exons'][idx]:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
69 print '%s\t%s\texon\t%d\t%d\t.\t%s\t.\tParent=%s' % (chr_name, source, ex_cod[0], ex_cod[1], strand, tid[0])
6e589f267c14 Uploaded
devteam
parents:
diff changeset
70
6e589f267c14 Uploaded
devteam
parents:
diff changeset
71
6e589f267c14 Uploaded
devteam
parents:
diff changeset
72 def __main__():
6e589f267c14 Uploaded
devteam
parents:
diff changeset
73
6e589f267c14 Uploaded
devteam
parents:
diff changeset
74 try:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
75 gtf_fname = sys.argv[1]
6e589f267c14 Uploaded
devteam
parents:
diff changeset
76 except:
6e589f267c14 Uploaded
devteam
parents:
diff changeset
77 print __doc__
6e589f267c14 Uploaded
devteam
parents:
diff changeset
78 sys.exit(-1)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
79
6e589f267c14 Uploaded
devteam
parents:
diff changeset
80 gtf_file_content = GFFParser.Parse(gtf_fname)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
81
6e589f267c14 Uploaded
devteam
parents:
diff changeset
82 GFFWriter(gtf_file_content)
6e589f267c14 Uploaded
devteam
parents:
diff changeset
83
6e589f267c14 Uploaded
devteam
parents:
diff changeset
84 if __name__ == "__main__":
6e589f267c14 Uploaded
devteam
parents:
diff changeset
85 __main__()