5
|
1 #!/usr/bin/env python
|
|
2 """
|
|
3 Convert genome annotation data in GFF/GTF to a 12 column BED format.
|
|
4 BED format typically represents the transcript models.
|
|
5
|
|
6 Usage: python gff_to_bed.py in.gff > out.bed
|
|
7
|
|
8 Requirement:
|
|
9 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py
|
|
10
|
|
11 Copyright (C)
|
|
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
|
|
13 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA.
|
|
14 """
|
|
15
|
|
16 import re
|
|
17 import sys
|
|
18 import GFFParser
|
|
19
|
|
20 def writeBED(tinfo):
|
|
21 """
|
|
22 writing result files in bed format
|
|
23
|
|
24 @args tinfo: list of genes
|
|
25 @args tinfo: numpy object
|
|
26 """
|
|
27
|
|
28 for ent1 in tinfo:
|
|
29 child_flag = False
|
|
30
|
|
31 for idx, tid in enumerate(ent1['transcripts']):
|
|
32 child_flag = True
|
|
33 exon_cnt = len(ent1['exons'][idx])
|
|
34 exon_len = ''
|
|
35 exon_cod = ''
|
|
36 rel_start = None
|
|
37 rel_stop = None
|
|
38 for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript
|
|
39 exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
|
|
40 if idz == 0: #calculate the relative start position
|
|
41 exon_cod += '0,'
|
|
42 rel_start = int(ex_cod[0])
|
|
43 rel_stop = ex_cod[1]
|
|
44 else:
|
|
45 exon_cod += '%d,' % (ex_cod[0]-rel_start)
|
|
46 rel_stop = int(ex_cod[1])
|
|
47
|
|
48 if exon_len:
|
|
49 score = '0'
|
|
50 score = ent1['score'][0] if ent1['score'] else score
|
|
51 out_print = [ent1['chr'],
|
|
52 str(rel_start),
|
|
53 str(rel_stop),
|
|
54 tid[0],
|
|
55 score,
|
|
56 ent1['strand'],
|
|
57 str(rel_start),
|
|
58 str(rel_stop),
|
|
59 '0',
|
|
60 str(exon_cnt),
|
|
61 exon_len,
|
|
62 exon_cod]
|
|
63 print '\t'.join(out_print)
|
|
64
|
|
65 if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type
|
|
66 score = '0'
|
|
67 score = ent1['score'][0] if ent1['score'] else score
|
|
68
|
|
69 out_print = [ent1['chr'],
|
|
70 '%d' % int(ent1['start']),
|
|
71 '%d' % int(ent1['stop']),
|
|
72 ent1['name'],
|
|
73 score,
|
|
74 ent1['strand'],
|
|
75 '%d' % int(ent1['start']),
|
|
76 '%d' % int(ent1['stop']),
|
|
77 '0',
|
|
78 '1',
|
|
79 '%d,' % (int(ent1['stop'])-int(ent1['start'])+1),
|
|
80 '0,']
|
|
81
|
|
82 print '\t'.join(out_print)
|
|
83
|
|
84
|
|
85 def __main__():
|
|
86 try:
|
|
87 query_file = sys.argv[1]
|
|
88 except:
|
|
89 print __doc__
|
|
90 sys.exit(-1)
|
|
91
|
|
92 Transcriptdb = GFFParser.Parse(query_file)
|
|
93 writeBED(Transcriptdb)
|
|
94
|
|
95 if __name__ == "__main__":
|
|
96 __main__()
|