Mercurial > repos > vipints > fml_gff3togtf
comparison gff_to_bed.py @ 5:6e589f267c14
Uploaded
author | devteam |
---|---|
date | Tue, 04 Nov 2014 12:15:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:619e0fcd9126 | 5:6e589f267c14 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Convert genome annotation data in GFF/GTF to a 12 column BED format. | |
4 BED format typically represents the transcript models. | |
5 | |
6 Usage: python gff_to_bed.py in.gff > out.bed | |
7 | |
8 Requirement: | |
9 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py | |
10 | |
11 Copyright (C) | |
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
13 2012-2014 Memorial Sloan Kettering Cancer Center New York City, USA. | |
14 """ | |
15 | |
16 import re | |
17 import sys | |
18 import GFFParser | |
19 | |
20 def writeBED(tinfo): | |
21 """ | |
22 writing result files in bed format | |
23 | |
24 @args tinfo: list of genes | |
25 @args tinfo: numpy object | |
26 """ | |
27 | |
28 for ent1 in tinfo: | |
29 child_flag = False | |
30 | |
31 for idx, tid in enumerate(ent1['transcripts']): | |
32 child_flag = True | |
33 exon_cnt = len(ent1['exons'][idx]) | |
34 exon_len = '' | |
35 exon_cod = '' | |
36 rel_start = None | |
37 rel_stop = None | |
38 for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript | |
39 exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1) | |
40 if idz == 0: #calculate the relative start position | |
41 exon_cod += '0,' | |
42 rel_start = int(ex_cod[0]) | |
43 rel_stop = ex_cod[1] | |
44 else: | |
45 exon_cod += '%d,' % (ex_cod[0]-rel_start) | |
46 rel_stop = int(ex_cod[1]) | |
47 | |
48 if exon_len: | |
49 score = '0' | |
50 score = ent1['score'][0] if ent1['score'] else score | |
51 out_print = [ent1['chr'], | |
52 str(rel_start), | |
53 str(rel_stop), | |
54 tid[0], | |
55 score, | |
56 ent1['strand'], | |
57 str(rel_start), | |
58 str(rel_stop), | |
59 '0', | |
60 str(exon_cnt), | |
61 exon_len, | |
62 exon_cod] | |
63 print '\t'.join(out_print) | |
64 | |
65 if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type | |
66 score = '0' | |
67 score = ent1['score'][0] if ent1['score'] else score | |
68 | |
69 out_print = [ent1['chr'], | |
70 '%d' % int(ent1['start']), | |
71 '%d' % int(ent1['stop']), | |
72 ent1['name'], | |
73 score, | |
74 ent1['strand'], | |
75 '%d' % int(ent1['start']), | |
76 '%d' % int(ent1['stop']), | |
77 '0', | |
78 '1', | |
79 '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), | |
80 '0,'] | |
81 | |
82 print '\t'.join(out_print) | |
83 | |
84 | |
85 def __main__(): | |
86 try: | |
87 query_file = sys.argv[1] | |
88 except: | |
89 print __doc__ | |
90 sys.exit(-1) | |
91 | |
92 Transcriptdb = GFFParser.Parse(query_file) | |
93 writeBED(Transcriptdb) | |
94 | |
95 if __name__ == "__main__": | |
96 __main__() |