Mercurial > repos > vipints > fml_gff3togtf
comparison gff_to_bed.py @ 10:c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Thu, 23 Apr 2015 18:01:45 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:7d67331368f3 | 10:c42c69aa81f8 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Convert genome annotation data in GFF/GTF to a 12 column BED format. | |
4 BED format typically represents the transcript models. | |
5 | |
6 Usage: python gff_to_bed.py in.gff > out.bed | |
7 | |
8 Requirement: | |
9 GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py | |
10 | |
11 Copyright (C) | |
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
13 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. | |
14 """ | |
15 | |
16 import re | |
17 import sys | |
18 import GFFParser | |
19 | |
20 def limitBEDWrite(tinfo): | |
21 """ | |
22 Write a three column BED file | |
23 | |
24 @args tinfo: list of genes | |
25 @type tinfo: numpy object | |
26 """ | |
27 | |
28 for contig_id, feature in tinfo.items(): | |
29 uns_line = dict() | |
30 for tid, tloc in feature.items(): | |
31 uns_line[(int(tloc[0])-1, int(tloc[1]))]=1 | |
32 for ele in sorted(uns_line): | |
33 pline = [contig_id, | |
34 str(ele[0]-1), | |
35 str(ele[1])] | |
36 | |
37 sys.stdout.write('\t'.join(pline)+"\n") | |
38 | |
39 | |
40 def writeBED(tinfo): | |
41 """ | |
42 writing result files in bed format | |
43 | |
44 @args tinfo: list of genes | |
45 @type tinfo: numpy object | |
46 """ | |
47 | |
48 for ent1 in tinfo: | |
49 child_flag = False | |
50 | |
51 for idx, tid in enumerate(ent1['transcripts']): | |
52 child_flag = True | |
53 exon_cnt = len(ent1['exons'][idx]) | |
54 exon_len = '' | |
55 exon_cod = '' | |
56 rel_start = None | |
57 rel_stop = None | |
58 for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript | |
59 exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1) | |
60 if idz == 0: #calculate the relative start position | |
61 exon_cod += '0,' | |
62 rel_start = int(ex_cod[0])-1 | |
63 rel_stop = int(ex_cod[1]) | |
64 else: | |
65 exon_cod += '%d,' % (ex_cod[0]-1-rel_start) ## shifting the coordinates to zero | |
66 rel_stop = int(ex_cod[1]) | |
67 | |
68 if exon_len: | |
69 score = 0 | |
70 score = ent1['transcript_score'][idx] if ent1['transcript_score'].any() else score ## getting the transcript score | |
71 out_print = [ent1['chr'], | |
72 str(rel_start), | |
73 str(rel_stop), | |
74 tid[0], | |
75 str(score), | |
76 ent1['strand'], | |
77 str(rel_start), | |
78 str(rel_stop), | |
79 '0', | |
80 str(exon_cnt), | |
81 exon_len, | |
82 exon_cod] | |
83 sys.stdout.write('\t'.join(out_print)+"\n") | |
84 | |
85 if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type | |
86 score = 0 | |
87 score = ent1['transcript_score'][0] if ent1['transcript_score'].any() else score | |
88 | |
89 out_print = [ent1['chr'], | |
90 '%d' % int(ent1['start'])-1, | |
91 '%d' % int(ent1['stop']), | |
92 ent1['name'], | |
93 str(score), | |
94 ent1['strand'], | |
95 '%d' % int(ent1['start']), | |
96 '%d' % int(ent1['stop']), | |
97 '0', | |
98 '1', | |
99 '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), | |
100 '0,'] | |
101 | |
102 sys.stdout.write('\t'.join(out_print)+"\n") | |
103 | |
104 | |
105 def __main__(): | |
106 try: | |
107 query_file = sys.argv[1] | |
108 except: | |
109 print __doc__ | |
110 sys.exit(-1) | |
111 | |
112 Transcriptdb = GFFParser.Parse(query_file) | |
113 writeBED(Transcriptdb) | |
114 | |
115 if __name__ == "__main__": | |
116 __main__() |