Mercurial > repos > vipints > fml_gff3togtf
comparison bed_to_gff.py @ 10:c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Thu, 23 Apr 2015 18:01:45 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
9:7d67331368f3 | 10:c42c69aa81f8 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Convert genome annotation data in a 12 column BED format to GFF3. | |
4 | |
5 Usage: | |
6 python bed_to_gff.py in.bed > out.gff | |
7 | |
8 Requirement: | |
9 helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py | |
10 | |
11 Copyright (C) | |
12 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. | |
13 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. | |
14 """ | |
15 | |
16 import re | |
17 import sys | |
18 import helper | |
19 | |
20 def __main__(): | |
21 """ | |
22 main function | |
23 """ | |
24 | |
25 try: | |
26 bed_fname = sys.argv[1] | |
27 except: | |
28 print __doc__ | |
29 sys.exit(-1) | |
30 | |
31 bed_fh = helper.open_file(bed_fname) | |
32 | |
33 for line in bed_fh: | |
34 line = line.strip( '\n\r' ) | |
35 | |
36 if not line or line[0] in ['#']: | |
37 continue | |
38 | |
39 parts = line.split('\t') | |
40 assert len(parts) >= 12, line | |
41 | |
42 rstarts = parts[-1].split(',') | |
43 rstarts.pop() if rstarts[-1] == '' else rstarts | |
44 | |
45 exon_lens = parts[-2].split(',') | |
46 exon_lens.pop() if exon_lens[-1] == '' else exon_lens | |
47 | |
48 if len(rstarts) != len(exon_lens): | |
49 continue # checking the consistency col 11 and col 12 | |
50 | |
51 if len(rstarts) != int(parts[-3]): | |
52 continue # checking the number of exons and block count are same | |
53 | |
54 if not parts[5] in ['+', '-']: | |
55 parts[5] = '.' # replace the unknown strand with '.' | |
56 | |
57 # bed2gff result line | |
58 sys.stdout.write('%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3])) | |
59 sys.stdout.write('%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3])) | |
60 | |
61 st = int(parts[1]) | |
62 for ex_cnt in range(int(parts[-3])): | |
63 start = st + int(rstarts[ex_cnt]) + 1 | |
64 stop = start + int(exon_lens[ex_cnt]) - 1 | |
65 sys.stdout.write('%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s\n' % (parts[0], start, stop, parts[4], parts[5], parts[3])) | |
66 | |
67 bed_fh.close() | |
68 | |
69 | |
70 if __name__ == "__main__": | |
71 __main__() |