Mercurial > repos > vipints > fml_gff3togtf
diff bed_to_gff.py @ 10:c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Thu, 23 Apr 2015 18:01:45 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bed_to_gff.py Thu Apr 23 18:01:45 2015 -0400 @@ -0,0 +1,71 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in a 12 column BED format to GFF3. + +Usage: + python bed_to_gff.py in.bed > out.gff + +Requirement: + helper.py : https://github.com/vipints/GFFtools-GX/blob/master/helper.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import helper + +def __main__(): + """ + main function + """ + + try: + bed_fname = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + bed_fh = helper.open_file(bed_fname) + + for line in bed_fh: + line = line.strip( '\n\r' ) + + if not line or line[0] in ['#']: + continue + + parts = line.split('\t') + assert len(parts) >= 12, line + + rstarts = parts[-1].split(',') + rstarts.pop() if rstarts[-1] == '' else rstarts + + exon_lens = parts[-2].split(',') + exon_lens.pop() if exon_lens[-1] == '' else exon_lens + + if len(rstarts) != len(exon_lens): + continue # checking the consistency col 11 and col 12 + + if len(rstarts) != int(parts[-3]): + continue # checking the number of exons and block count are same + + if not parts[5] in ['+', '-']: + parts[5] = '.' # replace the unknown strand with '.' + + # bed2gff result line + sys.stdout.write('%s\tbed2gff\tgene\t%d\t%s\t%s\t%s\t.\tID=Gene:%s;Name=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3])) + sys.stdout.write('%s\tbed2gff\ttranscript\t%d\t%s\t%s\t%s\t.\tID=%s;Name=%s;Parent=Gene:%s\n' % (parts[0], int(parts[1])+1, parts[2], parts[4], parts[5], parts[3], parts[3], parts[3])) + + st = int(parts[1]) + for ex_cnt in range(int(parts[-3])): + start = st + int(rstarts[ex_cnt]) + 1 + stop = start + int(exon_lens[ex_cnt]) - 1 + sys.stdout.write('%s\tbed2gff\texon\t%d\t%d\t%s\t%s\t.\tParent=%s\n' % (parts[0], start, stop, parts[4], parts[5], parts[3])) + + bed_fh.close() + + +if __name__ == "__main__": + __main__()