Mercurial > repos > vipints > fml_gff3togtf
diff gff_to_bed.py @ 10:c42c69aa81f8
fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author | vipints <vipin@cbio.mskcc.org> |
---|---|
date | Thu, 23 Apr 2015 18:01:45 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gff_to_bed.py Thu Apr 23 18:01:45 2015 -0400 @@ -0,0 +1,116 @@ +#!/usr/bin/env python +""" +Convert genome annotation data in GFF/GTF to a 12 column BED format. +BED format typically represents the transcript models. + +Usage: python gff_to_bed.py in.gff > out.bed + +Requirement: + GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py + +Copyright (C) + 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. + 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. +""" + +import re +import sys +import GFFParser + +def limitBEDWrite(tinfo): + """ + Write a three column BED file + + @args tinfo: list of genes + @type tinfo: numpy object + """ + + for contig_id, feature in tinfo.items(): + uns_line = dict() + for tid, tloc in feature.items(): + uns_line[(int(tloc[0])-1, int(tloc[1]))]=1 + for ele in sorted(uns_line): + pline = [contig_id, + str(ele[0]-1), + str(ele[1])] + + sys.stdout.write('\t'.join(pline)+"\n") + + +def writeBED(tinfo): + """ + writing result files in bed format + + @args tinfo: list of genes + @type tinfo: numpy object + """ + + for ent1 in tinfo: + child_flag = False + + for idx, tid in enumerate(ent1['transcripts']): + child_flag = True + exon_cnt = len(ent1['exons'][idx]) + exon_len = '' + exon_cod = '' + rel_start = None + rel_stop = None + for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript + exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1) + if idz == 0: #calculate the relative start position + exon_cod += '0,' + rel_start = int(ex_cod[0])-1 + rel_stop = int(ex_cod[1]) + else: + exon_cod += '%d,' % (ex_cod[0]-1-rel_start) ## shifting the coordinates to zero + rel_stop = int(ex_cod[1]) + + if exon_len: + score = 0 + score = ent1['transcript_score'][idx] if ent1['transcript_score'].any() else score ## getting the transcript score + out_print = [ent1['chr'], + str(rel_start), + str(rel_stop), + tid[0], + str(score), + ent1['strand'], + str(rel_start), + str(rel_stop), + '0', + str(exon_cnt), + exon_len, + exon_cod] + sys.stdout.write('\t'.join(out_print)+"\n") + + if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type + score = 0 + score = ent1['transcript_score'][0] if ent1['transcript_score'].any() else score + + out_print = [ent1['chr'], + '%d' % int(ent1['start'])-1, + '%d' % int(ent1['stop']), + ent1['name'], + str(score), + ent1['strand'], + '%d' % int(ent1['start']), + '%d' % int(ent1['stop']), + '0', + '1', + '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), + '0,'] + + sys.stdout.write('\t'.join(out_print)+"\n") + + +def __main__(): + try: + query_file = sys.argv[1] + except: + print __doc__ + sys.exit(-1) + + Transcriptdb = GFFParser.Parse(query_file) + writeBED(Transcriptdb) + +if __name__ == "__main__": + __main__()