diff gff_to_bed.py @ 10:c42c69aa81f8

fixed manually the upload of version 2.1.0 - deleted accidentally added files to the repo
author vipints <vipin@cbio.mskcc.org>
date Thu, 23 Apr 2015 18:01:45 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_to_bed.py	Thu Apr 23 18:01:45 2015 -0400
@@ -0,0 +1,116 @@
+#!/usr/bin/env python
+"""
+Convert genome annotation data in GFF/GTF to a 12 column BED format. 
+BED format typically represents the transcript models. 
+
+Usage: python gff_to_bed.py in.gff > out.bed  
+
+Requirement:
+    GFFParser.py: https://github.com/vipints/GFFtools-GX/blob/master/GFFParser.py    
+
+Copyright (C) 
+    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
+    2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
+"""
+
+import re
+import sys
+import GFFParser
+
+def limitBEDWrite(tinfo):
+    """
+    Write a three column BED file 
+    
+    @args tinfo: list of genes 
+    @type tinfo: numpy object  
+    """
+
+    for contig_id, feature in tinfo.items():
+        uns_line = dict()
+        for tid, tloc in feature.items():
+            uns_line[(int(tloc[0])-1, int(tloc[1]))]=1
+        for ele in sorted(uns_line):
+            pline = [contig_id,
+                    str(ele[0]-1),
+                    str(ele[1])]
+
+            sys.stdout.write('\t'.join(pline)+"\n")
+
+
+def writeBED(tinfo):
+    """
+    writing result files in bed format 
+
+    @args tinfo: list of genes 
+    @type tinfo: numpy object  
+    """
+
+    for ent1 in tinfo:
+        child_flag = False  
+
+        for idx, tid in enumerate(ent1['transcripts']):
+            child_flag = True 
+            exon_cnt = len(ent1['exons'][idx])
+            exon_len = ''
+            exon_cod = '' 
+            rel_start = None 
+            rel_stop = None 
+            for idz, ex_cod in enumerate(ent1['exons'][idx]):#check for exons of corresponding transcript  
+                exon_len += '%d,' % (ex_cod[1]-ex_cod[0]+1)
+                if idz == 0: #calculate the relative start position 
+                    exon_cod += '0,'
+                    rel_start = int(ex_cod[0])-1 
+                    rel_stop = int(ex_cod[1])
+                else:
+                    exon_cod += '%d,' % (ex_cod[0]-1-rel_start) ## shifting the coordinates to zero 
+                    rel_stop = int(ex_cod[1])
+            
+            if exon_len:
+                score = 0 
+                score = ent1['transcript_score'][idx] if ent1['transcript_score'].any() else score ## getting the transcript score 
+                out_print = [ent1['chr'],
+                            str(rel_start),
+                            str(rel_stop),
+                            tid[0],
+                            str(score), 
+                            ent1['strand'], 
+                            str(rel_start),
+                            str(rel_stop),
+                            '0',
+                            str(exon_cnt),
+                            exon_len,
+                            exon_cod]
+                sys.stdout.write('\t'.join(out_print)+"\n")
+        
+        if not child_flag: # file just contains only a single parent type i.e, gff3 defines only one feature type 
+            score = 0 
+            score = ent1['transcript_score'][0] if ent1['transcript_score'].any() else score
+
+            out_print = [ent1['chr'], 
+                        '%d' % int(ent1['start'])-1, 
+                        '%d' % int(ent1['stop']),
+                        ent1['name'], 
+                        str(score), 
+                        ent1['strand'],
+                        '%d' % int(ent1['start']), 
+                        '%d' % int(ent1['stop']),
+                        '0',
+                        '1',
+                        '%d,' % (int(ent1['stop'])-int(ent1['start'])+1), 
+                        '0,']
+
+            sys.stdout.write('\t'.join(out_print)+"\n")
+
+    
+def __main__():
+    try:
+        query_file = sys.argv[1]
+    except:
+        print __doc__
+        sys.exit(-1)
+
+    Transcriptdb = GFFParser.Parse(query_file)  
+    writeBED(Transcriptdb)
+
+if __name__ == "__main__": 
+    __main__()