changeset 8:53f1f83e80a1 draft

Uploaded
author brigidar
date Fri, 09 Oct 2015 11:52:31 -0400
parents 7d71f20d949e
children 0c769b205d18
files select_product.py
diffstat 1 files changed, 50 insertions(+), 33 deletions(-) [+]
line wrap: on
line diff
--- a/select_product.py	Fri Oct 09 11:52:18 2015 -0400
+++ b/select_product.py	Fri Oct 09 11:52:31 2015 -0400
@@ -7,65 +7,73 @@
 
 
 #!/usr/bin/env python
-"""
-Convert data from Genbank format to GFF. 
 
-Usage: 
-python gbk_to_gff.py in.gbk > out.gff 
-
-Requirements:
-    BioPython:- http://biopython.org/
-    helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py
-
-Copyright (C) 
-    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
-    2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
-"""
 
 import os
 import re
 import sys
-import helper 
 import collections
 import pdb
 import argparse
 from Bio import SeqIO
 
-def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n):
+def open_file(fname):
+    """
+        Open the file (supports .gz .bz2) and returns the handler
+        
+        @args fname: input file name for reading
+        @type fname: str
+        """
+    
+    try:
+        if os.path.splitext(fname)[1] == ".gz":
+            FH = gzip.open(fname, 'rb')
+        elif os.path.splitext(fname)[1] == ".bz2":
+            FH = bz2.BZ2File(fname, 'rb')
+        else:
+            FH = open(fname, 'rU')
+    except Exception as error:
+        sys.exit(error)
+    
+    return FH
+
+
+def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n):
     """
     Write the feature information
     """
+    
+    lines = []
+    
     for gname, ginfo in genes.items():
         line = [str(chr_id),
                 ginfo[3],
-                str(ginfo[0]-int(n)),
-                str(ginfo[1]+int(n)),
+                str(ginfo[0]-n),
+                str(ginfo[1]+n),
                 ginfo[2],
                 str(product)]
                 
 # to key on specfic products only
-        for k in keys.split(","):
+        for k in keys_1.split(","):
             if k in str(product):
         
-                sys.stdout.write('\t'.join(line)+"\n")
+                lines.append('\t'.join(line)+"\n")
                 break
 
-
-        unk +=1
-    return unk
+    return lines
 
 
 
 
-def gbk_parse(fname,keys,n):
+def gbk_parse(fname,keys_1,n,outfile=""):
     """
     Extract genome annotation recods from genbank format 
 
     @args fname: gbk file name 
     @type fname: str
     """
-    fhand = helper.open_file(gbkfname)
-    unk = 1 
+    fhand = open_file(gbkfname)
+    lines = []
 
     for record in SeqIO.parse(fhand, "genbank"):
         gene_tags = dict()
@@ -131,30 +139,39 @@
                                     transcript_id,
                                     rec.type))
         # record extracted, generate feature table
-        unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n)
+        lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n)
         
     fhand.close()
 
+    return lines
 
 if __name__=='__main__': 
 
     try:
         parser = argparse.ArgumentParser()
-        parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list")
+        parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list")
         parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
-        parser.add_argument('-n', '--number', help="flanking region to include")
+        parser.add_argument('-n', '--number', type=int, help="flanking region to include")
+        parser.add_argument('-o', '--output', help="output file")
 
         args = parser.parse_args()
         gbkfname = args.gbk_file
         n=args.number
-        keys=args.keys
+        keys_1=args.keys_1
+        output=args.output
     
-    except:
-        print __doc__
+    except Exception,e:
+        print "Error: %s" % e
         sys.exit(-1)
 
-    ## extract gbk records  
-    gbk_parse(gbkfname,keys,n)
+    ## extract gbk records
+
+    print "Filtering on keys: %s" % keys_1
+    lines = gbk_parse(gbkfname,keys_1,n)
+
+    with open(output, 'w') as of:
+        for l in lines:
+            of.write(l)