# HG changeset patch # User brigidar # Date 1444405951 14400 # Node ID 53f1f83e80a1a7fdc7c32344232227c76edb4dd4 # Parent 7d71f20d949ea351830a9ffe48a388bb1152915c Uploaded diff -r 7d71f20d949e -r 53f1f83e80a1 select_product.py --- a/select_product.py Fri Oct 09 11:52:18 2015 -0400 +++ b/select_product.py Fri Oct 09 11:52:31 2015 -0400 @@ -7,65 +7,73 @@ #!/usr/bin/env python -""" -Convert data from Genbank format to GFF. -Usage: -python gbk_to_gff.py in.gbk > out.gff - -Requirements: - BioPython:- http://biopython.org/ - helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py - -Copyright (C) - 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. - 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. -""" import os import re import sys -import helper import collections import pdb import argparse from Bio import SeqIO -def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n): +def open_file(fname): + """ + Open the file (supports .gz .bz2) and returns the handler + + @args fname: input file name for reading + @type fname: str + """ + + try: + if os.path.splitext(fname)[1] == ".gz": + FH = gzip.open(fname, 'rb') + elif os.path.splitext(fname)[1] == ".bz2": + FH = bz2.BZ2File(fname, 'rb') + else: + FH = open(fname, 'rU') + except Exception as error: + sys.exit(error) + + return FH + + +def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n): """ Write the feature information """ + + lines = [] + for gname, ginfo in genes.items(): line = [str(chr_id), ginfo[3], - str(ginfo[0]-int(n)), - str(ginfo[1]+int(n)), + str(ginfo[0]-n), + str(ginfo[1]+n), ginfo[2], str(product)] # to key on specfic products only - for k in keys.split(","): + for k in keys_1.split(","): if k in str(product): - sys.stdout.write('\t'.join(line)+"\n") + lines.append('\t'.join(line)+"\n") break - - unk +=1 - return unk + return lines -def gbk_parse(fname,keys,n): +def gbk_parse(fname,keys_1,n,outfile=""): """ Extract genome annotation recods from genbank format @args fname: gbk file name @type fname: str """ - fhand = helper.open_file(gbkfname) - unk = 1 + fhand = open_file(gbkfname) + lines = [] for record in SeqIO.parse(fhand, "genbank"): gene_tags = dict() @@ -131,30 +139,39 @@ transcript_id, rec.type)) # record extracted, generate feature table - unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n) + lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n) fhand.close() + return lines if __name__=='__main__': try: parser = argparse.ArgumentParser() - parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list") + parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list") parser.add_argument('-g', '--gbk_file', help="genbank file of reference") - parser.add_argument('-n', '--number', help="flanking region to include") + parser.add_argument('-n', '--number', type=int, help="flanking region to include") + parser.add_argument('-o', '--output', help="output file") args = parser.parse_args() gbkfname = args.gbk_file n=args.number - keys=args.keys + keys_1=args.keys_1 + output=args.output - except: - print __doc__ + except Exception,e: + print "Error: %s" % e sys.exit(-1) - ## extract gbk records - gbk_parse(gbkfname,keys,n) + ## extract gbk records + + print "Filtering on keys: %s" % keys_1 + lines = gbk_parse(gbkfname,keys_1,n) + + with open(output, 'w') as of: + for l in lines: + of.write(l)