Mercurial > repos > brigidar > select_product
view select_product.py @ 8:53f1f83e80a1 draft
Uploaded
author | brigidar |
---|---|
date | Fri, 09 Oct 2015 11:52:31 -0400 |
parents | 6497d42015aa |
children | 82186605ba75 |
line wrap: on
line source
#Adapted from gbk_to_gff # Brigida Rusconi, UTSA # October 8 2015 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description # adds a given number of bases on each side of the transcript for the exclusion set #!/usr/bin/env python import os import re import sys import collections import pdb import argparse from Bio import SeqIO def open_file(fname): """ Open the file (supports .gz .bz2) and returns the handler @args fname: input file name for reading @type fname: str """ try: if os.path.splitext(fname)[1] == ".gz": FH = gzip.open(fname, 'rb') elif os.path.splitext(fname)[1] == ".bz2": FH = bz2.BZ2File(fname, 'rb') else: FH = open(fname, 'rU') except Exception as error: sys.exit(error) return FH def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, product, keys_1, n): """ Write the feature information """ lines = [] for gname, ginfo in genes.items(): line = [str(chr_id), ginfo[3], str(ginfo[0]-n), str(ginfo[1]+n), ginfo[2], str(product)] # to key on specfic products only for k in keys_1.split(","): if k in str(product): lines.append('\t'.join(line)+"\n") break return lines def gbk_parse(fname,keys_1,n,outfile=""): """ Extract genome annotation recods from genbank format @args fname: gbk file name @type fname: str """ fhand = open_file(gbkfname) lines = [] for record in SeqIO.parse(fhand, "genbank"): gene_tags = dict() tx_tags = collections.defaultdict(list) exon = collections.defaultdict(list) cds = collections.defaultdict(list) mol_type, chr_id = None, None for rec in record.features: product = "NA" if rec.qualifiers.has_key('product'): product = rec.qualifiers['product'][0] if rec.type == 'source': try: mol_type = rec.qualifiers['mol_type'][0] except: mol_type = '.' pass try: chr_id = rec.qualifiers['chromosome'][0] except: chr_id = record.name continue strand='-' strand='+' if rec.strand>0 else strand fid = None try: fid = rec.qualifiers['gene'][0] except: pass transcript_id = None try: transcript_id = rec.qualifiers['transcript_id'][0] except: pass if re.search(r'gene', rec.type): gene_tags[fid] = (rec.location._start.position+1, rec.location._end.position, strand, rec.type ) elif rec.type == 'exon': exon[fid].append((rec.location._start.position+1, rec.location._end.position)) elif rec.type=='CDS': cds[fid].append((rec.location._start.position+1, rec.location._end.position)) else: # get all transcripts if transcript_id: tx_tags[fid].append((rec.location._start.position+1, rec.location._end.position, transcript_id, rec.type)) # record extracted, generate feature table lines += feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, product,keys_1,n) fhand.close() return lines if __name__=='__main__': try: parser = argparse.ArgumentParser() parser.add_argument('-k', '--keys_1', type=str, help="list of products to key on in a comma delimited list") parser.add_argument('-g', '--gbk_file', help="genbank file of reference") parser.add_argument('-n', '--number', type=int, help="flanking region to include") parser.add_argument('-o', '--output', help="output file") args = parser.parse_args() gbkfname = args.gbk_file n=args.number keys_1=args.keys_1 output=args.output except Exception,e: print "Error: %s" % e sys.exit(-1) ## extract gbk records print "Filtering on keys: %s" % keys_1 lines = gbk_parse(gbkfname,keys_1,n) with open(output, 'w') as of: for l in lines: of.write(l)