Mercurial > repos > brigidar > select_product
view select_product.py @ 0:6497d42015aa draft
Uploaded
author | brigidar |
---|---|
date | Thu, 08 Oct 2015 15:36:12 -0400 |
parents | |
children | 53f1f83e80a1 |
line wrap: on
line source
#Adapted from gbk_to_gff # Brigida Rusconi, UTSA # October 8 2015 # Finds all CDS that match the key words provided in the comma separated argument -k in the product description # adds a given number of bases on each side of the transcript for the exclusion set #!/usr/bin/env python """ Convert data from Genbank format to GFF. Usage: python gbk_to_gff.py in.gbk > out.gff Requirements: BioPython:- http://biopython.org/ helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py Copyright (C) 2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany. 2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA. """ import os import re import sys import helper import collections import pdb import argparse from Bio import SeqIO def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n): """ Write the feature information """ for gname, ginfo in genes.items(): line = [str(chr_id), ginfo[3], str(ginfo[0]-int(n)), str(ginfo[1]+int(n)), ginfo[2], str(product)] # to key on specfic products only for k in keys.split(","): if k in str(product): sys.stdout.write('\t'.join(line)+"\n") break unk +=1 return unk def gbk_parse(fname,keys,n): """ Extract genome annotation recods from genbank format @args fname: gbk file name @type fname: str """ fhand = helper.open_file(gbkfname) unk = 1 for record in SeqIO.parse(fhand, "genbank"): gene_tags = dict() tx_tags = collections.defaultdict(list) exon = collections.defaultdict(list) cds = collections.defaultdict(list) mol_type, chr_id = None, None for rec in record.features: product = "NA" if rec.qualifiers.has_key('product'): product = rec.qualifiers['product'][0] if rec.type == 'source': try: mol_type = rec.qualifiers['mol_type'][0] except: mol_type = '.' pass try: chr_id = rec.qualifiers['chromosome'][0] except: chr_id = record.name continue strand='-' strand='+' if rec.strand>0 else strand fid = None try: fid = rec.qualifiers['gene'][0] except: pass transcript_id = None try: transcript_id = rec.qualifiers['transcript_id'][0] except: pass if re.search(r'gene', rec.type): gene_tags[fid] = (rec.location._start.position+1, rec.location._end.position, strand, rec.type ) elif rec.type == 'exon': exon[fid].append((rec.location._start.position+1, rec.location._end.position)) elif rec.type=='CDS': cds[fid].append((rec.location._start.position+1, rec.location._end.position)) else: # get all transcripts if transcript_id: tx_tags[fid].append((rec.location._start.position+1, rec.location._end.position, transcript_id, rec.type)) # record extracted, generate feature table unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n) fhand.close() if __name__=='__main__': try: parser = argparse.ArgumentParser() parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list") parser.add_argument('-g', '--gbk_file', help="genbank file of reference") parser.add_argument('-n', '--number', help="flanking region to include") args = parser.parse_args() gbkfname = args.gbk_file n=args.number keys=args.keys except: print __doc__ sys.exit(-1) ## extract gbk records gbk_parse(gbkfname,keys,n)