view select_product.py @ 1:c76d2a6bedee draft

Uploaded
author brigidar
date Thu, 08 Oct 2015 15:36:47 -0400
parents 6497d42015aa
children 53f1f83e80a1
line wrap: on
line source

#Adapted from gbk_to_gff
# Brigida Rusconi, UTSA
# October 8 2015
# Finds all CDS that match the key words provided in the comma separated argument -k in the product description
# adds a given number of bases on each side of the transcript for the exclusion set



#!/usr/bin/env python
"""
Convert data from Genbank format to GFF. 

Usage: 
python gbk_to_gff.py in.gbk > out.gff 

Requirements:
    BioPython:- http://biopython.org/
    helper.py:- https://github.com/vipints/GFFtools-GX/blob/master/helper.py

Copyright (C) 
    2009-2012 Friedrich Miescher Laboratory of the Max Planck Society, Tubingen, Germany.
    2012-2015 Memorial Sloan Kettering Cancer Center New York City, USA.
"""

import os
import re
import sys
import helper 
import collections
import pdb
import argparse
from Bio import SeqIO

def feature_table(chr_id, source, orient, genes, transcripts, cds, exons, unk, product, keys, n):
    """
    Write the feature information
    """
    for gname, ginfo in genes.items():
        line = [str(chr_id),
                ginfo[3],
                str(ginfo[0]-int(n)),
                str(ginfo[1]+int(n)),
                ginfo[2],
                str(product)]
                
# to key on specfic products only
        for k in keys.split(","):
            if k in str(product):
        
                sys.stdout.write('\t'.join(line)+"\n")
                break


        unk +=1
    return unk




def gbk_parse(fname,keys,n):
    """
    Extract genome annotation recods from genbank format 

    @args fname: gbk file name 
    @type fname: str
    """
    fhand = helper.open_file(gbkfname)
    unk = 1 

    for record in SeqIO.parse(fhand, "genbank"):
        gene_tags = dict()
        tx_tags = collections.defaultdict(list) 
        exon = collections.defaultdict(list) 
        cds = collections.defaultdict(list) 
        mol_type, chr_id = None, None 

        for rec in record.features:


            product = "NA"
            
            if rec.qualifiers.has_key('product'):
            
                product = rec.qualifiers['product'][0]
            

            if rec.type == 'source':
                try:
                    mol_type = rec.qualifiers['mol_type'][0]
                except:
                    mol_type = '.'
                    pass 
                try:
                    chr_id = rec.qualifiers['chromosome'][0]
                except:
                    chr_id = record.name 
                continue 

            strand='-'
            strand='+' if rec.strand>0 else strand
            
            fid = None 
            try:
                fid = rec.qualifiers['gene'][0]
            except:
                pass

            transcript_id = None
            try:
                transcript_id = rec.qualifiers['transcript_id'][0]
            except:
                pass 

            if re.search(r'gene', rec.type):
                gene_tags[fid] = (rec.location._start.position+1, 
                                    rec.location._end.position, 
                                    strand,
                                    rec.type
                                    )
            elif rec.type == 'exon':
                exon[fid].append((rec.location._start.position+1, 
                                    rec.location._end.position))
            elif rec.type=='CDS':
                cds[fid].append((rec.location._start.position+1, 
                                    rec.location._end.position))
            else: 
                # get all transcripts 
                if transcript_id: 
                    tx_tags[fid].append((rec.location._start.position+1,
                                    rec.location._end.position, 
                                    transcript_id,
                                    rec.type))
        # record extracted, generate feature table
        unk = feature_table(chr_id, mol_type, strand, gene_tags, tx_tags, cds, exon, unk, product,keys,n)
        
    fhand.close()


if __name__=='__main__': 

    try:
        parser = argparse.ArgumentParser()
        parser.add_argument('-k', '--keys', help="list of products to key on in a comma delimited list")
        parser.add_argument('-g', '--gbk_file', help="genbank file of reference")
        parser.add_argument('-n', '--number', help="flanking region to include")

        args = parser.parse_args()
        gbkfname = args.gbk_file
        n=args.number
        keys=args.keys
    
    except:
        print __doc__
        sys.exit(-1)

    ## extract gbk records  
    gbk_parse(gbkfname,keys,n)