view corebio/seq_io/genbank_io.py @ 12:b819394a2634

Uploaded
author davidmurphy
date Wed, 22 Feb 2012 06:42:17 -0500
parents c55bdc2fb9fa
children
line wrap: on
line source

#!/usr/bin/env python
 

"""Read GenBank flat files. 

Currently only reads sequence data and not annotations.

"""
from corebio.utils import *
from corebio.seq import *

  
names = ( 'genbank',)
extensions = ('gb','genbank', 'gbk')



def read(fin, alphabet=None): 
    """Read and parse a file of genbank records. 

    Args:
    fin -- A stream or file to read
    alphabet -- The expected alphabet of the data, if given
    
    Returns: 
    SeqList -- A list of sequences
    
    Raises: 
    ValueError -- If the file is unparsable
    """         
    seqs = [ s for s in iterseq(fin, alphabet)]
    return SeqList(seqs)

    
def iterseq(fin, alphabet=None):
    """ Iterate over genbank records
    
    Args:
    fin -- A stream or file to read
    alphabet -- The expected alphabet of the data, if given    
    
    Yeilds: 
    Seq -- One alphabetic sequence at a time.
    
    Raises: 
    ValueError -- If the file is unparsable
    """
    alphabet = Alphabet(alphabet)

    seq = []
    
    def notblank(string) :
        return not isblank(string)

    lines = Reiterate(iter(fin))
    
    
    while True :
        line = lines.filter( notblank )
        if not line.startswith('LOCUS') :
            raise ValueError(
                "Cannot find start of record at line %d"% lines.index() )

        line = lines.filter(lambda s : s.startswith('ORIGIN') 
                                            or  s.startswith('//') )

        if line.startswith('//') :
            # No sequence data    
            yield Seq( '', alphabet)
        else:
            for line in lines :
                if line.startswith('//') :
                    yield Seq( ''.join(seq), alphabet)
                    seq = []
                    break    
                seq.extend( line.split()[1:] )