Mercurial > repos > davidmurphy > codonlogo
view corebio/seq_io/genbank_io.py @ 9:f3462128e87c
Minor alterations to the galaxy interface with some better examples and error messages added.
author | davidmurphy |
---|---|
date | Mon, 30 Jan 2012 08:17:57 -0500 |
parents | c55bdc2fb9fa |
children |
line wrap: on
line source
#!/usr/bin/env python """Read GenBank flat files. Currently only reads sequence data and not annotations. """ from corebio.utils import * from corebio.seq import * names = ( 'genbank',) extensions = ('gb','genbank', 'gbk') def read(fin, alphabet=None): """Read and parse a file of genbank records. Args: fin -- A stream or file to read alphabet -- The expected alphabet of the data, if given Returns: SeqList -- A list of sequences Raises: ValueError -- If the file is unparsable """ seqs = [ s for s in iterseq(fin, alphabet)] return SeqList(seqs) def iterseq(fin, alphabet=None): """ Iterate over genbank records Args: fin -- A stream or file to read alphabet -- The expected alphabet of the data, if given Yeilds: Seq -- One alphabetic sequence at a time. Raises: ValueError -- If the file is unparsable """ alphabet = Alphabet(alphabet) seq = [] def notblank(string) : return not isblank(string) lines = Reiterate(iter(fin)) while True : line = lines.filter( notblank ) if not line.startswith('LOCUS') : raise ValueError( "Cannot find start of record at line %d"% lines.index() ) line = lines.filter(lambda s : s.startswith('ORIGIN') or s.startswith('//') ) if line.startswith('//') : # No sequence data yield Seq( '', alphabet) else: for line in lines : if line.startswith('//') : yield Seq( ''.join(seq), alphabet) seq = [] break seq.extend( line.split()[1:] )