Mercurial > repos > davidmurphy > codonlogo
diff corebio/seq_io/nbrf_io.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/seq_io/nbrf_io.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,169 @@ + +# Copyright (c) 2006, The Regents of the University of California, through +# Lawrence Berkeley National Laboratory (subject to receipt of any required +# approvals from the U.S. Dept. of Energy). All rights reserved. + +# This software is distributed under the new BSD Open Source License. +# <http://www.opensource.org/licenses/bsd-license.html> +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# (1) Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# (2) Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and or other materials provided with the distribution. +# +# (3) Neither the name of the University of California, Lawrence Berkeley +# National Laboratory, U.S. Dept. of Energy nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. + +"""Sequence IO for NBRF/PIR format. + +The format is similar to fasta. The header line consistins of '>', a two- +letter sequence type (P1, F1, DL, DC, RL, RC, or XX), a semicolon, and a +sequence ID. The next line is a textual description of the sequence, +followed by one or more lines containing the sequence data. The end of +the sequence is marked by a "*" (asterisk) character. + +type_code -- A map between NBRF two letter type codes and Alphabets. + + +see: http://www.cmbi.kun.nl/bioinf/tools/crab_pir.html + +--- Example NBRF File --- + +>P1;CRAB_ANAPL +ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). + MDITIHNPLI RRPLFSWLAP SRIFDQIFGE HLQESELLPA SPSLSPFLMR + SPIFRMPSWL ETGLSEMRLE KDKFSVNLDV KHFSPEELKV KVLGDMVEIH + GKHEERQDEH GFIAREFNRK YRIPADVDPL TITSSLSLDG VLTVSAPRKQ + SDVPERSIPI TREEKPAIAG AQRK* + +>P1;CRAB_BOVIN +ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN). + MDIAIHHPWI RRPFFPFHSP SRLFDQFFGE HLLESDLFPA STSLSPFYLR + PPSFLRAPSW IDTGLSEMRL EKDRFSVNLD VKHFSPEELK VKVLGDVIEV + HGKHEERQDE HGFISREFHR KYRIPADVDP LAITSSLSSD GVLTVNGPRK + QASGPERTIP ITREEKPAVT AAPKK* + +""" + +from corebio.utils import * +from corebio.seq import * +from corebio.seq_io import * + +names = ("nbrf", "pir",) +extensions = ('nbrf', 'pir', 'ali') + + + + +type_code = { + 'P1' : protein_alphabet, # Protein (complete) + 'F1' : protein_alphabet, # Protein (fragment) + 'DL' : dna_alphabet, # DNA (linear) + 'DC' : dna_alphabet, # DNA (circular) + 'RC' : rna_alphabet, # RNA (linear) + 'RL' : rna_alphabet, # RNA (circular) + 'N3' : rna_alphabet, # tRNA + 'N1' : rna_alphabet, # other functional RNA + 'XX' : generic_alphabet + } + +def read(fin, alphabet=None): + """Read and parse a NBRF seqquence file. + + Args: + fin -- A stream or file to read + alphabet -- The expected alphabet of the data. If not supplied, then + an appropriate alphabet will be inferred from the data. + Returns: + SeqList -- A list of sequences + Raises: + ValueError -- If the file is unparsable + """ + seqs = [ s for s in iterseq(fin, alphabet)] + return SeqList(seqs) + + + +def iterseq(fin, alphabet=None): + """ Generate sequences from an NBRF file. + + arguments: + fin -- A stream or file to read + alphabet -- + yeilds : + Seq + raises : + ValueError -- On a parse error. + """ + + body, header,sequence = range(3) # Internal states + + state = body + seq_id = None + seq_desc = None + seq_alpha = None + seqs = [] + + for lineno, line in enumerate(fin) : + if state == body : + if line == "" or line.isspace() : + continue + if line[0] == '>': + seq_type, seq_id = line[1:].split(';') + if alphabet : + seq_alpha = alphabet + else : + seq_alpha = type_code[seq_type] + state = header + continue + raise ValueError("Parse error on line: %d" % lineno) + + elif state == header : + seq_desc = line.strip() + state = sequence + continue + + elif state == sequence : + data = "".join(line.split()) # Strip out white space + if data[-1] =='*' : + # End of sequence data + seqs.append(data[:-1]) + + seq = Seq( "".join(seqs), name = seq_id.strip(), + description = seq_desc, alphabet = seq_alpha) + + yield seq + state= body + seq_id = None + seq_desc = None + seqs = [] + continue + else : + seqs.append(data) + continue + else : + # If we ever get here something has gone terrible wrong + assert(False) + + # end for + +