codonlogo: corebio/seq_io/stockholm

comparison corebio/seq_io/stockholm_io.py @ 4:4d47ab2b7bcc

Uploaded

author	davidmurphy
date	Fri, 13 Jan 2012 07:18:19 -0500
parents	c55bdc2fb9fa
children

comparison

equal deleted inserted replaced

-:09d2dac9ef73
+:4d47ab2b7bcc
+#  Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
+#
+#  This software is distributed under the MIT Open Source License.
+#  <http://www.opensource.org/licenses/mit-license.html>
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#  THE SOFTWARE.
+#
+"""Read a multiple sequence alignment in STOCKHOLM format.
+This file format is used by PFAM and HMMER. At present, all annotation
+information is ignored.
+See:
+- http://www.cgb.ki.se/cgb/groups/sonnhammer/Stockholm.html
+- HMMER manual
+"""
+import re
+from corebio.utils import *
+from corebio.seq import *
+from corebio.seq_io import *
+example = """
+# STOCKHOLM 1.0
+#=GF ID CBS
+#=GF AC PF00571
+#=GF DE CBS domain
+#=GF AU Bateman A
+#=GF CC CBS domains are small intracellular modules mostly found
+#=GF CC in 2 or four copies within a protein.
+#=GF SQ 67
+#=GS O31698/18-71 AC O31698
+#=GS O83071/192-246 AC O83071
+#=GS O83071/259-312 AC O83071
+#=GS O31698/88-139 AC O31698
+#=GS O31698/88-139 OS Bacillus subtilis
+O83071/192-246          MTCRAQLIAVPRASSLAE..AIACAQKM....RVSRVPVYERS
+#=GR O83071/192-246 SA  999887756453524252..55152525....36463774777
+O83071/259-312          MQHVSAPVFVFECTRLAY..VQHKLRAH....SRAVAIVLDEY
+#=GR O83071/259-312 SS  CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEE
+O31698/18-71            MIEADKVAHVQVGNNLEH..ALLVLTKT....GYTAIPVLDPS
+#=GR O31698/18-71 SS    CCCHHHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEHHH
+O31698/88-139           EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
+#=GR O31698/88-139 SS   CCCCCCCHHHHHHHHHHH..HEEEEEEE....EEEEEEEEEEH
+#=GC SS_cons            CCCCCHHHHHHHHHHHHH..EEEEEEEE....EEEEEEEEEEH
+O31699/88-139           EVMLTDIPRLHINDPIMK..GFGMVINN......GFVCVENDE
+#=GR O31699/88-139 AS   ________________*__________________________
+#=GR_O31699/88-139_IN   ____________1______________2__________0____
+//
+"""
+names = ("stockholm", "pfam",)
+extensions = ('sth', 'stockholm', 'align')
+header_line = re.compile(r'#\s+STOCKHOLM\s+1.\d\s+$')
+def iterseq(fin, alphabet=None):
+"""Iterate over the sequences in the file."""
+# Default implementation
+return iter(read(fin, alphabet) )
+def read(fin, alphabet=None) :
+alphabet = Alphabet(alphabet)
+seq_ids = []
+seqs = []
+block_count = 0
+for token in _scan(fin):
+if token.typeof== "begin_block":
+block_count = 0
+elif token.typeof == "seq_id":
+if len(seqs) <= block_count :
+seq_ids.append(token.data)
+seqs.append([])
+elif token.typeof == "seq":
+if not alphabet.alphabetic(token.data) :
+raise ValueError (
+"Character on line: %d not in alphabet: %s : %s" % (
+token.lineno, alphabet, token.data) )
+seqs[block_count].append(token.data)
+block_count +=1
+seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)]
+return SeqList(seqs)
+def _scan( fin ):
+header, body, block = range(3)
+yield Token("begin")
+state = header
+for L, line in enumerate(fin):
+if state==header :
+if line.isspace() : continue
+m = header_line.match(line)
+state = body
+if m is not None :
+# print "header: ", m.group()
+yield Token("header", m.group() )
+continue
+else :
+raise ValueError("Parse error on line: %d" % L)
+if state == body :
+if line.isspace() : continue
+yield Token("begin_block")
+state = block
+# fall through to block
+if state ==  block:
+if line.isspace() :
+yield Token("end_block")
+state = body
+continue
+if line.strip() == '//' :
+yield Token("end_block")
+return
+if line[0] =='#' :  # Comment or annotation line
+continue
+name_seq = line.split(None,1) # Split into two parts at first whitespace
+if len(name_seq) != 2 :
+raise ValueError("Parse error on line: %d" % L)
+yield Token("seq_id", name_seq[0].strip() )
+yield Token("seq", name_seq[1].strip() )
+continue
+# END state blocks. If I ever get here something has gone terrible wrong
+raise RuntimeError()

Mercurial > repos > davidmurphy > codonlogo

comparison corebio/seq_io/stockholm_io.py @ 4:4d47ab2b7bcc