diff corebio/seq_io/clustal_io.py @ 4:4d47ab2b7bcc

Uploaded
author davidmurphy
date Fri, 13 Jan 2012 07:18:19 -0500
parents c55bdc2fb9fa
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/corebio/seq_io/clustal_io.py	Fri Jan 13 07:18:19 2012 -0500
@@ -0,0 +1,222 @@
+ 
+#  Copyright (c) 2005 Gavin E. Crooks <gec@threeplusone.com>
+#
+#  This software is distributed under the MIT Open Source License.
+#  <http://www.opensource.org/licenses/mit-license.html>
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a 
+#  copy of this software and associated documentation files (the "Software"),
+#  to deal in the Software without restriction, including without limitation
+#  the rights to use, copy, modify, merge, publish, distribute, sublicense,
+#  and/or sell copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included
+#  in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 
+#  THE SOFTWARE.
+#
+
+""" Read and write the CLUSTAL sequence file format.
+ 
+See :
+- http://www.cmpharm.ucsf.edu/~goh/Treecorr/sampleAlignment.html
+- http://www.bioperl.org/wiki/ClustalW_multiple_alignment_format
+  
+Ref :
+-   Higgins D., Thompson J., Gibson T., Thompson J.D., Higgins D.G., Gibson 
+    T.J. (1994). CLUSTAL W: improving the sensitivity of progressive multiple
+    sequence alignment through sequence weighting, position-specific gap
+    penalties and weight matrix choice. Nucleic Acids Res. 22:4673-4680.
+"""
+
+# TODO: What happens if CLUSTAL is not the first line of the file?
+
+
+import re
+
+from corebio.utils import *
+from corebio.seq import *
+from corebio.seq_io import *
+
+__all__ = ('example', 'names', 'extensions', 'read')
+
+example = """
+CLUSTAL W (1.81) multiple sequence alignment
+
+
+CXCR3_MOUSE       --------------------------LENSTSPYDYGENESD-------FSDSPPCPQDF
+BLR_HUMAN         --------------------------LENLEDLF-WELDRLD------NYNDTSLVENH-
+CXCR1_HUMAN       --------------------------MSNITDPQMWDFDDLN-------FTGMPPADEDY
+CXCR4_MURINE      -----------------------------------YTSDN---------YSGSGDYDSNK
+                                                     :  :          :..     ..
+ 
+CXCR3_MOUSE       -SL-------NFDRTFLPALYSLLFLLGLLGNGAVAAVLLSQRTALSSTDTFLLHLAVAD
+BLR_HUMAN         --LC-PATMASFKAVFVPVAYSLIFLLGVIGNVLVLVILERHRQTRSSTETFLFHLAVAD
+CXCR1_HUMAN       -SPC-MLETETLNKYVVIIAYALVFLLSLLGNSLVMLVILYSRVGRSVTDVYLLNLALAD
+CXCR4_MURINE      -EPC-RDENVHFNRIFLPTIYFIIFLTGIVGNGLVILVMGYQKKLRSMTDKYRLHLSVAD
+                             :.  .:   * ::** .::**  *  ::   :   * *: : ::*::**
+
+CXCR3_MOUSE       VLLVLTLPLWAVDAA-VQWVFGPGLCKVAGALFNINFYAGAFLLACISFDRYLSIVHATQ
+BLR_HUMAN         LLLVFILPFAVAEGS-VGWVLGTFLCKTVIALHKVNFYCSSLLLACIAVDRYLAIVHAVH
+CXCR1_HUMAN       LLFALTLPIWAASKV-NGWIFGTFLCKVVSLLKEVNFYSGILLLACISVDRYLAIVHATR
+CXCR4_MURINE      LLFVITLPFWAVDAM-ADWYFGKFLCKAVHIIYTVNLYSSVLILAFISLDRYLAIVHATN
+                  :*:.: **: ...     * :*  ***..  :  :*:*.. ::** *:.****:****..
+"""
+
+
+
+names = ("clustal", "clustalw",)
+extensions = ('aln',)
+
+
+header_line = re.compile(r'(CLUSTAL.*)$')
+
+# (sequence_id) (Sequence) (Optional sequence number)
+seq_line   = re.compile(r'(\s*\S+\s+)(\S+)\s*(\d*)$')
+
+# Saved group includes variable length leading space.
+# Must consult a seq_line to figure out how long the leading spoace is since
+# the maximum CLUSTAL ids length (normally 10 characters) can be changed.
+match_line = re.compile(r'([\s:\.\*]*)$') 
+
+
+def iterseq(fin, alphabet=None):
+    """Iterate over the sequences in the file."""
+    # Default implementation
+    return iter(read(fin, alphabet) )
+
+
+def read(fin, alphabet=None) :  
+    alphabet = Alphabet(alphabet)      
+    seq_ids = []
+    seqs = []
+    block_count = 0
+    
+    
+    for token in _scan(fin):
+        if token.typeof== "begin_block":
+            block_count = 0
+        elif token.typeof == "seq_id":
+            if len(seqs) <= block_count :
+                seq_ids.append(token.data)
+                seqs.append([])
+        elif token.typeof == "seq":
+            if not alphabet.alphabetic(token.data) :
+                raise ValueError(
+                    "Character on line: %d not in alphabet: %s : %s" % (
+                    token.lineno, alphabet, token.data) )
+            seqs[block_count].append(token.data)
+            block_count +=1
+
+          
+    seqs = [ Seq("".join(s), alphabet, name= i) for s,i in zip(seqs,seq_ids)]
+    return SeqList(seqs)
+
+
+# 1) The word "CLUSTAL" should be the first word on the first line of the file.
+# (But sometimes isn't.)
+# 2) The alignment is displayed in blocks of fixed length.
+# 3) Each line in the block corresponds to one sequence.
+# 4) Each sequence line starts with a sequence name followed by at least one
+#     space and then the sequence.
+
+def _scan( fin ):
+    """Scan a clustal format MSA file and yeild tokens.
+        The basic file structure is
+            begin_document
+                header?     
+               (begin_block
+                   (seq_id seq seq_index?)+
+                   match_line?
+               end_block)*
+            end_document     
+    
+        Usage:
+        for token in scan(clustal_file):
+            do_somthing(token)
+    """
+    header, body, block = range(3)
+    
+    yield Token("begin")
+    leader_width = -1
+    state = header
+    for L, line in enumerate(fin):
+        if state==header :
+            if line.isspace() : continue
+            m = header_line.match(line)
+            state = body
+            if m is not None :
+                yield Token("header", m.group() )
+                continue
+            else :
+                raise ValueError("Cannot find required header")
+
+        
+        if state == body :
+            if line.isspace() : continue
+            yield Token("begin_block")
+            state = block
+            # fall through to block
+        
+        if state ==  block:
+            if line.isspace() :
+                yield Token("end_block")
+                state = body
+                continue
+            
+            m = match_line.match(line)
+            if m is not None :
+                yield Token("match_line", line[leader_width:-1])
+                continue
+     
+            m = seq_line.match(line) 
+            if m is None: 
+                raise ValueError("Parse error on line: %d" % L)
+            leader_width = len(m.group(1))
+            yield Token("seq_id", m.group(1).strip() )
+            yield Token("seq", m.group(2).strip() )
+            if m.group(3)  :
+                yield Token("seq_num", m.group(3)) 
+            continue
+
+        # END state blocks. If I ever get here something has gone terrible wrong
+        raise RuntimeError()
+    
+    if state==block:
+         yield Token("end_block")
+    yield Token("end")     
+    return
+
+def write(fout, seqs) :
+    """Write 'seqs' to 'fout' as text in clustal format"""
+    header = "CLUSTAL W (1.81) multiple sequence alignment"
+    name_width = 17
+    seq_width = 60
+    
+    print >>fout, header
+    print >>fout
+    print >>fout 
+    
+    L = 0
+    for s in seqs: L = max(L, len(s))
+    
+    for block in range(0, L, seq_width):
+        for s  in seqs :
+            start = min(block, len(s))
+            end = min( start+seq_width, len(s))
+            print  >>fout, s.name.ljust(name_width),
+            print  >>fout, s[start:end]
+        print  >>fout 
+        
+        
+        
+
+
+