codonlogo: corebio/resource/scop.py comparison

comparison corebio/resource/scop.py @ 7:8d676bbd1f2d

Uploaded

author	davidmurphy
date	Mon, 16 Jan 2012 07:03:36 -0500
parents	c55bdc2fb9fa
children

comparison

equal deleted inserted replaced

-:4a4aca3d57c9
+:8d676bbd1f2d
+# Copyright 2000 by Jeffrey Chang.  All rights reserved.
+# Copyright 2001 by Gavin E. Crooks.  All rights reserved.
+# Modifications Copyright 2004/2005 James Casbon.
+# Copyright 2005 by Regents of the University of California. All rights Reserved.
+#   (Major rewrite for conformance to corebio. Gavin Crooks)
+#
+# This code is derived from the Biopython distribution and is governed by it's
+# license.  Please see the LICENSE file that should have been included
+# as part of this package.
+""" SCOP: Structural Classification of Proteins.
+The SCOP database aims to provide a manually constructed classification of
+all know protein structures into a hierarchy, the main levels of which
+are family, superfamily and fold.
+* SCOP: http://scop.mrc-lmb.cam.ac.uk/scop/
+* Introduction: http://scop.mrc-lmb.cam.ac.uk/scop/intro.html
+* SCOP parsable files: http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+The Scop object in this module represents the entire SCOP classification. It
+can be built from the three SCOP parsable files (see DesRecord, HieRecord and
+ClaRecord), modified is so desired, and converted back to the same file formats.
+A single SCOP domain (represented by the Domain class) can be obtained from
+Scop using the domain's SCOP identifier (sid).
+Classes:
+- Scop     -- The entire SCOP hierarchy.
+- Node     -- A node in the SCOP hierarchy.
+- Domain   -- A SCOP domain.
+- Residues -- A collection of residues from a PDB structure.
+- HieRecord -- Handle the SCOP HIErarchy files.
+- DesRecord -- Handle the SCOP DEScription file.
+- ClaRecord -- Handle the SCOP CLAssification file.
+nodeCodeDict  -- A mapping between known 2 letter node codes and a longer
+description. The known node types are 'cl' (class), 'cf'
+(fold), 'sf' (superfamily), 'fa' (family), 'dm' (domain),
+'sp' (species), 'px' (domain). Additional node types may
+be added in the future.
+"""
+import os, re
+nodeCodeDict = { 'cl':'class', 'cf':'fold', 'sf':'superfamily',
+'fa':'family', 'dm':'protein', 'sp':'species', 'px':'domain'}
+_nodetype_to_code= dict([[v,k] for k,v in nodeCodeDict.items()])
+nodeCodeOrder = [ 'ro', 'cl', 'cf', 'sf', 'fa', 'dm', 'sp', 'px' ]
+def cmp_sccs(sccs1, sccs2) :
+"""Order SCOP concise classification strings (sccs).
+a.4.5.1 < a.4.5.11 < b.1.1.1
+A sccs (e.g. a.4.5.11) compactly represents a domain's classification.
+The letter represents the class, and the numbers are the fold,
+superfamily, and family, respectively.
+"""
+s1 = sccs1.split(".")
+s2 = sccs2.split(".")
+if s1[0] != s2[0]: return cmp(s1[0], s2[0])
+s1 = map(int, s1[1:])
+s2 = map(int, s2[1:])
+return cmp(s1,s2)
+def _open_scop_file(scop_dir_path, version, filetype) :
+filename = "dir.%s.scop.txt_%s" % (filetype,version)
+afile = open(os.path.join( scop_dir_path, filename))
+return afile
+class Scop(object):
+"""The entire SCOP hierarchy.
+root             -- The root node of the hierarchy
+domains          -- A list of all domains
+nodes_by_sid     -- A dictionary of nodes indexed by SCOP identifier
+(e.g. 'd1hbia_')
+domains_by_sunid -- A dictionary of domains indexed by SCOP uniquie
+identifiers (e.g. 14996)
+"""
+def __init__(self):
+""" An empty Scop object.
+See also Scop.parse() and Scop.parse_files()
+"""
+self.root = None
+self.domains = []
+self.nodes_by_sunid = dict()
+self.domains_by_sid = dict()
+#@classmethod
+def parse(cls, dir_path, version) :
+"""Build the SCOP hierarchy from the SCOP parsable files.
+- dir_path -- A directory that contains the SCOP files
+- version  -- The SCOP version (as a string)
+The SCOP files are named dir.XXX.scop.txt_VERSION, where XXX
+is 'cla', 'des' or 'hie'.
+"""
+cla_file = None
+des_file = None
+hie_file = None
+try :
+cla_file = _open_scop_file( dir_path, version, 'cla')
+des_file = _open_scop_file( dir_path, version, 'des')
+hie_file = _open_scop_file( dir_path, version, 'hie')
+scop = cls.parse_files(cla_file, des_file, hie_file)
+finally :
+# If we opened the files, we close the files
+if cla_file : cla_file.close()
+if des_file : des_file.close()
+if hie_file : hie_file.close()
+return scop
+parse = classmethod(parse)
+#@classmethod
+def parse_files(cls, cla_file, des_file, hie_file):
+"""Build the SCOP hierarchy from the SCOP parsable files.
+- cla_file -- the CLA clasification file
+- des_file -- the DES description file
+- hie_file -- the HIE hierarchy file
+"""
+self = cls()
+sunidDict = {}
+root = Node()
+domains = []
+root.sunid=0
+root.type='ro'
+sunidDict[root.sunid] = root
+root.description = 'SCOP Root'
+# Build the rest of the nodes using the DES file
+for rec in DesRecord.records(des_file):
+if rec.nodetype =='px' :
+n = Domain()
+n.sid = rec.name
+domains.append(n)
+else :
+n = Node()
+n.sunid = rec.sunid
+n.type = rec.nodetype
+n.sccs = rec.sccs
+n.description = rec.description
+sunidDict[n.sunid] = n
+# Glue all of the Nodes together using the HIE file
+for rec in HieRecord.records(hie_file):
+if not rec.sunid in sunidDict :
+print rec.sunid                     #FIXME: HUH?
+n = sunidDict[rec.sunid]
+if rec.parent !='': # Not root node
+if not rec.parent in sunidDict :
+raise ValueError("Incomplete data?")
+n.parent = sunidDict[rec.parent]
+for c in rec.children:
+if not c in sunidDict :
+raise ValueError("Incomplete data?")
+n.children.append(sunidDict[c])
+# Fill in the gaps with information from the CLA file
+sidDict = {}
+for rec in ClaRecord.records(cla_file):
+n = sunidDict[rec.sunid]
+assert n.sccs == rec.sccs
+assert n.sid == rec.sid
+n.residues = rec.residues
+sidDict[n.sid] = n
+# Clean up
+self.root = root
+self.nodes_by_sunid = sunidDict
+self.domains_by_sid = sidDict
+self.domains = tuple(domains)
+return self
+parse_files = classmethod(parse_files)
+def write_hie(self, stream) :
+"""Build an HIE SCOP parsable file from this object"""
+nodes = self.nodes_by_sunid.values()
+# We order nodes to ease comparison with original file
+nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
+for n in nodes :
+stream.write(str(n.to_hie_record()))
+def write_des(self, stream) :
+"""Build a DES SCOP parsable file from this object"""
+nodes = self.nodes_by_sunid.values()
+# Origional SCOP file is not ordered?
+nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
+for n in nodes :
+if n != self.root :
+stream.write(str(n.to_des_record()))
+def write_cla(self, stream) :
+"""Build a CLA SCOP parsable file from this object"""
+nodes = self.domains_by_sid.values()
+# We order nodes to ease comparison with original file
+nodes.sort(lambda n1,n2: cmp(n1.sunid, n2.sunid))
+for n in nodes :
+stream.write(str(n.to_cla_record()))
+# End Scop
+class Node(object) :
+""" A node in the Scop hierarchy
+sunid  -- SCOP unique identifiers. e.g. '14986'
+parent -- The parent node
+children -- A list of child nodes
+sccs     -- SCOP concise classification string. e.g. 'a.1.1.2'
+type     -- A 2 letter node type code. e.g. 'px' for domains
+description --
+"""
+def __init__(self) :
+"""A new, uninitilized SCOP node."""
+self.sunid=''
+self.parent = None
+self.children=[]
+self.sccs = ''
+self.type =''
+self.description =''
+def __str__(self) :
+s = []
+s.append(str(self.sunid))
+s.append(self.sccs)
+s.append(self.type)
+s.append(self.description)
+return " ".join(s)
+def to_hie_record(self):
+"""Return an Hie.Record"""
+rec = HieRecord()
+rec.sunid = str(self.sunid)
+if self.parent : # Not root node
+rec.parent = str(self.parent.sunid)
+else:
+rec.parent = '-'
+for c in self.children :
+rec.children.append(str(c.sunid))
+return rec
+def to_des_record(self):
+"""Return a Des.Record"""
+rec = DesRecord()
+rec.sunid = str(self.sunid)
+rec.nodetype = self.type
+rec.sccs = self.sccs
+rec.description = self.description
+return rec
+def descendents( self, node_type) :
+""" Return a list of all decendent nodes of the given type. Node type
+can be a two letter code or longer description. e.g. 'fa' or 'family'
+"""
+if node_type in _nodetype_to_code:
+node_type = _nodetype_to_code[node_type]
+nodes = [self]
+while nodes[0].type != node_type:
+if nodes[0].type == 'px' :
+return [] # Fell of the bottom of the hierarchy
+child_list = []
+for n in nodes:
+for child in n.children:
+child_list.append( child )
+nodes = child_list
+return nodes
+def ascendent( self, node_type) :
+""" Return the ancestor node of the given type, or None. Node type can
+be a two letter code or longer description. e.g. 'fa' or 'family'
+"""
+if node_type in _nodetype_to_code :
+node_type = _nodetype_to_code[node_type]
+n = self
+if n.type == node_type: return None
+while n.type != node_type:
+if n.type == 'ro':
+return None # Fell of the top of the hierarchy
+n = n.parent
+return n
+# End Node
+class Domain(Node) :
+""" A SCOP domain. A leaf node in the Scop hierarchy.
+- sid      -- The SCOP domain identifier. e.g. 'd5hbib_'
+- residues -- A Residue object. It defines the collection
+of PDB atoms that make up this domain.
+"""
+def __init__(self) :
+Node.__init__(self)
+self.sid = ''
+self.residues = None
+def __str__(self) :
+s = []
+s.append(self.sid)
+s.append(self.sccs)
+s.append("("+str(self.residues)+")")
+if not self.parent :
+s.append(self.description)
+else :
+sp = self.parent
+dm = sp.parent
+s.append(dm.description)
+s.append("{"+sp.description+"}")
+return " ".join(s)
+def to_des_record(self):
+"""Return a des.Record"""
+rec = Node.to_des_record(self)
+rec.name = self.sid
+return rec
+def to_cla_record(self) :
+"""Return a cla.Record"""
+rec = ClaRecord()
+rec.sid = self.sid
+rec.residues = self.residues
+rec.sccs = self.sccs
+rec.sunid = self.sunid
+n = self
+while n.sunid != 0: # Not root node
+rec.hierarchy.append( (n.type, str(n.sunid)) )
+n = n.parent
+rec.hierarchy.reverse()
+return rec
+# End Domain
+class DesRecord(object):
+""" Handle the SCOP DEScription file.
+The file format is described in the scop
+"release notes.":http://scop.berkeley.edu/release-notes-1.55.html
+The latest DES file can be found
+"elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+The DES file consisnt of one DES record per line. Each record
+holds information for one node in the SCOP hierarchy, and consist
+of 5 tab deliminated fields,
+sunid, node type, sccs, node name, node description.
+For example ::
+21953   px      b.1.2.1 d1dan.1 1dan T:,U:91-106
+48724   cl      b       -       All beta proteins
+48725   cf      b.1     -       Immunoglobulin-like beta-sandwich
+49265   sf      b.1.2   -       Fibronectin type III
+49266   fa      b.1.2.1 -       Fibronectin type III
+- sunid       -- SCOP unique identifiers
+- nodetype    -- One of 'cl' (class), 'cf' (fold), 'sf' (superfamily),
+'fa' (family), 'dm' (protein), 'sp' (species),
+'px' (domain). Additional node types may be added.
+- sccs        -- SCOP concise classification strings. e.g. b.1.2.1
+- name        -- The SCOP ID (sid) for domains (e.g. d1anu1),
+currently empty for other node types
+- description --  e.g. "All beta proteins","Fibronectin type III",
+"""
+def __init__(self, record=None):
+if not record :
+self.sunid = ''
+self.nodetype = ''
+self.sccs = ''
+self.name = ''
+self.description =''
+else :
+entry = record.rstrip()  # no trailing whitespace
+columns = entry.split("\t")  # separate the tab-delineated cols
+if len(columns) != 5:
+raise ValueError("I don't understand the format of %s" % entry)
+self.sunid, self.nodetype, self.sccs, self.name, self.description \
+= columns
+if self.name == '-' : self.name =''
+self.sunid = int(self.sunid)
+def __str__(self):
+s = []
+s.append(self.sunid)
+s.append(self.nodetype)
+s.append(self.sccs)
+if self.name :
+s.append(self.name)
+else :
+s.append("-")
+s.append(self.description)
+return "\t".join(map(str,s)) + "\n"
+#@staticmethod
+def records(des_file):
+"""Iterates over a DES file, generating DesRecords """
+for line in des_file:
+if line[0] =='#':  continue  # A comment
+if line.isspace() : continue
+yield DesRecord(line)
+records = staticmethod(records)
+# End DesRecord
+class HieRecord(object):
+"""Handle the SCOP HIErarchy files, which describe the SCOP hierarchy in
+terms of SCOP unique identifiers (sunid).
+The file format is described in the scop
+"release notes.":http://scop.berkeley.edu/release-notes-1.55.html
+The latest HIE file can be found
+"elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+"Release 1.55":http://scop.berkeley.edu/parse/dir.hie.scop.txt_1.55
+Records consist of 3 tab deliminated fields; node's sunid,
+parent's sunid, and a list of children's sunids. For example ::
+0       -       46456,48724,51349,53931,56572,56835,56992,57942
+21953   49268   -
+49267   49266   49268,49269
+Each record holds information for one node in the SCOP hierarchy.
+sunid      -- SCOP unique identifiers of this node
+parent     -- Parents sunid
+children   -- Sequence of childrens sunids
+"""
+def __init__(self, record = None):
+self.sunid = None
+self.parent = None
+self.children = []
+if not record : return
+# Parses HIE records.
+entry = record.rstrip()        # no trailing whitespace
+columns = entry.split('\t')   # separate the tab-delineated cols
+if len(columns) != 3:
+raise ValueError("I don't understand the format of %s" % entry)
+self.sunid, self.parent, children = columns
+if self.sunid =='-' : self.sunid = ''
+if self.parent =='-' : self.parent = ''
+else : self.parent = int( self.parent )
+if children =='-' :
+self.children = ()
+else :
+self.children = children.split(',')
+self.children = map ( int, self.children )
+self.sunid = int(self.sunid)
+def __str__(self):
+s = []
+s.append(str(self.sunid))
+if self.parent:
+s.append(str(self.parent))
+else:
+if self.sunid != 0:
+s.append('0')
+else:
+s.append('-')
+if self.children :
+child_str = map(str, self.children)
+s.append(",".join(child_str))
+else:
+s.append('-')
+return "\t".join(s) + "\n"
+#@staticmethod
+def records(hie_file):
+"""Iterates over a DOM file, generating DomRecords """
+for line in hie_file:
+if line[0] =='#':  continue  # A comment
+if line.isspace() : continue
+yield HieRecord(line)
+records = staticmethod(records)
+# End HieRecord
+class ClaRecord(object):
+"""Handle the SCOP CLAssification file, which describes SCOP domains.
+The file format is described in the scop
+"release notes.":http://scop.berkeley.edu/release-notes-1.55.html
+The latest CLA file can be found
+"elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+sid         --  SCOP identifier. e.g. d1danl2
+residues    --  The domain definition as a Residues object
+sccs        --  SCOP concise classification strings.  e.g. b.1.2.1
+sunid       --  SCOP unique identifier for this domain
+hierarchy   --  A sequence of tuples (nodetype, sunid) describing the
+location of this domain in the SCOP hierarchy.
+See the Scop module for a description of nodetypes.
+"""
+def __init__(self, record=None):
+self.sid = ''
+self.residues = None
+self.sccs = ''
+self.sunid =''
+self.hierarchy = []
+if not record: return
+# Parse a tab-deliminated CLA record.
+entry = record.rstrip()        # no trailing whitespace
+columns = entry.split('\t')   # separate the tab-delineated cols
+if len(columns) != 6:
+raise ValueError("I don't understand the format of %s" % entry)
+self.sid, pdbid, residues, self.sccs, self.sunid, hierarchy = columns
+self.residues = Residues(residues)
+self.residues.pdbid = pdbid
+self.sunid = int(self.sunid)
+h = []
+for ht in hierarchy.split(",") :
+h.append( ht.split('='))
+for ht in h:
+ht[1] = int(ht[1])
+self.hierarchy = h
+def __str__(self):
+s = []
+s.append(self.sid)
+s += str(self.residues).split(" ")
+s.append(self.sccs)
+s.append(self.sunid)
+h=[]
+for ht in self.hierarchy:
+h.append("=".join(map(str,ht)))
+s.append(",".join(h))
+return "\t".join(map(str,s)) + "\n"
+#@staticmethod
+def records(cla_file):
+"""Iterates over a DOM file, generating DomRecords """
+for line in cla_file:
+if line[0] =='#':  continue  # A comment
+if line.isspace() : continue
+yield ClaRecord(line)
+records = staticmethod(records)
+# End ClaRecord
+class DomRecord(object):
+"""Handle the SCOP DOMain file.
+The DOM file has been officially deprecated. For more information see
+the SCOP"release notes.":http://scop.berkeley.edu/release-notes-1.55.html
+The DOM files for older releases can be found
+"elsewhere at SCOP.":http://scop.mrc-lmb.cam.ac.uk/scop/parse/
+DOM records consist of 4 tab deliminated fields;
+sid, pdbid, residues, hierarchy
+For example ::
+d1sctg_ 1sct    g:      1.001.001.001.001.001
+d1scth_ 1sct    h:      1.001.001.001.001.001
+d1flp__ 1flp    -       1.001.001.001.001.002
+d1moh__ 1moh    -       1.001.001.001.001.002
+sid -- The SCOP ID of the entry, e.g. d1anu1
+residues -- The domain definition as a Residues object
+hierarchy -- A string specifying where this domain is in the hierarchy.
+"""
+def __init__(self, record= None):
+self.sid = ''
+self.residues = []
+self.hierarchy = ''
+if record:
+entry = record.rstrip()  # no trailing whitespace
+columns = entry.split("\t")  # separate the tab-delineated cols
+if len(columns) != 4:
+raise ValueError("I don't understand the format of %s" % entry)
+self.sid, pdbid, res, self.hierarchy = columns
+self.residues = Residues(res)
+self.residues.pdbid = pdbid
+def __str__(self):
+s = []
+s.append(self.sid)
+s.append(str(self.residues).replace(" ","\t") )
+s.append(self.hierarchy)
+return "\t".join(s) + "\n"
+#@staticmethod
+def records(dom_file):
+"""Iterates over a DOM file, generating DomRecords """
+for line in dom_file:
+if line[0] =='#':  continue  # A comment
+if line.isspace() : continue
+yield DomRecord(line)
+records = staticmethod(records)
+# End DomRecord
+_pdbid_re = re.compile(r"^(\w\w\w\w)(?:$|\s+|_)(.*)")
+_fragment_re = re.compile(r"\(?(\w:)?(-?\w*)-?(-?\w*)\)?(.*)")
+class Residues(object) :
+"""A collection of residues from a PDB structure.
+This class provides code to work with SCOP domain definitions. These
+are concisely expressed as a one or more chain fragments. For example,
+"(1bba A:10-20,B:)" indicates residue 10 through 20 (inclusive) of
+chain A, and every residue of chain B in the pdb structure 1bba. The pdb
+id and brackets are optional. In addition "-" indicates every residue of
+a pbd structure with one unnamed chain.
+Start and end residue ids consist of the residue sequence number and an
+optional single letter insertion code. e.g. "12", "-1", "1a", "1000"
+pdbid -- An optional PDB id, e.g. "1bba"
+fragments -- A sequence of tuples (chainID, startResID, endResID)
+"""
+def __init__(self, str=None) :
+self.pdbid = ''
+self.fragments = ()
+if str is not None : self._parse(str)
+def _parse(self, string):
+string = string.strip()
+#Is there a pdbid at the front? e.g. 1bba A:1-100
+m = _pdbid_re.match(string)
+if m is not None :
+self.pdbid = m.group(1)
+string = m.group(2) # Everything else
+if string=='' or string == '-' or string=='(-)':  # no fragments, whole sequence
+return
+fragments = []
+for l in string.split(",") :
+m = _fragment_re.match(l)
+if m is None:
+raise ValueError("I don't understand the format of %s" % l)
+chain, start, end, postfix = m.groups()
+if postfix != "" :
+raise ValueError("I don't understand the format of %s" % l )
+if chain:
+if chain[-1] != ':':
+raise ValueError("I don't understand the chain in %s" % l)
+chain = chain[:-1]   # chop off the ':'
+else :
+chain =""
+fragments.append((chain, start, end))
+self.fragments = tuple(fragments)
+def __str__(self):
+prefix =""
+if self.pdbid :
+prefix =self.pdbid +' '
+if not self.fragments: return prefix+'-'
+strs = []
+for chain, start, end in self.fragments:
+s = []
+if chain: s.append("%s:" % chain)
+if start: s.append("%s-%s" % (start, end))
+strs.append("".join(s))
+return prefix+ ",".join(strs)
+# End Residues

Mercurial > repos > davidmurphy > codonlogo

comparison corebio/resource/scop.py @ 7:8d676bbd1f2d