diff commons/core/parsing/ExoParser.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 769e306b7933
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/core/parsing/ExoParser.py	Thu May 02 09:56:47 2013 -0400
@@ -0,0 +1,137 @@
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+import re
+import sys
+from commons.core.parsing.MapperParser import MapperParser
+from SMART.Java.Python.structure.Mapping import Mapping
+from SMART.Java.Python.structure.SubMapping import SubMapping
+
+class ExoParser(MapperParser):
+    """A class that parses the output of Exonerate - roll your own format"""
+
+    def __init__(self, fileName, verbosity = 0):
+        super(ExoParser, self).__init__(fileName, verbosity)
+
+
+    def __del__(self):
+        super(ExoParser, self).__del__()
+
+
+    def getFileFormats():
+        return ["exo", "exonerate"]
+    getFileFormats = staticmethod(getFileFormats)
+
+
+    def skipFirstLines(self):
+        while "Hostname" not in self.handle.readline():
+            self.currentLineNb += 1
+            pass
+
+
+    def parseLine(self, line):
+        
+        if line == "-- completed exonerate analysis\n":
+            return None
+        
+        m = re.search(r"^\s*(\S+)\s+(\d+)\s+(\d+)\s+[+-]\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s+(\d+)\s+(\S.*)$", line)
+        if m == None:
+            sys.exit("\nLine %d '%s' does not have a RYO format" % (self.currentLineNb, line))
+
+        mapping = Mapping()
+        name = m.group(1)
+        queryStart = min(int(m.group(2)), int(m.group(3)))
+        queryEnd = max(int(m.group(2)), int(m.group(3)))-1
+        chromosome = m.group(4)
+        targetStart = min(int(m.group(5)), int(m.group(6)))
+        targetEnd = max(int(m.group(5)), int(m.group(6)))-1
+        direction = m.group(7)
+        nbMismatches = int(m.group(8))
+        rest = m.group(9).strip()
+        
+        nbGaps = 0
+        queryOffset = 0
+        targetOffset = 0
+        
+        subMapping = None
+        m = re.search(r"^(\w)\s+(\d+)\s+(\d+)", rest)
+        while m != None:
+            queryDistance    = int(m.group(2))
+            targetDistance = int(m.group(3))
+            if m.group(1) == "M":
+                if subMapping == None:
+                    subMapping = SubMapping()
+    
+                    subMapping.setSize(queryDistance)
+                    subMapping.setDirection(direction)
+        
+                    subMapping.queryInterval.setName(name)
+                    subMapping.queryInterval.setStart(queryStart + queryOffset)
+                    subMapping.queryInterval.setDirection(direction)
+        
+                    subMapping.targetInterval.setChromosome(chromosome)
+                    subMapping.targetInterval.setStart(targetStart + targetOffset)
+                    subMapping.targetInterval.setDirection(1)
+    
+            elif m.group(1) == "G":
+                nbGaps += max(queryDistance, targetDistance)
+                
+            elif m.group(1) == "I" or m.group(1) == "5" or m.group(1) == "3":
+                if subMapping != None:
+                    subMapping.queryInterval.setEnd(queryStart + queryOffset - 1)
+                    subMapping.targetInterval.setEnd(targetStart + targetOffset - 1)
+                    mapping.addSubMapping(subMapping)
+                    subMapping = None
+            else:
+                sys.exit("Cannot understand sign '%s' in line %s" % (m.group(1), line))
+            
+            queryOffset += queryDistance
+            targetOffset += targetDistance
+            rest = rest[m.end():].strip()
+            m = re.search(r"^(\w)\s+(\d+)\s+(\d+)", rest)
+            
+        if subMapping != None:
+            subMapping.queryInterval.setEnd(queryStart + queryOffset - 1)
+            subMapping.targetInterval.setEnd(targetStart + targetOffset - 1)
+            mapping.addSubMapping(subMapping)
+                        
+        mapping.setNbMismatches(nbMismatches)
+        mapping.setNbGaps(nbGaps)
+        mapping.setDirection(direction)
+
+        mapping.queryInterval.setName(name)
+        mapping.queryInterval.setStart(queryStart)
+        mapping.queryInterval.setEnd(queryEnd)
+
+        mapping.targetInterval.setChromosome(chromosome)
+        mapping.targetInterval.setStart(targetStart)
+        mapping.targetInterval.setEnd(targetEnd)
+
+        return mapping
+