view commons/core/parsing/Soap2Parser.py @ 9:1eb55963fe39

Updated CompareOverlappingSmall*.py
author m-zytnicki
date Thu, 14 Mar 2013 05:23:05 -0400
parents 769e306b7933
children
line wrap: on
line source

#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import re
import sys
from SMART.Java.Python.structure.Mapping import Mapping
from SMART.Java.Python.structure.SubMapping import SubMapping
from commons.core.parsing.MapperParser import MapperParser


def mappingToSubMapping(mapping):
    subMapping = SubMapping()
    subMapping.targetInterval.copy(mapping.targetInterval)
    subMapping.queryInterval.copy(mapping.queryInterval)
    subMapping.setDirection(mapping.getDirection())
    subMapping.size = mapping.size
    subMapping.tags = mapping.tags
    return subMapping



class Soap2Parser(MapperParser):
    """A class that parses the output of SOAP2"""

    def __init__(self, fileName, verbosity = 0):
        super(Soap2Parser, self).__init__(fileName, verbosity)


    def __del__(self):
        super(Soap2Parser, self).__del__()


    def getFileFormats():
        return ["soap2"]
    getFileFormats = staticmethod(getFileFormats)


    def skipFirstLines(self):
        pass


    def getIterator(self):
        self.reset()
        currentName = None
        currentMappings = []
        for line in self.handle:
            mapping = self.parseLine(line)
            name = mapping.queryInterval.name
            if name == currentName:
                if mapping.getTagValue("end") == "a":
                    currentMappings.append(mapping)
                else:
                    otherEndMapping = currentMappings.pop(0)

                    newMapping = Mapping()
                    subMappingA = mappingToSubMapping(otherEndMapping)
                    subMappingB = mappingToSubMapping(mapping)
                    subMappingB.queryInterval.setDirection(subMappingA.queryInterval.getDirection())

                    newMapping.addSubMapping(subMappingA)
                    newMapping.addSubMapping(subMappingB)

                    newMapping.tags = otherEndMapping.tags
                    newMapping.setSize(otherEndMapping.size + mapping.size)
                    newMapping.setNbMismatches(otherEndMapping.getTagValue("nbMismatches") + mapping.getTagValue("nbMismatches"))
                    print otherEndMapping.getTagValue("nbMismatches")
                    print mapping.getTagValue("nbMismatches")
                    print newMapping.getTagValue("nbMismatches")
                    sys.exit()
                    newMapping.setTagValue("qualityString", otherEndMapping.getTagValue("qualityString") + mapping.getTagValue("qualityString"))
                    newMapping.setTagValue("occurrence", "%d" % (newMapping.getTagValue("nbOccurrences") - len(currentMappings)))
                    newMapping.setTagValue("ID", "%s-%s" % (name, newMapping.getTagValue("occurrence")))
                    del newMapping.tags["end"]
                    yield newMapping
            else:
                currentName = mapping.queryInterval.name
                for currentMapping in currentMappings:
                    yield currentMapping
                currentMappings = [mapping]
            self.currentLineNb += 1
                
                
    def parseLine(self, line):
        m = re.search(r"^\s*(\S+)\s+(\w+)\s+(\S+)\s+(\d+)\s+([ab])\s+(\d+)\s+([+-])\s+(\w+)\s+(\d+)\s+(\d+)\s+", line)
        if m == None:
            sys.exit("\nLine %d '%s' does not have a SOAP2 format" % (self.currentLineNb, line))

        name          = m.group(1)
        read          = m.group(2)
        qualityString = m.group(3)
        nbOccurrences = int(m.group(4))
        end           = m.group(5)
        size          = int(m.group(6))
        direction     = m.group(7)
        chromosome    = m.group(8)
        genomeStart   = int(m.group(9))
        nbMismatches  = int(m.group(10))

        mapping = Mapping()
        if name.endswith("/1") or name.endswith("/2"):
            name = name[:-2]

        mapping.queryInterval.name = name
        mapping.queryInterval.setDirection(direction)
        mapping.queryInterval.setStart(1)
        mapping.queryInterval.setEnd(size)

        mapping.targetInterval.setChromosome(chromosome)
        mapping.targetInterval.setStart(genomeStart)
        mapping.targetInterval.setSize(size)

        mapping.setDirection(direction)
        mapping.setSize(size)

        mapping.setNbMismatches(nbMismatches)
        mapping.setNbGaps(0)
        mapping.setTagValue("qualityString", qualityString)
        mapping.setTagValue("nbOccurrences", nbOccurrences)
        mapping.setTagValue("end", end)

        return mapping