view commons/core/parsing/AxtParser.py @ 62:8c42a6d7ffd4

Added simple test BED file.
author m-zytnicki
date Mon, 19 Oct 2015 11:25:11 +0200
parents 44d5973c188c
children
line wrap: on
line source

#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
import re
import sys
from SMART.Java.Python.structure.Mapping import Mapping
from SMART.Java.Python.structure.SubMapping import SubMapping
from commons.core.parsing.MapperParser import MapperParser
from SMART.Java.Python.misc import Utils
from SMART.Java.Python.misc.Utils import getHammingDistance


class AxtParser(MapperParser):
    """A class that parses AXT (as given by Mosaik)"""

    def __init__(self, fileName, verbosity = 0):
        super(AxtParser, self).__init__(fileName, verbosity)
        self.queryLine = None
        self.subjectLine = None

    def __del__(self):
        super(AxtParser, self).__del__()


    def getFileFormats():
        return ["axt"]
    getFileFormats = staticmethod(getFileFormats)


    def skipFirstLines(self):
        pass


    def getInfos(self):
        self.chromosomes = set()
        self.nbMappings  = 0
        self.size        = 0
        cpt              = 0
        self.reset()
        for line in self.handle:
            line = line.strip()
            if line == "": continue
            if cpt % 3 == 0:
                line    = line.strip()
                parts = line.split(" ")
                self.chromosomes.add(parts[1])
                self.size       += int(parts[6])
                self.nbMappings += 1
            cpt += 1
            if self.verbosity >= 10 and self.nbMappings % 100000 == 0:
                sys.stdout.write("    %d mappings read\r" % (self.nbMappings))
                sys.stdout.flush()
        self.reset()
        if self.verbosity >= 10:
            print "    %d mappings read" % (self.nbMappings)
            print "Done."
        

    def parseLine(self, line):

        if line.strip() == "":
            for line in self.handle:
                self.currentLineNb += 1
                break
        if line.strip() == "":
            return None

        m = re.search(r"^\s*\d+\s+(\S+)\s+(\d+)\s+(\d+)\s+(\S+)\s+(\d+)\s+(\d+)\s+([+-])\s+\d+\s*$", line)
        if m != None:
            mapping = Mapping()
            subMapping = SubMapping()
    
            subMapping.queryInterval.setName(m.group(4))
            subMapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6))))
            subMapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6))))
            subMapping.queryInterval.setDirection(m.group(7))
    
            subMapping.targetInterval.setChromosome(m.group(1))
            subMapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3))))
            subMapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3))))
            subMapping.targetInterval.setDirection(1)
    
            subMapping.setSize(min(subMapping.targetInterval.getSize(), subMapping.queryInterval.getSize()))
            subMapping.setDirection(m.group(7))
    
            mapping.addSubMapping(subMapping)
    
            mapping.setDirection(m.group(7))
            mapping.targetInterval.setChromosome(m.group(1))
            mapping.targetInterval.setStart(min(int(m.group(2)), int(m.group(3))))
            mapping.targetInterval.setEnd(max(int(m.group(2)), int(m.group(3))))
    
            mapping.queryInterval.setName(m.group(4))
            mapping.queryInterval.setStart(min(int(m.group(5)), int(m.group(6))))
            mapping.queryInterval.setEnd(max(int(m.group(5)), int(m.group(6))))
    
            mapping.setSize(min(mapping.targetInterval.getSize(), mapping.queryInterval.getSize()))
    
            self.currentMapping = mapping
            return None
        if self.queryLine == None:
            self.queryLine = line
            return None
        self.subjectLine = line
        seqLen = float(len(self.subjectLine))
        dist = float(getHammingDistance(self.queryLine, self.subjectLine))
        self.currentMapping.setNbMismatches(getHammingDistance(self.queryLine, self.subjectLine))
        self.currentMapping.setNbGaps(0)
        self.queryLine = None
        self.subjectLine = None
        return self.currentMapping