view smart_toolShed/SMART/Java/Python/mapperAnalyzer.py @ 4:1fc014126d55

Uploaded
author yufei-luo
date Fri, 18 Jan 2013 04:45:50 -0500
parents e0f8dcca02ed
children
line wrap: on
line source

#! /usr/bin/env python
#
# Copyright INRA-URGI 2009-2010
# 
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software. You can use,
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info".
# 
# As a counterpart to the access to the source code and rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty and the software's author, the holder of the
# economic rights, and the successive licensors have only limited
# liability.
# 
# In this respect, the user's attention is drawn to the risks associated
# with loading, using, modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean that it is complicated to manipulate, and that also
# therefore means that it is reserved for developers and experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or
# data to be ensured and, more generally, to use and operate it in the
# same conditions as regards security.
# 
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.
#
"""
Read a mapping file (many formats supported) and select some of them
Mappings should be sorted by read names
"""
import os, random, shelve
from optparse import OptionParser, OptionGroup
from commons.core.parsing.ParserChooser import ParserChooser
from commons.core.parsing.FastaParser import FastaParser
from commons.core.parsing.FastqParser import FastqParser
from commons.core.parsing.GffParser import GffParser
from commons.core.writer.BedWriter import BedWriter
from commons.core.writer.UcscWriter import UcscWriter
from commons.core.writer.GbWriter import GbWriter
from commons.core.writer.Gff2Writer import Gff2Writer
from commons.core.writer.Gff3Writer import Gff3Writer
from commons.core.writer.FastaWriter import FastaWriter
from commons.core.writer.FastqWriter import FastqWriter
from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
from SMART.Java.Python.mySql.MySqlTable import MySqlTable
from SMART.Java.Python.misc.RPlotter import RPlotter
from SMART.Java.Python.misc.Progress import Progress
from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress


distanceExons = 20
exonSize      = 20


class MapperAnalyzer(object):
    """
    Analyse the output of a parser
    """

    def __init__(self, verbosity = 0):
        self.verbosity                = verbosity
        self.mySqlConnection          = MySqlConnection(verbosity)
        self.tooShort                 = 0
        self.tooManyMismatches        = 0
        self.tooManyGaps              = 0
        self.tooShortExons            = 0
        self.tooManyMappings          = 0
        self.nbMappings               = 0
        self.nbSequences              = 0
        self.nbAlreadyMapped          = 0
        self.nbAlreadyMappedSequences = 0
        self.nbWrittenMappings        = 0
        self.nbWrittenSequences       = 0
        self.parser                   = None
        self.logHandle                = None
        self.randomNumber             = random.randint(0, 100000)
        self.gff3Writer               = None
        self.alreadyMappedReader      = None
        self.unmatchedWriter          = None
        self.sequenceListParser       = None
        self.sequences                = None
        self.alreadyMapped            = None
        self.mappedNamesTable         = None
        self.minSize                  = None
        self.minId                    = None
        self.maxMismatches            = None 
        self.maxGaps                  = None 
        self.maxMappings              = None 
        self.merge                    = False
        self.checkExons               = False
        self.suffix                   = None
        self.tmpDirectory             = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else ""


    def __del__(self):
        if self.sequences != None:
            self.sequences.close()
        if self.alreadyMapped != None:
            self.alreadyMapped.close()
        if self.mappedNamesTable != None:
            self.mappedNamesTable.remove()
        if self.gff3Writer != None:
            self.gff3Writer.close()

        if self.logHandle != None:
            self.logHandle.close()
        

    def setMappingFile(self, fileName, format):
        parserChooser = ParserChooser(self.verbosity)
        parserChooser.findFormat(format, "mapping")
        self.parser = parserChooser.getParser(fileName)


    def setSequenceFile(self, fileName, format):
        if format == "fasta":
            self.sequenceListParser = FastaParser(fileName, self.verbosity)
        elif format == "fastq":
            self.sequenceListParser = FastqParser(fileName, self.verbosity)
        else:
            raise Exception("Do not understand sequence format %s" % (format))

    
    def setOutputFile(self, fileName, title):
        self.gff3Writer = Gff3Writer(fileName, self.verbosity)
        self.gff3Writer.setTitle(title)

    
    def setAlreadyMatched(self, fileName):
        self.alreadyMappedReader = GffParser(fileName, self.verbosity)


    def setRemainingFile(self, fileName, format):
        if format == "fasta":
            self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity)
        elif format == "fastq":
            self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity)
        else:
            raise Exception("Do not understand %s format." % (format))
        self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity)
        self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50})
        self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True)


    def setLog(self, fileName):
        self.logHandle = open(fileName, "w")


    def setMinSize(self, size):
        self.minSize = size


    def setMinId(self, id):
        self.minId = id


    def setMaxMismatches(self, mismatches):
        self.maxMismatches = mismatches


    def setMaxGaps(self, gaps):
        self.maxGaps = gaps


    def setMaxMappings(self, mappings):
        self.maxMappings = mappings


    def mergeExons(self, b):
        self.merge = b


    def acceptShortExons(self, b):
        self.checkExons = not b


    def countMappings(self):
        self.nbMappings = self.parser.getNbMappings()
        if self.verbosity > 0:
            print "%i matches found" % (self.nbMappings)


    def storeAlreadyMapped(self):
        self.alreadyMapped            = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber))
        progress                      = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity)
        self.nbAlreadyMappedSequences = 0
        for transcript in self.alreadyMappedReader.getIterator():
            if not self.alreadyMapped.has_key(transcript.getName()):
                self.alreadyMapped[transcript.getName()] = 1
                self.nbAlreadyMappedSequences           += 1
            progress.inc()
        progress.done()
        self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts()


    def storeSequences(self):
        self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber))
        progress       = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity)
        for sequence in self.sequenceListParser.getIterator():
            self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence())
            self.nbSequences += 1
            progress.inc()
        progress.done()
        if self.verbosity > 0:
            print "%i sequences read" % (self.nbSequences)


    def checkOrder(self):
        names        = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber))
        previousName = None
        progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity)
        for mapping in self.parser.getIterator():
            name = mapping.queryInterval.getName()
            if name != previousName and previousName != None:
                if names.has_key(previousName):
                    raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName))
                names[previousName] = 1
                previousName        = name
            progress.inc()
        progress.done()
        names.close()


    def checkPreviouslyMapped(self, name):
        if self.alreadyMappedReader == None:
            return False
        return self.alreadyMapped.has_key(name)


    def findOriginalSize(self, name):
        alternate    = "%s/1" % (name)
        if (self.suffix == None) or (not self.suffix):
            if self.sequences.has_key(name):
                self.suffix = False
                return self.sequences[name]
            if self.suffix == None:
                self.suffix = True
            else:
                raise Exception("Cannot find name %n" % (name))
        if (self.suffix):
            if self.sequences.has_key(alternate):
                return self.sequences[alternate]        
        raise Exception("Cannot find name %s" % (name))
        

    def checkErrors(self, mapping):
        accepted = True
        # short size
        if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size:
            self.tooShort += 1
            accepted    = False
            if self.logHandle != None:
                self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size))
        # low identity
        if self.minId != None and mapping.getTagValue("identity") < self.minId:
            self.tooManyMismatches += 1
            accepted                = False
            if self.logHandle != None:
                self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping)))
        # too many mismatches
        if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches:
            self.tooManyMismatches += 1
            accepted                = False
            if self.logHandle != None:
                self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches))
        # too many gaps
        if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps:
            self.tooManyGaps += 1
            accepted         = False
            if self.logHandle != None:
                self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps))
        # short exons
        if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize:
            self.tooShortExons += 1
            accepted            = False
            if self.logHandle != None:
                self.logHandle.write("sequence %s maps as too short exons\n" % (mapping))
        return accepted

    
    def checkNbMappings(self, mappings):
        nbOccurrences = 0
        for mapping in mappings:
            nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
        if (self.maxMappings != None and nbOccurrences > self.maxMappings):
            self.tooManyMappings += 1
            if self.logHandle != None:
                self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences))
            return False
        return (nbOccurrences > 0)


    def sortMappings(self, mappings):
        nbOccurrences = 0
        for mapping in mappings:
            nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")

        orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore())
        cpt             = 1
        rank            = 1
        previousMapping = None
        previousScore   = None
        wasLastTie      = False
        rankedMappings  = []
        bestRegion      = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd())
        for mapping in orderedMappings:
            mapping.setNbOccurrences(nbOccurrences)
            mapping.setOccurrence(cpt)

            score = mapping.getErrorScore()
            if previousScore != None and previousScore == score:
                if "Rank" in previousMapping.getTagNames():
                    if not wasLastTie:
                        previousMapping.setRank("%sTie" % (rank))
                    mapping.setRank("%sTie" % (rank))
                    wasLastTie = True
            else:
                rank = cpt
                mapping.setRank(rank)
                wasLastTie = False
            if cpt != 1:
                mapping.setBestRegion(bestRegion)

            rankedMappings.append(mapping)
            previousMapping = mapping
            previousScore   = score
            cpt            += 1
        return rankedMappings


    def processMappings(self, mappings):
        if not mappings:
            return
        selectedMappings = []
        name             = mappings[0].queryInterval.getName()
        size             = self.findOriginalSize(name)
        for mapping in mappings:
            if self.merge:
                mapping.mergeExons(distanceExons)
            mapping.queryInterval.size = size
            if self.checkErrors(mapping):
                selectedMappings.append(mapping)

        if self.checkNbMappings(selectedMappings):
            if self.unmatchedWriter != None:
                query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name)))
            self.nbWrittenSequences += 1
            mappings = self.sortMappings(selectedMappings)
            for mapping in mappings:
                self.nbWrittenMappings += 1
                self.gff3Writer.addTranscript(mapping.getTranscript())


    def readMappings(self):
        previousQueryName = None
        mappings          = []
        self.parser.reset()
        progress = Progress(self.nbMappings, "Reading mappings", self.verbosity)
        for mapping in self.parser.getIterator():
            queryName = mapping.queryInterval.getName().split(" ")[0]
            if self.checkPreviouslyMapped(queryName):
                if self.logHandle != None:
                    self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName))
            else:
                if previousQueryName == queryName:
                    mappings.append(mapping)
                else:
                    if previousQueryName != None:
                        self.processMappings(mappings)
                    previousQueryName = queryName
                    mappings          = [mapping, ]
            progress.inc()
        self.processMappings(mappings)
        self.gff3Writer.write()
        self.gff3Writer.close()
        progress.done()
        

    def writeUnmatched(self):
        progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity)
        for sequence in self.sequenceListParser.getIterator():
            name = sequence.getName().split(" ")[0]
            query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name))
            if query.isEmpty():
                self.unmatchedWriter.addSequence(sequence)
            progress.inc()
        progress.done() 


    def analyze(self):
        self.countMappings()
        self.checkOrder()
        self.storeSequences()
        if self.alreadyMappedReader != None:
            self.storeAlreadyMapped()
        self.readMappings()
        if self.unmatchedWriter != None:
            self.writeUnmatched()




if __name__ == "__main__":
    
    # parse command line
    description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]"

    parser = OptionParser(description = description)
    compGroup = OptionGroup(parser, "Compulsory options")
    filtGroup = OptionGroup(parser, "Filtering options")
    tranGroup = OptionGroup(parser, "Transformation options")
    outpGroup = OptionGroup(parser, "Output options")
    otheGroup = OptionGroup(parser, "Other options")
    compGroup.add_option("-i", "--input",            dest="inputFileName",     action="store",                        type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]")
    compGroup.add_option("-f", "--format",           dest="format",            action="store",      default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]")
    compGroup.add_option("-q", "--sequences",        dest="sequencesFileName", action="store",                        type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]")
    compGroup.add_option("-k", "--seqFormat",        dest="sequenceFormat",    action="store",      default="fasta",  type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]")
    compGroup.add_option("-o", "--output",           dest="outputFileName",    action="store",                        type="string", help="output file [compulsory] [format: output file in GFF3 format]")
    filtGroup.add_option("-n", "--number",           dest="number",            action="store",      default=None,     type="int",    help="max. number of occurrences of a sequence [format: int]")
    filtGroup.add_option("-s", "--size",             dest="size",              action="store",      default=None,     type="int",    help="minimum pourcentage of size [format: int]")
    filtGroup.add_option("-d", "--identity",         dest="identity",          action="store",      default=None,     type="int",    help="minimum pourcentage of identity [format: int]")
    filtGroup.add_option("-m", "--mismatch",         dest="mismatch",          action="store",      default=None,     type="int",    help="maximum number of mismatches [format: int]")
    filtGroup.add_option("-p", "--gap",              dest="gap",               action="store",      default=None,     type="int",    help="maximum number of gaps [format: int]")
    tranGroup.add_option("-e", "--mergeExons",       dest="mergeExons",        action="store_true", default=False,                   help="merge exons when introns are short [format: bool] [default: false]")
    tranGroup.add_option("-x", "--removeExons",      dest="removeExons",       action="store_true", default=False,                   help="remove transcripts when exons are short [format: bool] [default: false]")
    outpGroup.add_option("-t", "--title",            dest="title",             action="store",      default="SMART",  type="string", help="title of the UCSC track [format: string] [default: SMART]")
    outpGroup.add_option("-r", "--remaining",        dest="remaining",         action="store_true", default=False,                   help="print the unmatched sequences [format: bool] [default: false]")
    otheGroup.add_option("-a", "--append",           dest="appendFileName",    action="store",      default=None,     type="string", help="append to GFF3 file [format: file in GFF3 format]")    
    otheGroup.add_option("-v", "--verbosity",        dest="verbosity",         action="store",      default=1,        type="int",    help="trace level [default: 1] [format: int]")
    otheGroup.add_option("-l", "--log",              dest="log",               action="store_true", default=False,                   help="write a log file [format: bool] [default: false]")
    parser.add_option_group(compGroup)
    parser.add_option_group(filtGroup)
    parser.add_option_group(tranGroup)
    parser.add_option_group(outpGroup)
    parser.add_option_group(otheGroup)
    (options, args) = parser.parse_args()

    
    analyzer = MapperAnalyzer(options.verbosity)
    analyzer.setMappingFile(options.inputFileName, options.format)
    analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat)
    analyzer.setOutputFile(options.outputFileName, options.title)
    if options.appendFileName != None:
        analyzer.setAlreadyMatched(options.appendFileName)
    if options.remaining:
        analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat)
    if options.number != None:
        analyzer.setMaxMappings(options.number)
    if options.size != None:
        analyzer.setMinSize(options.size)
    if options.identity != None:
        analyzer.setMinId(options.identity)
    if options.mismatch != None:
        analyzer.setMaxMismatches(options.mismatch)
    if options.gap != None:
        analyzer.setMaxGaps(options.gap)
    if options.mergeExons:
        analyzer.mergeExons(True)
    if options.removeExons:
        analyzer.acceptShortExons(False)
    if options.log:
        analyzer.setLog("%s.log" % (options.outputFileName))
    analyzer.analyze()
    
    if options.verbosity > 0:
        print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
        if options.appendFileName != None:
            print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
        print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100)
        if options.appendFileName != None:
            print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100)
        print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100)
        print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100)
        print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100)
        print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100)
        print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100)
        print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
        if options.appendFileName != None:
            print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)