diff smart_toolShed/SMART/Java/Python/mapperAnalyzer.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smart_toolShed/SMART/Java/Python/mapperAnalyzer.py	Thu Jan 17 10:52:14 2013 -0500
@@ -0,0 +1,486 @@
+#! /usr/bin/env python
+#
+# Copyright INRA-URGI 2009-2010
+# 
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software. You can use,
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info".
+# 
+# As a counterpart to the access to the source code and rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty and the software's author, the holder of the
+# economic rights, and the successive licensors have only limited
+# liability.
+# 
+# In this respect, the user's attention is drawn to the risks associated
+# with loading, using, modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean that it is complicated to manipulate, and that also
+# therefore means that it is reserved for developers and experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or
+# data to be ensured and, more generally, to use and operate it in the
+# same conditions as regards security.
+# 
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+#
+"""
+Read a mapping file (many formats supported) and select some of them
+Mappings should be sorted by read names
+"""
+import os, random, shelve
+from optparse import OptionParser, OptionGroup
+from commons.core.parsing.ParserChooser import ParserChooser
+from commons.core.parsing.FastaParser import FastaParser
+from commons.core.parsing.FastqParser import FastqParser
+from commons.core.parsing.GffParser import GffParser
+from commons.core.writer.BedWriter import BedWriter
+from commons.core.writer.UcscWriter import UcscWriter
+from commons.core.writer.GbWriter import GbWriter
+from commons.core.writer.Gff2Writer import Gff2Writer
+from commons.core.writer.Gff3Writer import Gff3Writer
+from commons.core.writer.FastaWriter import FastaWriter
+from commons.core.writer.FastqWriter import FastqWriter
+from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter
+from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection
+from SMART.Java.Python.mySql.MySqlTable import MySqlTable
+from SMART.Java.Python.misc.RPlotter import RPlotter
+from SMART.Java.Python.misc.Progress import Progress
+from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
+
+
+distanceExons = 20
+exonSize      = 20
+
+
+class MapperAnalyzer(object):
+    """
+    Analyse the output of a parser
+    """
+
+    def __init__(self, verbosity = 0):
+        self.verbosity                = verbosity
+        self.mySqlConnection          = MySqlConnection(verbosity)
+        self.tooShort                 = 0
+        self.tooManyMismatches        = 0
+        self.tooManyGaps              = 0
+        self.tooShortExons            = 0
+        self.tooManyMappings          = 0
+        self.nbMappings               = 0
+        self.nbSequences              = 0
+        self.nbAlreadyMapped          = 0
+        self.nbAlreadyMappedSequences = 0
+        self.nbWrittenMappings        = 0
+        self.nbWrittenSequences       = 0
+        self.parser                   = None
+        self.logHandle                = None
+        self.randomNumber             = random.randint(0, 100000)
+        self.gff3Writer               = None
+        self.alreadyMappedReader      = None
+        self.unmatchedWriter          = None
+        self.sequenceListParser       = None
+        self.sequences                = None
+        self.alreadyMapped            = None
+        self.mappedNamesTable         = None
+        self.minSize                  = None
+        self.minId                    = None
+        self.maxMismatches            = None 
+        self.maxGaps                  = None 
+        self.maxMappings              = None 
+        self.merge                    = False
+        self.checkExons               = False
+        self.suffix                   = None
+        self.tmpDirectory             = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else ""
+
+
+    def __del__(self):
+        if self.sequences != None:
+            self.sequences.close()
+        if self.alreadyMapped != None:
+            self.alreadyMapped.close()
+        if self.mappedNamesTable != None:
+            self.mappedNamesTable.remove()
+        if self.gff3Writer != None:
+            self.gff3Writer.close()
+
+        if self.logHandle != None:
+            self.logHandle.close()
+        
+
+    def setMappingFile(self, fileName, format):
+        parserChooser = ParserChooser(self.verbosity)
+        parserChooser.findFormat(format, "mapping")
+        self.parser = parserChooser.getParser(fileName)
+
+
+    def setSequenceFile(self, fileName, format):
+        if format == "fasta":
+            self.sequenceListParser = FastaParser(fileName, self.verbosity)
+        elif format == "fastq":
+            self.sequenceListParser = FastqParser(fileName, self.verbosity)
+        else:
+            raise Exception("Do not understand sequence format %s" % (format))
+
+    
+    def setOutputFile(self, fileName, title):
+        self.gff3Writer = Gff3Writer(fileName, self.verbosity)
+        self.gff3Writer.setTitle(title)
+
+    
+    def setAlreadyMatched(self, fileName):
+        self.alreadyMappedReader = GffParser(fileName, self.verbosity)
+
+
+    def setRemainingFile(self, fileName, format):
+        if format == "fasta":
+            self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity)
+        elif format == "fastq":
+            self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity)
+        else:
+            raise Exception("Do not understand %s format." % (format))
+        self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity)
+        self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50})
+        self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True)
+
+
+    def setLog(self, fileName):
+        self.logHandle = open(fileName, "w")
+
+
+    def setMinSize(self, size):
+        self.minSize = size
+
+
+    def setMinId(self, id):
+        self.minId = id
+
+
+    def setMaxMismatches(self, mismatches):
+        self.maxMismatches = mismatches
+
+
+    def setMaxGaps(self, gaps):
+        self.maxGaps = gaps
+
+
+    def setMaxMappings(self, mappings):
+        self.maxMappings = mappings
+
+
+    def mergeExons(self, b):
+        self.merge = b
+
+
+    def acceptShortExons(self, b):
+        self.checkExons = not b
+
+
+    def countMappings(self):
+        self.nbMappings = self.parser.getNbMappings()
+        if self.verbosity > 0:
+            print "%i matches found" % (self.nbMappings)
+
+
+    def storeAlreadyMapped(self):
+        self.alreadyMapped            = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber))
+        progress                      = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity)
+        self.nbAlreadyMappedSequences = 0
+        for transcript in self.alreadyMappedReader.getIterator():
+            if not self.alreadyMapped.has_key(transcript.getName()):
+                self.alreadyMapped[transcript.getName()] = 1
+                self.nbAlreadyMappedSequences           += 1
+            progress.inc()
+        progress.done()
+        self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts()
+
+
+    def storeSequences(self):
+        self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber))
+        progress       = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity)
+        for sequence in self.sequenceListParser.getIterator():
+            self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence())
+            self.nbSequences += 1
+            progress.inc()
+        progress.done()
+        if self.verbosity > 0:
+            print "%i sequences read" % (self.nbSequences)
+
+
+    def checkOrder(self):
+        names        = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber))
+        previousName = None
+        progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity)
+        for mapping in self.parser.getIterator():
+            name = mapping.queryInterval.getName()
+            if name != previousName and previousName != None:
+                if names.has_key(previousName):
+                    raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName))
+                names[previousName] = 1
+                previousName        = name
+            progress.inc()
+        progress.done()
+        names.close()
+
+
+    def checkPreviouslyMapped(self, name):
+        if self.alreadyMappedReader == None:
+            return False
+        return self.alreadyMapped.has_key(name)
+
+
+    def findOriginalSize(self, name):
+        alternate    = "%s/1" % (name)
+        if (self.suffix == None) or (not self.suffix):
+            if self.sequences.has_key(name):
+                self.suffix = False
+                return self.sequences[name]
+            if self.suffix == None:
+                self.suffix = True
+            else:
+                raise Exception("Cannot find name %n" % (name))
+        if (self.suffix):
+            if self.sequences.has_key(alternate):
+                return self.sequences[alternate]        
+        raise Exception("Cannot find name %s" % (name))
+        
+
+    def checkErrors(self, mapping):
+        accepted = True
+        # short size
+        if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size:
+            self.tooShort += 1
+            accepted    = False
+            if self.logHandle != None:
+                self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size))
+        # low identity
+        if self.minId != None and mapping.getTagValue("identity") < self.minId:
+            self.tooManyMismatches += 1
+            accepted                = False
+            if self.logHandle != None:
+                self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping)))
+        # too many mismatches
+        if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches:
+            self.tooManyMismatches += 1
+            accepted                = False
+            if self.logHandle != None:
+                self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches))
+        # too many gaps
+        if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps:
+            self.tooManyGaps += 1
+            accepted         = False
+            if self.logHandle != None:
+                self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps))
+        # short exons
+        if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize:
+            self.tooShortExons += 1
+            accepted            = False
+            if self.logHandle != None:
+                self.logHandle.write("sequence %s maps as too short exons\n" % (mapping))
+        return accepted
+
+    
+    def checkNbMappings(self, mappings):
+        nbOccurrences = 0
+        for mapping in mappings:
+            nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
+        if (self.maxMappings != None and nbOccurrences > self.maxMappings):
+            self.tooManyMappings += 1
+            if self.logHandle != None:
+                self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences))
+            return False
+        return (nbOccurrences > 0)
+
+
+    def sortMappings(self, mappings):
+        nbOccurrences = 0
+        for mapping in mappings:
+            nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences")
+
+        orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore())
+        cpt             = 1
+        rank            = 1
+        previousMapping = None
+        previousScore   = None
+        wasLastTie      = False
+        rankedMappings  = []
+        bestRegion      = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd())
+        for mapping in orderedMappings:
+            mapping.setNbOccurrences(nbOccurrences)
+            mapping.setOccurrence(cpt)
+
+            score = mapping.getErrorScore()
+            if previousScore != None and previousScore == score:
+                if "Rank" in previousMapping.getTagNames():
+                    if not wasLastTie:
+                        previousMapping.setRank("%sTie" % (rank))
+                    mapping.setRank("%sTie" % (rank))
+                    wasLastTie = True
+            else:
+                rank = cpt
+                mapping.setRank(rank)
+                wasLastTie = False
+            if cpt != 1:
+                mapping.setBestRegion(bestRegion)
+
+            rankedMappings.append(mapping)
+            previousMapping = mapping
+            previousScore   = score
+            cpt            += 1
+        return rankedMappings
+
+
+    def processMappings(self, mappings):
+        if not mappings:
+            return
+        selectedMappings = []
+        name             = mappings[0].queryInterval.getName()
+        size             = self.findOriginalSize(name)
+        for mapping in mappings:
+            if self.merge:
+                mapping.mergeExons(distanceExons)
+            mapping.queryInterval.size = size
+            if self.checkErrors(mapping):
+                selectedMappings.append(mapping)
+
+        if self.checkNbMappings(selectedMappings):
+            if self.unmatchedWriter != None:
+                query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name)))
+            self.nbWrittenSequences += 1
+            mappings = self.sortMappings(selectedMappings)
+            for mapping in mappings:
+                self.nbWrittenMappings += 1
+                self.gff3Writer.addTranscript(mapping.getTranscript())
+
+
+    def readMappings(self):
+        previousQueryName = None
+        mappings          = []
+        self.parser.reset()
+        progress = Progress(self.nbMappings, "Reading mappings", self.verbosity)
+        for mapping in self.parser.getIterator():
+            queryName = mapping.queryInterval.getName().split(" ")[0]
+            if self.checkPreviouslyMapped(queryName):
+                if self.logHandle != None:
+                    self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName))
+            else:
+                if previousQueryName == queryName:
+                    mappings.append(mapping)
+                else:
+                    if previousQueryName != None:
+                        self.processMappings(mappings)
+                    previousQueryName = queryName
+                    mappings          = [mapping, ]
+            progress.inc()
+        self.processMappings(mappings)
+        self.gff3Writer.write()
+        self.gff3Writer.close()
+        progress.done()
+        
+
+    def writeUnmatched(self):
+        progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity)
+        for sequence in self.sequenceListParser.getIterator():
+            name = sequence.getName().split(" ")[0]
+            query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name))
+            if query.isEmpty():
+                self.unmatchedWriter.addSequence(sequence)
+            progress.inc()
+        progress.done() 
+
+
+    def analyze(self):
+        self.countMappings()
+        self.checkOrder()
+        self.storeSequences()
+        if self.alreadyMappedReader != None:
+            self.storeAlreadyMapped()
+        self.readMappings()
+        if self.unmatchedWriter != None:
+            self.writeUnmatched()
+
+
+
+
+if __name__ == "__main__":
+    
+    # parse command line
+    description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]"
+
+    parser = OptionParser(description = description)
+    compGroup = OptionGroup(parser, "Compulsory options")
+    filtGroup = OptionGroup(parser, "Filtering options")
+    tranGroup = OptionGroup(parser, "Transformation options")
+    outpGroup = OptionGroup(parser, "Output options")
+    otheGroup = OptionGroup(parser, "Other options")
+    compGroup.add_option("-i", "--input",            dest="inputFileName",     action="store",                        type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]")
+    compGroup.add_option("-f", "--format",           dest="format",            action="store",      default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]")
+    compGroup.add_option("-q", "--sequences",        dest="sequencesFileName", action="store",                        type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]")
+    compGroup.add_option("-k", "--seqFormat",        dest="sequenceFormat",    action="store",      default="fasta",  type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]")
+    compGroup.add_option("-o", "--output",           dest="outputFileName",    action="store",                        type="string", help="output file [compulsory] [format: output file in GFF3 format]")
+    filtGroup.add_option("-n", "--number",           dest="number",            action="store",      default=None,     type="int",    help="max. number of occurrences of a sequence [format: int]")
+    filtGroup.add_option("-s", "--size",             dest="size",              action="store",      default=None,     type="int",    help="minimum pourcentage of size [format: int]")
+    filtGroup.add_option("-d", "--identity",         dest="identity",          action="store",      default=None,     type="int",    help="minimum pourcentage of identity [format: int]")
+    filtGroup.add_option("-m", "--mismatch",         dest="mismatch",          action="store",      default=None,     type="int",    help="maximum number of mismatches [format: int]")
+    filtGroup.add_option("-p", "--gap",              dest="gap",               action="store",      default=None,     type="int",    help="maximum number of gaps [format: int]")
+    tranGroup.add_option("-e", "--mergeExons",       dest="mergeExons",        action="store_true", default=False,                   help="merge exons when introns are short [format: bool] [default: false]")
+    tranGroup.add_option("-x", "--removeExons",      dest="removeExons",       action="store_true", default=False,                   help="remove transcripts when exons are short [format: bool] [default: false]")
+    outpGroup.add_option("-t", "--title",            dest="title",             action="store",      default="SMART",  type="string", help="title of the UCSC track [format: string] [default: SMART]")
+    outpGroup.add_option("-r", "--remaining",        dest="remaining",         action="store_true", default=False,                   help="print the unmatched sequences [format: bool] [default: false]")
+    otheGroup.add_option("-a", "--append",           dest="appendFileName",    action="store",      default=None,     type="string", help="append to GFF3 file [format: file in GFF3 format]")    
+    otheGroup.add_option("-v", "--verbosity",        dest="verbosity",         action="store",      default=1,        type="int",    help="trace level [default: 1] [format: int]")
+    otheGroup.add_option("-l", "--log",              dest="log",               action="store_true", default=False,                   help="write a log file [format: bool] [default: false]")
+    parser.add_option_group(compGroup)
+    parser.add_option_group(filtGroup)
+    parser.add_option_group(tranGroup)
+    parser.add_option_group(outpGroup)
+    parser.add_option_group(otheGroup)
+    (options, args) = parser.parse_args()
+
+    
+    analyzer = MapperAnalyzer(options.verbosity)
+    analyzer.setMappingFile(options.inputFileName, options.format)
+    analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat)
+    analyzer.setOutputFile(options.outputFileName, options.title)
+    if options.appendFileName != None:
+        analyzer.setAlreadyMatched(options.appendFileName)
+    if options.remaining:
+        analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat)
+    if options.number != None:
+        analyzer.setMaxMappings(options.number)
+    if options.size != None:
+        analyzer.setMinSize(options.size)
+    if options.identity != None:
+        analyzer.setMinId(options.identity)
+    if options.mismatch != None:
+        analyzer.setMaxMismatches(options.mismatch)
+    if options.gap != None:
+        analyzer.setMaxGaps(options.gap)
+    if options.mergeExons:
+        analyzer.mergeExons(True)
+    if options.removeExons:
+        analyzer.acceptShortExons(False)
+    if options.log:
+        analyzer.setLog("%s.log" % (options.outputFileName))
+    analyzer.analyze()
+    
+    if options.verbosity > 0:
+        print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
+        if options.appendFileName != None:
+            print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
+        print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100)
+        if options.appendFileName != None:
+            print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100)
+        print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100)
+        print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100)
+        print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100)
+        print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100)
+        print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100)
+        print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100)
+        if options.appendFileName != None:
+            print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)
+
+