Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/mapperAnalyzer.py @ 38:2c0c0a89fad7
Uploaded
author | m-zytnicki |
---|---|
date | Thu, 02 May 2013 09:56:47 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/mapperAnalyzer.py Thu May 02 09:56:47 2013 -0400 @@ -0,0 +1,486 @@ +#! /usr/bin/env python +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +""" +Read a mapping file (many formats supported) and select some of them +Mappings should be sorted by read names +""" +import os, random, shelve +from optparse import OptionParser, OptionGroup +from commons.core.parsing.ParserChooser import ParserChooser +from commons.core.parsing.FastaParser import FastaParser +from commons.core.parsing.FastqParser import FastqParser +from commons.core.parsing.GffParser import GffParser +from commons.core.writer.BedWriter import BedWriter +from commons.core.writer.UcscWriter import UcscWriter +from commons.core.writer.GbWriter import GbWriter +from commons.core.writer.Gff2Writer import Gff2Writer +from commons.core.writer.Gff3Writer import Gff3Writer +from commons.core.writer.FastaWriter import FastaWriter +from commons.core.writer.FastqWriter import FastqWriter +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.mySql.MySqlTable import MySqlTable +from SMART.Java.Python.misc.RPlotter import RPlotter +from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress + + +distanceExons = 20 +exonSize = 20 + + +class MapperAnalyzer(object): + """ + Analyse the output of a parser + """ + + def __init__(self, verbosity = 0): + self.verbosity = verbosity + self.mySqlConnection = MySqlConnection(verbosity) + self.tooShort = 0 + self.tooManyMismatches = 0 + self.tooManyGaps = 0 + self.tooShortExons = 0 + self.tooManyMappings = 0 + self.nbMappings = 0 + self.nbSequences = 0 + self.nbAlreadyMapped = 0 + self.nbAlreadyMappedSequences = 0 + self.nbWrittenMappings = 0 + self.nbWrittenSequences = 0 + self.parser = None + self.logHandle = None + self.randomNumber = random.randint(0, 100000) + self.gff3Writer = None + self.alreadyMappedReader = None + self.unmatchedWriter = None + self.sequenceListParser = None + self.sequences = None + self.alreadyMapped = None + self.mappedNamesTable = None + self.minSize = None + self.minId = None + self.maxMismatches = None + self.maxGaps = None + self.maxMappings = None + self.merge = False + self.checkExons = False + self.suffix = None + self.tmpDirectory = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else "" + + + def __del__(self): + if self.sequences != None: + self.sequences.close() + if self.alreadyMapped != None: + self.alreadyMapped.close() + if self.mappedNamesTable != None: + self.mappedNamesTable.remove() + if self.gff3Writer != None: + self.gff3Writer.close() + + if self.logHandle != None: + self.logHandle.close() + + + def setMappingFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format, "mapping") + self.parser = parserChooser.getParser(fileName) + + + def setSequenceFile(self, fileName, format): + if format == "fasta": + self.sequenceListParser = FastaParser(fileName, self.verbosity) + elif format == "fastq": + self.sequenceListParser = FastqParser(fileName, self.verbosity) + else: + raise Exception("Do not understand sequence format %s" % (format)) + + + def setOutputFile(self, fileName, title): + self.gff3Writer = Gff3Writer(fileName, self.verbosity) + self.gff3Writer.setTitle(title) + + + def setAlreadyMatched(self, fileName): + self.alreadyMappedReader = GffParser(fileName, self.verbosity) + + + def setRemainingFile(self, fileName, format): + if format == "fasta": + self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity) + elif format == "fastq": + self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity) + else: + raise Exception("Do not understand %s format." % (format)) + self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity) + self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50}) + self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True) + + + def setLog(self, fileName): + self.logHandle = open(fileName, "w") + + + def setMinSize(self, size): + self.minSize = size + + + def setMinId(self, id): + self.minId = id + + + def setMaxMismatches(self, mismatches): + self.maxMismatches = mismatches + + + def setMaxGaps(self, gaps): + self.maxGaps = gaps + + + def setMaxMappings(self, mappings): + self.maxMappings = mappings + + + def mergeExons(self, b): + self.merge = b + + + def acceptShortExons(self, b): + self.checkExons = not b + + + def countMappings(self): + self.nbMappings = self.parser.getNbMappings() + if self.verbosity > 0: + print "%i matches found" % (self.nbMappings) + + + def storeAlreadyMapped(self): + self.alreadyMapped = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber)) + progress = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity) + self.nbAlreadyMappedSequences = 0 + for transcript in self.alreadyMappedReader.getIterator(): + if not self.alreadyMapped.has_key(transcript.getName()): + self.alreadyMapped[transcript.getName()] = 1 + self.nbAlreadyMappedSequences += 1 + progress.inc() + progress.done() + self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts() + + + def storeSequences(self): + self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber)) + progress = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity) + for sequence in self.sequenceListParser.getIterator(): + self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence()) + self.nbSequences += 1 + progress.inc() + progress.done() + if self.verbosity > 0: + print "%i sequences read" % (self.nbSequences) + + + def checkOrder(self): + names = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber)) + previousName = None + progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity) + for mapping in self.parser.getIterator(): + name = mapping.queryInterval.getName() + if name != previousName and previousName != None: + if names.has_key(previousName): + raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName)) + names[previousName] = 1 + previousName = name + progress.inc() + progress.done() + names.close() + + + def checkPreviouslyMapped(self, name): + if self.alreadyMappedReader == None: + return False + return self.alreadyMapped.has_key(name) + + + def findOriginalSize(self, name): + alternate = "%s/1" % (name) + if (self.suffix == None) or (not self.suffix): + if self.sequences.has_key(name): + self.suffix = False + return self.sequences[name] + if self.suffix == None: + self.suffix = True + else: + raise Exception("Cannot find name %n" % (name)) + if (self.suffix): + if self.sequences.has_key(alternate): + return self.sequences[alternate] + raise Exception("Cannot find name %s" % (name)) + + + def checkErrors(self, mapping): + accepted = True + # short size + if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size: + self.tooShort += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size)) + # low identity + if self.minId != None and mapping.getTagValue("identity") < self.minId: + self.tooManyMismatches += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping))) + # too many mismatches + if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches: + self.tooManyMismatches += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches)) + # too many gaps + if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps: + self.tooManyGaps += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps)) + # short exons + if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize: + self.tooShortExons += 1 + accepted = False + if self.logHandle != None: + self.logHandle.write("sequence %s maps as too short exons\n" % (mapping)) + return accepted + + + def checkNbMappings(self, mappings): + nbOccurrences = 0 + for mapping in mappings: + nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") + if (self.maxMappings != None and nbOccurrences > self.maxMappings): + self.tooManyMappings += 1 + if self.logHandle != None: + self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences)) + return False + return (nbOccurrences > 0) + + + def sortMappings(self, mappings): + nbOccurrences = 0 + for mapping in mappings: + nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") + + orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore()) + cpt = 1 + rank = 1 + previousMapping = None + previousScore = None + wasLastTie = False + rankedMappings = [] + bestRegion = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd()) + for mapping in orderedMappings: + mapping.setNbOccurrences(nbOccurrences) + mapping.setOccurrence(cpt) + + score = mapping.getErrorScore() + if previousScore != None and previousScore == score: + if "Rank" in previousMapping.getTagNames(): + if not wasLastTie: + previousMapping.setRank("%sTie" % (rank)) + mapping.setRank("%sTie" % (rank)) + wasLastTie = True + else: + rank = cpt + mapping.setRank(rank) + wasLastTie = False + if cpt != 1: + mapping.setBestRegion(bestRegion) + + rankedMappings.append(mapping) + previousMapping = mapping + previousScore = score + cpt += 1 + return rankedMappings + + + def processMappings(self, mappings): + if not mappings: + return + selectedMappings = [] + name = mappings[0].queryInterval.getName() + size = self.findOriginalSize(name) + for mapping in mappings: + if self.merge: + mapping.mergeExons(distanceExons) + mapping.queryInterval.size = size + if self.checkErrors(mapping): + selectedMappings.append(mapping) + + if self.checkNbMappings(selectedMappings): + if self.unmatchedWriter != None: + query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name))) + self.nbWrittenSequences += 1 + mappings = self.sortMappings(selectedMappings) + for mapping in mappings: + self.nbWrittenMappings += 1 + self.gff3Writer.addTranscript(mapping.getTranscript()) + + + def readMappings(self): + previousQueryName = None + mappings = [] + self.parser.reset() + progress = Progress(self.nbMappings, "Reading mappings", self.verbosity) + for mapping in self.parser.getIterator(): + queryName = mapping.queryInterval.getName().split(" ")[0] + if self.checkPreviouslyMapped(queryName): + if self.logHandle != None: + self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName)) + else: + if previousQueryName == queryName: + mappings.append(mapping) + else: + if previousQueryName != None: + self.processMappings(mappings) + previousQueryName = queryName + mappings = [mapping, ] + progress.inc() + self.processMappings(mappings) + self.gff3Writer.write() + self.gff3Writer.close() + progress.done() + + + def writeUnmatched(self): + progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity) + for sequence in self.sequenceListParser.getIterator(): + name = sequence.getName().split(" ")[0] + query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name)) + if query.isEmpty(): + self.unmatchedWriter.addSequence(sequence) + progress.inc() + progress.done() + + + def analyze(self): + self.countMappings() + self.checkOrder() + self.storeSequences() + if self.alreadyMappedReader != None: + self.storeAlreadyMapped() + self.readMappings() + if self.unmatchedWriter != None: + self.writeUnmatched() + + + + +if __name__ == "__main__": + + # parse command line + description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]" + + parser = OptionParser(description = description) + compGroup = OptionGroup(parser, "Compulsory options") + filtGroup = OptionGroup(parser, "Filtering options") + tranGroup = OptionGroup(parser, "Transformation options") + outpGroup = OptionGroup(parser, "Output options") + otheGroup = OptionGroup(parser, "Other options") + compGroup.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]") + compGroup.add_option("-f", "--format", dest="format", action="store", default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]") + compGroup.add_option("-q", "--sequences", dest="sequencesFileName", action="store", type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]") + compGroup.add_option("-k", "--seqFormat", dest="sequenceFormat", action="store", default="fasta", type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]") + compGroup.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") + filtGroup.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="max. number of occurrences of a sequence [format: int]") + filtGroup.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="minimum pourcentage of size [format: int]") + filtGroup.add_option("-d", "--identity", dest="identity", action="store", default=None, type="int", help="minimum pourcentage of identity [format: int]") + filtGroup.add_option("-m", "--mismatch", dest="mismatch", action="store", default=None, type="int", help="maximum number of mismatches [format: int]") + filtGroup.add_option("-p", "--gap", dest="gap", action="store", default=None, type="int", help="maximum number of gaps [format: int]") + tranGroup.add_option("-e", "--mergeExons", dest="mergeExons", action="store_true", default=False, help="merge exons when introns are short [format: bool] [default: false]") + tranGroup.add_option("-x", "--removeExons", dest="removeExons", action="store_true", default=False, help="remove transcripts when exons are short [format: bool] [default: false]") + outpGroup.add_option("-t", "--title", dest="title", action="store", default="SMART", type="string", help="title of the UCSC track [format: string] [default: SMART]") + outpGroup.add_option("-r", "--remaining", dest="remaining", action="store_true", default=False, help="print the unmatched sequences [format: bool] [default: false]") + otheGroup.add_option("-a", "--append", dest="appendFileName", action="store", default=None, type="string", help="append to GFF3 file [format: file in GFF3 format]") + otheGroup.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") + otheGroup.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") + parser.add_option_group(compGroup) + parser.add_option_group(filtGroup) + parser.add_option_group(tranGroup) + parser.add_option_group(outpGroup) + parser.add_option_group(otheGroup) + (options, args) = parser.parse_args() + + + analyzer = MapperAnalyzer(options.verbosity) + analyzer.setMappingFile(options.inputFileName, options.format) + analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat) + analyzer.setOutputFile(options.outputFileName, options.title) + if options.appendFileName != None: + analyzer.setAlreadyMatched(options.appendFileName) + if options.remaining: + analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat) + if options.number != None: + analyzer.setMaxMappings(options.number) + if options.size != None: + analyzer.setMinSize(options.size) + if options.identity != None: + analyzer.setMinId(options.identity) + if options.mismatch != None: + analyzer.setMaxMismatches(options.mismatch) + if options.gap != None: + analyzer.setMaxGaps(options.gap) + if options.mergeExons: + analyzer.mergeExons(True) + if options.removeExons: + analyzer.acceptShortExons(False) + if options.log: + analyzer.setLog("%s.log" % (options.outputFileName)) + analyzer.analyze() + + if options.verbosity > 0: + print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) + if options.appendFileName != None: + print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100) + print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100) + if options.appendFileName != None: + print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100) + print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100) + print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100) + print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100) + print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100) + print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100) + print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) + if options.appendFileName != None: + print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100) + +