Mercurial > repos > yufei-luo > s_mart
view smart_toolShed/SMART/Java/Python/mapperAnalyzer.py @ 4:1fc014126d55
Uploaded
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:45:50 -0500 |
parents | e0f8dcca02ed |
children |
line wrap: on
line source
#! /usr/bin/env python # # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # """ Read a mapping file (many formats supported) and select some of them Mappings should be sorted by read names """ import os, random, shelve from optparse import OptionParser, OptionGroup from commons.core.parsing.ParserChooser import ParserChooser from commons.core.parsing.FastaParser import FastaParser from commons.core.parsing.FastqParser import FastqParser from commons.core.parsing.GffParser import GffParser from commons.core.writer.BedWriter import BedWriter from commons.core.writer.UcscWriter import UcscWriter from commons.core.writer.GbWriter import GbWriter from commons.core.writer.Gff2Writer import Gff2Writer from commons.core.writer.Gff3Writer import Gff3Writer from commons.core.writer.FastaWriter import FastaWriter from commons.core.writer.FastqWriter import FastqWriter from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection from SMART.Java.Python.mySql.MySqlTable import MySqlTable from SMART.Java.Python.misc.RPlotter import RPlotter from SMART.Java.Python.misc.Progress import Progress from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress distanceExons = 20 exonSize = 20 class MapperAnalyzer(object): """ Analyse the output of a parser """ def __init__(self, verbosity = 0): self.verbosity = verbosity self.mySqlConnection = MySqlConnection(verbosity) self.tooShort = 0 self.tooManyMismatches = 0 self.tooManyGaps = 0 self.tooShortExons = 0 self.tooManyMappings = 0 self.nbMappings = 0 self.nbSequences = 0 self.nbAlreadyMapped = 0 self.nbAlreadyMappedSequences = 0 self.nbWrittenMappings = 0 self.nbWrittenSequences = 0 self.parser = None self.logHandle = None self.randomNumber = random.randint(0, 100000) self.gff3Writer = None self.alreadyMappedReader = None self.unmatchedWriter = None self.sequenceListParser = None self.sequences = None self.alreadyMapped = None self.mappedNamesTable = None self.minSize = None self.minId = None self.maxMismatches = None self.maxGaps = None self.maxMappings = None self.merge = False self.checkExons = False self.suffix = None self.tmpDirectory = "%s%s" % (os.environ["SMARTMPPATH"], os.sep) if "SMARTMPPATH" in os.environ else "" def __del__(self): if self.sequences != None: self.sequences.close() if self.alreadyMapped != None: self.alreadyMapped.close() if self.mappedNamesTable != None: self.mappedNamesTable.remove() if self.gff3Writer != None: self.gff3Writer.close() if self.logHandle != None: self.logHandle.close() def setMappingFile(self, fileName, format): parserChooser = ParserChooser(self.verbosity) parserChooser.findFormat(format, "mapping") self.parser = parserChooser.getParser(fileName) def setSequenceFile(self, fileName, format): if format == "fasta": self.sequenceListParser = FastaParser(fileName, self.verbosity) elif format == "fastq": self.sequenceListParser = FastqParser(fileName, self.verbosity) else: raise Exception("Do not understand sequence format %s" % (format)) def setOutputFile(self, fileName, title): self.gff3Writer = Gff3Writer(fileName, self.verbosity) self.gff3Writer.setTitle(title) def setAlreadyMatched(self, fileName): self.alreadyMappedReader = GffParser(fileName, self.verbosity) def setRemainingFile(self, fileName, format): if format == "fasta": self.unmatchedWriter = FastaWriter("%s_unmatched.fasta" % (fileName), self.verbosity) elif format == "fastq": self.unmatchedWriter = FastqWriter("%s_unmatched.fastq" % (fileName), self.verbosity) else: raise Exception("Do not understand %s format." % (format)) self.mappedNamesTable = MySqlTable(self.mySqlConnection, "mappedNames_%d" % (self.randomNumber), self.verbosity) self.mappedNamesTable.create(["name"], {"name": "char"}, {"name": 50}) self.mappedNamesTable.createIndex("iNameMapped", ["name", ], True) def setLog(self, fileName): self.logHandle = open(fileName, "w") def setMinSize(self, size): self.minSize = size def setMinId(self, id): self.minId = id def setMaxMismatches(self, mismatches): self.maxMismatches = mismatches def setMaxGaps(self, gaps): self.maxGaps = gaps def setMaxMappings(self, mappings): self.maxMappings = mappings def mergeExons(self, b): self.merge = b def acceptShortExons(self, b): self.checkExons = not b def countMappings(self): self.nbMappings = self.parser.getNbMappings() if self.verbosity > 0: print "%i matches found" % (self.nbMappings) def storeAlreadyMapped(self): self.alreadyMapped = shelve.open("%stmpAlreadyMapped_%d" % (self.tmpDirectory, self.randomNumber)) progress = Progress(self.alreadyMappedReader.getNbTranscripts(), "Reading already mapped reads", self.verbosity) self.nbAlreadyMappedSequences = 0 for transcript in self.alreadyMappedReader.getIterator(): if not self.alreadyMapped.has_key(transcript.getName()): self.alreadyMapped[transcript.getName()] = 1 self.nbAlreadyMappedSequences += 1 progress.inc() progress.done() self.nbAlreadyMapped = self.alreadyMappedReader.getNbTranscripts() def storeSequences(self): self.sequences = shelve.open("%stmpSequences_%d" % (self.tmpDirectory, self.randomNumber)) progress = Progress(self.sequenceListParser.getNbSequences(), "Reading sequences", self.verbosity) for sequence in self.sequenceListParser.getIterator(): self.sequences[sequence.getName().split(" ")[0]] = len(sequence.getSequence()) self.nbSequences += 1 progress.inc() progress.done() if self.verbosity > 0: print "%i sequences read" % (self.nbSequences) def checkOrder(self): names = shelve.open("%stmpNames_%d" % (self.tmpDirectory, self.randomNumber)) previousName = None progress = Progress(self.nbMappings, "Checking mapping file", self.verbosity) for mapping in self.parser.getIterator(): name = mapping.queryInterval.getName() if name != previousName and previousName != None: if names.has_key(previousName): raise Exception("Error! Input mapping file is not ordered! (Name '%s' occurs at least twice)" % (previousName)) names[previousName] = 1 previousName = name progress.inc() progress.done() names.close() def checkPreviouslyMapped(self, name): if self.alreadyMappedReader == None: return False return self.alreadyMapped.has_key(name) def findOriginalSize(self, name): alternate = "%s/1" % (name) if (self.suffix == None) or (not self.suffix): if self.sequences.has_key(name): self.suffix = False return self.sequences[name] if self.suffix == None: self.suffix = True else: raise Exception("Cannot find name %n" % (name)) if (self.suffix): if self.sequences.has_key(alternate): return self.sequences[alternate] raise Exception("Cannot find name %s" % (name)) def checkErrors(self, mapping): accepted = True # short size if self.minSize != None and mapping.size * 100 < self.minSize * mapping.queryInterval.size: self.tooShort += 1 accepted = False if self.logHandle != None: self.logHandle.write("size of mapping %s is too short (%i instead of %i)\n" % (str(mapping), mapping.queryInterval.size, mapping.size)) # low identity if self.minId != None and mapping.getTagValue("identity") < self.minId: self.tooManyMismatches += 1 accepted = False if self.logHandle != None: self.logHandle.write("mapping %s has a low identity rate\n" % (str(mapping))) # too many mismatches if self.maxMismatches != None and mapping.getTagValue("nbMismatches") > self.maxMismatches: self.tooManyMismatches += 1 accepted = False if self.logHandle != None: self.logHandle.write("mapping %s has more mismatches than %i\n" % (str(mapping), self.maxMismatches)) # too many gaps if self.maxGaps != None and mapping.getTagValue("nbGaps") > self.maxGaps: self.tooManyGaps += 1 accepted = False if self.logHandle != None: self.logHandle.write("mapping %s has more gaps than %i\n" % (str(mapping), self.maxGaps)) # short exons if self.checkExons and len(mapping.subMappings) > 1 and min([subMapping.targetInterval.getSize() for subMapping in mapping.subMappings]) < exonSize: self.tooShortExons += 1 accepted = False if self.logHandle != None: self.logHandle.write("sequence %s maps as too short exons\n" % (mapping)) return accepted def checkNbMappings(self, mappings): nbOccurrences = 0 for mapping in mappings: nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") if (self.maxMappings != None and nbOccurrences > self.maxMappings): self.tooManyMappings += 1 if self.logHandle != None: self.logHandle.write("sequence %s maps %i times\n" % (mappings[0].queryInterval.getName(), nbOccurrences)) return False return (nbOccurrences > 0) def sortMappings(self, mappings): nbOccurrences = 0 for mapping in mappings: nbOccurrences += 1 if "nbOccurrences" not in mapping.getTagNames() else mapping.getTagValue("nbOccurrences") orderedMappings = sorted(mappings, key = lambda mapping: mapping.getErrorScore()) cpt = 1 rank = 1 previousMapping = None previousScore = None wasLastTie = False rankedMappings = [] bestRegion = "%s:%d-%d" % (orderedMappings[0].targetInterval.getChromosome(), orderedMappings[0].targetInterval.getStart(), orderedMappings[0].targetInterval.getEnd()) for mapping in orderedMappings: mapping.setNbOccurrences(nbOccurrences) mapping.setOccurrence(cpt) score = mapping.getErrorScore() if previousScore != None and previousScore == score: if "Rank" in previousMapping.getTagNames(): if not wasLastTie: previousMapping.setRank("%sTie" % (rank)) mapping.setRank("%sTie" % (rank)) wasLastTie = True else: rank = cpt mapping.setRank(rank) wasLastTie = False if cpt != 1: mapping.setBestRegion(bestRegion) rankedMappings.append(mapping) previousMapping = mapping previousScore = score cpt += 1 return rankedMappings def processMappings(self, mappings): if not mappings: return selectedMappings = [] name = mappings[0].queryInterval.getName() size = self.findOriginalSize(name) for mapping in mappings: if self.merge: mapping.mergeExons(distanceExons) mapping.queryInterval.size = size if self.checkErrors(mapping): selectedMappings.append(mapping) if self.checkNbMappings(selectedMappings): if self.unmatchedWriter != None: query = self.mySqlConnection.executeQuery("INSERT INTO %s (name) VALUES ('%s')" % (self.mappedNamesTable.name, name if not self.suffix else "%s/1" % (name))) self.nbWrittenSequences += 1 mappings = self.sortMappings(selectedMappings) for mapping in mappings: self.nbWrittenMappings += 1 self.gff3Writer.addTranscript(mapping.getTranscript()) def readMappings(self): previousQueryName = None mappings = [] self.parser.reset() progress = Progress(self.nbMappings, "Reading mappings", self.verbosity) for mapping in self.parser.getIterator(): queryName = mapping.queryInterval.getName().split(" ")[0] if self.checkPreviouslyMapped(queryName): if self.logHandle != None: self.logHandle.write("Mapping %s has already been mapped.\n" % (queryName)) else: if previousQueryName == queryName: mappings.append(mapping) else: if previousQueryName != None: self.processMappings(mappings) previousQueryName = queryName mappings = [mapping, ] progress.inc() self.processMappings(mappings) self.gff3Writer.write() self.gff3Writer.close() progress.done() def writeUnmatched(self): progress = Progress(self.nbSequences, "Reading unmatched sequences", self.verbosity) for sequence in self.sequenceListParser.getIterator(): name = sequence.getName().split(" ")[0] query = self.mySqlConnection.executeQuery("SELECT * FROM %s WHERE name = '%s' LIMIT 1" % (self.mappedNamesTable.name, name)) if query.isEmpty(): self.unmatchedWriter.addSequence(sequence) progress.inc() progress.done() def analyze(self): self.countMappings() self.checkOrder() self.storeSequences() if self.alreadyMappedReader != None: self.storeAlreadyMapped() self.readMappings() if self.unmatchedWriter != None: self.writeUnmatched() if __name__ == "__main__": # parse command line description = "Mapper Analyzer v1.0.1: Read the output of an aligner, print statistics and possibly translate into BED or GBrowse formats. [Category: Conversion]" parser = OptionParser(description = description) compGroup = OptionGroup(parser, "Compulsory options") filtGroup = OptionGroup(parser, "Filtering options") tranGroup = OptionGroup(parser, "Transformation options") outpGroup = OptionGroup(parser, "Output options") otheGroup = OptionGroup(parser, "Other options") compGroup.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file (output of the tool) [compulsory] [format: file in mapping format given by -f]") compGroup.add_option("-f", "--format", dest="format", action="store", default="seqmap", type="string", help="format of the file [compulsory] [format: mapping file format]") compGroup.add_option("-q", "--sequences", dest="sequencesFileName", action="store", type="string", help="file of the sequences [compulsory] [format: file in sequence format given by -k]") compGroup.add_option("-k", "--seqFormat", dest="sequenceFormat", action="store", default="fasta", type="string", help="format of the sequences: fasta or fastq [default: fasta] [format: sequence file format]") compGroup.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]") filtGroup.add_option("-n", "--number", dest="number", action="store", default=None, type="int", help="max. number of occurrences of a sequence [format: int]") filtGroup.add_option("-s", "--size", dest="size", action="store", default=None, type="int", help="minimum pourcentage of size [format: int]") filtGroup.add_option("-d", "--identity", dest="identity", action="store", default=None, type="int", help="minimum pourcentage of identity [format: int]") filtGroup.add_option("-m", "--mismatch", dest="mismatch", action="store", default=None, type="int", help="maximum number of mismatches [format: int]") filtGroup.add_option("-p", "--gap", dest="gap", action="store", default=None, type="int", help="maximum number of gaps [format: int]") tranGroup.add_option("-e", "--mergeExons", dest="mergeExons", action="store_true", default=False, help="merge exons when introns are short [format: bool] [default: false]") tranGroup.add_option("-x", "--removeExons", dest="removeExons", action="store_true", default=False, help="remove transcripts when exons are short [format: bool] [default: false]") outpGroup.add_option("-t", "--title", dest="title", action="store", default="SMART", type="string", help="title of the UCSC track [format: string] [default: SMART]") outpGroup.add_option("-r", "--remaining", dest="remaining", action="store_true", default=False, help="print the unmatched sequences [format: bool] [default: false]") otheGroup.add_option("-a", "--append", dest="appendFileName", action="store", default=None, type="string", help="append to GFF3 file [format: file in GFF3 format]") otheGroup.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]") otheGroup.add_option("-l", "--log", dest="log", action="store_true", default=False, help="write a log file [format: bool] [default: false]") parser.add_option_group(compGroup) parser.add_option_group(filtGroup) parser.add_option_group(tranGroup) parser.add_option_group(outpGroup) parser.add_option_group(otheGroup) (options, args) = parser.parse_args() analyzer = MapperAnalyzer(options.verbosity) analyzer.setMappingFile(options.inputFileName, options.format) analyzer.setSequenceFile(options.sequencesFileName, options.sequenceFormat) analyzer.setOutputFile(options.outputFileName, options.title) if options.appendFileName != None: analyzer.setAlreadyMatched(options.appendFileName) if options.remaining: analyzer.setRemainingFile(options.outputFileName, options.sequenceFormat) if options.number != None: analyzer.setMaxMappings(options.number) if options.size != None: analyzer.setMinSize(options.size) if options.identity != None: analyzer.setMinId(options.identity) if options.mismatch != None: analyzer.setMaxMismatches(options.mismatch) if options.gap != None: analyzer.setMaxGaps(options.gap) if options.mergeExons: analyzer.mergeExons(True) if options.removeExons: analyzer.acceptShortExons(False) if options.log: analyzer.setLog("%s.log" % (options.outputFileName)) analyzer.analyze() if options.verbosity > 0: print "kept %i sequences over %s (%f%%)" % (analyzer.nbWrittenSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) if options.appendFileName != None: print "kept %i sequences over %s (%f%%) including already mapped sequences" % (analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences, analyzer.nbSequences, float(analyzer.nbWrittenSequences + analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100) print "kept %i mappings over %i (%f%%)" % (analyzer.nbWrittenMappings, analyzer.nbMappings, float(analyzer.nbWrittenMappings) / analyzer.nbMappings * 100) if options.appendFileName != None: print "kept %i mappings over %i (%f%%) including already mapped" % (analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped, analyzer.nbMappings, float(analyzer.nbWrittenMappings + analyzer.nbAlreadyMapped) / analyzer.nbMappings * 100) print "removed %i too short mappings (%f%%)" % (analyzer.tooShort, float(analyzer.tooShort) / analyzer.nbMappings * 100) print "removed %i mappings with too many mismatches (%f%%)" % (analyzer.tooManyMismatches, float(analyzer.tooManyMismatches) / analyzer.nbMappings * 100) print "removed %i mappings with too many gaps (%f%%)" % (analyzer.tooManyGaps, float(analyzer.tooManyGaps) / analyzer.nbMappings * 100) print "removed %i mappings with too short exons (%f%%)" % (analyzer.tooShortExons, float(analyzer.tooShortExons) / analyzer.nbMappings * 100) print "removed %i sequences with too many hits (%f%%)" % (analyzer.tooManyMappings, float(analyzer.tooManyMappings) / analyzer.nbSequences * 100) print "%i sequences have no mapping (%f%%)" % (analyzer.nbSequences - analyzer.nbWrittenSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences) / analyzer.nbSequences * 100) if options.appendFileName != None: print "%i sequences have no mapping (%f%%) excluding already mapped sequences" % (analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences, float(analyzer.nbSequences - analyzer.nbWrittenSequences - analyzer.nbAlreadyMappedSequences) / analyzer.nbSequences * 100)