Mercurial > repos > yufei-luo > s_mart
view SMART/Java/Python/structure/TranscriptListsComparator.py @ 44:5f796c5c579f
Uploaded
author | m-zytnicki |
---|---|
date | Wed, 18 Sep 2013 08:32:38 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line source
# # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # import sys import random from SMART.Java.Python.misc import Utils from SMART.Java.Python.structure.Transcript import Transcript from SMART.Java.Python.structure.TranscriptList import TranscriptList from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable from SMART.Java.Python.misc.Progress import Progress from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter class TranscriptListsComparator(object): """ Compare two transcript lists, using a database for one of the list Uses one TranscriptContainer for query data, one TranscriptContainer exported to MySqlTranscriptTable for reference data, one MySqlTranscriptTable for transformed reference data @ivar inputTranscriptContainers: parsers to the list of query transcripts @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>} @ivar writer: transcript list writer @type writer: class L{TranscriptListWriter<TranscriptListWriter>} @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently) @type mySqlConnection: class L{MySqlConnection<MySqlConnection>} @ivar introns: compare transcripts or exons only @type introns: list of 2 boolean @ivar starts: restrict the query transcripts to first nucleotides @type starts: list of 2 int or None @ivar fivePrimes: extend a list of transcripts by their 5' end @type fivePrimes: list of 2 int or None @ivar threePrimes: extend a list of transcripts by their 3' end @type threePrimes: list of 2 int or None @ivar minDistance: min distance between two transcripts [default: 0] @type minDistance: int @ivar maxDistance: max distance between two transcripts [default: 0] @type maxDistance: int @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap @type minOverlap: int @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap @type pcOverlap: int @ivar upstreams: consider distances with elements which are upstream of the transcripts @type upstreams: boolean @ivar downstreams: consider distances with elements which are downstream of the transcripts @type downstreams: boolean @ivar colinear: whether transcripts should overlap in the same direction @type colinear: boolean @ivar antisense: whether transcripts should overlap in the opposite direction @type antisense: boolean @ivar outputDistance: output distance between query and reference instead of query transcript @type outputDistance: boolean @ivar absolute: do not consider the strand while computing distance @type absolute: boolean @ivar strandedDistance: return a line per strand while computing distances @type strandedDistance: boolean @ivar QUERY: constant specifying the query objects @type QUERY: int @ivar REFERENCE: constant specifying the reference objects @type REFERENCE: int @ivar INPUTTYPES: set of input types of data (query or reference) objects @type INPUTTYPES: list of 2 int @ivar typeToString: string representation of the previous types @type typeToString: dict @ivar tableNames: name of the transcript tables @type tableNames: dict of strings @ivar nbTranscripts: number of transcript in the query/reference set @type nbTranscripts: list of 2 int or None @ivar nbNucleotides: number of nucleotides in the query/reference set @type nbNucleotides: list of 2 int or None @ivar transcriptsToBeStored: transcripts that will be stored into database @type transcriptsToBeStored: dict of class L{TranscriptList<TranscriptList>} @ivar multiple: in merge mode, aggregate multiple transcripts @type multiple: boolean @ivar normalization: normalize each element by the number of mappings of this element @type normalization: boolean @ivar invert: invert the current comparison @type invert: boolean @ivar splitDifference: split into intervals when computing difference @type splitDifference: boolean @ivar odds: whether odds about the comparison should be computed @type odds: boolean @ivar overlapResults: count the number of overlaps @type overlapResults: dictionary @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one @type oddResults: dictionary @ivar outputContainer: container of the output transcripts @type outputContainer: class L{TranscriptContainer<TranscriptContainer>} @ivar logHandle: log handle @type logHandle: file @ivar verbosity: verbosity @type verbosity: int """ def __init__(self, logHandle = None, verbosity = 0): """ Constructor @param transcriptListParser2: parser to the list of reference transcripts @type transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>} @param logHandle: log handle @type logHandle: file @param verbosity: verbosity @type verbosity: int """ self.QUERY = 0 self.REFERENCE = 1 self.WORKING = 2 self.INPUTTYPES = (self.QUERY, self.REFERENCE) self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING) self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"} self.logHandle = logHandle self.verbosity = verbosity self.mySqlConnection = MySqlConnection(self.verbosity-1) self.inputTranscriptContainers = [None, None] self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))] self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames] self.writer = None self.introns = [False, False] self.starts = [None, None] self.ends = [None, None] self.fivePrimes = [None, None] self.threePrimes = [None, None] self.minDistance = None self.maxDistance = 0 self.minOverlap = 1 self.pcOverlap = None self.colinear = False self.antisense = False self.downstreams = [False, False] self.upstreams = [False, False] self.outputDistance = False self.absolute = False self.strandedDistance = False self.nbTranscripts = [None, None] self.nbNucleotides = [None, None] self.normalization = False self.included = False self.including = False self.invert = False self.notOverlapping = False self.splitDifference = False self.multiple = False self.odds = False self.overlapResults = None self.oddResults = None self.outputContainer = None self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES]) self.nbPrinted = 0 self.mySqlConnection.createDatabase() def __del__(self): """ Destructor Remove all temporary tables """ for type in self.INPUTWORKINGTYPES: self.mySqlTranscriptWriters[type].removeTables() self.mySqlConnection.deleteDatabase() def acceptIntrons(self, type, bool): """ Compare transcripts or exons only @param type: whether use query/reference data @type type: int @param bool: include introns or not @type bool: boolean """ self.introns[type] = bool def restrictToStart(self, type, size): """ Restrict a list of transcripts to first nucleotides @param type: whether use query/reference data @type type: int @param size: the size of the transcript to be considered @type size: int """ self.starts[type] = size self.introns[type] = False def restrictToEnd(self, type, size): """ Restrict a list of transcripts to first nucleotides @param type: whether use query/reference data @type type: int @param size: the size of the transcript to be considered @type size: int """ self.ends[type] = size self.introns[type] = False def extendFivePrime(self, type, size): """ Extend a list of transcripts by their 5' end @param type: whether use query/reference data @type type: int @param size: size of the extension @type size: int """ self.fivePrimes[type] = size def extendThreePrime(self, type, size): """ Extend the list of query transcripts by their 3' end @param type: whether use query/reference data @type type: int @param size: size of the extension @type size: int """ self.threePrimes[type] = size def setMinDistance(self, distance): """ Set the min distance between two transcripts @param distance: distance @type distance: int """ self.minDistance = distance def setMaxDistance(self, distance): """ Set the max distance between two transcripts @param distance: distance @type distance: int """ self.maxDistance = distance def setMinOverlap(self, overlap): """ Set the minimum number of nucleotides to declare an overlap @param overlap: minimum number of nucleotides @type overlap: int """ self.minOverlap = overlap def setPcOverlap(self, overlap): """ Set the percentage of nucleotides to declare an overlap @param overlap: percentage of nucleotides @type overlap: int """ self.pcOverlap = overlap def setUpstream(self, type, boolean): """ Consider transcripts which are upstream of some transcripts @param type: whether use query/reference data @type type: int @param boolean: consider only these transcripts or not @type boolean: boolean """ self.upstreams[type] = boolean def setDownstream(self, type, boolean): """ Consider transcripts which are downstream of some transcripts @param type: whether use query/reference data @type type: int @param boolean: consider only these transcripts or not @type boolean: boolean """ self.downstreams[type] = boolean def setOutputDistance(self, boolean): """ Output distance between query and reference instead of query transcript @param boolean: whether distance should be output @type boolean: boolean """ self.outputDistance = boolean def setAbsolute(self, boolean): """ Do not consider strand when computing distance (thus, having only non-negative values) @param boolean: whether we should consider strands @type boolean: boolean """ self.absolute = boolean def setStrandedDistance(self, boolean): """ Return two distance distributions, one per strand @param boolean: whether we should return 2 distance distance @type boolean: boolean """ self.strandedDistance = boolean def getColinearOnly(self, boolean): """ Only consider transcripts that overlap in the same direction @param boolean: whether transcripts should overlap in the same direction @type boolean: boolean """ self.colinear = boolean def getAntisenseOnly(self, boolean): """ Only consider transcripts that overlap in the opposite direction @param boolean: whether transcripts should overlap in the opposite direction @type boolean: boolean """ self.antisense = boolean def setIncludedOnly(self, boolean): """ Keep the elements from first set which are included in the second set @param boolean: whether to keep included elements only @type boolean: boolean """ self.included = boolean def setIncludingOnly(self, boolean): """ Keep the elements from second set which are included in the first set @param boolean: whether to keep included elements only @type boolean: boolean """ self.including = boolean def setNormalization(self, boolean): """ Normalize the elements by the number of mappings in the genome @param boolean: whether normalize @type boolean: boolean """ self.normalization = boolean def getInvert(self, boolean): """ Only consider transcripts that do not overlap @param boolean: whether invert the selection @type boolean: boolean """ self.invert = boolean def includeNotOverlapping(self, boolean): """ Also output the elements which do not overlap @param boolean: whether output the elements which do not overlap @type boolean: boolean """ self.notOverlapping = boolean def setSplitDifference(self, boolean): """ Split into intervals when computing difference @param boolean: whether to split @type boolean: boolean """ self.splitDifference = boolean def aggregate(self, boolean): """ In merge mode, aggregate multiple transcripts @param boolean: aggregate multiple transcripts @type boolean: boolean """ self.multiple = boolean def getTables(self, type): """ Get the SQL tables @param type: type of the table (query, reference, etc.) @type type: int """ return self.mySqlTranscriptWriters[type].getTables() def computeOdds(self, boolean): """ Compute odds @param boolean: whether odds should be computed @type boolean: boolean """ self.odds = boolean if self.odds: self.overlapResults = dict() def computeOddsPerTranscript(self, boolean): """ Compute odds for each transcript @param boolean: whether odds for each transcript should be computed @type boolean: boolean """ self.odds = boolean if self.odds: self.overlapResults = dict() def removeTables(self): """ Remove the temporary MySQL tables """ for type in self.INPUTWORKINGTYPES: for chromosome in self.getTables(type): self.getTables(type)[chromosome].remove() def clearTables(self): """ Empty the content of the databases """ for type in self.INPUTWORKINGTYPES: if self.transcriptListParsers[type] != None: for chromosome in self.getTables(type): self.getTables(type)[chromosome].clear() def extendTranscript(self, type, transcript): """ Extend a transcript corresponding to the parameters of the class @param transcript: a transcript @type transcript: class L{Transcript<Transcript>} @return: the possibly extended transcript """ extendedTranscript = Transcript() extendedTranscript.copy(transcript) if self.starts[type] != None: extendedTranscript.restrictStart(self.starts[type]) if self.ends[type] != None: extendedTranscript.restrictEnd(self.ends[type]) if self.fivePrimes[type] != None: extendedTranscript.extendStart(self.fivePrimes[type]) if self.threePrimes[type] != None: extendedTranscript.extendEnd(self.threePrimes[type]) return extendedTranscript def storeTranscript(self, type, transcript, now = True): """ Add a transcript to a MySQL database, or postpone the store @param type: whether use query/reference table @type type: int @param transcript: a transcript @type transcript: class L{Transcript<Transcript>} @param now: whether transcript should be stored now (or stored can be postponed) @type now: bool """ self.mySqlTranscriptWriters[type].addTranscript(transcript) if type == self.REFERENCE: self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript) if now: self.mySqlTranscriptWriters[type].write() if type == self.REFERENCE: self.mySqlTranscriptWriters[self.WORKING].write() def writeTranscript(self, transcript): """ Write a transcript in the output file @param transcript: a transcript @type transcript: class L{Transcript<Transcript>} """ if self.writer != None: self.writer.addTranscript(transcript) self.nbPrinted += 1 def flushData(self, type = None): """ Store the remaining transcripts @param type: whether use query/reference table (None for all) @type type: int or None """ if type == None: types = self.INPUTWORKINGTYPES else: types = [type] for type in types: self.mySqlTranscriptWriters[type].write() if self.writer != None: self.writer.write() def unstoreTranscript(self, type, transcript): """ Remove a transcript from a MySQL database @param type: whether use query/reference table @type type: int @param transcript: a transcript @type transcript: class L{Transcript<Transcript>} """ self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript) if type == self.REFERENCE: self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript) def addIndexes(self, tables): """ Add useful indexes to the tables @param tables: which tables should be indexed @type tables: list of int """ for type in tables: for chromosome in self.getTables(type): self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"]) self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"]) def storeTranscriptList(self, type, transcriptListParser, extension): """ Store a transcript list into database @param type: whether use query/reference parser @type type: int @param parser: a parser of transcript list @type parser: class L{TranscriptContainer<TranscriptContainer>} @param extension: extend (or not) the transcripts @type extension: boolean """ progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1) for transcript in transcriptListParser.getIterator(): if extension: transcript = self.extendTranscript(type, transcript) self.mySqlTranscriptWriters[type].addTranscript(transcript) progress.inc() self.mySqlTranscriptWriters[type].write() progress.done() if type == self.REFERENCE: for chromosome in self.getTables(self.REFERENCE): self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1) self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome]) def setInputTranscriptContainer(self, type, inputTranscriptContainer): """ Set an input transcript list container @param type: whether use query/reference parser @type type: int @param inputTranscriptContainer: a container @type inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>} """ self.inputTranscriptContainers[type] = inputTranscriptContainer self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts() self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides() def setOutputWriter(self, writer): """ Set an output transcript list writer @param writer: a writer @type writer: class L{TranscriptListWriter<TranscriptListWriter>} """ self.writer = writer def compareTranscript(self, transcript1, transcript2, includeDistance = False): """ Compare two transcripts, using user defined parameters @param transcript1: a transcript from the query set (already extended) @type transcript1: class L{Transcript<Transcript>} @param transcript2: a transcript from the reference set (already extended) @type transcript2: class L{Transcript<Transcript>} @param includeDistance: take into account the distance too @type includeDistance: boolean @return: true, if they overlap """ extendedTranscript1 = Transcript() extendedTranscript1.copy(transcript1) if includeDistance: if self.maxDistance > 0: extendedTranscript1.extendStart(self.maxDistance) extendedTranscript1.extendEnd(self.maxDistance) minOverlap = self.minOverlap if self.pcOverlap != None: minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap) if not extendedTranscript1.overlapWith(transcript2, self.minOverlap): return False if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \ (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \ (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \ (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()): return False if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()): return False if self.included and not transcript2.include(extendedTranscript1): return False if self.including and not extendedTranscript1.include(transcript2): return False if self.introns[self.REFERENCE] and self.introns[self.QUERY]: if self.logHandle != None: self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2))) return True if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap): if self.logHandle != None: self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2))) return True return False def compareTranscriptToList(self, transcript1): """ Compare a transcript to the reference list of transcripts (Do not extend the transcripts, except for the distance) @param transcript1: a transcript (from the query set) @type transcript1: class L{Transcript<Transcript>} @return: the reference transcripts overlapping """ # no transcript in the reference table if transcript1.getChromosome() not in self.getTables(self.WORKING): return # retrieve the the transcripts that may overlap in the working tables clauses = [] extendedTranscript1 = Transcript() extendedTranscript1.copy(transcript1) if self.maxDistance > 0: extendedTranscript1.extendStart(self.maxDistance) if self.maxDistance > 0: extendedTranscript1.extendEnd(self.maxDistance) command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName()) for binPair in extendedTranscript1.getBins(): clause = "bin " if binPair[0] == binPair[1]: clause += "= %i" % (binPair[0]) else: clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1]) clauses.append(clause) command += " OR ".join(clauses) command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart()) for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command): if self.compareTranscript(extendedTranscript1, transcript2): yield transcript2 def compareTranscriptList(self): """ Compare a list of transcript to the reference one @return: the transcripts that overlap with the reference set """ distance = 0 nbClustersIn = 0 nbClustersOut = 0 if self.maxDistance != None: distance = self.maxDistance self.addIndexes([self.QUERY, self.REFERENCE]) # export the container into tables self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) # looping for chromosome1 in sorted(self.getTables(self.QUERY).keys()): # get range of transcripts command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) query = self.mySqlConnection.executeQuery(command) result = query.getLine() first = result[0] last = result[1] nb = result[2] transcripts1 = [] toBeRemoved1 = [] transcripts2 = [] toBeRemoved2 = [] overlapsWith = [] nbOverlaps = [] nbChunks = max(1, nb / 100) chunkSize = (last - first) / nbChunks progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) for chunk in range(nbChunks + 1): # load transcripts start = first + chunk * chunkSize end = start + chunkSize - 1 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): transcripts1.append(transcript1) overlapsWith.append([]) nbOverlaps.append(0) nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) self.mySqlConnection.executeQuery(command) if chromosome1 in self.getTables(self.REFERENCE): command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) if chunk == 0: command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): transcripts2.append(transcript2) command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) self.mySqlConnection.executeQuery(command) # compare sets for index1, transcript1 in enumerate(transcripts1): overlappingNames = [] nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") for transcript2 in transcripts2: if self.compareTranscript(transcript1, transcript2, True): id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() if id2 not in overlapsWith[index1]: overlapsWith[index1].append(id2) nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements") if self.odds: if transcript2.getName() not in self.overlapResults: self.overlapResults[transcript2.getName()] = 0 self.overlapResults[transcript2.getName()] += nbElements1 # check if query transcript extends bounds of the chunk if transcript1.getEnd() < end: if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: if overlapsWith[index1]: transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) elif self.notOverlapping: transcript1.setTagValue("nbOverlaps", "0") self.writeTranscript(transcript1) nbClustersOut += nbElements1 toBeRemoved1.append(index1) # update list of query transcripts for index1 in reversed(toBeRemoved1): del transcripts1[index1] del overlapsWith[index1] del nbOverlaps[index1] toBeRemoved1 = [] # check if the reference transcripts extends bounds of the chunk for index2, transcript2 in enumerate(transcripts2): if transcript2.getEnd() + distance < end: toBeRemoved2.append(index2) for index2 in reversed(toBeRemoved2): del transcripts2[index2] toBeRemoved2 = [] progress.inc() for index1, transcript1 in enumerate(transcripts1): if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: if overlapsWith[index1]: transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) elif self.notOverlapping: transcript1.setTagValue("nbOverlaps", "0") self.writeTranscript(transcript1) nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") progress.done() self.getTables(self.QUERY)[chromosome1].remove() if chromosome1 in self.getTables(self.REFERENCE): self.getTables(self.REFERENCE)[chromosome1].remove() self.getTables(self.WORKING)[chromosome1].remove() self.flushData() if self.writer != None: self.writer.close() self.writer = None if self.verbosity > 0: print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn) if self.nbTranscripts[self.QUERY] != 0: print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100), if nbClustersOut != 0: print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100) def compareTranscriptListDistance(self): """ Compare a list of transcript to the reference one @return: the distance distributions in a hash """ nbDistances = 0 distances = {} absDistances = {} strandedDistances = dict([(strand, {}) for strand in (1, -1)]) # export the container into tables self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1) for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator(): # get the distance transcript1 = self.extendTranscript(self.QUERY, transcript1) distance = self.maxDistance + 1 strand = None closestElement = "None" for transcript2 in self.compareTranscriptToList(transcript1): thisStrand = transcript1.getDirection() * transcript2.getDirection() if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()): transcript2.reverse() if self.absolute: transcript2.setDirection(transcript1.getDirection()) if transcript2.getDirection() == transcript1.getDirection(): if self.starts[self.REFERENCE] != None: transcript2.restrictStart(self.starts[self.REFERENCE]) if self.ends[self.REFERENCE] != None: transcript2.restrictEnd(self.ends[self.REFERENCE]) thisDistance = transcript1.getRelativeDistance(transcript2) if (self.absolute): thisDistance = abs(thisDistance) if abs(thisDistance) < abs(distance): distance = thisDistance strand = thisStrand closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance): nbDistances += 1 distances[distance] = distances.get(distance, 0) + 1 absDistance = abs(distance) absDistances[absDistance] = absDistances.get(absDistance, 0) + 1 strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0) if distance not in strandedDistances[-strand]: strandedDistances[-strand][distance] = 0 # write transcript if distance == self.maxDistance + 1: distance = "None" tmpTranscript = Transcript() tmpTranscript.copy(transcript1) tmpTranscript.setTagValue("distance", distance) tmpTranscript.setTagValue("closestElement", closestElement) self.writeTranscript(tmpTranscript) progress.inc() progress.done() self.flushData() if self.verbosity > 0: print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE]) print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) if nbDistances == 0: print "Nothing matches" else: print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances) print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100) if self.strandedDistance: return strandedDistances return distances def compareTranscriptListMerge(self): """ Merge the query list of transcript with itself @return: the merged transcripts in a transcript list database """ nbMerges = 0 for type in (self.QUERY, self.REFERENCE): self.storeTranscriptList(type, self.inputTranscriptContainers[type], True) self.flushData() # Loop on the chromosomes for chromosome in sorted(self.getTables(self.QUERY).keys()): if chromosome not in self.getTables(self.REFERENCE): continue # Get the size of the chromosome maxEnd = 0 nbChunks = 0 for type in (self.QUERY, self.REFERENCE): command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName()) query = self.mySqlConnection.executeQuery(command) maxEnd = max(maxEnd, int(query.getLine()[0])) nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements()) mergedTranscripts = {} transcripts = {self.QUERY: [], self.REFERENCE: []} progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1) for i in range(nbChunks): rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1 rangeEnd = int((i+1) * (float(maxEnd) / nbChunks)) # Get all transcripts in query and reference from chunk for type in (self.QUERY, self.REFERENCE): correction = 0 if self.QUERY else self.maxDistance command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction) for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command): transcripts[type].append(transcript) # Merge elements between the two samples for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]): for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]): if referenceTranscript == None: continue if self.compareTranscript(queryTranscript, referenceTranscript, True): if queryTranscript.getDirection() != referenceTranscript.getDirection(): referenceTranscript.setDirection(queryTranscript.getDirection()) queryTranscript.merge(referenceTranscript, self.normalization) nbMerges += 1 transcripts[self.REFERENCE][iReference] = None if not self.multiple: mergedTranscripts[iQuery] = 0 # Remove transcripts from database for type in (self.QUERY, self.REFERENCE): correction = 0 if self.QUERY else self.maxDistance command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction) query = self.mySqlConnection.executeQuery(command) # Just in case, self-merge the elements in the query (beware of mergedTranscripts!) if (self.multiple): for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]): if queryTranscript1 == None: continue for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]): if iQuery2 <= iQuery1 or queryTranscript2 == None: continue minOverlap = self.minOverlap if self.pcOverlap != None: minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap) if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear): if queryTranscript1.getDirection() != queryTranscript2.getDirection(): queryTranscript2.setDirection(queryTranscript1.getDirection()) queryTranscript1.merge(queryTranscript2, self.normalization) transcripts[self.QUERY][iQuery2] = None nbMerges += 1 if not self.multiple: mergedTranscripts[iQuery1] = 0 # Update the sets of transcripts and write into database (also update mergedTranscripts) newTranscripts = {self.QUERY: [], self.REFERENCE: []} newMergedTranscripts = {} for type in (self.QUERY, self.REFERENCE): for i, transcript in enumerate(transcripts[type]): if transcript == None: continue correction = 0 if self.QUERY else self.maxDistance if transcript.getEnd() < rangeEnd - correction: if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): self.writeTranscript(transcripts[type][i]) else: if type == self.QUERY and i in mergedTranscripts: newMergedTranscripts[len(newTranscripts[type])] = 0 newTranscripts[type].append(transcript) transcripts = newTranscripts mergedTranscripts = newMergedTranscripts progress.inc() progress.done() for type in (self.QUERY, self.REFERENCE): for i, transcript in enumerate(transcripts[type]): if transcripts == None: continue if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): self.writeTranscript(transcripts[type][i]) # Manage chromosomes with no corresponding data if self.multiple: for type in self.INPUTTYPES: for chromosome in self.getTables(type): if chromosome in self.getTables(1 - type): continue for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator(): self.writeTranscript(transcript) self.flushData() if self.writer != None: self.writer.close() self.writer = None if self.verbosity > 0: print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) print "# merges: %d" % (nbMerges) print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) def compareTranscriptListSelfMerge(self): """ Merge the query list of transcript with itself @return: the merged transcripts in a transcript list database """ nbMerges = 0 distance = self.maxDistance if self.maxDistance != None else 0 self.addIndexes([self.QUERY]) self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) self.flushData() # looping for chromosome1 in sorted(self.getTables(self.QUERY).keys()): transcripts2 = [] # get range of transcripts progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName()) for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): # compare sets toBeRemoved = set() toBePrinted = set() for index2, transcript2 in enumerate(transcripts2): if self.compareTranscript(transcript1, transcript2, True): if transcript1.getDirection() != transcript2.getDirection(): transcript2.setDirection(transcript1.getDirection()) transcript1.merge(transcript2, self.normalization) toBeRemoved.add(index2) nbMerges += 1 elif transcript2.getEnd() + distance < transcript1.getStart(): toBePrinted.add(index2) transcripts2.append(transcript1) for index2 in sorted(toBePrinted): self.writeTranscript(transcripts2[index2]) transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)] for transcript2 in transcripts2: self.writeTranscript(transcript2) progress.done() self.getTables(self.QUERY)[chromosome1].remove() self.flushData() if self.writer != None: self.writer.close() self.writer = None if self.verbosity > 0: print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) print "# merges: %d" % (nbMerges) print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) def getDifferenceTranscriptList(self): """ Get the elements of the first list which do not overlap the second list (at the nucleotide level) @return: the transcripts that overlap with the reference set """ distance = 0 if self.maxDistance == None else self.maxDistance self.addIndexes([self.QUERY, self.REFERENCE]) # export the container into tables self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) # looping for chromosome1 in sorted(self.getTables(self.QUERY).keys()): # get range of transcripts command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) query = self.mySqlConnection.executeQuery(command) result = query.getLine() first = result[0] last = result[1] nb = result[2] transcripts1 = [] transcripts2 = [] nbChunks = max(1, nb / 100) chunkSize = (last - first) / nbChunks progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) for chunk in range(nbChunks + 1): # load transcripts start = first + chunk * chunkSize end = start + chunkSize - 1 command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): transcripts1.append(transcript1) command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) self.mySqlConnection.executeQuery(command) if chromosome1 in self.getTables(self.REFERENCE): command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) if chunk == 0: command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): transcripts2.append(transcript2) command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) self.mySqlConnection.executeQuery(command) # compare sets toBeRemoved1 = [] for index1, transcript1 in enumerate(transcripts1): newTranscript1 = Transcript() newTranscript1.copy(transcript1) for transcript2 in transcripts2: newTranscript1 = newTranscript1.getDifference(transcript2) if newTranscript1 == None: toBeRemoved1.append(index1) break transcripts1[index1] = newTranscript1 # check if query transcript extends bounds of the chunk if newTranscript1 != None and newTranscript1.getEnd() < end: if self.splitDifference: for exon in newTranscript1.getExons(): transcript = Transcript() transcript.copy(exon) self.writeTranscript(transcript) else: self.writeTranscript(newTranscript1) toBeRemoved1.append(index1) # update list of query transcripts for index1 in reversed(toBeRemoved1): del transcripts1[index1] # check if the reference transcripts extends bounds of the chunk toBeRemoved2 = [] for index2, transcript2 in enumerate(transcripts2): if transcript2.getEnd() + distance < end: toBeRemoved2.append(index2) for index2 in reversed(toBeRemoved2): del transcripts2[index2] progress.inc() for transcript1 in transcripts1: if self.splitDifference: for exon in transcript1.getExons(): transcript = Transcript() transcript.copy(exon) self.writeTranscript(transcript) else: self.writeTranscript(transcript1) progress.done() self.getTables(self.QUERY)[chromosome1].remove() if chromosome1 in self.getTables(self.REFERENCE): self.getTables(self.REFERENCE)[chromosome1].remove() self.getTables(self.WORKING)[chromosome1].remove() self.flushData() if self.writer != None: self.writer.close() self.writer = None if self.verbosity > 0: print "query: %d elements" % (self.nbTranscripts[self.QUERY]) print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) def getOddsPerTranscript(self): """ Return overlap results @return a dict of data """ if not self.odds: raise Exception("Did not compute odds!") return self.overlapResults def getOdds(self): """ Return odds about the overlap @return a dict of data """ if not self.odds: raise Exception("Did not compute odds!") if self.oddResults != None: return self.oddResults self.oddResults = {} for name, value in self.overlapResults.iteritems(): self.oddResults[value] = self.oddResults.get(value, 0) + 1 return self.oddResults