Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/structure/TranscriptListsComparator.py @ 6:769e306b7933
Change the repository level.
author | yufei-luo |
---|---|
date | Fri, 18 Jan 2013 04:54:14 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SMART/Java/Python/structure/TranscriptListsComparator.py Fri Jan 18 04:54:14 2013 -0500 @@ -0,0 +1,1198 @@ +# +# Copyright INRA-URGI 2009-2010 +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. +# +import sys +import random +from SMART.Java.Python.misc import Utils +from SMART.Java.Python.structure.Transcript import Transcript +from SMART.Java.Python.structure.TranscriptList import TranscriptList +from SMART.Java.Python.structure.TranscriptContainer import TranscriptContainer +from SMART.Java.Python.mySql.MySqlConnection import MySqlConnection +from SMART.Java.Python.mySql.MySqlTranscriptTable import MySqlTranscriptTable +from SMART.Java.Python.misc.Progress import Progress +from commons.core.writer.MySqlTranscriptWriter import MySqlTranscriptWriter + + + +class TranscriptListsComparator(object): + """ + Compare two transcript lists, using a database for one of the list + Uses one TranscriptContainer for query data, + one TranscriptContainer exported to MySqlTranscriptTable for reference data, + one MySqlTranscriptTable for transformed reference data + @ivar inputTranscriptContainers: parsers to the list of query transcripts + @type inputTranscriptContainers: list of 2 L{TranscriptContainer<TranscriptContainer>} + @ivar writer: transcript list writer + @type writer: class L{TranscriptListWriter<TranscriptListWriter>} + @ivar mySqlConnection: connection to a MySQL database (to compute the ovelapping efficiently) + @type mySqlConnection: class L{MySqlConnection<MySqlConnection>} + @ivar introns: compare transcripts or exons only + @type introns: list of 2 boolean + @ivar starts: restrict the query transcripts to first nucleotides + @type starts: list of 2 int or None + @ivar fivePrimes: extend a list of transcripts by their 5' end + @type fivePrimes: list of 2 int or None + @ivar threePrimes: extend a list of transcripts by their 3' end + @type threePrimes: list of 2 int or None + @ivar minDistance: min distance between two transcripts [default: 0] + @type minDistance: int + @ivar maxDistance: max distance between two transcripts [default: 0] + @type maxDistance: int + @ivar minOverlap: minimum number of overlapping nucleotides to declare an overlap + @type minOverlap: int + @ivar pcOverlap: percentage of overlapping nucleotides to declare an overlap + @type pcOverlap: int + @ivar upstreams: consider distances with elements which are upstream of the transcripts + @type upstreams: boolean + @ivar downstreams: consider distances with elements which are downstream of the transcripts + @type downstreams: boolean + @ivar colinear: whether transcripts should overlap in the same direction + @type colinear: boolean + @ivar antisense: whether transcripts should overlap in the opposite direction + @type antisense: boolean + @ivar outputDistance: output distance between query and reference instead of query transcript + @type outputDistance: boolean + @ivar absolute: do not consider the strand while computing distance + @type absolute: boolean + @ivar strandedDistance: return a line per strand while computing distances + @type strandedDistance: boolean + @ivar QUERY: constant specifying the query objects + @type QUERY: int + @ivar REFERENCE: constant specifying the reference objects + @type REFERENCE: int + @ivar INPUTTYPES: set of input types of data (query or reference) objects + @type INPUTTYPES: list of 2 int + @ivar typeToString: string representation of the previous types + @type typeToString: dict + @ivar tableNames: name of the transcript tables + @type tableNames: dict of strings + @ivar nbTranscripts: number of transcript in the query/reference set + @type nbTranscripts: list of 2 int or None + @ivar nbNucleotides: number of nucleotides in the query/reference set + @type nbNucleotides: list of 2 int or None + @ivar transcriptsToBeStored: transcripts that will be stored into database + @type transcriptsToBeStored: dict of class L{TranscriptList<TranscriptList>} + @ivar multiple: in merge mode, aggregate multiple transcripts + @type multiple: boolean + @ivar normalization: normalize each element by the number of mappings of this element + @type normalization: boolean + @ivar invert: invert the current comparison + @type invert: boolean + @ivar splitDifference: split into intervals when computing difference + @type splitDifference: boolean + @ivar odds: whether odds about the comparison should be computed + @type odds: boolean + @ivar overlapResults: count the number of overlaps + @type overlapResults: dictionary + @ivar oddResults: compute the number of times each transcript overlaps (or is merged with) another one + @type oddResults: dictionary + @ivar outputContainer: container of the output transcripts + @type outputContainer: class L{TranscriptContainer<TranscriptContainer>} + @ivar logHandle: log handle + @type logHandle: file + @ivar verbosity: verbosity + @type verbosity: int + """ + + def __init__(self, logHandle = None, verbosity = 0): + """ + Constructor + @param transcriptListParser2: parser to the list of reference transcripts + @type transcriptListParser2: class L{TranscriptListParser<TranscriptListParser>} + @param logHandle: log handle + @type logHandle: file + @param verbosity: verbosity + @type verbosity: int + """ + self.QUERY = 0 + self.REFERENCE = 1 + self.WORKING = 2 + self.INPUTTYPES = (self.QUERY, self.REFERENCE) + self.INPUTWORKINGTYPES = (self.QUERY, self.REFERENCE, self.WORKING) + self.typeToString = {self.QUERY: "Query", self.REFERENCE: "Reference", self.WORKING: "Working"} + + self.logHandle = logHandle + self.verbosity = verbosity + self.mySqlConnection = MySqlConnection(self.verbosity-1) + self.inputTranscriptContainers = [None, None] + self.tableNames = ["tmpQueryTable_%d" % (random.randint(0, 100000)), "tmpReferenceTable_%d" % (random.randint(0, 100000)), "tmpOutputTable_%d" % (random.randint(0, 100000)), "tmpWorkingTable_%d" % (random.randint(0, 100000))] + self.mySqlTranscriptWriters = [MySqlTranscriptWriter(self.mySqlConnection, name, verbosity-1) for name in self.tableNames] + self.writer = None + self.introns = [False, False] + self.starts = [None, None] + self.ends = [None, None] + self.fivePrimes = [None, None] + self.threePrimes = [None, None] + self.minDistance = None + self.maxDistance = 0 + self.minOverlap = 1 + self.pcOverlap = None + self.colinear = False + self.antisense = False + self.downstreams = [False, False] + self.upstreams = [False, False] + self.outputDistance = False + self.absolute = False + self.strandedDistance = False + self.nbTranscripts = [None, None] + self.nbNucleotides = [None, None] + self.normalization = False + self.included = False + self.including = False + self.invert = False + self.notOverlapping = False + self.splitDifference = False + self.multiple = False + self.odds = False + self.overlapResults = None + self.oddResults = None + self.outputContainer = None + self.transcriptsToBeStored = dict([(type, TranscriptList()) for type in self.INPUTWORKINGTYPES]) + self.nbPrinted = 0 + + self.mySqlConnection.createDatabase() + + + def __del__(self): + """ + Destructor + Remove all temporary tables + """ + for type in self.INPUTWORKINGTYPES: + self.mySqlTranscriptWriters[type].removeTables() + self.mySqlConnection.deleteDatabase() + + def acceptIntrons(self, type, bool): + """ + Compare transcripts or exons only + @param type: whether use query/reference data + @type type: int + @param bool: include introns or not + @type bool: boolean + """ + self.introns[type] = bool + + + def restrictToStart(self, type, size): + """ + Restrict a list of transcripts to first nucleotides + @param type: whether use query/reference data + @type type: int + @param size: the size of the transcript to be considered + @type size: int + """ + self.starts[type] = size + self.introns[type] = False + + + def restrictToEnd(self, type, size): + """ + Restrict a list of transcripts to first nucleotides + @param type: whether use query/reference data + @type type: int + @param size: the size of the transcript to be considered + @type size: int + """ + self.ends[type] = size + self.introns[type] = False + + + def extendFivePrime(self, type, size): + """ + Extend a list of transcripts by their 5' end + @param type: whether use query/reference data + @type type: int + @param size: size of the extension + @type size: int + """ + self.fivePrimes[type] = size + + + def extendThreePrime(self, type, size): + """ + Extend the list of query transcripts by their 3' end + @param type: whether use query/reference data + @type type: int + @param size: size of the extension + @type size: int + """ + self.threePrimes[type] = size + + + def setMinDistance(self, distance): + """ + Set the min distance between two transcripts + @param distance: distance + @type distance: int + """ + self.minDistance = distance + + + def setMaxDistance(self, distance): + """ + Set the max distance between two transcripts + @param distance: distance + @type distance: int + """ + self.maxDistance = distance + + + def setMinOverlap(self, overlap): + """ + Set the minimum number of nucleotides to declare an overlap + @param overlap: minimum number of nucleotides + @type overlap: int + """ + self.minOverlap = overlap + + + def setPcOverlap(self, overlap): + """ + Set the percentage of nucleotides to declare an overlap + @param overlap: percentage of nucleotides + @type overlap: int + """ + self.pcOverlap = overlap + + + def setUpstream(self, type, boolean): + """ + Consider transcripts which are upstream of some transcripts + @param type: whether use query/reference data + @type type: int + @param boolean: consider only these transcripts or not + @type boolean: boolean + """ + self.upstreams[type] = boolean + + + def setDownstream(self, type, boolean): + """ + Consider transcripts which are downstream of some transcripts + @param type: whether use query/reference data + @type type: int + @param boolean: consider only these transcripts or not + @type boolean: boolean + """ + self.downstreams[type] = boolean + + + def setOutputDistance(self, boolean): + """ + Output distance between query and reference instead of query transcript + @param boolean: whether distance should be output + @type boolean: boolean + """ + self.outputDistance = boolean + + + def setAbsolute(self, boolean): + """ + Do not consider strand when computing distance (thus, having only non-negative values) + @param boolean: whether we should consider strands + @type boolean: boolean + """ + self.absolute = boolean + + + def setStrandedDistance(self, boolean): + """ + Return two distance distributions, one per strand + @param boolean: whether we should return 2 distance distance + @type boolean: boolean + """ + self.strandedDistance = boolean + + + def getColinearOnly(self, boolean): + """ + Only consider transcripts that overlap in the same direction + @param boolean: whether transcripts should overlap in the same direction + @type boolean: boolean + """ + self.colinear = boolean + + + def getAntisenseOnly(self, boolean): + """ + Only consider transcripts that overlap in the opposite direction + @param boolean: whether transcripts should overlap in the opposite direction + @type boolean: boolean + """ + self.antisense = boolean + + + def setIncludedOnly(self, boolean): + """ + Keep the elements from first set which are included in the second set + @param boolean: whether to keep included elements only + @type boolean: boolean + """ + self.included = boolean + + + def setIncludingOnly(self, boolean): + """ + Keep the elements from second set which are included in the first set + @param boolean: whether to keep included elements only + @type boolean: boolean + """ + self.including = boolean + + + def setNormalization(self, boolean): + """ + Normalize the elements by the number of mappings in the genome + @param boolean: whether normalize + @type boolean: boolean + """ + self.normalization = boolean + + + def getInvert(self, boolean): + """ + Only consider transcripts that do not overlap + @param boolean: whether invert the selection + @type boolean: boolean + """ + self.invert = boolean + + + def includeNotOverlapping(self, boolean): + """ + Also output the elements which do not overlap + @param boolean: whether output the elements which do not overlap + @type boolean: boolean + """ + self.notOverlapping = boolean + + + def setSplitDifference(self, boolean): + """ + Split into intervals when computing difference + @param boolean: whether to split + @type boolean: boolean + """ + self.splitDifference = boolean + + + def aggregate(self, boolean): + """ + In merge mode, aggregate multiple transcripts + @param boolean: aggregate multiple transcripts + @type boolean: boolean + """ + self.multiple = boolean + + + def getTables(self, type): + """ + Get the SQL tables + @param type: type of the table (query, reference, etc.) + @type type: int + """ + return self.mySqlTranscriptWriters[type].getTables() + + + def computeOdds(self, boolean): + """ + Compute odds + @param boolean: whether odds should be computed + @type boolean: boolean + """ + self.odds = boolean + if self.odds: + self.overlapResults = dict() + + + def computeOddsPerTranscript(self, boolean): + """ + Compute odds for each transcript + @param boolean: whether odds for each transcript should be computed + @type boolean: boolean + """ + self.odds = boolean + if self.odds: + self.overlapResults = dict() + + + def removeTables(self): + """ + Remove the temporary MySQL tables + """ + for type in self.INPUTWORKINGTYPES: + for chromosome in self.getTables(type): + self.getTables(type)[chromosome].remove() + + + def clearTables(self): + """ + Empty the content of the databases + """ + for type in self.INPUTWORKINGTYPES: + if self.transcriptListParsers[type] != None: + for chromosome in self.getTables(type): + self.getTables(type)[chromosome].clear() + + + def extendTranscript(self, type, transcript): + """ + Extend a transcript corresponding to the parameters of the class + @param transcript: a transcript + @type transcript: class L{Transcript<Transcript>} + @return: the possibly extended transcript + """ + extendedTranscript = Transcript() + extendedTranscript.copy(transcript) + if self.starts[type] != None: + extendedTranscript.restrictStart(self.starts[type]) + if self.ends[type] != None: + extendedTranscript.restrictEnd(self.ends[type]) + if self.fivePrimes[type] != None: + extendedTranscript.extendStart(self.fivePrimes[type]) + if self.threePrimes[type] != None: + extendedTranscript.extendEnd(self.threePrimes[type]) + return extendedTranscript + + + def storeTranscript(self, type, transcript, now = True): + """ + Add a transcript to a MySQL database, or postpone the store + @param type: whether use query/reference table + @type type: int + @param transcript: a transcript + @type transcript: class L{Transcript<Transcript>} + @param now: whether transcript should be stored now (or stored can be postponed) + @type now: bool + """ + self.mySqlTranscriptWriters[type].addTranscript(transcript) + if type == self.REFERENCE: + self.mySqlTranscriptWriters[self.WORKING].addTranscript(transcript) + if now: + self.mySqlTranscriptWriters[type].write() + if type == self.REFERENCE: + self.mySqlTranscriptWriters[self.WORKING].write() + + + def writeTranscript(self, transcript): + """ + Write a transcript in the output file + @param transcript: a transcript + @type transcript: class L{Transcript<Transcript>} + """ + if self.writer != None: + self.writer.addTranscript(transcript) + self.nbPrinted += 1 + + + def flushData(self, type = None): + """ + Store the remaining transcripts + @param type: whether use query/reference table (None for all) + @type type: int or None + """ + if type == None: + types = self.INPUTWORKINGTYPES + else: + types = [type] + for type in types: + self.mySqlTranscriptWriters[type].write() + if self.writer != None: + self.writer.write() + + + def unstoreTranscript(self, type, transcript): + """ + Remove a transcript from a MySQL database + @param type: whether use query/reference table + @type type: int + @param transcript: a transcript + @type transcript: class L{Transcript<Transcript>} + """ + self.getTables(type)[transcript.getChromosome()].removeTranscript(transcript) + if type == self.REFERENCE: + self.getTables(self.WORKING)[transcript.getChromosome()].removeTranscript(transcript) + + + def addIndexes(self, tables): + """ + Add useful indexes to the tables + @param tables: which tables should be indexed + @type tables: list of int + """ + for type in tables: + for chromosome in self.getTables(type): + self.getTables(type)[chromosome].createIndex("iStart_transcript_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["start"]) + self.getTables(type)[chromosome].exonsTable.createIndex("iTranscriptId_exon_%s_%d_%d" % (chromosome, type, random.randint(0, 100000)), ["transcriptId"]) + + + def storeTranscriptList(self, type, transcriptListParser, extension): + """ + Store a transcript list into database + @param type: whether use query/reference parser + @type type: int + @param parser: a parser of transcript list + @type parser: class L{TranscriptContainer<TranscriptContainer>} + @param extension: extend (or not) the transcripts + @type extension: boolean + """ + progress = Progress(transcriptListParser.getNbTranscripts(), "Writing transcripts for %s" % ("query" if type == self.QUERY else "reference"), self.verbosity-1) + for transcript in transcriptListParser.getIterator(): + if extension: + transcript = self.extendTranscript(type, transcript) + self.mySqlTranscriptWriters[type].addTranscript(transcript) + progress.inc() + self.mySqlTranscriptWriters[type].write() + progress.done() + if type == self.REFERENCE: + for chromosome in self.getTables(self.REFERENCE): + self.getTables(self.WORKING)[chromosome] = MySqlTranscriptTable(self.mySqlConnection, self.tableNames[self.WORKING], chromosome, self.verbosity-1) + self.getTables(self.WORKING)[chromosome].copy(self.getTables(self.REFERENCE)[chromosome]) + + + def setInputTranscriptContainer(self, type, inputTranscriptContainer): + """ + Set an input transcript list container + @param type: whether use query/reference parser + @type type: int + @param inputTranscriptContainer: a container + @type inputTranscriptContainer: class L{TranscriptContainer<TranscriptContainer>} + """ + self.inputTranscriptContainers[type] = inputTranscriptContainer + self.nbTranscripts[type] = self.inputTranscriptContainers[type].getNbTranscripts() + self.nbNucleotides[type] = self.inputTranscriptContainers[type].getNbNucleotides() + + + def setOutputWriter(self, writer): + """ + Set an output transcript list writer + @param writer: a writer + @type writer: class L{TranscriptListWriter<TranscriptListWriter>} + """ + self.writer = writer + + + def compareTranscript(self, transcript1, transcript2, includeDistance = False): + """ + Compare two transcripts, using user defined parameters + @param transcript1: a transcript from the query set (already extended) + @type transcript1: class L{Transcript<Transcript>} + @param transcript2: a transcript from the reference set (already extended) + @type transcript2: class L{Transcript<Transcript>} + @param includeDistance: take into account the distance too + @type includeDistance: boolean + @return: true, if they overlap + """ + extendedTranscript1 = Transcript() + extendedTranscript1.copy(transcript1) + if includeDistance: + if self.maxDistance > 0: + extendedTranscript1.extendStart(self.maxDistance) + extendedTranscript1.extendEnd(self.maxDistance) + + minOverlap = self.minOverlap + if self.pcOverlap != None: + minOverlap = max(minOverlap, transcript1.getSize() / 100.0 * self.pcOverlap) + if not extendedTranscript1.overlapWith(transcript2, self.minOverlap): + return False + if (self.downstreams[self.QUERY] and transcript2.getStart() > extendedTranscript1.getStart()) or \ + (self.upstreams[self.QUERY] and transcript2.getEnd() < extendedTranscript1.getEnd()) or \ + (self.downstreams[self.REFERENCE] and extendedTranscript1.getStart() > transcript2.getStart()) or \ + (self.upstreams[self.REFERENCE] and extendedTranscript1.getEnd() < transcript2.getEnd()): + return False + if (self.antisense and extendedTranscript1.getDirection() == transcript2.getDirection()) or (self.colinear and extendedTranscript1.getDirection() != transcript2.getDirection()): + return False + if self.included and not transcript2.include(extendedTranscript1): + return False + if self.including and not extendedTranscript1.include(transcript2): + return False + if self.introns[self.REFERENCE] and self.introns[self.QUERY]: + if self.logHandle != None: + self.logHandle.write("%s overlaps with intron of %s\n" % (str(extendedTranscript1), str(transcript2))) + return True + if (not self.introns[self.REFERENCE]) and (not self.introns[self.QUERY]) and extendedTranscript1.overlapWithExon(transcript2, minOverlap): + if self.logHandle != None: + self.logHandle.write("%s overlaps with exon of %s\n" % (str(extendedTranscript1), str(transcript2))) + return True + return False + + + def compareTranscriptToList(self, transcript1): + """ + Compare a transcript to the reference list of transcripts + (Do not extend the transcripts, except for the distance) + @param transcript1: a transcript (from the query set) + @type transcript1: class L{Transcript<Transcript>} + @return: the reference transcripts overlapping + """ + # no transcript in the reference table + if transcript1.getChromosome() not in self.getTables(self.WORKING): + return + + # retrieve the the transcripts that may overlap in the working tables + clauses = [] + extendedTranscript1 = Transcript() + extendedTranscript1.copy(transcript1) + if self.maxDistance > 0: + extendedTranscript1.extendStart(self.maxDistance) + if self.maxDistance > 0: + extendedTranscript1.extendEnd(self.maxDistance) + command = "SELECT * FROM %s WHERE (" % (self.getTables(self.WORKING)[transcript1.getChromosome()].getName()) + for binPair in extendedTranscript1.getBins(): + clause = "bin " + if binPair[0] == binPair[1]: + clause += "= %i" % (binPair[0]) + else: + clause += "BETWEEN %i AND %i" % (binPair[0], binPair[1]) + clauses.append(clause) + command += " OR ".join(clauses) + command += ") AND start <= %d AND end >= %d" % (extendedTranscript1.getEnd(), extendedTranscript1.getStart()) + + for index2, transcript2 in self.getTables(self.REFERENCE)[transcript1.getChromosome()].selectTranscripts(command): + if self.compareTranscript(extendedTranscript1, transcript2): + yield transcript2 + + + def compareTranscriptList(self): + """ + Compare a list of transcript to the reference one + @return: the transcripts that overlap with the reference set + """ + distance = 0 + nbClustersIn = 0 + nbClustersOut = 0 + if self.maxDistance != None: + distance = self.maxDistance + + self.addIndexes([self.QUERY, self.REFERENCE]) + + # export the container into tables + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) + + # looping + for chromosome1 in sorted(self.getTables(self.QUERY).keys()): + # get range of transcripts + command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) + query = self.mySqlConnection.executeQuery(command) + result = query.getLine() + first = result[0] + last = result[1] + nb = result[2] + + transcripts1 = [] + toBeRemoved1 = [] + transcripts2 = [] + toBeRemoved2 = [] + overlapsWith = [] + nbOverlaps = [] + nbChunks = max(1, nb / 100) + chunkSize = (last - first) / nbChunks + progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) + for chunk in range(nbChunks + 1): + + # load transcripts + start = first + chunk * chunkSize + end = start + chunkSize - 1 + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) + for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): + transcripts1.append(transcript1) + overlapsWith.append([]) + nbOverlaps.append(0) + nbClustersIn += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) + self.mySqlConnection.executeQuery(command) + if chromosome1 in self.getTables(self.REFERENCE): + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) + if chunk == 0: + command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): + transcripts2.append(transcript2) + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + self.mySqlConnection.executeQuery(command) + + # compare sets + for index1, transcript1 in enumerate(transcripts1): + overlappingNames = [] + nbElements1 = 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") + for transcript2 in transcripts2: + if self.compareTranscript(transcript1, transcript2, True): + id2 = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() + if id2 not in overlapsWith[index1]: + overlapsWith[index1].append(id2) + nbOverlaps[index1] += 1 if "nbElements" not in transcript2.getTagNames() else transcript2.getTagValue("nbElements") + if self.odds: + if transcript2.getName() not in self.overlapResults: + self.overlapResults[transcript2.getName()] = 0 + self.overlapResults[transcript2.getName()] += nbElements1 + + # check if query transcript extends bounds of the chunk + if transcript1.getEnd() < end: + if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: + if overlapsWith[index1]: + transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) + transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) + elif self.notOverlapping: + transcript1.setTagValue("nbOverlaps", "0") + self.writeTranscript(transcript1) + nbClustersOut += nbElements1 + toBeRemoved1.append(index1) + + # update list of query transcripts + for index1 in reversed(toBeRemoved1): + del transcripts1[index1] + del overlapsWith[index1] + del nbOverlaps[index1] + toBeRemoved1 = [] + + # check if the reference transcripts extends bounds of the chunk + for index2, transcript2 in enumerate(transcripts2): + if transcript2.getEnd() + distance < end: + toBeRemoved2.append(index2) + for index2 in reversed(toBeRemoved2): + del transcripts2[index2] + toBeRemoved2 = [] + + progress.inc() + + for index1, transcript1 in enumerate(transcripts1): + if Utils.xor(overlapsWith[index1], self.invert) or self.notOverlapping: + if overlapsWith[index1]: + transcript1.setTagValue("overlapWith", ",".join(overlapsWith[index1])[:100]) + transcript1.setTagValue("nbOverlaps", "%d" % (nbOverlaps[index1])) + elif self.notOverlapping: + transcript1.setTagValue("nbOverlaps", "0") + self.writeTranscript(transcript1) + nbClustersOut += 1 if "nbElements" not in transcript1.getTagNames() else transcript1.getTagValue("nbElements") + progress.done() + self.getTables(self.QUERY)[chromosome1].remove() + if chromosome1 in self.getTables(self.REFERENCE): + self.getTables(self.REFERENCE)[chromosome1].remove() + self.getTables(self.WORKING)[chromosome1].remove() + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) + print "query: %d elements, %d clustered" % (self.nbTranscripts[self.QUERY], nbClustersIn) + if self.nbTranscripts[self.QUERY] != 0: + print "output: %d elements (%.2f%%)"% (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100), + if nbClustersOut != 0: + print ", %d clustered (%.2f%%)" % (nbClustersOut, float(nbClustersOut) / nbClustersIn * 100) + + + def compareTranscriptListDistance(self): + """ + Compare a list of transcript to the reference one + @return: the distance distributions in a hash + """ + nbDistances = 0 + distances = {} + absDistances = {} + strandedDistances = dict([(strand, {}) for strand in (1, -1)]) + + # export the container into tables + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) + + progress = Progress(self.nbTranscripts[self.QUERY], "Analyzing chromosomes", self.verbosity-1) + for transcript1 in self.inputTranscriptContainers[self.QUERY].getIterator(): + # get the distance + transcript1 = self.extendTranscript(self.QUERY, transcript1) + distance = self.maxDistance + 1 + strand = None + closestElement = "None" + for transcript2 in self.compareTranscriptToList(transcript1): + thisStrand = transcript1.getDirection() * transcript2.getDirection() + if self.antisense or (not self.colinear and transcript1.getDirection() != transcript2.getDirection()): + transcript2.reverse() + if self.absolute: + transcript2.setDirection(transcript1.getDirection()) + if transcript2.getDirection() == transcript1.getDirection(): + if self.starts[self.REFERENCE] != None: + transcript2.restrictStart(self.starts[self.REFERENCE]) + if self.ends[self.REFERENCE] != None: + transcript2.restrictEnd(self.ends[self.REFERENCE]) + thisDistance = transcript1.getRelativeDistance(transcript2) + if (self.absolute): + thisDistance = abs(thisDistance) + if abs(thisDistance) < abs(distance): + distance = thisDistance + strand = thisStrand + closestElement = transcript2.getTagValue("ID") if "ID" in transcript2.getTagNames() else transcript2.getName() + if (distance <= self.maxDistance) and (self.minDistance == None or distance >= self.minDistance): + nbDistances += 1 + distances[distance] = distances.get(distance, 0) + 1 + absDistance = abs(distance) + absDistances[absDistance] = absDistances.get(absDistance, 0) + 1 + strandedDistances[strand][distance] = strandedDistances[strand].get(distance, 0) + if distance not in strandedDistances[-strand]: + strandedDistances[-strand][distance] = 0 + + # write transcript + if distance == self.maxDistance + 1: + distance = "None" + tmpTranscript = Transcript() + tmpTranscript.copy(transcript1) + tmpTranscript.setTagValue("distance", distance) + tmpTranscript.setTagValue("closestElement", closestElement) + self.writeTranscript(tmpTranscript) + + progress.inc() + progress.done() + + self.flushData() + + if self.verbosity > 0: + print "reference: %d sequences" % (self.nbTranscripts[self.REFERENCE]) + print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) + if nbDistances == 0: + print "Nothing matches" + else: + print "min/avg/med/max transcripts: %d/%.2f/%.1f/%d" % Utils.getMinAvgMedMax(absDistances) + print "for %d distances (%.2f%%)" % (nbDistances, float(nbDistances) / self.nbTranscripts[self.QUERY] * 100) + + if self.strandedDistance: + return strandedDistances + return distances + + + def compareTranscriptListMerge(self): + """ + Merge the query list of transcript with itself + @return: the merged transcripts in a transcript list database + """ + nbMerges = 0 + + for type in (self.QUERY, self.REFERENCE): + self.storeTranscriptList(type, self.inputTranscriptContainers[type], True) + self.flushData() + + # Loop on the chromosomes + for chromosome in sorted(self.getTables(self.QUERY).keys()): + if chromosome not in self.getTables(self.REFERENCE): + continue + + # Get the size of the chromosome + maxEnd = 0 + nbChunks = 0 + for type in (self.QUERY, self.REFERENCE): + command = "SELECT MAX(end) from %s" % (self.getTables(type)[chromosome].getName()) + query = self.mySqlConnection.executeQuery(command) + maxEnd = max(maxEnd, int(query.getLine()[0])) + nbChunks = max(nbChunks, self.getTables(type)[chromosome].getNbElements()) + + mergedTranscripts = {} + transcripts = {self.QUERY: [], self.REFERENCE: []} + progress = Progress(nbChunks, "Analyzing %s" % (chromosome), self.verbosity-1) + for i in range(nbChunks): + rangeStart = int(i * (float(maxEnd) / nbChunks)) + 1 + rangeEnd = int((i+1) * (float(maxEnd) / nbChunks)) + + # Get all transcripts in query and reference from chunk + for type in (self.QUERY, self.REFERENCE): + correction = 0 if self.QUERY else self.maxDistance + command = "SELECT * FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd + correction) + for index, transcript in self.getTables(type)[chromosome].selectTranscripts(command): + transcripts[type].append(transcript) + + # Merge elements between the two samples + for iQuery, queryTranscript in enumerate(transcripts[self.QUERY]): + for iReference, referenceTranscript in enumerate(transcripts[self.REFERENCE]): + if referenceTranscript == None: continue + if self.compareTranscript(queryTranscript, referenceTranscript, True): + if queryTranscript.getDirection() != referenceTranscript.getDirection(): + referenceTranscript.setDirection(queryTranscript.getDirection()) + queryTranscript.merge(referenceTranscript, self.normalization) + nbMerges += 1 + transcripts[self.REFERENCE][iReference] = None + if not self.multiple: + mergedTranscripts[iQuery] = 0 + + # Remove transcripts from database + for type in (self.QUERY, self.REFERENCE): + correction = 0 if self.QUERY else self.maxDistance + command = "DELETE FROM %s WHERE start <= %d" % (self.getTables(type)[chromosome].getName(), rangeEnd - correction) + query = self.mySqlConnection.executeQuery(command) + + # Just in case, self-merge the elements in the query (beware of mergedTranscripts!) + if (self.multiple): + for iQuery1, queryTranscript1 in enumerate(transcripts[self.QUERY]): + if queryTranscript1 == None: continue + for iQuery2, queryTranscript2 in enumerate(transcripts[self.QUERY]): + if iQuery2 <= iQuery1 or queryTranscript2 == None: continue + minOverlap = self.minOverlap + if self.pcOverlap != None: + minOverlap = max(minOverlap, queryTranscript1.getSize() / 100.0 * self.pcOverlap) + if queryTranscript2.overlapWith(queryTranscript1, minOverlap) and (queryTranscript1.getDirection() == queryTranscript2.getDirection() or not self.colinear): + if queryTranscript1.getDirection() != queryTranscript2.getDirection(): + queryTranscript2.setDirection(queryTranscript1.getDirection()) + queryTranscript1.merge(queryTranscript2, self.normalization) + transcripts[self.QUERY][iQuery2] = None + nbMerges += 1 + if not self.multiple: + mergedTranscripts[iQuery1] = 0 + + # Update the sets of transcripts and write into database (also update mergedTranscripts) + newTranscripts = {self.QUERY: [], self.REFERENCE: []} + newMergedTranscripts = {} + for type in (self.QUERY, self.REFERENCE): + for i, transcript in enumerate(transcripts[type]): + if transcript == None: continue + correction = 0 if self.QUERY else self.maxDistance + if transcript.getEnd() < rangeEnd - correction: + if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): + self.writeTranscript(transcripts[type][i]) + else: + if type == self.QUERY and i in mergedTranscripts: + newMergedTranscripts[len(newTranscripts[type])] = 0 + newTranscripts[type].append(transcript) + transcripts = newTranscripts + mergedTranscripts = newMergedTranscripts + + progress.inc() + progress.done() + + for type in (self.QUERY, self.REFERENCE): + for i, transcript in enumerate(transcripts[type]): + if transcripts == None: continue + if self.multiple or ((type == self.QUERY) and (i in mergedTranscripts)): + self.writeTranscript(transcripts[type][i]) + + # Manage chromosomes with no corresponding data + if self.multiple: + for type in self.INPUTTYPES: + for chromosome in self.getTables(type): + if chromosome in self.getTables(1 - type): + continue + for transcript in self.getTables(self.OUTPUT)[chromosome].getIterator(): + self.writeTranscript(transcript) + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) + print "# merges: %d" % (nbMerges) + print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) + + + def compareTranscriptListSelfMerge(self): + """ + Merge the query list of transcript with itself + @return: the merged transcripts in a transcript list database + """ + nbMerges = 0 + distance = self.maxDistance if self.maxDistance != None else 0 + + self.addIndexes([self.QUERY]) + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.flushData() + + # looping + for chromosome1 in sorted(self.getTables(self.QUERY).keys()): + transcripts2 = [] + + # get range of transcripts + progress = Progress(self.getTables(self.QUERY)[chromosome1].getNbElements(), "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) + command = "SELECT * FROM %s ORDER BY start" % (self.getTables(self.QUERY)[chromosome1].getName()) + for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): + + # compare sets + toBeRemoved = set() + toBePrinted = set() + for index2, transcript2 in enumerate(transcripts2): + + if self.compareTranscript(transcript1, transcript2, True): + if transcript1.getDirection() != transcript2.getDirection(): + transcript2.setDirection(transcript1.getDirection()) + transcript1.merge(transcript2, self.normalization) + toBeRemoved.add(index2) + nbMerges += 1 + elif transcript2.getEnd() + distance < transcript1.getStart(): + toBePrinted.add(index2) + transcripts2.append(transcript1) + + for index2 in sorted(toBePrinted): + self.writeTranscript(transcripts2[index2]) + transcripts2 = [transcripts2[index2] for index2 in range(len(transcripts2)) if index2 not in (toBeRemoved | toBePrinted)] + + for transcript2 in transcripts2: + self.writeTranscript(transcript2) + progress.done() + self.getTables(self.QUERY)[chromosome1].remove() + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "query: %d sequences" % (self.nbTranscripts[self.QUERY]) + print "# merges: %d" % (nbMerges) + print "# printed %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) + + + def getDifferenceTranscriptList(self): + """ + Get the elements of the first list which do not overlap the second list (at the nucleotide level) + @return: the transcripts that overlap with the reference set + """ + distance = 0 if self.maxDistance == None else self.maxDistance + + self.addIndexes([self.QUERY, self.REFERENCE]) + + # export the container into tables + self.storeTranscriptList(self.QUERY, self.inputTranscriptContainers[self.QUERY], True) + self.storeTranscriptList(self.REFERENCE, self.inputTranscriptContainers[self.REFERENCE], True) + + # looping + for chromosome1 in sorted(self.getTables(self.QUERY).keys()): + # get range of transcripts + command = "SELECT MIN(start), MAX(end), COUNT(id) FROM %s" % (self.getTables(self.QUERY)[chromosome1].getName()) + query = self.mySqlConnection.executeQuery(command) + result = query.getLine() + first = result[0] + last = result[1] + nb = result[2] + + transcripts1 = [] + transcripts2 = [] + nbChunks = max(1, nb / 100) + chunkSize = (last - first) / nbChunks + progress = Progress(nbChunks + 1, "Analyzing chromosome %s" % (chromosome1), self.verbosity-1) + for chunk in range(nbChunks + 1): + + # load transcripts + start = first + chunk * chunkSize + end = start + chunkSize - 1 + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.QUERY)[chromosome1].getName(), start, end-1) + for index1, transcript1 in self.getTables(self.QUERY)[chromosome1].selectTranscripts(command): + transcripts1.append(transcript1) + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.QUERY)[chromosome1].getName(), end) + self.mySqlConnection.executeQuery(command) + if chromosome1 in self.getTables(self.REFERENCE): + command = "SELECT * FROM %s WHERE start BETWEEN %d AND %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), start-distance, end+distance-1) + if chunk == 0: + command = "SELECT * FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + for index2, transcript2 in self.getTables(self.REFERENCE)[chromosome1].selectTranscripts(command): + transcripts2.append(transcript2) + command = "DELETE FROM %s WHERE start < %d" % (self.getTables(self.REFERENCE)[chromosome1].getName(), end + distance) + self.mySqlConnection.executeQuery(command) + + # compare sets + toBeRemoved1 = [] + for index1, transcript1 in enumerate(transcripts1): + newTranscript1 = Transcript() + newTranscript1.copy(transcript1) + for transcript2 in transcripts2: + newTranscript1 = newTranscript1.getDifference(transcript2) + if newTranscript1 == None: + toBeRemoved1.append(index1) + break + transcripts1[index1] = newTranscript1 + + # check if query transcript extends bounds of the chunk + if newTranscript1 != None and newTranscript1.getEnd() < end: + if self.splitDifference: + for exon in newTranscript1.getExons(): + transcript = Transcript() + transcript.copy(exon) + self.writeTranscript(transcript) + else: + self.writeTranscript(newTranscript1) + toBeRemoved1.append(index1) + + # update list of query transcripts + for index1 in reversed(toBeRemoved1): + del transcripts1[index1] + + # check if the reference transcripts extends bounds of the chunk + toBeRemoved2 = [] + for index2, transcript2 in enumerate(transcripts2): + if transcript2.getEnd() + distance < end: + toBeRemoved2.append(index2) + for index2 in reversed(toBeRemoved2): + del transcripts2[index2] + + progress.inc() + + for transcript1 in transcripts1: + if self.splitDifference: + for exon in transcript1.getExons(): + transcript = Transcript() + transcript.copy(exon) + self.writeTranscript(transcript) + else: + self.writeTranscript(transcript1) + progress.done() + self.getTables(self.QUERY)[chromosome1].remove() + if chromosome1 in self.getTables(self.REFERENCE): + self.getTables(self.REFERENCE)[chromosome1].remove() + self.getTables(self.WORKING)[chromosome1].remove() + + self.flushData() + if self.writer != None: + self.writer.close() + self.writer = None + + if self.verbosity > 0: + print "query: %d elements" % (self.nbTranscripts[self.QUERY]) + print "reference: %d elements" % (self.nbTranscripts[self.REFERENCE]) + print "# printed: %d (%.2f%%)" % (self.nbPrinted, self.nbPrinted / float(self.nbTranscripts[self.QUERY]) * 100) + + + def getOddsPerTranscript(self): + """ + Return overlap results + @return a dict of data + """ + if not self.odds: + raise Exception("Did not compute odds!") + return self.overlapResults + + + def getOdds(self): + """ + Return odds about the overlap + @return a dict of data + """ + if not self.odds: + raise Exception("Did not compute odds!") + if self.oddResults != None: + return self.oddResults + self.oddResults = {} + for name, value in self.overlapResults.iteritems(): + self.oddResults[value] = self.oddResults.get(value, 0) + 1 + return self.oddResults