Mercurial > repos > yufei-luo > s_mart
diff SMART/Java/Python/clusterize.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | 769e306b7933 |
children |
line wrap: on
line diff
--- a/SMART/Java/Python/clusterize.py Mon Apr 22 11:11:10 2013 -0400 +++ b/SMART/Java/Python/clusterize.py Mon Apr 29 03:20:15 2013 -0400 @@ -31,7 +31,7 @@ from commons.core.writer.WriterChooser import WriterChooser """Clusterize a set of transcripts""" -import os +import os, os.path, random from optparse import OptionParser from commons.core.parsing.ParserChooser import ParserChooser from commons.core.writer.Gff3Writer import Gff3Writer @@ -39,127 +39,147 @@ from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle from SMART.Java.Python.ncList.FileSorter import FileSorter from SMART.Java.Python.misc.Progress import Progress +from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress class Clusterize(object): - - def __init__(self, verbosity): - self.normalize = False - self.presorted = False - self.distance = 1 - self.colinear = False - self.nbWritten = 0 - self.nbMerges = 0 - self.verbosity = verbosity - self.splittedFileNames = {} - def __del__(self): - for fileName in self.splittedFileNames.values(): - os.remove(fileName) + def __init__(self, verbosity): + self.normalize = False + self.presorted = False + self.distance = 1 + self.colinear = False + self.nbWritten = 0 + self.nbMerges = 0 + self.verbosity = verbosity + self.splittedFileNames = {} + + def __del__(self): + for fileName in self.splittedFileNames.values(): + os.remove(fileName) - def setInputFile(self, fileName, format): - parserChooser = ParserChooser(self.verbosity) - parserChooser.findFormat(format) - self.parser = parserChooser.getParser(fileName) - self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0]) + def setInputFile(self, fileName, format): + parserChooser = ParserChooser(self.verbosity) + parserChooser.findFormat(format) + self.parser = parserChooser.getParser(fileName) + self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) + if "SMARTTMPPATH" in os.environ: + self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName)) - def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): - writerChooser = WriterChooser() - writerChooser.findFormat(format) - self.writer = writerChooser.getWriter(fileName) - self.writer.setTitle(title) - self.writer.setFeature(feature) - self.writer.setFeaturePart(featurePart) + def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): + writerChooser = WriterChooser() + writerChooser.findFormat(format) + self.writer = writerChooser.getWriter(fileName) + self.writer.setTitle(title) + self.writer.setFeature(feature) + self.writer.setFeaturePart(featurePart) - def setDistance(self, distance): - self.distance = distance + def setDistance(self, distance): + self.distance = distance - def setColinear(self, colinear): - self.colinear = colinear + def setColinear(self, colinear): + self.colinear = colinear - def setNormalize(self, normalize): - self.normalize = normalize - - def setPresorted(self, presorted): - self.presorted = presorted + def setNormalize(self, normalize): + self.normalize = normalize + + def setPresorted(self, presorted): + self.presorted = presorted - def _sortFile(self): - fs = FileSorter(self.parser, self.verbosity-4) - fs.perChromosome(True) - fs.setPresorted(self.presorted) - fs.setOutputFileName(self.sortedFileName) - fs.sort() - self.splittedFileNames = fs.getOutputFileNames() - self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() - self.nbElements = fs.getNbElements() - - def _iterate(self, chromosome): - progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) - transcripts = [] - parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) - for newTranscript in parser.getIterator(): - newTranscripts = [] - for oldTranscript in transcripts: - if self._checkOverlap(newTranscript, oldTranscript): - self._merge(newTranscript, oldTranscript) - elif self._checkPassed(newTranscript, oldTranscript): - self._write(oldTranscript) - else: - newTranscripts.append(oldTranscript) - newTranscripts.append(newTranscript) - transcripts = newTranscripts - progress.inc() - for transcript in transcripts: - self._write(transcript) - progress.done() + def _sortFile(self): + if self.presorted: + return + fs = FileSorter(self.parser, self.verbosity-4) + fs.perChromosome(True) + fs.setPresorted(self.presorted) + fs.setOutputFileName(self.sortedFileName) + fs.sort() + self.splittedFileNames = fs.getOutputFileNames() + self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() + self.nbElements = fs.getNbElements() + + def _iterate(self, chromosome): + if chromosome == None: + progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) + parser = self.parser + else: + progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) + parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) + transcripts = [] + self.nbElements = 0 + for newTranscript in parser.getIterator(): + newTranscripts = [] + if newTranscript.__class__.__name__ == "Mapping": + newTranscript = newTranscript.getTranscript() + for oldTranscript in transcripts: + if self._checkOverlap(newTranscript, oldTranscript): + self._merge(newTranscript, oldTranscript) + elif self._checkPassed(newTranscript, oldTranscript): + self._write(oldTranscript) + else: + newTranscripts.append(oldTranscript) + newTranscripts.append(newTranscript) + transcripts = newTranscripts + self.nbElements += 1 + progress.inc() + for transcript in transcripts: + self._write(transcript) + progress.done() - def _merge(self, transcript1, transcript2): - self.nbMerges += 1 - transcript2.setDirection(transcript1.getDirection()) - transcript1.merge(transcript2) + def _merge(self, transcript1, transcript2): + self.nbMerges += 1 + transcript2.setDirection(transcript1.getDirection()) + transcript1.merge(transcript2) - def _write(self, transcript): - self.nbWritten += 1 - self.writer.addTranscript(transcript) + def _write(self, transcript): + self.nbWritten += 1 + self.writer.addTranscript(transcript) - def _checkOverlap(self, transcript1, transcript2): - if self.colinear and transcript1.getDirection() != transcript2.getDirection(): - return False - if transcript1.getDistance(transcript2) > self.distance: - return False - return True + def _checkOverlap(self, transcript1, transcript2): + if transcript1.getChromosome() != transcript2.getChromosome(): + return False + if self.colinear and transcript1.getDirection() != transcript2.getDirection(): + return False + if transcript1.getDistance(transcript2) > self.distance: + return False + return True - def _checkPassed(self, transcript1, transcript2): - return (transcript1.getDistance(transcript2) > self.distance) + def _checkPassed(self, transcript1, transcript2): + return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) - def run(self): - self._sortFile() - for chromosome in sorted(self.splittedFileNames.keys()): - self._iterate(chromosome) - self.writer.close() - if self.verbosity > 0: - print "# input: %d" % (self.nbElements) - print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) - print "# merges: %d" % (self.nbMerges) - + def run(self): + self._sortFile() + if self.presorted: + self._iterate(None) + else: + for chromosome in sorted(self.splittedFileNames.keys()): + self._iterate(chromosome) + self.writer.close() + if self.verbosity > 0: + print "# input: %d" % (self.nbElements) + print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) + print "# merges: %d" % (self.nbMerges) + if __name__ == "__main__": - description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" - - parser = OptionParser(description = description) - parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") - parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") - parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") - parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") - parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") - parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") - parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") - parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") - (options, args) = parser.parse_args() - - c = Clusterize(options.verbosity) - c.setInputFile(options.inputFileName, options.format) - c.setOutputFileName(options.outputFileName, options.outputFormat) - c.setColinear(options.colinear) - c.setDistance(options.distance) - c.setNormalize(options.normalize) - c.run() + description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" + + parser = OptionParser(description = description) + parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") + parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") + parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") + parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") + parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") + parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") + parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") + parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]") + parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") + (options, args) = parser.parse_args() + + c = Clusterize(options.verbosity) + c.setInputFile(options.inputFileName, options.format) + c.setOutputFileName(options.outputFileName, options.outputFormat) + c.setColinear(options.colinear) + c.setDistance(options.distance) + c.setNormalize(options.normalize) + c.setPresorted(options.sorted) + c.run()