Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/clusterize.py @ 60:90f4b29d884f
Uploaded
| author | m-zytnicki |
|---|---|
| date | Fri, 21 Feb 2014 08:32:36 -0500 |
| parents | 169d364ddd91 |
| children | f4de72c80eac |
comparison
equal
deleted
inserted
replaced
| 59:2a4884ba3e5c | 60:90f4b29d884f |
|---|---|
| 31 from commons.core.writer.WriterChooser import WriterChooser | 31 from commons.core.writer.WriterChooser import WriterChooser |
| 32 """Clusterize a set of transcripts""" | 32 """Clusterize a set of transcripts""" |
| 33 | 33 |
| 34 import os, os.path, random | 34 import os, os.path, random |
| 35 from optparse import OptionParser | 35 from optparse import OptionParser |
| 36 from heapq import heappush, heappop | |
| 36 from commons.core.parsing.ParserChooser import ParserChooser | 37 from commons.core.parsing.ParserChooser import ParserChooser |
| 37 from commons.core.writer.Gff3Writer import Gff3Writer | 38 from commons.core.writer.Gff3Writer import Gff3Writer |
| 38 from SMART.Java.Python.structure.Transcript import Transcript | 39 from SMART.Java.Python.structure.Transcript import Transcript |
| 39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle | 40 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle |
| 40 from SMART.Java.Python.ncList.FileSorter import FileSorter | 41 from SMART.Java.Python.ncList.FileSorter import FileSorter |
| 42 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress | 43 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress |
| 43 | 44 |
| 44 class Clusterize(object): | 45 class Clusterize(object): |
| 45 | 46 |
| 46 def __init__(self, verbosity): | 47 def __init__(self, verbosity): |
| 47 self.normalize = False | 48 self.parsers = {} |
| 48 self.presorted = False | 49 self.sortedFileNames = {} |
| 49 self.distance = 1 | 50 self.normalize = False |
| 50 self.colinear = False | 51 self.presorted = False |
| 51 self.nbWritten = 0 | 52 self.distance = 1 |
| 52 self.nbMerges = 0 | 53 self.collinear = False |
| 53 self.verbosity = verbosity | 54 self.nbWritten = 0 |
| 55 self.nbMerges = 0 | |
| 56 self.verbosity = verbosity | |
| 54 self.splittedFileNames = {} | 57 self.splittedFileNames = {} |
| 58 self.chromosomes = set() | |
| 55 | 59 |
| 56 def __del__(self): | 60 def __del__(self): |
| 57 for fileName in self.splittedFileNames.values(): | 61 for fileName1 in self.splittedFileNames: |
| 58 os.remove(fileName) | 62 for fileName2 in self.splittedFileNames[fileName1].values(): |
| 59 | 63 os.remove(fileName2) |
| 60 def setInputFile(self, fileName, format): | 64 |
| 65 def setInputFiles(self, fileNames, format): | |
| 61 parserChooser = ParserChooser(self.verbosity) | 66 parserChooser = ParserChooser(self.verbosity) |
| 62 parserChooser.findFormat(format) | 67 parserChooser.findFormat(format) |
| 63 self.parser = parserChooser.getParser(fileName) | 68 for fileName in fileNames: |
| 64 self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) | 69 self.parsers[fileName] = parserChooser.getParser(fileName) |
| 65 if "SMARTTMPPATH" in os.environ: | 70 self.sortedFileNames[fileName] = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) |
| 66 self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName)) | 71 if "SMARTTMPPATH" in os.environ: |
| 72 self.sortedFileNames[fileName] = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileNames[fileName])) | |
| 67 | 73 |
| 68 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): | 74 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): |
| 69 writerChooser = WriterChooser() | 75 writerChooser = WriterChooser() |
| 70 writerChooser.findFormat(format) | 76 writerChooser.findFormat(format) |
| 71 self.writer = writerChooser.getWriter(fileName) | 77 self.writer = writerChooser.getWriter(fileName) |
| 74 self.writer.setFeaturePart(featurePart) | 80 self.writer.setFeaturePart(featurePart) |
| 75 | 81 |
| 76 def setDistance(self, distance): | 82 def setDistance(self, distance): |
| 77 self.distance = distance | 83 self.distance = distance |
| 78 | 84 |
| 79 def setColinear(self, colinear): | 85 def setColinear(self, collinear): |
| 80 self.colinear = colinear | 86 self.collinear = collinear |
| 81 | 87 |
| 82 def setNormalize(self, normalize): | 88 def setNormalize(self, normalize): |
| 83 self.normalize = normalize | 89 self.normalize = normalize |
| 84 | 90 |
| 85 def setPresorted(self, presorted): | 91 def setPresorted(self, presorted): |
| 86 self.presorted = presorted | 92 self.presorted = presorted |
| 87 | 93 |
| 88 def _sortFile(self): | 94 def _sortFiles(self): |
| 89 if self.presorted: | 95 if self.presorted: |
| 90 return | 96 return |
| 91 fs = FileSorter(self.parser, self.verbosity-4) | 97 for fileName, parser in self.parsers.iteritems(): |
| 92 fs.perChromosome(True) | 98 fs = FileSorter(parser, self.verbosity-4) |
| 93 fs.setPresorted(self.presorted) | 99 fs.perChromosome(True) |
| 94 fs.setOutputFileName(self.sortedFileName) | 100 fs.setPresorted(self.presorted) |
| 95 fs.sort() | 101 fs.setOutputFileName(self.sortedFileNames[fileName]) |
| 96 self.splittedFileNames = fs.getOutputFileNames() | 102 fs.sort() |
| 97 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() | 103 self.splittedFileNames[fileName] = fs.getOutputFileNames() |
| 98 self.nbElements = fs.getNbElements() | 104 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() |
| 105 self.nbElements = fs.getNbElements() | |
| 106 self.chromosomes.update(self.splittedFileNames[fileName].keys()) | |
| 99 | 107 |
| 100 def _iterate(self, chromosome): | 108 def _iterate(self): |
| 101 if chromosome == None: | 109 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) |
| 102 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) | 110 transcripts = [] |
| 103 parser = self.parser | 111 heap = [] |
| 112 parsersSets = [] | |
| 113 if self.chromosomes: | |
| 114 parsersSets.append(self.parsers.values()) | |
| 104 else: | 115 else: |
| 105 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) | 116 for chromosome in self.chromosomes: |
| 106 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) | 117 parsersSets.append([self.splittedFileNames[fileName][chromosome] for fileName in self.splittedFileNames if chromosome in self.splittedFileNames[fileName]]) |
| 107 transcripts = [] | 118 for parsers in parsersSets: |
| 108 for newTranscript in parser.getIterator(): | 119 for parser in parsers: |
| 109 newTranscripts = [] | 120 iterator = parser.getIterator() |
| 110 if newTranscript.__class__.__name__ == "Mapping": | 121 for transcript in iterator: |
| 111 newTranscript = newTranscript.getTranscript() | 122 if transcript.__class__.__name__ == "Mapping": |
| 112 for oldTranscript in transcripts: | 123 transcript = transcript.getTranscript() |
| 113 if self._checkOverlap(newTranscript, oldTranscript): | 124 heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator)) |
| 114 self._merge(newTranscript, oldTranscript) | 125 break |
| 115 elif self._checkPassed(newTranscript, oldTranscript): | 126 while heap: |
| 116 self._write(oldTranscript) | 127 chromosome, start, end, newTranscript, iterator = heappop(heap) |
| 117 else: | 128 for transcript in iterator: |
| 118 newTranscripts.append(oldTranscript) | 129 if transcript.__class__.__name__ == "Mapping": |
| 119 newTranscripts.append(newTranscript) | 130 transcript = transcript.getTranscript() |
| 120 transcripts = newTranscripts | 131 heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator)) |
| 121 progress.inc() | 132 break |
| 122 for transcript in transcripts: | 133 newTranscripts = [] |
| 123 self._write(transcript) | 134 if newTranscript.__class__.__name__ == "Mapping": |
| 135 newTranscript = newTranscript.getTranscript() | |
| 136 for oldTranscript in transcripts: | |
| 137 if self._checkOverlap(newTranscript, oldTranscript): | |
| 138 self._merge(newTranscript, oldTranscript) | |
| 139 elif self._checkPassed(newTranscript, oldTranscript): | |
| 140 self._write(oldTranscript) | |
| 141 else: | |
| 142 newTranscripts.append(oldTranscript) | |
| 143 newTranscripts.append(newTranscript) | |
| 144 transcripts = newTranscripts | |
| 145 progress.inc() | |
| 146 for transcript in transcripts: | |
| 147 self._write(transcript) | |
| 124 progress.done() | 148 progress.done() |
| 125 | 149 |
| 126 def _merge(self, transcript1, transcript2): | 150 def _merge(self, transcript1, transcript2): |
| 127 self.nbMerges += 1 | 151 self.nbMerges += 1 |
| 128 transcript2.setDirection(transcript1.getDirection()) | 152 transcript2.setDirection(transcript1.getDirection()) |
| 133 self.writer.addTranscript(transcript) | 157 self.writer.addTranscript(transcript) |
| 134 | 158 |
| 135 def _checkOverlap(self, transcript1, transcript2): | 159 def _checkOverlap(self, transcript1, transcript2): |
| 136 if transcript1.getChromosome() != transcript2.getChromosome(): | 160 if transcript1.getChromosome() != transcript2.getChromosome(): |
| 137 return False | 161 return False |
| 138 if self.colinear and transcript1.getDirection() != transcript2.getDirection(): | 162 if self.collinear and transcript1.getDirection() != transcript2.getDirection(): |
| 139 return False | 163 return False |
| 140 if transcript1.getDistance(transcript2) > self.distance: | 164 if transcript1.getDistance(transcript2) > self.distance: |
| 141 return False | 165 return False |
| 142 return True | 166 return True |
| 143 | 167 |
| 144 def _checkPassed(self, transcript1, transcript2): | 168 def _checkPassed(self, transcript1, transcript2): |
| 145 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) | 169 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) |
| 146 | 170 |
| 147 def run(self): | 171 def run(self): |
| 148 self._sortFile() | 172 self._sortFiles() |
| 149 if self.presorted: | 173 self._iterate() |
| 150 self._iterate(None) | |
| 151 else: | |
| 152 for chromosome in sorted(self.splittedFileNames.keys()): | |
| 153 self._iterate(chromosome) | |
| 154 self.writer.close() | 174 self.writer.close() |
| 155 if self.verbosity > 0: | 175 if self.verbosity > 0: |
| 156 print "# input: %d" % (self.nbElements) | 176 print "# input: %d" % (self.nbElements) |
| 157 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) | 177 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) |
| 158 print "# merges: %d" % (self.nbMerges) | 178 print "# merges: %d" % (self.nbMerges) |
| 160 | 180 |
| 161 if __name__ == "__main__": | 181 if __name__ == "__main__": |
| 162 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" | 182 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" |
| 163 | 183 |
| 164 parser = OptionParser(description = description) | 184 parser = OptionParser(description = description) |
| 165 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") | 185 parser.add_option("-i", "--inputs", dest="inputFileNames", action="store", type="string", help="input files (separated by commas) [compulsory] [format: string]") |
| 166 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") | 186 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") |
| 167 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") | 187 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") |
| 168 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") | 188 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") |
| 169 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") | 189 parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="merge collinear transcripts only [format: bool] [default: false]") |
| 170 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") | 190 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") |
| 171 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") | 191 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") |
| 172 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]") | 192 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]") |
| 173 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") | 193 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") |
| 174 (options, args) = parser.parse_args() | 194 (options, args) = parser.parse_args() |
| 175 | 195 |
| 176 c = Clusterize(options.verbosity) | 196 c = Clusterize(options.verbosity) |
| 177 c.setInputFile(options.inputFileName, options.format) | 197 c.setInputFiles(options.inputFileNames.split(","), options.format) |
| 178 c.setOutputFileName(options.outputFileName, options.outputFormat) | 198 c.setOutputFileName(options.outputFileName, options.outputFormat) |
| 179 c.setColinear(options.colinear) | 199 c.setColinear(options.collinear) |
| 180 c.setDistance(options.distance) | 200 c.setDistance(options.distance) |
| 181 c.setNormalize(options.normalize) | 201 c.setNormalize(options.normalize) |
| 182 c.setPresorted(options.sorted) | 202 c.setPresorted(options.sorted) |
| 183 c.run() | 203 c.run() |
