Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/clusterize.py @ 60:90f4b29d884f
Uploaded
author | m-zytnicki |
---|---|
date | Fri, 21 Feb 2014 08:32:36 -0500 |
parents | 169d364ddd91 |
children | f4de72c80eac |
comparison
equal
deleted
inserted
replaced
59:2a4884ba3e5c | 60:90f4b29d884f |
---|---|
31 from commons.core.writer.WriterChooser import WriterChooser | 31 from commons.core.writer.WriterChooser import WriterChooser |
32 """Clusterize a set of transcripts""" | 32 """Clusterize a set of transcripts""" |
33 | 33 |
34 import os, os.path, random | 34 import os, os.path, random |
35 from optparse import OptionParser | 35 from optparse import OptionParser |
36 from heapq import heappush, heappop | |
36 from commons.core.parsing.ParserChooser import ParserChooser | 37 from commons.core.parsing.ParserChooser import ParserChooser |
37 from commons.core.writer.Gff3Writer import Gff3Writer | 38 from commons.core.writer.Gff3Writer import Gff3Writer |
38 from SMART.Java.Python.structure.Transcript import Transcript | 39 from SMART.Java.Python.structure.Transcript import Transcript |
39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle | 40 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle |
40 from SMART.Java.Python.ncList.FileSorter import FileSorter | 41 from SMART.Java.Python.ncList.FileSorter import FileSorter |
42 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress | 43 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress |
43 | 44 |
44 class Clusterize(object): | 45 class Clusterize(object): |
45 | 46 |
46 def __init__(self, verbosity): | 47 def __init__(self, verbosity): |
47 self.normalize = False | 48 self.parsers = {} |
48 self.presorted = False | 49 self.sortedFileNames = {} |
49 self.distance = 1 | 50 self.normalize = False |
50 self.colinear = False | 51 self.presorted = False |
51 self.nbWritten = 0 | 52 self.distance = 1 |
52 self.nbMerges = 0 | 53 self.collinear = False |
53 self.verbosity = verbosity | 54 self.nbWritten = 0 |
55 self.nbMerges = 0 | |
56 self.verbosity = verbosity | |
54 self.splittedFileNames = {} | 57 self.splittedFileNames = {} |
58 self.chromosomes = set() | |
55 | 59 |
56 def __del__(self): | 60 def __del__(self): |
57 for fileName in self.splittedFileNames.values(): | 61 for fileName1 in self.splittedFileNames: |
58 os.remove(fileName) | 62 for fileName2 in self.splittedFileNames[fileName1].values(): |
59 | 63 os.remove(fileName2) |
60 def setInputFile(self, fileName, format): | 64 |
65 def setInputFiles(self, fileNames, format): | |
61 parserChooser = ParserChooser(self.verbosity) | 66 parserChooser = ParserChooser(self.verbosity) |
62 parserChooser.findFormat(format) | 67 parserChooser.findFormat(format) |
63 self.parser = parserChooser.getParser(fileName) | 68 for fileName in fileNames: |
64 self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) | 69 self.parsers[fileName] = parserChooser.getParser(fileName) |
65 if "SMARTTMPPATH" in os.environ: | 70 self.sortedFileNames[fileName] = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) |
66 self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName)) | 71 if "SMARTTMPPATH" in os.environ: |
72 self.sortedFileNames[fileName] = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileNames[fileName])) | |
67 | 73 |
68 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): | 74 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): |
69 writerChooser = WriterChooser() | 75 writerChooser = WriterChooser() |
70 writerChooser.findFormat(format) | 76 writerChooser.findFormat(format) |
71 self.writer = writerChooser.getWriter(fileName) | 77 self.writer = writerChooser.getWriter(fileName) |
74 self.writer.setFeaturePart(featurePart) | 80 self.writer.setFeaturePart(featurePart) |
75 | 81 |
76 def setDistance(self, distance): | 82 def setDistance(self, distance): |
77 self.distance = distance | 83 self.distance = distance |
78 | 84 |
79 def setColinear(self, colinear): | 85 def setColinear(self, collinear): |
80 self.colinear = colinear | 86 self.collinear = collinear |
81 | 87 |
82 def setNormalize(self, normalize): | 88 def setNormalize(self, normalize): |
83 self.normalize = normalize | 89 self.normalize = normalize |
84 | 90 |
85 def setPresorted(self, presorted): | 91 def setPresorted(self, presorted): |
86 self.presorted = presorted | 92 self.presorted = presorted |
87 | 93 |
88 def _sortFile(self): | 94 def _sortFiles(self): |
89 if self.presorted: | 95 if self.presorted: |
90 return | 96 return |
91 fs = FileSorter(self.parser, self.verbosity-4) | 97 for fileName, parser in self.parsers.iteritems(): |
92 fs.perChromosome(True) | 98 fs = FileSorter(parser, self.verbosity-4) |
93 fs.setPresorted(self.presorted) | 99 fs.perChromosome(True) |
94 fs.setOutputFileName(self.sortedFileName) | 100 fs.setPresorted(self.presorted) |
95 fs.sort() | 101 fs.setOutputFileName(self.sortedFileNames[fileName]) |
96 self.splittedFileNames = fs.getOutputFileNames() | 102 fs.sort() |
97 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() | 103 self.splittedFileNames[fileName] = fs.getOutputFileNames() |
98 self.nbElements = fs.getNbElements() | 104 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() |
105 self.nbElements = fs.getNbElements() | |
106 self.chromosomes.update(self.splittedFileNames[fileName].keys()) | |
99 | 107 |
100 def _iterate(self, chromosome): | 108 def _iterate(self): |
101 if chromosome == None: | 109 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) |
102 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) | 110 transcripts = [] |
103 parser = self.parser | 111 heap = [] |
112 parsersSets = [] | |
113 if self.chromosomes: | |
114 parsersSets.append(self.parsers.values()) | |
104 else: | 115 else: |
105 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) | 116 for chromosome in self.chromosomes: |
106 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) | 117 parsersSets.append([self.splittedFileNames[fileName][chromosome] for fileName in self.splittedFileNames if chromosome in self.splittedFileNames[fileName]]) |
107 transcripts = [] | 118 for parsers in parsersSets: |
108 for newTranscript in parser.getIterator(): | 119 for parser in parsers: |
109 newTranscripts = [] | 120 iterator = parser.getIterator() |
110 if newTranscript.__class__.__name__ == "Mapping": | 121 for transcript in iterator: |
111 newTranscript = newTranscript.getTranscript() | 122 if transcript.__class__.__name__ == "Mapping": |
112 for oldTranscript in transcripts: | 123 transcript = transcript.getTranscript() |
113 if self._checkOverlap(newTranscript, oldTranscript): | 124 heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator)) |
114 self._merge(newTranscript, oldTranscript) | 125 break |
115 elif self._checkPassed(newTranscript, oldTranscript): | 126 while heap: |
116 self._write(oldTranscript) | 127 chromosome, start, end, newTranscript, iterator = heappop(heap) |
117 else: | 128 for transcript in iterator: |
118 newTranscripts.append(oldTranscript) | 129 if transcript.__class__.__name__ == "Mapping": |
119 newTranscripts.append(newTranscript) | 130 transcript = transcript.getTranscript() |
120 transcripts = newTranscripts | 131 heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator)) |
121 progress.inc() | 132 break |
122 for transcript in transcripts: | 133 newTranscripts = [] |
123 self._write(transcript) | 134 if newTranscript.__class__.__name__ == "Mapping": |
135 newTranscript = newTranscript.getTranscript() | |
136 for oldTranscript in transcripts: | |
137 if self._checkOverlap(newTranscript, oldTranscript): | |
138 self._merge(newTranscript, oldTranscript) | |
139 elif self._checkPassed(newTranscript, oldTranscript): | |
140 self._write(oldTranscript) | |
141 else: | |
142 newTranscripts.append(oldTranscript) | |
143 newTranscripts.append(newTranscript) | |
144 transcripts = newTranscripts | |
145 progress.inc() | |
146 for transcript in transcripts: | |
147 self._write(transcript) | |
124 progress.done() | 148 progress.done() |
125 | 149 |
126 def _merge(self, transcript1, transcript2): | 150 def _merge(self, transcript1, transcript2): |
127 self.nbMerges += 1 | 151 self.nbMerges += 1 |
128 transcript2.setDirection(transcript1.getDirection()) | 152 transcript2.setDirection(transcript1.getDirection()) |
133 self.writer.addTranscript(transcript) | 157 self.writer.addTranscript(transcript) |
134 | 158 |
135 def _checkOverlap(self, transcript1, transcript2): | 159 def _checkOverlap(self, transcript1, transcript2): |
136 if transcript1.getChromosome() != transcript2.getChromosome(): | 160 if transcript1.getChromosome() != transcript2.getChromosome(): |
137 return False | 161 return False |
138 if self.colinear and transcript1.getDirection() != transcript2.getDirection(): | 162 if self.collinear and transcript1.getDirection() != transcript2.getDirection(): |
139 return False | 163 return False |
140 if transcript1.getDistance(transcript2) > self.distance: | 164 if transcript1.getDistance(transcript2) > self.distance: |
141 return False | 165 return False |
142 return True | 166 return True |
143 | 167 |
144 def _checkPassed(self, transcript1, transcript2): | 168 def _checkPassed(self, transcript1, transcript2): |
145 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) | 169 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) |
146 | 170 |
147 def run(self): | 171 def run(self): |
148 self._sortFile() | 172 self._sortFiles() |
149 if self.presorted: | 173 self._iterate() |
150 self._iterate(None) | |
151 else: | |
152 for chromosome in sorted(self.splittedFileNames.keys()): | |
153 self._iterate(chromosome) | |
154 self.writer.close() | 174 self.writer.close() |
155 if self.verbosity > 0: | 175 if self.verbosity > 0: |
156 print "# input: %d" % (self.nbElements) | 176 print "# input: %d" % (self.nbElements) |
157 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) | 177 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) |
158 print "# merges: %d" % (self.nbMerges) | 178 print "# merges: %d" % (self.nbMerges) |
160 | 180 |
161 if __name__ == "__main__": | 181 if __name__ == "__main__": |
162 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" | 182 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" |
163 | 183 |
164 parser = OptionParser(description = description) | 184 parser = OptionParser(description = description) |
165 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") | 185 parser.add_option("-i", "--inputs", dest="inputFileNames", action="store", type="string", help="input files (separated by commas) [compulsory] [format: string]") |
166 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") | 186 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") |
167 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") | 187 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") |
168 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") | 188 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") |
169 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") | 189 parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="merge collinear transcripts only [format: bool] [default: false]") |
170 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") | 190 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") |
171 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") | 191 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") |
172 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]") | 192 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]") |
173 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") | 193 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") |
174 (options, args) = parser.parse_args() | 194 (options, args) = parser.parse_args() |
175 | 195 |
176 c = Clusterize(options.verbosity) | 196 c = Clusterize(options.verbosity) |
177 c.setInputFile(options.inputFileName, options.format) | 197 c.setInputFiles(options.inputFileNames.split(","), options.format) |
178 c.setOutputFileName(options.outputFileName, options.outputFormat) | 198 c.setOutputFileName(options.outputFileName, options.outputFormat) |
179 c.setColinear(options.colinear) | 199 c.setColinear(options.collinear) |
180 c.setDistance(options.distance) | 200 c.setDistance(options.distance) |
181 c.setNormalize(options.normalize) | 201 c.setNormalize(options.normalize) |
182 c.setPresorted(options.sorted) | 202 c.setPresorted(options.sorted) |
183 c.run() | 203 c.run() |