comparison SMART/Java/Python/clusterize.py @ 60:90f4b29d884f

Uploaded
author m-zytnicki
date Fri, 21 Feb 2014 08:32:36 -0500
parents 169d364ddd91
children f4de72c80eac
comparison
equal deleted inserted replaced
59:2a4884ba3e5c 60:90f4b29d884f
31 from commons.core.writer.WriterChooser import WriterChooser 31 from commons.core.writer.WriterChooser import WriterChooser
32 """Clusterize a set of transcripts""" 32 """Clusterize a set of transcripts"""
33 33
34 import os, os.path, random 34 import os, os.path, random
35 from optparse import OptionParser 35 from optparse import OptionParser
36 from heapq import heappush, heappop
36 from commons.core.parsing.ParserChooser import ParserChooser 37 from commons.core.parsing.ParserChooser import ParserChooser
37 from commons.core.writer.Gff3Writer import Gff3Writer 38 from commons.core.writer.Gff3Writer import Gff3Writer
38 from SMART.Java.Python.structure.Transcript import Transcript 39 from SMART.Java.Python.structure.Transcript import Transcript
39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle 40 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle
40 from SMART.Java.Python.ncList.FileSorter import FileSorter 41 from SMART.Java.Python.ncList.FileSorter import FileSorter
42 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress 43 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
43 44
44 class Clusterize(object): 45 class Clusterize(object):
45 46
46 def __init__(self, verbosity): 47 def __init__(self, verbosity):
47 self.normalize = False 48 self.parsers = {}
48 self.presorted = False 49 self.sortedFileNames = {}
49 self.distance = 1 50 self.normalize = False
50 self.colinear = False 51 self.presorted = False
51 self.nbWritten = 0 52 self.distance = 1
52 self.nbMerges = 0 53 self.collinear = False
53 self.verbosity = verbosity 54 self.nbWritten = 0
55 self.nbMerges = 0
56 self.verbosity = verbosity
54 self.splittedFileNames = {} 57 self.splittedFileNames = {}
58 self.chromosomes = set()
55 59
56 def __del__(self): 60 def __del__(self):
57 for fileName in self.splittedFileNames.values(): 61 for fileName1 in self.splittedFileNames:
58 os.remove(fileName) 62 for fileName2 in self.splittedFileNames[fileName1].values():
59 63 os.remove(fileName2)
60 def setInputFile(self, fileName, format): 64
65 def setInputFiles(self, fileNames, format):
61 parserChooser = ParserChooser(self.verbosity) 66 parserChooser = ParserChooser(self.verbosity)
62 parserChooser.findFormat(format) 67 parserChooser.findFormat(format)
63 self.parser = parserChooser.getParser(fileName) 68 for fileName in fileNames:
64 self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000)) 69 self.parsers[fileName] = parserChooser.getParser(fileName)
65 if "SMARTTMPPATH" in os.environ: 70 self.sortedFileNames[fileName] = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
66 self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName)) 71 if "SMARTTMPPATH" in os.environ:
72 self.sortedFileNames[fileName] = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileNames[fileName]))
67 73
68 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"): 74 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
69 writerChooser = WriterChooser() 75 writerChooser = WriterChooser()
70 writerChooser.findFormat(format) 76 writerChooser.findFormat(format)
71 self.writer = writerChooser.getWriter(fileName) 77 self.writer = writerChooser.getWriter(fileName)
74 self.writer.setFeaturePart(featurePart) 80 self.writer.setFeaturePart(featurePart)
75 81
76 def setDistance(self, distance): 82 def setDistance(self, distance):
77 self.distance = distance 83 self.distance = distance
78 84
79 def setColinear(self, colinear): 85 def setColinear(self, collinear):
80 self.colinear = colinear 86 self.collinear = collinear
81 87
82 def setNormalize(self, normalize): 88 def setNormalize(self, normalize):
83 self.normalize = normalize 89 self.normalize = normalize
84 90
85 def setPresorted(self, presorted): 91 def setPresorted(self, presorted):
86 self.presorted = presorted 92 self.presorted = presorted
87 93
88 def _sortFile(self): 94 def _sortFiles(self):
89 if self.presorted: 95 if self.presorted:
90 return 96 return
91 fs = FileSorter(self.parser, self.verbosity-4) 97 for fileName, parser in self.parsers.iteritems():
92 fs.perChromosome(True) 98 fs = FileSorter(parser, self.verbosity-4)
93 fs.setPresorted(self.presorted) 99 fs.perChromosome(True)
94 fs.setOutputFileName(self.sortedFileName) 100 fs.setPresorted(self.presorted)
95 fs.sort() 101 fs.setOutputFileName(self.sortedFileNames[fileName])
96 self.splittedFileNames = fs.getOutputFileNames() 102 fs.sort()
97 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome() 103 self.splittedFileNames[fileName] = fs.getOutputFileNames()
98 self.nbElements = fs.getNbElements() 104 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
105 self.nbElements = fs.getNbElements()
106 self.chromosomes.update(self.splittedFileNames[fileName].keys())
99 107
100 def _iterate(self, chromosome): 108 def _iterate(self):
101 if chromosome == None: 109 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
102 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity) 110 transcripts = []
103 parser = self.parser 111 heap = []
112 parsersSets = []
113 if self.chromosomes:
114 parsersSets.append(self.parsers.values())
104 else: 115 else:
105 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity) 116 for chromosome in self.chromosomes:
106 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity) 117 parsersSets.append([self.splittedFileNames[fileName][chromosome] for fileName in self.splittedFileNames if chromosome in self.splittedFileNames[fileName]])
107 transcripts = [] 118 for parsers in parsersSets:
108 for newTranscript in parser.getIterator(): 119 for parser in parsers:
109 newTranscripts = [] 120 iterator = parser.getIterator()
110 if newTranscript.__class__.__name__ == "Mapping": 121 for transcript in iterator:
111 newTranscript = newTranscript.getTranscript() 122 if transcript.__class__.__name__ == "Mapping":
112 for oldTranscript in transcripts: 123 transcript = transcript.getTranscript()
113 if self._checkOverlap(newTranscript, oldTranscript): 124 heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator))
114 self._merge(newTranscript, oldTranscript) 125 break
115 elif self._checkPassed(newTranscript, oldTranscript): 126 while heap:
116 self._write(oldTranscript) 127 chromosome, start, end, newTranscript, iterator = heappop(heap)
117 else: 128 for transcript in iterator:
118 newTranscripts.append(oldTranscript) 129 if transcript.__class__.__name__ == "Mapping":
119 newTranscripts.append(newTranscript) 130 transcript = transcript.getTranscript()
120 transcripts = newTranscripts 131 heappush(heap, (transcript.getChromosome(), transcript.getStart(), -transcript.getEnd(), transcript, iterator))
121 progress.inc() 132 break
122 for transcript in transcripts: 133 newTranscripts = []
123 self._write(transcript) 134 if newTranscript.__class__.__name__ == "Mapping":
135 newTranscript = newTranscript.getTranscript()
136 for oldTranscript in transcripts:
137 if self._checkOverlap(newTranscript, oldTranscript):
138 self._merge(newTranscript, oldTranscript)
139 elif self._checkPassed(newTranscript, oldTranscript):
140 self._write(oldTranscript)
141 else:
142 newTranscripts.append(oldTranscript)
143 newTranscripts.append(newTranscript)
144 transcripts = newTranscripts
145 progress.inc()
146 for transcript in transcripts:
147 self._write(transcript)
124 progress.done() 148 progress.done()
125 149
126 def _merge(self, transcript1, transcript2): 150 def _merge(self, transcript1, transcript2):
127 self.nbMerges += 1 151 self.nbMerges += 1
128 transcript2.setDirection(transcript1.getDirection()) 152 transcript2.setDirection(transcript1.getDirection())
133 self.writer.addTranscript(transcript) 157 self.writer.addTranscript(transcript)
134 158
135 def _checkOverlap(self, transcript1, transcript2): 159 def _checkOverlap(self, transcript1, transcript2):
136 if transcript1.getChromosome() != transcript2.getChromosome(): 160 if transcript1.getChromosome() != transcript2.getChromosome():
137 return False 161 return False
138 if self.colinear and transcript1.getDirection() != transcript2.getDirection(): 162 if self.collinear and transcript1.getDirection() != transcript2.getDirection():
139 return False 163 return False
140 if transcript1.getDistance(transcript2) > self.distance: 164 if transcript1.getDistance(transcript2) > self.distance:
141 return False 165 return False
142 return True 166 return True
143 167
144 def _checkPassed(self, transcript1, transcript2): 168 def _checkPassed(self, transcript1, transcript2):
145 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance)) 169 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance))
146 170
147 def run(self): 171 def run(self):
148 self._sortFile() 172 self._sortFiles()
149 if self.presorted: 173 self._iterate()
150 self._iterate(None)
151 else:
152 for chromosome in sorted(self.splittedFileNames.keys()):
153 self._iterate(chromosome)
154 self.writer.close() 174 self.writer.close()
155 if self.verbosity > 0: 175 if self.verbosity > 0:
156 print "# input: %d" % (self.nbElements) 176 print "# input: %d" % (self.nbElements)
157 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100)) 177 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100))
158 print "# merges: %d" % (self.nbMerges) 178 print "# merges: %d" % (self.nbMerges)
160 180
161 if __name__ == "__main__": 181 if __name__ == "__main__":
162 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]" 182 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
163 183
164 parser = OptionParser(description = description) 184 parser = OptionParser(description = description)
165 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]") 185 parser.add_option("-i", "--inputs", dest="inputFileNames", action="store", type="string", help="input files (separated by commas) [compulsory] [format: string]")
166 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]") 186 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]")
167 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]") 187 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
168 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]") 188 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]")
169 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]") 189 parser.add_option("-c", "--collinear", dest="collinear", action="store_true", default=False, help="merge collinear transcripts only [format: bool] [default: false]")
170 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]") 190 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]")
171 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]") 191 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
172 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]") 192 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]")
173 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]") 193 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
174 (options, args) = parser.parse_args() 194 (options, args) = parser.parse_args()
175 195
176 c = Clusterize(options.verbosity) 196 c = Clusterize(options.verbosity)
177 c.setInputFile(options.inputFileName, options.format) 197 c.setInputFiles(options.inputFileNames.split(","), options.format)
178 c.setOutputFileName(options.outputFileName, options.outputFormat) 198 c.setOutputFileName(options.outputFileName, options.outputFormat)
179 c.setColinear(options.colinear) 199 c.setColinear(options.collinear)
180 c.setDistance(options.distance) 200 c.setDistance(options.distance)
181 c.setNormalize(options.normalize) 201 c.setNormalize(options.normalize)
182 c.setPresorted(options.sorted) 202 c.setPresorted(options.sorted)
183 c.run() 203 c.run()