comparison SMART/Java/Python/clusterize.py @ 36:44d5973c188c

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 15:02:29 -0400
parents
children 169d364ddd91
comparison
equal deleted inserted replaced
35:d94018ca4ada 36:44d5973c188c
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from commons.core.writer.WriterChooser import WriterChooser
32 """Clusterize a set of transcripts"""
33
34 import os, os.path, random
35 from optparse import OptionParser
36 from commons.core.parsing.ParserChooser import ParserChooser
37 from commons.core.writer.Gff3Writer import Gff3Writer
38 from SMART.Java.Python.structure.Transcript import Transcript
39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle
40 from SMART.Java.Python.ncList.FileSorter import FileSorter
41 from SMART.Java.Python.misc.Progress import Progress
42 from SMART.Java.Python.misc.UnlimitedProgress import UnlimitedProgress
43
44 class Clusterize(object):
45
46 def __init__(self, verbosity):
47 self.normalize = False
48 self.presorted = False
49 self.distance = 1
50 self.colinear = False
51 self.nbWritten = 0
52 self.nbMerges = 0
53 self.verbosity = verbosity
54 self.splittedFileNames = {}
55
56 def __del__(self):
57 for fileName in self.splittedFileNames.values():
58 os.remove(fileName)
59
60 def setInputFile(self, fileName, format):
61 parserChooser = ParserChooser(self.verbosity)
62 parserChooser.findFormat(format)
63 self.parser = parserChooser.getParser(fileName)
64 self.sortedFileName = "%s_sorted_%d.pkl" % (os.path.splitext(fileName)[0], random.randint(1, 100000))
65 if "SMARTTMPPATH" in os.environ:
66 self.sortedFileName = os.path.join(os.environ["SMARTTMPPATH"], os.path.basename(self.sortedFileName))
67
68 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
69 writerChooser = WriterChooser()
70 writerChooser.findFormat(format)
71 self.writer = writerChooser.getWriter(fileName)
72 self.writer.setTitle(title)
73 self.writer.setFeature(feature)
74 self.writer.setFeaturePart(featurePart)
75
76 def setDistance(self, distance):
77 self.distance = distance
78
79 def setColinear(self, colinear):
80 self.colinear = colinear
81
82 def setNormalize(self, normalize):
83 self.normalize = normalize
84
85 def setPresorted(self, presorted):
86 self.presorted = presorted
87
88 def _sortFile(self):
89 if self.presorted:
90 return
91 fs = FileSorter(self.parser, self.verbosity-4)
92 fs.perChromosome(True)
93 fs.setPresorted(self.presorted)
94 fs.setOutputFileName(self.sortedFileName)
95 fs.sort()
96 self.splittedFileNames = fs.getOutputFileNames()
97 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
98 self.nbElements = fs.getNbElements()
99
100 def _iterate(self, chromosome):
101 if chromosome == None:
102 progress = UnlimitedProgress(10000, "Reading input file", self.verbosity)
103 parser = self.parser
104 else:
105 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
106 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
107 transcripts = []
108 self.nbElements = 0
109 for newTranscript in parser.getIterator():
110 newTranscripts = []
111 if newTranscript.__class__.__name__ == "Mapping":
112 newTranscript = newTranscript.getTranscript()
113 for oldTranscript in transcripts:
114 if self._checkOverlap(newTranscript, oldTranscript):
115 self._merge(newTranscript, oldTranscript)
116 elif self._checkPassed(newTranscript, oldTranscript):
117 self._write(oldTranscript)
118 else:
119 newTranscripts.append(oldTranscript)
120 newTranscripts.append(newTranscript)
121 transcripts = newTranscripts
122 self.nbElements += 1
123 progress.inc()
124 for transcript in transcripts:
125 self._write(transcript)
126 progress.done()
127
128 def _merge(self, transcript1, transcript2):
129 self.nbMerges += 1
130 transcript2.setDirection(transcript1.getDirection())
131 transcript1.merge(transcript2)
132
133 def _write(self, transcript):
134 self.nbWritten += 1
135 self.writer.addTranscript(transcript)
136
137 def _checkOverlap(self, transcript1, transcript2):
138 if transcript1.getChromosome() != transcript2.getChromosome():
139 return False
140 if self.colinear and transcript1.getDirection() != transcript2.getDirection():
141 return False
142 if transcript1.getDistance(transcript2) > self.distance:
143 return False
144 return True
145
146 def _checkPassed(self, transcript1, transcript2):
147 return ((transcript1.getChromosome() != transcript2.getChromosome()) or (transcript1.getDistance(transcript2) > self.distance))
148
149 def run(self):
150 self._sortFile()
151 if self.presorted:
152 self._iterate(None)
153 else:
154 for chromosome in sorted(self.splittedFileNames.keys()):
155 self._iterate(chromosome)
156 self.writer.close()
157 if self.verbosity > 0:
158 print "# input: %d" % (self.nbElements)
159 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100))
160 print "# merges: %d" % (self.nbMerges)
161
162
163 if __name__ == "__main__":
164 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
165
166 parser = OptionParser(description = description)
167 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
168 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]")
169 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
170 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]")
171 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]")
172 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]")
173 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
174 parser.add_option("-s", "--sorted", dest="sorted", action="store_true", default=False, help="input is already sorted [format: bool] [default: false]")
175 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
176 (options, args) = parser.parse_args()
177
178 c = Clusterize(options.verbosity)
179 c.setInputFile(options.inputFileName, options.format)
180 c.setOutputFileName(options.outputFileName, options.outputFormat)
181 c.setColinear(options.colinear)
182 c.setDistance(options.distance)
183 c.setNormalize(options.normalize)
184 c.setPresorted(options.sorted)
185 c.run()