comparison smart_toolShed/SMART/Java/Python/clusterize.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 from commons.core.writer.WriterChooser import WriterChooser
32 """Clusterize a set of transcripts"""
33
34 import os
35 from optparse import OptionParser
36 from commons.core.parsing.ParserChooser import ParserChooser
37 from commons.core.writer.Gff3Writer import Gff3Writer
38 from SMART.Java.Python.structure.Transcript import Transcript
39 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle
40 from SMART.Java.Python.ncList.FileSorter import FileSorter
41 from SMART.Java.Python.misc.Progress import Progress
42
43 class Clusterize(object):
44
45 def __init__(self, verbosity):
46 self.normalize = False
47 self.presorted = False
48 self.distance = 1
49 self.colinear = False
50 self.nbWritten = 0
51 self.nbMerges = 0
52 self.verbosity = verbosity
53 self.splittedFileNames = {}
54
55 def __del__(self):
56 for fileName in self.splittedFileNames.values():
57 os.remove(fileName)
58
59 def setInputFile(self, fileName, format):
60 parserChooser = ParserChooser(self.verbosity)
61 parserChooser.findFormat(format)
62 self.parser = parserChooser.getParser(fileName)
63 self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
64
65 def setOutputFileName(self, fileName, format="gff3", title="S-MART", feature="transcript", featurePart="exon"):
66 writerChooser = WriterChooser()
67 writerChooser.findFormat(format)
68 self.writer = writerChooser.getWriter(fileName)
69 self.writer.setTitle(title)
70 self.writer.setFeature(feature)
71 self.writer.setFeaturePart(featurePart)
72
73 def setDistance(self, distance):
74 self.distance = distance
75
76 def setColinear(self, colinear):
77 self.colinear = colinear
78
79 def setNormalize(self, normalize):
80 self.normalize = normalize
81
82 def setPresorted(self, presorted):
83 self.presorted = presorted
84
85 def _sortFile(self):
86 fs = FileSorter(self.parser, self.verbosity-4)
87 fs.perChromosome(True)
88 fs.setPresorted(self.presorted)
89 fs.setOutputFileName(self.sortedFileName)
90 fs.sort()
91 self.splittedFileNames = fs.getOutputFileNames()
92 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
93 self.nbElements = fs.getNbElements()
94
95 def _iterate(self, chromosome):
96 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
97 transcripts = []
98 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
99 for newTranscript in parser.getIterator():
100 newTranscripts = []
101 for oldTranscript in transcripts:
102 if self._checkOverlap(newTranscript, oldTranscript):
103 self._merge(newTranscript, oldTranscript)
104 elif self._checkPassed(newTranscript, oldTranscript):
105 self._write(oldTranscript)
106 else:
107 newTranscripts.append(oldTranscript)
108 newTranscripts.append(newTranscript)
109 transcripts = newTranscripts
110 progress.inc()
111 for transcript in transcripts:
112 self._write(transcript)
113 progress.done()
114
115 def _merge(self, transcript1, transcript2):
116 self.nbMerges += 1
117 transcript2.setDirection(transcript1.getDirection())
118 transcript1.merge(transcript2)
119
120 def _write(self, transcript):
121 self.nbWritten += 1
122 self.writer.addTranscript(transcript)
123
124 def _checkOverlap(self, transcript1, transcript2):
125 if self.colinear and transcript1.getDirection() != transcript2.getDirection():
126 return False
127 if transcript1.getDistance(transcript2) > self.distance:
128 return False
129 return True
130
131 def _checkPassed(self, transcript1, transcript2):
132 return (transcript1.getDistance(transcript2) > self.distance)
133
134 def run(self):
135 self._sortFile()
136 for chromosome in sorted(self.splittedFileNames.keys()):
137 self._iterate(chromosome)
138 self.writer.close()
139 if self.verbosity > 0:
140 print "# input: %d" % (self.nbElements)
141 print "# written: %d (%d%% overlaps)" % (self.nbWritten, 0 if (self.nbElements == 0) else ((float(self.nbWritten) / self.nbElements) * 100))
142 print "# merges: %d" % (self.nbMerges)
143
144
145 if __name__ == "__main__":
146 description = "Clusterize v1.0.3: clusterize the data which overlap. [Category: Merge]"
147
148 parser = OptionParser(description = description)
149 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in transcript format given by -f]")
150 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of file [format: transcript file format]")
151 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in transcript format given by -u]")
152 parser.add_option("-u", "--outputFormat", dest="outputFormat", action="store", default="gff", type="string", help="output file format [format: transcript file format]")
153 parser.add_option("-c", "--colinear", dest="colinear", action="store_true", default=False, help="merge colinear transcripts only [format: bool] [default: false]")
154 parser.add_option("-d", "--distance", dest="distance", action="store", default=0, type="int", help="max. distance between two transcripts to be merged [format: int] [default: 0]")
155 parser.add_option("-n", "--normalize", dest="normalize", action="store_true", default=False, help="normalize the number of reads per cluster by the number of mappings per read [format: bool] [default: false]")
156 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [format: int] [default: 1]")
157 (options, args) = parser.parse_args()
158
159 c = Clusterize(options.verbosity)
160 c.setInputFile(options.inputFileName, options.format)
161 c.setOutputFileName(options.outputFileName, options.outputFormat)
162 c.setColinear(options.colinear)
163 c.setDistance(options.distance)
164 c.setNormalize(options.normalize)
165 c.run()