comparison smart_toolShed/SMART/Java/Python/CollapseReads.py @ 0:e0f8dcca02ed

Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author yufei-luo
date Thu, 17 Jan 2013 10:52:14 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:e0f8dcca02ed
1 #! /usr/bin/env python
2 #
3 # Copyright INRA-URGI 2009-2010
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30 #
31 import os
32 from optparse import OptionParser, OptionGroup
33 from commons.core.parsing.ParserChooser import ParserChooser
34 from commons.core.writer.Gff3Writer import Gff3Writer
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.ncList.NCListFilePickle import NCListFileUnpickle
37 from SMART.Java.Python.ncList.FileSorter import FileSorter
38 from SMART.Java.Python.misc.Progress import Progress
39
40
41 class CollapseReads(object):
42 """
43 Merge two reads if they have exactly the same genomic coordinates
44 """
45
46 def __init__(self, verbosity = 0):
47 self.verbosity = verbosity
48 self.inputReader = None
49 self.outputWriter = None
50 self.strands = True
51 self.nbRead = 0
52 self.nbWritten = 0
53 self.nbMerges = 0
54 self.splittedFileNames = {}
55
56 def __del__(self):
57 for fileName in self.splittedFileNames.values():
58 os.remove(fileName)
59
60 def close(self):
61 self.outputWriter.close()
62
63 def setInputFile(self, fileName, format):
64 parserChooser = ParserChooser(self.verbosity)
65 parserChooser.findFormat(format, "transcript")
66 self.parser = parserChooser.getParser(fileName)
67 self.sortedFileName = "%s_sorted.pkl" % (os.path.splitext(fileName)[0])
68
69 def setOutputFile(self, fileName):
70 self.outputWriter = Gff3Writer(fileName, self.verbosity)
71
72 def getNbElements(self):
73 return self.parser.getNbTranscripts()
74
75 def _sortFile(self):
76 fs = FileSorter(self.parser, self.verbosity-4)
77 fs.perChromosome(True)
78 fs.setOutputFileName(self.sortedFileName)
79 fs.sort()
80 self.splittedFileNames = fs.getOutputFileNames()
81 self.nbElementsPerChromosome = fs.getNbElementsPerChromosome()
82 self.nbRead = fs.getNbElements()
83
84 def _iterate(self, chromosome):
85 progress = Progress(self.nbElementsPerChromosome[chromosome], "Checking chromosome %s" % (chromosome), self.verbosity)
86 transcripts = []
87 parser = NCListFileUnpickle(self.splittedFileNames[chromosome], self.verbosity)
88 for newTranscript in parser.getIterator():
89 newTranscripts = []
90 for oldTranscript in transcripts:
91 if self._checkOverlap(newTranscript, oldTranscript):
92 self._merge(newTranscript, oldTranscript)
93 elif self._checkPassed(newTranscript, oldTranscript):
94 self._write(oldTranscript)
95 else:
96 newTranscripts.append(oldTranscript)
97 newTranscripts.append(newTranscript)
98 transcripts = newTranscripts
99 progress.inc()
100 for transcript in transcripts:
101 self._write(transcript)
102 progress.done()
103
104 def _merge(self, transcript1, transcript2):
105 self.nbMerges += 1
106 transcript2.setDirection(transcript1.getDirection())
107 transcript1.merge(transcript2)
108
109 def _write(self, transcript):
110 self.nbWritten += 1
111 self.outputWriter.addTranscript(transcript)
112
113 def _checkOverlap(self, transcript1, transcript2):
114 if transcript1.getStart() != transcript2.getStart() or transcript1.getEnd() != transcript2.getEnd():
115 return False
116 return (not self.strands or transcript1.getDirection() == transcript2.getDirection())
117
118 def _checkPassed(self, transcript1, transcript2):
119 return (transcript2.getStart() < transcript1.getStart())
120
121 def collapseChromosome(self, chromosome):
122 progress = Progress(table.getNbElements(), "Analysing chromosome %s" % (chromosome), self.verbosity)
123 command = "SELECT * FROM %s ORDER BY start ASC, end DESC" % (table.name)
124 transcriptStart = None
125 transcriptEnd = None
126 transcriptDirection = None
127 currentTranscript = None
128 if self.strands:
129 command += ", direction"
130 for index, transcript in table.selectTranscripts(command, True):
131 self.nbRead += 1
132 if not self.strands:
133 transcript.setDirection("+")
134 if transcriptStart != transcript.getStart() or transcriptEnd != transcript.getEnd() or transcriptDirection != transcript.getDirection():
135 self.writeTranscript(currentTranscript)
136 transcriptStart = transcript.getStart()
137 transcriptEnd = transcript.getEnd()
138 transcriptDirection = transcript.getDirection()
139 currentTranscript = transcript
140 else:
141 currentTranscript.setTagValue("nbElements", (currentTranscript.getTagValue("nbElements") + 1) if "nbElements" in currentTranscript.getTagNames() else 1)
142 progress.inc()
143 self.writeTranscript(currentTranscript)
144 progress.done()
145
146 def collapse(self):
147 self._sortFile()
148 for chromosome in sorted(self.nbElementsPerChromosome.keys()):
149 self._iterate(chromosome)
150 self.outputWriter.close()
151 if self.verbosity > 1:
152 print "# reads read: %d" % (self.nbRead)
153 print "# reads written: %d (%.2f%%)" % (self.nbWritten, float(self.nbWritten) / self.nbRead * 100)
154 print "# reads merges: %d" % (self.nbMerges)
155
156 if __name__ == "__main__":
157
158 # parse command line
159 description = "Collapse Reads v1.0.3: Merge two reads if they have exactly the same genomic coordinates. [Category: Merge]"
160
161 parser = OptionParser(description = description)
162 parser.add_option("-i", "--input", dest="inputFileName", action="store", type="string", help="input file [compulsory] [format: file in mapping format given by -f]")
163 parser.add_option("-f", "--format", dest="format", action="store", type="string", help="format of the file [compulsory] [format: mapping file format]")
164 parser.add_option("-o", "--output", dest="outputFileName", action="store", type="string", help="output file [compulsory] [format: output file in GFF3 format]")
165 parser.add_option("-s", "--strands", dest="strands", action="store_true", default=False, help="merge elements on 2 different strands [format: bool] [default: false]")
166 parser.add_option("-v", "--verbosity", dest="verbosity", action="store", default=1, type="int", help="trace level [default: 1] [format: int]")
167 (options, args) = parser.parse_args()
168
169 collapser = CollapseReads(options.verbosity)
170 collapser.setInputFile(options.inputFileName, options.format)
171 collapser.setOutputFile(options.outputFileName)
172 collapser.strands = not options.strands
173 collapser.collapse()
174 collapser.close()