Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/toolLauncher/RnaFoldLauncher.py @ 38:2c0c0a89fad7
Uploaded
author | m-zytnicki |
---|---|
date | Thu, 02 May 2013 09:56:47 -0400 |
parents | 769e306b7933 |
children |
comparison
equal
deleted
inserted
replaced
37:d22fadc825e3 | 38:2c0c0a89fad7 |
---|---|
1 # | |
2 # Copyright INRA-URGI 2009-2010 | |
3 # | |
4 # This software is governed by the CeCILL license under French law and | |
5 # abiding by the rules of distribution of free software. You can use, | |
6 # modify and/ or redistribute the software under the terms of the CeCILL | |
7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
8 # "http://www.cecill.info". | |
9 # | |
10 # As a counterpart to the access to the source code and rights to copy, | |
11 # modify and redistribute granted by the license, users are provided only | |
12 # with a limited warranty and the software's author, the holder of the | |
13 # economic rights, and the successive licensors have only limited | |
14 # liability. | |
15 # | |
16 # In this respect, the user's attention is drawn to the risks associated | |
17 # with loading, using, modifying and/or developing or reproducing the | |
18 # software by the user in light of its specific status of free software, | |
19 # that may mean that it is complicated to manipulate, and that also | |
20 # therefore means that it is reserved for developers and experienced | |
21 # professionals having in-depth computer knowledge. Users are therefore | |
22 # encouraged to load and test the software's suitability as regards their | |
23 # requirements in conditions enabling the security of their systems and/or | |
24 # data to be ensured and, more generally, to use and operate it in the | |
25 # same conditions as regards security. | |
26 # | |
27 # The fact that you are presently reading this means that you have had | |
28 # knowledge of the CeCILL license and that you accept its terms. | |
29 # | |
30 import os | |
31 import sys | |
32 import random | |
33 import subprocess | |
34 from SMART.Java.Python.structure.TranscriptList import TranscriptList | |
35 from SMART.Java.Python.structure.Transcript import Transcript | |
36 from SMART.Java.Python.misc.Progress import Progress | |
37 from commons.core.parsing.FastaParser import FastaParser | |
38 | |
39 | |
40 class RnaFoldStructure(object): | |
41 """ | |
42 A structure to store the output of RNAFold | |
43 @ivar name: the name of the sequence | |
44 @type name: string | |
45 @ivar sequence: the sequence (with gaps) | |
46 @type sequence: string | |
47 @ivar structure: the bracket structure | |
48 @type structure: string | |
49 @ivar energy: the energy of the fold | |
50 @type energy: float | |
51 @ivar interactions: the interactions inside the structure | |
52 @ivar interactions: the interactions inside the structure | |
53 """ | |
54 | |
55 def __init__(self, name, sequence, structure, energy): | |
56 """ | |
57 Initialize the structure | |
58 @param name the name of the sequence | |
59 @type name: string | |
60 @param sequence: the sequence (with gaps) | |
61 @type sequence: string | |
62 @param structure: the bracket structure | |
63 @type structure: string | |
64 @param energy: the energy of the fold | |
65 @type energy: float | |
66 """ | |
67 self.name = name | |
68 self.sequence = sequence | |
69 self.structure = structure | |
70 self.energy = energy | |
71 self.interactions = None | |
72 | |
73 | |
74 def analyze(self): | |
75 """ | |
76 Analyze the output, assign the interactions | |
77 """ | |
78 if len(self.sequence) != len(self.structure): | |
79 sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure)) | |
80 stack = [] | |
81 self.interactions = [None for i in range(len(self.sequence))] | |
82 for i in range(len(self.sequence)): | |
83 if self.structure[i] == "(": | |
84 stack.append(i) | |
85 elif self.structure[i] == ")": | |
86 if not stack: | |
87 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) | |
88 otherI = stack.pop() | |
89 self.interactions[i] = otherI | |
90 self.interactions[otherI] = i | |
91 if stack: | |
92 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) | |
93 | |
94 | |
95 def getNbBulges(self, start = None, end = None): | |
96 """ | |
97 Get the number of insertions in a given range (in number of letters) | |
98 @param start: where to start the count | |
99 @type start: int | |
100 @param end: where to end the co | |
101 @type end: int | |
102 """ | |
103 if start == None: | |
104 start = 0 | |
105 if end == None: | |
106 end = len(self.sequence) | |
107 previousInt = None | |
108 nbBulges = 0 | |
109 inRegion = False | |
110 for i in range(len(self.sequence)): | |
111 if i == start: | |
112 inRegion = True | |
113 if i > end: | |
114 return nbBulges | |
115 if inRegion: | |
116 if self.interactions[i] == None: | |
117 nbBulges += 1 | |
118 elif previousInt != None and abs(self.interactions[i] - previousInt) != 1: | |
119 nbBulges += 1 | |
120 previousInt = self.interactions[i] | |
121 return nbBulges | |
122 | |
123 | |
124 def getStar(self, start = None, end = None): | |
125 """ | |
126 Get the supposed miRNA* | |
127 @param start: where to start the count | |
128 @type start: int | |
129 @param end: where to end the co | |
130 @type end: int | |
131 """ | |
132 if start == None: | |
133 start = 0 | |
134 if end == None: | |
135 end = len(self.sequence) | |
136 minStar = 1000000 | |
137 maxStar = 0 | |
138 inRegion = False | |
139 for i in range(len(self.sequence)): | |
140 if i == start: | |
141 inRegion = True | |
142 if i > end: | |
143 return (minStar, maxStar) | |
144 if inRegion: | |
145 if self.interactions[i] != None: | |
146 minStar = min(minStar, self.interactions[i]) | |
147 maxStar = max(maxStar, self.interactions[i]) | |
148 return (minStar, maxStar) | |
149 | |
150 | |
151 | |
152 class RnaFoldLauncher(object): | |
153 """ | |
154 Start and parse a RNAFold instance | |
155 @ivar inputTranscriptList: a list of transcripts | |
156 @type inputTranscriptList: class L{TranscriptList<TranscriptList>} | |
157 @ivar genomeFileParser: a parser to the genome file | |
158 @type genomeFileParser: class L{SequenceListParser<SequenceListParser>} | |
159 @ivar bothStrands: whether folding is done on both strands | |
160 @type bothStrands: bool | |
161 @ivar fivePrimeExtension: extension towards the 5' end | |
162 @type fivePrimeExtension: int | |
163 @ivar threePrimeExtension: extension towards the 3' end | |
164 @type threePrimeExtension: int | |
165 @ivar inputTranscriptList: the input list of transcripts | |
166 @type inputTranscriptList: class L{TranscriptList<TranscriptList>} | |
167 @ivar outputTranscriptList: the output list of transcripts | |
168 @type outputTranscriptList: class L{TranscriptList<TranscriptList>} | |
169 @ivar tmpInputFileName: the name of the temporary input file for RNAFold | |
170 @type tmpInputFileName: string | |
171 @ivar tmpOutputFileName: the name of the temporary output file for RNAFold | |
172 @type tmpOutputFileName: string | |
173 @ivar verbosity: verbosity | |
174 @type verbosity: int [default: 0] | |
175 """ | |
176 | |
177 def __init__(self, verbosity = 0): | |
178 """ | |
179 Constructor | |
180 @param verbosity: verbosity | |
181 @type verbosity: int | |
182 """ | |
183 self.verbosity = verbosity | |
184 self.transcriptNames = [] | |
185 randomNumber = random.randint(0, 100000) | |
186 self.bothStrands = True | |
187 self.tmpInputFileName = "tmpInput_%d.fas" % (randomNumber) | |
188 self.tmpOutputFileName = "tmpOutput_%d.fas" % (randomNumber) | |
189 self.outputTranscriptList = None | |
190 self.fivePrimeExtension = 0 | |
191 self.threePrimeExtension = 0 | |
192 | |
193 | |
194 def __del__(self): | |
195 for file in (self.tmpInputFileName, self.tmpOutputFileName): | |
196 if os.path.isfile(file): | |
197 os.remove(file) | |
198 | |
199 | |
200 def setTranscriptList(self, inputTranscriptList): | |
201 """ | |
202 Set the list of transcripts | |
203 @ivar inputTranscriptList: a list of transcripts | |
204 @type inputTranscriptList: class L{TranscriptList<TranscriptList>} | |
205 """ | |
206 self.inputTranscriptList = inputTranscriptList | |
207 | |
208 | |
209 def setExtensions(self, fivePrime, threePrime): | |
210 """ | |
211 Set extension sizes | |
212 @ivar fivePrime: extension towards the 5' end | |
213 @type fivePrime: int | |
214 @ivar threePrime: extension towards the 3' end | |
215 @type threePrime: int | |
216 """ | |
217 self.fivePrimeExtension = fivePrime | |
218 self.threePrimeExtension = threePrime | |
219 | |
220 | |
221 def setNbStrands(self, nbStrands): | |
222 """ | |
223 Set number of strands | |
224 @ivar nbStrands: if 2, the work is done on both strands | |
225 @type nbStrands: int | |
226 """ | |
227 self.nbStrands = nbStrands | |
228 | |
229 | |
230 def setGenomeFile(self, fileName): | |
231 """ | |
232 Set the genome file | |
233 @ivar genomeFileName: the multi-FASTA file containing the genome | |
234 @type genomeFileName: a string | |
235 """ | |
236 self.genomeFileParser = FastaParser(fileName, self.verbosity) | |
237 | |
238 | |
239 def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension): | |
240 """ | |
241 Write the RNAFold input file, containing the sequence corresponding to the transcript | |
242 @ivar transcript: a transcript | |
243 @type transcript: class L{Transcript<Transcript>} | |
244 @ivar reverse: invert the extensions | |
245 @type reverse: bool | |
246 """ | |
247 transcriptCopy = Transcript(transcript) | |
248 transcriptCopy.removeExons() | |
249 if not reverse: | |
250 transcriptCopy.extendStart(fivePrimeExtension) | |
251 transcriptCopy.extendEnd(threePrimeExtension) | |
252 else: | |
253 transcriptCopy.extendStart(threePrimeExtension) | |
254 transcriptCopy.extendEnd(fivePrimeExtension) | |
255 sequence = transcriptCopy.extractSequence(self.genomeFileParser) | |
256 handle = open(self.tmpInputFileName, "w") | |
257 handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence())) | |
258 handle.close() | |
259 | |
260 | |
261 def startRnaFold(self): | |
262 """ | |
263 Start RNAFold | |
264 """ | |
265 command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName) | |
266 if self.verbosity > 100: | |
267 print "Launching command '%s'" % (command) | |
268 status = subprocess.call(command, shell=True) | |
269 if status != 0: | |
270 raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status)) | |
271 | |
272 | |
273 def parseRnaFoldOutput(self): | |
274 """ | |
275 Parse an output file of RNAFold | |
276 @return: an RnaFoldStructure | |
277 """ | |
278 lines = open(self.tmpOutputFileName).readlines() | |
279 if len(lines) != 3: | |
280 raise Exception("Problem in RNAfold output! '%s'" % (lines)) | |
281 name = lines[0].strip()[1:].split()[0] | |
282 sequence = lines[1].strip() | |
283 structure = lines[2].strip().split()[0] | |
284 energy = float(lines[2].strip().split(" ", 1)[1][1:-1].strip()) | |
285 if self.verbosity > 100: | |
286 print "Getting sequence %s, structure %s" % (sequence, structure) | |
287 return RnaFoldStructure(name, sequence, structure, energy) | |
288 | |
289 | |
290 def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension): | |
291 """ | |
292 Analyze the RNAFold | |
293 @ivar transcript: a transcript | |
294 @type transcript: class L{Transcript<Transcript>} | |
295 @ivar rnaFoldOutput: the output of RNAFold | |
296 @type rnaFoldOutput: class L{RnaFoldStructure<RnaFoldStructure>} | |
297 @ivar reverse: invert the extensions | |
298 @type reverse: bool | |
299 @return: a t-uple of energy, number of insertions, number of bulges, strand | |
300 """ | |
301 rnaFoldOutput.analyze() | |
302 transcriptSize = transcript.end - transcript.start + 1 | |
303 start = fivePrimeExtension if not reverse else threePrimeExtension | |
304 end = start + transcriptSize | |
305 energy = rnaFoldOutput.energy | |
306 nbBulges = rnaFoldOutput.getNbBulges(start, end) | |
307 (minStar, maxStar) = rnaFoldOutput.getStar(start, end) | |
308 minStar += transcript.start - start | |
309 maxStar += transcript.start - start | |
310 if self.verbosity > 100: | |
311 print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+") | |
312 return (energy, nbBulges, minStar, maxStar, reverse) | |
313 | |
314 | |
315 def fold(self, transcript): | |
316 """ | |
317 Fold a transcript (in each strand) | |
318 @ivar transcript: a transcript | |
319 @type transcript: class L{Transcript<Transcript>} | |
320 @return: a t-uple of energy, number of insertions, number of bulges, strand | |
321 """ | |
322 results = [None] * self.nbStrands | |
323 strands = [False, True] if self.nbStrands == 2 else [False] | |
324 minNbBulges = 1000000 | |
325 for i, reverse in enumerate(strands): | |
326 self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension) | |
327 self.startRnaFold() | |
328 output = self.parseRnaFoldOutput() | |
329 results[i] = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension) | |
330 minNbBulges = min(minNbBulges, results[i][1]) | |
331 for result in results: | |
332 if result[1] == minNbBulges: | |
333 return result | |
334 return None | |
335 | |
336 | |
337 def refold(self, transcript): | |
338 """ | |
339 Fold a transcript, knowing where the miRNA starts and end | |
340 @ivar transcript: a transcript | |
341 @type transcript: class L{Transcript<Transcript>} | |
342 @return: the energy | |
343 """ | |
344 miStar = transcript.getTagValue("miRnaStar") | |
345 startMiStar = int(miStar.split("-")[0]) | |
346 endMiStart = int(miStar.split("-")[1]) | |
347 fivePrimeExtension = max(0, transcript.start - startMiStar) + 5 | |
348 threePrimeExtension = max(0, endMiStart - transcript.end) + 5 | |
349 self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension) | |
350 self.startRnaFold() | |
351 output = self.parseRnaFoldOutput() | |
352 result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension) | |
353 return result[0] | |
354 | |
355 | |
356 def computeResults(self): | |
357 """ | |
358 Fold all and fill an output transcript list with the values | |
359 """ | |
360 progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity) | |
361 self.outputTranscriptList = TranscriptList() | |
362 for transcript in self.inputTranscriptList.getIterator(): | |
363 result = self.fold(transcript) | |
364 transcript.setTagValue("nbBulges", result[1]) | |
365 transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3])) | |
366 transcript.setTagValue("miRNAstrand", result[4]) | |
367 transcript.setTagValue("energy", self.refold(transcript)) | |
368 self.outputTranscriptList.addTranscript(transcript) | |
369 progress.inc() | |
370 progress.done() | |
371 | |
372 | |
373 def getResults(self): | |
374 """ | |
375 Get an output transcript list with the values | |
376 """ | |
377 if self.outputTranscriptList == None: | |
378 self.computeResults() | |
379 return self.outputTranscriptList |