comparison SMART/Java/Python/toolLauncher/RnaFoldLauncher.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 769e306b7933
children
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 #
2 # Copyright INRA-URGI 2009-2010
3 #
4 # This software is governed by the CeCILL license under French law and
5 # abiding by the rules of distribution of free software. You can use,
6 # modify and/ or redistribute the software under the terms of the CeCILL
7 # license as circulated by CEA, CNRS and INRIA at the following URL
8 # "http://www.cecill.info".
9 #
10 # As a counterpart to the access to the source code and rights to copy,
11 # modify and redistribute granted by the license, users are provided only
12 # with a limited warranty and the software's author, the holder of the
13 # economic rights, and the successive licensors have only limited
14 # liability.
15 #
16 # In this respect, the user's attention is drawn to the risks associated
17 # with loading, using, modifying and/or developing or reproducing the
18 # software by the user in light of its specific status of free software,
19 # that may mean that it is complicated to manipulate, and that also
20 # therefore means that it is reserved for developers and experienced
21 # professionals having in-depth computer knowledge. Users are therefore
22 # encouraged to load and test the software's suitability as regards their
23 # requirements in conditions enabling the security of their systems and/or
24 # data to be ensured and, more generally, to use and operate it in the
25 # same conditions as regards security.
26 #
27 # The fact that you are presently reading this means that you have had
28 # knowledge of the CeCILL license and that you accept its terms.
29 #
30 import os
31 import sys
32 import random
33 import subprocess
34 from SMART.Java.Python.structure.TranscriptList import TranscriptList
35 from SMART.Java.Python.structure.Transcript import Transcript
36 from SMART.Java.Python.misc.Progress import Progress
37 from commons.core.parsing.FastaParser import FastaParser
38
39
40 class RnaFoldStructure(object):
41 """
42 A structure to store the output of RNAFold
43 @ivar name: the name of the sequence
44 @type name: string
45 @ivar sequence: the sequence (with gaps)
46 @type sequence: string
47 @ivar structure: the bracket structure
48 @type structure: string
49 @ivar energy: the energy of the fold
50 @type energy: float
51 @ivar interactions: the interactions inside the structure
52 @ivar interactions: the interactions inside the structure
53 """
54
55 def __init__(self, name, sequence, structure, energy):
56 """
57 Initialize the structure
58 @param name the name of the sequence
59 @type name: string
60 @param sequence: the sequence (with gaps)
61 @type sequence: string
62 @param structure: the bracket structure
63 @type structure: string
64 @param energy: the energy of the fold
65 @type energy: float
66 """
67 self.name = name
68 self.sequence = sequence
69 self.structure = structure
70 self.energy = energy
71 self.interactions = None
72
73
74 def analyze(self):
75 """
76 Analyze the output, assign the interactions
77 """
78 if len(self.sequence) != len(self.structure):
79 sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure))
80 stack = []
81 self.interactions = [None for i in range(len(self.sequence))]
82 for i in range(len(self.sequence)):
83 if self.structure[i] == "(":
84 stack.append(i)
85 elif self.structure[i] == ")":
86 if not stack:
87 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure))
88 otherI = stack.pop()
89 self.interactions[i] = otherI
90 self.interactions[otherI] = i
91 if stack:
92 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure))
93
94
95 def getNbBulges(self, start = None, end = None):
96 """
97 Get the number of insertions in a given range (in number of letters)
98 @param start: where to start the count
99 @type start: int
100 @param end: where to end the co
101 @type end: int
102 """
103 if start == None:
104 start = 0
105 if end == None:
106 end = len(self.sequence)
107 previousInt = None
108 nbBulges = 0
109 inRegion = False
110 for i in range(len(self.sequence)):
111 if i == start:
112 inRegion = True
113 if i > end:
114 return nbBulges
115 if inRegion:
116 if self.interactions[i] == None:
117 nbBulges += 1
118 elif previousInt != None and abs(self.interactions[i] - previousInt) != 1:
119 nbBulges += 1
120 previousInt = self.interactions[i]
121 return nbBulges
122
123
124 def getStar(self, start = None, end = None):
125 """
126 Get the supposed miRNA*
127 @param start: where to start the count
128 @type start: int
129 @param end: where to end the co
130 @type end: int
131 """
132 if start == None:
133 start = 0
134 if end == None:
135 end = len(self.sequence)
136 minStar = 1000000
137 maxStar = 0
138 inRegion = False
139 for i in range(len(self.sequence)):
140 if i == start:
141 inRegion = True
142 if i > end:
143 return (minStar, maxStar)
144 if inRegion:
145 if self.interactions[i] != None:
146 minStar = min(minStar, self.interactions[i])
147 maxStar = max(maxStar, self.interactions[i])
148 return (minStar, maxStar)
149
150
151
152 class RnaFoldLauncher(object):
153 """
154 Start and parse a RNAFold instance
155 @ivar inputTranscriptList: a list of transcripts
156 @type inputTranscriptList: class L{TranscriptList<TranscriptList>}
157 @ivar genomeFileParser: a parser to the genome file
158 @type genomeFileParser: class L{SequenceListParser<SequenceListParser>}
159 @ivar bothStrands: whether folding is done on both strands
160 @type bothStrands: bool
161 @ivar fivePrimeExtension: extension towards the 5' end
162 @type fivePrimeExtension: int
163 @ivar threePrimeExtension: extension towards the 3' end
164 @type threePrimeExtension: int
165 @ivar inputTranscriptList: the input list of transcripts
166 @type inputTranscriptList: class L{TranscriptList<TranscriptList>}
167 @ivar outputTranscriptList: the output list of transcripts
168 @type outputTranscriptList: class L{TranscriptList<TranscriptList>}
169 @ivar tmpInputFileName: the name of the temporary input file for RNAFold
170 @type tmpInputFileName: string
171 @ivar tmpOutputFileName: the name of the temporary output file for RNAFold
172 @type tmpOutputFileName: string
173 @ivar verbosity: verbosity
174 @type verbosity: int [default: 0]
175 """
176
177 def __init__(self, verbosity = 0):
178 """
179 Constructor
180 @param verbosity: verbosity
181 @type verbosity: int
182 """
183 self.verbosity = verbosity
184 self.transcriptNames = []
185 randomNumber = random.randint(0, 100000)
186 self.bothStrands = True
187 self.tmpInputFileName = "tmpInput_%d.fas" % (randomNumber)
188 self.tmpOutputFileName = "tmpOutput_%d.fas" % (randomNumber)
189 self.outputTranscriptList = None
190 self.fivePrimeExtension = 0
191 self.threePrimeExtension = 0
192
193
194 def __del__(self):
195 for file in (self.tmpInputFileName, self.tmpOutputFileName):
196 if os.path.isfile(file):
197 os.remove(file)
198
199
200 def setTranscriptList(self, inputTranscriptList):
201 """
202 Set the list of transcripts
203 @ivar inputTranscriptList: a list of transcripts
204 @type inputTranscriptList: class L{TranscriptList<TranscriptList>}
205 """
206 self.inputTranscriptList = inputTranscriptList
207
208
209 def setExtensions(self, fivePrime, threePrime):
210 """
211 Set extension sizes
212 @ivar fivePrime: extension towards the 5' end
213 @type fivePrime: int
214 @ivar threePrime: extension towards the 3' end
215 @type threePrime: int
216 """
217 self.fivePrimeExtension = fivePrime
218 self.threePrimeExtension = threePrime
219
220
221 def setNbStrands(self, nbStrands):
222 """
223 Set number of strands
224 @ivar nbStrands: if 2, the work is done on both strands
225 @type nbStrands: int
226 """
227 self.nbStrands = nbStrands
228
229
230 def setGenomeFile(self, fileName):
231 """
232 Set the genome file
233 @ivar genomeFileName: the multi-FASTA file containing the genome
234 @type genomeFileName: a string
235 """
236 self.genomeFileParser = FastaParser(fileName, self.verbosity)
237
238
239 def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension):
240 """
241 Write the RNAFold input file, containing the sequence corresponding to the transcript
242 @ivar transcript: a transcript
243 @type transcript: class L{Transcript<Transcript>}
244 @ivar reverse: invert the extensions
245 @type reverse: bool
246 """
247 transcriptCopy = Transcript(transcript)
248 transcriptCopy.removeExons()
249 if not reverse:
250 transcriptCopy.extendStart(fivePrimeExtension)
251 transcriptCopy.extendEnd(threePrimeExtension)
252 else:
253 transcriptCopy.extendStart(threePrimeExtension)
254 transcriptCopy.extendEnd(fivePrimeExtension)
255 sequence = transcriptCopy.extractSequence(self.genomeFileParser)
256 handle = open(self.tmpInputFileName, "w")
257 handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence()))
258 handle.close()
259
260
261 def startRnaFold(self):
262 """
263 Start RNAFold
264 """
265 command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName)
266 if self.verbosity > 100:
267 print "Launching command '%s'" % (command)
268 status = subprocess.call(command, shell=True)
269 if status != 0:
270 raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status))
271
272
273 def parseRnaFoldOutput(self):
274 """
275 Parse an output file of RNAFold
276 @return: an RnaFoldStructure
277 """
278 lines = open(self.tmpOutputFileName).readlines()
279 if len(lines) != 3:
280 raise Exception("Problem in RNAfold output! '%s'" % (lines))
281 name = lines[0].strip()[1:].split()[0]
282 sequence = lines[1].strip()
283 structure = lines[2].strip().split()[0]
284 energy = float(lines[2].strip().split(" ", 1)[1][1:-1].strip())
285 if self.verbosity > 100:
286 print "Getting sequence %s, structure %s" % (sequence, structure)
287 return RnaFoldStructure(name, sequence, structure, energy)
288
289
290 def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension):
291 """
292 Analyze the RNAFold
293 @ivar transcript: a transcript
294 @type transcript: class L{Transcript<Transcript>}
295 @ivar rnaFoldOutput: the output of RNAFold
296 @type rnaFoldOutput: class L{RnaFoldStructure<RnaFoldStructure>}
297 @ivar reverse: invert the extensions
298 @type reverse: bool
299 @return: a t-uple of energy, number of insertions, number of bulges, strand
300 """
301 rnaFoldOutput.analyze()
302 transcriptSize = transcript.end - transcript.start + 1
303 start = fivePrimeExtension if not reverse else threePrimeExtension
304 end = start + transcriptSize
305 energy = rnaFoldOutput.energy
306 nbBulges = rnaFoldOutput.getNbBulges(start, end)
307 (minStar, maxStar) = rnaFoldOutput.getStar(start, end)
308 minStar += transcript.start - start
309 maxStar += transcript.start - start
310 if self.verbosity > 100:
311 print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+")
312 return (energy, nbBulges, minStar, maxStar, reverse)
313
314
315 def fold(self, transcript):
316 """
317 Fold a transcript (in each strand)
318 @ivar transcript: a transcript
319 @type transcript: class L{Transcript<Transcript>}
320 @return: a t-uple of energy, number of insertions, number of bulges, strand
321 """
322 results = [None] * self.nbStrands
323 strands = [False, True] if self.nbStrands == 2 else [False]
324 minNbBulges = 1000000
325 for i, reverse in enumerate(strands):
326 self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension)
327 self.startRnaFold()
328 output = self.parseRnaFoldOutput()
329 results[i] = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension)
330 minNbBulges = min(minNbBulges, results[i][1])
331 for result in results:
332 if result[1] == minNbBulges:
333 return result
334 return None
335
336
337 def refold(self, transcript):
338 """
339 Fold a transcript, knowing where the miRNA starts and end
340 @ivar transcript: a transcript
341 @type transcript: class L{Transcript<Transcript>}
342 @return: the energy
343 """
344 miStar = transcript.getTagValue("miRnaStar")
345 startMiStar = int(miStar.split("-")[0])
346 endMiStart = int(miStar.split("-")[1])
347 fivePrimeExtension = max(0, transcript.start - startMiStar) + 5
348 threePrimeExtension = max(0, endMiStart - transcript.end) + 5
349 self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension)
350 self.startRnaFold()
351 output = self.parseRnaFoldOutput()
352 result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension)
353 return result[0]
354
355
356 def computeResults(self):
357 """
358 Fold all and fill an output transcript list with the values
359 """
360 progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity)
361 self.outputTranscriptList = TranscriptList()
362 for transcript in self.inputTranscriptList.getIterator():
363 result = self.fold(transcript)
364 transcript.setTagValue("nbBulges", result[1])
365 transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3]))
366 transcript.setTagValue("miRNAstrand", result[4])
367 transcript.setTagValue("energy", self.refold(transcript))
368 self.outputTranscriptList.addTranscript(transcript)
369 progress.inc()
370 progress.done()
371
372
373 def getResults(self):
374 """
375 Get an output transcript list with the values
376 """
377 if self.outputTranscriptList == None:
378 self.computeResults()
379 return self.outputTranscriptList