Mercurial > repos > yufei-luo > s_mart
comparison SMART/Java/Python/toolLauncher/RnaFoldLauncher.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | 769e306b7933 |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 # | |
| 2 # Copyright INRA-URGI 2009-2010 | |
| 3 # | |
| 4 # This software is governed by the CeCILL license under French law and | |
| 5 # abiding by the rules of distribution of free software. You can use, | |
| 6 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 7 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 8 # "http://www.cecill.info". | |
| 9 # | |
| 10 # As a counterpart to the access to the source code and rights to copy, | |
| 11 # modify and redistribute granted by the license, users are provided only | |
| 12 # with a limited warranty and the software's author, the holder of the | |
| 13 # economic rights, and the successive licensors have only limited | |
| 14 # liability. | |
| 15 # | |
| 16 # In this respect, the user's attention is drawn to the risks associated | |
| 17 # with loading, using, modifying and/or developing or reproducing the | |
| 18 # software by the user in light of its specific status of free software, | |
| 19 # that may mean that it is complicated to manipulate, and that also | |
| 20 # therefore means that it is reserved for developers and experienced | |
| 21 # professionals having in-depth computer knowledge. Users are therefore | |
| 22 # encouraged to load and test the software's suitability as regards their | |
| 23 # requirements in conditions enabling the security of their systems and/or | |
| 24 # data to be ensured and, more generally, to use and operate it in the | |
| 25 # same conditions as regards security. | |
| 26 # | |
| 27 # The fact that you are presently reading this means that you have had | |
| 28 # knowledge of the CeCILL license and that you accept its terms. | |
| 29 # | |
| 30 import os | |
| 31 import sys | |
| 32 import random | |
| 33 import subprocess | |
| 34 from SMART.Java.Python.structure.TranscriptList import TranscriptList | |
| 35 from SMART.Java.Python.structure.Transcript import Transcript | |
| 36 from SMART.Java.Python.misc.Progress import Progress | |
| 37 from commons.core.parsing.FastaParser import FastaParser | |
| 38 | |
| 39 | |
| 40 class RnaFoldStructure(object): | |
| 41 """ | |
| 42 A structure to store the output of RNAFold | |
| 43 @ivar name: the name of the sequence | |
| 44 @type name: string | |
| 45 @ivar sequence: the sequence (with gaps) | |
| 46 @type sequence: string | |
| 47 @ivar structure: the bracket structure | |
| 48 @type structure: string | |
| 49 @ivar energy: the energy of the fold | |
| 50 @type energy: float | |
| 51 @ivar interactions: the interactions inside the structure | |
| 52 @ivar interactions: the interactions inside the structure | |
| 53 """ | |
| 54 | |
| 55 def __init__(self, name, sequence, structure, energy): | |
| 56 """ | |
| 57 Initialize the structure | |
| 58 @param name the name of the sequence | |
| 59 @type name: string | |
| 60 @param sequence: the sequence (with gaps) | |
| 61 @type sequence: string | |
| 62 @param structure: the bracket structure | |
| 63 @type structure: string | |
| 64 @param energy: the energy of the fold | |
| 65 @type energy: float | |
| 66 """ | |
| 67 self.name = name | |
| 68 self.sequence = sequence | |
| 69 self.structure = structure | |
| 70 self.energy = energy | |
| 71 self.interactions = None | |
| 72 | |
| 73 | |
| 74 def analyze(self): | |
| 75 """ | |
| 76 Analyze the output, assign the interactions | |
| 77 """ | |
| 78 if len(self.sequence) != len(self.structure): | |
| 79 sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure)) | |
| 80 stack = [] | |
| 81 self.interactions = [None for i in range(len(self.sequence))] | |
| 82 for i in range(len(self.sequence)): | |
| 83 if self.structure[i] == "(": | |
| 84 stack.append(i) | |
| 85 elif self.structure[i] == ")": | |
| 86 if not stack: | |
| 87 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) | |
| 88 otherI = stack.pop() | |
| 89 self.interactions[i] = otherI | |
| 90 self.interactions[otherI] = i | |
| 91 if stack: | |
| 92 sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) | |
| 93 | |
| 94 | |
| 95 def getNbBulges(self, start = None, end = None): | |
| 96 """ | |
| 97 Get the number of insertions in a given range (in number of letters) | |
| 98 @param start: where to start the count | |
| 99 @type start: int | |
| 100 @param end: where to end the co | |
| 101 @type end: int | |
| 102 """ | |
| 103 if start == None: | |
| 104 start = 0 | |
| 105 if end == None: | |
| 106 end = len(self.sequence) | |
| 107 previousInt = None | |
| 108 nbBulges = 0 | |
| 109 inRegion = False | |
| 110 for i in range(len(self.sequence)): | |
| 111 if i == start: | |
| 112 inRegion = True | |
| 113 if i > end: | |
| 114 return nbBulges | |
| 115 if inRegion: | |
| 116 if self.interactions[i] == None: | |
| 117 nbBulges += 1 | |
| 118 elif previousInt != None and abs(self.interactions[i] - previousInt) != 1: | |
| 119 nbBulges += 1 | |
| 120 previousInt = self.interactions[i] | |
| 121 return nbBulges | |
| 122 | |
| 123 | |
| 124 def getStar(self, start = None, end = None): | |
| 125 """ | |
| 126 Get the supposed miRNA* | |
| 127 @param start: where to start the count | |
| 128 @type start: int | |
| 129 @param end: where to end the co | |
| 130 @type end: int | |
| 131 """ | |
| 132 if start == None: | |
| 133 start = 0 | |
| 134 if end == None: | |
| 135 end = len(self.sequence) | |
| 136 minStar = 1000000 | |
| 137 maxStar = 0 | |
| 138 inRegion = False | |
| 139 for i in range(len(self.sequence)): | |
| 140 if i == start: | |
| 141 inRegion = True | |
| 142 if i > end: | |
| 143 return (minStar, maxStar) | |
| 144 if inRegion: | |
| 145 if self.interactions[i] != None: | |
| 146 minStar = min(minStar, self.interactions[i]) | |
| 147 maxStar = max(maxStar, self.interactions[i]) | |
| 148 return (minStar, maxStar) | |
| 149 | |
| 150 | |
| 151 | |
| 152 class RnaFoldLauncher(object): | |
| 153 """ | |
| 154 Start and parse a RNAFold instance | |
| 155 @ivar inputTranscriptList: a list of transcripts | |
| 156 @type inputTranscriptList: class L{TranscriptList<TranscriptList>} | |
| 157 @ivar genomeFileParser: a parser to the genome file | |
| 158 @type genomeFileParser: class L{SequenceListParser<SequenceListParser>} | |
| 159 @ivar bothStrands: whether folding is done on both strands | |
| 160 @type bothStrands: bool | |
| 161 @ivar fivePrimeExtension: extension towards the 5' end | |
| 162 @type fivePrimeExtension: int | |
| 163 @ivar threePrimeExtension: extension towards the 3' end | |
| 164 @type threePrimeExtension: int | |
| 165 @ivar inputTranscriptList: the input list of transcripts | |
| 166 @type inputTranscriptList: class L{TranscriptList<TranscriptList>} | |
| 167 @ivar outputTranscriptList: the output list of transcripts | |
| 168 @type outputTranscriptList: class L{TranscriptList<TranscriptList>} | |
| 169 @ivar tmpInputFileName: the name of the temporary input file for RNAFold | |
| 170 @type tmpInputFileName: string | |
| 171 @ivar tmpOutputFileName: the name of the temporary output file for RNAFold | |
| 172 @type tmpOutputFileName: string | |
| 173 @ivar verbosity: verbosity | |
| 174 @type verbosity: int [default: 0] | |
| 175 """ | |
| 176 | |
| 177 def __init__(self, verbosity = 0): | |
| 178 """ | |
| 179 Constructor | |
| 180 @param verbosity: verbosity | |
| 181 @type verbosity: int | |
| 182 """ | |
| 183 self.verbosity = verbosity | |
| 184 self.transcriptNames = [] | |
| 185 randomNumber = random.randint(0, 100000) | |
| 186 self.bothStrands = True | |
| 187 self.tmpInputFileName = "tmpInput_%d.fas" % (randomNumber) | |
| 188 self.tmpOutputFileName = "tmpOutput_%d.fas" % (randomNumber) | |
| 189 self.outputTranscriptList = None | |
| 190 self.fivePrimeExtension = 0 | |
| 191 self.threePrimeExtension = 0 | |
| 192 | |
| 193 | |
| 194 def __del__(self): | |
| 195 for file in (self.tmpInputFileName, self.tmpOutputFileName): | |
| 196 if os.path.isfile(file): | |
| 197 os.remove(file) | |
| 198 | |
| 199 | |
| 200 def setTranscriptList(self, inputTranscriptList): | |
| 201 """ | |
| 202 Set the list of transcripts | |
| 203 @ivar inputTranscriptList: a list of transcripts | |
| 204 @type inputTranscriptList: class L{TranscriptList<TranscriptList>} | |
| 205 """ | |
| 206 self.inputTranscriptList = inputTranscriptList | |
| 207 | |
| 208 | |
| 209 def setExtensions(self, fivePrime, threePrime): | |
| 210 """ | |
| 211 Set extension sizes | |
| 212 @ivar fivePrime: extension towards the 5' end | |
| 213 @type fivePrime: int | |
| 214 @ivar threePrime: extension towards the 3' end | |
| 215 @type threePrime: int | |
| 216 """ | |
| 217 self.fivePrimeExtension = fivePrime | |
| 218 self.threePrimeExtension = threePrime | |
| 219 | |
| 220 | |
| 221 def setNbStrands(self, nbStrands): | |
| 222 """ | |
| 223 Set number of strands | |
| 224 @ivar nbStrands: if 2, the work is done on both strands | |
| 225 @type nbStrands: int | |
| 226 """ | |
| 227 self.nbStrands = nbStrands | |
| 228 | |
| 229 | |
| 230 def setGenomeFile(self, fileName): | |
| 231 """ | |
| 232 Set the genome file | |
| 233 @ivar genomeFileName: the multi-FASTA file containing the genome | |
| 234 @type genomeFileName: a string | |
| 235 """ | |
| 236 self.genomeFileParser = FastaParser(fileName, self.verbosity) | |
| 237 | |
| 238 | |
| 239 def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension): | |
| 240 """ | |
| 241 Write the RNAFold input file, containing the sequence corresponding to the transcript | |
| 242 @ivar transcript: a transcript | |
| 243 @type transcript: class L{Transcript<Transcript>} | |
| 244 @ivar reverse: invert the extensions | |
| 245 @type reverse: bool | |
| 246 """ | |
| 247 transcriptCopy = Transcript(transcript) | |
| 248 transcriptCopy.removeExons() | |
| 249 if not reverse: | |
| 250 transcriptCopy.extendStart(fivePrimeExtension) | |
| 251 transcriptCopy.extendEnd(threePrimeExtension) | |
| 252 else: | |
| 253 transcriptCopy.extendStart(threePrimeExtension) | |
| 254 transcriptCopy.extendEnd(fivePrimeExtension) | |
| 255 sequence = transcriptCopy.extractSequence(self.genomeFileParser) | |
| 256 handle = open(self.tmpInputFileName, "w") | |
| 257 handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence())) | |
| 258 handle.close() | |
| 259 | |
| 260 | |
| 261 def startRnaFold(self): | |
| 262 """ | |
| 263 Start RNAFold | |
| 264 """ | |
| 265 command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName) | |
| 266 if self.verbosity > 100: | |
| 267 print "Launching command '%s'" % (command) | |
| 268 status = subprocess.call(command, shell=True) | |
| 269 if status != 0: | |
| 270 raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status)) | |
| 271 | |
| 272 | |
| 273 def parseRnaFoldOutput(self): | |
| 274 """ | |
| 275 Parse an output file of RNAFold | |
| 276 @return: an RnaFoldStructure | |
| 277 """ | |
| 278 lines = open(self.tmpOutputFileName).readlines() | |
| 279 if len(lines) != 3: | |
| 280 raise Exception("Problem in RNAfold output! '%s'" % (lines)) | |
| 281 name = lines[0].strip()[1:].split()[0] | |
| 282 sequence = lines[1].strip() | |
| 283 structure = lines[2].strip().split()[0] | |
| 284 energy = float(lines[2].strip().split(" ", 1)[1][1:-1].strip()) | |
| 285 if self.verbosity > 100: | |
| 286 print "Getting sequence %s, structure %s" % (sequence, structure) | |
| 287 return RnaFoldStructure(name, sequence, structure, energy) | |
| 288 | |
| 289 | |
| 290 def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension): | |
| 291 """ | |
| 292 Analyze the RNAFold | |
| 293 @ivar transcript: a transcript | |
| 294 @type transcript: class L{Transcript<Transcript>} | |
| 295 @ivar rnaFoldOutput: the output of RNAFold | |
| 296 @type rnaFoldOutput: class L{RnaFoldStructure<RnaFoldStructure>} | |
| 297 @ivar reverse: invert the extensions | |
| 298 @type reverse: bool | |
| 299 @return: a t-uple of energy, number of insertions, number of bulges, strand | |
| 300 """ | |
| 301 rnaFoldOutput.analyze() | |
| 302 transcriptSize = transcript.end - transcript.start + 1 | |
| 303 start = fivePrimeExtension if not reverse else threePrimeExtension | |
| 304 end = start + transcriptSize | |
| 305 energy = rnaFoldOutput.energy | |
| 306 nbBulges = rnaFoldOutput.getNbBulges(start, end) | |
| 307 (minStar, maxStar) = rnaFoldOutput.getStar(start, end) | |
| 308 minStar += transcript.start - start | |
| 309 maxStar += transcript.start - start | |
| 310 if self.verbosity > 100: | |
| 311 print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+") | |
| 312 return (energy, nbBulges, minStar, maxStar, reverse) | |
| 313 | |
| 314 | |
| 315 def fold(self, transcript): | |
| 316 """ | |
| 317 Fold a transcript (in each strand) | |
| 318 @ivar transcript: a transcript | |
| 319 @type transcript: class L{Transcript<Transcript>} | |
| 320 @return: a t-uple of energy, number of insertions, number of bulges, strand | |
| 321 """ | |
| 322 results = [None] * self.nbStrands | |
| 323 strands = [False, True] if self.nbStrands == 2 else [False] | |
| 324 minNbBulges = 1000000 | |
| 325 for i, reverse in enumerate(strands): | |
| 326 self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension) | |
| 327 self.startRnaFold() | |
| 328 output = self.parseRnaFoldOutput() | |
| 329 results[i] = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension) | |
| 330 minNbBulges = min(minNbBulges, results[i][1]) | |
| 331 for result in results: | |
| 332 if result[1] == minNbBulges: | |
| 333 return result | |
| 334 return None | |
| 335 | |
| 336 | |
| 337 def refold(self, transcript): | |
| 338 """ | |
| 339 Fold a transcript, knowing where the miRNA starts and end | |
| 340 @ivar transcript: a transcript | |
| 341 @type transcript: class L{Transcript<Transcript>} | |
| 342 @return: the energy | |
| 343 """ | |
| 344 miStar = transcript.getTagValue("miRnaStar") | |
| 345 startMiStar = int(miStar.split("-")[0]) | |
| 346 endMiStart = int(miStar.split("-")[1]) | |
| 347 fivePrimeExtension = max(0, transcript.start - startMiStar) + 5 | |
| 348 threePrimeExtension = max(0, endMiStart - transcript.end) + 5 | |
| 349 self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension) | |
| 350 self.startRnaFold() | |
| 351 output = self.parseRnaFoldOutput() | |
| 352 result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension) | |
| 353 return result[0] | |
| 354 | |
| 355 | |
| 356 def computeResults(self): | |
| 357 """ | |
| 358 Fold all and fill an output transcript list with the values | |
| 359 """ | |
| 360 progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity) | |
| 361 self.outputTranscriptList = TranscriptList() | |
| 362 for transcript in self.inputTranscriptList.getIterator(): | |
| 363 result = self.fold(transcript) | |
| 364 transcript.setTagValue("nbBulges", result[1]) | |
| 365 transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3])) | |
| 366 transcript.setTagValue("miRNAstrand", result[4]) | |
| 367 transcript.setTagValue("energy", self.refold(transcript)) | |
| 368 self.outputTranscriptList.addTranscript(transcript) | |
| 369 progress.inc() | |
| 370 progress.done() | |
| 371 | |
| 372 | |
| 373 def getResults(self): | |
| 374 """ | |
| 375 Get an output transcript list with the values | |
| 376 """ | |
| 377 if self.outputTranscriptList == None: | |
| 378 self.computeResults() | |
| 379 return self.outputTranscriptList |
