| 6 | 1 # | 
|  | 2 # Copyright INRA-URGI 2009-2010 | 
|  | 3 # | 
|  | 4 # This software is governed by the CeCILL license under French law and | 
|  | 5 # abiding by the rules of distribution of free software. You can use, | 
|  | 6 # modify and/ or redistribute the software under the terms of the CeCILL | 
|  | 7 # license as circulated by CEA, CNRS and INRIA at the following URL | 
|  | 8 # "http://www.cecill.info". | 
|  | 9 # | 
|  | 10 # As a counterpart to the access to the source code and rights to copy, | 
|  | 11 # modify and redistribute granted by the license, users are provided only | 
|  | 12 # with a limited warranty and the software's author, the holder of the | 
|  | 13 # economic rights, and the successive licensors have only limited | 
|  | 14 # liability. | 
|  | 15 # | 
|  | 16 # In this respect, the user's attention is drawn to the risks associated | 
|  | 17 # with loading, using, modifying and/or developing or reproducing the | 
|  | 18 # software by the user in light of its specific status of free software, | 
|  | 19 # that may mean that it is complicated to manipulate, and that also | 
|  | 20 # therefore means that it is reserved for developers and experienced | 
|  | 21 # professionals having in-depth computer knowledge. Users are therefore | 
|  | 22 # encouraged to load and test the software's suitability as regards their | 
|  | 23 # requirements in conditions enabling the security of their systems and/or | 
|  | 24 # data to be ensured and, more generally, to use and operate it in the | 
|  | 25 # same conditions as regards security. | 
|  | 26 # | 
|  | 27 # The fact that you are presently reading this means that you have had | 
|  | 28 # knowledge of the CeCILL license and that you accept its terms. | 
|  | 29 # | 
|  | 30 import os | 
|  | 31 import sys | 
|  | 32 import random | 
|  | 33 import subprocess | 
|  | 34 from SMART.Java.Python.structure.TranscriptList import TranscriptList | 
|  | 35 from SMART.Java.Python.structure.Transcript import Transcript | 
|  | 36 from SMART.Java.Python.misc.Progress import Progress | 
|  | 37 from commons.core.parsing.FastaParser import FastaParser | 
|  | 38 | 
|  | 39 | 
|  | 40 class RnaFoldStructure(object): | 
|  | 41     """ | 
|  | 42     A structure to store the output of RNAFold | 
|  | 43     @ivar name:         the name of the sequence | 
|  | 44     @type name:         string | 
|  | 45     @ivar sequence:     the sequence (with gaps) | 
|  | 46     @type sequence:     string | 
|  | 47     @ivar structure:    the bracket structure | 
|  | 48     @type structure:    string | 
|  | 49     @ivar energy:       the energy of the fold | 
|  | 50     @type energy:       float | 
|  | 51     @ivar interactions: the interactions inside the structure | 
|  | 52     @ivar interactions: the interactions inside the structure | 
|  | 53     """ | 
|  | 54 | 
|  | 55     def __init__(self, name, sequence, structure, energy): | 
|  | 56         """ | 
|  | 57         Initialize the structure | 
|  | 58         @param name       the name of the sequence | 
|  | 59         @type  name:      string | 
|  | 60         @param sequence:  the sequence (with gaps) | 
|  | 61         @type  sequence:  string | 
|  | 62         @param structure: the bracket structure | 
|  | 63         @type  structure: string | 
|  | 64         @param energy:    the energy of the fold | 
|  | 65         @type  energy:    float | 
|  | 66         """ | 
|  | 67         self.name         = name | 
|  | 68         self.sequence     = sequence | 
|  | 69         self.structure    = structure | 
|  | 70         self.energy       = energy | 
|  | 71         self.interactions = None | 
|  | 72 | 
|  | 73 | 
|  | 74     def analyze(self): | 
|  | 75         """ | 
|  | 76         Analyze the output, assign the interactions | 
|  | 77         """ | 
|  | 78         if len(self.sequence) != len(self.structure): | 
|  | 79             sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure)) | 
|  | 80         stack                         = [] | 
|  | 81         self.interactions = [None for i in range(len(self.sequence))] | 
|  | 82         for i in range(len(self.sequence)): | 
|  | 83             if self.structure[i] == "(": | 
|  | 84                 stack.append(i) | 
|  | 85             elif self.structure[i] == ")": | 
|  | 86                 if not stack: | 
|  | 87                     sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) | 
|  | 88                 otherI = stack.pop() | 
|  | 89                 self.interactions[i]      = otherI | 
|  | 90                 self.interactions[otherI] = i | 
|  | 91         if stack: | 
|  | 92             sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) | 
|  | 93 | 
|  | 94 | 
|  | 95     def getNbBulges(self, start = None, end = None): | 
|  | 96         """ | 
|  | 97         Get the number of insertions in a given range (in number of letters) | 
|  | 98         @param start: where to start the count | 
|  | 99         @type    start: int | 
|  | 100         @param end:     where to end the co | 
|  | 101         @type    end:     int | 
|  | 102         """ | 
|  | 103         if start == None: | 
|  | 104             start = 0 | 
|  | 105         if end == None: | 
|  | 106             end = len(self.sequence) | 
|  | 107         previousInt = None | 
|  | 108         nbBulges    = 0 | 
|  | 109         inRegion    = False | 
|  | 110         for i in range(len(self.sequence)): | 
|  | 111             if i == start: | 
|  | 112                 inRegion = True | 
|  | 113             if i > end: | 
|  | 114                 return nbBulges | 
|  | 115             if inRegion: | 
|  | 116                 if self.interactions[i] == None: | 
|  | 117                     nbBulges += 1 | 
|  | 118                 elif previousInt != None and abs(self.interactions[i] - previousInt) != 1: | 
|  | 119                     nbBulges += 1 | 
|  | 120             previousInt = self.interactions[i] | 
|  | 121         return nbBulges | 
|  | 122 | 
|  | 123 | 
|  | 124     def getStar(self, start = None, end = None): | 
|  | 125         """ | 
|  | 126         Get the supposed miRNA* | 
|  | 127         @param start: where to start the count | 
|  | 128         @type    start: int | 
|  | 129         @param end:     where to end the co | 
|  | 130         @type    end:     int | 
|  | 131         """ | 
|  | 132         if start == None: | 
|  | 133             start = 0 | 
|  | 134         if end == None: | 
|  | 135             end = len(self.sequence) | 
|  | 136         minStar  = 1000000 | 
|  | 137         maxStar  = 0 | 
|  | 138         inRegion = False | 
|  | 139         for i in range(len(self.sequence)): | 
|  | 140             if i == start: | 
|  | 141                 inRegion = True | 
|  | 142             if i > end: | 
|  | 143                 return (minStar, maxStar) | 
|  | 144             if inRegion: | 
|  | 145                 if self.interactions[i] != None: | 
|  | 146                     minStar = min(minStar, self.interactions[i]) | 
|  | 147                     maxStar = max(maxStar, self.interactions[i]) | 
|  | 148         return (minStar, maxStar) | 
|  | 149 | 
|  | 150 | 
|  | 151 | 
|  | 152 class RnaFoldLauncher(object): | 
|  | 153     """ | 
|  | 154     Start and parse a RNAFold instance | 
|  | 155     @ivar inputTranscriptList:  a list of transcripts | 
|  | 156     @type inputTranscriptList:  class L{TranscriptList<TranscriptList>} | 
|  | 157     @ivar genomeFileParser:     a parser to the genome file | 
|  | 158     @type genomeFileParser:     class L{SequenceListParser<SequenceListParser>} | 
|  | 159     @ivar bothStrands:          whether folding is done on both strands | 
|  | 160     @type bothStrands:          bool | 
|  | 161     @ivar fivePrimeExtension:   extension towards the 5' end | 
|  | 162     @type fivePrimeExtension:   int | 
|  | 163     @ivar threePrimeExtension:  extension towards the 3' end | 
|  | 164     @type threePrimeExtension:  int | 
|  | 165     @ivar inputTranscriptList:  the input list of transcripts | 
|  | 166     @type inputTranscriptList:  class L{TranscriptList<TranscriptList>} | 
|  | 167     @ivar outputTranscriptList: the output list of transcripts | 
|  | 168     @type outputTranscriptList: class L{TranscriptList<TranscriptList>} | 
|  | 169     @ivar tmpInputFileName:     the name of the temporary input file for RNAFold | 
|  | 170     @type tmpInputFileName:     string | 
|  | 171     @ivar tmpOutputFileName:    the name of the temporary output file for RNAFold | 
|  | 172     @type tmpOutputFileName:    string | 
|  | 173     @ivar verbosity:            verbosity | 
|  | 174     @type verbosity:            int [default: 0] | 
|  | 175     """ | 
|  | 176 | 
|  | 177     def __init__(self, verbosity = 0): | 
|  | 178         """ | 
|  | 179         Constructor | 
|  | 180         @param verbosity: verbosity | 
|  | 181         @type  verbosity: int | 
|  | 182         """ | 
|  | 183         self.verbosity            = verbosity | 
|  | 184         self.transcriptNames      = [] | 
|  | 185         randomNumber              = random.randint(0, 100000) | 
|  | 186         self.bothStrands          = True | 
|  | 187         self.tmpInputFileName     = "tmpInput_%d.fas" % (randomNumber) | 
|  | 188         self.tmpOutputFileName    = "tmpOutput_%d.fas" % (randomNumber) | 
|  | 189         self.outputTranscriptList = None | 
|  | 190         self.fivePrimeExtension   = 0 | 
|  | 191         self.threePrimeExtension  = 0 | 
|  | 192 | 
|  | 193 | 
|  | 194     def __del__(self): | 
|  | 195         for file in (self.tmpInputFileName, self.tmpOutputFileName): | 
|  | 196             if os.path.isfile(file): | 
|  | 197                 os.remove(file) | 
|  | 198 | 
|  | 199 | 
|  | 200     def setTranscriptList(self, inputTranscriptList): | 
|  | 201         """ | 
|  | 202         Set the list of transcripts | 
|  | 203         @ivar inputTranscriptList: a list of transcripts | 
|  | 204         @type inputTranscriptList: class L{TranscriptList<TranscriptList>} | 
|  | 205         """ | 
|  | 206         self.inputTranscriptList = inputTranscriptList | 
|  | 207 | 
|  | 208 | 
|  | 209     def setExtensions(self, fivePrime, threePrime): | 
|  | 210         """ | 
|  | 211         Set extension sizes | 
|  | 212         @ivar fivePrime:  extension towards the 5' end | 
|  | 213         @type fivePrime:  int | 
|  | 214         @ivar threePrime: extension towards the 3' end | 
|  | 215         @type threePrime: int | 
|  | 216         """ | 
|  | 217         self.fivePrimeExtension  = fivePrime | 
|  | 218         self.threePrimeExtension = threePrime | 
|  | 219 | 
|  | 220 | 
|  | 221     def setNbStrands(self, nbStrands): | 
|  | 222         """ | 
|  | 223         Set number of strands | 
|  | 224         @ivar nbStrands: if 2, the work is done on both strands | 
|  | 225         @type nbStrands: int | 
|  | 226         """ | 
|  | 227         self.nbStrands = nbStrands | 
|  | 228 | 
|  | 229 | 
|  | 230     def setGenomeFile(self, fileName): | 
|  | 231         """ | 
|  | 232         Set the genome file | 
|  | 233         @ivar genomeFileName: the multi-FASTA file containing the genome | 
|  | 234         @type genomeFileName: a string | 
|  | 235         """ | 
|  | 236         self.genomeFileParser = FastaParser(fileName, self.verbosity) | 
|  | 237 | 
|  | 238 | 
|  | 239     def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension): | 
|  | 240         """ | 
|  | 241         Write the RNAFold input file, containing the sequence corresponding to the transcript | 
|  | 242         @ivar transcript: a transcript | 
|  | 243         @type transcript: class L{Transcript<Transcript>} | 
|  | 244         @ivar reverse:    invert the extensions | 
|  | 245         @type reverse:    bool | 
|  | 246         """ | 
|  | 247         transcriptCopy = Transcript(transcript) | 
|  | 248         transcriptCopy.removeExons() | 
|  | 249         if not reverse: | 
|  | 250             transcriptCopy.extendStart(fivePrimeExtension) | 
|  | 251             transcriptCopy.extendEnd(threePrimeExtension) | 
|  | 252         else: | 
|  | 253             transcriptCopy.extendStart(threePrimeExtension) | 
|  | 254             transcriptCopy.extendEnd(fivePrimeExtension) | 
|  | 255         sequence = transcriptCopy.extractSequence(self.genomeFileParser) | 
|  | 256         handle   = open(self.tmpInputFileName, "w") | 
|  | 257         handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence())) | 
|  | 258         handle.close() | 
|  | 259 | 
|  | 260 | 
|  | 261     def startRnaFold(self): | 
|  | 262         """ | 
|  | 263         Start RNAFold | 
|  | 264         """ | 
|  | 265         command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName) | 
|  | 266         if self.verbosity > 100: | 
|  | 267             print "Launching command '%s'" % (command) | 
|  | 268         status = subprocess.call(command, shell=True) | 
|  | 269         if status != 0: | 
|  | 270             raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status)) | 
|  | 271 | 
|  | 272 | 
|  | 273     def parseRnaFoldOutput(self): | 
|  | 274         """ | 
|  | 275         Parse an output file of RNAFold | 
|  | 276         @return: an RnaFoldStructure | 
|  | 277         """ | 
|  | 278         lines = open(self.tmpOutputFileName).readlines() | 
|  | 279         if len(lines) != 3: | 
|  | 280             raise Exception("Problem in RNAfold output! '%s'" % (lines)) | 
|  | 281         name      = lines[0].strip()[1:].split()[0] | 
|  | 282         sequence  = lines[1].strip() | 
|  | 283         structure = lines[2].strip().split()[0] | 
|  | 284         energy    = float(lines[2].strip().split(" ", 1)[1][1:-1].strip()) | 
|  | 285         if self.verbosity > 100: | 
|  | 286             print "Getting sequence %s, structure %s" % (sequence, structure) | 
|  | 287         return RnaFoldStructure(name, sequence, structure, energy) | 
|  | 288 | 
|  | 289 | 
|  | 290     def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension): | 
|  | 291         """ | 
|  | 292         Analyze the RNAFold | 
|  | 293         @ivar transcript:    a transcript | 
|  | 294         @type transcript:    class L{Transcript<Transcript>} | 
|  | 295         @ivar rnaFoldOutput: the output of RNAFold | 
|  | 296         @type rnaFoldOutput: class L{RnaFoldStructure<RnaFoldStructure>} | 
|  | 297         @ivar reverse:       invert the extensions | 
|  | 298         @type reverse:       bool | 
|  | 299         @return:             a t-uple of energy, number of insertions, number of bulges, strand | 
|  | 300         """ | 
|  | 301         rnaFoldOutput.analyze() | 
|  | 302         transcriptSize     = transcript.end - transcript.start + 1 | 
|  | 303         start              = fivePrimeExtension if not reverse else threePrimeExtension | 
|  | 304         end                = start + transcriptSize | 
|  | 305         energy             = rnaFoldOutput.energy | 
|  | 306         nbBulges           = rnaFoldOutput.getNbBulges(start, end) | 
|  | 307         (minStar, maxStar) = rnaFoldOutput.getStar(start, end) | 
|  | 308         minStar           += transcript.start - start | 
|  | 309         maxStar           += transcript.start - start | 
|  | 310         if self.verbosity > 100: | 
|  | 311             print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+") | 
|  | 312         return (energy, nbBulges, minStar, maxStar, reverse) | 
|  | 313 | 
|  | 314 | 
|  | 315     def fold(self, transcript): | 
|  | 316         """ | 
|  | 317         Fold a transcript (in each strand) | 
|  | 318         @ivar transcript: a transcript | 
|  | 319         @type transcript: class L{Transcript<Transcript>} | 
|  | 320         @return:          a t-uple of energy, number of insertions, number of bulges, strand | 
|  | 321         """ | 
|  | 322         results     = [None] * self.nbStrands | 
|  | 323         strands     = [False, True] if self.nbStrands == 2 else [False] | 
|  | 324         minNbBulges = 1000000 | 
|  | 325         for i, reverse in enumerate(strands): | 
|  | 326             self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension) | 
|  | 327             self.startRnaFold() | 
|  | 328             output = self.parseRnaFoldOutput() | 
|  | 329             results[i]  = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension) | 
|  | 330             minNbBulges = min(minNbBulges, results[i][1]) | 
|  | 331         for result in results: | 
|  | 332             if result[1] == minNbBulges: | 
|  | 333                 return result | 
|  | 334         return None | 
|  | 335 | 
|  | 336 | 
|  | 337     def refold(self, transcript): | 
|  | 338         """ | 
|  | 339         Fold a transcript, knowing where the miRNA starts and end | 
|  | 340         @ivar transcript: a transcript | 
|  | 341         @type transcript: class L{Transcript<Transcript>} | 
|  | 342         @return:          the energy | 
|  | 343         """ | 
|  | 344         miStar              = transcript.getTagValue("miRnaStar") | 
|  | 345         startMiStar         = int(miStar.split("-")[0]) | 
|  | 346         endMiStart          = int(miStar.split("-")[1]) | 
|  | 347         fivePrimeExtension  = max(0, transcript.start - startMiStar) + 5 | 
|  | 348         threePrimeExtension = max(0, endMiStart - transcript.end) + 5 | 
|  | 349         self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension) | 
|  | 350         self.startRnaFold() | 
|  | 351         output = self.parseRnaFoldOutput() | 
|  | 352         result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension) | 
|  | 353         return result[0] | 
|  | 354 | 
|  | 355 | 
|  | 356     def computeResults(self): | 
|  | 357         """ | 
|  | 358         Fold all and fill an output transcript list with the values | 
|  | 359         """ | 
|  | 360         progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity) | 
|  | 361         self.outputTranscriptList = TranscriptList() | 
|  | 362         for transcript in self.inputTranscriptList.getIterator(): | 
|  | 363             result = self.fold(transcript) | 
|  | 364             transcript.setTagValue("nbBulges", result[1]) | 
|  | 365             transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3])) | 
|  | 366             transcript.setTagValue("miRNAstrand", result[4]) | 
|  | 367             transcript.setTagValue("energy", self.refold(transcript)) | 
|  | 368             self.outputTranscriptList.addTranscript(transcript) | 
|  | 369             progress.inc() | 
|  | 370         progress.done() | 
|  | 371 | 
|  | 372 | 
|  | 373     def getResults(self): | 
|  | 374         """ | 
|  | 375         Get an output transcript list with the values | 
|  | 376         """ | 
|  | 377         if self.outputTranscriptList == None: | 
|  | 378             self.computeResults() | 
|  | 379         return self.outputTranscriptList |