Mercurial > repos > yufei-luo > s_mart
view smart_toolShed/SMART/Java/Python/toolLauncher/RnaFoldLauncher.py @ 0:e0f8dcca02ed
Uploaded S-MART tool. A toolbox manages RNA-Seq and ChIP-Seq data.
author | yufei-luo |
---|---|
date | Thu, 17 Jan 2013 10:52:14 -0500 |
parents | |
children |
line wrap: on
line source
# # Copyright INRA-URGI 2009-2010 # # This software is governed by the CeCILL license under French law and # abiding by the rules of distribution of free software. You can use, # modify and/ or redistribute the software under the terms of the CeCILL # license as circulated by CEA, CNRS and INRIA at the following URL # "http://www.cecill.info". # # As a counterpart to the access to the source code and rights to copy, # modify and redistribute granted by the license, users are provided only # with a limited warranty and the software's author, the holder of the # economic rights, and the successive licensors have only limited # liability. # # In this respect, the user's attention is drawn to the risks associated # with loading, using, modifying and/or developing or reproducing the # software by the user in light of its specific status of free software, # that may mean that it is complicated to manipulate, and that also # therefore means that it is reserved for developers and experienced # professionals having in-depth computer knowledge. Users are therefore # encouraged to load and test the software's suitability as regards their # requirements in conditions enabling the security of their systems and/or # data to be ensured and, more generally, to use and operate it in the # same conditions as regards security. # # The fact that you are presently reading this means that you have had # knowledge of the CeCILL license and that you accept its terms. # import os import sys import random import subprocess from SMART.Java.Python.structure.TranscriptList import TranscriptList from SMART.Java.Python.structure.Transcript import Transcript from SMART.Java.Python.misc.Progress import Progress from commons.core.parsing.FastaParser import FastaParser class RnaFoldStructure(object): """ A structure to store the output of RNAFold @ivar name: the name of the sequence @type name: string @ivar sequence: the sequence (with gaps) @type sequence: string @ivar structure: the bracket structure @type structure: string @ivar energy: the energy of the fold @type energy: float @ivar interactions: the interactions inside the structure @ivar interactions: the interactions inside the structure """ def __init__(self, name, sequence, structure, energy): """ Initialize the structure @param name the name of the sequence @type name: string @param sequence: the sequence (with gaps) @type sequence: string @param structure: the bracket structure @type structure: string @param energy: the energy of the fold @type energy: float """ self.name = name self.sequence = sequence self.structure = structure self.energy = energy self.interactions = None def analyze(self): """ Analyze the output, assign the interactions """ if len(self.sequence) != len(self.structure): sys.exit("Sizes of sequence and structure differ ('%s' and '%s')!\n" % (self.sequence, self.structure)) stack = [] self.interactions = [None for i in range(len(self.sequence))] for i in range(len(self.sequence)): if self.structure[i] == "(": stack.append(i) elif self.structure[i] == ")": if not stack: sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) otherI = stack.pop() self.interactions[i] = otherI self.interactions[otherI] = i if stack: sys.exit("Something wrong in the interaction line '%s'!\n" % (self.structure)) def getNbBulges(self, start = None, end = None): """ Get the number of insertions in a given range (in number of letters) @param start: where to start the count @type start: int @param end: where to end the co @type end: int """ if start == None: start = 0 if end == None: end = len(self.sequence) previousInt = None nbBulges = 0 inRegion = False for i in range(len(self.sequence)): if i == start: inRegion = True if i > end: return nbBulges if inRegion: if self.interactions[i] == None: nbBulges += 1 elif previousInt != None and abs(self.interactions[i] - previousInt) != 1: nbBulges += 1 previousInt = self.interactions[i] return nbBulges def getStar(self, start = None, end = None): """ Get the supposed miRNA* @param start: where to start the count @type start: int @param end: where to end the co @type end: int """ if start == None: start = 0 if end == None: end = len(self.sequence) minStar = 1000000 maxStar = 0 inRegion = False for i in range(len(self.sequence)): if i == start: inRegion = True if i > end: return (minStar, maxStar) if inRegion: if self.interactions[i] != None: minStar = min(minStar, self.interactions[i]) maxStar = max(maxStar, self.interactions[i]) return (minStar, maxStar) class RnaFoldLauncher(object): """ Start and parse a RNAFold instance @ivar inputTranscriptList: a list of transcripts @type inputTranscriptList: class L{TranscriptList<TranscriptList>} @ivar genomeFileParser: a parser to the genome file @type genomeFileParser: class L{SequenceListParser<SequenceListParser>} @ivar bothStrands: whether folding is done on both strands @type bothStrands: bool @ivar fivePrimeExtension: extension towards the 5' end @type fivePrimeExtension: int @ivar threePrimeExtension: extension towards the 3' end @type threePrimeExtension: int @ivar inputTranscriptList: the input list of transcripts @type inputTranscriptList: class L{TranscriptList<TranscriptList>} @ivar outputTranscriptList: the output list of transcripts @type outputTranscriptList: class L{TranscriptList<TranscriptList>} @ivar tmpInputFileName: the name of the temporary input file for RNAFold @type tmpInputFileName: string @ivar tmpOutputFileName: the name of the temporary output file for RNAFold @type tmpOutputFileName: string @ivar verbosity: verbosity @type verbosity: int [default: 0] """ def __init__(self, verbosity = 0): """ Constructor @param verbosity: verbosity @type verbosity: int """ self.verbosity = verbosity self.transcriptNames = [] randomNumber = random.randint(0, 100000) self.bothStrands = True self.tmpInputFileName = "tmpInput_%d.fas" % (randomNumber) self.tmpOutputFileName = "tmpOutput_%d.fas" % (randomNumber) self.outputTranscriptList = None self.fivePrimeExtension = 0 self.threePrimeExtension = 0 def __del__(self): for file in (self.tmpInputFileName, self.tmpOutputFileName): if os.path.isfile(file): os.remove(file) def setTranscriptList(self, inputTranscriptList): """ Set the list of transcripts @ivar inputTranscriptList: a list of transcripts @type inputTranscriptList: class L{TranscriptList<TranscriptList>} """ self.inputTranscriptList = inputTranscriptList def setExtensions(self, fivePrime, threePrime): """ Set extension sizes @ivar fivePrime: extension towards the 5' end @type fivePrime: int @ivar threePrime: extension towards the 3' end @type threePrime: int """ self.fivePrimeExtension = fivePrime self.threePrimeExtension = threePrime def setNbStrands(self, nbStrands): """ Set number of strands @ivar nbStrands: if 2, the work is done on both strands @type nbStrands: int """ self.nbStrands = nbStrands def setGenomeFile(self, fileName): """ Set the genome file @ivar genomeFileName: the multi-FASTA file containing the genome @type genomeFileName: a string """ self.genomeFileParser = FastaParser(fileName, self.verbosity) def writeInputFile(self, transcript, reverse, fivePrimeExtension, threePrimeExtension): """ Write the RNAFold input file, containing the sequence corresponding to the transcript @ivar transcript: a transcript @type transcript: class L{Transcript<Transcript>} @ivar reverse: invert the extensions @type reverse: bool """ transcriptCopy = Transcript(transcript) transcriptCopy.removeExons() if not reverse: transcriptCopy.extendStart(fivePrimeExtension) transcriptCopy.extendEnd(threePrimeExtension) else: transcriptCopy.extendStart(threePrimeExtension) transcriptCopy.extendEnd(fivePrimeExtension) sequence = transcriptCopy.extractSequence(self.genomeFileParser) handle = open(self.tmpInputFileName, "w") handle.write(">%s\n%s\n" % (sequence.getName().replace(":", "_").replace(".", "_"), sequence.getSequence())) handle.close() def startRnaFold(self): """ Start RNAFold """ command = "RNAfold < %s > %s" % (self.tmpInputFileName, self.tmpOutputFileName) if self.verbosity > 100: print "Launching command '%s'" % (command) status = subprocess.call(command, shell=True) if status != 0: raise Exception("Problem with RNAFold! Input file is %s, status is: %s" % (self.tmpInputFileName, status)) def parseRnaFoldOutput(self): """ Parse an output file of RNAFold @return: an RnaFoldStructure """ lines = open(self.tmpOutputFileName).readlines() if len(lines) != 3: raise Exception("Problem in RNAfold output! '%s'" % (lines)) name = lines[0].strip()[1:].split()[0] sequence = lines[1].strip() structure = lines[2].strip().split()[0] energy = float(lines[2].strip().split(" ", 1)[1][1:-1].strip()) if self.verbosity > 100: print "Getting sequence %s, structure %s" % (sequence, structure) return RnaFoldStructure(name, sequence, structure, energy) def analyzeRnaFoldOutput(self, transcript, rnaFoldOutput, reverse, fivePrimeExtension, threePrimeExtension): """ Analyze the RNAFold @ivar transcript: a transcript @type transcript: class L{Transcript<Transcript>} @ivar rnaFoldOutput: the output of RNAFold @type rnaFoldOutput: class L{RnaFoldStructure<RnaFoldStructure>} @ivar reverse: invert the extensions @type reverse: bool @return: a t-uple of energy, number of insertions, number of bulges, strand """ rnaFoldOutput.analyze() transcriptSize = transcript.end - transcript.start + 1 start = fivePrimeExtension if not reverse else threePrimeExtension end = start + transcriptSize energy = rnaFoldOutput.energy nbBulges = rnaFoldOutput.getNbBulges(start, end) (minStar, maxStar) = rnaFoldOutput.getStar(start, end) minStar += transcript.start - start maxStar += transcript.start - start if self.verbosity > 100: print "Getting structure with energy %d, nbBulges %d, miRna* %d-%d, strand %s" % (energy, nbBulges, minStar, maxStar, "-" if reverse else "+") return (energy, nbBulges, minStar, maxStar, reverse) def fold(self, transcript): """ Fold a transcript (in each strand) @ivar transcript: a transcript @type transcript: class L{Transcript<Transcript>} @return: a t-uple of energy, number of insertions, number of bulges, strand """ results = [None] * self.nbStrands strands = [False, True] if self.nbStrands == 2 else [False] minNbBulges = 1000000 for i, reverse in enumerate(strands): self.writeInputFile(transcript, reverse, self.fivePrimeExtension, self.threePrimeExtension) self.startRnaFold() output = self.parseRnaFoldOutput() results[i] = self.analyzeRnaFoldOutput(transcript, output, reverse, self.fivePrimeExtension, self.threePrimeExtension) minNbBulges = min(minNbBulges, results[i][1]) for result in results: if result[1] == minNbBulges: return result return None def refold(self, transcript): """ Fold a transcript, knowing where the miRNA starts and end @ivar transcript: a transcript @type transcript: class L{Transcript<Transcript>} @return: the energy """ miStar = transcript.getTagValue("miRnaStar") startMiStar = int(miStar.split("-")[0]) endMiStart = int(miStar.split("-")[1]) fivePrimeExtension = max(0, transcript.start - startMiStar) + 5 threePrimeExtension = max(0, endMiStart - transcript.end) + 5 self.writeInputFile(transcript, False, fivePrimeExtension, threePrimeExtension) self.startRnaFold() output = self.parseRnaFoldOutput() result = self.analyzeRnaFoldOutput(transcript, output, False, fivePrimeExtension, threePrimeExtension) return result[0] def computeResults(self): """ Fold all and fill an output transcript list with the values """ progress = Progress(self.inputTranscriptList.getNbTranscripts(), "Handling transcripts", self.verbosity) self.outputTranscriptList = TranscriptList() for transcript in self.inputTranscriptList.getIterator(): result = self.fold(transcript) transcript.setTagValue("nbBulges", result[1]) transcript.setTagValue("miRnaStar", "%d-%d" % (result[2], result[3])) transcript.setTagValue("miRNAstrand", result[4]) transcript.setTagValue("energy", self.refold(transcript)) self.outputTranscriptList.addTranscript(transcript) progress.inc() progress.done() def getResults(self): """ Get an output transcript list with the values """ if self.outputTranscriptList == None: self.computeResults() return self.outputTranscriptList