Mercurial > repos > yufei-luo > s_mart
diff commons/launcher/LaunchRefAlign.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/launcher/LaunchRefAlign.py Tue Apr 30 14:33:21 2013 -0400 @@ -0,0 +1,208 @@ +#!/usr/bin/env python + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + +from commons.core.LoggerFactory import LoggerFactory +from commons.core.utils.RepetOptionParser import RepetOptionParser +from commons.core.checker.ConfigChecker import ConfigRules +from commons.core.checker.ConfigChecker import ConfigChecker +import subprocess +import os +from commons.core.seq.Bioseq import Bioseq + +LOG_DEPTH = "repet.core.launchers" + +from commons.core.seq.BioseqDB import BioseqDB +from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders + + +class LaunchRefAlign(object): + """ + Launch 'refalign' to build a master-slave multiple sequence alignment. + """ + def __init__(self, inputFileName="", outFileName="", gapSize=10, match=10, mismatch=8, gapOpen=16, gapExtend=4, refseqName="", keepRefseq =False, verbosity=3 ): + self.inputFileName = inputFileName + self.outFileName=outFileName + self.gapSize = gapSize + self.match = match + self.mismatch = mismatch + self.gapOpen = gapOpen + self.gapExtend = gapExtend + self.gapExtend = gapExtend + self.refseqName = refseqName + self.keepRefseq = keepRefseq + self._verbosity = verbosity + self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) + + def setAttributesFromCmdLine(self): + description = "usage: LaunchRefalign.py [ options ]" + epilog = "\n -h: this help\n" + epilog += "\t -i: name of the input file (refseq is first, format='fasta')" + epilog += "\t -r: keep the reference sequence" + epilog += "\t -o: name of the output file (default=inFileName+'.fa_aln')" + epilog += "\t -v: verbosity (default=0)" + epilog += "\n\t" + parser = RepetOptionParser(description = description, epilog = epilog) + parser.add_option("-i", "--fasta", dest = "inputFileName", action = "store", type = "string", help = "input fasta file name [compulsory] [format: fasta]", default = "") + parser.add_option("-o", "--out", dest = "outFileName", action = "store", type = "string", help = "output file name [default: <input>.out]", default = "") + parser.add_option("-r", "--keepRefseq", dest = "keepRefseq", action = "store_true", help = "keep reference sequence [optional] [default: False]", default = False) + parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1) + options = parser.parse_args()[0] + self._setAttributesFromOptions(options) + + def _setAttributesFromOptions(self, options): + self.inputFileName = options.inputFileName + self.setOutFileName = options.outFileName + self.keepRefseq = options.keepRefseq + self._verbosity = options.verbosity + + def _checkOptions(self): + if self.inputFileName == "": + self._logAndRaise("ERROR: Missing input file name") + + if self.outFileName == "": + self.outFileName = "%s.fa_aln" % (self.inputFileName) + + def _logAndRaise(self, errorMsg): + self._log.error(errorMsg) + raise Exception(errorMsg) + + def _prepareRefAlign(self): + self.shortInputFileName = self.inputFileName+".shortH" + self.refFileName= self.shortInputFileName + ".ref" + self.cpyFileName=self.shortInputFileName + ".cpy" + + file_db = open(self.shortInputFileName) + file_ref = open(self.refFileName,"w") + file_cpy = open(self.cpyFileName,"w") + + self._numseq=0 + while 1: + seq=Bioseq() + seq.read(file_db) + if seq.sequence==None: + break + self._numseq+=1 + if self._numseq==1: + seq.write(file_ref) + else: + seq.write(file_cpy) + file_db.close() + file_ref.close() + file_cpy.close() + + def _shortenHeaders(self): + self.csh = ChangeSequenceHeaders() + self.csh.setInputFile(self.inputFileName) + self.csh.setFormat("fasta") + self.csh.setStep(1) + self.csh.setPrefix("seq") + self.csh.setLinkFile(self.inputFileName+".shortHlink") + self.csh.setOutputFile(self.inputFileName+".shortH") + self.csh.setVerbosityLevel(self._verbosity-1) + self.csh.run() + + bsDB = BioseqDB(self.inputFileName+".shortH") + bsDB.upCase() + bsDB.save(self.inputFileName+".shortHtmp") + del bsDB + os.rename(self.inputFileName+".shortHtmp", self.inputFileName+".shortH") + + def _renameHeaders(self): + self.csh.setInputFile(self.inputFileName+".shortH.fa_aln") + self.csh.setFormat("fasta") + self.csh.setStep(2) + self.csh.setLinkFile(self.inputFileName+".shortHlink" ) + self.csh.setOutputFile(self.outFileName) + self.csh.setVerbosityLevel(self._verbosity-1) + self.csh.run() + + def run(self): + LoggerFactory.setLevel(self._log, self._verbosity) + self._checkOptions() + self._log.info("START LaunchRefAlign") + self._log.debug("building a multiple alignment from '%s'..." % ( self.inputFileName)) + + inputFileName = "%s/%s" % (os.getcwd(), os.path.basename(self.inputFileName)) + if not os.path.exists(inputFileName): + os.symlink(self.inputFileName, inputFileName) + self.inputFileName = inputFileName + + self._shortenHeaders() + if self.keepRefseq: + self.refseqName="seq1" + self._prepareRefAlign() + + if self._numseq > 1: + cmd = "refalign %s %s -m %d -l %d -d %d -g %d -e %d" % (self.refFileName, self.cpyFileName, self.match, self.gapSize, self.mismatch, self.gapOpen, self.gapExtend) + + process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + self._log.debug("Running : %s" % cmd) + output = process.communicate() + self._log.debug("Output:\n%s" % output[0]) + if process.returncode != 0: + self._logAndRaise("ERROR when launching '%s'" % cmd) + refseqNameParam = "" + if self.refseqName != "": + refseqNameParam = "-r %s " % (self.refseqName) + outFileName = self.inputFileName+".shortH.fa_aln" + #self.cpyFileName = os.path.join(os.getcwd(),os.path.basename(self.cpyFileName)) + + self._log.info("Copy file path %s " % self.cpyFileName) + print("Copy file path %s " % self.cpyFileName) + cmd = "refalign2fasta.py -i %s.aligner %s-g d -o %s -v 1" % (self.cpyFileName, refseqNameParam, outFileName) + self._log.debug("Running : %s" % cmd) + process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + output = process.communicate() + self._log.debug("Output:\n%s" % output[0]) + + if process.returncode != 0: + self._logAndRaise("ERROR when launching '%s'" % cmd) + + cmd = "rm -f "+ self.refFileName + " " + self.cpyFileName + " " + self.cpyFileName + ".aligner " + self.cpyFileName + ".oriented " + self.cpyFileName + ".refalign.stat" + os.system(cmd) + + else: + self._logAndRaise("Only one sequence available") + cmd = "echo empty" + + self._renameHeaders() + + for fileName in [self.inputFileName + ".shortH", self.inputFileName + ".shortHlink", self.inputFileName + ".shortH.fa_aln"]: + os.remove(fileName) + self._log.info("END LaunchRefAlign") + return 0 + + +if __name__ == "__main__": + iLaunchRefAlign = LaunchRefAlign() + iLaunchRefAlign.setAttributesFromCmdLine() + iLaunchRefAlign.run()