18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
4 # http://www.inra.fr
|
|
5 # http://urgi.versailles.inra.fr
|
|
6 #
|
|
7 # This software is governed by the CeCILL license under French law and
|
|
8 # abiding by the rules of distribution of free software. You can use,
|
|
9 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
10 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
11 # "http://www.cecill.info".
|
|
12 #
|
|
13 # As a counterpart to the access to the source code and rights to copy,
|
|
14 # modify and redistribute granted by the license, users are provided only
|
|
15 # with a limited warranty and the software's author, the holder of the
|
|
16 # economic rights, and the successive licensors have only limited
|
|
17 # liability.
|
|
18 #
|
|
19 # In this respect, the user's attention is drawn to the risks associated
|
|
20 # with loading, using, modifying and/or developing or reproducing the
|
|
21 # software by the user in light of its specific status of free software,
|
|
22 # that may mean that it is complicated to manipulate, and that also
|
|
23 # therefore means that it is reserved for developers and experienced
|
|
24 # professionals having in-depth computer knowledge. Users are therefore
|
|
25 # encouraged to load and test the software's suitability as regards their
|
|
26 # requirements in conditions enabling the security of their systems and/or
|
|
27 # data to be ensured and, more generally, to use and operate it in the
|
|
28 # same conditions as regards security.
|
|
29 #
|
|
30 # The fact that you are presently reading this means that you have had
|
|
31 # knowledge of the CeCILL license and that you accept its terms.
|
|
32
|
|
33 from commons.core.LoggerFactory import LoggerFactory
|
|
34 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
35 from commons.core.checker.ConfigChecker import ConfigRules
|
|
36 from commons.core.checker.ConfigChecker import ConfigChecker
|
|
37 import subprocess
|
|
38 import os
|
|
39 from commons.core.seq.Bioseq import Bioseq
|
|
40
|
|
41 LOG_DEPTH = "repet.core.launchers"
|
|
42
|
|
43 from commons.core.seq.BioseqDB import BioseqDB
|
|
44 from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
|
|
45
|
|
46
|
|
47 class LaunchRefAlign(object):
|
|
48 """
|
|
49 Launch 'refalign' to build a master-slave multiple sequence alignment.
|
|
50 """
|
|
51 def __init__(self, inputFileName="", outFileName="", gapSize=10, match=10, mismatch=8, gapOpen=16, gapExtend=4, refseqName="", keepRefseq =False, verbosity=3 ):
|
|
52 self.inputFileName = inputFileName
|
|
53 self.outFileName=outFileName
|
|
54 self.gapSize = gapSize
|
|
55 self.match = match
|
|
56 self.mismatch = mismatch
|
|
57 self.gapOpen = gapOpen
|
|
58 self.gapExtend = gapExtend
|
|
59 self.gapExtend = gapExtend
|
|
60 self.refseqName = refseqName
|
|
61 self.keepRefseq = keepRefseq
|
|
62 self._verbosity = verbosity
|
|
63 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
|
|
64
|
|
65 def setAttributesFromCmdLine(self):
|
|
66 description = "usage: LaunchRefalign.py [ options ]"
|
|
67 epilog = "\n -h: this help\n"
|
|
68 epilog += "\t -i: name of the input file (refseq is first, format='fasta')"
|
|
69 epilog += "\t -r: keep the reference sequence"
|
|
70 epilog += "\t -o: name of the output file (default=inFileName+'.fa_aln')"
|
|
71 epilog += "\t -v: verbosity (default=0)"
|
|
72 epilog += "\n\t"
|
|
73 parser = RepetOptionParser(description = description, epilog = epilog)
|
|
74 parser.add_option("-i", "--fasta", dest = "inputFileName", action = "store", type = "string", help = "input fasta file name [compulsory] [format: fasta]", default = "")
|
|
75 parser.add_option("-o", "--out", dest = "outFileName", action = "store", type = "string", help = "output file name [default: <input>.out]", default = "")
|
|
76 parser.add_option("-r", "--keepRefseq", dest = "keepRefseq", action = "store_true", help = "keep reference sequence [optional] [default: False]", default = False)
|
|
77 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
|
|
78 options = parser.parse_args()[0]
|
|
79 self._setAttributesFromOptions(options)
|
|
80
|
|
81 def _setAttributesFromOptions(self, options):
|
|
82 self.inputFileName = options.inputFileName
|
|
83 self.setOutFileName = options.outFileName
|
|
84 self.keepRefseq = options.keepRefseq
|
|
85 self._verbosity = options.verbosity
|
|
86
|
|
87 def _checkOptions(self):
|
|
88 if self.inputFileName == "":
|
|
89 self._logAndRaise("ERROR: Missing input file name")
|
|
90
|
|
91 if self.outFileName == "":
|
|
92 self.outFileName = "%s.fa_aln" % (self.inputFileName)
|
|
93
|
|
94 def _logAndRaise(self, errorMsg):
|
|
95 self._log.error(errorMsg)
|
|
96 raise Exception(errorMsg)
|
|
97
|
|
98 def _prepareRefAlign(self):
|
|
99 self.shortInputFileName = self.inputFileName+".shortH"
|
|
100 self.refFileName= self.shortInputFileName + ".ref"
|
|
101 self.cpyFileName=self.shortInputFileName + ".cpy"
|
|
102
|
|
103 file_db = open(self.shortInputFileName)
|
|
104 file_ref = open(self.refFileName,"w")
|
|
105 file_cpy = open(self.cpyFileName,"w")
|
|
106
|
|
107 self._numseq=0
|
|
108 while 1:
|
|
109 seq=Bioseq()
|
|
110 seq.read(file_db)
|
|
111 if seq.sequence==None:
|
|
112 break
|
|
113 self._numseq+=1
|
|
114 if self._numseq==1:
|
|
115 seq.write(file_ref)
|
|
116 else:
|
|
117 seq.write(file_cpy)
|
|
118 file_db.close()
|
|
119 file_ref.close()
|
|
120 file_cpy.close()
|
|
121
|
|
122 def _shortenHeaders(self):
|
|
123 self.csh = ChangeSequenceHeaders()
|
|
124 self.csh.setInputFile(self.inputFileName)
|
|
125 self.csh.setFormat("fasta")
|
|
126 self.csh.setStep(1)
|
|
127 self.csh.setPrefix("seq")
|
|
128 self.csh.setLinkFile(self.inputFileName+".shortHlink")
|
|
129 self.csh.setOutputFile(self.inputFileName+".shortH")
|
|
130 self.csh.setVerbosityLevel(self._verbosity-1)
|
|
131 self.csh.run()
|
|
132
|
|
133 bsDB = BioseqDB(self.inputFileName+".shortH")
|
|
134 bsDB.upCase()
|
|
135 bsDB.save(self.inputFileName+".shortHtmp")
|
|
136 del bsDB
|
|
137 os.rename(self.inputFileName+".shortHtmp", self.inputFileName+".shortH")
|
|
138
|
|
139 def _renameHeaders(self):
|
|
140 self.csh.setInputFile(self.inputFileName+".shortH.fa_aln")
|
|
141 self.csh.setFormat("fasta")
|
|
142 self.csh.setStep(2)
|
|
143 self.csh.setLinkFile(self.inputFileName+".shortHlink" )
|
|
144 self.csh.setOutputFile(self.outFileName)
|
|
145 self.csh.setVerbosityLevel(self._verbosity-1)
|
|
146 self.csh.run()
|
|
147
|
|
148 def run(self):
|
|
149 LoggerFactory.setLevel(self._log, self._verbosity)
|
|
150 self._checkOptions()
|
|
151 self._log.info("START LaunchRefAlign")
|
|
152 self._log.debug("building a multiple alignment from '%s'..." % ( self.inputFileName))
|
|
153
|
|
154 inputFileName = "%s/%s" % (os.getcwd(), os.path.basename(self.inputFileName))
|
|
155 if not os.path.exists(inputFileName):
|
|
156 os.symlink(self.inputFileName, inputFileName)
|
|
157 self.inputFileName = inputFileName
|
|
158
|
|
159 self._shortenHeaders()
|
|
160 if self.keepRefseq:
|
|
161 self.refseqName="seq1"
|
|
162 self._prepareRefAlign()
|
|
163
|
|
164 if self._numseq > 1:
|
|
165 cmd = "refalign %s %s -m %d -l %d -d %d -g %d -e %d" % (self.refFileName, self.cpyFileName, self.match, self.gapSize, self.mismatch, self.gapOpen, self.gapExtend)
|
|
166
|
|
167 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
168 self._log.debug("Running : %s" % cmd)
|
|
169 output = process.communicate()
|
|
170 self._log.debug("Output:\n%s" % output[0])
|
|
171 if process.returncode != 0:
|
|
172 self._logAndRaise("ERROR when launching '%s'" % cmd)
|
|
173 refseqNameParam = ""
|
|
174 if self.refseqName != "":
|
|
175 refseqNameParam = "-r %s " % (self.refseqName)
|
|
176 outFileName = self.inputFileName+".shortH.fa_aln"
|
|
177 #self.cpyFileName = os.path.join(os.getcwd(),os.path.basename(self.cpyFileName))
|
|
178
|
|
179 self._log.info("Copy file path %s " % self.cpyFileName)
|
|
180 print("Copy file path %s " % self.cpyFileName)
|
|
181 cmd = "refalign2fasta.py -i %s.aligner %s-g d -o %s -v 1" % (self.cpyFileName, refseqNameParam, outFileName)
|
|
182 self._log.debug("Running : %s" % cmd)
|
|
183 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
184 output = process.communicate()
|
|
185 self._log.debug("Output:\n%s" % output[0])
|
|
186
|
|
187 if process.returncode != 0:
|
|
188 self._logAndRaise("ERROR when launching '%s'" % cmd)
|
|
189
|
|
190 cmd = "rm -f "+ self.refFileName + " " + self.cpyFileName + " " + self.cpyFileName + ".aligner " + self.cpyFileName + ".oriented " + self.cpyFileName + ".refalign.stat"
|
|
191 os.system(cmd)
|
|
192
|
|
193 else:
|
|
194 self._logAndRaise("Only one sequence available")
|
|
195 cmd = "echo empty"
|
|
196
|
|
197 self._renameHeaders()
|
|
198
|
|
199 for fileName in [self.inputFileName + ".shortH", self.inputFileName + ".shortHlink", self.inputFileName + ".shortH.fa_aln"]:
|
|
200 os.remove(fileName)
|
|
201 self._log.info("END LaunchRefAlign")
|
|
202 return 0
|
|
203
|
|
204
|
|
205 if __name__ == "__main__":
|
|
206 iLaunchRefAlign = LaunchRefAlign()
|
|
207 iLaunchRefAlign.setAttributesFromCmdLine()
|
|
208 iLaunchRefAlign.run()
|