comparison commons/launcher/LaunchMCL.py @ 31:0ab839023fe4

Uploaded
author m-zytnicki
date Tue, 30 Apr 2013 14:33:21 -0400
parents 94ab73e8a190
children
comparison
equal deleted inserted replaced
30:5677346472b5 31:0ab839023fe4
1 #!/usr/bin/env python
2
3 # Copyright INRA (Institut National de la Recherche Agronomique)
4 # http://www.inra.fr
5 # http://urgi.versailles.inra.fr
6 #
7 # This software is governed by the CeCILL license under French law and
8 # abiding by the rules of distribution of free software. You can use,
9 # modify and/ or redistribute the software under the terms of the CeCILL
10 # license as circulated by CEA, CNRS and INRIA at the following URL
11 # "http://www.cecill.info".
12 #
13 # As a counterpart to the access to the source code and rights to copy,
14 # modify and redistribute granted by the license, users are provided only
15 # with a limited warranty and the software's author, the holder of the
16 # economic rights, and the successive licensors have only limited
17 # liability.
18 #
19 # In this respect, the user's attention is drawn to the risks associated
20 # with loading, using, modifying and/or developing or reproducing the
21 # software by the user in light of its specific status of free software,
22 # that may mean that it is complicated to manipulate, and that also
23 # therefore means that it is reserved for developers and experienced
24 # professionals having in-depth computer knowledge. Users are therefore
25 # encouraged to load and test the software's suitability as regards their
26 # requirements in conditions enabling the security of their systems and/or
27 # data to be ensured and, more generally, to use and operate it in the
28 # same conditions as regards security.
29 #
30 # The fact that you are presently reading this means that you have had
31 # knowledge of the CeCILL license and that you accept its terms.
32
33 from commons.core.LoggerFactory import LoggerFactory
34 from commons.core.utils.RepetOptionParser import RepetOptionParser
35 from commons.core.seq.FastaUtils import FastaUtils
36 from commons.core.coord.MatchUtils import MatchUtils
37 import subprocess
38 import os
39 import time
40 import shutil
41 from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
42
43 LOG_DEPTH = "repet.base"
44
45 ##Launch MCL
46 #
47 class LaunchMCL(object):
48
49 def __init__(self, fastaFileName = "", outFilePrefix = "", inflate = 1.5, covThres = 0.0, isJoined = False, isCluster2Map = False, isClusterConsensusHeaders = False, doClean = False, verbosity = 0):
50 self._fastaFileName = fastaFileName
51 self.setOutFilePrefix(outFilePrefix)
52 self._inflate = inflate
53 self._coverageThreshold = covThres
54 self._isJoined = isJoined
55 self._isCluster2Map = isCluster2Map
56 self._isClusterConsensusHeaders = isClusterConsensusHeaders
57 self._doClean = doClean
58 self._verbosity = verbosity
59 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
60
61 def setAttributesFromCmdLine(self):
62 description = "Launch MCL clustering program."
63 epilog = "\nExample: launch without verbosity and keep temporary files.\n"
64 epilog += "\t$ python LaunchMCL.py -i file.fa -v 0\n"
65 parser = RepetOptionParser(description = description, epilog = epilog)
66 parser.add_option("-i", "--fasta", dest = "fastaFileName", action = "store", type = "string", help = "input fasta file name [compulsory] [format: fasta]", default = "")
67 parser.add_option("-o", "--out", dest = "outFilePrefix", action = "store", type = "string", help = "prefix of the output files [default=input fasta file name]", default = "")
68 parser.add_option("-I", "--inflate", dest = "inflate", action = "store", type = "float", help = "inflate parameter of MCL [optional] [default: 1.5]", default = 1.5)
69 parser.add_option("-T", "--coverage", dest = "coverageThreshold", action = "store", type = "float", help = "length coverage threshold (default=0.0, 0.0 <= value <= 1.0)", default = 0.0)
70 parser.add_option("-j", "--join", dest = "isJoined", action = "store_true", help = "join hits after alignement [optional] [default: False]" , default = False)
71 parser.add_option("-m", "--map", dest = "isCluster2Map", action = "store_true", help = "generate an additional output file in map format (Warning: only works if MCL's fasta input headers are formated like LTRharvest fasta output)", default = False)
72 parser.add_option("", "--isClusterConsensusHeaders", dest = "isClusterConsensusHeaders", action="store_true", help = "format headers for Cluster Consensus tool", default = False)
73 parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False)
74 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
75 options = parser.parse_args()[0]
76 self._setAttributesFromOptions(options)
77
78 def _setAttributesFromOptions(self, options):
79 self.setFastaFileName(options.fastaFileName)
80 self.setOutFilePrefix(options.outFilePrefix)
81 self.setInflate(options.inflate)
82 self.setCoverageThreshold(options.coverageThreshold)
83 self.setIsJoined(options.isJoined)
84 self.setIsCluster2Map(options.isCluster2Map)
85 self.setIsClusterConsensusHeaders(options.isClusterConsensusHeaders)
86 self.setDoClean(options.doClean)
87 self.setVerbosity(options.verbosity)
88
89 def setFastaFileName(self, fastaFileName):
90 self._fastaFileName = fastaFileName
91
92 def setOutFilePrefix(self, outFilePrefix):
93 if outFilePrefix == "":
94 self._outFilePrefix = os.path.splitext(self._fastaFileName)[0]
95 else:
96 self._outFilePrefix = outFilePrefix
97
98 def setInflate(self, inflate):
99 self._inflate = inflate
100
101 def setCoverageThreshold(self, covThres):
102 self._coverageThreshold = float(covThres)
103
104 def setIsJoined(self, isJoined):
105 self._isJoined = isJoined
106
107 def setDoClean(self, doClean):
108 self._doClean = doClean
109
110 def setIsCluster2Map(self, isCluster2Map):
111 self._isCluster2Map = isCluster2Map
112
113 def setIsClusterConsensusHeaders(self, isClusterConsensusHeaders):
114 self._isClusterConsensusHeaders = isClusterConsensusHeaders
115
116 def setVerbosity(self, verbosity):
117 self._verbosity = verbosity
118
119 def _checkOptions(self):
120 if self._fastaFileName == "":
121 self._logAndRaise("ERROR: Missing input fasta file name")
122 if self._isCluster2Map and self._isClusterConsensusHeaders:
123 self._logAndRaise("ERROR: You can't use both '--isClusterConsensusHeaders' and '-m' options")
124 if self._coverageThreshold > 1 or self._coverageThreshold < 0:
125 self._logAndRaise("ERROR: Coverage Threshold must be in [0.0 , 1.0]")
126
127 def _logAndRaise(self, errorMsg):
128 self._log.error(errorMsg)
129 raise Exception(errorMsg)
130
131 def run(self):
132 LoggerFactory.setLevel(self._log, self._verbosity)
133 self._checkOptions()
134 self._log.info("START Launch MCL")
135 self._log.debug("With parameters: -i %s -o %s -I %.2f -T %.2f -j %r -m %r -clusterHeaders %r " % (self._fastaFileName, self._outFilePrefix , self._inflate , self._coverageThreshold, self._isJoined, self._isCluster2Map, self._isClusterConsensusHeaders))
136 #self._log.debug("With parameters: -i %s -o %s -I %.2f -T %.2f" % (self._fastaFileName, self._outFilePrefix , self._inflate , self._coverageThreshold))
137 self._log.debug("Fasta file name: %s" % self._fastaFileName)
138 workingDir = "MCLtmpDirectory"
139 if os.path.exists(workingDir):
140 self._logAndRaise("ERROR: %s already exists." % workingDir)
141 os.mkdir(workingDir)
142 os.chdir(workingDir)
143 linkToFastaFile = "%s2.fa" % os.path.splitext(self._fastaFileName)[0]
144 os.symlink("../%s" % self._fastaFileName, self._fastaFileName)
145 fastaFileNameShorten = "%s.shortH" % self._fastaFileName
146 iChangeSequenceHeaders = ChangeSequenceHeaders(inFile=self._fastaFileName, format="fasta", step=1, outFile=fastaFileNameShorten, verbosity=self._verbosity - 1)
147 iChangeSequenceHeaders.run()
148 os.symlink(fastaFileNameShorten, linkToFastaFile)
149
150 self._log.info("START Blaster-Matcher (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
151 cmd = "LaunchBlaster.py"
152 cmd += " -q %s" % fastaFileNameShorten
153 cmd += " -s %s" % linkToFastaFile
154 cmd += " -a"
155 cmd += " 1>&2 >> blasterMatcher.log"
156 process = subprocess.Popen(cmd, shell = True)
157 self._log.debug("Running : %s" % cmd)
158 process.communicate()
159 if process.returncode != 0:
160 self._logAndRaise("ERROR when launching '%s'" % cmd)
161 outBlasterFileName = "%s.align" % fastaFileNameShorten
162
163 cmd = "matcher"
164 cmd += " -m %s" % outBlasterFileName
165 cmd += " -q %s" % fastaFileNameShorten
166 cmd += " -s %s" % linkToFastaFile
167 cmd += " -a"
168 if self._isJoined:
169 cmd += " -j"
170 cmd += " 1>&2 >> blasterMatcher.log"
171 process = subprocess.Popen(cmd, shell=True)
172 self._log.debug("Running : %s" % cmd)
173 process.communicate()
174 if process.returncode != 0:
175 self._logAndRaise("ERROR when launching '%s'" % cmd)
176 self._log.info("END Blaster-Matcher (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
177
178 outMatcherFileName = "%s.match.tab" % outBlasterFileName
179 inputABCFileName = "%s.shortH.abc" % os.path.splitext(fastaFileNameShorten)[0]
180 MatchUtils.convertMatchFileIntoABCFileOnQueryCoverage(outMatcherFileName, inputABCFileName, coverage = self._coverageThreshold)
181 outMCLPreprocessFileName = "MCLPreprocess.out"
182
183 self._log.info("START MCL (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
184 cmd = "mcxload"
185 cmd += " -abc %s" % inputABCFileName
186 cmd += " --stream-mirror"
187 cmd += " --stream-neg-log10"
188 cmd += " -stream-tf 'ceil(200)'"
189 cmd += " -o %s" % outMCLPreprocessFileName
190 cmd += " -write-tab %s.tab" % outMCLPreprocessFileName
191 cmd += " 1>&2 > MCLpreprocess.log"
192 process = subprocess.Popen(cmd, shell = True)
193 self._log.debug("Running : %s" % cmd)
194 process.communicate()
195 if process.returncode != 0:
196 self._logAndRaise("ERROR when launching '%s'" % cmd)
197
198 outMCLFileName = "out.shortH.mcl"
199 cmd = "mcl"
200 cmd += " %s" % outMCLPreprocessFileName
201 cmd += " -I %s" % self._inflate
202 cmd += " -use-tab %s.tab" % outMCLPreprocessFileName
203 cmd += " -o %s" % outMCLFileName
204 cmd += " 1>&2 > MCL.log"
205 process = subprocess.Popen(cmd, shell = True)
206 self._log.debug("Running : %s" % cmd)
207 process.communicate()
208 if process.returncode != 0:
209 self._logAndRaise("ERROR when launching '%s'" % cmd)
210 self._log.info("END MCL (%s)" % time.strftime("%Y-%m-%d %H:%M:%S"))
211
212 outFastaFileNameShorten = "%s.fa" % os.path.splitext(outMCLFileName)[0]
213
214 FastaUtils.convertClusterFileToFastaFile(outMCLFileName, fastaFileNameShorten, outFastaFileNameShorten, "MCL", verbosity = self._verbosity - 1)
215
216 outFastaFileName = "%s_MCL.fa" % self._outFilePrefix
217 linkFileName = "%s.newHlink" % self._fastaFileName
218 headerStyle = "A"
219 if self._isClusterConsensusHeaders:
220 headerStyle = "B"
221 iChangeSequenceHeaders = ChangeSequenceHeaders(inFile=outFastaFileNameShorten, format="fasta", step=2, outFile=outFastaFileName, linkFile=linkFileName, whichCluster = headerStyle, verbosity=self._verbosity - 1)
222 iChangeSequenceHeaders.run()
223
224 if self._isCluster2Map:
225 outMapFileName = "%s_MCL.map" % self._outFilePrefix
226 FastaUtils.convertClusteredFastaFileToMapFile(outFastaFileName, outMapFileName)
227 shutil.move(outMapFileName, "..")
228
229 shutil.move(outFastaFileName, "..")
230 os.chdir("..")
231 if self._doClean:
232 self._log.warning("Working directory will be cleaned")
233 shutil.rmtree(workingDir)
234 self._log.info("END Launch MCL")
235
236 if __name__ == "__main__":
237 iLaunch = LaunchMCL()
238 iLaunch.setAttributesFromCmdLine()
239 iLaunch.run()