18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Launch Blastclust on nucleotide sequences and return a fasta file.
|
|
5 """
|
|
6
|
|
7 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
8 # http://www.inra.fr
|
|
9 # http://urgi.versailles.inra.fr
|
|
10 #
|
|
11 # This software is governed by the CeCILL license under French law and
|
|
12 # abiding by the rules of distribution of free software. You can use,
|
|
13 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
14 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
15 # "http://www.cecill.info".
|
|
16 #
|
|
17 # As a counterpart to the access to the source code and rights to copy,
|
|
18 # modify and redistribute granted by the license, users are provided only
|
|
19 # with a limited warranty and the software's author, the holder of the
|
|
20 # economic rights, and the successive licensors have only limited
|
|
21 # liability.
|
|
22 #
|
|
23 # In this respect, the user's attention is drawn to the risks associated
|
|
24 # with loading, using, modifying and/or developing or reproducing the
|
|
25 # software by the user in light of its specific status of free software,
|
|
26 # that may mean that it is complicated to manipulate, and that also
|
|
27 # therefore means that it is reserved for developers and experienced
|
|
28 # professionals having in-depth computer knowledge. Users are therefore
|
|
29 # encouraged to load and test the software's suitability as regards their
|
|
30 # requirements in conditions enabling the security of their systems and/or
|
|
31 # data to be ensured and, more generally, to use and operate it in the
|
|
32 # same conditions as regards security.
|
|
33 #
|
|
34 # The fact that you are presently reading this means that you have had
|
|
35 # knowledge of the CeCILL license and that you accept its terms.
|
|
36
|
|
37 import os
|
|
38 import sys
|
|
39 import subprocess
|
|
40 from commons.core.seq.BioseqDB import BioseqDB
|
|
41 from commons.core.seq.Bioseq import Bioseq
|
|
42 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
43 from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
|
|
44
|
|
45 class LaunchBlastclust(object):
|
|
46 """
|
|
47 Launch Blastclust on nucleotide sequences and return a fasta file.
|
|
48 """
|
|
49
|
|
50 def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0):
|
|
51 """
|
|
52 Constructor.
|
|
53 """
|
|
54 self._inFileName = input
|
|
55 self._identityThreshold = 95
|
|
56 self._coverageThreshold = 0.9
|
|
57 self._bothSeq = "T"
|
|
58 self._filterUnclusteredSeq = False
|
|
59 self._outFilePrefix = outFilePrefix
|
|
60 self._isBlastToMap = False
|
|
61 self._isHeaderForTEdenovo = False
|
|
62 self._nbCPUs = 1
|
|
63 self._clean = clean
|
|
64 self._verbose = verbose
|
|
65 self._tmpFileName = ""
|
|
66
|
|
67 def setAttributesFromCmdLine(self):
|
|
68 """
|
|
69 Set the attributes from the command-line.
|
|
70 """
|
|
71
|
|
72 description = "Launch Blastclust on nucleotide sequences and return a fasta file."
|
|
73 usage = "LaunchBlastclust.py -i inputFileName [options]"
|
|
74
|
|
75 examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n"
|
|
76 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c"
|
|
77 examples += "\n\t"
|
|
78 examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n"
|
|
79 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m"
|
|
80 examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n"
|
|
81
|
|
82 #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option
|
|
83
|
|
84 parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
|
|
85 parser.add_option("-i", "--input", dest = "inFileName", type = "string", help = "name of the input fasta file (nucleotides)", default = "")
|
|
86 parser.add_option("-L", "--length", dest = "coverageThreshold", type = "float", help = "length coverage threshold (default=0.9)", default = 0.9)
|
|
87 parser.add_option("-S", "--ident", dest = "identityThreshold", type = "int", help = "identity threshold (default=95)", default = 95)
|
|
88 parser.add_option("-b", "--both", dest = "bothSeq", type = "string", help = "require coverage on both neighbours (default=T/F)", default = "T")
|
|
89 parser.add_option("-f", "--filter", dest = "filterUnclusteredSeq", help = "filter unclustered sequences", default = False, action="store_true")
|
|
90 parser.add_option("-o", "--out", dest = "outFilePrefix", type = "string", help = "prefix of the output files (default=input fasta file name)", default = "")
|
|
91 parser.add_option("-m", "--map", dest = "isBlast2Map", help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)", default = False, action="store_true")
|
|
92 parser.add_option("", "--TEdenovoHeader", dest = "isHeaderForTEdenovo", help = "format headers for TEdenovo pipeline", default = False, action="store_true")
|
|
93 parser.add_option("-n", "--num", dest = "nbCPUs", type = "int", help = "number of CPU's to use (default=1)", default = 1)
|
|
94 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
|
|
95 parser.add_option("-v", "--verbose", dest = "verbose", type = "int", help = "verbosity level (default=0/1/2)", default = 0)
|
|
96
|
|
97 options = parser.parse_args()[0]
|
|
98 self._setAttributesFromOptions(options)
|
|
99
|
|
100 def _setAttributesFromOptions(self, options):
|
|
101 self.setInputFileName(options.inFileName)
|
|
102 self.setCoverageThreshold(options.coverageThreshold)
|
|
103 self.setIdentityThreshold(options.identityThreshold)
|
|
104 self.setBothSequences(options.bothSeq)
|
|
105 self.setNbCPUs(options.nbCPUs)
|
|
106 self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo)
|
|
107 if options.filterUnclusteredSeq:
|
|
108 self.setFilterUnclusteredSequences()
|
|
109 if options.outFilePrefix != "":
|
|
110 self.setOutputFilePrefix(options.outFilePrefix)
|
|
111 else:
|
|
112 self._outFilePrefix = self._inFileName
|
|
113 if options.isBlast2Map:
|
|
114 self.setIsBlastToMap()
|
|
115 if options.clean:
|
|
116 self.setClean()
|
|
117 self.setVerbosityLevel(options.verbose)
|
|
118
|
|
119 def setInputFileName(self , inFileName):
|
|
120 self._inFileName = inFileName
|
|
121
|
|
122 def setCoverageThreshold(self, lengthThresh):
|
|
123 self._coverageThreshold = float(lengthThresh)
|
|
124
|
|
125 def setIdentityThreshold(self, identityThresh):
|
|
126 self._identityThreshold = int(identityThresh)
|
|
127
|
|
128 def setBothSequences(self, bothSeq):
|
|
129 self._bothSeq = bothSeq
|
|
130
|
|
131 def setNbCPUs(self, nbCPUs):
|
|
132 self._nbCPUs = int(nbCPUs)
|
|
133
|
|
134 def setFilterUnclusteredSequences(self):
|
|
135 self._filterUnclusteredSeq = True
|
|
136
|
|
137 def setOutputFilePrefix(self, outFilePrefix):
|
|
138 self._outFilePrefix = outFilePrefix
|
|
139
|
|
140 def setIsBlastToMap(self):
|
|
141 self._isBlastToMap = True
|
|
142
|
|
143 def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo):
|
|
144 self._isHeaderForTEdenovo = isHeaderForTEdenovo
|
|
145
|
|
146 def setClean(self):
|
|
147 self._clean = True
|
|
148
|
|
149 def setVerbosityLevel(self, verbose):
|
|
150 self._verbose = int(verbose)
|
|
151
|
|
152 def setTmpFileName(self, tmpFileName):
|
|
153 self._tmpFileName = tmpFileName
|
|
154
|
|
155
|
|
156 def checkAttributes(self):
|
|
157 """
|
|
158 Check the attributes are valid before running the algorithm.
|
|
159 """
|
|
160 if self._inFileName == "":
|
|
161 print "ERROR: missing input file name (-i)"
|
|
162 sys.exit(1)
|
|
163 if self._outFilePrefix == "":
|
|
164 self._outFilePrefix = self._inFileName
|
|
165 self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix)
|
|
166
|
|
167
|
|
168 def launchBlastclust(self, inFile):
|
|
169 """
|
|
170 Launch Blastclust in command-line.
|
|
171 """
|
|
172 if os.path.exists(os.path.basename(inFile)):
|
|
173 inFile = os.path.basename(inFile)
|
|
174 prg = "blastclust"
|
|
175 cmd = prg
|
|
176 cmd += " -i %s" % (inFile)
|
|
177 cmd += " -o %s" % (self._tmpFileName)
|
|
178 cmd += " -S %i" % (self._identityThreshold)
|
|
179 cmd += " -L %f" % (self._coverageThreshold)
|
|
180 cmd += " -b %s" % (self._bothSeq)
|
|
181 cmd += " -p F"
|
|
182 cmd += " -a %i" % (self._nbCPUs)
|
|
183 if self._verbose == 0:
|
|
184 cmd += " -v blastclust.log"
|
|
185 if self._verbose > 0:
|
|
186 print cmd
|
|
187 sys.stdout.flush()
|
|
188 process = subprocess.Popen(cmd, shell = True)
|
|
189 process.communicate()
|
|
190 if process.returncode != 0:
|
|
191 raise Exception("ERROR when launching '%s'" % cmd)
|
|
192 if self._clean and os.path.exists("error.log"):
|
|
193 os.remove("error.log")
|
|
194 if self._clean and os.path.exists("blastclust.log"):
|
|
195 os.remove("blastclust.log")
|
|
196
|
|
197
|
|
198 def getClustersFromTxtFile(self):
|
|
199 """
|
|
200 Return a dictionary with cluster IDs as keys and sequence headers as values.
|
|
201 """
|
|
202 dClusterId2SeqHeaders = {}
|
|
203 inF = open(self._tmpFileName, "r")
|
|
204 line = inF.readline()
|
|
205 clusterId = 1
|
|
206 while True:
|
|
207 if line == "":
|
|
208 break
|
|
209 tokens = line[:-1].split(" ")
|
|
210 dClusterId2SeqHeaders[clusterId] = []
|
|
211 for seqHeader in tokens:
|
|
212 if seqHeader != "":
|
|
213 dClusterId2SeqHeaders[clusterId].append(seqHeader)
|
|
214 line = inF.readline()
|
|
215 clusterId += 1
|
|
216 inF.close()
|
|
217 if self._verbose > 0:
|
|
218 print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys()))
|
|
219 sys.stdout.flush()
|
|
220 return dClusterId2SeqHeaders
|
|
221
|
|
222
|
|
223 def filterUnclusteredSequences(self, dClusterId2SeqHeaders):
|
|
224 """
|
|
225 Filter clusters having only one sequence.
|
|
226 """
|
|
227 for clusterId in dClusterId2SeqHeaders.keys():
|
|
228 if len(dClusterId2SeqHeaders[clusterId]) == 1:
|
|
229 del dClusterId2SeqHeaders[clusterId]
|
|
230 if self._verbose > 0:
|
|
231 print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys()))
|
|
232 sys.stdout.flush()
|
|
233 return dClusterId2SeqHeaders
|
|
234
|
|
235
|
|
236 def getClusteringResultsInFasta(self, inFile):
|
|
237 """
|
|
238 Write a fasta file whose sequence headers contain the clustering IDs.
|
|
239 """
|
|
240 dClusterId2SeqHeaders = self.getClustersFromTxtFile()
|
|
241 if self._filterUnclusteredSeq:
|
|
242 dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders)
|
|
243 inDB = BioseqDB(inFile)
|
|
244 outFileName = "%s_Blastclust.fa" % (inFile)
|
|
245 outF = open(outFileName, "w")
|
|
246 for clusterId in dClusterId2SeqHeaders.keys():
|
|
247 memberId = 1
|
|
248 for seqHeader in dClusterId2SeqHeaders[clusterId]:
|
|
249 bs = inDB.fetch(seqHeader)
|
|
250 bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader)
|
|
251 bs.write(outF)
|
|
252 memberId += 1
|
|
253 outF.close()
|
|
254
|
|
255
|
|
256 def getLinkInitNewHeaders(self):
|
|
257 dNew2Init = {}
|
|
258 linkFileName = "%s.shortHlink" % (self._inFileName)
|
|
259 linkFile = open(linkFileName,"r")
|
|
260 line = linkFile.readline()
|
|
261 while True:
|
|
262 if line == "":
|
|
263 break
|
|
264 data = line.split("\t")
|
|
265 dNew2Init[data[0]] = data[1]
|
|
266 line = linkFile.readline()
|
|
267 linkFile.close()
|
|
268 return dNew2Init
|
|
269
|
|
270
|
|
271 def retrieveInitHeaders(self, dNewH2InitH):
|
|
272 tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName)
|
|
273 tmpFaFileHandler = open(tmpFaFile, "r")
|
|
274 outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix)
|
|
275 outFaFileHandler = open(outFaFile, "w")
|
|
276 while True:
|
|
277 line = tmpFaFileHandler.readline()
|
|
278 if line == "":
|
|
279 break
|
|
280 if line[0] == ">":
|
|
281 tokens = line[1:-1].split("_")
|
|
282 initHeader = dNewH2InitH[tokens[1]]
|
|
283 if self._isHeaderForTEdenovo:
|
|
284 classif = initHeader.split("_")[0]
|
|
285 consensusName = "_".join(initHeader.split("_")[1:])
|
|
286 clusterId = tokens[0].split("Cluster")[1].split("Mb")[0]
|
|
287 newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName)
|
|
288 else:
|
|
289 newHeader = "%s_%s" % (tokens[0], initHeader)
|
|
290 outFaFileHandler.write(">%s\n" % (newHeader))
|
|
291 else:
|
|
292 outFaFileHandler.write(line)
|
|
293 tmpFaFileHandler.close()
|
|
294 outFaFileHandler.close()
|
|
295 if self._clean:
|
|
296 os.remove(tmpFaFile)
|
|
297
|
|
298
|
|
299 def blastclustToMap(self, blastclustFastaOut):
|
|
300 """
|
|
301 Write a map file from blastclust fasta output.
|
|
302 Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output.
|
|
303 """
|
|
304 fileDb = open(blastclustFastaOut , "r")
|
|
305 mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0])
|
|
306 fileMap = open(mapFilename, "w")
|
|
307 seq = Bioseq()
|
|
308 numseq = 0
|
|
309 while 1:
|
|
310 seq.read(fileDb)
|
|
311 if seq.sequence == None:
|
|
312 break
|
|
313 numseq = numseq + 1
|
|
314 ID = seq.header.split(' ')[0].split('_')[0]
|
|
315 chunk = seq.header.split(' ')[0].split('_')[1]
|
|
316 start = seq.header.split(' ')[-1].split(',')[0][1:]
|
|
317 end = seq.header.split(' ')[-1].split(',')[1][:-1]
|
|
318 line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
|
|
319 fileMap.write(line + "\n")
|
|
320
|
|
321 fileDb.close()
|
|
322 fileMap.close()
|
|
323 print "saved in %s" % mapFilename
|
|
324
|
|
325
|
|
326 def start(self):
|
|
327 """
|
|
328 Useful commands before running the program.
|
|
329 """
|
|
330 self.checkAttributes()
|
|
331 if self._verbose > 0:
|
|
332 print "START %s" % (type(self).__name__)
|
|
333
|
|
334
|
|
335 def end(self):
|
|
336 """
|
|
337 Useful commands before ending the program.
|
|
338 """
|
|
339 if self._verbose > 0:
|
|
340 print "END %s" % (type(self).__name__)
|
|
341
|
|
342
|
|
343 def run(self):
|
|
344 """
|
|
345 Run the program.
|
|
346 """
|
|
347 self.start()
|
|
348
|
|
349 iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName)
|
|
350 iCSH.run()
|
|
351
|
|
352 self.launchBlastclust("%s.shortH" % (self._inFileName))
|
|
353
|
|
354 self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName))
|
|
355
|
|
356 dNewH2InitH = self.getLinkInitNewHeaders()
|
|
357 self.retrieveInitHeaders(dNewH2InitH)
|
|
358
|
|
359 if self._isBlastToMap:
|
|
360 blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix)
|
|
361 self.blastclustToMap(blastclustFileName)
|
|
362
|
|
363 if self._clean:
|
|
364 os.remove("%s.shortH" % (self._inFileName))
|
|
365 os.remove("%s.shortHlink" % (self._inFileName))
|
|
366
|
|
367 self.end()
|
|
368
|
|
369 if __name__ == "__main__":
|
|
370 i = LaunchBlastclust()
|
|
371 i.setAttributesFromCmdLine()
|
|
372 i.run()
|