annotate commons/launcher/LaunchBlastclust.py @ 19:9bcfa7936eec

Deleted selected files
author m-zytnicki
date Mon, 29 Apr 2013 03:23:29 -0400
parents 94ab73e8a190
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
18
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
1 #!/usr/bin/env python
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
2
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
3 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
4 Launch Blastclust on nucleotide sequences and return a fasta file.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
5 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
6
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
7 # Copyright INRA (Institut National de la Recherche Agronomique)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
8 # http://www.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
9 # http://urgi.versailles.inra.fr
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
10 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
11 # This software is governed by the CeCILL license under French law and
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
12 # abiding by the rules of distribution of free software. You can use,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
13 # modify and/ or redistribute the software under the terms of the CeCILL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
14 # license as circulated by CEA, CNRS and INRIA at the following URL
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
15 # "http://www.cecill.info".
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
16 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
17 # As a counterpart to the access to the source code and rights to copy,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
18 # modify and redistribute granted by the license, users are provided only
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
19 # with a limited warranty and the software's author, the holder of the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
20 # economic rights, and the successive licensors have only limited
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
21 # liability.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
22 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
23 # In this respect, the user's attention is drawn to the risks associated
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
24 # with loading, using, modifying and/or developing or reproducing the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
25 # software by the user in light of its specific status of free software,
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
26 # that may mean that it is complicated to manipulate, and that also
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
27 # therefore means that it is reserved for developers and experienced
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
28 # professionals having in-depth computer knowledge. Users are therefore
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
29 # encouraged to load and test the software's suitability as regards their
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
30 # requirements in conditions enabling the security of their systems and/or
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
31 # data to be ensured and, more generally, to use and operate it in the
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
32 # same conditions as regards security.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
33 #
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
34 # The fact that you are presently reading this means that you have had
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
35 # knowledge of the CeCILL license and that you accept its terms.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
36
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
37 import os
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
38 import sys
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
39 import subprocess
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
40 from commons.core.seq.BioseqDB import BioseqDB
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
41 from commons.core.seq.Bioseq import Bioseq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
42 from commons.core.utils.RepetOptionParser import RepetOptionParser
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
43 from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
44
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
45 class LaunchBlastclust(object):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
46 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
47 Launch Blastclust on nucleotide sequences and return a fasta file.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
48 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
49
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
50 def __init__(self, input = "", outFilePrefix = "", clean = False, verbose = 0):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
51 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
52 Constructor.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
53 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
54 self._inFileName = input
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
55 self._identityThreshold = 95
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
56 self._coverageThreshold = 0.9
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
57 self._bothSeq = "T"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
58 self._filterUnclusteredSeq = False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
59 self._outFilePrefix = outFilePrefix
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
60 self._isBlastToMap = False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
61 self._isHeaderForTEdenovo = False
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
62 self._nbCPUs = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
63 self._clean = clean
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
64 self._verbose = verbose
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
65 self._tmpFileName = ""
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
66
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
67 def setAttributesFromCmdLine(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
68 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
69 Set the attributes from the command-line.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
70 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
71
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
72 description = "Launch Blastclust on nucleotide sequences and return a fasta file."
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
73 usage = "LaunchBlastclust.py -i inputFileName [options]"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
74
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
75 examples = "\nExample 1: launch Blastclust with default options, highest verbose and clean temporary files.\n"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
76 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -v 2 -c"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
77 examples += "\n\t"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
78 examples += "\t\nExample 2: launch Blastclust with an identity threshold of 90%, rename output files and generate a map file corresponding to the fasta output.\n"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
79 examples += "\t$ python ./LaunchBlastclust.py -i MyBank.fa -S 90 -o SpecialOutputName -m"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
80 examples += "\n\tWARNING: Please refer to -m option limitations in the description above.\n"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
81
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
82 #TODO: check if the optionParser can handle '\' into strings for a better code readability in -m option
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
83
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
84 parser = RepetOptionParser(description = description, usage = usage, version = "v1.0", epilog = examples)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
85 parser.add_option("-i", "--input", dest = "inFileName", type = "string", help = "name of the input fasta file (nucleotides)", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
86 parser.add_option("-L", "--length", dest = "coverageThreshold", type = "float", help = "length coverage threshold (default=0.9)", default = 0.9)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
87 parser.add_option("-S", "--ident", dest = "identityThreshold", type = "int", help = "identity threshold (default=95)", default = 95)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
88 parser.add_option("-b", "--both", dest = "bothSeq", type = "string", help = "require coverage on both neighbours (default=T/F)", default = "T")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
89 parser.add_option("-f", "--filter", dest = "filterUnclusteredSeq", help = "filter unclustered sequences", default = False, action="store_true")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
90 parser.add_option("-o", "--out", dest = "outFilePrefix", type = "string", help = "prefix of the output files (default=input fasta file name)", default = "")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
91 parser.add_option("-m", "--map", dest = "isBlast2Map", help = "generate an additional output file in map format (Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output)", default = False, action="store_true")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
92 parser.add_option("", "--TEdenovoHeader", dest = "isHeaderForTEdenovo", help = "format headers for TEdenovo pipeline", default = False, action="store_true")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
93 parser.add_option("-n", "--num", dest = "nbCPUs", type = "int", help = "number of CPU's to use (default=1)", default = 1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
94 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
95 parser.add_option("-v", "--verbose", dest = "verbose", type = "int", help = "verbosity level (default=0/1/2)", default = 0)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
96
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
97 options = parser.parse_args()[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
98 self._setAttributesFromOptions(options)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
99
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
100 def _setAttributesFromOptions(self, options):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
101 self.setInputFileName(options.inFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
102 self.setCoverageThreshold(options.coverageThreshold)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
103 self.setIdentityThreshold(options.identityThreshold)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
104 self.setBothSequences(options.bothSeq)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
105 self.setNbCPUs(options.nbCPUs)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
106 self.setIsHeaderForTEdenovo(options.isHeaderForTEdenovo)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
107 if options.filterUnclusteredSeq:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
108 self.setFilterUnclusteredSequences()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
109 if options.outFilePrefix != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
110 self.setOutputFilePrefix(options.outFilePrefix)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
111 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
112 self._outFilePrefix = self._inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
113 if options.isBlast2Map:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
114 self.setIsBlastToMap()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
115 if options.clean:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
116 self.setClean()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
117 self.setVerbosityLevel(options.verbose)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
118
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
119 def setInputFileName(self , inFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
120 self._inFileName = inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
121
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
122 def setCoverageThreshold(self, lengthThresh):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
123 self._coverageThreshold = float(lengthThresh)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
124
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
125 def setIdentityThreshold(self, identityThresh):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
126 self._identityThreshold = int(identityThresh)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
127
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
128 def setBothSequences(self, bothSeq):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
129 self._bothSeq = bothSeq
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
130
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
131 def setNbCPUs(self, nbCPUs):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
132 self._nbCPUs = int(nbCPUs)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
133
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
134 def setFilterUnclusteredSequences(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
135 self._filterUnclusteredSeq = True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
136
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
137 def setOutputFilePrefix(self, outFilePrefix):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
138 self._outFilePrefix = outFilePrefix
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
139
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
140 def setIsBlastToMap(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
141 self._isBlastToMap = True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
142
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
143 def setIsHeaderForTEdenovo(self, isHeaderForTEdenovo):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
144 self._isHeaderForTEdenovo = isHeaderForTEdenovo
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
145
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
146 def setClean(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
147 self._clean = True
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
148
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
149 def setVerbosityLevel(self, verbose):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
150 self._verbose = int(verbose)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
151
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
152 def setTmpFileName(self, tmpFileName):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
153 self._tmpFileName = tmpFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
154
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
155
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
156 def checkAttributes(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
157 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
158 Check the attributes are valid before running the algorithm.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
159 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
160 if self._inFileName == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
161 print "ERROR: missing input file name (-i)"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
162 sys.exit(1)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
163 if self._outFilePrefix == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
164 self._outFilePrefix = self._inFileName
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
165 self._tmpFileName = "%s_blastclust.txt" % (self._outFilePrefix)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
166
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
167
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
168 def launchBlastclust(self, inFile):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
169 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
170 Launch Blastclust in command-line.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
171 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
172 if os.path.exists(os.path.basename(inFile)):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
173 inFile = os.path.basename(inFile)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
174 prg = "blastclust"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
175 cmd = prg
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
176 cmd += " -i %s" % (inFile)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
177 cmd += " -o %s" % (self._tmpFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
178 cmd += " -S %i" % (self._identityThreshold)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
179 cmd += " -L %f" % (self._coverageThreshold)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
180 cmd += " -b %s" % (self._bothSeq)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
181 cmd += " -p F"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
182 cmd += " -a %i" % (self._nbCPUs)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
183 if self._verbose == 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
184 cmd += " -v blastclust.log"
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
185 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
186 print cmd
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
187 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
188 process = subprocess.Popen(cmd, shell = True)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
189 process.communicate()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
190 if process.returncode != 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
191 raise Exception("ERROR when launching '%s'" % cmd)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
192 if self._clean and os.path.exists("error.log"):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
193 os.remove("error.log")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
194 if self._clean and os.path.exists("blastclust.log"):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
195 os.remove("blastclust.log")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
196
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
197
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
198 def getClustersFromTxtFile(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
199 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
200 Return a dictionary with cluster IDs as keys and sequence headers as values.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
201 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
202 dClusterId2SeqHeaders = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
203 inF = open(self._tmpFileName, "r")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
204 line = inF.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
205 clusterId = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
206 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
207 if line == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
208 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
209 tokens = line[:-1].split(" ")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
210 dClusterId2SeqHeaders[clusterId] = []
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
211 for seqHeader in tokens:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
212 if seqHeader != "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
213 dClusterId2SeqHeaders[clusterId].append(seqHeader)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
214 line = inF.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
215 clusterId += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
216 inF.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
217 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
218 print "nb of clusters: %i" % (len(dClusterId2SeqHeaders.keys()))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
219 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
220 return dClusterId2SeqHeaders
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
221
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
222
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
223 def filterUnclusteredSequences(self, dClusterId2SeqHeaders):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
224 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
225 Filter clusters having only one sequence.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
226 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
227 for clusterId in dClusterId2SeqHeaders.keys():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
228 if len(dClusterId2SeqHeaders[clusterId]) == 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
229 del dClusterId2SeqHeaders[clusterId]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
230 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
231 print "nb of clusters (>1seq): %i" % (len(dClusterId2SeqHeaders.keys()))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
232 sys.stdout.flush()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
233 return dClusterId2SeqHeaders
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
234
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
235
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
236 def getClusteringResultsInFasta(self, inFile):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
237 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
238 Write a fasta file whose sequence headers contain the clustering IDs.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
239 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
240 dClusterId2SeqHeaders = self.getClustersFromTxtFile()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
241 if self._filterUnclusteredSeq:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
242 dClusterId2SeqHeaders = self.filterUnclusteredSequences(dClusterId2SeqHeaders)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
243 inDB = BioseqDB(inFile)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
244 outFileName = "%s_Blastclust.fa" % (inFile)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
245 outF = open(outFileName, "w")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
246 for clusterId in dClusterId2SeqHeaders.keys():
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
247 memberId = 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
248 for seqHeader in dClusterId2SeqHeaders[clusterId]:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
249 bs = inDB.fetch(seqHeader)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
250 bs.header = "BlastclustCluster%iMb%i_%s" % (clusterId, memberId, seqHeader)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
251 bs.write(outF)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
252 memberId += 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
253 outF.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
254
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
255
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
256 def getLinkInitNewHeaders(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
257 dNew2Init = {}
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
258 linkFileName = "%s.shortHlink" % (self._inFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
259 linkFile = open(linkFileName,"r")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
260 line = linkFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
261 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
262 if line == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
263 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
264 data = line.split("\t")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
265 dNew2Init[data[0]] = data[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
266 line = linkFile.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
267 linkFile.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
268 return dNew2Init
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
269
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
270
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
271 def retrieveInitHeaders(self, dNewH2InitH):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
272 tmpFaFile = "%s.shortH_Blastclust.fa" % (self._inFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
273 tmpFaFileHandler = open(tmpFaFile, "r")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
274 outFaFile = "%s_Blastclust.fa" % (self._outFilePrefix)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
275 outFaFileHandler = open(outFaFile, "w")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
276 while True:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
277 line = tmpFaFileHandler.readline()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
278 if line == "":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
279 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
280 if line[0] == ">":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
281 tokens = line[1:-1].split("_")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
282 initHeader = dNewH2InitH[tokens[1]]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
283 if self._isHeaderForTEdenovo:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
284 classif = initHeader.split("_")[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
285 consensusName = "_".join(initHeader.split("_")[1:])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
286 clusterId = tokens[0].split("Cluster")[1].split("Mb")[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
287 newHeader = "%s_Blc%s_%s" % (classif, clusterId, consensusName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
288 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
289 newHeader = "%s_%s" % (tokens[0], initHeader)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
290 outFaFileHandler.write(">%s\n" % (newHeader))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
291 else:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
292 outFaFileHandler.write(line)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
293 tmpFaFileHandler.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
294 outFaFileHandler.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
295 if self._clean:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
296 os.remove(tmpFaFile)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
297
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
298
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
299 def blastclustToMap(self, blastclustFastaOut):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
300 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
301 Write a map file from blastclust fasta output.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
302 Warning: only works if blastclust's fasta input headers are formated like LTRharvest fasta output.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
303 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
304 fileDb = open(blastclustFastaOut , "r")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
305 mapFilename = "%s.map" % (os.path.splitext(blastclustFastaOut)[0])
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
306 fileMap = open(mapFilename, "w")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
307 seq = Bioseq()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
308 numseq = 0
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
309 while 1:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
310 seq.read(fileDb)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
311 if seq.sequence == None:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
312 break
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
313 numseq = numseq + 1
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
314 ID = seq.header.split(' ')[0].split('_')[0]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
315 chunk = seq.header.split(' ')[0].split('_')[1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
316 start = seq.header.split(' ')[-1].split(',')[0][1:]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
317 end = seq.header.split(' ')[-1].split(',')[1][:-1]
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
318 line= '%s\t%s\t%s\t%s' % (ID, chunk, start, end)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
319 fileMap.write(line + "\n")
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
320
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
321 fileDb.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
322 fileMap.close()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
323 print "saved in %s" % mapFilename
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
324
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
325
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
326 def start(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
327 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
328 Useful commands before running the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
329 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
330 self.checkAttributes()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
331 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
332 print "START %s" % (type(self).__name__)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
333
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
334
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
335 def end(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
336 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
337 Useful commands before ending the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
338 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
339 if self._verbose > 0:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
340 print "END %s" % (type(self).__name__)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
341
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
342
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
343 def run(self):
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
344 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
345 Run the program.
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
346 """
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
347 self.start()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
348
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
349 iCSH = ChangeSequenceHeaders(inFile = self._inFileName, format = "fasta", step = 1, outFile = "%s.shortH" % self._inFileName, linkFile = "%s.shortHlink" % self._inFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
350 iCSH.run()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
351
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
352 self.launchBlastclust("%s.shortH" % (self._inFileName))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
353
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
354 self.getClusteringResultsInFasta("%s.shortH" % (self._inFileName))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
355
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
356 dNewH2InitH = self.getLinkInitNewHeaders()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
357 self.retrieveInitHeaders(dNewH2InitH)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
358
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
359 if self._isBlastToMap:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
360 blastclustFileName = "%s_Blastclust.fa" % (self._outFilePrefix)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
361 self.blastclustToMap(blastclustFileName)
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
362
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
363 if self._clean:
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
364 os.remove("%s.shortH" % (self._inFileName))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
365 os.remove("%s.shortHlink" % (self._inFileName))
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
366
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
367 self.end()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
368
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
369 if __name__ == "__main__":
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
370 i = LaunchBlastclust()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
371 i.setAttributesFromCmdLine()
94ab73e8a190 Uploaded
m-zytnicki
parents:
diff changeset
372 i.run()