Mercurial > repos > yufei-luo > s_mart
comparison commons/launcher/LaunchTallymer.py @ 31:0ab839023fe4
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 14:33:21 -0400 |
parents | 94ab73e8a190 |
children |
comparison
equal
deleted
inserted
replaced
30:5677346472b5 | 31:0ab839023fe4 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 """ | |
4 Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data | |
5 """ | |
6 | |
7 # Copyright INRA (Institut National de la Recherche Agronomique) | |
8 # http://www.inra.fr | |
9 # http://urgi.versailles.inra.fr | |
10 # | |
11 # This software is governed by the CeCILL license under French law and | |
12 # abiding by the rules of distribution of free software. You can use, | |
13 # modify and/ or redistribute the software under the terms of the CeCILL | |
14 # license as circulated by CEA, CNRS and INRIA at the following URL | |
15 # "http://www.cecill.info". | |
16 # | |
17 # As a counterpart to the access to the source code and rights to copy, | |
18 # modify and redistribute granted by the license, users are provided only | |
19 # with a limited warranty and the software's author, the holder of the | |
20 # economic rights, and the successive licensors have only limited | |
21 # liability. | |
22 # | |
23 # In this respect, the user's attention is drawn to the risks associated | |
24 # with loading, using, modifying and/or developing or reproducing the | |
25 # software by the user in light of its specific status of free software, | |
26 # that may mean that it is complicated to manipulate, and that also | |
27 # therefore means that it is reserved for developers and experienced | |
28 # professionals having in-depth computer knowledge. Users are therefore | |
29 # encouraged to load and test the software's suitability as regards their | |
30 # requirements in conditions enabling the security of their systems and/or | |
31 # data to be ensured and, more generally, to use and operate it in the | |
32 # same conditions as regards security. | |
33 # | |
34 # The fact that you are presently reading this means that you have had | |
35 # knowledge of the CeCILL license and that you accept its terms. | |
36 | |
37 import os | |
38 import shutil | |
39 import subprocess | |
40 import time | |
41 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
42 from commons.core.LoggerFactory import LoggerFactory | |
43 from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile | |
44 from commons.core.seq.BioseqUtils import BioseqUtils | |
45 from commons.core.seq.BioseqDB import BioseqDB | |
46 from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles | |
47 | |
48 LOG_DEPTH = "repet.tools" | |
49 | |
50 | |
51 class LaunchTallymer(object): | |
52 """ | |
53 Launch Tallymer's sub programs, generate map file, and convert output to | |
54 gff and wig, as well as visual (RPlot) data | |
55 """ | |
56 | |
57 _lValidFormats = ["gff", "gff3", "wig", "bed", "map"] | |
58 | |
59 def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0): | |
60 self.inputFasta = inputFasta | |
61 self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta | |
62 self.merSize = merSize | |
63 self.minOccs = minOccs | |
64 self.outputFormats = outputFormats | |
65 self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot | |
66 self.doClean = clean | |
67 self.verbosity = verbosity | |
68 | |
69 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity) | |
70 self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S")) | |
71 self._tmpSearchFileName = None | |
72 self._tmpMapFileName = None | |
73 self._tmpStatFileName = None | |
74 self._tmpPngFileName = None | |
75 self._plot_data = {} | |
76 self._plot_data2 = {} | |
77 | |
78 def setAttributesFromCmdLine(self): | |
79 description = "Generates stats from the results of the tallymer search ." | |
80 parser = RepetOptionParser(description=description) | |
81 parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "", action="store", type="string", help="input fasta file [compulsory] [format: fasta]") | |
82 parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "", action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]") | |
83 parser.add_option("-s", "--merSize", dest="merSize", default = 20, action="store", type="int", help="input merSize [optional][default:20]") | |
84 parser.add_option("-m", "--minOccs", dest="minOccs", default = 4, action="store", type="int", help="input minimal kmer occurence count [default:4]") | |
85 parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff", action="store", type="string", help="comma separated list of output file formats (can be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats)) | |
86 parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds") | |
87 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true") | |
88 parser.add_option("-v", "--verbosity", dest="verbosity", default = 1, action="store", type="int", help="verbosity [optional] ") | |
89 options, args = parser.parse_args() | |
90 self._setAttributesFromOptions(options) | |
91 | |
92 def _setAttributesFromOptions(self, options): | |
93 self.inputFasta = options.inputFasta | |
94 self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta | |
95 self.merSize = options.merSize | |
96 self.minOccs = options.minOccs | |
97 self.outputFormats = options.outputFormats | |
98 self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot | |
99 self.doClean = options.clean | |
100 self.verbosity = options.verbosity | |
101 | |
102 def _checkOptions(self): | |
103 if self.inputFasta == "": | |
104 self._logAndRaise("Error: missing input fasta file") | |
105 if self.merSize < 1: | |
106 self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize) | |
107 | |
108 self.outputFormats = self.outputFormats.lower().split(',') | |
109 sOutFormats = set(self.outputFormats) | |
110 sValidFormats = set(self._lValidFormats) | |
111 lInvalidFormats = list(sOutFormats - sValidFormats) | |
112 self.outputFormats = list(sValidFormats.intersection(sOutFormats)) | |
113 if lInvalidFormats: | |
114 self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats)) | |
115 if not self.outputFormats: | |
116 self._logAndRaise("Error: no valid output formats specified") | |
117 | |
118 def _logAndRaise(self, errorMsg): | |
119 self._log.error(errorMsg) | |
120 raise Exception(errorMsg) | |
121 | |
122 def clean(self): | |
123 try: | |
124 shutil.rmtree(self._workdir) | |
125 except Exception as inst: | |
126 self._log.error(inst) | |
127 | |
128 def run(self): | |
129 LoggerFactory.setLevel(self._log, self.verbosity) | |
130 self._checkOptions() | |
131 self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean)) | |
132 try: | |
133 os.makedirs(self._workdir) | |
134 except:pass | |
135 | |
136 srcPath = os.path.abspath(self.inputFasta) | |
137 dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta)) | |
138 os.symlink(srcPath, dstPath) | |
139 | |
140 if (self.indexationFasta != self.inputFasta): | |
141 srcPath = os.path.abspath(self.indexationFasta) | |
142 dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta)) | |
143 try: | |
144 os.symlink(srcPath, dstPath) | |
145 except OSError as inst: | |
146 pass | |
147 | |
148 os.chdir(self._workdir) | |
149 | |
150 if (self.indexationFasta != self.inputFasta): | |
151 self.indexationFasta = os.path.basename(self.indexationFasta) | |
152 else: | |
153 self.indexationFasta = os.path.basename(self.inputFasta) | |
154 self.inputFasta = os.path.basename(self.inputFasta) | |
155 | |
156 self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0] | |
157 self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName | |
158 self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName | |
159 self._tmpPngFileName = "%s.png" % self._tmpSearchFileName | |
160 | |
161 | |
162 | |
163 | |
164 | |
165 self._runTallymerSearch() | |
166 self._convertTallymerToMap() | |
167 self._writeOutputFiles() | |
168 | |
169 if self.nLargestScaffoldsToPlot > 0: | |
170 self._doPlot() | |
171 shutil.copy2(self._tmpPngFileName, "../.") | |
172 | |
173 shutil.copy2(self._tmpStatFileName, "../.") | |
174 os.chdir("..") | |
175 | |
176 if self.doClean: | |
177 self.clean() | |
178 | |
179 def _runTallymerSearch(self): | |
180 self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta)) | |
181 self._indexInputFastaFile() | |
182 self._countAndIndexKmersForGivenK() | |
183 self._searchKmersListInTallymerIndex() | |
184 self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta)) | |
185 | |
186 def _convertTallymerToMap(self): | |
187 self._log.info("Starting to run tallymer search to map conversion") | |
188 totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName) | |
189 self._log.debug("totalNbOcc=%i" % totalNbOcc) | |
190 self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ) | |
191 self._log.info("Finished tallymer search to map conversion") | |
192 | |
193 def _doPlot(self): | |
194 iBioseqDB = BioseqDB() | |
195 iBioseqDB.load(self.inputFasta) | |
196 largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot) | |
197 | |
198 for seq in largest_seqsDb.db: | |
199 headerCleaned = seq.header.replace(" ", "_") | |
200 shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned) | |
201 data = self._plot_data2[seq.header] | |
202 gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) ) | |
203 gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned)) | |
204 gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize) | |
205 gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName | |
206 gffPlotter._rawData = data | |
207 gffPlotter.run() | |
208 | |
209 def _indexInputFastaFile(self): | |
210 self._log.debug("index the input fasta file: get an enhanced suffix array.") | |
211 cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta) | |
212 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
213 self._log.debug("Running suffixerator with following params %s" % cmd) | |
214 output = process.communicate() | |
215 self._log.debug("Suffixerator Output:\n%s" % output[0]) | |
216 if process.returncode != 0: | |
217 self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta) | |
218 | |
219 def _countAndIndexKmersForGivenK(self): | |
220 self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize) | |
221 cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta) | |
222 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
223 self._log.debug("Running tallymer mkindex with following params %s" % cmd) | |
224 output = process.communicate() | |
225 self._log.debug("Tallymer mkindex Output:\n%s" % output[0]) | |
226 if process.returncode != 0: | |
227 self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta) | |
228 | |
229 def _searchKmersListInTallymerIndex(self): | |
230 self._log.debug("Searching list of kmers in tallymer-index ") | |
231 cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta) | |
232 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
233 self._log.debug("Running tallymer search with following params %s" % cmd) | |
234 output = process.communicate() | |
235 self._log.debug("Tallymer search Output:\n%s" % output[0]) | |
236 if process.returncode != 0: | |
237 self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta) | |
238 tmpOutputFile = open(self._tmpSearchFileName,'w') | |
239 tmpOutputFile.write(output[0]) | |
240 tmpOutputFile.close() | |
241 | |
242 def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ): | |
243 with open(self._tmpStatFileName, "w") as statFile: | |
244 statFile.write("kmer\tocc\tfreq\n") | |
245 for kmer in dKmer2Occ.keys(): | |
246 statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc))) | |
247 | |
248 def _writeOutputFiles(self): | |
249 for format in self.outputFormats: | |
250 self._log.info("Generating %s file" % format) | |
251 outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format) | |
252 try: | |
253 iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\ | |
254 inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity | |
255 iConvertTranscriptFile.run() | |
256 except Exception as inst: | |
257 self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format)) | |
258 shutil.copy2(outputFileName, "../.") | |
259 | |
260 | |
261 class ConvertUtils(object): | |
262 | |
263 def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName): | |
264 dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName) | |
265 plotData = {} | |
266 plotData2 = {} | |
267 with open(searchFileName, "r") as talFile: | |
268 with open(mapFileName, "w") as mapFile: | |
269 totalNbOcc = 0 | |
270 dKmer2Occ = {} | |
271 line = talFile.readline() | |
272 while line: | |
273 data = line[:-1].split("\t") | |
274 name = "%s_%s" % (data[3], data[2]) | |
275 nbOccs = int(data[2]) | |
276 chrname = dIndex2NameLengthList[int(data[0])][0] | |
277 if data[1][0] == "+": | |
278 start = int(data[1][1:]) + 1 | |
279 end = start + len(data[3]) | |
280 elif data[1][0] == "-": | |
281 start_revcomp = int(data[1][1:]) | |
282 start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1 | |
283 end = end - len(data[3]) + 1 | |
284 mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs) | |
285 mapFile.write(mapLine) | |
286 | |
287 if plotData2.get(chrname,None) is None: | |
288 plotData2[chrname] = {} | |
289 if plotData2[chrname].get(start, None) is None: | |
290 plotData2[chrname][start]=0 | |
291 plotData2[chrname][start] += nbOccs | |
292 | |
293 totalNbOcc += 1 | |
294 if dKmer2Occ.has_key(data[3]): | |
295 dKmer2Occ[data[3]] += 1 | |
296 else: | |
297 dKmer2Occ[data[3]] = 1 | |
298 plotData[start] = nbOccs | |
299 line = talFile.readline() | |
300 return totalNbOcc, dKmer2Occ, plotData, plotData2 | |
301 | |
302 convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData) | |
303 | |
304 def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName): | |
305 with open(fastaFileName) as fastaFile: | |
306 line = fastaFile.readline() | |
307 i = 0 | |
308 length = 0 | |
309 dIndex2Name = {} | |
310 while line: | |
311 if line[0] == ">": | |
312 dIndex2Name[i] = [line[1:-1]] | |
313 if i > 0: | |
314 dIndex2Name[i - 1].append(length) | |
315 length = 0 | |
316 i += 1 | |
317 else: | |
318 length += len(line[:-1]) | |
319 line = fastaFile.readline() | |
320 dIndex2Name[i - 1].append(length) | |
321 return dIndex2Name | |
322 | |
323 _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder) | |
324 | |
325 if __name__ == "__main__": | |
326 iLaunchTallymer = LaunchTallymer() | |
327 iLaunchTallymer.setAttributesFromCmdLine() | |
328 iLaunchTallymer.run() |