18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 Launch Tallymer's sub programs, generate map file, and convert output to gff and wig, as well as visual (RPlot) data
|
|
5 """
|
|
6
|
|
7 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
8 # http://www.inra.fr
|
|
9 # http://urgi.versailles.inra.fr
|
|
10 #
|
|
11 # This software is governed by the CeCILL license under French law and
|
|
12 # abiding by the rules of distribution of free software. You can use,
|
|
13 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
14 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
15 # "http://www.cecill.info".
|
|
16 #
|
|
17 # As a counterpart to the access to the source code and rights to copy,
|
|
18 # modify and redistribute granted by the license, users are provided only
|
|
19 # with a limited warranty and the software's author, the holder of the
|
|
20 # economic rights, and the successive licensors have only limited
|
|
21 # liability.
|
|
22 #
|
|
23 # In this respect, the user's attention is drawn to the risks associated
|
|
24 # with loading, using, modifying and/or developing or reproducing the
|
|
25 # software by the user in light of its specific status of free software,
|
|
26 # that may mean that it is complicated to manipulate, and that also
|
|
27 # therefore means that it is reserved for developers and experienced
|
|
28 # professionals having in-depth computer knowledge. Users are therefore
|
|
29 # encouraged to load and test the software's suitability as regards their
|
|
30 # requirements in conditions enabling the security of their systems and/or
|
|
31 # data to be ensured and, more generally, to use and operate it in the
|
|
32 # same conditions as regards security.
|
|
33 #
|
|
34 # The fact that you are presently reading this means that you have had
|
|
35 # knowledge of the CeCILL license and that you accept its terms.
|
|
36
|
|
37 import os
|
|
38 import shutil
|
|
39 import subprocess
|
|
40 import time
|
|
41 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
42 from commons.core.LoggerFactory import LoggerFactory
|
|
43 from SMART.Java.Python.convertTranscriptFile import ConvertTranscriptFile
|
|
44 from commons.core.seq.BioseqUtils import BioseqUtils
|
|
45 from commons.core.seq.BioseqDB import BioseqDB
|
|
46 from Tallymer_pipe.PlotBenchMarkGFFFiles import PlotBenchMarkGFFFiles
|
|
47
|
|
48 LOG_DEPTH = "repet.tools"
|
|
49
|
|
50
|
|
51 class LaunchTallymer(object):
|
|
52 """
|
|
53 Launch Tallymer's sub programs, generate map file, and convert output to
|
|
54 gff and wig, as well as visual (RPlot) data
|
|
55 """
|
|
56
|
|
57 _lValidFormats = ["gff", "gff3", "wig", "bed", "map"]
|
|
58
|
|
59 def __init__(self, inputFasta="", indexationFasta=None, merSize=20, minOccs=4, outputFormats="gff", nLargestScaffoldsToPlot=0, clean=False, verbosity=0):
|
|
60 self.inputFasta = inputFasta
|
|
61 self.indexationFasta = indexationFasta if indexationFasta != None else inputFasta
|
|
62 self.merSize = merSize
|
|
63 self.minOccs = minOccs
|
|
64 self.outputFormats = outputFormats
|
|
65 self.nLargestScaffoldsToPlot = nLargestScaffoldsToPlot
|
|
66 self.doClean = clean
|
|
67 self.verbosity = verbosity
|
|
68
|
|
69 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self.verbosity)
|
|
70 self._workdir = os.path.join(os.getcwd(), "launchTallymer_%s" % time.strftime("%Y%m%d%H%M%S"))
|
|
71 self._tmpSearchFileName = None
|
|
72 self._tmpMapFileName = None
|
|
73 self._tmpStatFileName = None
|
|
74 self._tmpPngFileName = None
|
|
75 self._plot_data = {}
|
|
76 self._plot_data2 = {}
|
|
77
|
|
78 def setAttributesFromCmdLine(self):
|
|
79 description = "Generates stats from the results of the tallymer search ."
|
|
80 parser = RepetOptionParser(description=description)
|
|
81 parser.add_option("-i", "--inputFasta", dest="inputFasta", default = "", action="store", type="string", help="input fasta file [compulsory] [format: fasta]")
|
|
82 parser.add_option("-u", "--indexationFasta", dest="indexationFasta", default = "", action="store", type="string", help="input indexation fasta file used to generate kmer index (defaults to input fasta)[optional] [format: fasta]")
|
|
83 parser.add_option("-s", "--merSize", dest="merSize", default = 20, action="store", type="int", help="input merSize [optional][default:20]")
|
|
84 parser.add_option("-m", "--minOccs", dest="minOccs", default = 4, action="store", type="int", help="input minimal kmer occurence count [default:4]")
|
|
85 parser.add_option("-f", "--outputFormats", dest="outputFormats", default = "gff", action="store", type="string", help="comma separated list of output file formats (can be %s) [optional] [default:gff]" % ", ".join(self._lValidFormats))
|
|
86 parser.add_option("-n", "--nLargestScaffoldsToPlot",default = 0, type="int", action="store", dest = "nLargestScaffoldsToPlot", help = "generate graph of Kmer repartition along the input sequence for the n biggest scaffolds")
|
|
87 parser.add_option("-c", "--clean", dest = "clean", help = "clean temporary files", default = False, action="store_true")
|
|
88 parser.add_option("-v", "--verbosity", dest="verbosity", default = 1, action="store", type="int", help="verbosity [optional] ")
|
|
89 options, args = parser.parse_args()
|
|
90 self._setAttributesFromOptions(options)
|
|
91
|
|
92 def _setAttributesFromOptions(self, options):
|
|
93 self.inputFasta = options.inputFasta
|
|
94 self.indexationFasta = options.indexationFasta if options.indexationFasta != "" else options.inputFasta
|
|
95 self.merSize = options.merSize
|
|
96 self.minOccs = options.minOccs
|
|
97 self.outputFormats = options.outputFormats
|
|
98 self.nLargestScaffoldsToPlot = options.nLargestScaffoldsToPlot
|
|
99 self.doClean = options.clean
|
|
100 self.verbosity = options.verbosity
|
|
101
|
|
102 def _checkOptions(self):
|
|
103 if self.inputFasta == "":
|
|
104 self._logAndRaise("Error: missing input fasta file")
|
|
105 if self.merSize < 1:
|
|
106 self._logAndRaise("Error: invalid kmer size '%i'; must be a positive integer" % self.merSize)
|
|
107
|
|
108 self.outputFormats = self.outputFormats.lower().split(',')
|
|
109 sOutFormats = set(self.outputFormats)
|
|
110 sValidFormats = set(self._lValidFormats)
|
|
111 lInvalidFormats = list(sOutFormats - sValidFormats)
|
|
112 self.outputFormats = list(sValidFormats.intersection(sOutFormats))
|
|
113 if lInvalidFormats:
|
|
114 self._log.warning("Warning: ignoring invalid formats: <%s>" % " ".join(lInvalidFormats))
|
|
115 if not self.outputFormats:
|
|
116 self._logAndRaise("Error: no valid output formats specified")
|
|
117
|
|
118 def _logAndRaise(self, errorMsg):
|
|
119 self._log.error(errorMsg)
|
|
120 raise Exception(errorMsg)
|
|
121
|
|
122 def clean(self):
|
|
123 try:
|
|
124 shutil.rmtree(self._workdir)
|
|
125 except Exception as inst:
|
|
126 self._log.error(inst)
|
|
127
|
|
128 def run(self):
|
|
129 LoggerFactory.setLevel(self._log, self.verbosity)
|
|
130 self._checkOptions()
|
|
131 self._log.debug("Input fasta file: %s; K-mer size: %s; Output formats: %s; Cleaning: %s" % (self.inputFasta, self.merSize, str(self.outputFormats), self.doClean))
|
|
132 try:
|
|
133 os.makedirs(self._workdir)
|
|
134 except:pass
|
|
135
|
|
136 srcPath = os.path.abspath(self.inputFasta)
|
|
137 dstPath = os.path.join(self._workdir,os.path.basename(self.inputFasta))
|
|
138 os.symlink(srcPath, dstPath)
|
|
139
|
|
140 if (self.indexationFasta != self.inputFasta):
|
|
141 srcPath = os.path.abspath(self.indexationFasta)
|
|
142 dstPath = os.path.join(self._workdir,os.path.basename(self.indexationFasta))
|
|
143 try:
|
|
144 os.symlink(srcPath, dstPath)
|
|
145 except OSError as inst:
|
|
146 pass
|
|
147
|
|
148 os.chdir(self._workdir)
|
|
149
|
|
150 if (self.indexationFasta != self.inputFasta):
|
|
151 self.indexationFasta = os.path.basename(self.indexationFasta)
|
|
152 else:
|
|
153 self.indexationFasta = os.path.basename(self.inputFasta)
|
|
154 self.inputFasta = os.path.basename(self.inputFasta)
|
|
155
|
|
156 self._tmpSearchFileName = "%s.tallymer" % os.path.splitext(os.path.basename(self.inputFasta))[0]
|
|
157 self._tmpMapFileName = "%s_tmp.map" % self._tmpSearchFileName
|
|
158 self._tmpStatFileName = "%s.stat" % self._tmpSearchFileName
|
|
159 self._tmpPngFileName = "%s.png" % self._tmpSearchFileName
|
|
160
|
|
161
|
|
162
|
|
163
|
|
164
|
|
165 self._runTallymerSearch()
|
|
166 self._convertTallymerToMap()
|
|
167 self._writeOutputFiles()
|
|
168
|
|
169 if self.nLargestScaffoldsToPlot > 0:
|
|
170 self._doPlot()
|
|
171 shutil.copy2(self._tmpPngFileName, "../.")
|
|
172
|
|
173 shutil.copy2(self._tmpStatFileName, "../.")
|
|
174 os.chdir("..")
|
|
175
|
|
176 if self.doClean:
|
|
177 self.clean()
|
|
178
|
|
179 def _runTallymerSearch(self):
|
|
180 self._log.info("Starting to run tallymer search of sequence %s " % (self.inputFasta))
|
|
181 self._indexInputFastaFile()
|
|
182 self._countAndIndexKmersForGivenK()
|
|
183 self._searchKmersListInTallymerIndex()
|
|
184 self._log.info("Finished running tallymer scan of sequence %s " % (self.inputFasta))
|
|
185
|
|
186 def _convertTallymerToMap(self):
|
|
187 self._log.info("Starting to run tallymer search to map conversion")
|
|
188 totalNbOcc, dKmer2Occ, self._plot_data, self._plot_data2 = ConvertUtils.convertTallymerFormatIntoMapFormatAndGenerateData(self.inputFasta, self._tmpSearchFileName, self._tmpMapFileName)
|
|
189 self._log.debug("totalNbOcc=%i" % totalNbOcc)
|
|
190 self._writeOccurencesNbAndFrequencyOfKmers(totalNbOcc, dKmer2Occ)
|
|
191 self._log.info("Finished tallymer search to map conversion")
|
|
192
|
|
193 def _doPlot(self):
|
|
194 iBioseqDB = BioseqDB()
|
|
195 iBioseqDB.load(self.inputFasta)
|
|
196 largest_seqsDb = iBioseqDB.bestLength(self.nLargestScaffoldsToPlot)
|
|
197
|
|
198 for seq in largest_seqsDb.db:
|
|
199 headerCleaned = seq.header.replace(" ", "_")
|
|
200 shortFastaName = "%s_%s" % (os.path.basename(self.inputFasta),headerCleaned)
|
|
201 data = self._plot_data2[seq.header]
|
|
202 gffPlotter = PlotBenchMarkGFFFiles(yLabel="Number of Kmer Occurences", maxLength=1, title="Kmer repartition along the input sequence: %s; MerSize: %i" % (shortFastaName, self.merSize) )
|
|
203 gffPlotter.setOutFileName("%s_%s.png" % (self._tmpSearchFileName, headerCleaned))
|
|
204 gffPlotter._title = "Mers along the input sequence: %s MerSize: %i" % (shortFastaName, self.merSize)
|
|
205 gffPlotter._xLabel = "Coordinates along the input sequence (%s)" % shortFastaName
|
|
206 gffPlotter._rawData = data
|
|
207 gffPlotter.run()
|
|
208
|
|
209 def _indexInputFastaFile(self):
|
|
210 self._log.debug("index the input fasta file: get an enhanced suffix array.")
|
|
211 cmd = "gt suffixerator -dna -pl -tis -suf -lcp -v -db %s -indexname %s.reads" % (self.indexationFasta, self.indexationFasta)
|
|
212 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
213 self._log.debug("Running suffixerator with following params %s" % cmd)
|
|
214 output = process.communicate()
|
|
215 self._log.debug("Suffixerator Output:\n%s" % output[0])
|
|
216 if process.returncode != 0:
|
|
217 self._logAndRaise("Error in generating enhanced suffix array in %s" % self.indexationFasta)
|
|
218
|
|
219 def _countAndIndexKmersForGivenK(self):
|
|
220 self._log.debug("Counting and indexing k-mers for k = %i " % self.merSize)
|
|
221 cmd = "gt tallymer mkindex -mersize %i -minocc %i -indexname %s.tyr-reads -counts -pl -esa %s.reads" % (self.merSize, self.minOccs, self.indexationFasta, self.indexationFasta)
|
|
222 process = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
223 self._log.debug("Running tallymer mkindex with following params %s" % cmd)
|
|
224 output = process.communicate()
|
|
225 self._log.debug("Tallymer mkindex Output:\n%s" % output[0])
|
|
226 if process.returncode != 0:
|
|
227 self._logAndRaise("Error in indexing kmers in %s.reads" % self.indexationFasta)
|
|
228
|
|
229 def _searchKmersListInTallymerIndex(self):
|
|
230 self._log.debug("Searching list of kmers in tallymer-index ")
|
|
231 cmd = "gt tallymer search -output qseqnum qpos counts sequence -tyr %s.tyr-reads -q %s" % (self.indexationFasta, self.inputFasta)
|
|
232 process = subprocess.Popen(cmd.split(' '),stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
|
233 self._log.debug("Running tallymer search with following params %s" % cmd)
|
|
234 output = process.communicate()
|
|
235 self._log.debug("Tallymer search Output:\n%s" % output[0])
|
|
236 if process.returncode != 0:
|
|
237 self._logAndRaise("Error in searching for kmers in %s.tyr-reads" % self.indexationFasta)
|
|
238 tmpOutputFile = open(self._tmpSearchFileName,'w')
|
|
239 tmpOutputFile.write(output[0])
|
|
240 tmpOutputFile.close()
|
|
241
|
|
242 def _writeOccurencesNbAndFrequencyOfKmers(self, totalNbOcc, dKmer2Occ):
|
|
243 with open(self._tmpStatFileName, "w") as statFile:
|
|
244 statFile.write("kmer\tocc\tfreq\n")
|
|
245 for kmer in dKmer2Occ.keys():
|
|
246 statFile.write("%s\t%i\t%.10f\n" % (kmer, dKmer2Occ[kmer], dKmer2Occ[kmer] / float(totalNbOcc)))
|
|
247
|
|
248 def _writeOutputFiles(self):
|
|
249 for format in self.outputFormats:
|
|
250 self._log.info("Generating %s file" % format)
|
|
251 outputFileName = "%s.tallymer.%s" % (os.path.splitext(self.inputFasta)[0], format)
|
|
252 try:
|
|
253 iConvertTranscriptFile = ConvertTranscriptFile(inputFileName=self._tmpMapFileName, name="Tallymer",\
|
|
254 inputFormat="map", outputFileName=outputFileName, outputFormat=format,feature= "Match", featurePart="Match-part", verbosity=0) #self.verbosity
|
|
255 iConvertTranscriptFile.run()
|
|
256 except Exception as inst:
|
|
257 self._log.error("Error: %s - Failed to generate %s format ouput, skipping" % (inst, format))
|
|
258 shutil.copy2(outputFileName, "../.")
|
|
259
|
|
260
|
|
261 class ConvertUtils(object):
|
|
262
|
|
263 def convertTallymerFormatIntoMapFormatAndGenerateData(fastaFileName, searchFileName, mapFileName):
|
|
264 dIndex2NameLengthList = ConvertUtils._createDictOfNameLengthAccordingToSeqOrder(fastaFileName)
|
|
265 plotData = {}
|
|
266 plotData2 = {}
|
|
267 with open(searchFileName, "r") as talFile:
|
|
268 with open(mapFileName, "w") as mapFile:
|
|
269 totalNbOcc = 0
|
|
270 dKmer2Occ = {}
|
|
271 line = talFile.readline()
|
|
272 while line:
|
|
273 data = line[:-1].split("\t")
|
|
274 name = "%s_%s" % (data[3], data[2])
|
|
275 nbOccs = int(data[2])
|
|
276 chrname = dIndex2NameLengthList[int(data[0])][0]
|
|
277 if data[1][0] == "+":
|
|
278 start = int(data[1][1:]) + 1
|
|
279 end = start + len(data[3])
|
|
280 elif data[1][0] == "-":
|
|
281 start_revcomp = int(data[1][1:])
|
|
282 start = dIndex2NameLengthList[int(data[0])][1] - start_revcomp - 1
|
|
283 end = end - len(data[3]) + 1
|
|
284 mapLine = "%s\t%s\t%s\t%s\t%i\n" % (name, chrname, start, end, nbOccs)
|
|
285 mapFile.write(mapLine)
|
|
286
|
|
287 if plotData2.get(chrname,None) is None:
|
|
288 plotData2[chrname] = {}
|
|
289 if plotData2[chrname].get(start, None) is None:
|
|
290 plotData2[chrname][start]=0
|
|
291 plotData2[chrname][start] += nbOccs
|
|
292
|
|
293 totalNbOcc += 1
|
|
294 if dKmer2Occ.has_key(data[3]):
|
|
295 dKmer2Occ[data[3]] += 1
|
|
296 else:
|
|
297 dKmer2Occ[data[3]] = 1
|
|
298 plotData[start] = nbOccs
|
|
299 line = talFile.readline()
|
|
300 return totalNbOcc, dKmer2Occ, plotData, plotData2
|
|
301
|
|
302 convertTallymerFormatIntoMapFormatAndGenerateData = staticmethod(convertTallymerFormatIntoMapFormatAndGenerateData)
|
|
303
|
|
304 def _createDictOfNameLengthAccordingToSeqOrder(fastaFileName):
|
|
305 with open(fastaFileName) as fastaFile:
|
|
306 line = fastaFile.readline()
|
|
307 i = 0
|
|
308 length = 0
|
|
309 dIndex2Name = {}
|
|
310 while line:
|
|
311 if line[0] == ">":
|
|
312 dIndex2Name[i] = [line[1:-1]]
|
|
313 if i > 0:
|
|
314 dIndex2Name[i - 1].append(length)
|
|
315 length = 0
|
|
316 i += 1
|
|
317 else:
|
|
318 length += len(line[:-1])
|
|
319 line = fastaFile.readline()
|
|
320 dIndex2Name[i - 1].append(length)
|
|
321 return dIndex2Name
|
|
322
|
|
323 _createDictOfNameLengthAccordingToSeqOrder = staticmethod(_createDictOfNameLengthAccordingToSeqOrder)
|
|
324
|
|
325 if __name__ == "__main__":
|
|
326 iLaunchTallymer = LaunchTallymer()
|
|
327 iLaunchTallymer.setAttributesFromCmdLine()
|
|
328 iLaunchTallymer.run() |