18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
4 # http://www.inra.fr
|
|
5 # http://urgi.versailles.inra.fr
|
|
6 #
|
|
7 # This software is governed by the CeCILL license under French law and
|
|
8 # abiding by the rules of distribution of free software. You can use,
|
|
9 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
10 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
11 # "http://www.cecill.info".
|
|
12 #
|
|
13 # As a counterpart to the access to the source code and rights to copy,
|
|
14 # modify and redistribute granted by the license, users are provided only
|
|
15 # with a limited warranty and the software's author, the holder of the
|
|
16 # economic rights, and the successive licensors have only limited
|
|
17 # liability.
|
|
18 #
|
|
19 # In this respect, the user's attention is drawn to the risks associated
|
|
20 # with loading, using, modifying and/or developing or reproducing the
|
|
21 # software by the user in light of its specific status of free software,
|
|
22 # that may mean that it is complicated to manipulate, and that also
|
|
23 # therefore means that it is reserved for developers and experienced
|
|
24 # professionals having in-depth computer knowledge. Users are therefore
|
|
25 # encouraged to load and test the software's suitability as regards their
|
|
26 # requirements in conditions enabling the security of their systems and/or
|
|
27 # data to be ensured and, more generally, to use and operate it in the
|
|
28 # same conditions as regards security.
|
|
29 #
|
|
30 # The fact that you are presently reading this means that you have had
|
|
31 # knowledge of the CeCILL license and that you accept its terms.
|
|
32
|
|
33 import re
|
|
34 import os
|
|
35 from commons.core.LoggerFactory import LoggerFactory
|
|
36 from commons.core.utils.RepetOptionParser import RepetOptionParser
|
|
37 from commons.core.checker.ConfigChecker import ConfigRules, ConfigChecker
|
|
38 import subprocess
|
|
39 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB
|
|
40 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
|
|
41 from commons.core.coord.PathUtils import PathUtils
|
|
42 from commons.core.coord.SetUtils import SetUtils
|
|
43 from commons.core.sql.TablePathAdaptator import TablePathAdaptator
|
|
44 from commons.core.coord.Set import Set
|
|
45 from commons.core.sql.DbFactory import DbFactory
|
|
46
|
|
47 ## Align a TE on genome according to annotation
|
|
48
|
|
49 LOG_DEPTH = "repet.tools"
|
|
50
|
|
51 class AlignTEOnGenomeAccordingToAnnotation(object):
|
|
52
|
|
53 def __init__(self, pathTableName = "", queryTableName = "", subjectTableName = "", mergeSamePathId = False, outTableName = "", matchPenality=10, mism=8, gapo=16, gape=4, gapl=20, configFileName = "", doClean = False, verbosity = 0):
|
|
54 self._pathTableName = pathTableName
|
|
55 self._queryTableName = queryTableName
|
|
56 self._subjectTableName = subjectTableName
|
|
57 self._mergeSamePathId = mergeSamePathId
|
|
58 self.setOutTableName(outTableName)
|
|
59 self._matchPenality = matchPenality
|
|
60 self._mismatch = mism
|
|
61 self._gapOpening = gapo
|
|
62 self._gapExtend = gape
|
|
63 self._gapLength = gapl
|
|
64 self._configFileName = configFileName
|
|
65 self._doClean = doClean
|
|
66 self._verbosity = verbosity
|
|
67 self._iDb = None
|
|
68 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity)
|
|
69
|
|
70 def setAttributesFromCmdLine(self):
|
|
71 description = "Align a TE on genome according to annotation."
|
|
72 epilog = "\nExample 1: launch without verbosity and keep temporary files.\n"
|
|
73 epilog += "\t$ python AlignTEOnGenomeAccordingToAnnotation.py -p DmelChr4_chr_allTEs_nr_noSSR_join_path -q DmelChr4_chr_seq -s DmelChr4_refTEs_seq -v 0"
|
|
74 epilog += "\n"
|
|
75 parser = RepetOptionParser(description = description, epilog = epilog)
|
|
76 parser.add_option("-p", "--path", dest = "pathTableName", action = "store", type = "string", help = "path table name [compulsory] [format: path]", default = "")
|
|
77 parser.add_option("-q", "--query", dest = "queryTableName", action = "store", type = "string", help = "query table name [compulsory] [format: seq]", default = "")
|
|
78 parser.add_option("-s", "--subject", dest = "subjectTableName", action = "store", type = "string", help = "subject table name [compulsory] [format: seq]", default = "")
|
|
79 parser.add_option("-m", "--merge", dest = "mergeSamePathId", action = "store_true", help = "merge joined matchs [optional] [default: False]", default = False)
|
|
80 parser.add_option("-o", "--out", dest = "outTableName", action = "store", type = "string", help = "output table name [default: <pathTableName>_align]", default = "")
|
|
81 #TODO: add options for : matchPenality=10, mism=8, gapo=16, gape=4, gapl=20
|
|
82 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "configuration file name (e.g. TEannot.cfg)", default = "")
|
|
83 #NOTE: doClean unused
|
|
84 # parser.add_option("-c", "--clean", dest = "doClean", action = "store_true", help = "clean temporary files [optional] [default: False]", default = False)
|
|
85 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity [optional] [default: 1]", default = 1)
|
|
86 options = parser.parse_args()[0]
|
|
87 self._setAttributesFromOptions(options)
|
|
88
|
|
89 def _setAttributesFromOptions(self, options):
|
|
90 self.setPathTableName(options.pathTableName)
|
|
91 self.setQueryTableName(options.queryTableName)
|
|
92 self.setSubjectTableName(options.subjectTableName)
|
|
93 self.setMergeSamePathId(options.mergeSamePathId)
|
|
94 self.setOutTableName(options.outTableName)
|
|
95 self.setConfigFileName(options.configFileName)
|
|
96 # self.setDoClean(options.doClean)
|
|
97 self.setVerbosity(options.verbosity)
|
|
98
|
|
99 def _checkConfig(self):
|
|
100 iConfigRules = ConfigRules()
|
|
101 iConfigRules.addRuleSection(section="", mandatory=True)
|
|
102 iConfigRules.addRuleOption(section="", option ="fasta_name", mandatory=True, type="string")
|
|
103 iConfigRules.addRuleOption(section="", option ="clean", mandatory=True, type="bool")
|
|
104 iConfigChecker = ConfigChecker(self._configFileName, iConfigRules)
|
|
105 iConfig = iConfigChecker.getConfig()
|
|
106 self._setAttributesFromConfig(iConfig)
|
|
107
|
|
108 def _setAttributesFromConfig(self, iConfig):
|
|
109 self.setOutTableName(self._outTableName)
|
|
110 self.setDoClean(iConfig.get("", "clean"))
|
|
111
|
|
112 def setPathTableName(self, pathTableName):
|
|
113 self._pathTableName = pathTableName
|
|
114
|
|
115 def setQueryTableName(self, queryTableName):
|
|
116 self._queryTableName = queryTableName
|
|
117
|
|
118 def setSubjectTableName(self, subjectTableName):
|
|
119 self._subjectTableName = subjectTableName
|
|
120
|
|
121 def setMergeSamePathId(self, mergeSamePathId):
|
|
122 self._mergeSamePathId = mergeSamePathId
|
|
123
|
|
124 def setOutTableName(self, outTableName):
|
|
125 if outTableName == "":
|
|
126 self._outTableName = "%s_align" % self._pathTableName
|
|
127 else:
|
|
128 self._outTableName = outTableName
|
|
129
|
|
130 def setConfigFileName(self, configFileName):
|
|
131 self._configFileName = configFileName
|
|
132
|
|
133 def setDoClean(self, doClean):
|
|
134 self._doClean = doClean
|
|
135
|
|
136 def setVerbosity(self, verbosity):
|
|
137 self._verbosity = verbosity
|
|
138
|
|
139 def setDbInstance(self, iDb):
|
|
140 self._iDb = iDb
|
|
141
|
|
142 def _checkOptions(self):
|
|
143 if self._pathTableName == "" or not self._iDb.doesTableExist(self._pathTableName):
|
|
144 self._logAndRaise("ERROR: Missing path table")
|
|
145 if self._queryTableName == "" or not self._iDb.doesTableExist(self._queryTableName):
|
|
146 self._logAndRaise("ERROR: Missing query table")
|
|
147 if self._subjectTableName == "" or not self._iDb.doesTableExist(self._subjectTableName):
|
|
148 self._logAndRaise("ERROR: Missing subject table")
|
|
149
|
|
150 def _logAndRaise(self, errorMsg):
|
|
151 self._log.error(errorMsg)
|
|
152 raise Exception(errorMsg)
|
|
153
|
|
154 def alignBioseqWithNWalign(self, iBioseq1, iBioseq2):
|
|
155 fastaFileName1 = "seqtoalign1.tmp"
|
|
156 iBioseq1.save(fastaFileName1)
|
|
157 fastaFileName2 = "seqtoalign2.tmp"
|
|
158 iBioseq2.save(fastaFileName2)
|
|
159 alignBioseqDBFileName = "aligned.tmp"
|
|
160 cmd = "NWalign"
|
|
161 cmd += " %s" % fastaFileName1
|
|
162 cmd += " %s" % fastaFileName2
|
|
163 cmd += " -m %s" % self._matchPenality
|
|
164 cmd += " -d %s" % self._mismatch
|
|
165 cmd += " -g %s" % self._gapOpening
|
|
166 cmd += " -e %s" % self._gapExtend
|
|
167 cmd += " -l %s" % self._gapLength
|
|
168 cmd += " -D"
|
|
169 cmd += " -o %s" % alignBioseqDBFileName
|
|
170 process = subprocess.Popen(cmd, shell = True)
|
|
171 self._log.debug("Running : %s" % cmd)
|
|
172 process.communicate()
|
|
173 if process.returncode != 0:
|
|
174 self._logAndRaise("ERROR when launching '%s'" % cmd)
|
|
175 iAlignedBioseqDB = AlignedBioseqDB()
|
|
176 iAlignedBioseqDB.load(alignBioseqDBFileName)
|
|
177 os.remove(fastaFileName1)
|
|
178 os.remove(fastaFileName2)
|
|
179 os.remove(alignBioseqDBFileName)
|
|
180 return iAlignedBioseqDB
|
|
181
|
|
182 def alignSeqAccordingToPathAndBuildAlignedSeqTable(self):
|
|
183 if self._iDb.doesTableExist(self._outTableName):
|
|
184 self._logAndRaise("ERROR: out table %s already exists" % self._outTableName)
|
|
185
|
|
186 #TODO: create alignedSeq table in DbMySql...
|
|
187 sqlCmd="CREATE TABLE %s (path int unsigned, query_aligned_seq longtext, subject_aligned_seq longtext, score int unsigned, identity float unsigned)" % self._outTableName
|
|
188 self._iDb.execute(sqlCmd)
|
|
189
|
|
190 iQueryTSA = TableSeqAdaptator(self._iDb, self._queryTableName)
|
|
191 iSubjectTSA = TableSeqAdaptator(self._iDb, self._subjectTableName)
|
|
192 iTPA = TablePathAdaptator(self._iDb, self._pathTableName)
|
|
193
|
|
194 if self._mergeSamePathId:
|
|
195 lPathId = iTPA.getIdList()
|
|
196 pathNb = len(lPathId)
|
|
197 count = 0
|
|
198 for pathNum in lPathId:
|
|
199 self._log.debug(count,"/",pathNb,"=>path",pathNum,"...")
|
|
200 lPaths = iTPA.getPathListFromId(pathNum)
|
|
201 #TODO: getSetListFromQueries() call getSubjectAsSetOfQuery() => "reverse complement" the query (coordinates are inversed, so getBioseq() will take the reverse-comp.) : is it correct ?
|
|
202 lQuerySets = PathUtils.getSetListFromQueries(lPaths)
|
|
203 #NOTE: merge sets if overlap
|
|
204 lQueryMergedSets = SetUtils.mergeSetsInList(lQuerySets)
|
|
205 #TODO: getBioseqFromSetList() build a sequence that does not exist : is it correct ?
|
|
206 iQueryBioseq = iQueryTSA.getBioseqFromSetList(lQueryMergedSets)
|
|
207 lSubjectSets = PathUtils.getSetListFromSubjects(lPaths)
|
|
208 #TODO: no merge for subjects : is it correct ? matcher allow overlap on query and not on subject ?
|
|
209 iSubjectBioseq = iSubjectTSA.getBioseqFromSetList(lSubjectSets)
|
|
210 iAlignedBioseqDB = self.alignBioseqWithNWalign(iQueryBioseq, iSubjectBioseq)
|
|
211 self._insertAlignedBioseqDBWithScoreAndIdentityInTable(pathNum, iAlignedBioseqDB)
|
|
212 else:
|
|
213 lPathId = iTPA.getIdList()
|
|
214 pathNb = len(lPathId)
|
|
215 count = 0
|
|
216 for pathNum in lPathId:
|
|
217 self._log.debug(count,"/",pathNb,"=>path",pathNum,"...")
|
|
218 lPaths = iTPA.getPathListFromId(pathNum)
|
|
219 queryName = lPaths[0].getQueryName()
|
|
220 subjectName = lPaths[0].getSubjectName()
|
|
221 lQueryStart = []
|
|
222 lQueryEnd = []
|
|
223 lSubjectStart = []
|
|
224 lSubjectEnd = []
|
|
225 isReversed = not lPaths[0].isSubjectOnDirectStrand()
|
|
226 for iPath in lPaths:
|
|
227 lQueryStart.append(iPath.getQueryStart())
|
|
228 lQueryEnd.append(iPath.getQueryEnd())
|
|
229 lSubjectStart.append(iPath.getSubjectStart())
|
|
230 lSubjectEnd.append(iPath.getSubjectEnd())
|
|
231 queryStart = min(lQueryStart)
|
|
232 queryEnd = max(lQueryEnd)
|
|
233 if isReversed:
|
|
234 subjectStart = max(lSubjectStart)
|
|
235 subjectEnd = min(lSubjectEnd)
|
|
236 else:
|
|
237 subjectStart = min(lSubjectStart)
|
|
238 subjectEnd = max(lSubjectEnd)
|
|
239 lQuerySets = [Set(pathNum,subjectName, queryName,queryStart,queryEnd)]
|
|
240 lSubjectSets = [Set(pathNum,queryName, subjectName,subjectStart,subjectEnd)]
|
|
241
|
|
242 iQueryBioseq = iQueryTSA.getBioseqFromSetList(lQuerySets)
|
|
243 iSubjectBioseq = iSubjectTSA.getBioseqFromSetList(lSubjectSets)
|
|
244 iAlignedBioseqDB = self.alignBioseqWithNWalign(iQueryBioseq, iSubjectBioseq)
|
|
245 self._insertAlignedBioseqDBWithScoreAndIdentityInTable(pathNum, iAlignedBioseqDB)
|
|
246
|
|
247 def run(self):
|
|
248 LoggerFactory.setLevel(self._log, self._verbosity)
|
|
249 if self._configFileName:
|
|
250 self._checkConfig()
|
|
251 self._iDb = DbFactory.createInstance()
|
|
252 self._checkOptions()
|
|
253 self._log.info("START AlignTEOnGenomeAccordingToAnnotation")
|
|
254 self._log.debug("path table name: %s" % self._pathTableName)
|
|
255 self._log.debug("query table name: %s" % self._queryTableName)
|
|
256 self._log.debug("subject table name: %s" % self._subjectTableName)
|
|
257 self.alignSeqAccordingToPathAndBuildAlignedSeqTable()
|
|
258 self._iDb.close()
|
|
259 self._log.info("END AlignTEOnGenomeAccordingToAnnotation")
|
|
260
|
|
261 def _insertAlignedBioseqDBWithScoreAndIdentityInTable(self, pathNum, iAlignedBioseqDB):
|
|
262 scoreWithEndLine = re.split("Score=", iAlignedBioseqDB.db[0].header)[1]
|
|
263 score = int(scoreWithEndLine.split()[0])
|
|
264
|
|
265 identity = re.split("Identity=", scoreWithEndLine)[1]
|
|
266 if identity == "nan":
|
|
267 identity = "0.0"
|
|
268 identity = float(identity)*100.0
|
|
269
|
|
270 #TODO: create TableAlignedSeqAdaptator (to use insert...)
|
|
271 sqlCmd = 'INSERT INTO %s VALUES (%d,"%s","%s", %d,%f);' % (self._outTableName, pathNum, iAlignedBioseqDB.db[0].sequence, iAlignedBioseqDB.db[1].sequence, score, identity)
|
|
272 self._iDb.execute(sqlCmd)
|
|
273
|
|
274 self._log.debug("header:", iAlignedBioseqDB.db[0].header)
|
|
275 self._log.debug("path", pathNum, "Score=", score, "Identity=", identity, "ok")
|
|
276 self._log.debug(iAlignedBioseqDB.db[0].sequence[:80])
|
|
277 self._log.debug(iAlignedBioseqDB.db[1].sequence[:80])
|
|
278
|
|
279 if __name__ == "__main__":
|
|
280 iLaunch = AlignTEOnGenomeAccordingToAnnotation()
|
|
281 iLaunch.setAttributesFromCmdLine()
|
|
282 iLaunch.run() |