Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/GetMultAlignAndPhylogenyPerTErefSeq.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
17:b0e8584489e6 | 18:94ab73e8a190 |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 ##@file | |
4 # For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies. | |
5 # usage: GetMultAlignAndPhylogenyPerTErefSeq.py [ options ] | |
6 # options: | |
7 # -h: this help | |
8 # -S: step (0: all steps [default], 1:file generation, 2:multiple alignements, 3:phylogenies) | |
9 # -p: table with the annotations (format=path) | |
10 # -s: table with the TE reference sequences (format=seq) | |
11 # -g: table with the genome sequence (format=seq) | |
12 # -r: name or file with TE reference sequence(s) (all by default) | |
13 # -m: MSA method (default=Refalign/Map) | |
14 # -l: minimum length of copies (default=100) | |
15 # -n: number of longest copies to use (default=20) | |
16 # -y: minimum copy proportion compare to references (default=0.5) | |
17 # -R: keep the reference sequence (only with Refalign) | |
18 # -C: configuration file | |
19 # -q: queue name | |
20 # -c: clean | |
21 # -d: temporary directory | |
22 # -v: verbosity level (default=0/1) | |
23 | |
24 import os | |
25 import sys | |
26 import glob | |
27 import ConfigParser | |
28 | |
29 import pyRepet.launcher.programLauncher | |
30 | |
31 from commons.core.coord.PathUtils import PathUtils | |
32 from commons.core.seq.FastaUtils import FastaUtils | |
33 from commons.core.coord.SetUtils import SetUtils | |
34 from commons.core.sql.TablePathAdaptator import TablePathAdaptator | |
35 from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator | |
36 from commons.tools.OrientSequences import OrientSequences | |
37 from ConfigParser import MissingSectionHeaderError | |
38 from commons.core.utils.RepetOptionParser import RepetOptionParser | |
39 from commons.core.LoggerFactory import LoggerFactory | |
40 from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB | |
41 from commons.launcher import LaunchMap | |
42 from commons.core.sql.DbFactory import DbFactory | |
43 from commons.core.sql.TableJobAdaptatorFactory import TableJobAdaptatorFactory | |
44 from commons.core.launcher.Launcher import Launcher | |
45 from commons.core.utils.FileUtils import FileUtils | |
46 | |
47 | |
48 LOG_DEPTH = "repet.tools" | |
49 | |
50 ## For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies. | |
51 # | |
52 class GetMultAlignAndPhylogenyPerTErefSeq(object): | |
53 | |
54 def __init__(self, pathTableName="",refSeqTableName="", genomeSeqTableName="", step=0, mSAmethod="RefAlign",keepRefseq=False, configFileName= "", clean = True, verbosity=3): | |
55 """ | |
56 Constructor. | |
57 """ | |
58 self.step = step | |
59 self._pathTable = pathTableName | |
60 self._refSeqTable = refSeqTableName | |
61 self._genomeSeqTable = genomeSeqTableName | |
62 self._TErefseq = "" | |
63 self._MSAmethod = mSAmethod | |
64 self._minCopyLength = 100 | |
65 self._nbLongestCopies = 20 | |
66 self._minPropCopy = 0.5 | |
67 self._keepRefseq = keepRefseq | |
68 self.setConfigFileName(configFileName) | |
69 self._queue = "" | |
70 self._tmpDir = "" | |
71 self._clean = clean | |
72 self._verbosity = verbosity | |
73 self._db = None | |
74 self._tpaAnnot = None | |
75 self._tsaRef = None | |
76 self._pL = pyRepet.launcher.programLauncher.programLauncher() | |
77 self._log = LoggerFactory.createLogger("%s.%s" % (LOG_DEPTH, self.__class__.__name__), self._verbosity) | |
78 | |
79 def _logAndRaise(self, errorMsg): | |
80 self._log.error(errorMsg) | |
81 raise Exception(errorMsg) | |
82 | |
83 def setAttributesFromCmdLine(self): | |
84 desc = "For each TE reference sequence, it computes a multiple alignment and a phylogeny of all its copies.\n" | |
85 #Commented: it's not true, Config File is mandatory! | |
86 # desc += "Connection to the database parameters are retrieved from the environment" | |
87 | |
88 #TODO: format options as other scripts (have a look at LaunchTemplate) | |
89 parser = RepetOptionParser(description = desc, epilog = "") | |
90 parser.add_option("-S", "--step", dest = "step" , action = "store", type = "int", help = "step (0: all steps [default], 1:file generation, 2:multiple alignments, 3:phylogenies)", default = 0 ) | |
91 parser.add_option("-p", "--pathTable", dest = "path", action= "store", type = "string", help = "(mandatory) table with the annotations (format=path)", default = "") | |
92 parser.add_option("-s", "--refSeqTable",dest = "refSeqTable", action= "store", type = "string", help = "(mandatory) table with the TE reference sequences (format=seq)", default = "") | |
93 parser.add_option("-g", "--genomeSeqTable",dest = "genomeSeqTable",action= "store", type = "string", help = "(mandatory) table with the genome sequence (format=seq)", default = "") | |
94 parser.add_option("-r", "--TErefseq",dest = "TErefseq", action= "store", type = "string", help = "name or file with TE reference sequence(s) (all by default)", default = "") | |
95 parser.add_option("-m", "--MSAmethod",dest = "MSAmethod", action= "store", type = "string", help = "MSA method (default=RefAlign/Map)", default = "RefAlign") | |
96 parser.add_option("-l", "--minCopyLength",dest = "minCopyLength", action= "store", type = "int", help = "minimum length of copies (default=100)", default = 100) | |
97 parser.add_option("-n", "--nbLongestCopies",dest = "nbLongestCopies", action= "store", type = "int", help = "number of longest copies to use (default=20)", default = 20) | |
98 parser.add_option("-y", "--minPropCopy",dest = "minPropCopy", action= "store", type = "float", help = "minimum copy proportion compare to references (default=0.5)", default = 0.5) | |
99 parser.add_option("-R", "--keepRefseq",dest = "keepRefseq", action="store_true", help = "keep the reference sequence (only with Refalign)", default = False) | |
100 parser.add_option("-C", "--config", dest = "configFileName", action = "store", type = "string", help = "(mandatory) config file name to set database connection", default = "") | |
101 parser.add_option("-q", "--queue",dest = "queue", action= "store", type = "string", help = "queue name", default = "") | |
102 parser.add_option("-c", "--clean", action="store_false", dest = "clean", help = "don't clean", default = True) | |
103 parser.add_option("-d", "--tmpDir",dest = "tmpDir", action= "store", type = "string", help = "temporary directory", default = "") | |
104 parser.add_option("-v", "--verbosity", dest = "verbosity", action = "store", type = "int", help = "verbosity level (default=0)", default = 0) | |
105 options = parser.parse_args()[0] | |
106 self._setAttributesFromOptions(options) | |
107 | |
108 def _setAttributesFromOptions(self, options): | |
109 self.setConfigFileName(options.configFileName) | |
110 self.setStep(options.step) | |
111 self.setPathTable(options.path) | |
112 self.setRefSeqTable(options.refSeqTable) | |
113 self.setGenomeSeqTable(options.genomeSeqTable) | |
114 self.setTErefseq(options.TErefseq) | |
115 self.setMSAmethod(options.MSAmethod) | |
116 self.setMinCopyLength(options.minCopyLength) | |
117 self.setNbLongestCopies(options.nbLongestCopies) | |
118 self.setMinPropCopy(options.minPropCopy) | |
119 self.setKeepRefseq(options.keepRefseq) | |
120 self.setQueue(options.queue) | |
121 self.setClean(options.clean) | |
122 self.setTmpDir(options.tmpDir) | |
123 self.setVerbosity(options.verbosity) | |
124 | |
125 def setStep(self, step): | |
126 self.step = step | |
127 | |
128 def setPathTable(self, path): | |
129 self._pathTable = path | |
130 | |
131 def setRefSeqTable(self, refSeqTable): | |
132 self._refSeqTable = refSeqTable | |
133 | |
134 def setGenomeSeqTable(self, genomeSeqTable): | |
135 self._genomeSeqTable = genomeSeqTable | |
136 | |
137 def setTErefseq(self, TErefseq): | |
138 self._TErefseq = TErefseq | |
139 | |
140 def setMSAmethod(self, MSAmethod): | |
141 self._MSAmethod = MSAmethod | |
142 | |
143 def setMinCopyLength(self, minCopyLength): | |
144 self._minCopyLength = minCopyLength | |
145 | |
146 def setNbLongestCopies(self, nbLongestCopies): | |
147 self._nbLongestCopies = nbLongestCopies | |
148 | |
149 def setMinPropCopy(self, minPropCopy): | |
150 self._minPropCopy = minPropCopy | |
151 | |
152 def setKeepRefseq(self, keepRefseq): | |
153 self._keepRefseq = keepRefseq | |
154 | |
155 def setQueue(self, queue): | |
156 self._queue = queue | |
157 | |
158 def setClean(self, clean): | |
159 self._clean = clean | |
160 | |
161 def setTmpDir(self, tmpDir): | |
162 self._tmpDir = tmpDir | |
163 | |
164 def setVerbosity(self, verbosity): | |
165 self._verbosity = verbosity | |
166 | |
167 def setup_env(self): | |
168 configFileHandle = open(self._configFileName) | |
169 # Use RepetConfigParser? | |
170 config = ConfigParser.ConfigParser() | |
171 try : | |
172 config.readfp(configFileHandle) | |
173 except MissingSectionHeaderError: | |
174 self._logAndRaise("Configuration file %s must begin with a section header" % self._configFileName) | |
175 | |
176 os.environ["REPET_HOST"] = config.get("repet_env", "repet_host") | |
177 os.environ["REPET_USER"] = config.get("repet_env", "repet_user") | |
178 os.environ["REPET_PW"] = config.get("repet_env", "repet_pw") | |
179 os.environ["REPET_DB"] = config.get("repet_env", "repet_db") | |
180 os.environ["REPET_PORT"] = config.get("repet_env", "repet_port") | |
181 os.environ["REPET_JOB_MANAGER"] = config.get("repet_env", "repet_job_manager") | |
182 | |
183 def setConfigFileName(self, configFileName): | |
184 self._configFileName = configFileName | |
185 | |
186 def checkAttributes( self ): | |
187 """ | |
188 Check the attributes are valid before running the algorithm. | |
189 """ | |
190 if self._pathTable == "": | |
191 self._logAndRaise("PathTable is mandatory") | |
192 if self._refSeqTable == "": | |
193 self._logAndRaise("RefSeqTable is mandatory") | |
194 if self._genomeSeqTable == "": | |
195 self._logAndRaise("GenomeSeqTable is mandatory") | |
196 if self._configFileName == "": | |
197 self._logAndRaise("Configuration file is mandatory") | |
198 else: | |
199 if FileUtils.isRessourceExists(self._configFileName): | |
200 self.setup_env() | |
201 else: | |
202 self._logAndRaise("Configuration file '%s' does not exist!" % self._configFileName) | |
203 | |
204 if ( self.step == 2 or self.step == 3 ) and self._MSAmethod not in ["RefAlign","Map"]: | |
205 if self._MSAmethod == None or self._MSAmethod == "": | |
206 self._logAndRaise("Missing method option") | |
207 self._logAndRaise("Method '%s' not yet available" % ( self._MSAmethod )) | |
208 | |
209 if self._tmpDir == "": | |
210 self._tmpDir = os.getcwd() | |
211 | |
212 def connectSql(self): | |
213 self._db = DbFactory().createInstance(configFileName = self._configFileName, verbosity = 1) | |
214 self._tpaAnnot = TablePathAdaptator(self._db, self._pathTable) | |
215 self._tsaRef = TableSeqAdaptator(self._db,self._refSeqTable) | |
216 | |
217 def getNamesOfTErefSeq( self ): | |
218 """ | |
219 Return a list with the names of reference TEs. | |
220 """ | |
221 lNamesTErefSeq = [] | |
222 if self._TErefseq == "": | |
223 lNamesTErefSeq = self._tsaRef.getAccessionsList() | |
224 else: | |
225 if os.path.isfile( self._TErefseq ): | |
226 refseqFileHandler = open( self._TErefseq, "r" ) | |
227 while True: | |
228 line = refseqFileHandler.readline() | |
229 if line == "": | |
230 break | |
231 lNamesTErefSeq.append( line[:-1].split("\t")[0] ) | |
232 refseqFileHandler.close() | |
233 else: | |
234 lNamesTErefSeq = [ self._TErefseq ] | |
235 for name in lNamesTErefSeq: | |
236 if not self._tsaRef.isAccessionInTable( name ): | |
237 self._log.warning("'%s' not in table '%s'" % (name, self._refSeqTable)) | |
238 lNamesTErefSeq.remove( name ) | |
239 lNamesTErefSeq.sort() | |
240 self._log.info("nb of TE reference sequences: %d" % (len(lNamesTErefSeq))) | |
241 return lNamesTErefSeq | |
242 | |
243 | |
244 def getTErefSeqInFastaFiles( self, lNamesTErefSeq ): | |
245 """ | |
246 Save sequences of reference TEs in fasta files. | |
247 """ | |
248 for name in lNamesTErefSeq: | |
249 self._log.debug("save sequence of '%s'..." % ( name )) | |
250 self._tsaRef.saveAccessionsListInFastaFile( [name], name+".fa" ) | |
251 | |
252 def getCopiesInFastaFilesPerTErefSeq( self, lNamesTErefSeq ): | |
253 """ | |
254 Save sequences of TE copies in fasta files. | |
255 """ | |
256 self._log.info("retrieve the copies...") | |
257 tsaChr = TableSeqAdaptator( self._db, self._genomeSeqTable ) | |
258 totalNbCopies = 0 | |
259 totalNbSavedCopies = 0 | |
260 for name in lNamesTErefSeq: | |
261 nbCopies = 0 | |
262 if os.path.exists(name+"_copies.fa"): | |
263 continue | |
264 outFile = open( name+"_copies.fa", "w" ) | |
265 self._log.debug("Fetching path nums for subject: %s " % name) | |
266 lPathNums = self._tpaAnnot.getIdListSortedByDecreasingChainLengthFromSubject(name) | |
267 nbCopies = len(lPathNums) | |
268 totalNbCopies += nbCopies | |
269 self._log.debug("refseq '%s': %d copies" % ( name, nbCopies )) | |
270 i = 0 | |
271 nbSavedCopies = 0 | |
272 nbSavedFragments = 0 | |
273 lengthRefseq = self._tsaRef.getSeqLengthFromAccession( name ) | |
274 while i < len(lPathNums) and nbSavedCopies < self._nbLongestCopies: | |
275 lPaths = self._tpaAnnot.getPathListFromId( lPathNums[i] ) | |
276 lSets = PathUtils.getSetListFromQueries( lPaths ) | |
277 copyLength = SetUtils.getCumulLength( lSets ) | |
278 if copyLength >= self._minCopyLength \ | |
279 and copyLength >= self._minPropCopy * lengthRefseq: | |
280 bs = tsaChr.getBioseqFromSetList( lSets ) | |
281 bs.write(outFile) | |
282 nbSavedCopies += 1 | |
283 nbSavedFragments += len(lPaths) | |
284 i += 1 | |
285 | |
286 self._log.debug(" (saved: %d copies, %d fragments)\n" % ( nbSavedCopies, nbSavedFragments ) ) | |
287 outFile.close() | |
288 totalNbSavedCopies += nbSavedCopies | |
289 if nbSavedCopies == 0 and nbCopies != 0: | |
290 self._log.warning("No copy >= %d" % ( self._minCopyLength )) | |
291 self._log.info("nb of copies: %d (%d saved)" % ( totalNbCopies, totalNbSavedCopies )) | |
292 | |
293 def filter4Alignments( self, lNamesTErefSeq ): | |
294 """ | |
295 Filter TE copy sequences according to their length. | |
296 """ | |
297 self._log.info("filtering copies...") | |
298 if len(lNamesTErefSeq) == 1: | |
299 if not os.path.exists( "%s_copies.fa" % ( lNamesTErefSeq[0] ) ): | |
300 self._logAndRaise("first run step 1") | |
301 lInFiles = [ "%s_copies.fa" % ( lNamesTErefSeq[0] ) ] | |
302 else: | |
303 lInFiles = glob.glob( "*_copies.fa" ) | |
304 if len(lInFiles) == 0: | |
305 self._logAndRaise("no file '*_copies.fa'") | |
306 count = 0 | |
307 for inFileName in lInFiles: | |
308 if os.path.exists( "%s.filtered" % ( inFileName ) ): | |
309 continue | |
310 count += 1 | |
311 self._log.debug("TE %d --> %s" %(count,inFileName)) | |
312 FastaUtils.dbLengthFilter( self._minCopyLength, inFileName, verbose=self._verbosity ) | |
313 FastaUtils.dbLongestSequences( self._nbLongestCopies, inFileName+".Sup"+str(self._minCopyLength), verbose=self._verbosity ) | |
314 os.rename( inFileName+".Sup"+str(self._minCopyLength)+".best"+str(self._nbLongestCopies), inFileName+".filtered" ) | |
315 os.remove( inFileName+".Sup"+str(self._minCopyLength) ) | |
316 os.remove( inFileName+".Inf"+str(self._minCopyLength) ) | |
317 self._log.info("files filtered: %d" % (count)) | |
318 | |
319 def buildInFiles4Launcher( self, lNamesTErefSeq ): | |
320 """ | |
321 Save sequences of TE copies with reference in fasta files for launcher usage. | |
322 """ | |
323 self._log.info("building input files for launcher...") | |
324 for name in lNamesTErefSeq: | |
325 if os.path.exists( "%s_all.fa.oriented" % ( name ) ): | |
326 continue | |
327 if FastaUtils.dbSize( "%s_copies.fa.filtered" % ( name ) ) > 0: | |
328 os.system( "cat "+name+".fa "+ name+"_copies.fa.filtered > " + name+"_all.fa" ) | |
329 ors = OrientSequences(inFileName= "%s_all.fa" %(name), prgToOrient="mummer", clean=True, verbosity =self._verbosity - 1) | |
330 ors.run() | |
331 ors.clean() | |
332 if len( glob.glob("*_all.fa.oriented") ) == 0: | |
333 self._logAndRaise("no copies") | |
334 self._log.info("done building input files for launcher...") | |
335 | |
336 def renameFile( self, fromName, toName): | |
337 lFiles = glob.glob( "*%s" %fromName ) | |
338 for f in lFiles: | |
339 os.rename( f, f.replace(fromName,toName)) | |
340 | |
341 def _preparejobs(self, iLauncher, cDir): | |
342 self._log.info("Preparing Align jobs") | |
343 lCmdsTuples = [] | |
344 print(self.queriesDir,self.alignFileSuffix) | |
345 queryFiles = glob.glob("%s/%s" % (self.queriesDir,self.alignFileSuffix)) | |
346 print("queryFiles",queryFiles) | |
347 for queryFile in queryFiles:#os.listdir(self.queriesDir): | |
348 lCmds = [] | |
349 lCmdStart = [] | |
350 lCmdFinish = [] #['shutil.move("%s/*" %newDir, "../" )'] | |
351 queryFilePath = os.path.join(self.queriesDir,queryFile) | |
352 lCmds.append(self._createLaunchAlignCommands(iLauncher, queryFilePath)) | |
353 #lCmdFinish.append("shutil.move(\"%s.%s\", \"%s/%s.%s\")" % (benchName,self.outputformat,self.resultsDir,benchName,self.outputformat)) | |
354 lCmdsTuples.append(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) | |
355 self._log.info("Finished preparing Align jobs") | |
356 return lCmdsTuples | |
357 | |
358 def _preparePhyMljobs(self, iLauncher, cDir): | |
359 self._log.info("Preparing PhyMl jobs") | |
360 lCmdsTuples = [] | |
361 queryFiles = glob.glob("%s/%s" % (self.queriesDir,self.phyloFileSuffix)) | |
362 print("queryFiles",queryFiles) | |
363 for queryFile in queryFiles:#os.listdir(self.queriesDir): | |
364 lCmds = [] | |
365 lCmdStart = [] | |
366 lCmdFinish = [] #['shutil.move("%s/*" %newDir, "../" )'] | |
367 queryFilePath = os.path.join(self.queriesDir,queryFile) | |
368 lCmds.append(self._createLaunchPhyMLCommands(iLauncher, queryFilePath)) | |
369 lCmdsTuples.append(iLauncher.prepareCommands_withoutIndentation(lCmds, lCmdStart, lCmdFinish)) | |
370 self._log.info("Finished preparing PhyMl jobs") | |
371 return lCmdsTuples | |
372 | |
373 def _createLaunchAlignCommands(self, iLauncher, query): | |
374 if self._MSAmethod == "Map": | |
375 prg = "LaunchMap.py" | |
376 elif self._MSAmethod == "RefAlign": | |
377 prg = "LaunchRefAlign.py" | |
378 | |
379 lArgs = [] | |
380 lArgs.append("-i %s" % query) | |
381 lArgs.append(" -o %s.fa_aln" % query) | |
382 lArgs.append("-v 1" ) | |
383 | |
384 if self._MSAmethod == "RefAlign" and self._keepRefseq: | |
385 lArgs.append("-r " ) | |
386 | |
387 self._log.debug("Prepared Align commands : %s %s" % (prg, " ".join(lArgs))) | |
388 | |
389 | |
390 return iLauncher.getSystemCommand("%s" % prg, lArgs) | |
391 | |
392 def launchMultAlignments(self, lNamesTErefSeq): | |
393 """ | |
394 Make multiple alignments via Map for each TE family | |
395 """ | |
396 self.queriesDir = os.getcwd() | |
397 | |
398 if len(lNamesTErefSeq) == 1: | |
399 self.alignFileSuffix = "%s_all.fa.oriented" % (lNamesTErefSeq[0]) | |
400 else: | |
401 self.alignFileSuffix = "*_all.fa.oriented" | |
402 | |
403 queue = self._queue | |
404 cDir = os.getcwd() | |
405 tmpDir = self._tmpDir | |
406 groupid = "%s_%s" % ( self._refSeqTable, self._MSAmethod ) | |
407 acronym = "Align" | |
408 iDb = DbFactory.createInstance(configFileName=self._configFileName) | |
409 iTJA = TableJobAdaptatorFactory.createInstance(iDb, "jobs") | |
410 iLauncher = Launcher(iTJA, os.getcwd(), "", "", cDir, tmpDir, "jobs", queue, groupid) | |
411 lCmdsTuples = self._preparejobs(iLauncher, cDir) | |
412 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, self._clean) | |
413 | |
414 self.renameFile("_all.fa.oriented.fa_aln", "_all.fa.oriented_%s.fa_aln" % (self._MSAmethod.lower()) ) | |
415 | |
416 # def __makeMultAlignments( self, lNamesTErefSeq ): | |
417 # """ | |
418 # Make multiple alignments via Map or Refalign for each TE family | |
419 # """ | |
420 # self._pL.reset("") | |
421 # if self._MSAmethod == "Map": | |
422 # | |
423 # prg = os.environ["REPET_PATH"] + "/bin/srptMAP.py" | |
424 # elif self._MSAmethod == "RefAlign": | |
425 # prg = os.environ["REPET_PATH"] + "/bin/srptRefalign.py" | |
426 # cmd = prg | |
427 # cmd += " -g %s_%s" % ( self._refSeqTable, self._MSAmethod ) | |
428 # cmd += " -q %s" % ( os.getcwd() ) | |
429 # if len(lNamesTErefSeq) == 1: | |
430 # cmd += " -S %s_all.fa.oriented" % ( lNamesTErefSeq[0] ) | |
431 # else: | |
432 # cmd += " -S '*_all.fa.oriented'" | |
433 # cmd += " -Q %s" % ( self._queue ) | |
434 # cmd += " -C %s" % ( self._configFileName ) | |
435 # if self._MSAmethod == "Refalign" and self._keepRefseq: | |
436 # cmd += " -r" | |
437 # if not self._clean : | |
438 # cmd += " -c" | |
439 # if self._tmpDir != "": | |
440 # cmd += " -d %s" % ( self._tmpDir ) | |
441 # cmd += " -v %d" % ( self._verbosity ) | |
442 # self._pL.launch( prg, cmd ) | |
443 # lFiles = glob.glob( "*_all.fa.oriented.fa_aln" ) | |
444 # for f in lFiles: | |
445 # os.rename( f, f.replace("_all.fa.oriented.fa_aln","_all.fa.oriented_%s.fa_aln" % (self._MSAmethod.lower()) ) ) | |
446 | |
447 | |
448 def filter4phylogenies( self, verbosity=0 ): | |
449 """ | |
450 Filter TE copy alignment for better phylogenies. | |
451 """ | |
452 self._log.info("Filtering MSA") | |
453 lInFiles = glob.glob( "*_all.fa.oriented_%s.fa_aln" % ( self._MSAmethod.lower() ) ) | |
454 count = 0 | |
455 for inFileName in lInFiles: | |
456 count += 1 | |
457 self._log.debug("clean MSA %d --> %s" % (count,inFileName)) | |
458 alignDB = AlignedBioseqDB() | |
459 alignDB.load(inFileName) | |
460 alignDB.cleanMSA() | |
461 if alignDB.getSize() > 2: | |
462 alignDB.save( inFileName + ".clean" ) | |
463 self._log.debug("clean!") | |
464 else: | |
465 self._log.debug("skip!") | |
466 self._log.info("MSA cleaned: %d" % count) | |
467 | |
468 | |
469 def _createLaunchPhyMLCommands(self, iLauncher, query): | |
470 # prg = os.environ["REPET_PATH"] + "/bin/srptPhyML.py" | |
471 # cmd = prg | |
472 # cmd += " -g %s_PHY_%s" % ( self._refSeqTable, os.getpid() ) | |
473 # cmd += " -q %s" % ( os.getcwd() ) | |
474 # cmd += " -S '*_all.fa.oriented_%s.fa_aln.clean'" % ( self._MSAmethod.lower() ) | |
475 # cmd += " -Q %s" % ( self._queue ) | |
476 # cmd += " -C %s" % ( self._configFileName ) | |
477 | |
478 prg = "LaunchPhyML.py" | |
479 lArgs = [] | |
480 lArgs.append("-i %s" % query) | |
481 lArgs.append("-o %s.fa_phylo" % query) | |
482 lArgs.append("-v %d" % (self._verbosity-1)) | |
483 | |
484 self._log.debug("Prepared Phyml commands : %s %s" % (prg, " ".join(lArgs))) | |
485 return iLauncher.getSystemCommand("%s" % prg, lArgs) | |
486 | |
487 def makePhylogenies( self ): | |
488 """ | |
489 Launch PhyML on each TE family. | |
490 """ | |
491 self.phyloFileSuffix = "*_all.fa.oriented_%s.fa_aln.clean" % ( self._MSAmethod.lower() ) | |
492 | |
493 queue = self._queue | |
494 cDir = os.getcwd() | |
495 tmpDir = self._tmpDir | |
496 groupid = "%s_PHY_%s" % ( self._refSeqTable, os.getpid() ) | |
497 acronym = "Phylo" | |
498 iDb = DbFactory.createInstance(configFileName=self._configFileName) | |
499 iTJA = TableJobAdaptatorFactory.createInstance(iDb, "jobs") | |
500 iLauncher = Launcher(iTJA, os.getcwd(), "", "", cDir, tmpDir, "jobs", queue, groupid) | |
501 lCmdsTuples = self._preparePhyMljobs(iLauncher, cDir) | |
502 iLauncher.runLauncherForMultipleJobs(acronym, lCmdsTuples, self._clean) | |
503 | |
504 | |
505 def start( self ): | |
506 """ | |
507 Useful commands before running the program. | |
508 """ | |
509 self.checkAttributes() | |
510 self._log.info("START GetMultAlignAndPhylogenyPerTErefSeq.py STEP %d" % self.step) | |
511 self.connectSql() | |
512 | |
513 def end( self ): | |
514 """ | |
515 Useful commands before ending the program. | |
516 """ | |
517 self._db.close() | |
518 self._log.info("END GetMultAlignAndPhylogenyPerTErefSeq.py STEP %d" % self.step) | |
519 | |
520 def run( self ): | |
521 """ | |
522 Run the program. | |
523 """ | |
524 LoggerFactory.setLevel(self._log, self._verbosity) | |
525 self.start() | |
526 lNamesTErefSeq = self.getNamesOfTErefSeq() | |
527 self._log.debug("lNamesTErefSeq: %s" % " ".join(lNamesTErefSeq)) | |
528 | |
529 if self.step in [0, 1]: | |
530 self.getTErefSeqInFastaFiles( lNamesTErefSeq ) | |
531 self.getCopiesInFastaFilesPerTErefSeq( lNamesTErefSeq ) | |
532 | |
533 if self.step in [0, 2]: | |
534 self.filter4Alignments(lNamesTErefSeq) | |
535 self.buildInFiles4Launcher(lNamesTErefSeq) | |
536 self.launchMultAlignments(lNamesTErefSeq) | |
537 | |
538 if self.step in [0, 3]: | |
539 self.filter4phylogenies(verbosity=self._verbosity) | |
540 self.makePhylogenies() | |
541 self.end() | |
542 | |
543 if __name__ == "__main__": | |
544 iGetMultAlignAndPhylogenyPerTErefSeq = GetMultAlignAndPhylogenyPerTErefSeq() | |
545 iGetMultAlignAndPhylogenyPerTErefSeq.setAttributesFromCmdLine() | |
546 iGetMultAlignAndPhylogenyPerTErefSeq.run() | |
547 |