18
|
1 #!/usr/bin/env python
|
|
2
|
|
3 """
|
|
4 This program takes a query directory as input,
|
|
5 then launches Blaster and/or Matcher on each file in it,
|
|
6 finally results are optionally gathered in a single file.
|
|
7 """
|
|
8
|
|
9 import os
|
|
10 import sys
|
|
11 import getopt
|
|
12 import logging
|
|
13 import glob
|
|
14 import ConfigParser
|
|
15
|
|
16 from pyRepet.launcher.programLauncher import programLauncher
|
|
17 from pyRepet.launcher import Launcher
|
|
18 from pyRepet.sql.RepetJobMySQL import RepetJob
|
|
19 from commons.core.coord.Align import Align
|
|
20
|
|
21
|
|
22 def help():
|
|
23 print
|
|
24 print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] )
|
|
25 print "options:"
|
|
26 print " -h: this help"
|
|
27 print " -g: name of the group identifier (same for all the jobs)"
|
|
28 print " -q: name of the query directory"
|
|
29 print " -S: suffix in the query directory (default='*.fa' for Blaster, '*.align' for Matcher)"
|
|
30 print " -s: absolute path to the subject databank"
|
|
31 print " -Q: resources needed on the cluster)"
|
|
32 print " -d: absolute path to the temporary directory"
|
|
33 print " -m: mix of Blaster and/or Matcher"
|
|
34 print " 1: launch Blaster only"
|
|
35 print " 2: launch Matcher only (on '*.align' query files)"
|
|
36 print " 3: launch Blaster+Matcher in the same job"
|
|
37 print " -B: parameters for Blaster (e.g. \"-a -n tblastx\")"
|
|
38 print " -M: parameters for Matcher (e.g. \"-j\")"
|
|
39 print " -Z: collect all the results into a single file"
|
|
40 print " align (after Blaster)"
|
|
41 print " path/tab (after Matcher)"
|
|
42 print " -C: configuration file from TEdenovo or TEannot pipeline"
|
|
43 print " -t: name of the table recording the jobs (default=jobs)"
|
|
44 print " -p: absolute path to project directory (if jobs management via files)"
|
|
45 print " -c: clean (remove job launch files and job stdout)"
|
|
46 print " -v: verbose (default=0/1/2)"
|
|
47 print
|
|
48
|
|
49
|
|
50 def filterRedundantMatches( inFile, outFile ):
|
|
51 """
|
|
52 When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks),
|
|
53 one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...'
|
|
54 and we discards 'chunk7-11-110-chunk3-1-100-...'.
|
|
55 Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards
|
|
56 'chunk5-11-110-chunk5-1-100-...'.
|
|
57 For this of course the results need to be sorted by query, on plus strand,
|
|
58 and in ascending coordinates (always the case with Blaster).
|
|
59 """
|
|
60 inFileHandler = open( inFile, "r" )
|
|
61 outFileHandler = open( outFile, "w" )
|
|
62 iAlign = Align()
|
|
63 countMatches = 0
|
|
64 tick = 100000
|
|
65 while True:
|
|
66 line = inFileHandler.readline()
|
|
67 if line == "":
|
|
68 break
|
|
69 countMatches += 1
|
|
70 iAlign.setFromString( line )
|
|
71 if "chunk" not in iAlign.range_query.seqname \
|
|
72 or "chunk" not in iAlign.range_subject.seqname:
|
|
73 print "ERROR: 'chunk' not in seqname"
|
|
74 sys.exit(1)
|
|
75 if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]):
|
|
76 iAlign.write( outFileHandler )
|
|
77 elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]):
|
|
78 if iAlign.range_query.getMin() < iAlign.range_subject.getMin():
|
|
79 iAlign.write( outFileHandler )
|
|
80 if countMatches % tick == 0: # need to free buffer frequently as file can be big
|
|
81 outFileHandler.flush()
|
|
82 os.fsync( outFileHandler.fileno() )
|
|
83 inFileHandler.close()
|
|
84 outFileHandler.close()
|
|
85
|
|
86
|
|
87 def runCollect( groupid, collect, allByAll ):
|
|
88 """
|
|
89 Gather the results of each job in a single job and adapt path ID if necessary.
|
|
90 """
|
|
91 if verbose > 0:
|
|
92 print "concatenate the results of each job"; sys.stdout.flush()
|
|
93
|
|
94 # retrieve the list of the files
|
|
95 lFiles = glob.glob( "*.%s" % ( collect ) )
|
|
96 lFiles.sort()
|
|
97
|
|
98 # concatenate all the individual files
|
|
99 if os.path.exists( "%s.%s_tmp" % ( groupid, collect ) ):
|
|
100 os.remove( "%s.%s_tmp" % ( groupid, collect ) )
|
|
101 for resFile in lFiles:
|
|
102 prg = "cat"
|
|
103 cmd = prg
|
|
104 cmd += " %s >> %s.%s_tmp" % ( resFile, groupid, collect )
|
|
105 pL.launch( prg, cmd )
|
|
106 if clean == True:
|
|
107 prg = "rm"
|
|
108 cmd = prg
|
|
109 cmd += " -f %s" % ( resFile )
|
|
110 pL.launch( prg, cmd )
|
|
111
|
|
112 if os.path.exists( "%s.%s" % ( groupid, collect ) ):
|
|
113 os.remove( "%s.%s" % ( groupid, collect ) )
|
|
114
|
|
115 if collect == "align":
|
|
116 if not allByAll:
|
|
117 os.system( "mv %s.%s_tmp %s.%s" % ( groupid, collect, groupid, collect ) )
|
|
118 else:
|
|
119 filterRedundantMatches( "%s.%s_tmp" % ( groupid, collect ),
|
|
120 "%s.%s" % ( groupid, collect ) )
|
|
121
|
|
122 # adapt the path IDs if necessary
|
|
123 elif collect == "path":
|
|
124 prg = os.environ["REPET_PATH"] + "/bin/"
|
|
125 if os.path.exists( prg + "pathnum2id" ):
|
|
126 prg += "pathnum2id"
|
|
127 cmd = prg
|
|
128 cmd += " -i %s.path_tmp" % ( groupid )
|
|
129 cmd += " -o %s.path" % ( groupid )
|
|
130 cmd += " -v %i" % ( verbose - 1 )
|
|
131 pL.launch( prg, cmd )
|
|
132 else:
|
|
133 prg += "pathnum2id.py"
|
|
134 cmd = prg
|
|
135 cmd += " -i %s.path_tmp" % ( groupid )
|
|
136 cmd += " -o %s.path" % ( groupid )
|
|
137 pL.launch( prg, cmd )
|
|
138
|
|
139 elif collect == "tab":
|
|
140 prg = os.environ["REPET_PATH"] + "/bin/"
|
|
141 if os.path.exists( prg + "tabnum2id" ):
|
|
142 prg += "tabnum2id"
|
|
143 cmd = prg
|
|
144 cmd += " -i %s.tab_tmp" % ( groupid )
|
|
145 cmd += " -o %s.tab" % ( groupid )
|
|
146 cmd += " -v %i" % ( verbose - 1 )
|
|
147 pL.launch( prg, cmd )
|
|
148 else:
|
|
149 prg += "tabnum2id.py"
|
|
150 cmd = prg
|
|
151 cmd += " -i %s.tab_tmp" % ( groupid )
|
|
152 cmd += " -o %s.tab" % ( groupid )
|
|
153 pL.launch( prg, cmd )
|
|
154
|
|
155 if clean == True:
|
|
156 os.remove( "%s.%s_tmp" % ( groupid, collect ) )
|
|
157
|
|
158
|
|
159 def main():
|
|
160 """
|
|
161 This program takes a query directory as input,
|
|
162 then launches Blaster and/or Matcher on each file in it,
|
|
163 finally results are optionally gathered in a single file.
|
|
164 """
|
|
165
|
|
166 groupid = ""
|
|
167 queryDir = ""
|
|
168 patternSuffix = ""
|
|
169 subjectBank = ""
|
|
170 queue = ""
|
|
171 tmpDir = ""
|
|
172 mix = ""
|
|
173 paramBlaster = ""
|
|
174 paramMatcher = ""
|
|
175 collect = ""
|
|
176 configFileName = ""
|
|
177 jobTable = "jobs"
|
|
178 projectDir = ""
|
|
179 global clean
|
|
180 clean = False
|
|
181 global verbose
|
|
182 verbose = 0
|
|
183 allByAll = False
|
|
184
|
|
185 try:
|
|
186 opts, args = getopt.getopt(sys.argv[1:],"hg:q:S:s:Q:d:m:B:M:Z:C:t:p:cv:")
|
|
187 except getopt.GetoptError, err:
|
|
188 print str(err)
|
|
189 help()
|
|
190 sys.exit(1)
|
|
191 for o,a in opts:
|
|
192 if o == "-h":
|
|
193 help()
|
|
194 sys.exit(0)
|
|
195 elif o == "-g":
|
|
196 groupid = a
|
|
197 elif o == "-q":
|
|
198 queryDir = a
|
|
199 if queryDir[-1] == "/":
|
|
200 queryDir = queryDir[:-1]
|
|
201 elif o == "-S":
|
|
202 patternSuffix = a
|
|
203 elif o == "-s":
|
|
204 subjectBank = a
|
|
205 elif o == "-Q":
|
|
206 queue = a
|
|
207 elif o == "-d":
|
|
208 tmpDir = a
|
|
209 if tmpDir[-1] == "/":
|
|
210 tmpDir = tmpDir[:-1]
|
|
211 elif o == "-m":
|
|
212 mix = a
|
|
213 elif o == "-B":
|
|
214 paramBlaster = a
|
|
215 elif o == "-M":
|
|
216 paramMatcher = a
|
|
217 elif o == "-C":
|
|
218 configFileName = a
|
|
219 elif o == "-Z":
|
|
220 collect = a
|
|
221 elif o == "-t":
|
|
222 jobTable = a
|
|
223 elif o == "-p":
|
|
224 projectDir = a
|
|
225 elif o == "-c":
|
|
226 clean = True
|
|
227 elif o == "-v":
|
|
228 verbose = int(a)
|
|
229
|
|
230 if groupid == "":
|
|
231 print "ERROR: missing group identifier (-g)"
|
|
232 help()
|
|
233 sys.exit(1)
|
|
234
|
|
235 if queryDir == "":
|
|
236 print "ERROR: missing query directory (-q)"
|
|
237 help()
|
|
238 sys.exit(1)
|
|
239
|
|
240 if mix in [ "1", "3" ] and subjectBank == "":
|
|
241 print "ERROR: missing subject bank for Blaster"
|
|
242 sys.exit(1)
|
|
243
|
|
244 if os.environ["REPET_JOBS"] == "files" and projectDir == "":
|
|
245 print "ERROR: missing compulsory options for jobs management via files"
|
|
246 help()
|
|
247 sys.exit(1)
|
|
248
|
|
249 if not os.path.exists( queryDir ):
|
|
250 print "ERROR: can't find query directory '%s'" % ( queryDir )
|
|
251 sys.exit(1)
|
|
252
|
|
253 if verbose > 0:
|
|
254 print "START %s" % ( sys.argv[0].split("/")[-1] )
|
|
255 sys.stdout.flush()
|
|
256
|
|
257
|
|
258 logFileName = "%s_pid%s.log" % ( groupid, os.getpid() )
|
|
259 handler = logging.FileHandler( logFileName )
|
|
260 formatter = logging.Formatter( "%(asctime)s %(levelname)s: %(message)s" )
|
|
261 handler.setFormatter( formatter )
|
|
262 logging.getLogger('').addHandler( handler )
|
|
263 logging.getLogger('').setLevel( logging.DEBUG )
|
|
264 logging.info( "started" )
|
|
265
|
|
266
|
|
267 if configFileName != "":
|
|
268 if not os.path.exists( configFileName ):
|
|
269 print "ERROR: configuration file '%s' doesn't exist" % ( configFileName )
|
|
270 sys.exit(1)
|
|
271 config = ConfigParser.ConfigParser()
|
|
272 config.readfp( open(configFileName) )
|
|
273 host = config.get("repet_env","repet_host")
|
|
274 user = config.get("repet_env","repet_user")
|
|
275 passwd = config.get("repet_env","repet_pw")
|
|
276 dbname = config.get("repet_env","repet_db")
|
|
277 else:
|
|
278 host = os.environ["REPET_HOST"]
|
|
279 user = os.environ["REPET_USER"]
|
|
280 passwd = os.environ["REPET_PW"]
|
|
281 dbname = os.environ["REPET_DB"]
|
|
282
|
|
283 if os.environ["REPET_JOBS"] == "files":
|
|
284 jobdb = RepetJob( dbname = projectDir + "/" + os.environ["REPET_DB"] )
|
|
285 elif os.environ["REPET_JOBS"] == "MySQL":
|
|
286 jobdb = RepetJob( user, host, passwd, dbname )
|
|
287 else:
|
|
288 print "ERROR: REPET_JOBS is '%s'" % ( os.environ["REPET_JOBS"] )
|
|
289 sys.exit(1)
|
|
290
|
|
291
|
|
292 currentDir = os.getcwd()
|
|
293 if tmpDir == "":
|
|
294 tmpDir = currentDir
|
|
295
|
|
296 global pL
|
|
297 pL = programLauncher()
|
|
298
|
|
299 if "-a" in paramBlaster:
|
|
300 allByAll = True
|
|
301
|
|
302
|
|
303 # if Blaster will be launched, prepare the subject data if necessary
|
|
304 if mix != "2" and not os.path.exists( "%s_cut" % ( subjectBank ) ):
|
|
305 if verbose > 0:
|
|
306 print "prepare subject bank '%s'..." % ( subjectBank.split("/")[-1] )
|
|
307 sys.stdout.flush()
|
|
308 prg = "blaster"
|
|
309 cmd = prg
|
|
310 cmd += " -q %s" % ( subjectBank )
|
|
311 cmd += " %s" % ( paramBlaster )
|
|
312 cmd += " -P"
|
|
313 pL.launch( prg, cmd )
|
|
314
|
|
315
|
|
316 # launch Blaster only (on '*.fa' in queryDir)
|
|
317 if mix == "1":
|
|
318 launcher = Launcher.BlasterLauncher( jobdb, queryDir, subjectBank, paramBlaster, currentDir, tmpDir, jobTable, queue, groupid, "Blaster" )
|
|
319 if patternSuffix == "":
|
|
320 patternSuffix = "*.fa"
|
|
321 launcher.run( patternSuffix, verbose )
|
|
322 if clean == True:
|
|
323 launcher.clean()
|
|
324
|
|
325 # launch Matcher only (on '*.align' in queryDir; don't use '-q' or '-s', only '-m')
|
|
326 elif mix == "2":
|
|
327 launcher = Launcher.MatcherLauncher( jobdb, queryDir, subjectBank, paramMatcher, currentDir, tmpDir, jobTable, queue, groupid, "Matcher" )
|
|
328 if patternSuffix == "":
|
|
329 patternSuffix = "*.align"
|
|
330 launcher.run( patternSuffix, verbose )
|
|
331 if clean == True:
|
|
332 launcher.clean()
|
|
333
|
|
334 # launch Blaster+Matcher (on '*.fa' in queryDir)
|
|
335 elif mix == "3":
|
|
336 launcher = Launcher.Launcher( jobdb, queryDir, subjectBank, paramBlaster, currentDir, tmpDir, jobTable, queue, groupid, "BlasterMatcher" )
|
|
337 launcher.beginRun()
|
|
338 if patternSuffix == "":
|
|
339 patternSuffix = "*.fa"
|
|
340 lFaFiles = glob.glob( queryDir + "/" + patternSuffix )
|
|
341 if len(lFaFiles) == 0:
|
|
342 print "ERROR: query directory '%s' is empty of suffix '%s'" % ( queryDir, patternSuffix )
|
|
343 sys.exit(1)
|
|
344 count = 0
|
|
345 for inFaName in lFaFiles:
|
|
346 count += 1
|
|
347 launcher.acronyme = "BlasterMatcher_%i" % ( count )
|
|
348 launcher.job.jobname = launcher.acronyme
|
|
349 prefix = os.path.basename( inFaName )
|
|
350 cmd_start = ""
|
|
351 cmd_start += "if not os.path.exists( \"" + prefix + "\" ):\n"
|
|
352 cmd_start += "\tos.system( \"cp " + inFaName + " .\" )\n"
|
|
353 launchB = ""
|
|
354 launchB += "blaster"
|
|
355 launchB += " -q %s" % ( prefix )
|
|
356 launchB += " -s %s" % ( subjectBank )
|
|
357 launchB += " -B %s" % ( prefix )
|
|
358 if paramBlaster != "":
|
|
359 launchB += " %s" % ( paramBlaster )
|
|
360 cleanB = ""
|
|
361 cleanB += "if not os.path.exists( \"%s/%s.param\" ):\n" % ( currentDir, prefix )
|
|
362 cleanB += "\tos.system( \"mv %s.param %s\" )\n" % ( prefix, currentDir )
|
|
363 cleanB += "if os.path.exists( \"" + prefix + ".Nstretch.map\" ):\n"
|
|
364 cleanB += "\tos.remove( \"" + prefix + ".Nstretch.map\" )\n"
|
|
365 cleanB += "if os.path.exists( \"" + prefix + "_cut\" ):\n"
|
|
366 cleanB += "\tos.system( \"rm -f " + prefix + "_cut*\" )\n"
|
|
367 cleanB += "if os.path.exists( \"" + prefix + ".raw\" ):\n"
|
|
368 cleanB += "\tos.remove( \"" + prefix + ".raw\" )\n"
|
|
369 cleanB += "if os.path.exists( \"" + prefix + ".seq_treated\" ):\n"
|
|
370 cleanB += "\tos.remove( \"" + prefix + ".seq_treated\" )\n"
|
|
371 launchM = ""
|
|
372 launchM += os.environ["REPET_PATH"] + "/bin/matcher"
|
|
373 launchM += " -m %s.align" % ( prefix )
|
|
374 launchM += " -q %s" % ( prefix )
|
|
375 launchM += " -s %s" % ( subjectBank )
|
|
376 if paramMatcher != "":
|
|
377 launchM += " %s" % ( paramMatcher )
|
|
378 cleanM = ""
|
|
379 s = ""
|
|
380 if "-a" in paramMatcher:
|
|
381 s = "match"
|
|
382 else:
|
|
383 s = "clean_match"
|
|
384 if collect == "path":
|
|
385 cleanM += "if not os.path.exists( \"%s/%s.align.%s.path\" ):\n" % ( currentDir, prefix, s )
|
|
386 cleanM += "\tos.system( \"mv %s.align.%s.path %s\" )\n" % ( prefix, s, currentDir )
|
|
387 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".tab\" ):\n"
|
|
388 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".tab\" )\n"
|
|
389 elif collect == "tab":
|
|
390 cleanM += "if not os.path.exists( \"%s/%s.align.%s.tab\" ):\n" % ( currentDir, prefix, s )
|
|
391 cleanM += "\tos.system( \"mv %s.align.%s.tab %s\" )\n" % ( prefix, s, currentDir )
|
|
392 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".path\" ):\n"
|
|
393 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".path\" )\n"
|
|
394 cleanM += "if not os.path.exists( \"%s/%s.align.%s.param\" ):\n" % ( currentDir, prefix, s )
|
|
395 cleanM += "\tos.system( \"mv %s.align.%s.param %s\" )\n" % ( prefix, s, currentDir )
|
|
396 if tmpDir != currentDir:
|
|
397 cleanM += "if os.path.exists( \"%s\" ):\n" % ( prefix )
|
|
398 cleanM += "\tos.remove( \"%s\" )\n" % ( prefix )
|
|
399 if clean == True:
|
|
400 cleanM += "if os.path.exists( \"" + prefix + ".align\" ):\n"
|
|
401 cleanM += "\tos.remove( \"" + prefix + ".align\" )\n"
|
|
402 else:
|
|
403 cleanM += "if not os.path.exists( \"%s/%s.align\" ):\n" % ( currentDir, prefix )
|
|
404 cleanM += "\tos.system( \"mv %s.align %s\" )\n" % ( prefix, currentDir )
|
|
405 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".fa\" ):\n"
|
|
406 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".fa\" )\n"
|
|
407 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".map\" ):\n"
|
|
408 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".map\" )\n"
|
|
409 cmd_start += "print \"" + launchB + "\"; sys.stdout.flush()\n"
|
|
410 cmd_start += "log = os.system( \"" + launchB + "\" )\n"
|
|
411 cmd_start += "if log != 0:\n"
|
|
412 cmd_start += launcher.cmd_test( launcher.job, "error", loop=1 )
|
|
413 cmd_start += "\tsys.exit(1)\n"
|
|
414 cmd_start += cleanB
|
|
415 cmd_start += "print \"" + launchM + "\"; sys.stdout.flush()\n"
|
|
416 cmd_start += "log = os.system( \"" + launchM + "\" )\n"
|
|
417 cmd_start += cleanM
|
|
418 launcher.runSingleJob( cmd_start )
|
|
419 launcher.acronyme = "BlasterMatcher"
|
|
420 launcher._nbJobs = count
|
|
421 launcher.endRun()
|
|
422 if clean == True:
|
|
423 launcher.clean( "BlasterMatcher_*" )
|
|
424
|
|
425 else:
|
|
426 print "ERROR: option '-m %s' not recognized" % ( mix )
|
|
427 sys.exit(1)
|
|
428
|
|
429
|
|
430 if collect != "":
|
|
431 if collect in [ "align", "path", "tab" ]:
|
|
432 runCollect( groupid, collect, allByAll )
|
|
433 else:
|
|
434 print "ERROR: collect '%s' not implemented" % ( collect )
|
|
435 sys.exit(1)
|
|
436
|
|
437
|
|
438 logging.info( "finished" )
|
|
439
|
|
440 if verbose > 0:
|
|
441 print "END %s" % ( sys.argv[0].split("/")[-1] )
|
|
442 sys.stdout.flush()
|
|
443
|
|
444 return 0
|
|
445
|
|
446
|
|
447 if __name__ == "__main__":
|
|
448 main()
|