Mercurial > repos > yufei-luo > s_mart
comparison commons/tools/srptBlasterMatcher.py @ 18:94ab73e8a190
Uploaded
| author | m-zytnicki |
|---|---|
| date | Mon, 29 Apr 2013 03:20:15 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 17:b0e8584489e6 | 18:94ab73e8a190 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 | |
| 3 """ | |
| 4 This program takes a query directory as input, | |
| 5 then launches Blaster and/or Matcher on each file in it, | |
| 6 finally results are optionally gathered in a single file. | |
| 7 """ | |
| 8 | |
| 9 import os | |
| 10 import sys | |
| 11 import getopt | |
| 12 import logging | |
| 13 import glob | |
| 14 import ConfigParser | |
| 15 | |
| 16 from pyRepet.launcher.programLauncher import programLauncher | |
| 17 from pyRepet.launcher import Launcher | |
| 18 from pyRepet.sql.RepetJobMySQL import RepetJob | |
| 19 from commons.core.coord.Align import Align | |
| 20 | |
| 21 | |
| 22 def help(): | |
| 23 print | |
| 24 print "usage: %s [ options ]" % ( sys.argv[0].split("/")[-1] ) | |
| 25 print "options:" | |
| 26 print " -h: this help" | |
| 27 print " -g: name of the group identifier (same for all the jobs)" | |
| 28 print " -q: name of the query directory" | |
| 29 print " -S: suffix in the query directory (default='*.fa' for Blaster, '*.align' for Matcher)" | |
| 30 print " -s: absolute path to the subject databank" | |
| 31 print " -Q: resources needed on the cluster)" | |
| 32 print " -d: absolute path to the temporary directory" | |
| 33 print " -m: mix of Blaster and/or Matcher" | |
| 34 print " 1: launch Blaster only" | |
| 35 print " 2: launch Matcher only (on '*.align' query files)" | |
| 36 print " 3: launch Blaster+Matcher in the same job" | |
| 37 print " -B: parameters for Blaster (e.g. \"-a -n tblastx\")" | |
| 38 print " -M: parameters for Matcher (e.g. \"-j\")" | |
| 39 print " -Z: collect all the results into a single file" | |
| 40 print " align (after Blaster)" | |
| 41 print " path/tab (after Matcher)" | |
| 42 print " -C: configuration file from TEdenovo or TEannot pipeline" | |
| 43 print " -t: name of the table recording the jobs (default=jobs)" | |
| 44 print " -p: absolute path to project directory (if jobs management via files)" | |
| 45 print " -c: clean (remove job launch files and job stdout)" | |
| 46 print " -v: verbose (default=0/1/2)" | |
| 47 print | |
| 48 | |
| 49 | |
| 50 def filterRedundantMatches( inFile, outFile ): | |
| 51 """ | |
| 52 When a pairwise alignment is launched ~ all-by-all (ie one batch against all chunks), | |
| 53 one filters the redundant matches. For instance we keep 'chunk3-1-100-chunk7-11-110-...' | |
| 54 and we discards 'chunk7-11-110-chunk3-1-100-...'. | |
| 55 Also we keep 'chunk5-1-100-chunk5-11-110-...' and we discards | |
| 56 'chunk5-11-110-chunk5-1-100-...'. | |
| 57 For this of course the results need to be sorted by query, on plus strand, | |
| 58 and in ascending coordinates (always the case with Blaster). | |
| 59 """ | |
| 60 inFileHandler = open( inFile, "r" ) | |
| 61 outFileHandler = open( outFile, "w" ) | |
| 62 iAlign = Align() | |
| 63 countMatches = 0 | |
| 64 tick = 100000 | |
| 65 while True: | |
| 66 line = inFileHandler.readline() | |
| 67 if line == "": | |
| 68 break | |
| 69 countMatches += 1 | |
| 70 iAlign.setFromString( line ) | |
| 71 if "chunk" not in iAlign.range_query.seqname \ | |
| 72 or "chunk" not in iAlign.range_subject.seqname: | |
| 73 print "ERROR: 'chunk' not in seqname" | |
| 74 sys.exit(1) | |
| 75 if int(iAlign.range_query.seqname.split("chunk")[1]) < int(iAlign.range_subject.seqname.split("chunk")[1]): | |
| 76 iAlign.write( outFileHandler ) | |
| 77 elif int(iAlign.range_query.seqname.split("chunk")[1]) == int(iAlign.range_subject.seqname.split("chunk")[1]): | |
| 78 if iAlign.range_query.getMin() < iAlign.range_subject.getMin(): | |
| 79 iAlign.write( outFileHandler ) | |
| 80 if countMatches % tick == 0: # need to free buffer frequently as file can be big | |
| 81 outFileHandler.flush() | |
| 82 os.fsync( outFileHandler.fileno() ) | |
| 83 inFileHandler.close() | |
| 84 outFileHandler.close() | |
| 85 | |
| 86 | |
| 87 def runCollect( groupid, collect, allByAll ): | |
| 88 """ | |
| 89 Gather the results of each job in a single job and adapt path ID if necessary. | |
| 90 """ | |
| 91 if verbose > 0: | |
| 92 print "concatenate the results of each job"; sys.stdout.flush() | |
| 93 | |
| 94 # retrieve the list of the files | |
| 95 lFiles = glob.glob( "*.%s" % ( collect ) ) | |
| 96 lFiles.sort() | |
| 97 | |
| 98 # concatenate all the individual files | |
| 99 if os.path.exists( "%s.%s_tmp" % ( groupid, collect ) ): | |
| 100 os.remove( "%s.%s_tmp" % ( groupid, collect ) ) | |
| 101 for resFile in lFiles: | |
| 102 prg = "cat" | |
| 103 cmd = prg | |
| 104 cmd += " %s >> %s.%s_tmp" % ( resFile, groupid, collect ) | |
| 105 pL.launch( prg, cmd ) | |
| 106 if clean == True: | |
| 107 prg = "rm" | |
| 108 cmd = prg | |
| 109 cmd += " -f %s" % ( resFile ) | |
| 110 pL.launch( prg, cmd ) | |
| 111 | |
| 112 if os.path.exists( "%s.%s" % ( groupid, collect ) ): | |
| 113 os.remove( "%s.%s" % ( groupid, collect ) ) | |
| 114 | |
| 115 if collect == "align": | |
| 116 if not allByAll: | |
| 117 os.system( "mv %s.%s_tmp %s.%s" % ( groupid, collect, groupid, collect ) ) | |
| 118 else: | |
| 119 filterRedundantMatches( "%s.%s_tmp" % ( groupid, collect ), | |
| 120 "%s.%s" % ( groupid, collect ) ) | |
| 121 | |
| 122 # adapt the path IDs if necessary | |
| 123 elif collect == "path": | |
| 124 prg = os.environ["REPET_PATH"] + "/bin/" | |
| 125 if os.path.exists( prg + "pathnum2id" ): | |
| 126 prg += "pathnum2id" | |
| 127 cmd = prg | |
| 128 cmd += " -i %s.path_tmp" % ( groupid ) | |
| 129 cmd += " -o %s.path" % ( groupid ) | |
| 130 cmd += " -v %i" % ( verbose - 1 ) | |
| 131 pL.launch( prg, cmd ) | |
| 132 else: | |
| 133 prg += "pathnum2id.py" | |
| 134 cmd = prg | |
| 135 cmd += " -i %s.path_tmp" % ( groupid ) | |
| 136 cmd += " -o %s.path" % ( groupid ) | |
| 137 pL.launch( prg, cmd ) | |
| 138 | |
| 139 elif collect == "tab": | |
| 140 prg = os.environ["REPET_PATH"] + "/bin/" | |
| 141 if os.path.exists( prg + "tabnum2id" ): | |
| 142 prg += "tabnum2id" | |
| 143 cmd = prg | |
| 144 cmd += " -i %s.tab_tmp" % ( groupid ) | |
| 145 cmd += " -o %s.tab" % ( groupid ) | |
| 146 cmd += " -v %i" % ( verbose - 1 ) | |
| 147 pL.launch( prg, cmd ) | |
| 148 else: | |
| 149 prg += "tabnum2id.py" | |
| 150 cmd = prg | |
| 151 cmd += " -i %s.tab_tmp" % ( groupid ) | |
| 152 cmd += " -o %s.tab" % ( groupid ) | |
| 153 pL.launch( prg, cmd ) | |
| 154 | |
| 155 if clean == True: | |
| 156 os.remove( "%s.%s_tmp" % ( groupid, collect ) ) | |
| 157 | |
| 158 | |
| 159 def main(): | |
| 160 """ | |
| 161 This program takes a query directory as input, | |
| 162 then launches Blaster and/or Matcher on each file in it, | |
| 163 finally results are optionally gathered in a single file. | |
| 164 """ | |
| 165 | |
| 166 groupid = "" | |
| 167 queryDir = "" | |
| 168 patternSuffix = "" | |
| 169 subjectBank = "" | |
| 170 queue = "" | |
| 171 tmpDir = "" | |
| 172 mix = "" | |
| 173 paramBlaster = "" | |
| 174 paramMatcher = "" | |
| 175 collect = "" | |
| 176 configFileName = "" | |
| 177 jobTable = "jobs" | |
| 178 projectDir = "" | |
| 179 global clean | |
| 180 clean = False | |
| 181 global verbose | |
| 182 verbose = 0 | |
| 183 allByAll = False | |
| 184 | |
| 185 try: | |
| 186 opts, args = getopt.getopt(sys.argv[1:],"hg:q:S:s:Q:d:m:B:M:Z:C:t:p:cv:") | |
| 187 except getopt.GetoptError, err: | |
| 188 print str(err) | |
| 189 help() | |
| 190 sys.exit(1) | |
| 191 for o,a in opts: | |
| 192 if o == "-h": | |
| 193 help() | |
| 194 sys.exit(0) | |
| 195 elif o == "-g": | |
| 196 groupid = a | |
| 197 elif o == "-q": | |
| 198 queryDir = a | |
| 199 if queryDir[-1] == "/": | |
| 200 queryDir = queryDir[:-1] | |
| 201 elif o == "-S": | |
| 202 patternSuffix = a | |
| 203 elif o == "-s": | |
| 204 subjectBank = a | |
| 205 elif o == "-Q": | |
| 206 queue = a | |
| 207 elif o == "-d": | |
| 208 tmpDir = a | |
| 209 if tmpDir[-1] == "/": | |
| 210 tmpDir = tmpDir[:-1] | |
| 211 elif o == "-m": | |
| 212 mix = a | |
| 213 elif o == "-B": | |
| 214 paramBlaster = a | |
| 215 elif o == "-M": | |
| 216 paramMatcher = a | |
| 217 elif o == "-C": | |
| 218 configFileName = a | |
| 219 elif o == "-Z": | |
| 220 collect = a | |
| 221 elif o == "-t": | |
| 222 jobTable = a | |
| 223 elif o == "-p": | |
| 224 projectDir = a | |
| 225 elif o == "-c": | |
| 226 clean = True | |
| 227 elif o == "-v": | |
| 228 verbose = int(a) | |
| 229 | |
| 230 if groupid == "": | |
| 231 print "ERROR: missing group identifier (-g)" | |
| 232 help() | |
| 233 sys.exit(1) | |
| 234 | |
| 235 if queryDir == "": | |
| 236 print "ERROR: missing query directory (-q)" | |
| 237 help() | |
| 238 sys.exit(1) | |
| 239 | |
| 240 if mix in [ "1", "3" ] and subjectBank == "": | |
| 241 print "ERROR: missing subject bank for Blaster" | |
| 242 sys.exit(1) | |
| 243 | |
| 244 if os.environ["REPET_JOBS"] == "files" and projectDir == "": | |
| 245 print "ERROR: missing compulsory options for jobs management via files" | |
| 246 help() | |
| 247 sys.exit(1) | |
| 248 | |
| 249 if not os.path.exists( queryDir ): | |
| 250 print "ERROR: can't find query directory '%s'" % ( queryDir ) | |
| 251 sys.exit(1) | |
| 252 | |
| 253 if verbose > 0: | |
| 254 print "START %s" % ( sys.argv[0].split("/")[-1] ) | |
| 255 sys.stdout.flush() | |
| 256 | |
| 257 | |
| 258 logFileName = "%s_pid%s.log" % ( groupid, os.getpid() ) | |
| 259 handler = logging.FileHandler( logFileName ) | |
| 260 formatter = logging.Formatter( "%(asctime)s %(levelname)s: %(message)s" ) | |
| 261 handler.setFormatter( formatter ) | |
| 262 logging.getLogger('').addHandler( handler ) | |
| 263 logging.getLogger('').setLevel( logging.DEBUG ) | |
| 264 logging.info( "started" ) | |
| 265 | |
| 266 | |
| 267 if configFileName != "": | |
| 268 if not os.path.exists( configFileName ): | |
| 269 print "ERROR: configuration file '%s' doesn't exist" % ( configFileName ) | |
| 270 sys.exit(1) | |
| 271 config = ConfigParser.ConfigParser() | |
| 272 config.readfp( open(configFileName) ) | |
| 273 host = config.get("repet_env","repet_host") | |
| 274 user = config.get("repet_env","repet_user") | |
| 275 passwd = config.get("repet_env","repet_pw") | |
| 276 dbname = config.get("repet_env","repet_db") | |
| 277 else: | |
| 278 host = os.environ["REPET_HOST"] | |
| 279 user = os.environ["REPET_USER"] | |
| 280 passwd = os.environ["REPET_PW"] | |
| 281 dbname = os.environ["REPET_DB"] | |
| 282 | |
| 283 if os.environ["REPET_JOBS"] == "files": | |
| 284 jobdb = RepetJob( dbname = projectDir + "/" + os.environ["REPET_DB"] ) | |
| 285 elif os.environ["REPET_JOBS"] == "MySQL": | |
| 286 jobdb = RepetJob( user, host, passwd, dbname ) | |
| 287 else: | |
| 288 print "ERROR: REPET_JOBS is '%s'" % ( os.environ["REPET_JOBS"] ) | |
| 289 sys.exit(1) | |
| 290 | |
| 291 | |
| 292 currentDir = os.getcwd() | |
| 293 if tmpDir == "": | |
| 294 tmpDir = currentDir | |
| 295 | |
| 296 global pL | |
| 297 pL = programLauncher() | |
| 298 | |
| 299 if "-a" in paramBlaster: | |
| 300 allByAll = True | |
| 301 | |
| 302 | |
| 303 # if Blaster will be launched, prepare the subject data if necessary | |
| 304 if mix != "2" and not os.path.exists( "%s_cut" % ( subjectBank ) ): | |
| 305 if verbose > 0: | |
| 306 print "prepare subject bank '%s'..." % ( subjectBank.split("/")[-1] ) | |
| 307 sys.stdout.flush() | |
| 308 prg = "blaster" | |
| 309 cmd = prg | |
| 310 cmd += " -q %s" % ( subjectBank ) | |
| 311 cmd += " %s" % ( paramBlaster ) | |
| 312 cmd += " -P" | |
| 313 pL.launch( prg, cmd ) | |
| 314 | |
| 315 | |
| 316 # launch Blaster only (on '*.fa' in queryDir) | |
| 317 if mix == "1": | |
| 318 launcher = Launcher.BlasterLauncher( jobdb, queryDir, subjectBank, paramBlaster, currentDir, tmpDir, jobTable, queue, groupid, "Blaster" ) | |
| 319 if patternSuffix == "": | |
| 320 patternSuffix = "*.fa" | |
| 321 launcher.run( patternSuffix, verbose ) | |
| 322 if clean == True: | |
| 323 launcher.clean() | |
| 324 | |
| 325 # launch Matcher only (on '*.align' in queryDir; don't use '-q' or '-s', only '-m') | |
| 326 elif mix == "2": | |
| 327 launcher = Launcher.MatcherLauncher( jobdb, queryDir, subjectBank, paramMatcher, currentDir, tmpDir, jobTable, queue, groupid, "Matcher" ) | |
| 328 if patternSuffix == "": | |
| 329 patternSuffix = "*.align" | |
| 330 launcher.run( patternSuffix, verbose ) | |
| 331 if clean == True: | |
| 332 launcher.clean() | |
| 333 | |
| 334 # launch Blaster+Matcher (on '*.fa' in queryDir) | |
| 335 elif mix == "3": | |
| 336 launcher = Launcher.Launcher( jobdb, queryDir, subjectBank, paramBlaster, currentDir, tmpDir, jobTable, queue, groupid, "BlasterMatcher" ) | |
| 337 launcher.beginRun() | |
| 338 if patternSuffix == "": | |
| 339 patternSuffix = "*.fa" | |
| 340 lFaFiles = glob.glob( queryDir + "/" + patternSuffix ) | |
| 341 if len(lFaFiles) == 0: | |
| 342 print "ERROR: query directory '%s' is empty of suffix '%s'" % ( queryDir, patternSuffix ) | |
| 343 sys.exit(1) | |
| 344 count = 0 | |
| 345 for inFaName in lFaFiles: | |
| 346 count += 1 | |
| 347 launcher.acronyme = "BlasterMatcher_%i" % ( count ) | |
| 348 launcher.job.jobname = launcher.acronyme | |
| 349 prefix = os.path.basename( inFaName ) | |
| 350 cmd_start = "" | |
| 351 cmd_start += "if not os.path.exists( \"" + prefix + "\" ):\n" | |
| 352 cmd_start += "\tos.system( \"cp " + inFaName + " .\" )\n" | |
| 353 launchB = "" | |
| 354 launchB += "blaster" | |
| 355 launchB += " -q %s" % ( prefix ) | |
| 356 launchB += " -s %s" % ( subjectBank ) | |
| 357 launchB += " -B %s" % ( prefix ) | |
| 358 if paramBlaster != "": | |
| 359 launchB += " %s" % ( paramBlaster ) | |
| 360 cleanB = "" | |
| 361 cleanB += "if not os.path.exists( \"%s/%s.param\" ):\n" % ( currentDir, prefix ) | |
| 362 cleanB += "\tos.system( \"mv %s.param %s\" )\n" % ( prefix, currentDir ) | |
| 363 cleanB += "if os.path.exists( \"" + prefix + ".Nstretch.map\" ):\n" | |
| 364 cleanB += "\tos.remove( \"" + prefix + ".Nstretch.map\" )\n" | |
| 365 cleanB += "if os.path.exists( \"" + prefix + "_cut\" ):\n" | |
| 366 cleanB += "\tos.system( \"rm -f " + prefix + "_cut*\" )\n" | |
| 367 cleanB += "if os.path.exists( \"" + prefix + ".raw\" ):\n" | |
| 368 cleanB += "\tos.remove( \"" + prefix + ".raw\" )\n" | |
| 369 cleanB += "if os.path.exists( \"" + prefix + ".seq_treated\" ):\n" | |
| 370 cleanB += "\tos.remove( \"" + prefix + ".seq_treated\" )\n" | |
| 371 launchM = "" | |
| 372 launchM += os.environ["REPET_PATH"] + "/bin/matcher" | |
| 373 launchM += " -m %s.align" % ( prefix ) | |
| 374 launchM += " -q %s" % ( prefix ) | |
| 375 launchM += " -s %s" % ( subjectBank ) | |
| 376 if paramMatcher != "": | |
| 377 launchM += " %s" % ( paramMatcher ) | |
| 378 cleanM = "" | |
| 379 s = "" | |
| 380 if "-a" in paramMatcher: | |
| 381 s = "match" | |
| 382 else: | |
| 383 s = "clean_match" | |
| 384 if collect == "path": | |
| 385 cleanM += "if not os.path.exists( \"%s/%s.align.%s.path\" ):\n" % ( currentDir, prefix, s ) | |
| 386 cleanM += "\tos.system( \"mv %s.align.%s.path %s\" )\n" % ( prefix, s, currentDir ) | |
| 387 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".tab\" ):\n" | |
| 388 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".tab\" )\n" | |
| 389 elif collect == "tab": | |
| 390 cleanM += "if not os.path.exists( \"%s/%s.align.%s.tab\" ):\n" % ( currentDir, prefix, s ) | |
| 391 cleanM += "\tos.system( \"mv %s.align.%s.tab %s\" )\n" % ( prefix, s, currentDir ) | |
| 392 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".path\" ):\n" | |
| 393 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".path\" )\n" | |
| 394 cleanM += "if not os.path.exists( \"%s/%s.align.%s.param\" ):\n" % ( currentDir, prefix, s ) | |
| 395 cleanM += "\tos.system( \"mv %s.align.%s.param %s\" )\n" % ( prefix, s, currentDir ) | |
| 396 if tmpDir != currentDir: | |
| 397 cleanM += "if os.path.exists( \"%s\" ):\n" % ( prefix ) | |
| 398 cleanM += "\tos.remove( \"%s\" )\n" % ( prefix ) | |
| 399 if clean == True: | |
| 400 cleanM += "if os.path.exists( \"" + prefix + ".align\" ):\n" | |
| 401 cleanM += "\tos.remove( \"" + prefix + ".align\" )\n" | |
| 402 else: | |
| 403 cleanM += "if not os.path.exists( \"%s/%s.align\" ):\n" % ( currentDir, prefix ) | |
| 404 cleanM += "\tos.system( \"mv %s.align %s\" )\n" % ( prefix, currentDir ) | |
| 405 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".fa\" ):\n" | |
| 406 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".fa\" )\n" | |
| 407 cleanM += "if os.path.exists( \"" + prefix + ".align."+s+".map\" ):\n" | |
| 408 cleanM += "\tos.remove( \"" + prefix + ".align."+s+".map\" )\n" | |
| 409 cmd_start += "print \"" + launchB + "\"; sys.stdout.flush()\n" | |
| 410 cmd_start += "log = os.system( \"" + launchB + "\" )\n" | |
| 411 cmd_start += "if log != 0:\n" | |
| 412 cmd_start += launcher.cmd_test( launcher.job, "error", loop=1 ) | |
| 413 cmd_start += "\tsys.exit(1)\n" | |
| 414 cmd_start += cleanB | |
| 415 cmd_start += "print \"" + launchM + "\"; sys.stdout.flush()\n" | |
| 416 cmd_start += "log = os.system( \"" + launchM + "\" )\n" | |
| 417 cmd_start += cleanM | |
| 418 launcher.runSingleJob( cmd_start ) | |
| 419 launcher.acronyme = "BlasterMatcher" | |
| 420 launcher._nbJobs = count | |
| 421 launcher.endRun() | |
| 422 if clean == True: | |
| 423 launcher.clean( "BlasterMatcher_*" ) | |
| 424 | |
| 425 else: | |
| 426 print "ERROR: option '-m %s' not recognized" % ( mix ) | |
| 427 sys.exit(1) | |
| 428 | |
| 429 | |
| 430 if collect != "": | |
| 431 if collect in [ "align", "path", "tab" ]: | |
| 432 runCollect( groupid, collect, allByAll ) | |
| 433 else: | |
| 434 print "ERROR: collect '%s' not implemented" % ( collect ) | |
| 435 sys.exit(1) | |
| 436 | |
| 437 | |
| 438 logging.info( "finished" ) | |
| 439 | |
| 440 if verbose > 0: | |
| 441 print "END %s" % ( sys.argv[0].split("/")[-1] ) | |
| 442 sys.stdout.flush() | |
| 443 | |
| 444 return 0 | |
| 445 | |
| 446 | |
| 447 if __name__ == "__main__": | |
| 448 main() |
