Mercurial > repos > yufei-luo > s_mart
diff commons/launcher/launchBlasterMatcherPerQuery.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/launcher/launchBlasterMatcherPerQuery.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,196 @@ +#!/usr/bin/env python + +""" +This program splits the input fasta file in a given number of files, launch Blaster and/or Matcher on them in parallel and collect the results afterwards. +""" + +import os +import sys +import getopt +import exceptions +import logging +import ConfigParser + +if not os.environ.has_key( "REPET_PATH" ): + print "*** Error: no environment variable REPET_PATH" + sys.exit(1) +sys.path.append( os.environ["REPET_PATH"] ) + +import pyRepet.launcher.programLauncher +import pyRepet.seq.fastaDB + +#----------------------------------------------------------------------------- + +def help(): + + """ + Give the list of the command-line options. + """ + + print + print "usage:",sys.argv[0]," [ options ]" + print "options:" + print " -h: this help" + print " -q: fasta filename of the queries" + print " -s: fasta filename of the subjects (same as queries if not specified)" + print " -Q: queue name on the cluster" + print " -d: absolute path to the temporary directory" + print " -C: configuration file" + print " -n: max. number of jobs (default=10,given a min. of 1 query per job)" + print " -m: mix of Blaster and/or Matcher" + print " 1: launch Blaster only" + print " 2: launch Matcher only (on '*.align' query files)" + print " 3: launch Blaster+Matcher in the same job (default)" + print " -B: parameters for Blaster (e.g. \"-a -n tblastx\")" + print " -M: parameters for Matcher (e.g. \"-j\")" + print " -Z: collect all the results into a single file (format 'align', 'path' or 'tab')" + print " -c: clean" + print " -v: verbose (default=0/1/2)" + print + +#----------------------------------------------------------------------------- + +def main(): + + """ + This program splits the input fasta file in a given number of files, launch Blaster and/or Matcher on them in parallel and collect the results afterwards. + """ + + qryFileName = "" + sbjFileName = "" + queue = "" + tmpDir = "" + configFileName = "" + maxNbJobs = 10 + minQryPerJob = 1 + mix = "3" + paramBlaster = "" + paramMatcher = "" + collectFormat = "" + clean = False + verbose = 0 + + try: + opts, args = getopt.getopt(sys.argv[1:],"hq:s:Q:d:C:n:m:B:M:Z:cv:") + except getopt.GetoptError, err: + print str(err) + help() + sys.exit(1) + for o,a in opts: + if o == "-h": + help() + sys.exit(0) + elif o == "-q": + qryFileName = a + elif o == "-s": + sbjFileName = a + elif o == "-Q": + queue = a + elif o == "-d": + tmpDir = a + elif o == "-C": + configFileName = a + elif o == "-n": + maxNbJobs = int(a) + elif o == "-m": + mix = a + elif o == "-B": + paramBlaster = a + elif o == "-M": + paramMatcher = a + elif o == "-Z": + collectFormat = a + elif o == "-c": + clean = True + elif o == "-v": + verbose = int(a) + + if qryFileName == "" or configFileName == "" or collectFormat == "": + print "*** Error: missing compulsory options" + help() + sys.exit(1) + + if verbose > 0: + print "\nbeginning of %s" % (sys.argv[0].split("/")[-1]) + sys.stdout.flush() + + if not os.path.exists( qryFileName ): + print "*** Error: query file '%s' doesn't exist" % ( qryFileName ) + sys.exit(1) + if sbjFileName != "": + if not os.path.exists( sbjFileName ): + print "*** Error: subject file '%s' doesn't exist" % ( sbjFileName ) + sys.exit(1) + else: + sbjFileName = qryFileName + + pL = pyRepet.launcher.programLauncher.programLauncher() + + nbSeqQry = pyRepet.seq.fastaDB.dbSize( qryFileName ) + qryPerJob = nbSeqQry / float(maxNbJobs) + + # split the input query file in single files into a new directory + prg = os.environ["REPET_PATH"] + "/bin/dbSplit.py" + cmd = prg + cmd += " -i %s" % ( qryFileName ) + if qryPerJob <= 1.0: + cmd += " -n %i" % ( minQryPerJob ) + else: + cmd += " -n %i" % ( qryPerJob + 1 ) + cmd += " -d" + pL.launch( prg, cmd ) + + # prepare the subject databank + if sbjFileName != qryFileName: + prg = "blaster" + cmd = prg + cmd += " -q %s" % ( sbjFileName ) + cmd += " -P" + pL.launch( prg, cmd ) + + # launch Blaster+Matcher in parallel + prg = "srptBlasterMatcher.py" + cmd = prg + cmd += " -g %s_vs_%s" % ( qryFileName, sbjFileName ) + cmd += " -q %s/batches" % ( os.getcwd() ) + cmd += " -s %s/%s" % ( os.getcwd(), sbjFileName ) + cmd += " -Q '%s'" % ( queue ) + if tmpDir != "": + cmd += " -d %s" % ( tmpDir ) + cmd += " -m %s" % ( mix ) + if paramBlaster != "": + cmd += " -B \"%s\"" % ( paramBlaster ) + if paramMatcher != "": + cmd += " -M \"%s\"" % ( paramMatcher ) + cmd += " -Z %s" % ( collectFormat ) + cmd += " -C %s" % ( configFileName ) + if clean == True: + cmd += " -c" + cmd += " -v %i" % ( verbose - 1 ) + pL.launch( prg, cmd ) + + suffix = "" + if mix in ["2","3"]: + if "-a" in paramMatcher: + suffix = "match.%s" % ( collectFormat ) + else: + suffix = "clean_match.%s" % ( collectFormat ) + os.system( "mv %s_vs_%s.%s %s_vs_%s.align.%s" % ( qryFileName, sbjFileName, collectFormat, qryFileName, sbjFileName, suffix ) ) + + # clean + if clean == True: + prg = "rm" + cmd = prg + cmd += " -rf batches formatdb.log %s_cut* %s.Nstretch.map" % ( sbjFileName, sbjFileName ) + pL.launch( prg, cmd ) + + if verbose > 0: + print "%s finished successfully\n" % (sys.argv[0].split("/")[-1]) + sys.stdout.flush() + + return 0 + +#---------------------------------------------------------------------------- + +if __name__ == '__main__': + main()