Mercurial > repos > yufei-luo > s_mart
diff commons/tools/BenchmarkTEconsensus.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/BenchmarkTEconsensus.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,567 @@ +#!/usr/bin/env python + +##@file +# Compare two fasta files of TEs to assess how reference sequences are recovered by de novo consensus. + + +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import sys +import getopt +import shutil +import glob + + +import pyRepet.launcher.programLauncher +from commons.core.coord.AlignUtils import AlignUtils +from commons.core.coord.MatchUtils import MatchUtils +from commons.core.utils.FileUtils import FileUtils +from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB +from commons.core.seq.FastaUtils import FastaUtils +from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders + + +class BenchmarkTEconsensus( object ): + + def __init__( self ): + self._qryFile = "" + self._sbjFile = "" + self._method = 1 + self._keepConflictSbj = False + self._thresholdCoverage = 95 + self._thresholdIdentity = 80 + self._thresholdEvalue = 1e-10 + self._thresholdCoverageMatch = 90 + self._useCluster = False + self._queue = "" + self._configFileName = "" + self._clean = False + self._verbose = 0 + self._pL = pyRepet.launcher.programLauncher.programLauncher() + + + def help( self ): + print + print "usage: BenchmarkTEconsensus.py [ options ]" + print "options:" + print " -h: this help" + print " -q: name of the query file (de novo consensus, format='fasta')" + print " -s: name of the subject file (reference sequences, format='fasta')" + print " -m: method" + print " 1: Blaster + Matcher (default)" + print " 2: Blaster + merge + Matcher (not with '-Q')" + print " 3: Orienter + Mafft + Matcher" + print " 4: Yass + Matcher" + print " -a: keep all conflicting subjects in Matcher" + print " -t: coverage threshold over which the match is 'complete' (in %% of the seq length, default=%i)" % self._thresholdCoverage + print " -I: identity threshold for 'CC' matches (default=%i)" % self._thresholdIdentity + print " -E: E-value threshold for 'CC' matches (default=1e-10)" + print " -T: coverage threshold for match length on query compare to subject length (default=%i)" % self._thresholdCoverageMatch + print " -Q: queue name to run in parallel" + print " -C: name of the configuration file (compulsory with '-Q')" + print " -c: clean" + print " -v: verbosity level (default=0/1/2)" + print + + + def setAttributesFromCmdLine( self ): + try: + opts, args = getopt.getopt( sys.argv[1:], "hq:s:m:at:I:E:T:Q:C:cv:" ) + except getopt.GetoptError, err: + sys.stderr.write( "%s\n" % ( str(err) ) ) + self.help() + sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); + sys.exit(0) + elif o == "-q": + self._qryFile = a + elif o == "-s": + self._sbjFile = a + elif o == "-m": + self._method = int(a) + elif o == "-a": + self._keepConflictSbj = True + elif o == "-t": + self._thresholdCoverage = int(a) + elif o == "-I": + self._thresholdIdentity = int(a) + elif o == "-E": + self._thresholdEvalue = float(a) + elif o == "-T": + self._thresholdCoverageMatch = int(a) + elif o == "-Q": + self._useCluster = True + self._queue = a + elif o == "-C": + self._configFile = a + elif o == "-c": + self._clean = True + elif o == "-v": + self._verbose = int(a) + + + def checkAttributes( self ): + if self._qryFile == "": + msg = "ERROR: missing query file (-q)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help() + sys.exit(1) + if not os.path.exists( self._qryFile ): + msg = "ERROR: can't find file '%s'" % ( self._qryFile ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help() + sys.exit(1) + if self._sbjFile == "": + msg = "ERROR: missing subject file (-s)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help() + sys.exit(1) + if not os.path.exists( self._sbjFile ): + msg = "ERROR: can't find file '%s'" % ( self._sbjFile ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help() + sys.exit(1) + if self._useCluster: + if self._configFile == "": + msg = "ERROR: missing configuration file (-C)" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help() + sys.exit(1) + if not os.path.exists( self._configFile ): + msg = "ERROR: can't find file '%s'" % ( self._configFile ) + sys.stderr.write( "%s\n" % ( msg ) ) + self.help() + sys.exit(1) + if self._method == 2: + msg = "ERROR: can't launch method 2 in parallel" + sys.stderr.write( "%s\n" % ( msg ) ) + self.help() + sys.exit(1) + nbSeqQry = FastaUtils.dbSize( self._qryFile ) + if nbSeqQry == 0: + print "WARNING: query file is empty" + sys.exit(0) + nbSeqSbj = FastaUtils.dbSize( self._sbjFile ) + if nbSeqSbj == 0: + print "WARNING: subject file is empty" + sys.exit(0) + + + def preprocess( self ): + tmpDir = "tmp%s_t%i_m%i_I%i" % ( os.getpid(), + self._thresholdCoverage, + self._method, + self._thresholdIdentity ) + if os.path.exists( tmpDir ): + shutil.rmtree( tmpDir ) + os.mkdir( tmpDir ) + os.chdir( tmpDir ) + + os.symlink( "../%s" % self._qryFile, self._qryFile ) + csh = ChangeSequenceHeaders() + csh.setInputFile( self._qryFile ) + csh.setFormat( "fasta" ) + csh.setStep( 1 ) + csh.setPrefix( "query" ) + csh.setOutputFile( "%s.newH" % ( self._qryFile ) ) + csh.setVerbosityLevel( self._verbose ) + csh.run() + self._qryFile += ".newH" + + if not os.path.exists( self._sbjFile ): + os.symlink( "../%s" % self._sbjFile, self._sbjFile ) + + + def compareFastaViaBlasterMatcher( self, merged=False ): + """ + Blaster (+ merged) + Matcher + """ + if self._keepConflictSbj: + s = "match" + else: + s = "clean_match" + matchFile = "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ) + pathFile = "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ) + if merged: + matchFile += ".merged" + pathFile += ".merged" + matchFile += ".%s.tab" % ( s ) + pathFile += ".%s.path" % ( s ) + + if not self._useCluster: + prg = "blaster" + cmd = prg + cmd += " -q %s" % ( self._qryFile ) + cmd += " -s %s" % ( self._sbjFile ) + cmd += " -B %s_vs_%s.m%i" % ( self._qryFile, + self._sbjFile, + self._method ) + cmd += " -v %i" % ( self._verbose ) + self._pL.launch( prg, cmd ) + + if merged: + tmpFile = "%s_vs_%s.m%i.align.merged" % ( self._qryFile, + self._sbjFile, + self._method ) + AlignUtils.mergeFile( "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ), + tmpFile ) + else: + tmpFile = "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ) + + prg = "matcher" + cmd = prg + cmd += " -m %s" % ( tmpFile ) + cmd += " -q %s" % ( self._qryFile ) + cmd += " -s %s" % ( self._sbjFile ) + cmd += " -j" + if self._keepConflictSbj: + cmd += " -a" + cmd += " -v %i" % ( self._verbose ) + self._pL.launch( prg, cmd ) + + else: + os.symlink( "../%s" % self._configFile, self._configFileName ) + + prg = os.environ["REPET_PATH"] + "/bin/launchBlasterMatcherPerQuery.py" + cmd = prg + cmd += " -q %s" % ( self._qryFile ) + cmd += " -s %s" % ( self._sbjFile ) + cmd += " -Q %s" % ( self._queue ) + cmd += " -C %s" % ( self._configFile ) + cmd += " -n %i" % ( 10 ) + if self._keepConflictSbj: + cmd += " -M \"%s\"" % ( "-j -a" ) + else: + cmd += " -M \"%s\"" % ( "-j" ) + cmd += " -Z tab" + if self._clean: + cmd += " -c" + cmd += " -v %i" % ( self._verbose - 1 ) + self._pL.launch( prg, cmd ) + + csh = ChangeSequenceHeaders() + csh.setInputFile( matchFile ) + csh.setFormat( "tab" ) + csh.setStep( 2 ) + csh.setLinkFile( "%slink" % ( self._qryFile ) ) + csh.setOutputFile( matchFile.replace(".newH","") ) + csh.run() + + csh.setInputFile( pathFile ) + csh.setFormat( "path" ) + csh.setStep( 2 ) + csh.setOutputFile( pathFile.replace(".newH","") ) + csh.run() + + return matchFile.replace(".newH",""), pathFile.replace(".newH","") + + + def compareFastaViaMafft( self ): + """ + Orienter, Mafft, Matcher + """ + FastaUtils.dbSplit( self._qryFile, 1, False, False, "query" ) + FastaUtils.dbSplit( self._sbjFile, 1, False, False, "subject" ) + lQueries = glob.glob( "query_*.fa" ) + lSubjects = glob.glob( "subject_*.fa" ) + + if self._keepConflictSbj: + s = "match" + else: + s = "clean_match" + matchFile = "%s_vs_%s.m%i.align.%s.tab" % ( self._qryFile, + self._sbjFile, + self._method, + s ) + os.system( "touch %s" % matchFile ) + pathFile = "%s_vs_%s.m%i.align.%s.path" % ( self._qryFile, + self._sbjFile, + self._method, + s ) + os.system( "touch %s" % pathFile ) + + countQueries = 0 + for query in lQueries: + countQueries += 1 + queryHeader = FastaUtils.dbHeaders( query )[0] + queryLength = FastaUtils.dbLengths( query )[0] + countSubjects = 0 + for subject in lSubjects: + countSubjects += 1 + subjectHeader = FastaUtils.dbHeaders( subject )[0] + subjectLength = FastaUtils.dbLengths( subject )[0] + if self._verbose > 0: + print "compare '%s' (%i bp, %i/%i) and '%s' (%i bp, %i/%i)" % ( queryHeader, queryLength, + countQueries, len(lQueries), + subjectHeader, subjectLength, + countSubjects, len(lSubjects) ) + sys.stdout.flush() + qsLengthRatio = 100 * queryLength / float(subjectLength) + if qsLengthRatio < self._thresholdCoverage - 2 or qsLengthRatio > 100 + (100-self._thresholdCoverage) + 2: + if self._verbose > 0: + print "skip (q/s=%.2f%%)" % ( qsLengthRatio ) + continue + + tmpFile = "%s_vs_%s" % ( query, subject ) + FileUtils.catFilesFromList( [ query, subject ], + tmpFile ) + prg = "OrientSequences.py" + cmd = prg + cmd += " -i %s" % ( tmpFile ) + cmd += " -p mummer" + cmd += " -c" + cmd += " -v %i" % ( self._verbose - 1 ) + self._pL.launch( prg, cmd ) + prg = "MafftProgramLauncher.py" + cmd = prg + cmd += " -i %s.oriented" % ( tmpFile ) + cmd += " -c" + cmd += " -v %i" % ( self._verbose - 1 ) + self._pL.launch( prg, cmd ) + + absDB = AlignedBioseqDB( "%s.oriented.fa_aln" % tmpFile ) + lHeaders = absDB.getHeaderList() + lAligns = absDB.getAlignList( lHeaders[0], lHeaders[1] ) + for i in lAligns: + if "re-oriented" in i.getQueryName(): + i.setQueryName( queryHeader ) + start = i.getQueryStart() + end = i.getQueryEnd() + i.setQueryStart( queryLength - end + 1 ) + i.setQueryEnd( queryLength - start + 1 ) + if "re-oriented" in i.getSubjectName(): + i.setSubjectName( subjectHeader ) + start = i.getSubjectStart() + end = i.getSubjectEnd() + i.setSubjectEnd( subjectLength - end + 1 ) + i.setSubjectStart( subjectLength - start + 1 ) + if not i.isQueryOnDirectStrand(): + i.reverse() + AlignUtils.writeListInFile( lAligns, + "%s.oriented.fa_aln.align" % tmpFile ) + + prg = os.environ["REPET_PATH"] + "/bin/matcher" + cmd = prg + cmd += " -m %s.oriented.fa_aln.align" % ( tmpFile ) + cmd += " -q %s" % ( query ) + cmd += " -s %s" % ( subject ) + cmd += " -j" + if self._keepConflictSbj: + cmd += " -a" + cmd += " -v %i" % ( self._verbose - 1 ) + self._pL.launch( prg, cmd ) + + FileUtils.appendFileContent( "%s.oriented.fa_aln.align.%s.path" % ( tmpFile, s ), pathFile ) + lMatches = MatchUtils.getMatchListFromFile( "%s.oriented.fa_aln.align.%s.tab" % ( tmpFile, s ) ) + MatchUtils.writeListInFile( lMatches, matchFile, "a" ) + + for f in [ tmpFile, + "%s.oriented" % ( tmpFile ), +# "%s.oriented.fa_aln" % ( tmpFile ), + "%s.oriented.fa_aln.align" % ( tmpFile ), + "%s.oriented.fa_aln.align.match.fa" % ( tmpFile ), + "%s.oriented.fa_aln.align.match.map" % ( tmpFile ), + "%s.oriented.fa_aln.align.match.param" % ( tmpFile ), + "%s.oriented.fa_aln.align.match.path" % ( tmpFile ), + "%s.oriented.fa_aln.align.match.tab" % ( tmpFile ), + ]: + os.remove( f ) + + if not FileUtils.isEmpty( matchFile ): + csh = ChangeSequenceHeaders() + csh.setInputFile( matchFile ) + csh.setFormat( "tab" ) + csh.setStep( 2 ) + csh.setLinkFile( "%slink" % ( self._qryFile ) ) + csh.setOutputFile( matchFile.replace(".newH","") ) + csh.run() + csh.setInputFile( pathFile ) + csh.setFormat( "path" ) + csh.setStep( 2 ) + csh.setOutputFile( pathFile.replace(".newH","") ) + csh.run() + return matchFile.replace(".newH",""), pathFile.replace(".newH","") + else: + return "", "" + + + def compareFastaViaYassMatcher( self, merged=False ): + """ + Yass + Matcher + """ + if self._keepConflictSbj: + s = "match" + else: + s = "clean_match" + matchFile = "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ) + pathFile = "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ) + if merged: + matchFile += ".merged" + pathFile += ".merged" + matchFile += ".%s.tab" % ( s ) + pathFile += ".%s.path" % ( s ) + + if not self._useCluster: + prg = os.environ["REPET_PATH"] + "/bin/YassProgramLauncher.py" + cmd = prg + cmd += " -i %s" % ( self._qryFile ) + cmd += " -s %s" % ( self._sbjFile ) + cmd += " -c" +# cmd += " -p '-i 12'" + cmd += " -o %s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ) + cmd += " -v %i" % ( self._verbose ) + self._pL.launch( prg, cmd ) + + if merged: + tmpFile = "%s_vs_%s.m%i.align.merged" % ( self._qryFile, + self._sbjFile, + self._method ) + AlignUtils.mergeFile( "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ), + tmpFile ) + else: + tmpFile = "%s_vs_%s.m%i.align" % ( self._qryFile, + self._sbjFile, + self._method ) + + prg = os.environ["REPET_PATH"] + "/bin/matcher" + cmd = prg + cmd += " -m %s" % ( tmpFile ) + cmd += " -q %s" % ( self._qryFile ) + cmd += " -s %s" % ( self._sbjFile ) + cmd += " -j" + if self._keepConflictSbj: + cmd += " -a" + cmd += " -v %i" % ( self._verbose ) + self._pL.launch( prg, cmd ) + + csh = ChangeSequenceHeaders() + csh.setInputFile( matchFile ) + csh.setFormat( "tab" ) + csh.setStep( 2 ) + csh.setLinkFile( "%slink" % ( self._qryFile ) ) + csh.setOutputFile( matchFile.replace(".newH","") ) + csh.run() + + csh.setInputFile( pathFile ) + csh.setFormat( "path" ) + csh.setStep( 2 ) + csh.setOutputFile( pathFile.replace(".newH","") ) + csh.run() + + return matchFile.replace(".newH",""), pathFile.replace(".newH","") + + + def analyzeMatchFile( self, matchFile, pathFile ): + if matchFile != "": + if self._verbose > 0: + print "analyze the 'tab' file..." + sys.stdout.flush() + prg = os.environ["REPET_PATH"] + "/bin/tabFileReader.py" + cmd = prg + cmd += " -m %s" % ( matchFile ) + cmd += " -q %s" % ( self._qryFile.replace(".newH","") ) + cmd += " -s %s" % ( self._sbjFile.replace(".newH","") ) + cmd += " -t %i" % ( self._thresholdCoverage ) + cmd += " -I %i" % ( self._thresholdIdentity ) + cmd += " -E %g" % ( self._thresholdEvalue ) + cmd += " -T %i" % ( self._thresholdCoverageMatch ) + cmd += " -v %i" % ( self._verbose - 1 ) + self._pL.launch( prg, cmd ) + for f in [ matchFile, pathFile, + "%s_tabFileReader.txt" % matchFile, + "%s_qryCategories.txt" % matchFile, + "%s_sbjCategories.txt" % matchFile ]: + shutil.copy( f, ".." ) + os.chdir( ".." ) + + + def start( self ): + self.checkAttributes() + if self._verbose > 0: + print "START BenchmarkTEconsensus.py" + sys.stdout.flush() + + + def end( self ): + if self._clean: + tmpDir = "tmp%s_t%i_m%i_I%i" % ( os.getpid(), + self._thresholdCoverage, + self._method, + self._thresholdIdentity ) + shutil.rmtree( tmpDir ) + if self._verbose > 0: + print "END BenchmarkTEconsensus.py" + sys.stdout.flush() + + + def run( self ): + self.start() + + self.preprocess() + + if self._method == 1: + matchFile, pathFile = self.compareFastaViaBlasterMatcher() + elif self._method == 2: + matchFile, pathFile = self.compareFastaViaBlasterMatcher( merged=True ) + elif self._method == 3: + matchFile, pathFile = self.compareFastaViaMafft() + elif self._method == 4: + matchFile, pathFile = self.compareFastaViaYassMatcher() + + self.analyzeMatchFile( matchFile, pathFile ) + + self.end() + + +if __name__ == "__main__": + i = BenchmarkTEconsensus() + i.setAttributesFromCmdLine() + i.run()