diff commons/tools/BenchmarkTEconsensus.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/BenchmarkTEconsensus.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,567 @@
+#!/usr/bin/env python
+
+##@file
+# Compare two fasta files of TEs to assess how reference sequences are recovered by de novo consensus.
+
+
+# Copyright INRA (Institut National de la Recherche Agronomique)
+# http://www.inra.fr
+# http://urgi.versailles.inra.fr
+#
+# This software is governed by the CeCILL license under French law and
+# abiding by the rules of distribution of free software.  You can  use, 
+# modify and/ or redistribute the software under the terms of the CeCILL
+# license as circulated by CEA, CNRS and INRIA at the following URL
+# "http://www.cecill.info". 
+#
+# As a counterpart to the access to the source code and  rights to copy,
+# modify and redistribute granted by the license, users are provided only
+# with a limited warranty  and the software's author,  the holder of the
+# economic rights,  and the successive licensors  have only  limited
+# liability. 
+#
+# In this respect, the user's attention is drawn to the risks associated
+# with loading,  using,  modifying and/or developing or reproducing the
+# software by the user in light of its specific status of free software,
+# that may mean  that it is complicated to manipulate,  and  that  also
+# therefore means  that it is reserved for developers  and  experienced
+# professionals having in-depth computer knowledge. Users are therefore
+# encouraged to load and test the software's suitability as regards their
+# requirements in conditions enabling the security of their systems and/or 
+# data to be ensured and,  more generally, to use and operate it in the 
+# same conditions as regards security. 
+#
+# The fact that you are presently reading this means that you have had
+# knowledge of the CeCILL license and that you accept its terms.
+
+
+import os
+import sys
+import getopt
+import shutil
+import glob
+
+
+import pyRepet.launcher.programLauncher
+from commons.core.coord.AlignUtils import AlignUtils
+from commons.core.coord.MatchUtils import MatchUtils
+from commons.core.utils.FileUtils import FileUtils
+from commons.core.seq.AlignedBioseqDB import AlignedBioseqDB
+from commons.core.seq.FastaUtils import FastaUtils
+from commons.tools.ChangeSequenceHeaders import ChangeSequenceHeaders
+
+
+class BenchmarkTEconsensus( object ):
+    
+    def __init__( self ):
+        self._qryFile = ""
+        self._sbjFile = ""
+        self._method = 1
+        self._keepConflictSbj = False
+        self._thresholdCoverage = 95
+        self._thresholdIdentity = 80
+        self._thresholdEvalue = 1e-10
+        self._thresholdCoverageMatch = 90
+        self._useCluster = False
+        self._queue = ""
+        self._configFileName = ""
+        self._clean = False
+        self._verbose = 0
+        self._pL = pyRepet.launcher.programLauncher.programLauncher()
+        
+        
+    def help( self ):
+        print
+        print "usage: BenchmarkTEconsensus.py [ options ]"
+        print "options:"
+        print "     -h: this help"
+        print "     -q: name of the query file (de novo consensus, format='fasta')"
+        print "     -s: name of the subject file (reference sequences, format='fasta')"
+        print "     -m: method"
+        print "         1: Blaster + Matcher (default)"
+        print "         2: Blaster + merge + Matcher (not with '-Q')"
+        print "         3: Orienter + Mafft + Matcher"
+        print "         4: Yass + Matcher"
+        print "     -a: keep all conflicting subjects in Matcher"
+        print "     -t: coverage threshold over which the match is 'complete' (in %% of the seq length, default=%i)" % self._thresholdCoverage
+        print "     -I: identity threshold for 'CC' matches (default=%i)" % self._thresholdIdentity
+        print "     -E: E-value threshold for 'CC' matches (default=1e-10)"
+        print "     -T: coverage threshold for match length on query compare to subject length (default=%i)" % self._thresholdCoverageMatch
+        print "     -Q: queue name to run in parallel"
+        print "     -C: name of the configuration file (compulsory with '-Q')"
+        print "     -c: clean"
+        print "     -v: verbosity level (default=0/1/2)"
+        print
+        
+        
+    def setAttributesFromCmdLine( self ):
+        try:
+            opts, args = getopt.getopt( sys.argv[1:], "hq:s:m:at:I:E:T:Q:C:cv:" )
+        except getopt.GetoptError, err:
+            sys.stderr.write( "%s\n" % ( str(err) ) )
+            self.help()
+            sys.exit(1)
+        for o,a in opts:
+            if o == "-h":
+                self.help();
+                sys.exit(0)
+            elif o == "-q":
+                self._qryFile = a
+            elif o == "-s":
+                self._sbjFile = a
+            elif o == "-m":
+                self._method = int(a)
+            elif o == "-a":
+                self._keepConflictSbj = True
+            elif o == "-t":
+                self._thresholdCoverage = int(a)
+            elif o == "-I":
+                self._thresholdIdentity = int(a)
+            elif o == "-E":
+                self._thresholdEvalue = float(a)
+            elif o == "-T":
+                self._thresholdCoverageMatch = int(a)
+            elif o == "-Q":
+                self._useCluster = True
+                self._queue = a
+            elif o == "-C":
+                self._configFile = a
+            elif o == "-c":
+                self._clean = True
+            elif o == "-v":
+                self._verbose = int(a)
+                
+                
+    def checkAttributes( self ):
+        if self._qryFile == "":
+            msg = "ERROR: missing query file (-q)"
+            sys.stderr.write( "%s\n" % ( msg ) )
+            self.help()
+            sys.exit(1)
+        if not os.path.exists( self._qryFile ):
+            msg = "ERROR: can't find file '%s'" % ( self._qryFile )
+            sys.stderr.write( "%s\n" % ( msg ) )
+            self.help()
+            sys.exit(1)
+        if self._sbjFile == "":
+            msg = "ERROR: missing subject file (-s)"
+            sys.stderr.write( "%s\n" % ( msg ) )
+            self.help()
+            sys.exit(1)
+        if not os.path.exists( self._sbjFile ):
+            msg = "ERROR: can't find file '%s'" % ( self._sbjFile )
+            sys.stderr.write( "%s\n" % ( msg ) )
+            self.help()
+            sys.exit(1)
+        if self._useCluster:
+            if self._configFile == "":
+                msg = "ERROR: missing configuration file (-C)"
+                sys.stderr.write( "%s\n" % ( msg ) )
+                self.help()
+                sys.exit(1)
+            if not os.path.exists( self._configFile ):
+                msg = "ERROR: can't find file '%s'" % ( self._configFile )
+                sys.stderr.write( "%s\n" % ( msg ) )
+                self.help()
+                sys.exit(1)
+            if self._method == 2:
+                msg = "ERROR: can't launch method 2 in parallel"
+                sys.stderr.write( "%s\n" % ( msg ) )
+                self.help()
+                sys.exit(1)
+        nbSeqQry = FastaUtils.dbSize( self._qryFile )
+        if nbSeqQry == 0:
+            print "WARNING: query file is empty"
+            sys.exit(0)
+        nbSeqSbj = FastaUtils.dbSize( self._sbjFile )
+        if nbSeqSbj == 0:
+            print "WARNING: subject file is empty"
+            sys.exit(0)
+            
+            
+    def preprocess( self ):
+        tmpDir = "tmp%s_t%i_m%i_I%i" % ( os.getpid(),
+                                         self._thresholdCoverage,
+                                         self._method,
+                                         self._thresholdIdentity )
+        if os.path.exists( tmpDir ):
+            shutil.rmtree( tmpDir )
+        os.mkdir( tmpDir )
+        os.chdir( tmpDir )
+        
+        os.symlink( "../%s" % self._qryFile, self._qryFile )
+        csh = ChangeSequenceHeaders()
+        csh.setInputFile( self._qryFile )
+        csh.setFormat( "fasta" )
+        csh.setStep( 1 )
+        csh.setPrefix( "query" )
+        csh.setOutputFile( "%s.newH" % ( self._qryFile ) )
+        csh.setVerbosityLevel( self._verbose )
+        csh.run()
+        self._qryFile += ".newH"
+        
+        if not os.path.exists( self._sbjFile ):
+            os.symlink( "../%s" % self._sbjFile, self._sbjFile )
+            
+            
+    def compareFastaViaBlasterMatcher( self, merged=False ):
+        """
+        Blaster (+ merged) +  Matcher
+        """
+        if self._keepConflictSbj:
+            s = "match"
+        else:
+            s = "clean_match"
+        matchFile = "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                             self._sbjFile,
+                                             self._method )
+        pathFile = "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                            self._sbjFile,
+                                            self._method )
+        if merged:
+            matchFile += ".merged"
+            pathFile += ".merged"
+        matchFile += ".%s.tab" % ( s )
+        pathFile += ".%s.path" % ( s )
+        
+        if not self._useCluster:
+            prg = "blaster"
+            cmd = prg
+            cmd += " -q %s" % ( self._qryFile )
+            cmd += " -s %s" % ( self._sbjFile )
+            cmd += " -B %s_vs_%s.m%i" % ( self._qryFile,
+                                          self._sbjFile,
+                                          self._method )
+            cmd += " -v %i" % ( self._verbose )
+            self._pL.launch( prg, cmd )
+            
+            if merged:
+                tmpFile = "%s_vs_%s.m%i.align.merged" % ( self._qryFile,
+                                                          self._sbjFile,
+                                                          self._method )
+                AlignUtils.mergeFile( "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                                               self._sbjFile,
+                                                               self._method ),
+                                                               tmpFile )
+            else:
+                tmpFile = "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                                   self._sbjFile,
+                                                   self._method )
+                
+            prg = "matcher"
+            cmd = prg
+            cmd += " -m %s" % ( tmpFile )
+            cmd += " -q %s" % ( self._qryFile )
+            cmd += " -s %s" % ( self._sbjFile )
+            cmd += " -j"
+            if self._keepConflictSbj:
+                cmd += " -a"
+            cmd += " -v %i" % ( self._verbose )
+            self._pL.launch( prg, cmd )
+            
+        else:
+            os.symlink( "../%s" % self._configFile, self._configFileName )
+            
+            prg = os.environ["REPET_PATH"] + "/bin/launchBlasterMatcherPerQuery.py"
+            cmd = prg
+            cmd += " -q %s" % ( self._qryFile )
+            cmd += " -s %s" % ( self._sbjFile )
+            cmd += " -Q %s" % ( self._queue )
+            cmd += " -C %s" % ( self._configFile )
+            cmd += " -n %i" % ( 10 )
+            if self._keepConflictSbj:
+                cmd += " -M \"%s\"" % ( "-j -a" )
+            else:
+                cmd += " -M \"%s\"" % ( "-j" )
+            cmd += " -Z tab"
+            if self._clean:
+                cmd += " -c"
+            cmd += " -v %i" % ( self._verbose - 1 )
+            self._pL.launch( prg, cmd )
+            
+        csh = ChangeSequenceHeaders()
+        csh.setInputFile( matchFile )
+        csh.setFormat( "tab" )
+        csh.setStep( 2 )
+        csh.setLinkFile( "%slink" % ( self._qryFile ) )
+        csh.setOutputFile( matchFile.replace(".newH","") )
+        csh.run()
+        
+        csh.setInputFile( pathFile )
+        csh.setFormat( "path" )
+        csh.setStep( 2 )
+        csh.setOutputFile( pathFile.replace(".newH","") )
+        csh.run()
+        
+        return matchFile.replace(".newH",""), pathFile.replace(".newH","")
+    
+    
+    def compareFastaViaMafft( self ):
+        """
+        Orienter, Mafft, Matcher
+        """
+        FastaUtils.dbSplit( self._qryFile, 1, False, False, "query" )
+        FastaUtils.dbSplit( self._sbjFile, 1, False, False, "subject" )
+        lQueries = glob.glob( "query_*.fa" )
+        lSubjects = glob.glob( "subject_*.fa" )
+        
+        if self._keepConflictSbj:
+            s = "match"
+        else:
+            s = "clean_match"
+        matchFile = "%s_vs_%s.m%i.align.%s.tab" % ( self._qryFile,
+                                                    self._sbjFile,
+                                                    self._method,
+                                                    s )
+        os.system( "touch %s" %  matchFile )
+        pathFile = "%s_vs_%s.m%i.align.%s.path" % ( self._qryFile,
+                                                    self._sbjFile,
+                                                    self._method,
+                                                    s )
+        os.system( "touch %s" %  pathFile )
+        
+        countQueries = 0
+        for query in lQueries:
+            countQueries += 1
+            queryHeader = FastaUtils.dbHeaders( query )[0]
+            queryLength = FastaUtils.dbLengths( query )[0]
+            countSubjects = 0
+            for subject in lSubjects:
+                countSubjects += 1
+                subjectHeader = FastaUtils.dbHeaders( subject )[0]
+                subjectLength = FastaUtils.dbLengths( subject )[0]
+                if self._verbose > 0:
+                    print "compare '%s' (%i bp, %i/%i) and '%s' (%i bp, %i/%i)" % ( queryHeader, queryLength,
+                                                                                    countQueries, len(lQueries),
+                                                                                    subjectHeader, subjectLength,
+                                                                                    countSubjects, len(lSubjects) )
+                    sys.stdout.flush()
+                qsLengthRatio = 100 * queryLength / float(subjectLength)
+                if qsLengthRatio < self._thresholdCoverage - 2 or qsLengthRatio > 100 + (100-self._thresholdCoverage) + 2:
+                    if self._verbose > 0:
+                        print "skip (q/s=%.2f%%)" % ( qsLengthRatio )
+                    continue
+                
+                tmpFile = "%s_vs_%s" % ( query, subject )
+                FileUtils.catFilesFromList( [ query, subject ],
+                                            tmpFile )
+                prg = "OrientSequences.py"
+                cmd = prg
+                cmd += " -i %s" % ( tmpFile )
+                cmd += " -p mummer"
+                cmd += " -c"
+                cmd += " -v %i" % ( self._verbose - 1 )
+                self._pL.launch( prg, cmd )
+                prg = "MafftProgramLauncher.py"
+                cmd = prg
+                cmd += " -i %s.oriented" % ( tmpFile )
+                cmd += " -c"
+                cmd += " -v %i" % ( self._verbose - 1 )
+                self._pL.launch( prg, cmd )
+                
+                absDB = AlignedBioseqDB( "%s.oriented.fa_aln" % tmpFile )
+                lHeaders = absDB.getHeaderList()
+                lAligns = absDB.getAlignList( lHeaders[0], lHeaders[1] )
+                for i in lAligns:
+                    if "re-oriented" in i.getQueryName():
+                        i.setQueryName( queryHeader )
+                        start = i.getQueryStart()
+                        end = i.getQueryEnd()
+                        i.setQueryStart( queryLength - end + 1 )
+                        i.setQueryEnd( queryLength - start + 1 )
+                    if "re-oriented" in i.getSubjectName():
+                        i.setSubjectName( subjectHeader )
+                        start = i.getSubjectStart()
+                        end = i.getSubjectEnd()
+                        i.setSubjectEnd( subjectLength - end + 1 )
+                        i.setSubjectStart( subjectLength - start + 1 )
+                    if not i.isQueryOnDirectStrand():
+                        i.reverse()
+                AlignUtils.writeListInFile( lAligns,
+                                            "%s.oriented.fa_aln.align" % tmpFile )
+                
+                prg = os.environ["REPET_PATH"] + "/bin/matcher"
+                cmd = prg
+                cmd += " -m %s.oriented.fa_aln.align" % ( tmpFile )
+                cmd += " -q %s" % ( query )
+                cmd += " -s %s" % ( subject )
+                cmd += " -j"
+                if self._keepConflictSbj:
+                    cmd += " -a"
+                cmd += " -v %i" % ( self._verbose - 1 )
+                self._pL.launch( prg, cmd )
+                
+                FileUtils.appendFileContent( "%s.oriented.fa_aln.align.%s.path" % ( tmpFile, s ), pathFile )
+                lMatches = MatchUtils.getMatchListFromFile( "%s.oriented.fa_aln.align.%s.tab" % ( tmpFile, s ) )
+                MatchUtils.writeListInFile( lMatches, matchFile, "a" )
+                
+                for f in [ tmpFile,
+                          "%s.oriented" % ( tmpFile ),
+#                          "%s.oriented.fa_aln" % ( tmpFile ),
+                          "%s.oriented.fa_aln.align" % ( tmpFile ),
+                          "%s.oriented.fa_aln.align.match.fa" % ( tmpFile ),
+                          "%s.oriented.fa_aln.align.match.map" % ( tmpFile ),
+                          "%s.oriented.fa_aln.align.match.param" % ( tmpFile ),
+                          "%s.oriented.fa_aln.align.match.path" % ( tmpFile ),
+                          "%s.oriented.fa_aln.align.match.tab" % ( tmpFile ),
+                          ]:
+                    os.remove( f )
+                    
+        if not FileUtils.isEmpty( matchFile ):
+            csh = ChangeSequenceHeaders()
+            csh.setInputFile( matchFile )
+            csh.setFormat( "tab" )
+            csh.setStep( 2 )
+            csh.setLinkFile( "%slink" % ( self._qryFile ) )
+            csh.setOutputFile( matchFile.replace(".newH","") )
+            csh.run()
+            csh.setInputFile( pathFile )
+            csh.setFormat( "path" )
+            csh.setStep( 2 )
+            csh.setOutputFile( pathFile.replace(".newH","") )
+            csh.run()
+            return matchFile.replace(".newH",""), pathFile.replace(".newH","")
+        else:
+            return "", ""
+        
+        
+    def compareFastaViaYassMatcher( self, merged=False ):
+        """
+        Yass +  Matcher
+        """
+        if self._keepConflictSbj:
+            s = "match"
+        else:
+            s = "clean_match"
+        matchFile = "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                             self._sbjFile,
+                                             self._method )
+        pathFile = "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                            self._sbjFile,
+                                            self._method )
+        if merged:
+            matchFile += ".merged"
+            pathFile += ".merged"
+        matchFile += ".%s.tab" % ( s )
+        pathFile += ".%s.path" % ( s )
+        
+        if not self._useCluster:
+            prg = os.environ["REPET_PATH"] + "/bin/YassProgramLauncher.py"
+            cmd = prg
+            cmd += " -i %s" % ( self._qryFile )
+            cmd += " -s %s" % ( self._sbjFile )
+            cmd += " -c"
+#            cmd += " -p '-i 12'"
+            cmd += " -o %s_vs_%s.m%i.align" % ( self._qryFile,
+                                                self._sbjFile,
+                                                self._method )
+            cmd += " -v %i" % ( self._verbose )
+            self._pL.launch( prg, cmd )
+            
+            if merged:
+                tmpFile = "%s_vs_%s.m%i.align.merged" % ( self._qryFile,
+                                                          self._sbjFile,
+                                                          self._method )
+                AlignUtils.mergeFile( "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                                               self._sbjFile,
+                                                               self._method ),
+                                                               tmpFile )
+            else:
+                tmpFile = "%s_vs_%s.m%i.align" % ( self._qryFile,
+                                                   self._sbjFile,
+                                                   self._method )
+                
+            prg = os.environ["REPET_PATH"] + "/bin/matcher"
+            cmd = prg
+            cmd += " -m %s" % ( tmpFile )
+            cmd += " -q %s" % ( self._qryFile )
+            cmd += " -s %s" % ( self._sbjFile )
+            cmd += " -j"
+            if self._keepConflictSbj:
+                cmd += " -a"
+            cmd += " -v %i" % ( self._verbose )
+            self._pL.launch( prg, cmd )
+            
+        csh = ChangeSequenceHeaders()
+        csh.setInputFile( matchFile )
+        csh.setFormat( "tab" )
+        csh.setStep( 2 )
+        csh.setLinkFile( "%slink" % ( self._qryFile ) )
+        csh.setOutputFile( matchFile.replace(".newH","") )
+        csh.run()
+        
+        csh.setInputFile( pathFile )
+        csh.setFormat( "path" )
+        csh.setStep( 2 )
+        csh.setOutputFile( pathFile.replace(".newH","") )
+        csh.run()
+        
+        return matchFile.replace(".newH",""), pathFile.replace(".newH","")
+    
+    
+    def analyzeMatchFile( self, matchFile, pathFile ):
+        if matchFile != "":
+            if self._verbose > 0:
+                print "analyze the 'tab' file..."
+                sys.stdout.flush()
+            prg = os.environ["REPET_PATH"] + "/bin/tabFileReader.py"
+            cmd = prg
+            cmd += " -m %s" % ( matchFile )
+            cmd += " -q %s" % ( self._qryFile.replace(".newH","") )
+            cmd += " -s %s" % ( self._sbjFile.replace(".newH","") )
+            cmd += " -t %i" % ( self._thresholdCoverage )
+            cmd += " -I %i" % ( self._thresholdIdentity )
+            cmd += " -E %g" % ( self._thresholdEvalue )
+            cmd += " -T %i" % ( self._thresholdCoverageMatch )
+            cmd += " -v %i" % ( self._verbose - 1 )
+            self._pL.launch( prg, cmd )
+            for f in [ matchFile, pathFile,
+                      "%s_tabFileReader.txt" %  matchFile,
+                      "%s_qryCategories.txt" %  matchFile,
+                      "%s_sbjCategories.txt" %  matchFile ]:
+                shutil.copy( f, ".." )
+        os.chdir( ".." )
+        
+        
+    def start( self ):
+        self.checkAttributes()
+        if self._verbose > 0:
+            print "START BenchmarkTEconsensus.py"
+            sys.stdout.flush()
+            
+            
+    def end( self ):
+        if self._clean:
+            tmpDir = "tmp%s_t%i_m%i_I%i" % ( os.getpid(),
+                                             self._thresholdCoverage,
+                                             self._method,
+                                             self._thresholdIdentity )
+            shutil.rmtree( tmpDir )
+        if self._verbose > 0:
+            print "END BenchmarkTEconsensus.py"
+            sys.stdout.flush()
+            
+            
+    def run( self ):
+        self.start()
+        
+        self.preprocess()
+        
+        if self._method == 1:
+            matchFile, pathFile = self.compareFastaViaBlasterMatcher()
+        elif self._method == 2:
+            matchFile, pathFile = self.compareFastaViaBlasterMatcher( merged=True )
+        elif self._method == 3:
+            matchFile, pathFile = self.compareFastaViaMafft()
+        elif self._method == 4:
+            matchFile, pathFile = self.compareFastaViaYassMatcher()
+            
+        self.analyzeMatchFile( matchFile, pathFile )
+        
+        self.end()
+        
+        
+if __name__ == "__main__":
+    i = BenchmarkTEconsensus()
+    i.setAttributesFromCmdLine()
+    i.run()