diff commons/tools/CorrelateTEageWithGCcontent.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/CorrelateTEageWithGCcontent.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import getopt
+import math
+from commons.core.sql.DbMySql import DbMySql
+from commons.core.sql.TablePathAdaptator import TablePathAdaptator
+from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator
+from commons.core.coord.PathUtils import PathUtils
+from commons.core.coord.SetUtils import SetUtils
+from commons.core.seq.BioseqUtils import BioseqUtils
+
+
+class CorrelateTEageWithGCcontent( object ):
+    
+    def __init__( self ):
+        self._inputCoord = ""
+        self._inputGenome = ""
+        self._inputTErefseq = ""
+        self._configFile = ""
+        self._outFile = ""
+        self._verbose = 0
+        self._db = None
+        self._tableCoord = ""
+        self._pathA = TablePathAdaptator()
+        self._tableGenome = ""
+        self._seqA = TableSeqAdaptator()
+        
+        
+    def help( self ):
+        print
+        print "usage: CorrelateTEageWithGCcontent.py [ options ]"
+        print "options:"
+        print "     -h: this help"
+        print "     -i: input TE coordinates (can be file or table)"
+        print "         TEs as subjects in 'path' format"
+        print "     -g: input genome sequences (can be fasta file or table)"
+        print "     -r: input TE reference sequences (can be fasta file or table)"
+        print "     -C: configuration file (if table as input)"
+        print "     -o: output fasta file (default=inputCoord+'.gc')"
+        print "     -v: verbosity level (default=0/1)"
+        print
+        
+        
+    def setAttributesFromCmdLine( self ):
+        try:
+            opts, args = getopt.getopt(sys.argv[1:],"hi:g:r:C:o:v:")
+        except getopt.GetoptError, err:
+            msg = "%s" % str(err)
+            sys.stderr.write( "%s\n" % msg )
+            self.help(); sys.exit(1)
+        for o,a in opts:
+            if o == "-h":
+                self.help(); sys.exit(0)
+            elif o == "-i":
+                self._inputCoord = a
+            elif o == "-g":
+                self._inputGenome = a
+            elif o == "-r":
+                self._inputTErefseq = a
+            elif o == "-C":
+                self._configFile = a
+            elif o =="-o":
+                self._outFile = a
+            elif o == "-v":
+                self._verbose = int(a)
+                
+                
+    def checkAttributes( self ):
+        if self._inputCoord == "":
+            msg = "ERROR: missing input TE coordinates (-i)"
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if not os.path.exists( self._inputCoord ):
+            if not os.path.exists( self._configFile ):
+                msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputCoord, self._configFile )
+                sys.stderr.write( "%s\n" % msg )
+                self.help()
+                sys.exit(1)
+            if not os.path.exists( self._configFile ):
+                msg = "ERROR: can't find configuration file '%s'" % ( self._configFile )
+                sys.stderr.write( "%s\n" % msg )
+                sys.exit(1)
+            self._db = DbMySql( cfgFileName=self._configFile )
+            if not self._db.exist( self._inputCoord ):
+                msg = "ERROR: can't find table '%s'" % ( self._inputCoord )
+                sys.stderr.write( "%s\n" % msg )
+                self.help()
+                sys.exit(1)
+            self._tableCoord = self._inputCoord
+        else:
+            self._tableCoord = self._inputCoord.replace(".","_")
+        if self._inputGenome == "":
+            msg = "ERROR: missing input genome sequences (-g)"
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if not os.path.exists( self._inputGenome ):
+            if not self._db.doesTableExist( self._inputGenome ):
+                msg = "ERROR: can't find table '%s'" % ( self._inputGenome )
+                sys.stderr.write( "%s\n" % msg )
+                self.help()
+                sys.exit(1)
+            self._tableGenome = self._inputGenome
+        else:
+            self._tableGenome = self._inputGenome.replace(".","_")
+        if self._inputTErefseq == "":
+            msg = "ERROR: missing input TE reference sequences (-r)"
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if not os.path.exists( self._inputTErefseq ):
+            if not self._db.doesTableExist( self._inputTErefseq ):
+                msg = "ERROR: can't find table '%s'" % ( self._inputTErefseq )
+                sys.stderr.write( "%s\n" % msg )
+                self.help()
+                sys.exit(1)
+        if self._outFile == "":
+            self._outFile = "%s.gc" % ( self._inputCoord )
+            
+            
+    def getLengthOfTErefseqs( self ):
+        if os.path.exists( self._inputTErefseq ):
+            return BioseqUtils.getLengthPerSeqFromFile( self._inputTErefseq )
+        else:
+            dTErefseq2Length = {}
+            refseqA = TableSeqAdaptator( self._db, self._inputTErefseq )
+            lAccessions = refseqA.getAccessionsList()
+            for acc in lAccessions:
+                dTErefseq2Length[ acc ] = refseqA.getSeqLengthFromAccession( acc )
+            return dTErefseq2Length
+        
+        
+    def start( self ):
+        self.checkAttributes()
+        if self._verbose > 0:
+            print "START CorrelateTEageWithGCcontent.py"
+            sys.stdout.flush()
+        if os.path.exists( self._inputCoord ):
+            self._db = DbMySql( cfgFileName=self._configFile )
+            self._db.createTable( self._tableCoord, "path", self._inputCoord, True )
+        self._pathA = TablePathAdaptator( self._db, self._tableCoord )
+        if os.path.exists( self._inputGenome ):
+            self._db.createTable( self._tableGenome, "seq", self._inputGenome, True )
+        self._seqA = TableSeqAdaptator( self._db, self._tableGenome )
+        if self._verbose > 0:
+            print "output fasta file: %s" % self._outFile
+            
+            
+    def end( self ):
+        if os.path.exists( self._inputCoord ):
+            self._db.dropTable( self._tableCoord )
+        if os.path.exists( self._inputGenome ):
+            self._db.dropTable( self._tableGenome )
+        self._db.close()
+        if self._verbose > 0:
+            print "END CorrelateTEageWithGCcontent.py"
+            sys.stdout.flush()
+            
+            
+    def run( self ):
+        self.start()
+        
+        dTErefseq2Length = self.getLengthOfTErefseqs()
+        
+        outFileHandler = open( self._outFile, "w" )
+        outFileHandler.write( "copy\tTE\tchr\tlength\tid\tGC\tlengthPerc\n" )
+        
+        lIdentifiers = self._pathA.getIdList()
+        nbTEcopies = len(lIdentifiers)
+        if self._verbose > 0:
+            print "nb of TE copies: %i" % ( nbTEcopies )
+            sys.stdout.flush()
+        count = 0
+        power10 = int( math.floor( math.log10( nbTEcopies ) ) ) - 1
+        for id in lIdentifiers:
+            count += 1
+            if self._verbose > 0 and power10 > 0 and count % math.pow(10,power10) == 0:
+                print "%s / %s" % ( str(count).zfill(power10+2), str(nbTEcopies).zfill(power10+2) )
+                sys.stdout.flush()
+            lPaths = self._pathA.getPathListFromId( id )
+            lSets = PathUtils.getSetListFromQueries( lPaths )
+            lMergedSets = SetUtils.mergeSetsInList( lSets )
+            bs = self._seqA.getBioseqFromSetList( lMergedSets )
+            data = "%i" % id
+            data += "\t%s" % ( bs.header.split("::")[0] )
+            data += "\t%s" % ( lPaths[0].getQueryName() )
+            data += "\t%i" % ( bs.getLength() )
+            data += "\t%.2f" % ( PathUtils.getIdentityFromPathList( lPaths ) )
+            data += "\t%.2f" % ( bs.getGCpercentage() )
+            data += "\t%.2f" % ( 100 * bs.getLength() / float( dTErefseq2Length[ bs.header.split("::")[0] ] ) )
+            outFileHandler.write( "%s\n" % data )
+            
+        outFileHandler.close()
+        
+        self.end()
+        
+        
+if __name__ == "__main__":
+    i = CorrelateTEageWithGCcontent()
+    i.setAttributesFromCmdLine()
+    i.run()