diff commons/tools/SpliceTEsFromGenome.py @ 18:94ab73e8a190

Uploaded
author m-zytnicki
date Mon, 29 Apr 2013 03:20:15 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/commons/tools/SpliceTEsFromGenome.py	Mon Apr 29 03:20:15 2013 -0400
@@ -0,0 +1,193 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import getopt
+
+from commons.core.sql.DbMySql import DbMySql
+from commons.core.seq.FastaUtils import FastaUtils
+from commons.core.coord.MapUtils import MapUtils
+from commons.core.coord.AlignUtils import AlignUtils
+from commons.core.coord.PathUtils import PathUtils
+
+
+class SpliceTEsFromGenome( object ):
+    
+    def __init__( self ):
+        self._inputData = ""
+        self._formatData = ""
+        self._genomeFile = ""
+        self._configFile = ""
+        self._outFile = ""
+        self._verbose = 0
+        self._db = None
+        
+        
+    def help( self ):
+        print
+        print "usage: SpliceTEsFromGenome.py [ options ]"
+        print "options:"
+        print "     -h: this help"
+        print "     -i: input TE coordinates (can be file or table)"
+        print "         TEs as subjects if align or path format"
+        print "     -f: format of the data (map/align/path)"
+        print "     -g: genome file (format=fasta)"
+        print "     -C: configuration file (if table as input)"
+        print "     -o: output fasta file (default=genomeFile+'.splice')"
+        print "     -v: verbosity level (default=0/1)"
+        print
+        
+        
+    def setAttributesFromCmdLine( self ):
+        try:
+            opts, args = getopt.getopt(sys.argv[1:],"hi:f:g:C:o:v:")
+        except getopt.GetoptError, err:
+            msg = "%s" % str(err)
+            sys.stderr.write( "%s\n" % msg )
+            self.help(); sys.exit(1)
+        for o,a in opts:
+            if o == "-h":
+                self.help(); sys.exit(0)
+            elif o == "-i":
+                self._inputData = a
+            elif o == "-f":
+                self._formatData = a
+            elif o == "-g":
+                self._genomeFile = a
+            elif o == "-C":
+                self._configFile = a
+            elif o =="-o":
+                self._outFile = a
+            elif o == "-v":
+                self._verbose = int(a)
+                
+                
+    def checkAttributes( self ):
+        if self._inputData == "":
+            msg = "ERROR: missing input data (-i)"
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if not os.path.exists( self._inputData ):
+            if not os.path.exists( self._configFile ):
+                msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputData, self._configFile )
+                sys.stderr.write( "%s\n" % msg )
+                self.help()
+                sys.exit(1)
+            if not os.path.exists( self._configFile ):
+                msg = "ERROR: can't find config file '%s'" % ( self._configFile )
+                sys.stderr.write( "%s\n" % msg )
+                sys.exit(1)
+            self._db = DbMySql( cfgFileName=self._configFile )
+            if not self._db.doesTableExist( self._inputData ):
+                msg = "ERROR: can't find table '%s'" % ( self._inputData )
+                sys.stderr.write( "%s\n" % msg )
+                self.help()
+                sys.exit(1)
+        if self._formatData == "":
+            msg = "ERROR: need to precise format (-f)"
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if self._formatData not in [ "map", "align", "path" ]:
+            msg = "ERROR: format '%s' not yet supported" % ( self._formatData )
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if self._genomeFile == "":
+            msg = "ERROR: missing genome file (-g)"
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if not os.path.exists( self._genomeFile ):
+            msg = "ERROR: can't find genome file '%s'" % ( self._genomeFile )
+            sys.stderr.write( "%s\n" % msg )
+            self.help()
+            sys.exit(1)
+        if self._outFile == "":
+            self._outFile = "%s.splice" % ( self._genomeFile )
+            if self._verbose > 0:
+                print "output fasta file: %s" % self._outFile
+                
+                
+    def getCoordsAsMapFile( self ):
+        if self._verbose > 0:
+            print "get TE coordinates as 'Map' file"
+            sys.stdout.flush()
+        if self._db != None:
+            cmd = "srptExportTable.py"
+            cmd += " -i %s" % ( self._inputData )
+            cmd += " -C %s" % ( self._configFile )
+            cmd += " -o %s.%s" % ( self._inputData, self._formatData )
+            returnStatus = os.system( cmd )
+            if returnStatus != 0:
+                msg = "ERROR while exporting data from table"
+                sys.stderr.write( "%s\n" % msg )
+                sys.exit(1)
+            self._inputData += ".%s" % ( self._formatData )
+           
+        if self._formatData == "map":
+            return self._inputData
+        elif self._formatData == "align":
+            mapFile = "%s.map" % ( self._inputData )
+            AlignUtils.convertAlignFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
+            return mapFile
+        elif self._formatData == "path":
+            mapFile = "%s.map" % ( self._inputData )
+            PathUtils.convertPathFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile )
+            return mapFile
+        
+        
+    def mergeCoordsInMapFile( self, mapFile ):
+        if self._verbose > 0:
+            print "merge TE coordinates"
+            sys.stdout.flush()
+        mergeFile = "%s.merge" % ( mapFile )
+        MapUtils.mergeCoordsInFile( mapFile, mergeFile )
+        if self._formatData != "map" or self._db != None:
+            os.remove( mapFile )
+        return mergeFile
+    
+    
+    def spliceFastaFromCoords( self, mergeFile ):
+        if self._verbose > 0:
+            print "splice TE copies from the genome"
+            sys.stdout.flush()
+        FastaUtils.spliceFromCoords( self._genomeFile,
+                                     mergeFile,
+                                     self._outFile )
+    
+        os.remove( mergeFile )
+        
+        
+    def start( self ):
+        self.checkAttributes()
+        if self._verbose > 0:
+            print "START SpliceTEsFromGenome.py"
+            sys.stdout.flush()
+            
+            
+    def end( self ):
+        if self._db != None:
+            self._db.close()
+        if self._verbose > 0:
+            print "END SpliceTEsFromGenome.py"
+            sys.stdout.flush()
+            
+            
+    def run( self ):
+        self.start()
+        
+        mapFile = self.getCoordsAsMapFile()
+
+        mergeFile = self.mergeCoordsInMapFile( mapFile )
+        
+        self.spliceFastaFromCoords( mergeFile )
+        
+        self.end()
+        
+        
+if __name__ == "__main__":
+    i = SpliceTEsFromGenome()
+    i.setAttributesFromCmdLine()
+    i.run()