Mercurial > repos > yufei-luo > s_mart
diff commons/tools/SpliceTEsFromGenome.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/SpliceTEsFromGenome.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,193 @@ +#!/usr/bin/env python + +import sys +import os +import getopt + +from commons.core.sql.DbMySql import DbMySql +from commons.core.seq.FastaUtils import FastaUtils +from commons.core.coord.MapUtils import MapUtils +from commons.core.coord.AlignUtils import AlignUtils +from commons.core.coord.PathUtils import PathUtils + + +class SpliceTEsFromGenome( object ): + + def __init__( self ): + self._inputData = "" + self._formatData = "" + self._genomeFile = "" + self._configFile = "" + self._outFile = "" + self._verbose = 0 + self._db = None + + + def help( self ): + print + print "usage: SpliceTEsFromGenome.py [ options ]" + print "options:" + print " -h: this help" + print " -i: input TE coordinates (can be file or table)" + print " TEs as subjects if align or path format" + print " -f: format of the data (map/align/path)" + print " -g: genome file (format=fasta)" + print " -C: configuration file (if table as input)" + print " -o: output fasta file (default=genomeFile+'.splice')" + print " -v: verbosity level (default=0/1)" + print + + + def setAttributesFromCmdLine( self ): + try: + opts, args = getopt.getopt(sys.argv[1:],"hi:f:g:C:o:v:") + except getopt.GetoptError, err: + msg = "%s" % str(err) + sys.stderr.write( "%s\n" % msg ) + self.help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); sys.exit(0) + elif o == "-i": + self._inputData = a + elif o == "-f": + self._formatData = a + elif o == "-g": + self._genomeFile = a + elif o == "-C": + self._configFile = a + elif o =="-o": + self._outFile = a + elif o == "-v": + self._verbose = int(a) + + + def checkAttributes( self ): + if self._inputData == "": + msg = "ERROR: missing input data (-i)" + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if not os.path.exists( self._inputData ): + if not os.path.exists( self._configFile ): + msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputData, self._configFile ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if not os.path.exists( self._configFile ): + msg = "ERROR: can't find config file '%s'" % ( self._configFile ) + sys.stderr.write( "%s\n" % msg ) + sys.exit(1) + self._db = DbMySql( cfgFileName=self._configFile ) + if not self._db.doesTableExist( self._inputData ): + msg = "ERROR: can't find table '%s'" % ( self._inputData ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if self._formatData == "": + msg = "ERROR: need to precise format (-f)" + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if self._formatData not in [ "map", "align", "path" ]: + msg = "ERROR: format '%s' not yet supported" % ( self._formatData ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if self._genomeFile == "": + msg = "ERROR: missing genome file (-g)" + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if not os.path.exists( self._genomeFile ): + msg = "ERROR: can't find genome file '%s'" % ( self._genomeFile ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if self._outFile == "": + self._outFile = "%s.splice" % ( self._genomeFile ) + if self._verbose > 0: + print "output fasta file: %s" % self._outFile + + + def getCoordsAsMapFile( self ): + if self._verbose > 0: + print "get TE coordinates as 'Map' file" + sys.stdout.flush() + if self._db != None: + cmd = "srptExportTable.py" + cmd += " -i %s" % ( self._inputData ) + cmd += " -C %s" % ( self._configFile ) + cmd += " -o %s.%s" % ( self._inputData, self._formatData ) + returnStatus = os.system( cmd ) + if returnStatus != 0: + msg = "ERROR while exporting data from table" + sys.stderr.write( "%s\n" % msg ) + sys.exit(1) + self._inputData += ".%s" % ( self._formatData ) + + if self._formatData == "map": + return self._inputData + elif self._formatData == "align": + mapFile = "%s.map" % ( self._inputData ) + AlignUtils.convertAlignFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile ) + return mapFile + elif self._formatData == "path": + mapFile = "%s.map" % ( self._inputData ) + PathUtils.convertPathFileIntoMapFileWithSubjectsOnQueries( self._inputData, mapFile ) + return mapFile + + + def mergeCoordsInMapFile( self, mapFile ): + if self._verbose > 0: + print "merge TE coordinates" + sys.stdout.flush() + mergeFile = "%s.merge" % ( mapFile ) + MapUtils.mergeCoordsInFile( mapFile, mergeFile ) + if self._formatData != "map" or self._db != None: + os.remove( mapFile ) + return mergeFile + + + def spliceFastaFromCoords( self, mergeFile ): + if self._verbose > 0: + print "splice TE copies from the genome" + sys.stdout.flush() + FastaUtils.spliceFromCoords( self._genomeFile, + mergeFile, + self._outFile ) + + os.remove( mergeFile ) + + + def start( self ): + self.checkAttributes() + if self._verbose > 0: + print "START SpliceTEsFromGenome.py" + sys.stdout.flush() + + + def end( self ): + if self._db != None: + self._db.close() + if self._verbose > 0: + print "END SpliceTEsFromGenome.py" + sys.stdout.flush() + + + def run( self ): + self.start() + + mapFile = self.getCoordsAsMapFile() + + mergeFile = self.mergeCoordsInMapFile( mapFile ) + + self.spliceFastaFromCoords( mergeFile ) + + self.end() + + +if __name__ == "__main__": + i = SpliceTEsFromGenome() + i.setAttributesFromCmdLine() + i.run()