Mercurial > repos > yufei-luo > s_mart
diff commons/tools/CorrelateTEageWithGCcontent.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/CorrelateTEageWithGCcontent.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,204 @@ +#!/usr/bin/env python + +import sys +import os +import getopt +import math +from commons.core.sql.DbMySql import DbMySql +from commons.core.sql.TablePathAdaptator import TablePathAdaptator +from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator +from commons.core.coord.PathUtils import PathUtils +from commons.core.coord.SetUtils import SetUtils +from commons.core.seq.BioseqUtils import BioseqUtils + + +class CorrelateTEageWithGCcontent( object ): + + def __init__( self ): + self._inputCoord = "" + self._inputGenome = "" + self._inputTErefseq = "" + self._configFile = "" + self._outFile = "" + self._verbose = 0 + self._db = None + self._tableCoord = "" + self._pathA = TablePathAdaptator() + self._tableGenome = "" + self._seqA = TableSeqAdaptator() + + + def help( self ): + print + print "usage: CorrelateTEageWithGCcontent.py [ options ]" + print "options:" + print " -h: this help" + print " -i: input TE coordinates (can be file or table)" + print " TEs as subjects in 'path' format" + print " -g: input genome sequences (can be fasta file or table)" + print " -r: input TE reference sequences (can be fasta file or table)" + print " -C: configuration file (if table as input)" + print " -o: output fasta file (default=inputCoord+'.gc')" + print " -v: verbosity level (default=0/1)" + print + + + def setAttributesFromCmdLine( self ): + try: + opts, args = getopt.getopt(sys.argv[1:],"hi:g:r:C:o:v:") + except getopt.GetoptError, err: + msg = "%s" % str(err) + sys.stderr.write( "%s\n" % msg ) + self.help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); sys.exit(0) + elif o == "-i": + self._inputCoord = a + elif o == "-g": + self._inputGenome = a + elif o == "-r": + self._inputTErefseq = a + elif o == "-C": + self._configFile = a + elif o =="-o": + self._outFile = a + elif o == "-v": + self._verbose = int(a) + + + def checkAttributes( self ): + if self._inputCoord == "": + msg = "ERROR: missing input TE coordinates (-i)" + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if not os.path.exists( self._inputCoord ): + if not os.path.exists( self._configFile ): + msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputCoord, self._configFile ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if not os.path.exists( self._configFile ): + msg = "ERROR: can't find configuration file '%s'" % ( self._configFile ) + sys.stderr.write( "%s\n" % msg ) + sys.exit(1) + self._db = DbMySql( cfgFileName=self._configFile ) + if not self._db.exist( self._inputCoord ): + msg = "ERROR: can't find table '%s'" % ( self._inputCoord ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + self._tableCoord = self._inputCoord + else: + self._tableCoord = self._inputCoord.replace(".","_") + if self._inputGenome == "": + msg = "ERROR: missing input genome sequences (-g)" + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if not os.path.exists( self._inputGenome ): + if not self._db.doesTableExist( self._inputGenome ): + msg = "ERROR: can't find table '%s'" % ( self._inputGenome ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + self._tableGenome = self._inputGenome + else: + self._tableGenome = self._inputGenome.replace(".","_") + if self._inputTErefseq == "": + msg = "ERROR: missing input TE reference sequences (-r)" + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if not os.path.exists( self._inputTErefseq ): + if not self._db.doesTableExist( self._inputTErefseq ): + msg = "ERROR: can't find table '%s'" % ( self._inputTErefseq ) + sys.stderr.write( "%s\n" % msg ) + self.help() + sys.exit(1) + if self._outFile == "": + self._outFile = "%s.gc" % ( self._inputCoord ) + + + def getLengthOfTErefseqs( self ): + if os.path.exists( self._inputTErefseq ): + return BioseqUtils.getLengthPerSeqFromFile( self._inputTErefseq ) + else: + dTErefseq2Length = {} + refseqA = TableSeqAdaptator( self._db, self._inputTErefseq ) + lAccessions = refseqA.getAccessionsList() + for acc in lAccessions: + dTErefseq2Length[ acc ] = refseqA.getSeqLengthFromAccession( acc ) + return dTErefseq2Length + + + def start( self ): + self.checkAttributes() + if self._verbose > 0: + print "START CorrelateTEageWithGCcontent.py" + sys.stdout.flush() + if os.path.exists( self._inputCoord ): + self._db = DbMySql( cfgFileName=self._configFile ) + self._db.createTable( self._tableCoord, "path", self._inputCoord, True ) + self._pathA = TablePathAdaptator( self._db, self._tableCoord ) + if os.path.exists( self._inputGenome ): + self._db.createTable( self._tableGenome, "seq", self._inputGenome, True ) + self._seqA = TableSeqAdaptator( self._db, self._tableGenome ) + if self._verbose > 0: + print "output fasta file: %s" % self._outFile + + + def end( self ): + if os.path.exists( self._inputCoord ): + self._db.dropTable( self._tableCoord ) + if os.path.exists( self._inputGenome ): + self._db.dropTable( self._tableGenome ) + self._db.close() + if self._verbose > 0: + print "END CorrelateTEageWithGCcontent.py" + sys.stdout.flush() + + + def run( self ): + self.start() + + dTErefseq2Length = self.getLengthOfTErefseqs() + + outFileHandler = open( self._outFile, "w" ) + outFileHandler.write( "copy\tTE\tchr\tlength\tid\tGC\tlengthPerc\n" ) + + lIdentifiers = self._pathA.getIdList() + nbTEcopies = len(lIdentifiers) + if self._verbose > 0: + print "nb of TE copies: %i" % ( nbTEcopies ) + sys.stdout.flush() + count = 0 + power10 = int( math.floor( math.log10( nbTEcopies ) ) ) - 1 + for id in lIdentifiers: + count += 1 + if self._verbose > 0 and power10 > 0 and count % math.pow(10,power10) == 0: + print "%s / %s" % ( str(count).zfill(power10+2), str(nbTEcopies).zfill(power10+2) ) + sys.stdout.flush() + lPaths = self._pathA.getPathListFromId( id ) + lSets = PathUtils.getSetListFromQueries( lPaths ) + lMergedSets = SetUtils.mergeSetsInList( lSets ) + bs = self._seqA.getBioseqFromSetList( lMergedSets ) + data = "%i" % id + data += "\t%s" % ( bs.header.split("::")[0] ) + data += "\t%s" % ( lPaths[0].getQueryName() ) + data += "\t%i" % ( bs.getLength() ) + data += "\t%.2f" % ( PathUtils.getIdentityFromPathList( lPaths ) ) + data += "\t%.2f" % ( bs.getGCpercentage() ) + data += "\t%.2f" % ( 100 * bs.getLength() / float( dTErefseq2Length[ bs.header.split("::")[0] ] ) ) + outFileHandler.write( "%s\n" % data ) + + outFileHandler.close() + + self.end() + + +if __name__ == "__main__": + i = CorrelateTEageWithGCcontent() + i.setAttributesFromCmdLine() + i.run()