Mercurial > repos > yufei-luo > s_mart
view commons/tools/CorrelateTEageWithGCcontent.py @ 19:9bcfa7936eec
Deleted selected files
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:23:29 -0400 |
parents | 94ab73e8a190 |
children |
line wrap: on
line source
#!/usr/bin/env python import sys import os import getopt import math from commons.core.sql.DbMySql import DbMySql from commons.core.sql.TablePathAdaptator import TablePathAdaptator from commons.core.sql.TableSeqAdaptator import TableSeqAdaptator from commons.core.coord.PathUtils import PathUtils from commons.core.coord.SetUtils import SetUtils from commons.core.seq.BioseqUtils import BioseqUtils class CorrelateTEageWithGCcontent( object ): def __init__( self ): self._inputCoord = "" self._inputGenome = "" self._inputTErefseq = "" self._configFile = "" self._outFile = "" self._verbose = 0 self._db = None self._tableCoord = "" self._pathA = TablePathAdaptator() self._tableGenome = "" self._seqA = TableSeqAdaptator() def help( self ): print print "usage: CorrelateTEageWithGCcontent.py [ options ]" print "options:" print " -h: this help" print " -i: input TE coordinates (can be file or table)" print " TEs as subjects in 'path' format" print " -g: input genome sequences (can be fasta file or table)" print " -r: input TE reference sequences (can be fasta file or table)" print " -C: configuration file (if table as input)" print " -o: output fasta file (default=inputCoord+'.gc')" print " -v: verbosity level (default=0/1)" print def setAttributesFromCmdLine( self ): try: opts, args = getopt.getopt(sys.argv[1:],"hi:g:r:C:o:v:") except getopt.GetoptError, err: msg = "%s" % str(err) sys.stderr.write( "%s\n" % msg ) self.help(); sys.exit(1) for o,a in opts: if o == "-h": self.help(); sys.exit(0) elif o == "-i": self._inputCoord = a elif o == "-g": self._inputGenome = a elif o == "-r": self._inputTErefseq = a elif o == "-C": self._configFile = a elif o =="-o": self._outFile = a elif o == "-v": self._verbose = int(a) def checkAttributes( self ): if self._inputCoord == "": msg = "ERROR: missing input TE coordinates (-i)" sys.stderr.write( "%s\n" % msg ) self.help() sys.exit(1) if not os.path.exists( self._inputCoord ): if not os.path.exists( self._configFile ): msg = "ERROR: neither input file '%s' nor configuration file '%s'" % ( self._inputCoord, self._configFile ) sys.stderr.write( "%s\n" % msg ) self.help() sys.exit(1) if not os.path.exists( self._configFile ): msg = "ERROR: can't find configuration file '%s'" % ( self._configFile ) sys.stderr.write( "%s\n" % msg ) sys.exit(1) self._db = DbMySql( cfgFileName=self._configFile ) if not self._db.exist( self._inputCoord ): msg = "ERROR: can't find table '%s'" % ( self._inputCoord ) sys.stderr.write( "%s\n" % msg ) self.help() sys.exit(1) self._tableCoord = self._inputCoord else: self._tableCoord = self._inputCoord.replace(".","_") if self._inputGenome == "": msg = "ERROR: missing input genome sequences (-g)" sys.stderr.write( "%s\n" % msg ) self.help() sys.exit(1) if not os.path.exists( self._inputGenome ): if not self._db.doesTableExist( self._inputGenome ): msg = "ERROR: can't find table '%s'" % ( self._inputGenome ) sys.stderr.write( "%s\n" % msg ) self.help() sys.exit(1) self._tableGenome = self._inputGenome else: self._tableGenome = self._inputGenome.replace(".","_") if self._inputTErefseq == "": msg = "ERROR: missing input TE reference sequences (-r)" sys.stderr.write( "%s\n" % msg ) self.help() sys.exit(1) if not os.path.exists( self._inputTErefseq ): if not self._db.doesTableExist( self._inputTErefseq ): msg = "ERROR: can't find table '%s'" % ( self._inputTErefseq ) sys.stderr.write( "%s\n" % msg ) self.help() sys.exit(1) if self._outFile == "": self._outFile = "%s.gc" % ( self._inputCoord ) def getLengthOfTErefseqs( self ): if os.path.exists( self._inputTErefseq ): return BioseqUtils.getLengthPerSeqFromFile( self._inputTErefseq ) else: dTErefseq2Length = {} refseqA = TableSeqAdaptator( self._db, self._inputTErefseq ) lAccessions = refseqA.getAccessionsList() for acc in lAccessions: dTErefseq2Length[ acc ] = refseqA.getSeqLengthFromAccession( acc ) return dTErefseq2Length def start( self ): self.checkAttributes() if self._verbose > 0: print "START CorrelateTEageWithGCcontent.py" sys.stdout.flush() if os.path.exists( self._inputCoord ): self._db = DbMySql( cfgFileName=self._configFile ) self._db.createTable( self._tableCoord, "path", self._inputCoord, True ) self._pathA = TablePathAdaptator( self._db, self._tableCoord ) if os.path.exists( self._inputGenome ): self._db.createTable( self._tableGenome, "seq", self._inputGenome, True ) self._seqA = TableSeqAdaptator( self._db, self._tableGenome ) if self._verbose > 0: print "output fasta file: %s" % self._outFile def end( self ): if os.path.exists( self._inputCoord ): self._db.dropTable( self._tableCoord ) if os.path.exists( self._inputGenome ): self._db.dropTable( self._tableGenome ) self._db.close() if self._verbose > 0: print "END CorrelateTEageWithGCcontent.py" sys.stdout.flush() def run( self ): self.start() dTErefseq2Length = self.getLengthOfTErefseqs() outFileHandler = open( self._outFile, "w" ) outFileHandler.write( "copy\tTE\tchr\tlength\tid\tGC\tlengthPerc\n" ) lIdentifiers = self._pathA.getIdList() nbTEcopies = len(lIdentifiers) if self._verbose > 0: print "nb of TE copies: %i" % ( nbTEcopies ) sys.stdout.flush() count = 0 power10 = int( math.floor( math.log10( nbTEcopies ) ) ) - 1 for id in lIdentifiers: count += 1 if self._verbose > 0 and power10 > 0 and count % math.pow(10,power10) == 0: print "%s / %s" % ( str(count).zfill(power10+2), str(nbTEcopies).zfill(power10+2) ) sys.stdout.flush() lPaths = self._pathA.getPathListFromId( id ) lSets = PathUtils.getSetListFromQueries( lPaths ) lMergedSets = SetUtils.mergeSetsInList( lSets ) bs = self._seqA.getBioseqFromSetList( lMergedSets ) data = "%i" % id data += "\t%s" % ( bs.header.split("::")[0] ) data += "\t%s" % ( lPaths[0].getQueryName() ) data += "\t%i" % ( bs.getLength() ) data += "\t%.2f" % ( PathUtils.getIdentityFromPathList( lPaths ) ) data += "\t%.2f" % ( bs.getGCpercentage() ) data += "\t%.2f" % ( 100 * bs.getLength() / float( dTErefseq2Length[ bs.header.split("::")[0] ] ) ) outFileHandler.write( "%s\n" % data ) outFileHandler.close() self.end() if __name__ == "__main__": i = CorrelateTEageWithGCcontent() i.setAttributesFromCmdLine() i.run()