Mercurial > repos > yufei-luo > s_mart
diff commons/tools/CalcCoordCumulLength.py @ 18:94ab73e8a190
Uploaded
author | m-zytnicki |
---|---|
date | Mon, 29 Apr 2013 03:20:15 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/tools/CalcCoordCumulLength.py Mon Apr 29 03:20:15 2013 -0400 @@ -0,0 +1,186 @@ +#!/usr/bin/env python + +""" +Calculate the cumulative length of coordinates data in the L{Map<commons.coreMap>} format. +""" + +import os +import sys +import getopt +from pyRepet.launcher.programLauncher import programLauncher +from pyRepet.util.Stat import Stat +from commons.core.checker.CheckerUtils import CheckerUtils + + +class CalcCoordCumulLength( object ): + """ + Compute the coverage of coordinates data in the L{Map<commons.core.ccommons.core """ + + + def __init__( self ): + """ + Constructor. + """ + self._inFileName = "" + self._outFileName = "" + self._verbose = 0 + + + def help( self ): + """ + Display the help on stdout. + """ + print + print "usage:",sys.argv[0]," [ options ]" + print "options:" + print " -h: this help" + print " -i: name of the input file (format='map')" + print " -o: name of the output file (default=inFileName+'.coverage')" + print + + + def setAttributesFromCmdLine( self ): + """ + Set the attributes from the command-line. + """ + try: + opts, args = getopt.getopt(sys.argv[1:],"hi:o:v:") + except getopt.GetoptError, err: + print str(err); self.help(); sys.exit(1) + for o,a in opts: + if o == "-h": + self.help(); sys.exit(0) + elif o == "-i": + self.setInputFileName( a ) + elif o == "-o": + self._outFileName = a + elif o == "-v": + self._verbose = int(a) + + + def setInputFileName( self, inFileName ): + self._inFileName = inFileName + + def setVerbose( self, verbose ): + self._verbose = int(verbose) + + def checkAttributes( self ): + """ + Check the attributes are valid before running the algorithm. + """ + if self._inFileName == "": + print "ERROR: missing input file" + self.help(); sys.exit(1) + if not os.path.exists( self._inFileName ): + print "ERROR: can't find file '%s'" % ( self._inFileName ) + self.help(); sys.exit(1) + if self._outFileName == "": + self._outFileName = "%s.coverage" % ( self._inFileName ) + + + def mergeCoordinates( self ): + """ + Merge the coordinates with 'mapOp'. + """ + if self._verbose > 0: + print "merge the coordinates with mapOp..."; sys.stdout.flush() + if not CheckerUtils.isExecutableInUserPath( "mapOp" ): + msg = "ERROR: 'mapOp' is not in your PATH" + sys.stderr.write( "%s\n" % msg ) + sys.exit(1) + pL = programLauncher() + prg = os.environ["REPET_PATH"] + "/bin/mapOp" + cmd = prg + cmd += " -q %s" % ( self._inFileName ) + cmd += " -m" + cmd += " > /dev/null" + pL.launch( prg, cmd, self._verbose - 1 ) + if self._verbose > 0: + print "coordinates merged !"; sys.stdout.flush() + mergeFileName = "%s.merge" % ( self._inFileName ) + inPath, inName = os.path.split( self._inFileName ) + if inPath != "": + os.system( "mv %s.merge %s" % ( inName, inPath ) ) + return mergeFileName + + + def getStatsPerChr( self, mergeFileName ): + """ + Return summary statistics on the coordinates, per chromosome. + @param mergeFileName: L{Map<commons.core.coord.Macommons.coreype mergeFileName: string + @return: dictionary whose keys are the chromosomes of the 'map file and values are L{Stat<pyRepet.util.Stat>} instances + """ + dChr2Stats = {} + if self._verbose > 0: + print "compute the coverage of the coordinates..."; sys.stdout.flush() + mergeF = open( mergeFileName, "r" ) + line = mergeF.readline() + while True: + if line == "": break + tokens = line[:-1].split("\t") + if int(tokens[2]) < int(tokens[3]): + matchLength = int(tokens[3]) - int(tokens[2]) + 1 + elif int(tokens[2]) > int(tokens[3]): + matchLength = int(tokens[2]) - int(tokens[3]) + 1 + if not dChr2Stats.has_key( tokens[1] ): + dChr2Stats[ tokens[1] ] = Stat() + dChr2Stats[ tokens[1] ].add( matchLength ) + line = mergeF.readline() + mergeF.close() + os.remove( mergeFileName ) + return dChr2Stats + + + def saveCumulLength( self, dChr2Stats ): + """ + Write the stats in the output file. + """ + outF = open( self._outFileName, "w" ) + totalLength = 0 + for i in dChr2Stats.keys(): + totalLength += dChr2Stats[i].sum + string = "cumulative length (in bp) on '%s': %i" % ( i, dChr2Stats[i].sum ) + outF.write( "%s\n" % ( string ) ) + if self._verbose > 0: print string + string = "total cumulative length (in bp): %i" % ( totalLength ) + outF.write( "%s\n" % ( string ) ) + if self._verbose > 0: print string + outF.close() + sys.stdout.flush() + + + def start( self ): + """ + Useful commands before running the program. + """ + if self._verbose > 0: + print "beginning of %s" % (sys.argv[0].split("/")[-1]); sys.stdout.flush() + self.checkAttributes() + if self._verbose > 0: + print "input file : '%s'" % ( self._inFileName ) + sys.stdout.flush() + + + def end( self ): + """ + Useful commands before ending the program. + """ + if self._verbose > 0: + print "%s finished successfully" % (sys.argv[0].split("/")[-1]); sys.stdout.flush() + + + def run( self ): + """ + Run the program. + """ + self.start() + mergeFileName = self.mergeCoordinates() + dChr2Stats = self.getStatsPerChr( mergeFileName ) + self.saveCumulLength( dChr2Stats ) + self.end() + + +if __name__ == '__main__': + i = CalcCoordCumulLength() + i.setAttributesFromCmdLine() + i.run()