Mercurial > repos > yufei-luo > s_mart
diff commons/core/utils/FileUtils.py @ 36:44d5973c188c
Uploaded
author | m-zytnicki |
---|---|
date | Tue, 30 Apr 2013 15:02:29 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/commons/core/utils/FileUtils.py Tue Apr 30 15:02:29 2013 -0400 @@ -0,0 +1,445 @@ +# Copyright INRA (Institut National de la Recherche Agronomique) +# http://www.inra.fr +# http://urgi.versailles.inra.fr +# +# This software is governed by the CeCILL license under French law and +# abiding by the rules of distribution of free software. You can use, +# modify and/ or redistribute the software under the terms of the CeCILL +# license as circulated by CEA, CNRS and INRIA at the following URL +# "http://www.cecill.info". +# +# As a counterpart to the access to the source code and rights to copy, +# modify and redistribute granted by the license, users are provided only +# with a limited warranty and the software's author, the holder of the +# economic rights, and the successive licensors have only limited +# liability. +# +# In this respect, the user's attention is drawn to the risks associated +# with loading, using, modifying and/or developing or reproducing the +# software by the user in light of its specific status of free software, +# that may mean that it is complicated to manipulate, and that also +# therefore means that it is reserved for developers and experienced +# professionals having in-depth computer knowledge. Users are therefore +# encouraged to load and test the software's suitability as regards their +# requirements in conditions enabling the security of their systems and/or +# data to be ensured and, more generally, to use and operate it in the +# same conditions as regards security. +# +# The fact that you are presently reading this means that you have had +# knowledge of the CeCILL license and that you accept its terms. + + +import os +import glob +import shutil +import sys +import re +import math +try: + import hashlib +except: + pass + + +class FileUtils( object ): + + ## Return the number of lines in the given file + # + def getNbLinesInSingleFile( fileName ): + fileHandler = open( fileName, "r" ) + lines = fileHandler.readlines() + fileHandler.close() + if (len(lines)>0 and lines[-1]== "\n"): + return (len(lines)-1) + else : + return len(lines) + + getNbLinesInSingleFile = staticmethod( getNbLinesInSingleFile ) + + ## Return the number of lines in the files in the given list + # + def getNbLinesInFileList( lFileNames ): + count = 0 + for fileName in lFileNames: + count += FileUtils.getNbLinesInSingleFile( fileName ) + return count + + getNbLinesInFileList = staticmethod( getNbLinesInFileList ) + + ## Return True if the given file exists, False otherwise + # + def isRessourceExists( fileName ): + return os.path.exists( fileName ) + + isRessourceExists = staticmethod( isRessourceExists ) + + ## Return True if the given file is empty, False otherwise + # + def isEmpty( fileName ): + return 0 == FileUtils.getNbLinesInSingleFile( fileName ) + + isEmpty = staticmethod( isEmpty ) + + ## Return True if both files are identical, False otherwise + # + def are2FilesIdentical( file1, file2 ): + tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) ) + cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile ) + returnStatus = os.system( cmd ) + if returnStatus != 0: + print "WARNING: 'diff' returned '%i'" % returnStatus + os.remove( tmpFile ) + return False + if FileUtils.isEmpty( tmpFile ): + os.remove( tmpFile ) + return True + else: + os.remove( tmpFile ) + return False + + are2FilesIdentical = staticmethod( are2FilesIdentical ) + + ## Return a string with all the content of the files in the given list + # + def getFileContent( lFiles ): + content = "" + lFiles.sort() + for fileName in lFiles: + currentFile = open( fileName, "r" ) + content += currentFile.read() + currentFile.close() + return content + + getFileContent = staticmethod( getFileContent ) + + ## Save content of the given file after having sorted it + # + def sortFileContent( inFile, outFile="" ): + inFileHandler = open(inFile, "r" ) + lines = inFileHandler.readlines() + inFileHandler.close() + lines.sort() + if outFile == "": + outFile = inFile + outFileHandler = open( outFile, "w" ) + outFileHandler.writelines( lines ) + outFileHandler.close() + + sortFileContent = staticmethod( sortFileContent ) + + ## Add end-of-line symbol to the given file content if necessary + # + def addNewLineAtTheEndOfFileContent( fileContent ): + if not fileContent.endswith('\n') and len(fileContent) != 0: + fileContent += '\n' + return fileContent + + addNewLineAtTheEndOfFileContent = staticmethod( addNewLineAtTheEndOfFileContent ) + + ## Concatenate files in the given list + # + def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ): + if sort: + lFiles.sort() + outFileHandler = open( outFile, "a" ) + isFirstFile = True + for singleFile in lFiles: + if not isFirstFile: + outFileHandler.write(separator) + isFirstFile = False + singleFileHandler = open( singleFile, "r" ) + if skipHeaders: + singleFileHandler.readline() + line = singleFileHandler.readline() + while line: + outFileHandler.write(line) + line = singleFileHandler.readline() + singleFileHandler.close() + outFileHandler.close() + + catFilesFromList = staticmethod( catFilesFromList ) + + ## Concatenate files according to the given pattern + # + def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ): + lFiles = glob.glob( pattern ) + FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator ) + + catFilesByPattern = staticmethod( catFilesByPattern ) + + ## Remove files listed according to the given pattern + # + # @example prefix="/home/tmp/dummy*.txt" + # + def removeFilesByPattern( prefix ): + lFiles = glob.glob( prefix ) + for f in lFiles: + os.remove( f ) + + removeFilesByPattern = staticmethod( removeFilesByPattern ) + + ## Remove files listed according to the suffixes in the given list + # + def removeFilesBySuffixList( targetPath, lSuffixes ): + if targetPath[-1] == "/": + targetPath = targetPath[:-1] + for suffix in lSuffixes: + pattern = "%s/*%s" % ( targetPath, suffix ) + FileUtils.removeFilesByPattern( pattern ) + + removeFilesBySuffixList = staticmethod( removeFilesBySuffixList ) + + ## Remove repeated blanks in the given file + # + def removeRepeatedBlanks( inFile, outFile="" ): + if outFile == "": + outFile = inFile + tmpFile = "tr_%s_%s" % ( inFile, outFile ) + cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile ) + os.system( cmd ) + os.rename( tmpFile, outFile ) + + removeRepeatedBlanks = staticmethod( removeRepeatedBlanks ) + + ## Remove files in the given list + # + @staticmethod + def removeFilesFromList(lFiles): + for f in lFiles: + os.remove(f) + + ## Remove files in the given list if exist + # + @staticmethod + def removeFilesFromListIfExist(lFiles): + for fileName in lFiles: + if FileUtils.isRessourceExists(fileName): + os.remove(fileName) + + ## Append the content of a file to another file + # + # @param inFile string name of the input file + # @param outFile string name of the output file + # + def appendFileContent( inFile, outFile ): + outFileHandler = open( outFile, "a" ) + inFileHandler = open( inFile, "r" ) + shutil.copyfileobj( inFileHandler, outFileHandler ) + inFileHandler.close() + outFileHandler.close() + + appendFileContent = staticmethod( appendFileContent ) + + + ## Replace Windows end-of-line by Unix end-of-line + # + def fromWindowsToUnixEof( inFile ): + tmpFile = "%s.tmp" % ( inFile ) + shutil.copyfile( inFile, tmpFile ) + os.remove( inFile ) + tmpFileHandler = open( tmpFile, "r" ) + inFileHandler = open( inFile, "w" ) + while True: + line = tmpFileHandler.readline() + if line == "": + break + inFileHandler.write( line.replace("\r\n","\n") ) + tmpFileHandler.close() + inFileHandler.close() + os.remove( tmpFile ) + + fromWindowsToUnixEof = staticmethod( fromWindowsToUnixEof ) + + + ## Remove duplicated lines in a file + # + # @note it preserves the initial order and handles blank lines + # + def removeDuplicatedLines( inFile ): + tmpFile = "%s.tmp" % ( inFile ) + shutil.copyfile( inFile, tmpFile ) + os.remove( inFile ) + + tmpFileHandler = open( tmpFile, "r" ) + lLines = list( tmpFileHandler.read().split("\n") ) + if lLines[-1] == "": + del lLines[-1] + sLines = set( lLines ) + tmpFileHandler.close() + os.remove( tmpFile ) + + inFileHandler = open( inFile, "w" ) + for line in lLines: + if line in sLines: + inFileHandler.write( "%s\n" % ( line ) ) + sLines.remove( line ) + inFileHandler.close() + + removeDuplicatedLines = staticmethod( removeDuplicatedLines ) + + + ## Write a list of lines in a given file + # + def writeLineListInFile( inFile, lLines ): + inFileHandler = open( inFile, "w" ) + for line in lLines: + inFileHandler.write( line ) + inFileHandler.close() + + writeLineListInFile = staticmethod( writeLineListInFile ) + + + ## Give the list of absolute path of each directory in the given directory + # + # @param rootPath string absolute path of the given directory + # + # @return lDirPath list of absolute directory path + # + def getAbsoluteDirectoryPathList(rootPath): + lDirPath = [] + lPaths = glob.glob(rootPath + "/*") + for ressource in lPaths: + if os.path.isdir(ressource) : + lDirPath.append(ressource) + return lDirPath + + getAbsoluteDirectoryPathList = staticmethod(getAbsoluteDirectoryPathList) + + + ## Get a sublist of which each element matches/doesn't match a pattern + # + # @param lPath string list of paths + # + # @param pattern string pattern + # + # @param match bool + # + # @return lPathMatching list of path matching pattern + # + def getSubListAccordingToPattern(lPath, pattern, match = True): + lPathMatching = [] + for path in lPath: + if match: + if re.match(".*%s.*" % pattern, path): + lPathMatching.append(path) + else: + if not re.match(".*%s.*" % pattern, path): + lPathMatching.append(path) + return lPathMatching + + getSubListAccordingToPattern = staticmethod(getSubListAccordingToPattern) + + + ## Give the list of file names found in the given directory + # + # @param dirPath string absolute path of the given directory + # + # @return lFilesInDir list of file names + # + def getFileNamesList( dirPath, patternFileFilter = ".*" ): + lFilesInDir = [] + lPaths = glob.glob( dirPath + "/*" ) + for ressource in lPaths: + if os.path.isfile( ressource ): + fileName = os.path.basename( ressource ) + if re.match(patternFileFilter, fileName): + lFilesInDir.append( fileName ) + return lFilesInDir + + getFileNamesList = staticmethod( getFileNamesList ) + + ## Return the MD5 sum of a file + # + def getMd5SecureHash( inFile ): + if "hashlib" in sys.modules: + md5 = hashlib.md5() + inFileHandler = open( inFile, "r" ) + while True: + line = inFileHandler.readline() + if line == "": + break + md5.update( line ) + inFileHandler.close() + return md5.hexdigest() + else: + return "" + + getMd5SecureHash = staticmethod( getMd5SecureHash ) + + ## Cat all files of a given directory + # + # @param dir string directory name + # @param outFileName string output file name + # + def catFilesOfDir(dir, outFileName): + lFiles = FileUtils.getFileNamesList(dir) + lFile2 = [] + for file in lFiles: + lFile2.append(dir + "/" + file) + FileUtils.catFilesFromList(lFile2, outFileName) + + catFilesOfDir = staticmethod(catFilesOfDir) + + ## Return True if size file > 0 octet + # + # @param fileName string file name + # + def isSizeNotNull(fileName): + size = os.path.getsize(fileName) + if size > 0: + return True + return False + + isSizeNotNull = staticmethod(isSizeNotNull) + + ## Split one file into N Files by lines + # + # @param fileName string file name + # @param N int number of files to create + # + @staticmethod + def splitFileIntoNFiles(fileName, N): + nbLine = FileUtils.getNbLinesInSingleFile(fileName) + nbLinesInEachFile = nbLine + if N > nbLine: + N = nbLine + if N != 0: + nbLinesInEachFile = math.ceil(float(nbLine) / N) + else: + N = 1 + filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) + fileHandler = open(fileName, "r") + for i in range(1,N+1): + with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f: + j = 0 + while j < nbLinesInEachFile: + j += 1 + f.write(fileHandler.readline()) + fileHandler.close() + + ## Split one file into files of N lines + # + # @param fileName string input file name + # @param N int lines number per files + # + @staticmethod + def splitFileAccordingToLineNumber(fileName, N): + filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) + with open(fileName) as inF: + fileNb = 1 + line = inF.readline() + if not line or N == 0: + outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) + f = open(outFileName, "wb") + shutil.copyfileobj(open(fileName, "rb"), f) + f.close() + else: + while line: + outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) + with open(outFileName, "w") as outF: + lineNb = 1 + while lineNb <= N and line: + outF.write(line) + line = inF.readline() + lineNb += 1 + fileNb += 1 \ No newline at end of file