view commons/core/utils/FileUtils.py @ 69:1473ab954708 draft

Corrected bug in "CollapsedReads" XML file.
author m-zytnicki
date Wed, 18 Nov 2015 10:59:02 -0500
parents 44d5973c188c
children
line wrap: on
line source

# Copyright INRA (Institut National de la Recherche Agronomique)
# http://www.inra.fr
# http://urgi.versailles.inra.fr
#
# This software is governed by the CeCILL license under French law and
# abiding by the rules of distribution of free software.  You can  use, 
# modify and/ or redistribute the software under the terms of the CeCILL
# license as circulated by CEA, CNRS and INRIA at the following URL
# "http://www.cecill.info". 
#
# As a counterpart to the access to the source code and  rights to copy,
# modify and redistribute granted by the license, users are provided only
# with a limited warranty  and the software's author,  the holder of the
# economic rights,  and the successive licensors  have only  limited
# liability. 
#
# In this respect, the user's attention is drawn to the risks associated
# with loading,  using,  modifying and/or developing or reproducing the
# software by the user in light of its specific status of free software,
# that may mean  that it is complicated to manipulate,  and  that  also
# therefore means  that it is reserved for developers  and  experienced
# professionals having in-depth computer knowledge. Users are therefore
# encouraged to load and test the software's suitability as regards their
# requirements in conditions enabling the security of their systems and/or 
# data to be ensured and,  more generally, to use and operate it in the 
# same conditions as regards security. 
#
# The fact that you are presently reading this means that you have had
# knowledge of the CeCILL license and that you accept its terms.


import os
import glob
import shutil
import sys
import re
import math
try:
    import hashlib
except:
    pass


class FileUtils( object ):
    
    ## Return the number of lines in the given file
    #
    def getNbLinesInSingleFile( fileName ):
        fileHandler = open( fileName, "r" )
        lines = fileHandler.readlines()
        fileHandler.close()
        if (len(lines)>0 and lines[-1]== "\n"):
            return (len(lines)-1)
        else :
            return len(lines)
    
    getNbLinesInSingleFile = staticmethod( getNbLinesInSingleFile )
    
    ## Return the number of lines in the files in the given list
    #
    def getNbLinesInFileList( lFileNames ):
        count = 0
        for fileName in lFileNames:
            count += FileUtils.getNbLinesInSingleFile( fileName )
        return count
    
    getNbLinesInFileList = staticmethod( getNbLinesInFileList )
    
    ## Return True if the given file exists, False otherwise
    #
    def isRessourceExists( fileName ):
        return os.path.exists( fileName )
    
    isRessourceExists = staticmethod( isRessourceExists )
    
    ## Return True if the given file is empty, False otherwise
    #
    def isEmpty( fileName ):
        return 0 == FileUtils.getNbLinesInSingleFile( fileName )
    
    isEmpty = staticmethod( isEmpty )
    
    ## Return True if both files are identical, False otherwise
    #
    def are2FilesIdentical( file1, file2 ):
        tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) )
        cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile )
        returnStatus = os.system( cmd )
        if returnStatus != 0:
            print "WARNING: 'diff' returned '%i'" % returnStatus
            os.remove( tmpFile )
            return False
        if FileUtils.isEmpty( tmpFile ):
            os.remove( tmpFile )
            return True
        else:
            os.remove( tmpFile )
            return False
        
    are2FilesIdentical = staticmethod( are2FilesIdentical )
    
    ## Return a string with all the content of the files in the given list
    #
    def getFileContent( lFiles ):
        content = ""
        lFiles.sort()
        for fileName in lFiles:
            currentFile = open( fileName, "r" )
            content += currentFile.read()
            currentFile.close()
        return content
    
    getFileContent = staticmethod( getFileContent )
    
    ## Save content of the given file after having sorted it
    #
    def sortFileContent( inFile, outFile="" ):
        inFileHandler = open(inFile, "r" )
        lines = inFileHandler.readlines()
        inFileHandler.close()
        lines.sort()
        if outFile == "":
            outFile = inFile
        outFileHandler = open( outFile, "w" )
        outFileHandler.writelines( lines )
        outFileHandler.close()
        
    sortFileContent = staticmethod( sortFileContent )
    
    ## Add end-of-line symbol to the given file content if necessary
    #
    def addNewLineAtTheEndOfFileContent( fileContent ):
        if not fileContent.endswith('\n')  and  len(fileContent) != 0:
            fileContent += '\n'
        return fileContent
    
    addNewLineAtTheEndOfFileContent = staticmethod( addNewLineAtTheEndOfFileContent )
    
    ## Concatenate files in the given list
    #
    def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ):
        if sort:
            lFiles.sort()
        outFileHandler = open( outFile, "a" )
        isFirstFile = True
        for singleFile in lFiles:
            if not isFirstFile:
                outFileHandler.write(separator)
            isFirstFile = False
            singleFileHandler = open( singleFile, "r" )
            if skipHeaders:
                singleFileHandler.readline()
            line = singleFileHandler.readline()
            while line:
                outFileHandler.write(line)
                line = singleFileHandler.readline()
            singleFileHandler.close()
        outFileHandler.close()
        
    catFilesFromList = staticmethod( catFilesFromList )
    
    ## Concatenate files according to the given pattern
    #
    def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ):
        lFiles = glob.glob( pattern )
        FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator )
        
    catFilesByPattern = staticmethod( catFilesByPattern )
    
    ## Remove files listed according to the given pattern
    #
    # @example prefix="/home/tmp/dummy*.txt"
    #
    def removeFilesByPattern( prefix ):
        lFiles = glob.glob( prefix )
        for f in lFiles:
            os.remove( f )
            
    removeFilesByPattern = staticmethod( removeFilesByPattern )
    
    ## Remove files listed according to the suffixes in the given list
    #
    def removeFilesBySuffixList( targetPath, lSuffixes ):
        if targetPath[-1] == "/":
            targetPath = targetPath[:-1]
        for suffix in lSuffixes:
            pattern = "%s/*%s" % ( targetPath, suffix )
            FileUtils.removeFilesByPattern( pattern )
            
    removeFilesBySuffixList = staticmethod( removeFilesBySuffixList )
    
    ## Remove repeated blanks in the given file
    #
    def removeRepeatedBlanks( inFile, outFile="" ):
        if outFile == "":
            outFile = inFile
        tmpFile = "tr_%s_%s" % ( inFile, outFile )
        cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile )
        os.system( cmd )
        os.rename( tmpFile, outFile )
        
    removeRepeatedBlanks = staticmethod( removeRepeatedBlanks )
    
    ## Remove files in the given list
    #
    @staticmethod
    def removeFilesFromList(lFiles):
        for f in lFiles:
            os.remove(f)
    
    ## Remove files in the given list if exist
    #
    @staticmethod
    def removeFilesFromListIfExist(lFiles):
        for fileName in lFiles:
            if FileUtils.isRessourceExists(fileName):
                os.remove(fileName)
    
    ## Append the content of a file to another file
    #
    # @param inFile string name of the input file
    # @param outFile string name of the output file
    #
    def appendFileContent( inFile, outFile ):
        outFileHandler = open( outFile, "a" )
        inFileHandler = open( inFile, "r" )
        shutil.copyfileobj( inFileHandler, outFileHandler )
        inFileHandler.close()
        outFileHandler.close()
        
    appendFileContent = staticmethod( appendFileContent )
    
    
    ## Replace Windows end-of-line by Unix end-of-line
    #
    def fromWindowsToUnixEof( inFile ):
        tmpFile = "%s.tmp" % ( inFile )
        shutil.copyfile( inFile, tmpFile )
        os.remove( inFile )
        tmpFileHandler = open( tmpFile, "r" )
        inFileHandler = open( inFile, "w" )
        while True:
            line = tmpFileHandler.readline()
            if line == "":
                break
            inFileHandler.write( line.replace("\r\n","\n") )
        tmpFileHandler.close()
        inFileHandler.close()
        os.remove( tmpFile )
        
    fromWindowsToUnixEof = staticmethod( fromWindowsToUnixEof )


    ## Remove duplicated lines in a file
    #
    # @note it preserves the initial order and handles blank lines
    #
    def removeDuplicatedLines( inFile ):
        tmpFile = "%s.tmp" % ( inFile )
        shutil.copyfile( inFile, tmpFile )
        os.remove( inFile )
        
        tmpFileHandler = open( tmpFile, "r" )
        lLines = list( tmpFileHandler.read().split("\n") )
        if lLines[-1] == "":
            del lLines[-1]
        sLines = set( lLines )
        tmpFileHandler.close()
        os.remove( tmpFile )
        
        inFileHandler = open( inFile, "w" )
        for line in lLines:
            if line in sLines:
                inFileHandler.write( "%s\n" % ( line ) )
                sLines.remove( line )
        inFileHandler.close()
        
    removeDuplicatedLines = staticmethod( removeDuplicatedLines )
    
    
    ## Write a list of lines in a given file
    #
    def writeLineListInFile( inFile, lLines ):
        inFileHandler = open( inFile, "w" )
        for line in lLines:
            inFileHandler.write( line )
        inFileHandler.close()
        
    writeLineListInFile = staticmethod( writeLineListInFile )
    
    
    ## Give the list of absolute path of each directory in the given directory
    #
    # @param rootPath string absolute path of the given directory
    #
    # @return lDirPath list of absolute directory path
    #
    def getAbsoluteDirectoryPathList(rootPath):
        lDirPath = []
        lPaths = glob.glob(rootPath + "/*")
        for ressource in lPaths:
            if os.path.isdir(ressource) :
                lDirPath.append(ressource)
        return lDirPath
    
    getAbsoluteDirectoryPathList = staticmethod(getAbsoluteDirectoryPathList)
    
    
    ## Get a sublist of which each element matches/doesn't match a pattern
    #
    # @param lPath string list of paths
    #
    # @param pattern string pattern
    #
    # @param match bool 
    #
    # @return lPathMatching list of path matching pattern
    #
    def getSubListAccordingToPattern(lPath, pattern, match = True):
        lPathMatching = []
        for path in lPath:
            if match:
                if re.match(".*%s.*" % pattern, path):
                    lPathMatching.append(path)
            else:
                if not re.match(".*%s.*" % pattern, path):
                    lPathMatching.append(path)
        return lPathMatching
    
    getSubListAccordingToPattern = staticmethod(getSubListAccordingToPattern)
    
    
    ## Give the list of file names found in the given directory
    #
    # @param dirPath string absolute path of the given directory
    #
    # @return lFilesInDir list of file names
    #
    def getFileNamesList( dirPath, patternFileFilter = ".*" ):
        lFilesInDir = []
        lPaths = glob.glob( dirPath + "/*" )
        for ressource in lPaths:
            if os.path.isfile( ressource ):
                fileName = os.path.basename( ressource )
                if re.match(patternFileFilter, fileName):
                    lFilesInDir.append( fileName )
        return lFilesInDir
    
    getFileNamesList = staticmethod( getFileNamesList )
    
    ## Return the MD5 sum of a file
    #
    def getMd5SecureHash( inFile ):
        if "hashlib" in sys.modules:
            md5 = hashlib.md5()
            inFileHandler = open( inFile, "r" )
            while True:
                line = inFileHandler.readline()
                if line == "":
                    break
                md5.update( line )
            inFileHandler.close()
            return md5.hexdigest()
        else:
            return ""
        
    getMd5SecureHash = staticmethod( getMd5SecureHash )
    
    ## Cat all files of a given directory
    #
    # @param dir string directory name
    # @param outFileName string output file name
    #
    def catFilesOfDir(dir, outFileName):
        lFiles = FileUtils.getFileNamesList(dir)
        lFile2 = []
        for file in lFiles:
            lFile2.append(dir + "/" + file)
        FileUtils.catFilesFromList(lFile2, outFileName)
        
    catFilesOfDir = staticmethod(catFilesOfDir)
    
    ## Return True if size file > 0 octet
    #
    # @param fileName string file name
    #
    def isSizeNotNull(fileName):
        size = os.path.getsize(fileName)
        if size > 0:
            return True
        return False
        
    isSizeNotNull = staticmethod(isSizeNotNull)
    
    ## Split one file into N Files by lines
    #
    # @param fileName string file name
    # @param N int number of files to create
    # 
    @staticmethod
    def splitFileIntoNFiles(fileName, N):
        nbLine = FileUtils.getNbLinesInSingleFile(fileName)
        nbLinesInEachFile = nbLine
        if N > nbLine:
            N = nbLine
        if N != 0:
            nbLinesInEachFile = math.ceil(float(nbLine) / N)
        else:
            N = 1
        filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
        fileHandler = open(fileName, "r")
        for i in range(1,N+1):
            with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f:
                j = 0
                while j < nbLinesInEachFile:
                    j += 1
                    f.write(fileHandler.readline())
        fileHandler.close()            
            
    ## Split one file into files of N lines
    #
    # @param fileName string input file name
    # @param N int lines number per files
    # 
    @staticmethod
    def splitFileAccordingToLineNumber(fileName, N):
        filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
        with open(fileName) as inF:
            fileNb = 1
            line = inF.readline()
            if not line or N == 0:
                outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
                f = open(outFileName, "wb")
                shutil.copyfileobj(open(fileName, "rb"), f)
                f.close()
            else:
                while line:
                    outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
                    with open(outFileName, "w") as outF:
                        lineNb = 1
                        while lineNb <= N and line:
                            outF.write(line)
                            line = inF.readline()
                            lineNb += 1
                    fileNb += 1