Mercurial > repos > urgi-team > teiso
comparison TEisotools-1.1.a/commons/core/utils/FileUtils.py @ 13:feef9a0db09d draft
Uploaded
| author | urgi-team |
|---|---|
| date | Wed, 20 Jul 2016 09:04:42 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| 12:22b0494ec883 | 13:feef9a0db09d |
|---|---|
| 1 # Copyright INRA (Institut National de la Recherche Agronomique) | |
| 2 # http://www.inra.fr | |
| 3 # http://urgi.versailles.inra.fr | |
| 4 # | |
| 5 # This software is governed by the CeCILL license under French law and | |
| 6 # abiding by the rules of distribution of free software. You can use, | |
| 7 # modify and/ or redistribute the software under the terms of the CeCILL | |
| 8 # license as circulated by CEA, CNRS and INRIA at the following URL | |
| 9 # "http://www.cecill.info". | |
| 10 # | |
| 11 # As a counterpart to the access to the source code and rights to copy, | |
| 12 # modify and redistribute granted by the license, users are provided only | |
| 13 # with a limited warranty and the software's author, the holder of the | |
| 14 # economic rights, and the successive licensors have only limited | |
| 15 # liability. | |
| 16 # | |
| 17 # In this respect, the user's attention is drawn to the risks associated | |
| 18 # with loading, using, modifying and/or developing or reproducing the | |
| 19 # software by the user in light of its specific status of free software, | |
| 20 # that may mean that it is complicated to manipulate, and that also | |
| 21 # therefore means that it is reserved for developers and experienced | |
| 22 # professionals having in-depth computer knowledge. Users are therefore | |
| 23 # encouraged to load and test the software's suitability as regards their | |
| 24 # requirements in conditions enabling the security of their systems and/or | |
| 25 # data to be ensured and, more generally, to use and operate it in the | |
| 26 # same conditions as regards security. | |
| 27 # | |
| 28 # The fact that you are presently reading this means that you have had | |
| 29 # knowledge of the CeCILL license and that you accept its terms. | |
| 30 | |
| 31 | |
| 32 import os | |
| 33 import re | |
| 34 import sys | |
| 35 import math | |
| 36 import glob | |
| 37 import shutil | |
| 38 import subprocess | |
| 39 from operator import itemgetter | |
| 40 try: | |
| 41 import hashlib | |
| 42 except: | |
| 43 pass | |
| 44 | |
| 45 | |
| 46 class FileUtils( object ): | |
| 47 | |
| 48 ## Return the number of lines in the given file | |
| 49 # | |
| 50 @staticmethod | |
| 51 def getNbLinesInSingleFile( fileName ): | |
| 52 cmd = "wc -l %s" % fileName | |
| 53 r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] | |
| 54 nbLines = int(r.split()[0]) | |
| 55 | |
| 56 toAdd = 0 | |
| 57 if nbLines: | |
| 58 cmd = "tail -1 %s" % fileName | |
| 59 r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0] | |
| 60 | |
| 61 if r == '\n': | |
| 62 toAdd -= 1 | |
| 63 elif '\n' not in r: | |
| 64 toAdd += 1 | |
| 65 | |
| 66 return nbLines + toAdd | |
| 67 | |
| 68 ## Return the number of lines in the files in the given list | |
| 69 # | |
| 70 @staticmethod | |
| 71 def getNbLinesInFileList( lFileNames ): | |
| 72 count = 0 | |
| 73 for fileName in lFileNames: | |
| 74 count += FileUtils.getNbLinesInSingleFile( fileName ) | |
| 75 return count | |
| 76 | |
| 77 ## Return True if the given file exists, False otherwise | |
| 78 # | |
| 79 @staticmethod | |
| 80 def isRessourceExists( fileName ): | |
| 81 return os.path.exists( fileName ) | |
| 82 | |
| 83 ## Return True if the given file is empty, False otherwise | |
| 84 # | |
| 85 @staticmethod | |
| 86 def isEmpty( fileName ): | |
| 87 return 0 == FileUtils.getNbLinesInSingleFile( fileName ) | |
| 88 | |
| 89 ## Return True if both files are identical, False otherwise | |
| 90 # | |
| 91 @staticmethod | |
| 92 def are2FilesIdentical( file1, file2 ): | |
| 93 tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) ) | |
| 94 cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile ) | |
| 95 returnStatus = os.system( cmd ) | |
| 96 if returnStatus != 0: | |
| 97 print "WARNING: 'diff' returned '%i'" % returnStatus | |
| 98 os.remove( tmpFile ) | |
| 99 return False | |
| 100 if FileUtils.isEmpty( tmpFile ): | |
| 101 os.remove( tmpFile ) | |
| 102 return True | |
| 103 else: | |
| 104 os.remove( tmpFile ) | |
| 105 return False | |
| 106 | |
| 107 ## Return a string with all the content of the files in the given list | |
| 108 # | |
| 109 @staticmethod | |
| 110 def getFileContent( lFiles ): | |
| 111 content = "" | |
| 112 lFiles.sort() | |
| 113 for fileName in lFiles: | |
| 114 currentFile = open( fileName, "r" ) | |
| 115 content += currentFile.read() | |
| 116 currentFile.close() | |
| 117 return content | |
| 118 | |
| 119 ## Save content of the given file after having sorted it | |
| 120 # | |
| 121 @staticmethod | |
| 122 def sortFileContent( inFile, outFile="" ): | |
| 123 inFileHandler = open(inFile, "r" ) | |
| 124 lines = inFileHandler.readlines() | |
| 125 inFileHandler.close() | |
| 126 lines.sort() | |
| 127 if outFile == "": | |
| 128 outFile = inFile | |
| 129 outFileHandler = open( outFile, "w" ) | |
| 130 outFileHandler.writelines( lines ) | |
| 131 outFileHandler.close() | |
| 132 | |
| 133 ## Add end-of-line symbol to the given file content if necessary | |
| 134 # | |
| 135 @staticmethod | |
| 136 def addNewLineAtTheEndOfFileContent( fileContent ): | |
| 137 if not fileContent.endswith('\n') and len(fileContent) != 0: | |
| 138 fileContent += '\n' | |
| 139 return fileContent | |
| 140 | |
| 141 ## Concatenate files in the given list | |
| 142 # | |
| 143 @staticmethod | |
| 144 def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ): | |
| 145 if sort: | |
| 146 lFiles.sort() | |
| 147 outFileHandler = open( outFile, "a" ) | |
| 148 isFirstFile = True | |
| 149 for singleFile in lFiles: | |
| 150 if not isFirstFile: | |
| 151 outFileHandler.write(separator) | |
| 152 isFirstFile = False | |
| 153 singleFileHandler = open( singleFile, "r" ) | |
| 154 if skipHeaders: | |
| 155 singleFileHandler.readline() | |
| 156 line = singleFileHandler.readline() | |
| 157 while line: | |
| 158 outFileHandler.write(line) | |
| 159 line = singleFileHandler.readline() | |
| 160 singleFileHandler.close() | |
| 161 outFileHandler.close() | |
| 162 | |
| 163 ## Concatenate files according to the given pattern | |
| 164 # | |
| 165 @staticmethod | |
| 166 def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ): | |
| 167 lFiles = glob.glob( pattern ) | |
| 168 FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator ) | |
| 169 | |
| 170 ## Cat all files of a given directory | |
| 171 # | |
| 172 # @param dir string directory name | |
| 173 # @param outFileName string output file name | |
| 174 # | |
| 175 @staticmethod | |
| 176 def catFilesOfDir(directory, outFileName): | |
| 177 FileUtils.catFilesByPattern("%s/*" % directory, outFileName) | |
| 178 | |
| 179 ## Remove files listed according to the given pattern | |
| 180 # | |
| 181 # @example prefix="/home/tmp/dummy*.txt" | |
| 182 # | |
| 183 @staticmethod | |
| 184 def removeFilesByPattern( prefix ): | |
| 185 lFiles = glob.glob( prefix ) | |
| 186 for f in lFiles: | |
| 187 os.remove( f ) | |
| 188 | |
| 189 ## Remove files listed according to the suffixes in the given list | |
| 190 # | |
| 191 @staticmethod | |
| 192 def removeFilesBySuffixList( targetPath, lSuffixes ): | |
| 193 if targetPath[-1] == "/": | |
| 194 targetPath = targetPath[:-1] | |
| 195 for suffix in lSuffixes: | |
| 196 pattern = "%s/*%s" % ( targetPath, suffix ) | |
| 197 FileUtils.removeFilesByPattern( pattern ) | |
| 198 | |
| 199 ## Remove repeated blanks in the given file | |
| 200 # | |
| 201 @staticmethod | |
| 202 def removeRepeatedBlanks( inFile, outFile="" ): | |
| 203 if outFile == "": | |
| 204 outFile = inFile | |
| 205 tmpFile = "tr_%s_%s" % ( inFile, outFile ) | |
| 206 cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile ) | |
| 207 os.system( cmd ) | |
| 208 os.rename( tmpFile, outFile ) | |
| 209 | |
| 210 ## Remove files in the given list | |
| 211 # | |
| 212 @staticmethod | |
| 213 def removeFilesFromList(lFiles): | |
| 214 for f in lFiles: | |
| 215 os.remove(f) | |
| 216 | |
| 217 ## Remove files in the given list if exist | |
| 218 # | |
| 219 @staticmethod | |
| 220 def removeFilesFromListIfExist(lFiles): | |
| 221 for fileName in lFiles: | |
| 222 if FileUtils.isRessourceExists(fileName): | |
| 223 os.remove(fileName) | |
| 224 | |
| 225 ## Append the content of a file to another file | |
| 226 # | |
| 227 # @param inFile string name of the input file | |
| 228 # @param outFile string name of the output file | |
| 229 # | |
| 230 @staticmethod | |
| 231 def appendFileContent( inFile, outFile ): | |
| 232 outFileHandler = open( outFile, "a" ) | |
| 233 inFileHandler = open( inFile, "r" ) | |
| 234 shutil.copyfileobj( inFileHandler, outFileHandler ) | |
| 235 inFileHandler.close() | |
| 236 outFileHandler.close() | |
| 237 | |
| 238 | |
| 239 ## Replace Windows end-of-line by Unix end-of-line | |
| 240 # | |
| 241 @staticmethod | |
| 242 def fromWindowsToUnixEof( inFile ): | |
| 243 tmpFile = "%s.tmp" % ( inFile ) | |
| 244 shutil.copyfile( inFile, tmpFile ) | |
| 245 os.remove( inFile ) | |
| 246 tmpFileHandler = open( tmpFile, "r" ) | |
| 247 inFileHandler = open( inFile, "w" ) | |
| 248 while True: | |
| 249 line = tmpFileHandler.readline() | |
| 250 if line == "": | |
| 251 break | |
| 252 inFileHandler.write( line.replace("\r\n","\n") ) | |
| 253 tmpFileHandler.close() | |
| 254 inFileHandler.close() | |
| 255 os.remove( tmpFile ) | |
| 256 | |
| 257 | |
| 258 ## Remove duplicated lines in a file | |
| 259 # | |
| 260 # @note it preserves the initial order and handles blank lines | |
| 261 # | |
| 262 @staticmethod | |
| 263 def removeDuplicatedLines( inFile ): | |
| 264 tmpFile = "%s.tmp" % ( inFile ) | |
| 265 shutil.copyfile( inFile, tmpFile ) | |
| 266 os.remove( inFile ) | |
| 267 | |
| 268 tmpFileHandler = open( tmpFile, "r" ) | |
| 269 lLines = list( tmpFileHandler.read().split("\n") ) | |
| 270 if lLines[-1] == "": | |
| 271 del lLines[-1] | |
| 272 sLines = set( lLines ) | |
| 273 tmpFileHandler.close() | |
| 274 os.remove( tmpFile ) | |
| 275 | |
| 276 inFileHandler = open( inFile, "w" ) | |
| 277 for line in lLines: | |
| 278 if line in sLines: | |
| 279 inFileHandler.write( "%s\n" % ( line ) ) | |
| 280 sLines.remove( line ) | |
| 281 inFileHandler.close() | |
| 282 | |
| 283 | |
| 284 ## Write a list of lines in a given file | |
| 285 # | |
| 286 @staticmethod | |
| 287 def writeLineListInFile( inFile, lLines ): | |
| 288 inFileHandler = open( inFile, "w" ) | |
| 289 for line in lLines: | |
| 290 inFileHandler.write( line ) | |
| 291 inFileHandler.close() | |
| 292 | |
| 293 | |
| 294 ## Give the list of absolute path of each directory in the given directory | |
| 295 # | |
| 296 # @param rootPath string absolute path of the given directory | |
| 297 # | |
| 298 # @return lDirPath list of absolute directory path | |
| 299 # | |
| 300 @staticmethod | |
| 301 def getAbsoluteDirectoryPathList(rootPath): | |
| 302 lDirPath = [] | |
| 303 lPaths = glob.glob(rootPath + "/*") | |
| 304 for ressource in lPaths: | |
| 305 if os.path.isdir(ressource) : | |
| 306 lDirPath.append(ressource) | |
| 307 return lDirPath | |
| 308 | |
| 309 | |
| 310 ## Get a sublist of which each element matches/doesn't match a pattern | |
| 311 # | |
| 312 # @param lPath string list of paths | |
| 313 # | |
| 314 # @param pattern string pattern | |
| 315 # | |
| 316 # @param match bool | |
| 317 # | |
| 318 # @return lPathMatching list of path matching pattern | |
| 319 # | |
| 320 @staticmethod | |
| 321 def getSubListAccordingToPattern(lPath, pattern, match = True): | |
| 322 lPathMatching = [] | |
| 323 for path in lPath: | |
| 324 if match: | |
| 325 if re.match(".*%s.*" % pattern, path): | |
| 326 lPathMatching.append(path) | |
| 327 else: | |
| 328 if not re.match(".*%s.*" % pattern, path): | |
| 329 lPathMatching.append(path) | |
| 330 return lPathMatching | |
| 331 | |
| 332 | |
| 333 ## Give the list of file names found in the given directory | |
| 334 # | |
| 335 # @param dirPath string absolute path of the given directory | |
| 336 # | |
| 337 # @return lFilesInDir list of file names | |
| 338 # | |
| 339 @staticmethod | |
| 340 def getFileNamesList( dirPath, patternFileFilter = ".*" ): | |
| 341 lFilesInDir = [] | |
| 342 lPaths = glob.glob( dirPath + "/*" ) | |
| 343 for ressource in lPaths: | |
| 344 if os.path.isfile( ressource ): | |
| 345 fileName = os.path.basename( ressource ) | |
| 346 if re.match(patternFileFilter, fileName): | |
| 347 lFilesInDir.append( fileName ) | |
| 348 return lFilesInDir | |
| 349 | |
| 350 ## Return the MD5 sum of a file | |
| 351 # | |
| 352 @staticmethod | |
| 353 def getMd5SecureHash( inFile ): | |
| 354 if "hashlib" in sys.modules: | |
| 355 md5 = hashlib.md5() | |
| 356 inFileHandler = open( inFile, "r" ) | |
| 357 while True: | |
| 358 line = inFileHandler.readline() | |
| 359 if line == "": | |
| 360 break | |
| 361 md5.update( line ) | |
| 362 inFileHandler.close() | |
| 363 return md5.hexdigest() | |
| 364 else: | |
| 365 return "" | |
| 366 | |
| 367 ## Return True if size file > 0 octet | |
| 368 # | |
| 369 # @param fileName string file name | |
| 370 # | |
| 371 @staticmethod | |
| 372 def isSizeNotNull(fileName): | |
| 373 size = os.path.getsize(fileName) | |
| 374 if size > 0: | |
| 375 return True | |
| 376 return False | |
| 377 | |
| 378 ## Split one file into N Files by lines | |
| 379 # | |
| 380 # @param fileName string file name | |
| 381 # @param N int number of files to create | |
| 382 # | |
| 383 @staticmethod | |
| 384 def splitFileIntoNFiles(fileName, N): | |
| 385 nbLine = FileUtils.getNbLinesInSingleFile(fileName) | |
| 386 nbLinesInEachFile = nbLine | |
| 387 if N > nbLine: | |
| 388 N = nbLine | |
| 389 if N != 0: | |
| 390 nbLinesInEachFile = math.ceil(float(nbLine) / N) | |
| 391 else: | |
| 392 N = 1 | |
| 393 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) | |
| 394 fileHandler = open(fileName, "r") | |
| 395 for i in range(1,N+1): | |
| 396 with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f: | |
| 397 j = 0 | |
| 398 while j < nbLinesInEachFile: | |
| 399 j += 1 | |
| 400 f.write(fileHandler.readline()) | |
| 401 fileHandler.close() | |
| 402 | |
| 403 ## Split one file into files of N lines | |
| 404 # | |
| 405 # @param fileName string input file name | |
| 406 # @param N int lines number per files | |
| 407 # | |
| 408 @staticmethod | |
| 409 def splitFileAccordingToLineNumber(fileName, N): | |
| 410 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName)) | |
| 411 with open(fileName) as inF: | |
| 412 fileNb = 1 | |
| 413 line = inF.readline() | |
| 414 if not line or N == 0: | |
| 415 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) | |
| 416 f = open(outFileName, "wb") | |
| 417 shutil.copyfileobj(open(fileName, "rb"), f) | |
| 418 f.close() | |
| 419 else: | |
| 420 while line: | |
| 421 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt) | |
| 422 with open(outFileName, "w") as outF: | |
| 423 lineNb = 1 | |
| 424 while lineNb <= N and line: | |
| 425 outF.write(line) | |
| 426 line = inF.readline() | |
| 427 lineNb += 1 | |
| 428 fileNb += 1 | |
| 429 | |
| 430 ## Concatenates names from a list, using a given separator and a given extension. | |
| 431 # | |
| 432 # @param lNames list of file names | |
| 433 # @param sep separator used to join names | |
| 434 # @param ext extension of the return file name. If None, the most represented extension in lNames is used. | |
| 435 # If there is several, the first extension of theses several in alphabetical order is used | |
| 436 # | |
| 437 # @return concatName name concatenated | |
| 438 # | |
| 439 @staticmethod | |
| 440 def concatenateFileNamesFromList(lNames, sep = "_", ext = None): | |
| 441 concatName = "" | |
| 442 if lNames: | |
| 443 lNames.sort() | |
| 444 tBaseNames, tExt = zip(*[os.path.splitext(os.path.basename(name)) for name in lNames]) | |
| 445 | |
| 446 if ext is None: | |
| 447 dtExtToNb = {} | |
| 448 for extension in set(tExt): | |
| 449 dtExtToNb[extension] = tExt.count(extension) | |
| 450 | |
| 451 items = sorted(dtExtToNb.items(), key = itemgetter(0)) | |
| 452 items.sort(key = itemgetter(1), reverse = True) | |
| 453 ext = items[0][0] | |
| 454 | |
| 455 if ext and ext[0] != '.': | |
| 456 ext = ".%s" % ext | |
| 457 | |
| 458 concatName = "%s%s" % (sep.join(tBaseNames), ext) | |
| 459 return concatName | |
| 460 | |
| 461 ## Concatenates names from a string, using a given separator and a given extension. Names are split from the string using splitSep | |
| 462 # | |
| 463 # @param filesNames list of file names | |
| 464 # @param splitSep separator used to split names from the input string | |
| 465 # @param joinSep separator used to join names | |
| 466 # @param ext extension of the return file name. If None, the most represented extension in lNames is used. | |
| 467 # If there is several, the first extension of theses several in alphabetical order is used | |
| 468 # | |
| 469 # @return concatName,lFilesNames name concatenated and split files list sorted alphabetically. Return original name if splitSep is empty. | |
| 470 # | |
| 471 @staticmethod | |
| 472 def concatenateFileNamesFromString(filesNames, splitSep = ",", joinSep = "_", ext = None): | |
| 473 if splitSep: | |
| 474 lFilesNames = filesNames.split(splitSep) | |
| 475 return FileUtils.concatenateFileNamesFromList(lFilesNames, joinSep, ext), lFilesNames | |
| 476 else: | |
| 477 print "WARNING: no split separator provided, returning input string" | |
| 478 return filesNames, [filesNames] | |
| 479 |
