Mercurial > repos > shellac > guppy_basecaller
comparison env/lib/python3.7/site-packages/bagit.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
| author | shellac |
|---|---|
| date | Sat, 02 May 2020 07:14:21 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:26e78fe6e8c4 |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # encoding: utf-8 | |
| 3 | |
| 4 from __future__ import absolute_import, division, print_function, unicode_literals | |
| 5 | |
| 6 import argparse | |
| 7 import codecs | |
| 8 import gettext | |
| 9 import hashlib | |
| 10 import logging | |
| 11 import multiprocessing | |
| 12 import os | |
| 13 import re | |
| 14 import signal | |
| 15 import sys | |
| 16 import tempfile | |
| 17 import unicodedata | |
| 18 import warnings | |
| 19 from collections import defaultdict | |
| 20 from datetime import date | |
| 21 from functools import partial | |
| 22 from os.path import abspath, isdir, isfile, join | |
| 23 | |
| 24 from pkg_resources import DistributionNotFound, get_distribution | |
| 25 | |
| 26 try: | |
| 27 from urllib.parse import urlparse | |
| 28 except ImportError: | |
| 29 from urlparse import urlparse | |
| 30 | |
| 31 | |
| 32 def find_locale_dir(): | |
| 33 for prefix in (os.path.dirname(__file__), sys.prefix): | |
| 34 locale_dir = os.path.join(prefix, "locale") | |
| 35 if os.path.isdir(locale_dir): | |
| 36 return locale_dir | |
| 37 | |
| 38 | |
| 39 TRANSLATION_CATALOG = gettext.translation( | |
| 40 "bagit-python", localedir=find_locale_dir(), fallback=True | |
| 41 ) | |
| 42 if sys.version_info < (3,): | |
| 43 _ = TRANSLATION_CATALOG.ugettext | |
| 44 else: | |
| 45 _ = TRANSLATION_CATALOG.gettext | |
| 46 | |
| 47 MODULE_NAME = "bagit" if __name__ == "__main__" else __name__ | |
| 48 | |
| 49 LOGGER = logging.getLogger(MODULE_NAME) | |
| 50 | |
| 51 try: | |
| 52 VERSION = get_distribution(MODULE_NAME).version | |
| 53 except DistributionNotFound: | |
| 54 VERSION = "0.0.dev0" | |
| 55 | |
| 56 PROJECT_URL = "https://github.com/LibraryOfCongress/bagit-python" | |
| 57 | |
| 58 __doc__ = ( | |
| 59 _( | |
| 60 """ | |
| 61 BagIt is a directory, filename convention for bundling an arbitrary set of | |
| 62 files with a manifest, checksums, and additional metadata. More about BagIt | |
| 63 can be found at: | |
| 64 | |
| 65 http://purl.org/net/bagit | |
| 66 | |
| 67 bagit.py is a pure python drop in library and command line tool for creating, | |
| 68 and working with BagIt directories. | |
| 69 | |
| 70 | |
| 71 Command-Line Usage: | |
| 72 | |
| 73 Basic usage is to give bagit.py a directory to bag up: | |
| 74 | |
| 75 $ bagit.py my_directory | |
| 76 | |
| 77 This does a bag-in-place operation where the current contents will be moved | |
| 78 into the appropriate BagIt structure and the metadata files will be created. | |
| 79 | |
| 80 You can bag multiple directories if you wish: | |
| 81 | |
| 82 $ bagit.py directory1 directory2 | |
| 83 | |
| 84 Optionally you can provide metadata which will be stored in bag-info.txt: | |
| 85 | |
| 86 $ bagit.py --source-organization "Library of Congress" directory | |
| 87 | |
| 88 You can also select which manifest algorithms will be used: | |
| 89 | |
| 90 $ bagit.py --sha1 --md5 --sha256 --sha512 directory | |
| 91 | |
| 92 | |
| 93 Using BagIt from your Python code: | |
| 94 | |
| 95 import bagit | |
| 96 bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'}) | |
| 97 print(bag.entries) | |
| 98 | |
| 99 For more information or to contribute to bagit-python's development, please | |
| 100 visit %(PROJECT_URL)s | |
| 101 """ | |
| 102 ) | |
| 103 % globals() | |
| 104 ) | |
| 105 | |
| 106 # standard bag-info.txt metadata | |
| 107 STANDARD_BAG_INFO_HEADERS = [ | |
| 108 "Source-Organization", | |
| 109 "Organization-Address", | |
| 110 "Contact-Name", | |
| 111 "Contact-Phone", | |
| 112 "Contact-Email", | |
| 113 "External-Description", | |
| 114 "External-Identifier", | |
| 115 "Bag-Size", | |
| 116 "Bag-Group-Identifier", | |
| 117 "Bag-Count", | |
| 118 "Internal-Sender-Identifier", | |
| 119 "Internal-Sender-Description", | |
| 120 "BagIt-Profile-Identifier", | |
| 121 # Bagging-Date is autogenerated | |
| 122 # Payload-Oxum is autogenerated | |
| 123 ] | |
| 124 | |
| 125 CHECKSUM_ALGOS = hashlib.algorithms_guaranteed | |
| 126 DEFAULT_CHECKSUMS = ["sha256", "sha512"] | |
| 127 | |
| 128 #: Block size used when reading files for hashing: | |
| 129 HASH_BLOCK_SIZE = 512 * 1024 | |
| 130 | |
| 131 #: Convenience function used everywhere we want to open a file to read text | |
| 132 #: rather than undecoded bytes: | |
| 133 open_text_file = partial(codecs.open, encoding="utf-8", errors="strict") | |
| 134 | |
| 135 # This is the same as decoding the byte values in codecs.BOM: | |
| 136 UNICODE_BYTE_ORDER_MARK = "\uFEFF" | |
| 137 | |
| 138 | |
| 139 def make_bag( | |
| 140 bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" | |
| 141 ): | |
| 142 """ | |
| 143 Convert a given directory into a bag. You can pass in arbitrary | |
| 144 key/value pairs to put into the bag-info.txt metadata file as | |
| 145 the bag_info dictionary. | |
| 146 """ | |
| 147 | |
| 148 if checksum is not None: | |
| 149 warnings.warn( | |
| 150 _( | |
| 151 "The `checksum` argument for `make_bag` should be replaced with `checksums`" | |
| 152 ), | |
| 153 DeprecationWarning, | |
| 154 ) | |
| 155 checksums = checksum | |
| 156 | |
| 157 if checksums is None: | |
| 158 checksums = DEFAULT_CHECKSUMS | |
| 159 | |
| 160 bag_dir = os.path.abspath(bag_dir) | |
| 161 cwd = os.path.abspath(os.path.curdir) | |
| 162 | |
| 163 if cwd.startswith(bag_dir) and cwd != bag_dir: | |
| 164 raise RuntimeError( | |
| 165 _("Bagging a parent of the current directory is not supported") | |
| 166 ) | |
| 167 | |
| 168 LOGGER.info(_("Creating bag for directory %s"), bag_dir) | |
| 169 | |
| 170 if not os.path.isdir(bag_dir): | |
| 171 LOGGER.error(_("Bag directory %s does not exist"), bag_dir) | |
| 172 raise RuntimeError(_("Bag directory %s does not exist") % bag_dir) | |
| 173 | |
| 174 # FIXME: we should do the permissions checks before changing directories | |
| 175 old_dir = os.path.abspath(os.path.curdir) | |
| 176 | |
| 177 try: | |
| 178 # TODO: These two checks are currently redundant since an unreadable directory will also | |
| 179 # often be unwritable, and this code will require review when we add the option to | |
| 180 # bag to a destination other than the source. It would be nice if we could avoid | |
| 181 # walking the directory tree more than once even if most filesystems will cache it | |
| 182 | |
| 183 unbaggable = _can_bag(bag_dir) | |
| 184 | |
| 185 if unbaggable: | |
| 186 LOGGER.error( | |
| 187 _("Unable to write to the following directories and files:\n%s"), | |
| 188 unbaggable, | |
| 189 ) | |
| 190 raise BagError(_("Missing permissions to move all files and directories")) | |
| 191 | |
| 192 unreadable_dirs, unreadable_files = _can_read(bag_dir) | |
| 193 | |
| 194 if unreadable_dirs or unreadable_files: | |
| 195 if unreadable_dirs: | |
| 196 LOGGER.error( | |
| 197 _("The following directories do not have read permissions:\n%s"), | |
| 198 unreadable_dirs, | |
| 199 ) | |
| 200 if unreadable_files: | |
| 201 LOGGER.error( | |
| 202 _("The following files do not have read permissions:\n%s"), | |
| 203 unreadable_files, | |
| 204 ) | |
| 205 raise BagError( | |
| 206 _("Read permissions are required to calculate file fixities") | |
| 207 ) | |
| 208 else: | |
| 209 LOGGER.info(_("Creating data directory")) | |
| 210 | |
| 211 # FIXME: if we calculate full paths we won't need to deal with changing directories | |
| 212 os.chdir(bag_dir) | |
| 213 cwd = os.getcwd() | |
| 214 temp_data = tempfile.mkdtemp(dir=cwd) | |
| 215 | |
| 216 for f in os.listdir("."): | |
| 217 if os.path.abspath(f) == temp_data: | |
| 218 continue | |
| 219 new_f = os.path.join(temp_data, f) | |
| 220 LOGGER.info( | |
| 221 _("Moving %(source)s to %(destination)s"), | |
| 222 {"source": f, "destination": new_f}, | |
| 223 ) | |
| 224 os.rename(f, new_f) | |
| 225 | |
| 226 LOGGER.info( | |
| 227 _("Moving %(source)s to %(destination)s"), | |
| 228 {"source": temp_data, "destination": "data"}, | |
| 229 ) | |
| 230 os.rename(temp_data, "data") | |
| 231 | |
| 232 # permissions for the payload directory should match those of the | |
| 233 # original directory | |
| 234 os.chmod("data", os.stat(cwd).st_mode) | |
| 235 | |
| 236 total_bytes, total_files = make_manifests( | |
| 237 "data", processes, algorithms=checksums, encoding=encoding | |
| 238 ) | |
| 239 | |
| 240 LOGGER.info(_("Creating bagit.txt")) | |
| 241 txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" | |
| 242 with open_text_file("bagit.txt", "w") as bagit_file: | |
| 243 bagit_file.write(txt) | |
| 244 | |
| 245 LOGGER.info(_("Creating bag-info.txt")) | |
| 246 if bag_info is None: | |
| 247 bag_info = {} | |
| 248 | |
| 249 # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden | |
| 250 if "Bagging-Date" not in bag_info: | |
| 251 bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") | |
| 252 if "Bag-Software-Agent" not in bag_info: | |
| 253 bag_info["Bag-Software-Agent"] = "bagit.py v%s <%s>" % ( | |
| 254 VERSION, | |
| 255 PROJECT_URL, | |
| 256 ) | |
| 257 | |
| 258 bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
| 259 _make_tag_file("bag-info.txt", bag_info) | |
| 260 | |
| 261 for c in checksums: | |
| 262 _make_tagmanifest_file(c, bag_dir, encoding="utf-8") | |
| 263 except Exception: | |
| 264 LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir) | |
| 265 raise | |
| 266 finally: | |
| 267 os.chdir(old_dir) | |
| 268 | |
| 269 return Bag(bag_dir) | |
| 270 | |
| 271 | |
| 272 class Bag(object): | |
| 273 """A representation of a bag.""" | |
| 274 | |
| 275 valid_files = ["bagit.txt", "fetch.txt"] | |
| 276 valid_directories = ["data"] | |
| 277 | |
| 278 def __init__(self, path=None): | |
| 279 super(Bag, self).__init__() | |
| 280 self.tags = {} | |
| 281 self.info = {} | |
| 282 #: Dictionary of manifest entries and the checksum values for each | |
| 283 #: algorithm: | |
| 284 self.entries = {} | |
| 285 | |
| 286 # To reliably handle Unicode normalization differences, we maintain | |
| 287 # lookup dictionaries in both directions for the filenames read from | |
| 288 # the filesystem and the manifests so we can handle cases where the | |
| 289 # normalization form changed between the bag being created and read. | |
| 290 # See https://github.com/LibraryOfCongress/bagit-python/issues/51. | |
| 291 | |
| 292 #: maps Unicode-normalized values to the raw value from the filesystem | |
| 293 self.normalized_filesystem_names = {} | |
| 294 | |
| 295 #: maps Unicode-normalized values to the raw value in the manifest | |
| 296 self.normalized_manifest_names = {} | |
| 297 | |
| 298 self.algorithms = [] | |
| 299 self.tag_file_name = None | |
| 300 self.path = abspath(path) | |
| 301 if path: | |
| 302 # if path ends in a path separator, strip it off | |
| 303 if path[-1] == os.sep: | |
| 304 self.path = path[:-1] | |
| 305 self._open() | |
| 306 | |
| 307 def __str__(self): | |
| 308 # FIXME: develop a more informative string representation for a Bag | |
| 309 return self.path | |
| 310 | |
| 311 @property | |
| 312 def algs(self): | |
| 313 warnings.warn(_("Use Bag.algorithms instead of Bag.algs"), DeprecationWarning) | |
| 314 return self.algorithms | |
| 315 | |
| 316 @property | |
| 317 def version(self): | |
| 318 warnings.warn( | |
| 319 _("Use the Bag.version_info tuple instead of Bag.version"), | |
| 320 DeprecationWarning, | |
| 321 ) | |
| 322 return self._version | |
| 323 | |
| 324 def _open(self): | |
| 325 # Open the bagit.txt file, and load any tags from it, including | |
| 326 # the required version and encoding. | |
| 327 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
| 328 | |
| 329 if not isfile(bagit_file_path): | |
| 330 raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path) | |
| 331 | |
| 332 self.tags = tags = _load_tag_file(bagit_file_path) | |
| 333 | |
| 334 required_tags = ("BagIt-Version", "Tag-File-Character-Encoding") | |
| 335 missing_tags = [i for i in required_tags if i not in tags] | |
| 336 if missing_tags: | |
| 337 raise BagError( | |
| 338 _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags) | |
| 339 ) | |
| 340 | |
| 341 # To avoid breaking existing code we'll leave self.version as the string | |
| 342 # and parse it into a numeric version_info tuple. In version 2.0 we can | |
| 343 # break that. | |
| 344 | |
| 345 self._version = tags["BagIt-Version"] | |
| 346 | |
| 347 try: | |
| 348 self.version_info = tuple(int(i) for i in self._version.split(".", 1)) | |
| 349 except ValueError: | |
| 350 raise BagError( | |
| 351 _("Bag version numbers must be MAJOR.MINOR numbers, not %s") | |
| 352 % self._version | |
| 353 ) | |
| 354 | |
| 355 if (0, 93) <= self.version_info <= (0, 95): | |
| 356 self.tag_file_name = "package-info.txt" | |
| 357 elif (0, 96) <= self.version_info < (2,): | |
| 358 self.tag_file_name = "bag-info.txt" | |
| 359 else: | |
| 360 raise BagError(_("Unsupported bag version: %s") % self._version) | |
| 361 | |
| 362 self.encoding = tags["Tag-File-Character-Encoding"] | |
| 363 | |
| 364 try: | |
| 365 codecs.lookup(self.encoding) | |
| 366 except LookupError: | |
| 367 raise BagValidationError(_("Unsupported encoding: %s") % self.encoding) | |
| 368 | |
| 369 info_file_path = os.path.join(self.path, self.tag_file_name) | |
| 370 if os.path.exists(info_file_path): | |
| 371 self.info = _load_tag_file(info_file_path, encoding=self.encoding) | |
| 372 | |
| 373 self._load_manifests() | |
| 374 | |
| 375 def manifest_files(self): | |
| 376 for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
| 377 f = os.path.join(self.path, filename) | |
| 378 if isfile(f): | |
| 379 yield f | |
| 380 | |
| 381 def tagmanifest_files(self): | |
| 382 for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
| 383 f = os.path.join(self.path, filename) | |
| 384 if isfile(f): | |
| 385 yield f | |
| 386 | |
| 387 def compare_manifests_with_fs(self): | |
| 388 """ | |
| 389 Compare the filenames in the manifests to the filenames present on the | |
| 390 local filesystem and returns two lists of the files which are only | |
| 391 present in the manifests and the files which are only present on the | |
| 392 local filesystem, respectively. | |
| 393 """ | |
| 394 | |
| 395 # We compare the filenames after Unicode normalization so we can | |
| 396 # reliably detect normalization changes after bag creation: | |
| 397 files_on_fs = set(normalize_unicode(i) for i in self.payload_files()) | |
| 398 files_in_manifest = set( | |
| 399 normalize_unicode(i) for i in self.payload_entries().keys() | |
| 400 ) | |
| 401 | |
| 402 if self.version_info >= (0, 97): | |
| 403 files_in_manifest.update(self.missing_optional_tagfiles()) | |
| 404 | |
| 405 only_on_fs = list() | |
| 406 only_in_manifest = list() | |
| 407 | |
| 408 for i in files_on_fs.difference(files_in_manifest): | |
| 409 only_on_fs.append(self.normalized_filesystem_names[i]) | |
| 410 | |
| 411 for i in files_in_manifest.difference(files_on_fs): | |
| 412 only_in_manifest.append(self.normalized_manifest_names[i]) | |
| 413 | |
| 414 return only_in_manifest, only_on_fs | |
| 415 | |
| 416 def compare_fetch_with_fs(self): | |
| 417 """Compares the fetch entries with the files actually | |
| 418 in the payload, and returns a list of all the files | |
| 419 that still need to be fetched. | |
| 420 """ | |
| 421 | |
| 422 files_on_fs = set(self.payload_files()) | |
| 423 files_in_fetch = set(self.files_to_be_fetched()) | |
| 424 | |
| 425 return list(files_in_fetch - files_on_fs) | |
| 426 | |
| 427 def payload_files(self): | |
| 428 """Returns a list of filenames which are present on the local filesystem""" | |
| 429 payload_dir = os.path.join(self.path, "data") | |
| 430 | |
| 431 for dirpath, _, filenames in os.walk(payload_dir): | |
| 432 for f in filenames: | |
| 433 # Jump through some hoops here to make the payload files are | |
| 434 # returned with the directory structure relative to the base | |
| 435 # directory rather than the | |
| 436 normalized_f = os.path.normpath(f) | |
| 437 rel_path = os.path.relpath( | |
| 438 os.path.join(dirpath, normalized_f), start=self.path | |
| 439 ) | |
| 440 | |
| 441 self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path | |
| 442 yield rel_path | |
| 443 | |
| 444 def payload_entries(self): | |
| 445 """Return a dictionary of items """ | |
| 446 # Don't use dict comprehension (compatibility with Python < 2.7) | |
| 447 return dict( | |
| 448 (key, value) | |
| 449 for (key, value) in self.entries.items() | |
| 450 if key.startswith("data" + os.sep) | |
| 451 ) | |
| 452 | |
| 453 def save(self, processes=1, manifests=False): | |
| 454 """ | |
| 455 save will persist any changes that have been made to the bag | |
| 456 metadata (self.info). | |
| 457 | |
| 458 If you have modified the payload of the bag (added, modified, | |
| 459 removed files in the data directory) and want to regenerate manifests | |
| 460 set the manifests parameter to True. The default is False since you | |
| 461 wouldn't want a save to accidentally create a new manifest for | |
| 462 a corrupted bag. | |
| 463 | |
| 464 If you want to control the number of processes that are used when | |
| 465 recalculating checksums use the processes parameter. | |
| 466 """ | |
| 467 # Error checking | |
| 468 if not self.path: | |
| 469 raise BagError(_("Bag.save() called before setting the path!")) | |
| 470 | |
| 471 if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK): | |
| 472 raise BagError( | |
| 473 _("Cannot save bag to non-existent or inaccessible directory %s") | |
| 474 % self.path | |
| 475 ) | |
| 476 | |
| 477 unbaggable = _can_bag(self.path) | |
| 478 if unbaggable: | |
| 479 LOGGER.error( | |
| 480 _( | |
| 481 "Missing write permissions for the following directories and files:\n%s" | |
| 482 ), | |
| 483 unbaggable, | |
| 484 ) | |
| 485 raise BagError(_("Missing permissions to move all files and directories")) | |
| 486 | |
| 487 unreadable_dirs, unreadable_files = _can_read(self.path) | |
| 488 if unreadable_dirs or unreadable_files: | |
| 489 if unreadable_dirs: | |
| 490 LOGGER.error( | |
| 491 _("The following directories do not have read permissions:\n%s"), | |
| 492 unreadable_dirs, | |
| 493 ) | |
| 494 if unreadable_files: | |
| 495 LOGGER.error( | |
| 496 _("The following files do not have read permissions:\n%s"), | |
| 497 unreadable_files, | |
| 498 ) | |
| 499 raise BagError( | |
| 500 _("Read permissions are required to calculate file fixities") | |
| 501 ) | |
| 502 | |
| 503 # Change working directory to bag directory so helper functions work | |
| 504 old_dir = os.path.abspath(os.path.curdir) | |
| 505 os.chdir(self.path) | |
| 506 | |
| 507 # Generate new manifest files | |
| 508 if manifests: | |
| 509 total_bytes, total_files = make_manifests( | |
| 510 "data", processes, algorithms=self.algorithms, encoding=self.encoding | |
| 511 ) | |
| 512 | |
| 513 # Update Payload-Oxum | |
| 514 LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name) | |
| 515 self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
| 516 | |
| 517 _make_tag_file(self.tag_file_name, self.info) | |
| 518 | |
| 519 # Update tag-manifest for changes to manifest & bag-info files | |
| 520 for alg in self.algorithms: | |
| 521 _make_tagmanifest_file(alg, self.path, encoding=self.encoding) | |
| 522 | |
| 523 # Reload the manifests | |
| 524 self._load_manifests() | |
| 525 | |
| 526 os.chdir(old_dir) | |
| 527 | |
| 528 def tagfile_entries(self): | |
| 529 return dict( | |
| 530 (key, value) | |
| 531 for (key, value) in self.entries.items() | |
| 532 if not key.startswith("data" + os.sep) | |
| 533 ) | |
| 534 | |
| 535 def missing_optional_tagfiles(self): | |
| 536 """ | |
| 537 From v0.97 we need to validate any tagfiles listed | |
| 538 in the optional tagmanifest(s). As there is no mandatory | |
| 539 directory structure for additional tagfiles we can | |
| 540 only check for entries with missing files (not missing | |
| 541 entries for existing files). | |
| 542 """ | |
| 543 for tagfilepath in self.tagfile_entries().keys(): | |
| 544 if not os.path.isfile(os.path.join(self.path, tagfilepath)): | |
| 545 yield tagfilepath | |
| 546 | |
| 547 def fetch_entries(self): | |
| 548 """Load fetch.txt if present and iterate over its contents | |
| 549 | |
| 550 yields (url, size, filename) tuples | |
| 551 | |
| 552 raises BagError for errors such as an unsafe filename referencing | |
| 553 data outside of the bag directory | |
| 554 """ | |
| 555 | |
| 556 fetch_file_path = os.path.join(self.path, "fetch.txt") | |
| 557 | |
| 558 if isfile(fetch_file_path): | |
| 559 with open_text_file( | |
| 560 fetch_file_path, "r", encoding=self.encoding | |
| 561 ) as fetch_file: | |
| 562 for line in fetch_file: | |
| 563 url, file_size, filename = line.strip().split(None, 2) | |
| 564 | |
| 565 if self._path_is_dangerous(filename): | |
| 566 raise BagError( | |
| 567 _('Path "%(payload_file)s" in "%(source_file)s" is unsafe') | |
| 568 % { | |
| 569 "payload_file": filename, | |
| 570 "source_file": os.path.join(self.path, "fetch.txt"), | |
| 571 } | |
| 572 ) | |
| 573 | |
| 574 yield url, file_size, filename | |
| 575 | |
| 576 def files_to_be_fetched(self): | |
| 577 """ | |
| 578 Convenience wrapper for fetch_entries which returns only the | |
| 579 local filename | |
| 580 """ | |
| 581 | |
| 582 for url, file_size, filename in self.fetch_entries(): | |
| 583 yield filename | |
| 584 | |
| 585 def has_oxum(self): | |
| 586 return "Payload-Oxum" in self.info | |
| 587 | |
| 588 def validate(self, processes=1, fast=False, completeness_only=False): | |
| 589 """Checks the structure and contents are valid. | |
| 590 | |
| 591 If you supply the parameter fast=True the Payload-Oxum (if present) will | |
| 592 be used to check that the payload files are present and accounted for, | |
| 593 instead of re-calculating fixities and comparing them against the | |
| 594 manifest. By default validate() will re-calculate fixities (fast=False). | |
| 595 """ | |
| 596 | |
| 597 self._validate_structure() | |
| 598 self._validate_bagittxt() | |
| 599 | |
| 600 self.validate_fetch() | |
| 601 | |
| 602 self._validate_contents( | |
| 603 processes=processes, fast=fast, completeness_only=completeness_only | |
| 604 ) | |
| 605 | |
| 606 return True | |
| 607 | |
| 608 def is_valid(self, fast=False, completeness_only=False): | |
| 609 """Returns validation success or failure as boolean. | |
| 610 Optional fast parameter passed directly to validate(). | |
| 611 """ | |
| 612 | |
| 613 try: | |
| 614 self.validate(fast=fast, completeness_only=completeness_only) | |
| 615 except BagError: | |
| 616 return False | |
| 617 | |
| 618 return True | |
| 619 | |
| 620 def _load_manifests(self): | |
| 621 self.entries = {} | |
| 622 manifests = list(self.manifest_files()) | |
| 623 | |
| 624 if self.version_info >= (0, 97): | |
| 625 # v0.97+ requires that optional tagfiles are verified. | |
| 626 manifests += list(self.tagmanifest_files()) | |
| 627 | |
| 628 for manifest_filename in manifests: | |
| 629 if not manifest_filename.find("tagmanifest-") is -1: | |
| 630 search = "tagmanifest-" | |
| 631 else: | |
| 632 search = "manifest-" | |
| 633 alg = ( | |
| 634 os.path.basename(manifest_filename) | |
| 635 .replace(search, "") | |
| 636 .replace(".txt", "") | |
| 637 ) | |
| 638 if alg not in self.algorithms: | |
| 639 self.algorithms.append(alg) | |
| 640 | |
| 641 with open_text_file( | |
| 642 manifest_filename, "r", encoding=self.encoding | |
| 643 ) as manifest_file: | |
| 644 if manifest_file.encoding.startswith("UTF"): | |
| 645 # We'll check the first character to see if it's a BOM: | |
| 646 if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK: | |
| 647 # We'll skip it either way by letting line decoding | |
| 648 # happen at the new offset but we will issue a warning | |
| 649 # for UTF-8 since the presence of a BOM is contrary to | |
| 650 # the BagIt specification: | |
| 651 if manifest_file.encoding == "UTF-8": | |
| 652 LOGGER.warning( | |
| 653 _( | |
| 654 "%s is encoded using UTF-8 but contains an unnecessary" | |
| 655 " byte-order mark, which is not in compliance with the" | |
| 656 " BagIt RFC" | |
| 657 ), | |
| 658 manifest_file.name, | |
| 659 ) | |
| 660 else: | |
| 661 manifest_file.seek(0) # Pretend the first read never happened | |
| 662 | |
| 663 for line in manifest_file: | |
| 664 line = line.strip() | |
| 665 | |
| 666 # Ignore blank lines and comments. | |
| 667 if line == "" or line.startswith("#"): | |
| 668 continue | |
| 669 | |
| 670 entry = line.split(None, 1) | |
| 671 | |
| 672 # Format is FILENAME *CHECKSUM | |
| 673 if len(entry) != 2: | |
| 674 LOGGER.error( | |
| 675 _( | |
| 676 "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" | |
| 677 ), | |
| 678 {"bag": self, "algorithm": alg, "line": line}, | |
| 679 ) | |
| 680 continue | |
| 681 | |
| 682 entry_hash = entry[0] | |
| 683 entry_path = os.path.normpath(entry[1].lstrip("*")) | |
| 684 entry_path = _decode_filename(entry_path) | |
| 685 | |
| 686 if self._path_is_dangerous(entry_path): | |
| 687 raise BagError( | |
| 688 _( | |
| 689 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe' | |
| 690 ) | |
| 691 % { | |
| 692 "payload_file": entry_path, | |
| 693 "manifest_file": manifest_file.name, | |
| 694 } | |
| 695 ) | |
| 696 | |
| 697 entry_hashes = self.entries.setdefault(entry_path, {}) | |
| 698 | |
| 699 if alg in entry_hashes: | |
| 700 warning_ctx = { | |
| 701 "bag": self, | |
| 702 "algorithm": alg, | |
| 703 "filename": entry_path, | |
| 704 } | |
| 705 if entry_hashes[alg] == entry_hash: | |
| 706 msg = _( | |
| 707 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
| 708 " multiple times with the same value" | |
| 709 ) | |
| 710 if self.version_info >= (1,): | |
| 711 raise BagError(msg % warning_ctx) | |
| 712 else: | |
| 713 LOGGER.warning(msg, warning_ctx) | |
| 714 else: | |
| 715 raise BagError( | |
| 716 _( | |
| 717 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
| 718 " multiple times with conflicting values" | |
| 719 ) | |
| 720 % warning_ctx | |
| 721 ) | |
| 722 | |
| 723 entry_hashes[alg] = entry_hash | |
| 724 | |
| 725 self.normalized_manifest_names.update( | |
| 726 (normalize_unicode(i), i) for i in self.entries.keys() | |
| 727 ) | |
| 728 | |
| 729 def _validate_structure(self): | |
| 730 """ | |
| 731 Checks the structure of the bag to determine whether it conforms to the | |
| 732 BagIt spec. Returns true on success, otherwise it will raise a | |
| 733 BagValidationError exception. | |
| 734 """ | |
| 735 | |
| 736 self._validate_structure_payload_directory() | |
| 737 self._validate_structure_tag_files() | |
| 738 | |
| 739 def _validate_structure_payload_directory(self): | |
| 740 data_dir_path = os.path.join(self.path, "data") | |
| 741 | |
| 742 if not isdir(data_dir_path): | |
| 743 raise BagValidationError( | |
| 744 _("Expected data directory %s does not exist") % data_dir_path | |
| 745 ) | |
| 746 | |
| 747 def _validate_structure_tag_files(self): | |
| 748 # Note: we deviate somewhat from v0.96 of the spec in that it allows | |
| 749 # other files and directories to be present in the base directory | |
| 750 | |
| 751 if not list(self.manifest_files()): | |
| 752 raise BagValidationError(_("No manifest files found")) | |
| 753 if "bagit.txt" not in os.listdir(self.path): | |
| 754 raise BagValidationError( | |
| 755 _('Expected %s to contain "bagit.txt"') % self.path | |
| 756 ) | |
| 757 | |
| 758 def validate_fetch(self): | |
| 759 """Validate the fetch.txt file | |
| 760 | |
| 761 Raises `BagError` for errors and otherwise returns no value | |
| 762 """ | |
| 763 | |
| 764 for url, file_size, filename in self.fetch_entries(): | |
| 765 # fetch_entries will raise a BagError for unsafe filenames | |
| 766 # so at this point we will check only that the URL is minimally | |
| 767 # well formed: | |
| 768 parsed_url = urlparse(url) | |
| 769 | |
| 770 if not all((parsed_url.scheme, parsed_url.netloc)): | |
| 771 raise BagError(_("Malformed URL in fetch.txt: %s") % url) | |
| 772 | |
| 773 def _validate_contents(self, processes=1, fast=False, completeness_only=False): | |
| 774 if fast and not self.has_oxum(): | |
| 775 raise BagValidationError( | |
| 776 _("Fast validation requires bag-info.txt to include Payload-Oxum") | |
| 777 ) | |
| 778 | |
| 779 # Perform the fast file count + size check so we can fail early: | |
| 780 self._validate_oxum() | |
| 781 | |
| 782 if fast: | |
| 783 return | |
| 784 | |
| 785 self._validate_completeness() | |
| 786 | |
| 787 if completeness_only: | |
| 788 return | |
| 789 | |
| 790 self._validate_entries(processes) | |
| 791 | |
| 792 def _validate_oxum(self): | |
| 793 oxum = self.info.get("Payload-Oxum") | |
| 794 | |
| 795 if oxum is None: | |
| 796 return | |
| 797 | |
| 798 # If multiple Payload-Oxum tags (bad idea) | |
| 799 # use the first listed in bag-info.txt | |
| 800 if isinstance(oxum, list): | |
| 801 LOGGER.warning(_("bag-info.txt defines multiple Payload-Oxum values!")) | |
| 802 oxum = oxum[0] | |
| 803 | |
| 804 oxum_byte_count, oxum_file_count = oxum.split(".", 1) | |
| 805 | |
| 806 if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit(): | |
| 807 raise BagError(_("Malformed Payload-Oxum value: %s") % oxum) | |
| 808 | |
| 809 oxum_byte_count = int(oxum_byte_count) | |
| 810 oxum_file_count = int(oxum_file_count) | |
| 811 total_bytes = 0 | |
| 812 total_files = 0 | |
| 813 | |
| 814 for payload_file in self.payload_files(): | |
| 815 payload_file = os.path.join(self.path, payload_file) | |
| 816 total_bytes += os.stat(payload_file).st_size | |
| 817 total_files += 1 | |
| 818 | |
| 819 if oxum_file_count != total_files or oxum_byte_count != total_bytes: | |
| 820 raise BagValidationError( | |
| 821 _( | |
| 822 "Payload-Oxum validation failed." | |
| 823 " Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes" | |
| 824 " but found %(found_file_count)d files and %(found_byte_count)d bytes" | |
| 825 ) | |
| 826 % { | |
| 827 "found_file_count": total_files, | |
| 828 "found_byte_count": total_bytes, | |
| 829 "oxum_file_count": oxum_file_count, | |
| 830 "oxum_byte_count": oxum_byte_count, | |
| 831 } | |
| 832 ) | |
| 833 | |
| 834 def _validate_completeness(self): | |
| 835 """ | |
| 836 Verify that the actual file manifests match the files in the data directory | |
| 837 """ | |
| 838 errors = list() | |
| 839 | |
| 840 # First we'll make sure there's no mismatch between the filesystem | |
| 841 # and the list of files in the manifest(s) | |
| 842 only_in_manifests, only_on_fs = self.compare_manifests_with_fs() | |
| 843 for path in only_in_manifests: | |
| 844 e = FileMissing(path) | |
| 845 LOGGER.warning(force_unicode(e)) | |
| 846 errors.append(e) | |
| 847 for path in only_on_fs: | |
| 848 e = UnexpectedFile(path) | |
| 849 LOGGER.warning(force_unicode(e)) | |
| 850 errors.append(e) | |
| 851 | |
| 852 if errors: | |
| 853 raise BagValidationError(_("Bag validation failed"), errors) | |
| 854 | |
| 855 def _validate_entries(self, processes): | |
| 856 """ | |
| 857 Verify that the actual file contents match the recorded hashes stored in the manifest files | |
| 858 """ | |
| 859 errors = list() | |
| 860 | |
| 861 if os.name == "posix": | |
| 862 worker_init = posix_multiprocessing_worker_initializer | |
| 863 else: | |
| 864 worker_init = None | |
| 865 | |
| 866 args = ( | |
| 867 ( | |
| 868 self.path, | |
| 869 self.normalized_filesystem_names.get(rel_path, rel_path), | |
| 870 hashes, | |
| 871 self.algorithms, | |
| 872 ) | |
| 873 for rel_path, hashes in self.entries.items() | |
| 874 ) | |
| 875 | |
| 876 try: | |
| 877 if processes == 1: | |
| 878 hash_results = [_calc_hashes(i) for i in args] | |
| 879 else: | |
| 880 try: | |
| 881 pool = multiprocessing.Pool( | |
| 882 processes if processes else None, initializer=worker_init | |
| 883 ) | |
| 884 hash_results = pool.map(_calc_hashes, args) | |
| 885 finally: | |
| 886 pool.terminate() | |
| 887 | |
| 888 # Any unhandled exceptions are probably fatal | |
| 889 except: | |
| 890 LOGGER.exception(_("Unable to calculate file hashes for %s"), self) | |
| 891 raise | |
| 892 | |
| 893 for rel_path, f_hashes, hashes in hash_results: | |
| 894 for alg, computed_hash in f_hashes.items(): | |
| 895 stored_hash = hashes[alg] | |
| 896 if stored_hash.lower() != computed_hash: | |
| 897 e = ChecksumMismatch( | |
| 898 rel_path, alg, stored_hash.lower(), computed_hash | |
| 899 ) | |
| 900 LOGGER.warning(force_unicode(e)) | |
| 901 errors.append(e) | |
| 902 | |
| 903 if errors: | |
| 904 raise BagValidationError(_("Bag validation failed"), errors) | |
| 905 | |
| 906 def _validate_bagittxt(self): | |
| 907 """ | |
| 908 Verify that bagit.txt conforms to specification | |
| 909 """ | |
| 910 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
| 911 | |
| 912 # Note that we are intentionally opening this file in binary mode so we can confirm | |
| 913 # that it does not start with the UTF-8 byte-order-mark | |
| 914 with open(bagit_file_path, "rb") as bagit_file: | |
| 915 first_line = bagit_file.read(4) | |
| 916 if first_line.startswith(codecs.BOM_UTF8): | |
| 917 raise BagValidationError( | |
| 918 _("bagit.txt must not contain a byte-order mark") | |
| 919 ) | |
| 920 | |
| 921 def _path_is_dangerous(self, path): | |
| 922 """ | |
| 923 Return true if path looks dangerous, i.e. potentially operates | |
| 924 outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json, | |
| 925 \\?\c:\, D:\sys32\cmd.exe | |
| 926 """ | |
| 927 if os.path.isabs(path): | |
| 928 return True | |
| 929 if os.path.expanduser(path) != path: | |
| 930 return True | |
| 931 if os.path.expandvars(path) != path: | |
| 932 return True | |
| 933 real_path = os.path.realpath(os.path.join(self.path, path)) | |
| 934 real_path = os.path.normpath(real_path) | |
| 935 bag_path = os.path.realpath(self.path) | |
| 936 bag_path = os.path.normpath(bag_path) | |
| 937 common = os.path.commonprefix((bag_path, real_path)) | |
| 938 return not (common == bag_path) | |
| 939 | |
| 940 | |
| 941 class BagError(Exception): | |
| 942 pass | |
| 943 | |
| 944 | |
| 945 class BagValidationError(BagError): | |
| 946 def __init__(self, message, details=None): | |
| 947 super(BagValidationError, self).__init__() | |
| 948 | |
| 949 if details is None: | |
| 950 details = [] | |
| 951 | |
| 952 self.message = message | |
| 953 self.details = details | |
| 954 | |
| 955 def __str__(self): | |
| 956 if len(self.details) > 0: | |
| 957 details = "; ".join([force_unicode(e) for e in self.details]) | |
| 958 return "%s: %s" % (self.message, details) | |
| 959 return self.message | |
| 960 | |
| 961 | |
| 962 class ManifestErrorDetail(BagError): | |
| 963 def __init__(self, path): | |
| 964 super(ManifestErrorDetail, self).__init__() | |
| 965 | |
| 966 self.path = path | |
| 967 | |
| 968 | |
| 969 class ChecksumMismatch(ManifestErrorDetail): | |
| 970 def __init__(self, path, algorithm=None, expected=None, found=None): | |
| 971 super(ChecksumMismatch, self).__init__(path) | |
| 972 | |
| 973 self.path = path | |
| 974 self.algorithm = algorithm | |
| 975 self.expected = expected | |
| 976 self.found = found | |
| 977 | |
| 978 def __str__(self): | |
| 979 return _( | |
| 980 '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"' | |
| 981 ) % { | |
| 982 "path": force_unicode(self.path), | |
| 983 "algorithm": self.algorithm, | |
| 984 "expected": self.expected, | |
| 985 "found": self.found, | |
| 986 } | |
| 987 | |
| 988 | |
| 989 class FileMissing(ManifestErrorDetail): | |
| 990 def __str__(self): | |
| 991 return _( | |
| 992 "%s exists in manifest but was not found on filesystem" | |
| 993 ) % force_unicode(self.path) | |
| 994 | |
| 995 | |
| 996 class UnexpectedFile(ManifestErrorDetail): | |
| 997 def __str__(self): | |
| 998 return _("%s exists on filesystem but is not in the manifest") % self.path | |
| 999 | |
| 1000 | |
| 1001 class FileNormalizationConflict(BagError): | |
| 1002 """ | |
| 1003 Exception raised when two files differ only in normalization and thus | |
| 1004 are not safely portable | |
| 1005 """ | |
| 1006 | |
| 1007 def __init__(self, file_a, file_b): | |
| 1008 super(FileNormalizationConflict, self).__init__() | |
| 1009 | |
| 1010 self.file_a = file_a | |
| 1011 self.file_b = file_b | |
| 1012 | |
| 1013 def __str__(self): | |
| 1014 return _( | |
| 1015 'Unicode normalization conflict for file "%(file_a)s" and "%(file_b)s"' | |
| 1016 ) % {"file_a": self.file_a, "file_b": self.file_b} | |
| 1017 | |
| 1018 | |
| 1019 def posix_multiprocessing_worker_initializer(): | |
| 1020 """Ignore SIGINT in multiprocessing workers on POSIX systems""" | |
| 1021 signal.signal(signal.SIGINT, signal.SIG_IGN) | |
| 1022 | |
| 1023 | |
| 1024 # The Unicode normalization form used here doesn't matter – all we care about | |
| 1025 # is consistency since the input value will be preserved: | |
| 1026 | |
| 1027 | |
| 1028 def normalize_unicode_py3(s): | |
| 1029 return unicodedata.normalize("NFC", s) | |
| 1030 | |
| 1031 | |
| 1032 def normalize_unicode_py2(s): | |
| 1033 if isinstance(s, str): | |
| 1034 s = s.decode("utf-8") | |
| 1035 return unicodedata.normalize("NFC", s) | |
| 1036 | |
| 1037 | |
| 1038 if sys.version_info > (3, 0): | |
| 1039 normalize_unicode = normalize_unicode_py3 | |
| 1040 else: | |
| 1041 normalize_unicode = normalize_unicode_py2 | |
| 1042 | |
| 1043 | |
| 1044 def build_unicode_normalized_lookup_dict(filenames): | |
| 1045 """ | |
| 1046 Return a dictionary mapping unicode-normalized filenames to as-encoded | |
| 1047 values to efficiently detect conflicts between the filesystem and manifests. | |
| 1048 | |
| 1049 This is necessary because some filesystems and utilities may automatically | |
| 1050 apply a different Unicode normalization form to filenames than was applied | |
| 1051 when the bag was originally created. | |
| 1052 | |
| 1053 The best known example of this is when a bag is created using a | |
| 1054 normalization form other than NFD and then transferred to a Mac where the | |
| 1055 HFS+ filesystem will transparently normalize filenames to a variant of NFD | |
| 1056 for every call: | |
| 1057 | |
| 1058 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html#UnicodeSubtleties | |
| 1059 | |
| 1060 Windows is documented as storing filenames exactly as provided: | |
| 1061 | |
| 1062 https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx | |
| 1063 | |
| 1064 Linux performs no normalization in the kernel but it is technically | |
| 1065 valid for a filesystem to perform normalization, such as when an HFS+ | |
| 1066 volume is mounted. | |
| 1067 | |
| 1068 See http://www.unicode.org/reports/tr15/ for a full discussion of | |
| 1069 equivalence and normalization in Unicode. | |
| 1070 """ | |
| 1071 | |
| 1072 output = dict() | |
| 1073 | |
| 1074 for filename in filenames: | |
| 1075 normalized_filename = normalize_unicode(filename) | |
| 1076 if normalized_filename in output: | |
| 1077 raise FileNormalizationConflict(filename, output[normalized_filename]) | |
| 1078 else: | |
| 1079 output[normalized_filename] = filename | |
| 1080 | |
| 1081 return output | |
| 1082 | |
| 1083 | |
| 1084 def get_hashers(algorithms): | |
| 1085 """ | |
| 1086 Given a list of algorithm names, return a dictionary of hasher instances | |
| 1087 | |
| 1088 This avoids redundant code between the creation and validation code where in | |
| 1089 both cases we want to avoid reading the same file more than once. The | |
| 1090 intended use is a simple for loop: | |
| 1091 | |
| 1092 for block in file: | |
| 1093 for hasher in hashers.values(): | |
| 1094 hasher.update(block) | |
| 1095 """ | |
| 1096 | |
| 1097 hashers = {} | |
| 1098 | |
| 1099 for alg in algorithms: | |
| 1100 try: | |
| 1101 hasher = hashlib.new(alg) | |
| 1102 except ValueError: | |
| 1103 LOGGER.warning( | |
| 1104 _("Disabling requested hash algorithm %s: hashlib does not support it"), | |
| 1105 alg, | |
| 1106 ) | |
| 1107 continue | |
| 1108 | |
| 1109 hashers[alg] = hasher | |
| 1110 | |
| 1111 if not hashers: | |
| 1112 raise ValueError( | |
| 1113 _( | |
| 1114 "Unable to continue: hashlib does not support any of the requested algorithms!" | |
| 1115 ) | |
| 1116 ) | |
| 1117 | |
| 1118 return hashers | |
| 1119 | |
| 1120 | |
| 1121 def _calc_hashes(args): | |
| 1122 # auto unpacking of sequences illegal in Python3 | |
| 1123 (base_path, rel_path, hashes, algorithms) = args | |
| 1124 full_path = os.path.join(base_path, rel_path) | |
| 1125 | |
| 1126 # Create a clone of the default empty hash objects: | |
| 1127 f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms) | |
| 1128 | |
| 1129 try: | |
| 1130 f_hashes = _calculate_file_hashes(full_path, f_hashers) | |
| 1131 except BagValidationError as e: | |
| 1132 f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys()) | |
| 1133 | |
| 1134 return rel_path, f_hashes, hashes | |
| 1135 | |
| 1136 | |
| 1137 def _calculate_file_hashes(full_path, f_hashers): | |
| 1138 """ | |
| 1139 Returns a dictionary of (algorithm, hexdigest) values for the provided | |
| 1140 filename | |
| 1141 """ | |
| 1142 LOGGER.info(_("Verifying checksum for file %s"), full_path) | |
| 1143 | |
| 1144 try: | |
| 1145 with open(full_path, "rb") as f: | |
| 1146 while True: | |
| 1147 block = f.read(HASH_BLOCK_SIZE) | |
| 1148 if not block: | |
| 1149 break | |
| 1150 for i in f_hashers.values(): | |
| 1151 i.update(block) | |
| 1152 except (OSError, IOError) as e: | |
| 1153 raise BagValidationError( | |
| 1154 _("Could not read %(filename)s: %(error)s") | |
| 1155 % {"filename": full_path, "error": force_unicode(e)} | |
| 1156 ) | |
| 1157 | |
| 1158 return dict((alg, h.hexdigest()) for alg, h in f_hashers.items()) | |
| 1159 | |
| 1160 | |
| 1161 def _load_tag_file(tag_file_name, encoding="utf-8-sig"): | |
| 1162 with open_text_file(tag_file_name, "r", encoding=encoding) as tag_file: | |
| 1163 # Store duplicate tags as list of vals | |
| 1164 # in order of parsing under the same key. | |
| 1165 tags = {} | |
| 1166 for name, value in _parse_tags(tag_file): | |
| 1167 if name not in tags: | |
| 1168 tags[name] = value | |
| 1169 continue | |
| 1170 | |
| 1171 if not isinstance(tags[name], list): | |
| 1172 tags[name] = [tags[name], value] | |
| 1173 else: | |
| 1174 tags[name].append(value) | |
| 1175 | |
| 1176 return tags | |
| 1177 | |
| 1178 | |
| 1179 def _parse_tags(tag_file): | |
| 1180 """Parses a tag file, according to RFC 2822. This | |
| 1181 includes line folding, permitting extra-long | |
| 1182 field values. | |
| 1183 | |
| 1184 See http://www.faqs.org/rfcs/rfc2822.html for | |
| 1185 more information. | |
| 1186 """ | |
| 1187 | |
| 1188 tag_name = None | |
| 1189 tag_value = None | |
| 1190 | |
| 1191 # Line folding is handled by yielding values only after we encounter | |
| 1192 # the start of a new tag, or if we pass the EOF. | |
| 1193 for num, line in enumerate(tag_file): | |
| 1194 # Skip over any empty or blank lines. | |
| 1195 if len(line) == 0 or line.isspace(): | |
| 1196 continue | |
| 1197 elif line[0].isspace() and tag_value is not None: # folded line | |
| 1198 tag_value += line | |
| 1199 else: | |
| 1200 # Starting a new tag; yield the last one. | |
| 1201 if tag_name: | |
| 1202 yield (tag_name, tag_value.strip()) | |
| 1203 | |
| 1204 if ":" not in line: | |
| 1205 raise BagValidationError( | |
| 1206 _("%(filename)s contains invalid tag: %(line)s") | |
| 1207 % { | |
| 1208 "line": line.strip(), | |
| 1209 "filename": os.path.basename(tag_file.name), | |
| 1210 } | |
| 1211 ) | |
| 1212 | |
| 1213 parts = line.strip().split(":", 1) | |
| 1214 tag_name = parts[0].strip() | |
| 1215 tag_value = parts[1] | |
| 1216 | |
| 1217 # Passed the EOF. All done after this. | |
| 1218 if tag_name: | |
| 1219 yield (tag_name, tag_value.strip()) | |
| 1220 | |
| 1221 | |
| 1222 def _make_tag_file(bag_info_path, bag_info): | |
| 1223 headers = sorted(bag_info.keys()) | |
| 1224 with open_text_file(bag_info_path, "w") as f: | |
| 1225 for h in headers: | |
| 1226 values = bag_info[h] | |
| 1227 if not isinstance(values, list): | |
| 1228 values = [values] | |
| 1229 for txt in values: | |
| 1230 # strip CR, LF and CRLF so they don't mess up the tag file | |
| 1231 txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt)) | |
| 1232 f.write("%s: %s\n" % (h, txt)) | |
| 1233 | |
| 1234 | |
| 1235 def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"): | |
| 1236 LOGGER.info( | |
| 1237 _("Using %(process_count)d processes to generate manifests: %(algorithms)s"), | |
| 1238 {"process_count": processes, "algorithms": ", ".join(algorithms)}, | |
| 1239 ) | |
| 1240 | |
| 1241 manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms) | |
| 1242 | |
| 1243 if processes > 1: | |
| 1244 pool = multiprocessing.Pool(processes=processes) | |
| 1245 checksums = pool.map(manifest_line_generator, _walk(data_dir)) | |
| 1246 pool.close() | |
| 1247 pool.join() | |
| 1248 else: | |
| 1249 checksums = [manifest_line_generator(i) for i in _walk(data_dir)] | |
| 1250 | |
| 1251 # At this point we have a list of tuples which start with the algorithm name: | |
| 1252 manifest_data = {} | |
| 1253 for batch in checksums: | |
| 1254 for entry in batch: | |
| 1255 manifest_data.setdefault(entry[0], []).append(entry[1:]) | |
| 1256 | |
| 1257 # These will be keyed on the algorithm name so we can perform sanity checks | |
| 1258 # below to catch failures in the hashing process: | |
| 1259 num_files = defaultdict(lambda: 0) | |
| 1260 total_bytes = defaultdict(lambda: 0) | |
| 1261 | |
| 1262 for algorithm, values in manifest_data.items(): | |
| 1263 manifest_filename = "manifest-%s.txt" % algorithm | |
| 1264 | |
| 1265 with open_text_file(manifest_filename, "w", encoding=encoding) as manifest: | |
| 1266 for digest, filename, byte_count in values: | |
| 1267 manifest.write("%s %s\n" % (digest, _encode_filename(filename))) | |
| 1268 num_files[algorithm] += 1 | |
| 1269 total_bytes[algorithm] += byte_count | |
| 1270 | |
| 1271 # We'll use sets of the values for the error checks and eventually return the payload oxum values: | |
| 1272 byte_value_set = set(total_bytes.values()) | |
| 1273 file_count_set = set(num_files.values()) | |
| 1274 | |
| 1275 # allow a bag with an empty payload | |
| 1276 if not byte_value_set and not file_count_set: | |
| 1277 return 0, 0 | |
| 1278 | |
| 1279 if len(file_count_set) != 1: | |
| 1280 raise RuntimeError(_("Expected the same number of files for each checksum")) | |
| 1281 | |
| 1282 if len(byte_value_set) != 1: | |
| 1283 raise RuntimeError(_("Expected the same number of bytes for each checksums")) | |
| 1284 | |
| 1285 return byte_value_set.pop(), file_count_set.pop() | |
| 1286 | |
| 1287 | |
| 1288 def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"): | |
| 1289 tagmanifest_file = join(bag_dir, "tagmanifest-%s.txt" % alg) | |
| 1290 LOGGER.info(_("Creating %s"), tagmanifest_file) | |
| 1291 | |
| 1292 checksums = [] | |
| 1293 for f in _find_tag_files(bag_dir): | |
| 1294 if re.match(r"^tagmanifest-.+\.txt$", f): | |
| 1295 continue | |
| 1296 with open(join(bag_dir, f), "rb") as fh: | |
| 1297 m = hashlib.new(alg) | |
| 1298 while True: | |
| 1299 block = fh.read(HASH_BLOCK_SIZE) | |
| 1300 if not block: | |
| 1301 break | |
| 1302 m.update(block) | |
| 1303 checksums.append((m.hexdigest(), f)) | |
| 1304 | |
| 1305 with open_text_file( | |
| 1306 join(bag_dir, tagmanifest_file), mode="w", encoding=encoding | |
| 1307 ) as tagmanifest: | |
| 1308 for digest, filename in checksums: | |
| 1309 tagmanifest.write("%s %s\n" % (digest, filename)) | |
| 1310 | |
| 1311 | |
| 1312 def _find_tag_files(bag_dir): | |
| 1313 for dir in os.listdir(bag_dir): | |
| 1314 if dir != "data": | |
| 1315 if os.path.isfile(dir) and not dir.startswith("tagmanifest-"): | |
| 1316 yield dir | |
| 1317 for dir_name, _, filenames in os.walk(dir): | |
| 1318 for filename in filenames: | |
| 1319 if filename.startswith("tagmanifest-"): | |
| 1320 continue | |
| 1321 # remove everything up to the bag_dir directory | |
| 1322 p = join(dir_name, filename) | |
| 1323 yield os.path.relpath(p, bag_dir) | |
| 1324 | |
| 1325 | |
| 1326 def _walk(data_dir): | |
| 1327 for dirpath, dirnames, filenames in os.walk(data_dir): | |
| 1328 # if we don't sort here the order of entries is non-deterministic | |
| 1329 # which makes it hard to test the fixity of tagmanifest-md5.txt | |
| 1330 filenames.sort() | |
| 1331 dirnames.sort() | |
| 1332 for fn in filenames: | |
| 1333 path = os.path.join(dirpath, fn) | |
| 1334 # BagIt spec requires manifest to always use '/' as path separator | |
| 1335 if os.path.sep != "/": | |
| 1336 parts = path.split(os.path.sep) | |
| 1337 path = "/".join(parts) | |
| 1338 yield path | |
| 1339 | |
| 1340 | |
| 1341 def _can_bag(test_dir): | |
| 1342 """Scan the provided directory for files which cannot be bagged due to insufficient permissions""" | |
| 1343 unbaggable = [] | |
| 1344 | |
| 1345 if not os.access(test_dir, os.R_OK): | |
| 1346 # We cannot continue without permission to read the source directory | |
| 1347 unbaggable.append(test_dir) | |
| 1348 return unbaggable | |
| 1349 | |
| 1350 if not os.access(test_dir, os.W_OK): | |
| 1351 unbaggable.append(test_dir) | |
| 1352 | |
| 1353 for dirpath, dirnames, filenames in os.walk(test_dir): | |
| 1354 for directory in dirnames: | |
| 1355 full_path = os.path.join(dirpath, directory) | |
| 1356 if not os.access(full_path, os.W_OK): | |
| 1357 unbaggable.append(full_path) | |
| 1358 | |
| 1359 return unbaggable | |
| 1360 | |
| 1361 | |
| 1362 def _can_read(test_dir): | |
| 1363 """ | |
| 1364 returns ((unreadable_dirs), (unreadable_files)) | |
| 1365 """ | |
| 1366 unreadable_dirs = [] | |
| 1367 unreadable_files = [] | |
| 1368 | |
| 1369 if not os.access(test_dir, os.R_OK): | |
| 1370 unreadable_dirs.append(test_dir) | |
| 1371 else: | |
| 1372 for dirpath, dirnames, filenames in os.walk(test_dir): | |
| 1373 for dn in dirnames: | |
| 1374 full_path = os.path.join(dirpath, dn) | |
| 1375 if not os.access(full_path, os.R_OK): | |
| 1376 unreadable_dirs.append(full_path) | |
| 1377 for fn in filenames: | |
| 1378 full_path = os.path.join(dirpath, fn) | |
| 1379 if not os.access(full_path, os.R_OK): | |
| 1380 unreadable_files.append(full_path) | |
| 1381 return (tuple(unreadable_dirs), tuple(unreadable_files)) | |
| 1382 | |
| 1383 | |
| 1384 def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): | |
| 1385 LOGGER.info(_("Generating manifest lines for file %s"), filename) | |
| 1386 | |
| 1387 # For performance we'll read the file only once and pass it block | |
| 1388 # by block to every requested hash algorithm: | |
| 1389 hashers = get_hashers(algorithms) | |
| 1390 | |
| 1391 total_bytes = 0 | |
| 1392 | |
| 1393 with open(filename, "rb") as f: | |
| 1394 while True: | |
| 1395 block = f.read(HASH_BLOCK_SIZE) | |
| 1396 | |
| 1397 if not block: | |
| 1398 break | |
| 1399 | |
| 1400 total_bytes += len(block) | |
| 1401 for hasher in hashers.values(): | |
| 1402 hasher.update(block) | |
| 1403 | |
| 1404 decoded_filename = _decode_filename(filename) | |
| 1405 | |
| 1406 # We'll generate a list of results in roughly manifest format but prefixed with the algorithm: | |
| 1407 results = [ | |
| 1408 (alg, hasher.hexdigest(), decoded_filename, total_bytes) | |
| 1409 for alg, hasher in hashers.items() | |
| 1410 ] | |
| 1411 | |
| 1412 return results | |
| 1413 | |
| 1414 | |
| 1415 def _encode_filename(s): | |
| 1416 s = s.replace("\r", "%0D") | |
| 1417 s = s.replace("\n", "%0A") | |
| 1418 return s | |
| 1419 | |
| 1420 | |
| 1421 def _decode_filename(s): | |
| 1422 s = re.sub(r"%0D", "\r", s, re.IGNORECASE) | |
| 1423 s = re.sub(r"%0A", "\n", s, re.IGNORECASE) | |
| 1424 return s | |
| 1425 | |
| 1426 | |
| 1427 def force_unicode_py2(s): | |
| 1428 """Reliably return a Unicode string given a possible unicode or byte string""" | |
| 1429 if isinstance(s, str): | |
| 1430 return s.decode("utf-8") | |
| 1431 else: | |
| 1432 return unicode(s) | |
| 1433 | |
| 1434 | |
| 1435 if sys.version_info > (3, 0): | |
| 1436 force_unicode = str | |
| 1437 else: | |
| 1438 force_unicode = force_unicode_py2 | |
| 1439 | |
| 1440 # following code is used for command line program | |
| 1441 | |
| 1442 | |
| 1443 class BagArgumentParser(argparse.ArgumentParser): | |
| 1444 def __init__(self, *args, **kwargs): | |
| 1445 self.bag_info = {} | |
| 1446 argparse.ArgumentParser.__init__(self, *args, **kwargs) | |
| 1447 | |
| 1448 | |
| 1449 class BagHeaderAction(argparse.Action): | |
| 1450 def __call__(self, parser, _, values, option_string=None): | |
| 1451 opt = option_string.lstrip("--") | |
| 1452 opt_caps = "-".join([o.capitalize() for o in opt.split("-")]) | |
| 1453 parser.bag_info[opt_caps] = values | |
| 1454 | |
| 1455 | |
| 1456 def _make_parser(): | |
| 1457 parser = BagArgumentParser( | |
| 1458 formatter_class=argparse.RawDescriptionHelpFormatter, | |
| 1459 description="bagit-python version %s\n\n%s\n" % (VERSION, __doc__.strip()), | |
| 1460 ) | |
| 1461 parser.add_argument( | |
| 1462 "--processes", | |
| 1463 type=int, | |
| 1464 dest="processes", | |
| 1465 default=1, | |
| 1466 help=_( | |
| 1467 "Use multiple processes to calculate checksums faster (default: %(default)s)" | |
| 1468 ), | |
| 1469 ) | |
| 1470 parser.add_argument("--log", help=_("The name of the log file (default: stdout)")) | |
| 1471 parser.add_argument( | |
| 1472 "--quiet", | |
| 1473 action="store_true", | |
| 1474 help=_("Suppress all progress information other than errors"), | |
| 1475 ) | |
| 1476 parser.add_argument( | |
| 1477 "--validate", | |
| 1478 action="store_true", | |
| 1479 help=_( | |
| 1480 "Validate existing bags in the provided directories instead of" | |
| 1481 " creating new ones" | |
| 1482 ), | |
| 1483 ) | |
| 1484 parser.add_argument( | |
| 1485 "--fast", | |
| 1486 action="store_true", | |
| 1487 help=_( | |
| 1488 "Modify --validate behaviour to only test whether the bag directory" | |
| 1489 " has the number of files and total size specified in Payload-Oxum" | |
| 1490 " without performing checksum validation to detect corruption." | |
| 1491 ), | |
| 1492 ) | |
| 1493 parser.add_argument( | |
| 1494 "--completeness-only", | |
| 1495 action="store_true", | |
| 1496 help=_( | |
| 1497 "Modify --validate behaviour to test whether the bag directory" | |
| 1498 " has the expected payload specified in the checksum manifests" | |
| 1499 " without performing checksum validation to detect corruption." | |
| 1500 ), | |
| 1501 ) | |
| 1502 | |
| 1503 checksum_args = parser.add_argument_group( | |
| 1504 _("Checksum Algorithms"), | |
| 1505 _( | |
| 1506 "Select the manifest algorithms to be used when creating bags" | |
| 1507 " (default=%s)" | |
| 1508 ) | |
| 1509 % ", ".join(DEFAULT_CHECKSUMS), | |
| 1510 ) | |
| 1511 | |
| 1512 for i in CHECKSUM_ALGOS: | |
| 1513 alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper()) | |
| 1514 checksum_args.add_argument( | |
| 1515 "--%s" % i, | |
| 1516 action="append_const", | |
| 1517 dest="checksums", | |
| 1518 const=i, | |
| 1519 help=_("Generate %s manifest when creating a bag") % alg_name, | |
| 1520 ) | |
| 1521 | |
| 1522 metadata_args = parser.add_argument_group(_("Optional Bag Metadata")) | |
| 1523 for header in STANDARD_BAG_INFO_HEADERS: | |
| 1524 metadata_args.add_argument( | |
| 1525 "--%s" % header.lower(), type=str, action=BagHeaderAction | |
| 1526 ) | |
| 1527 | |
| 1528 parser.add_argument( | |
| 1529 "directory", | |
| 1530 nargs="+", | |
| 1531 help=_( | |
| 1532 "Directory which will be converted into a bag in place" | |
| 1533 " by moving any existing files into the BagIt structure" | |
| 1534 " and creating the manifests and other metadata." | |
| 1535 ), | |
| 1536 ) | |
| 1537 | |
| 1538 return parser | |
| 1539 | |
| 1540 | |
| 1541 def _configure_logging(opts): | |
| 1542 log_format = "%(asctime)s - %(levelname)s - %(message)s" | |
| 1543 if opts.quiet: | |
| 1544 level = logging.ERROR | |
| 1545 else: | |
| 1546 level = logging.INFO | |
| 1547 if opts.log: | |
| 1548 logging.basicConfig(filename=opts.log, level=level, format=log_format) | |
| 1549 else: | |
| 1550 logging.basicConfig(level=level, format=log_format) | |
| 1551 | |
| 1552 | |
| 1553 def main(): | |
| 1554 if "--version" in sys.argv: | |
| 1555 print(_("bagit-python version %s") % VERSION) | |
| 1556 sys.exit(0) | |
| 1557 | |
| 1558 parser = _make_parser() | |
| 1559 args = parser.parse_args() | |
| 1560 | |
| 1561 if args.processes < 0: | |
| 1562 parser.error(_("The number of processes must be 0 or greater")) | |
| 1563 | |
| 1564 if args.fast and not args.validate: | |
| 1565 parser.error(_("--fast is only allowed as an option for --validate!")) | |
| 1566 | |
| 1567 _configure_logging(args) | |
| 1568 | |
| 1569 rc = 0 | |
| 1570 for bag_dir in args.directory: | |
| 1571 # validate the bag | |
| 1572 if args.validate: | |
| 1573 try: | |
| 1574 bag = Bag(bag_dir) | |
| 1575 # validate throws a BagError or BagValidationError | |
| 1576 bag.validate( | |
| 1577 processes=args.processes, | |
| 1578 fast=args.fast, | |
| 1579 completeness_only=args.completeness_only, | |
| 1580 ) | |
| 1581 if args.fast: | |
| 1582 LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir) | |
| 1583 else: | |
| 1584 LOGGER.info(_("%s is valid"), bag_dir) | |
| 1585 except BagError as e: | |
| 1586 LOGGER.error( | |
| 1587 _("%(bag)s is invalid: %(error)s"), {"bag": bag_dir, "error": e} | |
| 1588 ) | |
| 1589 rc = 1 | |
| 1590 | |
| 1591 # make the bag | |
| 1592 else: | |
| 1593 try: | |
| 1594 make_bag( | |
| 1595 bag_dir, | |
| 1596 bag_info=parser.bag_info, | |
| 1597 processes=args.processes, | |
| 1598 checksums=args.checksums, | |
| 1599 ) | |
| 1600 except Exception as exc: | |
| 1601 LOGGER.error( | |
| 1602 _("Failed to create bag in %(bag_directory)s: %(error)s"), | |
| 1603 {"bag_directory": bag_dir, "error": exc}, | |
| 1604 exc_info=True, | |
| 1605 ) | |
| 1606 rc = 1 | |
| 1607 | |
| 1608 sys.exit(rc) | |
| 1609 | |
| 1610 | |
| 1611 if __name__ == "__main__": | |
| 1612 main() |
