Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bagit.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
| author | shellac |
|---|---|
| date | Mon, 22 Mar 2021 18:12:50 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:4f3585e2f14b |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 # -*- coding: utf-8 -*- | |
| 3 | |
| 4 from __future__ import absolute_import, division, print_function, unicode_literals | |
| 5 | |
| 6 import argparse | |
| 7 import codecs | |
| 8 import gettext | |
| 9 import hashlib | |
| 10 import logging | |
| 11 import multiprocessing | |
| 12 import os | |
| 13 import re | |
| 14 import signal | |
| 15 import sys | |
| 16 import tempfile | |
| 17 import unicodedata | |
| 18 import warnings | |
| 19 from collections import defaultdict | |
| 20 from datetime import date | |
| 21 from functools import partial | |
| 22 from os.path import abspath, isdir, isfile, join | |
| 23 | |
| 24 from pkg_resources import DistributionNotFound, get_distribution | |
| 25 | |
| 26 try: | |
| 27 from urllib.parse import urlparse | |
| 28 except ImportError: | |
| 29 from urlparse import urlparse | |
| 30 | |
| 31 | |
| 32 def find_locale_dir(): | |
| 33 for prefix in (os.path.dirname(__file__), sys.prefix): | |
| 34 locale_dir = os.path.join(prefix, "locale") | |
| 35 if os.path.isdir(locale_dir): | |
| 36 return locale_dir | |
| 37 | |
| 38 | |
| 39 TRANSLATION_CATALOG = gettext.translation( | |
| 40 "bagit-python", localedir=find_locale_dir(), fallback=True | |
| 41 ) | |
| 42 if sys.version_info < (3,): | |
| 43 _ = TRANSLATION_CATALOG.ugettext | |
| 44 else: | |
| 45 _ = TRANSLATION_CATALOG.gettext | |
| 46 | |
| 47 MODULE_NAME = "bagit" if __name__ == "__main__" else __name__ | |
| 48 | |
| 49 LOGGER = logging.getLogger(MODULE_NAME) | |
| 50 | |
| 51 try: | |
| 52 VERSION = get_distribution(MODULE_NAME).version | |
| 53 except DistributionNotFound: | |
| 54 VERSION = "0.0.dev0" | |
| 55 | |
| 56 PROJECT_URL = "https://github.com/LibraryOfCongress/bagit-python" | |
| 57 | |
| 58 __doc__ = ( | |
| 59 _( | |
| 60 """ | |
| 61 BagIt is a directory, filename convention for bundling an arbitrary set of | |
| 62 files with a manifest, checksums, and additional metadata. More about BagIt | |
| 63 can be found at: | |
| 64 | |
| 65 http://purl.org/net/bagit | |
| 66 | |
| 67 bagit.py is a pure python drop in library and command line tool for creating, | |
| 68 and working with BagIt directories. | |
| 69 | |
| 70 | |
| 71 Command-Line Usage: | |
| 72 | |
| 73 Basic usage is to give bagit.py a directory to bag up: | |
| 74 | |
| 75 $ bagit.py my_directory | |
| 76 | |
| 77 This does a bag-in-place operation where the current contents will be moved | |
| 78 into the appropriate BagIt structure and the metadata files will be created. | |
| 79 | |
| 80 You can bag multiple directories if you wish: | |
| 81 | |
| 82 $ bagit.py directory1 directory2 | |
| 83 | |
| 84 Optionally you can provide metadata which will be stored in bag-info.txt: | |
| 85 | |
| 86 $ bagit.py --source-organization "Library of Congress" directory | |
| 87 | |
| 88 You can also select which manifest algorithms will be used: | |
| 89 | |
| 90 $ bagit.py --sha1 --md5 --sha256 --sha512 directory | |
| 91 | |
| 92 | |
| 93 Using BagIt from your Python code: | |
| 94 | |
| 95 import bagit | |
| 96 bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'}) | |
| 97 print(bag.entries) | |
| 98 | |
| 99 For more information or to contribute to bagit-python's development, please | |
| 100 visit %(PROJECT_URL)s | |
| 101 """ | |
| 102 ) | |
| 103 % globals() | |
| 104 ) | |
| 105 | |
| 106 # standard bag-info.txt metadata | |
| 107 STANDARD_BAG_INFO_HEADERS = [ | |
| 108 "Source-Organization", | |
| 109 "Organization-Address", | |
| 110 "Contact-Name", | |
| 111 "Contact-Phone", | |
| 112 "Contact-Email", | |
| 113 "External-Description", | |
| 114 "External-Identifier", | |
| 115 "Bag-Size", | |
| 116 "Bag-Group-Identifier", | |
| 117 "Bag-Count", | |
| 118 "Internal-Sender-Identifier", | |
| 119 "Internal-Sender-Description", | |
| 120 "BagIt-Profile-Identifier", | |
| 121 # Bagging-Date is autogenerated | |
| 122 # Payload-Oxum is autogenerated | |
| 123 ] | |
| 124 | |
| 125 try: | |
| 126 CHECKSUM_ALGOS = hashlib.algorithms_guaranteed | |
| 127 except AttributeError: | |
| 128 # FIXME: remove when we drop Python 2 (https://github.com/LibraryOfCongress/bagit-python/issues/102) | |
| 129 # Python 2.7.0-2.7.8 | |
| 130 CHECKSUM_ALGOS = set(hashlib.algorithms) | |
| 131 DEFAULT_CHECKSUMS = ["sha256", "sha512"] | |
| 132 | |
| 133 #: Block size used when reading files for hashing: | |
| 134 HASH_BLOCK_SIZE = 512 * 1024 | |
| 135 | |
| 136 #: Convenience function used everywhere we want to open a file to read text | |
| 137 #: rather than undecoded bytes: | |
| 138 open_text_file = partial(codecs.open, encoding="utf-8", errors="strict") | |
| 139 | |
| 140 # This is the same as decoding the byte values in codecs.BOM: | |
| 141 UNICODE_BYTE_ORDER_MARK = "\uFEFF" | |
| 142 | |
| 143 | |
| 144 def make_bag( | |
| 145 bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" | |
| 146 ): | |
| 147 """ | |
| 148 Convert a given directory into a bag. You can pass in arbitrary | |
| 149 key/value pairs to put into the bag-info.txt metadata file as | |
| 150 the bag_info dictionary. | |
| 151 """ | |
| 152 | |
| 153 if checksum is not None: | |
| 154 warnings.warn( | |
| 155 _( | |
| 156 "The `checksum` argument for `make_bag` should be replaced with `checksums`" | |
| 157 ), | |
| 158 DeprecationWarning, | |
| 159 ) | |
| 160 checksums = checksum | |
| 161 | |
| 162 if checksums is None: | |
| 163 checksums = DEFAULT_CHECKSUMS | |
| 164 | |
| 165 bag_dir = os.path.abspath(bag_dir) | |
| 166 cwd = os.path.abspath(os.path.curdir) | |
| 167 | |
| 168 if cwd.startswith(bag_dir) and cwd != bag_dir: | |
| 169 raise RuntimeError( | |
| 170 _("Bagging a parent of the current directory is not supported") | |
| 171 ) | |
| 172 | |
| 173 LOGGER.info(_("Creating bag for directory %s"), bag_dir) | |
| 174 | |
| 175 if not os.path.isdir(bag_dir): | |
| 176 LOGGER.error(_("Bag directory %s does not exist"), bag_dir) | |
| 177 raise RuntimeError(_("Bag directory %s does not exist") % bag_dir) | |
| 178 | |
| 179 # FIXME: we should do the permissions checks before changing directories | |
| 180 old_dir = os.path.abspath(os.path.curdir) | |
| 181 | |
| 182 try: | |
| 183 # TODO: These two checks are currently redundant since an unreadable directory will also | |
| 184 # often be unwritable, and this code will require review when we add the option to | |
| 185 # bag to a destination other than the source. It would be nice if we could avoid | |
| 186 # walking the directory tree more than once even if most filesystems will cache it | |
| 187 | |
| 188 unbaggable = _can_bag(bag_dir) | |
| 189 | |
| 190 if unbaggable: | |
| 191 LOGGER.error( | |
| 192 _("Unable to write to the following directories and files:\n%s"), | |
| 193 unbaggable, | |
| 194 ) | |
| 195 raise BagError(_("Missing permissions to move all files and directories")) | |
| 196 | |
| 197 unreadable_dirs, unreadable_files = _can_read(bag_dir) | |
| 198 | |
| 199 if unreadable_dirs or unreadable_files: | |
| 200 if unreadable_dirs: | |
| 201 LOGGER.error( | |
| 202 _("The following directories do not have read permissions:\n%s"), | |
| 203 unreadable_dirs, | |
| 204 ) | |
| 205 if unreadable_files: | |
| 206 LOGGER.error( | |
| 207 _("The following files do not have read permissions:\n%s"), | |
| 208 unreadable_files, | |
| 209 ) | |
| 210 raise BagError( | |
| 211 _("Read permissions are required to calculate file fixities") | |
| 212 ) | |
| 213 else: | |
| 214 LOGGER.info(_("Creating data directory")) | |
| 215 | |
| 216 # FIXME: if we calculate full paths we won't need to deal with changing directories | |
| 217 os.chdir(bag_dir) | |
| 218 cwd = os.getcwd() | |
| 219 temp_data = tempfile.mkdtemp(dir=cwd) | |
| 220 | |
| 221 for f in os.listdir("."): | |
| 222 if os.path.abspath(f) == temp_data: | |
| 223 continue | |
| 224 new_f = os.path.join(temp_data, f) | |
| 225 LOGGER.info( | |
| 226 _("Moving %(source)s to %(destination)s"), | |
| 227 {"source": f, "destination": new_f}, | |
| 228 ) | |
| 229 os.rename(f, new_f) | |
| 230 | |
| 231 LOGGER.info( | |
| 232 _("Moving %(source)s to %(destination)s"), | |
| 233 {"source": temp_data, "destination": "data"}, | |
| 234 ) | |
| 235 os.rename(temp_data, "data") | |
| 236 | |
| 237 # permissions for the payload directory should match those of the | |
| 238 # original directory | |
| 239 os.chmod("data", os.stat(cwd).st_mode) | |
| 240 | |
| 241 total_bytes, total_files = make_manifests( | |
| 242 "data", processes, algorithms=checksums, encoding=encoding | |
| 243 ) | |
| 244 | |
| 245 LOGGER.info(_("Creating bagit.txt")) | |
| 246 txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" | |
| 247 with open_text_file("bagit.txt", "w") as bagit_file: | |
| 248 bagit_file.write(txt) | |
| 249 | |
| 250 LOGGER.info(_("Creating bag-info.txt")) | |
| 251 if bag_info is None: | |
| 252 bag_info = {} | |
| 253 | |
| 254 # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden | |
| 255 if "Bagging-Date" not in bag_info: | |
| 256 bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") | |
| 257 if "Bag-Software-Agent" not in bag_info: | |
| 258 bag_info["Bag-Software-Agent"] = "bagit.py v%s <%s>" % ( | |
| 259 VERSION, | |
| 260 PROJECT_URL, | |
| 261 ) | |
| 262 | |
| 263 bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
| 264 _make_tag_file("bag-info.txt", bag_info) | |
| 265 | |
| 266 for c in checksums: | |
| 267 _make_tagmanifest_file(c, bag_dir, encoding="utf-8") | |
| 268 except Exception: | |
| 269 LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir) | |
| 270 raise | |
| 271 finally: | |
| 272 os.chdir(old_dir) | |
| 273 | |
| 274 return Bag(bag_dir) | |
| 275 | |
| 276 | |
| 277 class Bag(object): | |
| 278 """A representation of a bag.""" | |
| 279 | |
| 280 valid_files = ["bagit.txt", "fetch.txt"] | |
| 281 valid_directories = ["data"] | |
| 282 | |
| 283 def __init__(self, path=None): | |
| 284 super(Bag, self).__init__() | |
| 285 self.tags = {} | |
| 286 self.info = {} | |
| 287 #: Dictionary of manifest entries and the checksum values for each | |
| 288 #: algorithm: | |
| 289 self.entries = {} | |
| 290 | |
| 291 # To reliably handle Unicode normalization differences, we maintain | |
| 292 # lookup dictionaries in both directions for the filenames read from | |
| 293 # the filesystem and the manifests so we can handle cases where the | |
| 294 # normalization form changed between the bag being created and read. | |
| 295 # See https://github.com/LibraryOfCongress/bagit-python/issues/51. | |
| 296 | |
| 297 #: maps Unicode-normalized values to the raw value from the filesystem | |
| 298 self.normalized_filesystem_names = {} | |
| 299 | |
| 300 #: maps Unicode-normalized values to the raw value in the manifest | |
| 301 self.normalized_manifest_names = {} | |
| 302 | |
| 303 self.algorithms = [] | |
| 304 self.tag_file_name = None | |
| 305 self.path = abspath(path) | |
| 306 if path: | |
| 307 # if path ends in a path separator, strip it off | |
| 308 if path[-1] == os.sep: | |
| 309 self.path = path[:-1] | |
| 310 self._open() | |
| 311 | |
| 312 def __str__(self): | |
| 313 # FIXME: develop a more informative string representation for a Bag | |
| 314 return self.path | |
| 315 | |
| 316 @property | |
| 317 def algs(self): | |
| 318 warnings.warn(_("Use Bag.algorithms instead of Bag.algs"), DeprecationWarning) | |
| 319 return self.algorithms | |
| 320 | |
| 321 @property | |
| 322 def version(self): | |
| 323 warnings.warn( | |
| 324 _("Use the Bag.version_info tuple instead of Bag.version"), | |
| 325 DeprecationWarning, | |
| 326 ) | |
| 327 return self._version | |
| 328 | |
| 329 def _open(self): | |
| 330 # Open the bagit.txt file, and load any tags from it, including | |
| 331 # the required version and encoding. | |
| 332 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
| 333 | |
| 334 if not isfile(bagit_file_path): | |
| 335 raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path) | |
| 336 | |
| 337 self.tags = tags = _load_tag_file(bagit_file_path) | |
| 338 | |
| 339 required_tags = ("BagIt-Version", "Tag-File-Character-Encoding") | |
| 340 missing_tags = [i for i in required_tags if i not in tags] | |
| 341 if missing_tags: | |
| 342 raise BagError( | |
| 343 _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags) | |
| 344 ) | |
| 345 | |
| 346 # To avoid breaking existing code we'll leave self.version as the string | |
| 347 # and parse it into a numeric version_info tuple. In version 2.0 we can | |
| 348 # break that. | |
| 349 | |
| 350 self._version = tags["BagIt-Version"] | |
| 351 | |
| 352 try: | |
| 353 self.version_info = tuple(int(i) for i in self._version.split(".", 1)) | |
| 354 except ValueError: | |
| 355 raise BagError( | |
| 356 _("Bag version numbers must be MAJOR.MINOR numbers, not %s") | |
| 357 % self._version | |
| 358 ) | |
| 359 | |
| 360 if (0, 93) <= self.version_info <= (0, 95): | |
| 361 self.tag_file_name = "package-info.txt" | |
| 362 elif (0, 96) <= self.version_info < (2,): | |
| 363 self.tag_file_name = "bag-info.txt" | |
| 364 else: | |
| 365 raise BagError(_("Unsupported bag version: %s") % self._version) | |
| 366 | |
| 367 self.encoding = tags["Tag-File-Character-Encoding"] | |
| 368 | |
| 369 try: | |
| 370 codecs.lookup(self.encoding) | |
| 371 except LookupError: | |
| 372 raise BagValidationError(_("Unsupported encoding: %s") % self.encoding) | |
| 373 | |
| 374 info_file_path = os.path.join(self.path, self.tag_file_name) | |
| 375 if os.path.exists(info_file_path): | |
| 376 self.info = _load_tag_file(info_file_path, encoding=self.encoding) | |
| 377 | |
| 378 self._load_manifests() | |
| 379 | |
| 380 def manifest_files(self): | |
| 381 for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
| 382 f = os.path.join(self.path, filename) | |
| 383 if isfile(f): | |
| 384 yield f | |
| 385 | |
| 386 def tagmanifest_files(self): | |
| 387 for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
| 388 f = os.path.join(self.path, filename) | |
| 389 if isfile(f): | |
| 390 yield f | |
| 391 | |
| 392 def compare_manifests_with_fs(self): | |
| 393 """ | |
| 394 Compare the filenames in the manifests to the filenames present on the | |
| 395 local filesystem and returns two lists of the files which are only | |
| 396 present in the manifests and the files which are only present on the | |
| 397 local filesystem, respectively. | |
| 398 """ | |
| 399 | |
| 400 # We compare the filenames after Unicode normalization so we can | |
| 401 # reliably detect normalization changes after bag creation: | |
| 402 files_on_fs = set(normalize_unicode(i) for i in self.payload_files()) | |
| 403 files_in_manifest = set( | |
| 404 normalize_unicode(i) for i in self.payload_entries().keys() | |
| 405 ) | |
| 406 | |
| 407 if self.version_info >= (0, 97): | |
| 408 files_in_manifest.update(self.missing_optional_tagfiles()) | |
| 409 | |
| 410 only_on_fs = list() | |
| 411 only_in_manifest = list() | |
| 412 | |
| 413 for i in files_on_fs.difference(files_in_manifest): | |
| 414 only_on_fs.append(self.normalized_filesystem_names[i]) | |
| 415 | |
| 416 for i in files_in_manifest.difference(files_on_fs): | |
| 417 only_in_manifest.append(self.normalized_manifest_names[i]) | |
| 418 | |
| 419 return only_in_manifest, only_on_fs | |
| 420 | |
| 421 def compare_fetch_with_fs(self): | |
| 422 """Compares the fetch entries with the files actually | |
| 423 in the payload, and returns a list of all the files | |
| 424 that still need to be fetched. | |
| 425 """ | |
| 426 | |
| 427 files_on_fs = set(self.payload_files()) | |
| 428 files_in_fetch = set(self.files_to_be_fetched()) | |
| 429 | |
| 430 return list(files_in_fetch - files_on_fs) | |
| 431 | |
| 432 def payload_files(self): | |
| 433 """Returns a list of filenames which are present on the local filesystem""" | |
| 434 payload_dir = os.path.join(self.path, "data") | |
| 435 | |
| 436 for dirpath, _, filenames in os.walk(payload_dir): | |
| 437 for f in filenames: | |
| 438 # Jump through some hoops here to make the payload files are | |
| 439 # returned with the directory structure relative to the base | |
| 440 # directory rather than the | |
| 441 normalized_f = os.path.normpath(f) | |
| 442 rel_path = os.path.relpath( | |
| 443 os.path.join(dirpath, normalized_f), start=self.path | |
| 444 ) | |
| 445 | |
| 446 self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path | |
| 447 yield rel_path | |
| 448 | |
| 449 def payload_entries(self): | |
| 450 """Return a dictionary of items """ | |
| 451 # Don't use dict comprehension (compatibility with Python < 2.7) | |
| 452 return dict( | |
| 453 (key, value) | |
| 454 for (key, value) in self.entries.items() | |
| 455 if key.startswith("data" + os.sep) | |
| 456 ) | |
| 457 | |
| 458 def save(self, processes=1, manifests=False): | |
| 459 """ | |
| 460 save will persist any changes that have been made to the bag | |
| 461 metadata (self.info). | |
| 462 | |
| 463 If you have modified the payload of the bag (added, modified, | |
| 464 removed files in the data directory) and want to regenerate manifests | |
| 465 set the manifests parameter to True. The default is False since you | |
| 466 wouldn't want a save to accidentally create a new manifest for | |
| 467 a corrupted bag. | |
| 468 | |
| 469 If you want to control the number of processes that are used when | |
| 470 recalculating checksums use the processes parameter. | |
| 471 """ | |
| 472 # Error checking | |
| 473 if not self.path: | |
| 474 raise BagError(_("Bag.save() called before setting the path!")) | |
| 475 | |
| 476 if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK): | |
| 477 raise BagError( | |
| 478 _("Cannot save bag to non-existent or inaccessible directory %s") | |
| 479 % self.path | |
| 480 ) | |
| 481 | |
| 482 unbaggable = _can_bag(self.path) | |
| 483 if unbaggable: | |
| 484 LOGGER.error( | |
| 485 _( | |
| 486 "Missing write permissions for the following directories and files:\n%s" | |
| 487 ), | |
| 488 unbaggable, | |
| 489 ) | |
| 490 raise BagError(_("Missing permissions to move all files and directories")) | |
| 491 | |
| 492 unreadable_dirs, unreadable_files = _can_read(self.path) | |
| 493 if unreadable_dirs or unreadable_files: | |
| 494 if unreadable_dirs: | |
| 495 LOGGER.error( | |
| 496 _("The following directories do not have read permissions:\n%s"), | |
| 497 unreadable_dirs, | |
| 498 ) | |
| 499 if unreadable_files: | |
| 500 LOGGER.error( | |
| 501 _("The following files do not have read permissions:\n%s"), | |
| 502 unreadable_files, | |
| 503 ) | |
| 504 raise BagError( | |
| 505 _("Read permissions are required to calculate file fixities") | |
| 506 ) | |
| 507 | |
| 508 # Change working directory to bag directory so helper functions work | |
| 509 old_dir = os.path.abspath(os.path.curdir) | |
| 510 os.chdir(self.path) | |
| 511 | |
| 512 # Generate new manifest files | |
| 513 if manifests: | |
| 514 total_bytes, total_files = make_manifests( | |
| 515 "data", processes, algorithms=self.algorithms, encoding=self.encoding | |
| 516 ) | |
| 517 | |
| 518 # Update Payload-Oxum | |
| 519 LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name) | |
| 520 self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
| 521 | |
| 522 _make_tag_file(self.tag_file_name, self.info) | |
| 523 | |
| 524 # Update tag-manifest for changes to manifest & bag-info files | |
| 525 for alg in self.algorithms: | |
| 526 _make_tagmanifest_file(alg, self.path, encoding=self.encoding) | |
| 527 | |
| 528 # Reload the manifests | |
| 529 self._load_manifests() | |
| 530 | |
| 531 os.chdir(old_dir) | |
| 532 | |
| 533 def tagfile_entries(self): | |
| 534 return dict( | |
| 535 (key, value) | |
| 536 for (key, value) in self.entries.items() | |
| 537 if not key.startswith("data" + os.sep) | |
| 538 ) | |
| 539 | |
| 540 def missing_optional_tagfiles(self): | |
| 541 """ | |
| 542 From v0.97 we need to validate any tagfiles listed | |
| 543 in the optional tagmanifest(s). As there is no mandatory | |
| 544 directory structure for additional tagfiles we can | |
| 545 only check for entries with missing files (not missing | |
| 546 entries for existing files). | |
| 547 """ | |
| 548 for tagfilepath in self.tagfile_entries().keys(): | |
| 549 if not os.path.isfile(os.path.join(self.path, tagfilepath)): | |
| 550 yield tagfilepath | |
| 551 | |
| 552 def fetch_entries(self): | |
| 553 """Load fetch.txt if present and iterate over its contents | |
| 554 | |
| 555 yields (url, size, filename) tuples | |
| 556 | |
| 557 raises BagError for errors such as an unsafe filename referencing | |
| 558 data outside of the bag directory | |
| 559 """ | |
| 560 | |
| 561 fetch_file_path = os.path.join(self.path, "fetch.txt") | |
| 562 | |
| 563 if isfile(fetch_file_path): | |
| 564 with open_text_file( | |
| 565 fetch_file_path, "r", encoding=self.encoding | |
| 566 ) as fetch_file: | |
| 567 for line in fetch_file: | |
| 568 url, file_size, filename = line.strip().split(None, 2) | |
| 569 | |
| 570 if self._path_is_dangerous(filename): | |
| 571 raise BagError( | |
| 572 _('Path "%(payload_file)s" in "%(source_file)s" is unsafe') | |
| 573 % { | |
| 574 "payload_file": filename, | |
| 575 "source_file": os.path.join(self.path, "fetch.txt"), | |
| 576 } | |
| 577 ) | |
| 578 | |
| 579 yield url, file_size, filename | |
| 580 | |
| 581 def files_to_be_fetched(self): | |
| 582 """ | |
| 583 Convenience wrapper for fetch_entries which returns only the | |
| 584 local filename | |
| 585 """ | |
| 586 | |
| 587 for url, file_size, filename in self.fetch_entries(): | |
| 588 yield filename | |
| 589 | |
| 590 def has_oxum(self): | |
| 591 return "Payload-Oxum" in self.info | |
| 592 | |
| 593 def validate(self, processes=1, fast=False, completeness_only=False): | |
| 594 """Checks the structure and contents are valid. | |
| 595 | |
| 596 If you supply the parameter fast=True the Payload-Oxum (if present) will | |
| 597 be used to check that the payload files are present and accounted for, | |
| 598 instead of re-calculating fixities and comparing them against the | |
| 599 manifest. By default validate() will re-calculate fixities (fast=False). | |
| 600 """ | |
| 601 | |
| 602 self._validate_structure() | |
| 603 self._validate_bagittxt() | |
| 604 | |
| 605 self.validate_fetch() | |
| 606 | |
| 607 self._validate_contents( | |
| 608 processes=processes, fast=fast, completeness_only=completeness_only | |
| 609 ) | |
| 610 | |
| 611 return True | |
| 612 | |
| 613 def is_valid(self, fast=False, completeness_only=False): | |
| 614 """Returns validation success or failure as boolean. | |
| 615 Optional fast parameter passed directly to validate(). | |
| 616 """ | |
| 617 | |
| 618 try: | |
| 619 self.validate(fast=fast, completeness_only=completeness_only) | |
| 620 except BagError: | |
| 621 return False | |
| 622 | |
| 623 return True | |
| 624 | |
| 625 def _load_manifests(self): | |
| 626 self.entries = {} | |
| 627 manifests = list(self.manifest_files()) | |
| 628 | |
| 629 if self.version_info >= (0, 97): | |
| 630 # v0.97+ requires that optional tagfiles are verified. | |
| 631 manifests += list(self.tagmanifest_files()) | |
| 632 | |
| 633 for manifest_filename in manifests: | |
| 634 if manifest_filename.find("tagmanifest-") != -1: | |
| 635 search = "tagmanifest-" | |
| 636 else: | |
| 637 search = "manifest-" | |
| 638 alg = ( | |
| 639 os.path.basename(manifest_filename) | |
| 640 .replace(search, "") | |
| 641 .replace(".txt", "") | |
| 642 ) | |
| 643 if alg not in self.algorithms: | |
| 644 self.algorithms.append(alg) | |
| 645 | |
| 646 with open_text_file( | |
| 647 manifest_filename, "r", encoding=self.encoding | |
| 648 ) as manifest_file: | |
| 649 if manifest_file.encoding.startswith("UTF"): | |
| 650 # We'll check the first character to see if it's a BOM: | |
| 651 if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK: | |
| 652 # We'll skip it either way by letting line decoding | |
| 653 # happen at the new offset but we will issue a warning | |
| 654 # for UTF-8 since the presence of a BOM is contrary to | |
| 655 # the BagIt specification: | |
| 656 if manifest_file.encoding == "UTF-8": | |
| 657 LOGGER.warning( | |
| 658 _( | |
| 659 "%s is encoded using UTF-8 but contains an unnecessary" | |
| 660 " byte-order mark, which is not in compliance with the" | |
| 661 " BagIt RFC" | |
| 662 ), | |
| 663 manifest_file.name, | |
| 664 ) | |
| 665 else: | |
| 666 manifest_file.seek(0) # Pretend the first read never happened | |
| 667 | |
| 668 for line in manifest_file: | |
| 669 line = line.strip() | |
| 670 | |
| 671 # Ignore blank lines and comments. | |
| 672 if line == "" or line.startswith("#"): | |
| 673 continue | |
| 674 | |
| 675 entry = line.split(None, 1) | |
| 676 | |
| 677 # Format is FILENAME *CHECKSUM | |
| 678 if len(entry) != 2: | |
| 679 LOGGER.error( | |
| 680 _( | |
| 681 "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" | |
| 682 ), | |
| 683 {"bag": self, "algorithm": alg, "line": line}, | |
| 684 ) | |
| 685 continue | |
| 686 | |
| 687 entry_hash = entry[0] | |
| 688 entry_path = os.path.normpath(entry[1].lstrip("*")) | |
| 689 entry_path = _decode_filename(entry_path) | |
| 690 | |
| 691 if self._path_is_dangerous(entry_path): | |
| 692 raise BagError( | |
| 693 _( | |
| 694 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe' | |
| 695 ) | |
| 696 % { | |
| 697 "payload_file": entry_path, | |
| 698 "manifest_file": manifest_file.name, | |
| 699 } | |
| 700 ) | |
| 701 | |
| 702 entry_hashes = self.entries.setdefault(entry_path, {}) | |
| 703 | |
| 704 if alg in entry_hashes: | |
| 705 warning_ctx = { | |
| 706 "bag": self, | |
| 707 "algorithm": alg, | |
| 708 "filename": entry_path, | |
| 709 } | |
| 710 if entry_hashes[alg] == entry_hash: | |
| 711 msg = _( | |
| 712 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
| 713 " multiple times with the same value" | |
| 714 ) | |
| 715 if self.version_info >= (1,): | |
| 716 raise BagError(msg % warning_ctx) | |
| 717 else: | |
| 718 LOGGER.warning(msg, warning_ctx) | |
| 719 else: | |
| 720 raise BagError( | |
| 721 _( | |
| 722 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
| 723 " multiple times with conflicting values" | |
| 724 ) | |
| 725 % warning_ctx | |
| 726 ) | |
| 727 | |
| 728 entry_hashes[alg] = entry_hash | |
| 729 | |
| 730 self.normalized_manifest_names.update( | |
| 731 (normalize_unicode(i), i) for i in self.entries.keys() | |
| 732 ) | |
| 733 | |
| 734 def _validate_structure(self): | |
| 735 """ | |
| 736 Checks the structure of the bag to determine whether it conforms to the | |
| 737 BagIt spec. Returns true on success, otherwise it will raise a | |
| 738 BagValidationError exception. | |
| 739 """ | |
| 740 | |
| 741 self._validate_structure_payload_directory() | |
| 742 self._validate_structure_tag_files() | |
| 743 | |
| 744 def _validate_structure_payload_directory(self): | |
| 745 data_dir_path = os.path.join(self.path, "data") | |
| 746 | |
| 747 if not isdir(data_dir_path): | |
| 748 raise BagValidationError( | |
| 749 _("Expected data directory %s does not exist") % data_dir_path | |
| 750 ) | |
| 751 | |
| 752 def _validate_structure_tag_files(self): | |
| 753 # Note: we deviate somewhat from v0.96 of the spec in that it allows | |
| 754 # other files and directories to be present in the base directory | |
| 755 | |
| 756 if not list(self.manifest_files()): | |
| 757 raise BagValidationError(_("No manifest files found")) | |
| 758 if "bagit.txt" not in os.listdir(self.path): | |
| 759 raise BagValidationError( | |
| 760 _('Expected %s to contain "bagit.txt"') % self.path | |
| 761 ) | |
| 762 | |
| 763 def validate_fetch(self): | |
| 764 """Validate the fetch.txt file | |
| 765 | |
| 766 Raises `BagError` for errors and otherwise returns no value | |
| 767 """ | |
| 768 | |
| 769 for url, file_size, filename in self.fetch_entries(): | |
| 770 # fetch_entries will raise a BagError for unsafe filenames | |
| 771 # so at this point we will check only that the URL is minimally | |
| 772 # well formed: | |
| 773 parsed_url = urlparse(url) | |
| 774 | |
| 775 if not all((parsed_url.scheme, parsed_url.netloc)): | |
| 776 raise BagError(_("Malformed URL in fetch.txt: %s") % url) | |
| 777 | |
| 778 def _validate_contents(self, processes=1, fast=False, completeness_only=False): | |
| 779 if fast and not self.has_oxum(): | |
| 780 raise BagValidationError( | |
| 781 _("Fast validation requires bag-info.txt to include Payload-Oxum") | |
| 782 ) | |
| 783 | |
| 784 # Perform the fast file count + size check so we can fail early: | |
| 785 self._validate_oxum() | |
| 786 | |
| 787 if fast: | |
| 788 return | |
| 789 | |
| 790 self._validate_completeness() | |
| 791 | |
| 792 if completeness_only: | |
| 793 return | |
| 794 | |
| 795 self._validate_entries(processes) | |
| 796 | |
| 797 def _validate_oxum(self): | |
| 798 oxum = self.info.get("Payload-Oxum") | |
| 799 | |
| 800 if oxum is None: | |
| 801 return | |
| 802 | |
| 803 # If multiple Payload-Oxum tags (bad idea) | |
| 804 # use the first listed in bag-info.txt | |
| 805 if isinstance(oxum, list): | |
| 806 LOGGER.warning(_("bag-info.txt defines multiple Payload-Oxum values!")) | |
| 807 oxum = oxum[0] | |
| 808 | |
| 809 oxum_byte_count, oxum_file_count = oxum.split(".", 1) | |
| 810 | |
| 811 if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit(): | |
| 812 raise BagError(_("Malformed Payload-Oxum value: %s") % oxum) | |
| 813 | |
| 814 oxum_byte_count = int(oxum_byte_count) | |
| 815 oxum_file_count = int(oxum_file_count) | |
| 816 total_bytes = 0 | |
| 817 total_files = 0 | |
| 818 | |
| 819 for payload_file in self.payload_files(): | |
| 820 payload_file = os.path.join(self.path, payload_file) | |
| 821 total_bytes += os.stat(payload_file).st_size | |
| 822 total_files += 1 | |
| 823 | |
| 824 if oxum_file_count != total_files or oxum_byte_count != total_bytes: | |
| 825 raise BagValidationError( | |
| 826 _( | |
| 827 "Payload-Oxum validation failed." | |
| 828 " Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes" | |
| 829 " but found %(found_file_count)d files and %(found_byte_count)d bytes" | |
| 830 ) | |
| 831 % { | |
| 832 "found_file_count": total_files, | |
| 833 "found_byte_count": total_bytes, | |
| 834 "oxum_file_count": oxum_file_count, | |
| 835 "oxum_byte_count": oxum_byte_count, | |
| 836 } | |
| 837 ) | |
| 838 | |
| 839 def _validate_completeness(self): | |
| 840 """ | |
| 841 Verify that the actual file manifests match the files in the data directory | |
| 842 """ | |
| 843 errors = list() | |
| 844 | |
| 845 # First we'll make sure there's no mismatch between the filesystem | |
| 846 # and the list of files in the manifest(s) | |
| 847 only_in_manifests, only_on_fs = self.compare_manifests_with_fs() | |
| 848 for path in only_in_manifests: | |
| 849 e = FileMissing(path) | |
| 850 LOGGER.warning(force_unicode(e)) | |
| 851 errors.append(e) | |
| 852 for path in only_on_fs: | |
| 853 e = UnexpectedFile(path) | |
| 854 LOGGER.warning(force_unicode(e)) | |
| 855 errors.append(e) | |
| 856 | |
| 857 if errors: | |
| 858 raise BagValidationError(_("Bag validation failed"), errors) | |
| 859 | |
| 860 def _validate_entries(self, processes): | |
| 861 """ | |
| 862 Verify that the actual file contents match the recorded hashes stored in the manifest files | |
| 863 """ | |
| 864 errors = list() | |
| 865 | |
| 866 if os.name == "posix": | |
| 867 worker_init = posix_multiprocessing_worker_initializer | |
| 868 else: | |
| 869 worker_init = None | |
| 870 | |
| 871 args = ( | |
| 872 ( | |
| 873 self.path, | |
| 874 self.normalized_filesystem_names.get(rel_path, rel_path), | |
| 875 hashes, | |
| 876 self.algorithms, | |
| 877 ) | |
| 878 for rel_path, hashes in self.entries.items() | |
| 879 ) | |
| 880 | |
| 881 try: | |
| 882 if processes == 1: | |
| 883 hash_results = [_calc_hashes(i) for i in args] | |
| 884 else: | |
| 885 try: | |
| 886 pool = multiprocessing.Pool( | |
| 887 processes if processes else None, initializer=worker_init | |
| 888 ) | |
| 889 hash_results = pool.map(_calc_hashes, args) | |
| 890 finally: | |
| 891 pool.terminate() | |
| 892 | |
| 893 # Any unhandled exceptions are probably fatal | |
| 894 except: | |
| 895 LOGGER.exception(_("Unable to calculate file hashes for %s"), self) | |
| 896 raise | |
| 897 | |
| 898 for rel_path, f_hashes, hashes in hash_results: | |
| 899 for alg, computed_hash in f_hashes.items(): | |
| 900 stored_hash = hashes[alg] | |
| 901 if stored_hash.lower() != computed_hash: | |
| 902 e = ChecksumMismatch( | |
| 903 rel_path, alg, stored_hash.lower(), computed_hash | |
| 904 ) | |
| 905 LOGGER.warning(force_unicode(e)) | |
| 906 errors.append(e) | |
| 907 | |
| 908 if errors: | |
| 909 raise BagValidationError(_("Bag validation failed"), errors) | |
| 910 | |
| 911 def _validate_bagittxt(self): | |
| 912 """ | |
| 913 Verify that bagit.txt conforms to specification | |
| 914 """ | |
| 915 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
| 916 | |
| 917 # Note that we are intentionally opening this file in binary mode so we can confirm | |
| 918 # that it does not start with the UTF-8 byte-order-mark | |
| 919 with open(bagit_file_path, "rb") as bagit_file: | |
| 920 first_line = bagit_file.read(4) | |
| 921 if first_line.startswith(codecs.BOM_UTF8): | |
| 922 raise BagValidationError( | |
| 923 _("bagit.txt must not contain a byte-order mark") | |
| 924 ) | |
| 925 | |
| 926 def _path_is_dangerous(self, path): | |
| 927 """ | |
| 928 Return true if path looks dangerous, i.e. potentially operates | |
| 929 outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json, | |
| 930 \\?\c:\, D:\sys32\cmd.exe | |
| 931 """ | |
| 932 if os.path.isabs(path): | |
| 933 return True | |
| 934 if os.path.expanduser(path) != path: | |
| 935 return True | |
| 936 if os.path.expandvars(path) != path: | |
| 937 return True | |
| 938 real_path = os.path.realpath(os.path.join(self.path, path)) | |
| 939 real_path = os.path.normpath(real_path) | |
| 940 bag_path = os.path.realpath(self.path) | |
| 941 bag_path = os.path.normpath(bag_path) | |
| 942 common = os.path.commonprefix((bag_path, real_path)) | |
| 943 return not (common == bag_path) | |
| 944 | |
| 945 | |
| 946 class BagError(Exception): | |
| 947 pass | |
| 948 | |
| 949 | |
| 950 class BagValidationError(BagError): | |
| 951 def __init__(self, message, details=None): | |
| 952 super(BagValidationError, self).__init__() | |
| 953 | |
| 954 if details is None: | |
| 955 details = [] | |
| 956 | |
| 957 self.message = message | |
| 958 self.details = details | |
| 959 | |
| 960 def __str__(self): | |
| 961 if len(self.details) > 0: | |
| 962 details = "; ".join([force_unicode(e) for e in self.details]) | |
| 963 return "%s: %s" % (self.message, details) | |
| 964 return self.message | |
| 965 | |
| 966 | |
| 967 class ManifestErrorDetail(BagError): | |
| 968 def __init__(self, path): | |
| 969 super(ManifestErrorDetail, self).__init__() | |
| 970 | |
| 971 self.path = path | |
| 972 | |
| 973 | |
| 974 class ChecksumMismatch(ManifestErrorDetail): | |
| 975 def __init__(self, path, algorithm=None, expected=None, found=None): | |
| 976 super(ChecksumMismatch, self).__init__(path) | |
| 977 | |
| 978 self.path = path | |
| 979 self.algorithm = algorithm | |
| 980 self.expected = expected | |
| 981 self.found = found | |
| 982 | |
| 983 def __str__(self): | |
| 984 return _( | |
| 985 '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"' | |
| 986 ) % { | |
| 987 "path": force_unicode(self.path), | |
| 988 "algorithm": self.algorithm, | |
| 989 "expected": self.expected, | |
| 990 "found": self.found, | |
| 991 } | |
| 992 | |
| 993 | |
| 994 class FileMissing(ManifestErrorDetail): | |
| 995 def __str__(self): | |
| 996 return _( | |
| 997 "%s exists in manifest but was not found on filesystem" | |
| 998 ) % force_unicode(self.path) | |
| 999 | |
| 1000 | |
| 1001 class UnexpectedFile(ManifestErrorDetail): | |
| 1002 def __str__(self): | |
| 1003 return _("%s exists on filesystem but is not in the manifest") % self.path | |
| 1004 | |
| 1005 | |
| 1006 class FileNormalizationConflict(BagError): | |
| 1007 """ | |
| 1008 Exception raised when two files differ only in normalization and thus | |
| 1009 are not safely portable | |
| 1010 """ | |
| 1011 | |
| 1012 def __init__(self, file_a, file_b): | |
| 1013 super(FileNormalizationConflict, self).__init__() | |
| 1014 | |
| 1015 self.file_a = file_a | |
| 1016 self.file_b = file_b | |
| 1017 | |
| 1018 def __str__(self): | |
| 1019 return _( | |
| 1020 'Unicode normalization conflict for file "%(file_a)s" and "%(file_b)s"' | |
| 1021 ) % {"file_a": self.file_a, "file_b": self.file_b} | |
| 1022 | |
| 1023 | |
| 1024 def posix_multiprocessing_worker_initializer(): | |
| 1025 """Ignore SIGINT in multiprocessing workers on POSIX systems""" | |
| 1026 signal.signal(signal.SIGINT, signal.SIG_IGN) | |
| 1027 | |
| 1028 | |
| 1029 # The Unicode normalization form used here doesn't matter – all we care about | |
| 1030 # is consistency since the input value will be preserved: | |
| 1031 | |
| 1032 | |
| 1033 def normalize_unicode_py3(s): | |
| 1034 return unicodedata.normalize("NFC", s) | |
| 1035 | |
| 1036 | |
| 1037 def normalize_unicode_py2(s): | |
| 1038 if isinstance(s, str): | |
| 1039 s = s.decode("utf-8") | |
| 1040 return unicodedata.normalize("NFC", s) | |
| 1041 | |
| 1042 | |
| 1043 if sys.version_info > (3, 0): | |
| 1044 normalize_unicode = normalize_unicode_py3 | |
| 1045 else: | |
| 1046 normalize_unicode = normalize_unicode_py2 | |
| 1047 | |
| 1048 | |
| 1049 def build_unicode_normalized_lookup_dict(filenames): | |
| 1050 """ | |
| 1051 Return a dictionary mapping unicode-normalized filenames to as-encoded | |
| 1052 values to efficiently detect conflicts between the filesystem and manifests. | |
| 1053 | |
| 1054 This is necessary because some filesystems and utilities may automatically | |
| 1055 apply a different Unicode normalization form to filenames than was applied | |
| 1056 when the bag was originally created. | |
| 1057 | |
| 1058 The best known example of this is when a bag is created using a | |
| 1059 normalization form other than NFD and then transferred to a Mac where the | |
| 1060 HFS+ filesystem will transparently normalize filenames to a variant of NFD | |
| 1061 for every call: | |
| 1062 | |
| 1063 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html#UnicodeSubtleties | |
| 1064 | |
| 1065 Windows is documented as storing filenames exactly as provided: | |
| 1066 | |
| 1067 https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx | |
| 1068 | |
| 1069 Linux performs no normalization in the kernel but it is technically | |
| 1070 valid for a filesystem to perform normalization, such as when an HFS+ | |
| 1071 volume is mounted. | |
| 1072 | |
| 1073 See http://www.unicode.org/reports/tr15/ for a full discussion of | |
| 1074 equivalence and normalization in Unicode. | |
| 1075 """ | |
| 1076 | |
| 1077 output = dict() | |
| 1078 | |
| 1079 for filename in filenames: | |
| 1080 normalized_filename = normalize_unicode(filename) | |
| 1081 if normalized_filename in output: | |
| 1082 raise FileNormalizationConflict(filename, output[normalized_filename]) | |
| 1083 else: | |
| 1084 output[normalized_filename] = filename | |
| 1085 | |
| 1086 return output | |
| 1087 | |
| 1088 | |
| 1089 def get_hashers(algorithms): | |
| 1090 """ | |
| 1091 Given a list of algorithm names, return a dictionary of hasher instances | |
| 1092 | |
| 1093 This avoids redundant code between the creation and validation code where in | |
| 1094 both cases we want to avoid reading the same file more than once. The | |
| 1095 intended use is a simple for loop: | |
| 1096 | |
| 1097 for block in file: | |
| 1098 for hasher in hashers.values(): | |
| 1099 hasher.update(block) | |
| 1100 """ | |
| 1101 | |
| 1102 hashers = {} | |
| 1103 | |
| 1104 for alg in algorithms: | |
| 1105 try: | |
| 1106 hasher = hashlib.new(alg) | |
| 1107 except ValueError: | |
| 1108 LOGGER.warning( | |
| 1109 _("Disabling requested hash algorithm %s: hashlib does not support it"), | |
| 1110 alg, | |
| 1111 ) | |
| 1112 continue | |
| 1113 | |
| 1114 hashers[alg] = hasher | |
| 1115 | |
| 1116 if not hashers: | |
| 1117 raise ValueError( | |
| 1118 _( | |
| 1119 "Unable to continue: hashlib does not support any of the requested algorithms!" | |
| 1120 ) | |
| 1121 ) | |
| 1122 | |
| 1123 return hashers | |
| 1124 | |
| 1125 | |
| 1126 def _calc_hashes(args): | |
| 1127 # auto unpacking of sequences illegal in Python3 | |
| 1128 (base_path, rel_path, hashes, algorithms) = args | |
| 1129 full_path = os.path.join(base_path, rel_path) | |
| 1130 | |
| 1131 # Create a clone of the default empty hash objects: | |
| 1132 f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms) | |
| 1133 | |
| 1134 try: | |
| 1135 f_hashes = _calculate_file_hashes(full_path, f_hashers) | |
| 1136 except BagValidationError as e: | |
| 1137 f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys()) | |
| 1138 | |
| 1139 return rel_path, f_hashes, hashes | |
| 1140 | |
| 1141 | |
| 1142 def _calculate_file_hashes(full_path, f_hashers): | |
| 1143 """ | |
| 1144 Returns a dictionary of (algorithm, hexdigest) values for the provided | |
| 1145 filename | |
| 1146 """ | |
| 1147 LOGGER.info(_("Verifying checksum for file %s"), full_path) | |
| 1148 | |
| 1149 try: | |
| 1150 with open(full_path, "rb") as f: | |
| 1151 while True: | |
| 1152 block = f.read(HASH_BLOCK_SIZE) | |
| 1153 if not block: | |
| 1154 break | |
| 1155 for i in f_hashers.values(): | |
| 1156 i.update(block) | |
| 1157 except (OSError, IOError) as e: | |
| 1158 raise BagValidationError( | |
| 1159 _("Could not read %(filename)s: %(error)s") | |
| 1160 % {"filename": full_path, "error": force_unicode(e)} | |
| 1161 ) | |
| 1162 | |
| 1163 return dict((alg, h.hexdigest()) for alg, h in f_hashers.items()) | |
| 1164 | |
| 1165 | |
| 1166 def _load_tag_file(tag_file_name, encoding="utf-8-sig"): | |
| 1167 with open_text_file(tag_file_name, "r", encoding=encoding) as tag_file: | |
| 1168 # Store duplicate tags as list of vals | |
| 1169 # in order of parsing under the same key. | |
| 1170 tags = {} | |
| 1171 for name, value in _parse_tags(tag_file): | |
| 1172 if name not in tags: | |
| 1173 tags[name] = value | |
| 1174 continue | |
| 1175 | |
| 1176 if not isinstance(tags[name], list): | |
| 1177 tags[name] = [tags[name], value] | |
| 1178 else: | |
| 1179 tags[name].append(value) | |
| 1180 | |
| 1181 return tags | |
| 1182 | |
| 1183 | |
| 1184 def _parse_tags(tag_file): | |
| 1185 """Parses a tag file, according to RFC 2822. This | |
| 1186 includes line folding, permitting extra-long | |
| 1187 field values. | |
| 1188 | |
| 1189 See http://www.faqs.org/rfcs/rfc2822.html for | |
| 1190 more information. | |
| 1191 """ | |
| 1192 | |
| 1193 tag_name = None | |
| 1194 tag_value = None | |
| 1195 | |
| 1196 # Line folding is handled by yielding values only after we encounter | |
| 1197 # the start of a new tag, or if we pass the EOF. | |
| 1198 for num, line in enumerate(tag_file): | |
| 1199 # Skip over any empty or blank lines. | |
| 1200 if len(line) == 0 or line.isspace(): | |
| 1201 continue | |
| 1202 elif line[0].isspace() and tag_value is not None: # folded line | |
| 1203 tag_value += line | |
| 1204 else: | |
| 1205 # Starting a new tag; yield the last one. | |
| 1206 if tag_name: | |
| 1207 yield (tag_name, tag_value.strip()) | |
| 1208 | |
| 1209 if ":" not in line: | |
| 1210 raise BagValidationError( | |
| 1211 _("%(filename)s contains invalid tag: %(line)s") | |
| 1212 % { | |
| 1213 "line": line.strip(), | |
| 1214 "filename": os.path.basename(tag_file.name), | |
| 1215 } | |
| 1216 ) | |
| 1217 | |
| 1218 parts = line.strip().split(":", 1) | |
| 1219 tag_name = parts[0].strip() | |
| 1220 tag_value = parts[1] | |
| 1221 | |
| 1222 # Passed the EOF. All done after this. | |
| 1223 if tag_name: | |
| 1224 yield (tag_name, tag_value.strip()) | |
| 1225 | |
| 1226 | |
| 1227 def _make_tag_file(bag_info_path, bag_info): | |
| 1228 headers = sorted(bag_info.keys()) | |
| 1229 with open_text_file(bag_info_path, "w") as f: | |
| 1230 for h in headers: | |
| 1231 values = bag_info[h] | |
| 1232 if not isinstance(values, list): | |
| 1233 values = [values] | |
| 1234 for txt in values: | |
| 1235 # strip CR, LF and CRLF so they don't mess up the tag file | |
| 1236 txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt)) | |
| 1237 f.write("%s: %s\n" % (h, txt)) | |
| 1238 | |
| 1239 | |
| 1240 def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"): | |
| 1241 LOGGER.info( | |
| 1242 _("Using %(process_count)d processes to generate manifests: %(algorithms)s"), | |
| 1243 {"process_count": processes, "algorithms": ", ".join(algorithms)}, | |
| 1244 ) | |
| 1245 | |
| 1246 manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms) | |
| 1247 | |
| 1248 if processes > 1: | |
| 1249 pool = multiprocessing.Pool(processes=processes) | |
| 1250 checksums = pool.map(manifest_line_generator, _walk(data_dir)) | |
| 1251 pool.close() | |
| 1252 pool.join() | |
| 1253 else: | |
| 1254 checksums = [manifest_line_generator(i) for i in _walk(data_dir)] | |
| 1255 | |
| 1256 # At this point we have a list of tuples which start with the algorithm name: | |
| 1257 manifest_data = {} | |
| 1258 for batch in checksums: | |
| 1259 for entry in batch: | |
| 1260 manifest_data.setdefault(entry[0], []).append(entry[1:]) | |
| 1261 | |
| 1262 # These will be keyed on the algorithm name so we can perform sanity checks | |
| 1263 # below to catch failures in the hashing process: | |
| 1264 num_files = defaultdict(lambda: 0) | |
| 1265 total_bytes = defaultdict(lambda: 0) | |
| 1266 | |
| 1267 for algorithm, values in manifest_data.items(): | |
| 1268 manifest_filename = "manifest-%s.txt" % algorithm | |
| 1269 | |
| 1270 with open_text_file(manifest_filename, "w", encoding=encoding) as manifest: | |
| 1271 for digest, filename, byte_count in values: | |
| 1272 manifest.write("%s %s\n" % (digest, _encode_filename(filename))) | |
| 1273 num_files[algorithm] += 1 | |
| 1274 total_bytes[algorithm] += byte_count | |
| 1275 | |
| 1276 # We'll use sets of the values for the error checks and eventually return the payload oxum values: | |
| 1277 byte_value_set = set(total_bytes.values()) | |
| 1278 file_count_set = set(num_files.values()) | |
| 1279 | |
| 1280 # allow a bag with an empty payload | |
| 1281 if not byte_value_set and not file_count_set: | |
| 1282 return 0, 0 | |
| 1283 | |
| 1284 if len(file_count_set) != 1: | |
| 1285 raise RuntimeError(_("Expected the same number of files for each checksum")) | |
| 1286 | |
| 1287 if len(byte_value_set) != 1: | |
| 1288 raise RuntimeError(_("Expected the same number of bytes for each checksums")) | |
| 1289 | |
| 1290 return byte_value_set.pop(), file_count_set.pop() | |
| 1291 | |
| 1292 | |
| 1293 def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"): | |
| 1294 tagmanifest_file = join(bag_dir, "tagmanifest-%s.txt" % alg) | |
| 1295 LOGGER.info(_("Creating %s"), tagmanifest_file) | |
| 1296 | |
| 1297 checksums = [] | |
| 1298 for f in _find_tag_files(bag_dir): | |
| 1299 if re.match(r"^tagmanifest-.+\.txt$", f): | |
| 1300 continue | |
| 1301 with open(join(bag_dir, f), "rb") as fh: | |
| 1302 m = hashlib.new(alg) | |
| 1303 while True: | |
| 1304 block = fh.read(HASH_BLOCK_SIZE) | |
| 1305 if not block: | |
| 1306 break | |
| 1307 m.update(block) | |
| 1308 checksums.append((m.hexdigest(), f)) | |
| 1309 | |
| 1310 with open_text_file( | |
| 1311 join(bag_dir, tagmanifest_file), mode="w", encoding=encoding | |
| 1312 ) as tagmanifest: | |
| 1313 for digest, filename in checksums: | |
| 1314 tagmanifest.write("%s %s\n" % (digest, filename)) | |
| 1315 | |
| 1316 | |
| 1317 def _find_tag_files(bag_dir): | |
| 1318 for dir in os.listdir(bag_dir): | |
| 1319 if dir != "data": | |
| 1320 if os.path.isfile(dir) and not dir.startswith("tagmanifest-"): | |
| 1321 yield dir | |
| 1322 for dir_name, _, filenames in os.walk(dir): | |
| 1323 for filename in filenames: | |
| 1324 if filename.startswith("tagmanifest-"): | |
| 1325 continue | |
| 1326 # remove everything up to the bag_dir directory | |
| 1327 p = join(dir_name, filename) | |
| 1328 yield os.path.relpath(p, bag_dir) | |
| 1329 | |
| 1330 | |
| 1331 def _walk(data_dir): | |
| 1332 for dirpath, dirnames, filenames in os.walk(data_dir): | |
| 1333 # if we don't sort here the order of entries is non-deterministic | |
| 1334 # which makes it hard to test the fixity of tagmanifest-md5.txt | |
| 1335 filenames.sort() | |
| 1336 dirnames.sort() | |
| 1337 for fn in filenames: | |
| 1338 path = os.path.join(dirpath, fn) | |
| 1339 # BagIt spec requires manifest to always use '/' as path separator | |
| 1340 if os.path.sep != "/": | |
| 1341 parts = path.split(os.path.sep) | |
| 1342 path = "/".join(parts) | |
| 1343 yield path | |
| 1344 | |
| 1345 | |
| 1346 def _can_bag(test_dir): | |
| 1347 """Scan the provided directory for files which cannot be bagged due to insufficient permissions""" | |
| 1348 unbaggable = [] | |
| 1349 | |
| 1350 if not os.access(test_dir, os.R_OK): | |
| 1351 # We cannot continue without permission to read the source directory | |
| 1352 unbaggable.append(test_dir) | |
| 1353 return unbaggable | |
| 1354 | |
| 1355 if not os.access(test_dir, os.W_OK): | |
| 1356 unbaggable.append(test_dir) | |
| 1357 | |
| 1358 for dirpath, dirnames, filenames in os.walk(test_dir): | |
| 1359 for directory in dirnames: | |
| 1360 full_path = os.path.join(dirpath, directory) | |
| 1361 if not os.access(full_path, os.W_OK): | |
| 1362 unbaggable.append(full_path) | |
| 1363 | |
| 1364 return unbaggable | |
| 1365 | |
| 1366 | |
| 1367 def _can_read(test_dir): | |
| 1368 """ | |
| 1369 returns ((unreadable_dirs), (unreadable_files)) | |
| 1370 """ | |
| 1371 unreadable_dirs = [] | |
| 1372 unreadable_files = [] | |
| 1373 | |
| 1374 if not os.access(test_dir, os.R_OK): | |
| 1375 unreadable_dirs.append(test_dir) | |
| 1376 else: | |
| 1377 for dirpath, dirnames, filenames in os.walk(test_dir): | |
| 1378 for dn in dirnames: | |
| 1379 full_path = os.path.join(dirpath, dn) | |
| 1380 if not os.access(full_path, os.R_OK): | |
| 1381 unreadable_dirs.append(full_path) | |
| 1382 for fn in filenames: | |
| 1383 full_path = os.path.join(dirpath, fn) | |
| 1384 if not os.access(full_path, os.R_OK): | |
| 1385 unreadable_files.append(full_path) | |
| 1386 return (tuple(unreadable_dirs), tuple(unreadable_files)) | |
| 1387 | |
| 1388 | |
| 1389 def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): | |
| 1390 LOGGER.info(_("Generating manifest lines for file %s"), filename) | |
| 1391 | |
| 1392 # For performance we'll read the file only once and pass it block | |
| 1393 # by block to every requested hash algorithm: | |
| 1394 hashers = get_hashers(algorithms) | |
| 1395 | |
| 1396 total_bytes = 0 | |
| 1397 | |
| 1398 with open(filename, "rb") as f: | |
| 1399 while True: | |
| 1400 block = f.read(HASH_BLOCK_SIZE) | |
| 1401 | |
| 1402 if not block: | |
| 1403 break | |
| 1404 | |
| 1405 total_bytes += len(block) | |
| 1406 for hasher in hashers.values(): | |
| 1407 hasher.update(block) | |
| 1408 | |
| 1409 decoded_filename = _decode_filename(filename) | |
| 1410 | |
| 1411 # We'll generate a list of results in roughly manifest format but prefixed with the algorithm: | |
| 1412 results = [ | |
| 1413 (alg, hasher.hexdigest(), decoded_filename, total_bytes) | |
| 1414 for alg, hasher in hashers.items() | |
| 1415 ] | |
| 1416 | |
| 1417 return results | |
| 1418 | |
| 1419 | |
| 1420 def _encode_filename(s): | |
| 1421 s = s.replace("\r", "%0D") | |
| 1422 s = s.replace("\n", "%0A") | |
| 1423 return s | |
| 1424 | |
| 1425 | |
| 1426 def _decode_filename(s): | |
| 1427 s = re.sub(r"%0D", "\r", s, re.IGNORECASE) | |
| 1428 s = re.sub(r"%0A", "\n", s, re.IGNORECASE) | |
| 1429 return s | |
| 1430 | |
| 1431 | |
| 1432 def force_unicode_py2(s): | |
| 1433 """Reliably return a Unicode string given a possible unicode or byte string""" | |
| 1434 if isinstance(s, str): | |
| 1435 return s.decode("utf-8") | |
| 1436 else: | |
| 1437 return unicode(s) | |
| 1438 | |
| 1439 | |
| 1440 if sys.version_info > (3, 0): | |
| 1441 force_unicode = str | |
| 1442 else: | |
| 1443 force_unicode = force_unicode_py2 | |
| 1444 | |
| 1445 # following code is used for command line program | |
| 1446 | |
| 1447 | |
| 1448 class BagArgumentParser(argparse.ArgumentParser): | |
| 1449 def __init__(self, *args, **kwargs): | |
| 1450 argparse.ArgumentParser.__init__(self, *args, **kwargs) | |
| 1451 self.set_defaults(bag_info={}) | |
| 1452 | |
| 1453 | |
| 1454 class BagHeaderAction(argparse.Action): | |
| 1455 def __call__(self, parser, namespace, values, option_string=None): | |
| 1456 opt = option_string.lstrip("--") | |
| 1457 opt_caps = "-".join([o.capitalize() for o in opt.split("-")]) | |
| 1458 namespace.bag_info[opt_caps] = values | |
| 1459 | |
| 1460 | |
| 1461 def _make_parser(): | |
| 1462 parser = BagArgumentParser( | |
| 1463 formatter_class=argparse.RawDescriptionHelpFormatter, | |
| 1464 description="bagit-python version %s\n\n%s\n" % (VERSION, __doc__.strip()), | |
| 1465 ) | |
| 1466 parser.add_argument( | |
| 1467 "--processes", | |
| 1468 type=int, | |
| 1469 dest="processes", | |
| 1470 default=1, | |
| 1471 help=_( | |
| 1472 "Use multiple processes to calculate checksums faster (default: %(default)s)" | |
| 1473 ), | |
| 1474 ) | |
| 1475 parser.add_argument("--log", help=_("The name of the log file (default: stdout)")) | |
| 1476 parser.add_argument( | |
| 1477 "--quiet", | |
| 1478 action="store_true", | |
| 1479 help=_("Suppress all progress information other than errors"), | |
| 1480 ) | |
| 1481 parser.add_argument( | |
| 1482 "--validate", | |
| 1483 action="store_true", | |
| 1484 help=_( | |
| 1485 "Validate existing bags in the provided directories instead of" | |
| 1486 " creating new ones" | |
| 1487 ), | |
| 1488 ) | |
| 1489 parser.add_argument( | |
| 1490 "--fast", | |
| 1491 action="store_true", | |
| 1492 help=_( | |
| 1493 "Modify --validate behaviour to only test whether the bag directory" | |
| 1494 " has the number of files and total size specified in Payload-Oxum" | |
| 1495 " without performing checksum validation to detect corruption." | |
| 1496 ), | |
| 1497 ) | |
| 1498 parser.add_argument( | |
| 1499 "--completeness-only", | |
| 1500 action="store_true", | |
| 1501 help=_( | |
| 1502 "Modify --validate behaviour to test whether the bag directory" | |
| 1503 " has the expected payload specified in the checksum manifests" | |
| 1504 " without performing checksum validation to detect corruption." | |
| 1505 ), | |
| 1506 ) | |
| 1507 | |
| 1508 checksum_args = parser.add_argument_group( | |
| 1509 _("Checksum Algorithms"), | |
| 1510 _( | |
| 1511 "Select the manifest algorithms to be used when creating bags" | |
| 1512 " (default=%s)" | |
| 1513 ) | |
| 1514 % ", ".join(DEFAULT_CHECKSUMS), | |
| 1515 ) | |
| 1516 | |
| 1517 for i in CHECKSUM_ALGOS: | |
| 1518 alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper()) | |
| 1519 checksum_args.add_argument( | |
| 1520 "--%s" % i, | |
| 1521 action="append_const", | |
| 1522 dest="checksums", | |
| 1523 const=i, | |
| 1524 help=_("Generate %s manifest when creating a bag") % alg_name, | |
| 1525 ) | |
| 1526 | |
| 1527 metadata_args = parser.add_argument_group(_("Optional Bag Metadata")) | |
| 1528 for header in STANDARD_BAG_INFO_HEADERS: | |
| 1529 metadata_args.add_argument( | |
| 1530 "--%s" % header.lower(), type=str, action=BagHeaderAction, default=argparse.SUPPRESS | |
| 1531 ) | |
| 1532 | |
| 1533 parser.add_argument( | |
| 1534 "directory", | |
| 1535 nargs="+", | |
| 1536 help=_( | |
| 1537 "Directory which will be converted into a bag in place" | |
| 1538 " by moving any existing files into the BagIt structure" | |
| 1539 " and creating the manifests and other metadata." | |
| 1540 ), | |
| 1541 ) | |
| 1542 | |
| 1543 return parser | |
| 1544 | |
| 1545 | |
| 1546 def _configure_logging(opts): | |
| 1547 log_format = "%(asctime)s - %(levelname)s - %(message)s" | |
| 1548 if opts.quiet: | |
| 1549 level = logging.ERROR | |
| 1550 else: | |
| 1551 level = logging.INFO | |
| 1552 if opts.log: | |
| 1553 logging.basicConfig(filename=opts.log, level=level, format=log_format) | |
| 1554 else: | |
| 1555 logging.basicConfig(level=level, format=log_format) | |
| 1556 | |
| 1557 | |
| 1558 def main(): | |
| 1559 if "--version" in sys.argv: | |
| 1560 print(_("bagit-python version %s") % VERSION) | |
| 1561 sys.exit(0) | |
| 1562 | |
| 1563 parser = _make_parser() | |
| 1564 args = parser.parse_args() | |
| 1565 | |
| 1566 if args.processes < 0: | |
| 1567 parser.error(_("The number of processes must be 0 or greater")) | |
| 1568 | |
| 1569 if args.fast and not args.validate: | |
| 1570 parser.error(_("--fast is only allowed as an option for --validate!")) | |
| 1571 | |
| 1572 _configure_logging(args) | |
| 1573 | |
| 1574 rc = 0 | |
| 1575 for bag_dir in args.directory: | |
| 1576 # validate the bag | |
| 1577 if args.validate: | |
| 1578 try: | |
| 1579 bag = Bag(bag_dir) | |
| 1580 # validate throws a BagError or BagValidationError | |
| 1581 bag.validate( | |
| 1582 processes=args.processes, | |
| 1583 fast=args.fast, | |
| 1584 completeness_only=args.completeness_only, | |
| 1585 ) | |
| 1586 if args.fast: | |
| 1587 LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir) | |
| 1588 else: | |
| 1589 LOGGER.info(_("%s is valid"), bag_dir) | |
| 1590 except BagError as e: | |
| 1591 LOGGER.error( | |
| 1592 _("%(bag)s is invalid: %(error)s"), {"bag": bag_dir, "error": e} | |
| 1593 ) | |
| 1594 rc = 1 | |
| 1595 | |
| 1596 # make the bag | |
| 1597 else: | |
| 1598 try: | |
| 1599 make_bag( | |
| 1600 bag_dir, | |
| 1601 bag_info=args.bag_info, | |
| 1602 processes=args.processes, | |
| 1603 checksums=args.checksums, | |
| 1604 ) | |
| 1605 except Exception as exc: | |
| 1606 LOGGER.error( | |
| 1607 _("Failed to create bag in %(bag_directory)s: %(error)s"), | |
| 1608 {"bag_directory": bag_dir, "error": exc}, | |
| 1609 exc_info=True, | |
| 1610 ) | |
| 1611 rc = 1 | |
| 1612 | |
| 1613 sys.exit(rc) | |
| 1614 | |
| 1615 | |
| 1616 if __name__ == "__main__": | |
| 1617 main() |
