comparison env/bin/bagit.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 #!/Users/pldms/Development/Projects/2020/david-matthews-galaxy/guppy_basecaller/env/bin/python3
2 # encoding: utf-8
3
4 from __future__ import absolute_import, division, print_function, unicode_literals
5
6 import argparse
7 import codecs
8 import gettext
9 import hashlib
10 import logging
11 import multiprocessing
12 import os
13 import re
14 import signal
15 import sys
16 import tempfile
17 import unicodedata
18 import warnings
19 from collections import defaultdict
20 from datetime import date
21 from functools import partial
22 from os.path import abspath, isdir, isfile, join
23
24 from pkg_resources import DistributionNotFound, get_distribution
25
26 try:
27 from urllib.parse import urlparse
28 except ImportError:
29 from urlparse import urlparse
30
31
32 def find_locale_dir():
33 for prefix in (os.path.dirname(__file__), sys.prefix):
34 locale_dir = os.path.join(prefix, "locale")
35 if os.path.isdir(locale_dir):
36 return locale_dir
37
38
39 TRANSLATION_CATALOG = gettext.translation(
40 "bagit-python", localedir=find_locale_dir(), fallback=True
41 )
42 if sys.version_info < (3,):
43 _ = TRANSLATION_CATALOG.ugettext
44 else:
45 _ = TRANSLATION_CATALOG.gettext
46
47 MODULE_NAME = "bagit" if __name__ == "__main__" else __name__
48
49 LOGGER = logging.getLogger(MODULE_NAME)
50
51 try:
52 VERSION = get_distribution(MODULE_NAME).version
53 except DistributionNotFound:
54 VERSION = "0.0.dev0"
55
56 PROJECT_URL = "https://github.com/LibraryOfCongress/bagit-python"
57
58 __doc__ = (
59 _(
60 """
61 BagIt is a directory, filename convention for bundling an arbitrary set of
62 files with a manifest, checksums, and additional metadata. More about BagIt
63 can be found at:
64
65 http://purl.org/net/bagit
66
67 bagit.py is a pure python drop in library and command line tool for creating,
68 and working with BagIt directories.
69
70
71 Command-Line Usage:
72
73 Basic usage is to give bagit.py a directory to bag up:
74
75 $ bagit.py my_directory
76
77 This does a bag-in-place operation where the current contents will be moved
78 into the appropriate BagIt structure and the metadata files will be created.
79
80 You can bag multiple directories if you wish:
81
82 $ bagit.py directory1 directory2
83
84 Optionally you can provide metadata which will be stored in bag-info.txt:
85
86 $ bagit.py --source-organization "Library of Congress" directory
87
88 You can also select which manifest algorithms will be used:
89
90 $ bagit.py --sha1 --md5 --sha256 --sha512 directory
91
92
93 Using BagIt from your Python code:
94
95 import bagit
96 bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'})
97 print(bag.entries)
98
99 For more information or to contribute to bagit-python's development, please
100 visit %(PROJECT_URL)s
101 """
102 )
103 % globals()
104 )
105
106 # standard bag-info.txt metadata
107 STANDARD_BAG_INFO_HEADERS = [
108 "Source-Organization",
109 "Organization-Address",
110 "Contact-Name",
111 "Contact-Phone",
112 "Contact-Email",
113 "External-Description",
114 "External-Identifier",
115 "Bag-Size",
116 "Bag-Group-Identifier",
117 "Bag-Count",
118 "Internal-Sender-Identifier",
119 "Internal-Sender-Description",
120 "BagIt-Profile-Identifier",
121 # Bagging-Date is autogenerated
122 # Payload-Oxum is autogenerated
123 ]
124
125 CHECKSUM_ALGOS = hashlib.algorithms_guaranteed
126 DEFAULT_CHECKSUMS = ["sha256", "sha512"]
127
128 #: Block size used when reading files for hashing:
129 HASH_BLOCK_SIZE = 512 * 1024
130
131 #: Convenience function used everywhere we want to open a file to read text
132 #: rather than undecoded bytes:
133 open_text_file = partial(codecs.open, encoding="utf-8", errors="strict")
134
135 # This is the same as decoding the byte values in codecs.BOM:
136 UNICODE_BYTE_ORDER_MARK = "\uFEFF"
137
138
139 def make_bag(
140 bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8"
141 ):
142 """
143 Convert a given directory into a bag. You can pass in arbitrary
144 key/value pairs to put into the bag-info.txt metadata file as
145 the bag_info dictionary.
146 """
147
148 if checksum is not None:
149 warnings.warn(
150 _(
151 "The `checksum` argument for `make_bag` should be replaced with `checksums`"
152 ),
153 DeprecationWarning,
154 )
155 checksums = checksum
156
157 if checksums is None:
158 checksums = DEFAULT_CHECKSUMS
159
160 bag_dir = os.path.abspath(bag_dir)
161 cwd = os.path.abspath(os.path.curdir)
162
163 if cwd.startswith(bag_dir) and cwd != bag_dir:
164 raise RuntimeError(
165 _("Bagging a parent of the current directory is not supported")
166 )
167
168 LOGGER.info(_("Creating bag for directory %s"), bag_dir)
169
170 if not os.path.isdir(bag_dir):
171 LOGGER.error(_("Bag directory %s does not exist"), bag_dir)
172 raise RuntimeError(_("Bag directory %s does not exist") % bag_dir)
173
174 # FIXME: we should do the permissions checks before changing directories
175 old_dir = os.path.abspath(os.path.curdir)
176
177 try:
178 # TODO: These two checks are currently redundant since an unreadable directory will also
179 # often be unwritable, and this code will require review when we add the option to
180 # bag to a destination other than the source. It would be nice if we could avoid
181 # walking the directory tree more than once even if most filesystems will cache it
182
183 unbaggable = _can_bag(bag_dir)
184
185 if unbaggable:
186 LOGGER.error(
187 _("Unable to write to the following directories and files:\n%s"),
188 unbaggable,
189 )
190 raise BagError(_("Missing permissions to move all files and directories"))
191
192 unreadable_dirs, unreadable_files = _can_read(bag_dir)
193
194 if unreadable_dirs or unreadable_files:
195 if unreadable_dirs:
196 LOGGER.error(
197 _("The following directories do not have read permissions:\n%s"),
198 unreadable_dirs,
199 )
200 if unreadable_files:
201 LOGGER.error(
202 _("The following files do not have read permissions:\n%s"),
203 unreadable_files,
204 )
205 raise BagError(
206 _("Read permissions are required to calculate file fixities")
207 )
208 else:
209 LOGGER.info(_("Creating data directory"))
210
211 # FIXME: if we calculate full paths we won't need to deal with changing directories
212 os.chdir(bag_dir)
213 cwd = os.getcwd()
214 temp_data = tempfile.mkdtemp(dir=cwd)
215
216 for f in os.listdir("."):
217 if os.path.abspath(f) == temp_data:
218 continue
219 new_f = os.path.join(temp_data, f)
220 LOGGER.info(
221 _("Moving %(source)s to %(destination)s"),
222 {"source": f, "destination": new_f},
223 )
224 os.rename(f, new_f)
225
226 LOGGER.info(
227 _("Moving %(source)s to %(destination)s"),
228 {"source": temp_data, "destination": "data"},
229 )
230 os.rename(temp_data, "data")
231
232 # permissions for the payload directory should match those of the
233 # original directory
234 os.chmod("data", os.stat(cwd).st_mode)
235
236 total_bytes, total_files = make_manifests(
237 "data", processes, algorithms=checksums, encoding=encoding
238 )
239
240 LOGGER.info(_("Creating bagit.txt"))
241 txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n"""
242 with open_text_file("bagit.txt", "w") as bagit_file:
243 bagit_file.write(txt)
244
245 LOGGER.info(_("Creating bag-info.txt"))
246 if bag_info is None:
247 bag_info = {}
248
249 # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden
250 if "Bagging-Date" not in bag_info:
251 bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d")
252 if "Bag-Software-Agent" not in bag_info:
253 bag_info["Bag-Software-Agent"] = "bagit.py v%s <%s>" % (
254 VERSION,
255 PROJECT_URL,
256 )
257
258 bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)
259 _make_tag_file("bag-info.txt", bag_info)
260
261 for c in checksums:
262 _make_tagmanifest_file(c, bag_dir, encoding="utf-8")
263 except Exception:
264 LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir)
265 raise
266 finally:
267 os.chdir(old_dir)
268
269 return Bag(bag_dir)
270
271
272 class Bag(object):
273 """A representation of a bag."""
274
275 valid_files = ["bagit.txt", "fetch.txt"]
276 valid_directories = ["data"]
277
278 def __init__(self, path=None):
279 super(Bag, self).__init__()
280 self.tags = {}
281 self.info = {}
282 #: Dictionary of manifest entries and the checksum values for each
283 #: algorithm:
284 self.entries = {}
285
286 # To reliably handle Unicode normalization differences, we maintain
287 # lookup dictionaries in both directions for the filenames read from
288 # the filesystem and the manifests so we can handle cases where the
289 # normalization form changed between the bag being created and read.
290 # See https://github.com/LibraryOfCongress/bagit-python/issues/51.
291
292 #: maps Unicode-normalized values to the raw value from the filesystem
293 self.normalized_filesystem_names = {}
294
295 #: maps Unicode-normalized values to the raw value in the manifest
296 self.normalized_manifest_names = {}
297
298 self.algorithms = []
299 self.tag_file_name = None
300 self.path = abspath(path)
301 if path:
302 # if path ends in a path separator, strip it off
303 if path[-1] == os.sep:
304 self.path = path[:-1]
305 self._open()
306
307 def __str__(self):
308 # FIXME: develop a more informative string representation for a Bag
309 return self.path
310
311 @property
312 def algs(self):
313 warnings.warn(_("Use Bag.algorithms instead of Bag.algs"), DeprecationWarning)
314 return self.algorithms
315
316 @property
317 def version(self):
318 warnings.warn(
319 _("Use the Bag.version_info tuple instead of Bag.version"),
320 DeprecationWarning,
321 )
322 return self._version
323
324 def _open(self):
325 # Open the bagit.txt file, and load any tags from it, including
326 # the required version and encoding.
327 bagit_file_path = os.path.join(self.path, "bagit.txt")
328
329 if not isfile(bagit_file_path):
330 raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path)
331
332 self.tags = tags = _load_tag_file(bagit_file_path)
333
334 required_tags = ("BagIt-Version", "Tag-File-Character-Encoding")
335 missing_tags = [i for i in required_tags if i not in tags]
336 if missing_tags:
337 raise BagError(
338 _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags)
339 )
340
341 # To avoid breaking existing code we'll leave self.version as the string
342 # and parse it into a numeric version_info tuple. In version 2.0 we can
343 # break that.
344
345 self._version = tags["BagIt-Version"]
346
347 try:
348 self.version_info = tuple(int(i) for i in self._version.split(".", 1))
349 except ValueError:
350 raise BagError(
351 _("Bag version numbers must be MAJOR.MINOR numbers, not %s")
352 % self._version
353 )
354
355 if (0, 93) <= self.version_info <= (0, 95):
356 self.tag_file_name = "package-info.txt"
357 elif (0, 96) <= self.version_info < (2,):
358 self.tag_file_name = "bag-info.txt"
359 else:
360 raise BagError(_("Unsupported bag version: %s") % self._version)
361
362 self.encoding = tags["Tag-File-Character-Encoding"]
363
364 try:
365 codecs.lookup(self.encoding)
366 except LookupError:
367 raise BagValidationError(_("Unsupported encoding: %s") % self.encoding)
368
369 info_file_path = os.path.join(self.path, self.tag_file_name)
370 if os.path.exists(info_file_path):
371 self.info = _load_tag_file(info_file_path, encoding=self.encoding)
372
373 self._load_manifests()
374
375 def manifest_files(self):
376 for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]:
377 f = os.path.join(self.path, filename)
378 if isfile(f):
379 yield f
380
381 def tagmanifest_files(self):
382 for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]:
383 f = os.path.join(self.path, filename)
384 if isfile(f):
385 yield f
386
387 def compare_manifests_with_fs(self):
388 """
389 Compare the filenames in the manifests to the filenames present on the
390 local filesystem and returns two lists of the files which are only
391 present in the manifests and the files which are only present on the
392 local filesystem, respectively.
393 """
394
395 # We compare the filenames after Unicode normalization so we can
396 # reliably detect normalization changes after bag creation:
397 files_on_fs = set(normalize_unicode(i) for i in self.payload_files())
398 files_in_manifest = set(
399 normalize_unicode(i) for i in self.payload_entries().keys()
400 )
401
402 if self.version_info >= (0, 97):
403 files_in_manifest.update(self.missing_optional_tagfiles())
404
405 only_on_fs = list()
406 only_in_manifest = list()
407
408 for i in files_on_fs.difference(files_in_manifest):
409 only_on_fs.append(self.normalized_filesystem_names[i])
410
411 for i in files_in_manifest.difference(files_on_fs):
412 only_in_manifest.append(self.normalized_manifest_names[i])
413
414 return only_in_manifest, only_on_fs
415
416 def compare_fetch_with_fs(self):
417 """Compares the fetch entries with the files actually
418 in the payload, and returns a list of all the files
419 that still need to be fetched.
420 """
421
422 files_on_fs = set(self.payload_files())
423 files_in_fetch = set(self.files_to_be_fetched())
424
425 return list(files_in_fetch - files_on_fs)
426
427 def payload_files(self):
428 """Returns a list of filenames which are present on the local filesystem"""
429 payload_dir = os.path.join(self.path, "data")
430
431 for dirpath, _, filenames in os.walk(payload_dir):
432 for f in filenames:
433 # Jump through some hoops here to make the payload files are
434 # returned with the directory structure relative to the base
435 # directory rather than the
436 normalized_f = os.path.normpath(f)
437 rel_path = os.path.relpath(
438 os.path.join(dirpath, normalized_f), start=self.path
439 )
440
441 self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path
442 yield rel_path
443
444 def payload_entries(self):
445 """Return a dictionary of items """
446 # Don't use dict comprehension (compatibility with Python < 2.7)
447 return dict(
448 (key, value)
449 for (key, value) in self.entries.items()
450 if key.startswith("data" + os.sep)
451 )
452
453 def save(self, processes=1, manifests=False):
454 """
455 save will persist any changes that have been made to the bag
456 metadata (self.info).
457
458 If you have modified the payload of the bag (added, modified,
459 removed files in the data directory) and want to regenerate manifests
460 set the manifests parameter to True. The default is False since you
461 wouldn't want a save to accidentally create a new manifest for
462 a corrupted bag.
463
464 If you want to control the number of processes that are used when
465 recalculating checksums use the processes parameter.
466 """
467 # Error checking
468 if not self.path:
469 raise BagError(_("Bag.save() called before setting the path!"))
470
471 if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK):
472 raise BagError(
473 _("Cannot save bag to non-existent or inaccessible directory %s")
474 % self.path
475 )
476
477 unbaggable = _can_bag(self.path)
478 if unbaggable:
479 LOGGER.error(
480 _(
481 "Missing write permissions for the following directories and files:\n%s"
482 ),
483 unbaggable,
484 )
485 raise BagError(_("Missing permissions to move all files and directories"))
486
487 unreadable_dirs, unreadable_files = _can_read(self.path)
488 if unreadable_dirs or unreadable_files:
489 if unreadable_dirs:
490 LOGGER.error(
491 _("The following directories do not have read permissions:\n%s"),
492 unreadable_dirs,
493 )
494 if unreadable_files:
495 LOGGER.error(
496 _("The following files do not have read permissions:\n%s"),
497 unreadable_files,
498 )
499 raise BagError(
500 _("Read permissions are required to calculate file fixities")
501 )
502
503 # Change working directory to bag directory so helper functions work
504 old_dir = os.path.abspath(os.path.curdir)
505 os.chdir(self.path)
506
507 # Generate new manifest files
508 if manifests:
509 total_bytes, total_files = make_manifests(
510 "data", processes, algorithms=self.algorithms, encoding=self.encoding
511 )
512
513 # Update Payload-Oxum
514 LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name)
515 self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files)
516
517 _make_tag_file(self.tag_file_name, self.info)
518
519 # Update tag-manifest for changes to manifest & bag-info files
520 for alg in self.algorithms:
521 _make_tagmanifest_file(alg, self.path, encoding=self.encoding)
522
523 # Reload the manifests
524 self._load_manifests()
525
526 os.chdir(old_dir)
527
528 def tagfile_entries(self):
529 return dict(
530 (key, value)
531 for (key, value) in self.entries.items()
532 if not key.startswith("data" + os.sep)
533 )
534
535 def missing_optional_tagfiles(self):
536 """
537 From v0.97 we need to validate any tagfiles listed
538 in the optional tagmanifest(s). As there is no mandatory
539 directory structure for additional tagfiles we can
540 only check for entries with missing files (not missing
541 entries for existing files).
542 """
543 for tagfilepath in self.tagfile_entries().keys():
544 if not os.path.isfile(os.path.join(self.path, tagfilepath)):
545 yield tagfilepath
546
547 def fetch_entries(self):
548 """Load fetch.txt if present and iterate over its contents
549
550 yields (url, size, filename) tuples
551
552 raises BagError for errors such as an unsafe filename referencing
553 data outside of the bag directory
554 """
555
556 fetch_file_path = os.path.join(self.path, "fetch.txt")
557
558 if isfile(fetch_file_path):
559 with open_text_file(
560 fetch_file_path, "r", encoding=self.encoding
561 ) as fetch_file:
562 for line in fetch_file:
563 url, file_size, filename = line.strip().split(None, 2)
564
565 if self._path_is_dangerous(filename):
566 raise BagError(
567 _('Path "%(payload_file)s" in "%(source_file)s" is unsafe')
568 % {
569 "payload_file": filename,
570 "source_file": os.path.join(self.path, "fetch.txt"),
571 }
572 )
573
574 yield url, file_size, filename
575
576 def files_to_be_fetched(self):
577 """
578 Convenience wrapper for fetch_entries which returns only the
579 local filename
580 """
581
582 for url, file_size, filename in self.fetch_entries():
583 yield filename
584
585 def has_oxum(self):
586 return "Payload-Oxum" in self.info
587
588 def validate(self, processes=1, fast=False, completeness_only=False):
589 """Checks the structure and contents are valid.
590
591 If you supply the parameter fast=True the Payload-Oxum (if present) will
592 be used to check that the payload files are present and accounted for,
593 instead of re-calculating fixities and comparing them against the
594 manifest. By default validate() will re-calculate fixities (fast=False).
595 """
596
597 self._validate_structure()
598 self._validate_bagittxt()
599
600 self.validate_fetch()
601
602 self._validate_contents(
603 processes=processes, fast=fast, completeness_only=completeness_only
604 )
605
606 return True
607
608 def is_valid(self, fast=False, completeness_only=False):
609 """Returns validation success or failure as boolean.
610 Optional fast parameter passed directly to validate().
611 """
612
613 try:
614 self.validate(fast=fast, completeness_only=completeness_only)
615 except BagError:
616 return False
617
618 return True
619
620 def _load_manifests(self):
621 self.entries = {}
622 manifests = list(self.manifest_files())
623
624 if self.version_info >= (0, 97):
625 # v0.97+ requires that optional tagfiles are verified.
626 manifests += list(self.tagmanifest_files())
627
628 for manifest_filename in manifests:
629 if not manifest_filename.find("tagmanifest-") is -1:
630 search = "tagmanifest-"
631 else:
632 search = "manifest-"
633 alg = (
634 os.path.basename(manifest_filename)
635 .replace(search, "")
636 .replace(".txt", "")
637 )
638 if alg not in self.algorithms:
639 self.algorithms.append(alg)
640
641 with open_text_file(
642 manifest_filename, "r", encoding=self.encoding
643 ) as manifest_file:
644 if manifest_file.encoding.startswith("UTF"):
645 # We'll check the first character to see if it's a BOM:
646 if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK:
647 # We'll skip it either way by letting line decoding
648 # happen at the new offset but we will issue a warning
649 # for UTF-8 since the presence of a BOM is contrary to
650 # the BagIt specification:
651 if manifest_file.encoding == "UTF-8":
652 LOGGER.warning(
653 _(
654 "%s is encoded using UTF-8 but contains an unnecessary"
655 " byte-order mark, which is not in compliance with the"
656 " BagIt RFC"
657 ),
658 manifest_file.name,
659 )
660 else:
661 manifest_file.seek(0) # Pretend the first read never happened
662
663 for line in manifest_file:
664 line = line.strip()
665
666 # Ignore blank lines and comments.
667 if line == "" or line.startswith("#"):
668 continue
669
670 entry = line.split(None, 1)
671
672 # Format is FILENAME *CHECKSUM
673 if len(entry) != 2:
674 LOGGER.error(
675 _(
676 "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s"
677 ),
678 {"bag": self, "algorithm": alg, "line": line},
679 )
680 continue
681
682 entry_hash = entry[0]
683 entry_path = os.path.normpath(entry[1].lstrip("*"))
684 entry_path = _decode_filename(entry_path)
685
686 if self._path_is_dangerous(entry_path):
687 raise BagError(
688 _(
689 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe'
690 )
691 % {
692 "payload_file": entry_path,
693 "manifest_file": manifest_file.name,
694 }
695 )
696
697 entry_hashes = self.entries.setdefault(entry_path, {})
698
699 if alg in entry_hashes:
700 warning_ctx = {
701 "bag": self,
702 "algorithm": alg,
703 "filename": entry_path,
704 }
705 if entry_hashes[alg] == entry_hash:
706 msg = _(
707 "%(bag)s: %(algorithm)s manifest lists %(filename)s"
708 " multiple times with the same value"
709 )
710 if self.version_info >= (1,):
711 raise BagError(msg % warning_ctx)
712 else:
713 LOGGER.warning(msg, warning_ctx)
714 else:
715 raise BagError(
716 _(
717 "%(bag)s: %(algorithm)s manifest lists %(filename)s"
718 " multiple times with conflicting values"
719 )
720 % warning_ctx
721 )
722
723 entry_hashes[alg] = entry_hash
724
725 self.normalized_manifest_names.update(
726 (normalize_unicode(i), i) for i in self.entries.keys()
727 )
728
729 def _validate_structure(self):
730 """
731 Checks the structure of the bag to determine whether it conforms to the
732 BagIt spec. Returns true on success, otherwise it will raise a
733 BagValidationError exception.
734 """
735
736 self._validate_structure_payload_directory()
737 self._validate_structure_tag_files()
738
739 def _validate_structure_payload_directory(self):
740 data_dir_path = os.path.join(self.path, "data")
741
742 if not isdir(data_dir_path):
743 raise BagValidationError(
744 _("Expected data directory %s does not exist") % data_dir_path
745 )
746
747 def _validate_structure_tag_files(self):
748 # Note: we deviate somewhat from v0.96 of the spec in that it allows
749 # other files and directories to be present in the base directory
750
751 if not list(self.manifest_files()):
752 raise BagValidationError(_("No manifest files found"))
753 if "bagit.txt" not in os.listdir(self.path):
754 raise BagValidationError(
755 _('Expected %s to contain "bagit.txt"') % self.path
756 )
757
758 def validate_fetch(self):
759 """Validate the fetch.txt file
760
761 Raises `BagError` for errors and otherwise returns no value
762 """
763
764 for url, file_size, filename in self.fetch_entries():
765 # fetch_entries will raise a BagError for unsafe filenames
766 # so at this point we will check only that the URL is minimally
767 # well formed:
768 parsed_url = urlparse(url)
769
770 if not all((parsed_url.scheme, parsed_url.netloc)):
771 raise BagError(_("Malformed URL in fetch.txt: %s") % url)
772
773 def _validate_contents(self, processes=1, fast=False, completeness_only=False):
774 if fast and not self.has_oxum():
775 raise BagValidationError(
776 _("Fast validation requires bag-info.txt to include Payload-Oxum")
777 )
778
779 # Perform the fast file count + size check so we can fail early:
780 self._validate_oxum()
781
782 if fast:
783 return
784
785 self._validate_completeness()
786
787 if completeness_only:
788 return
789
790 self._validate_entries(processes)
791
792 def _validate_oxum(self):
793 oxum = self.info.get("Payload-Oxum")
794
795 if oxum is None:
796 return
797
798 # If multiple Payload-Oxum tags (bad idea)
799 # use the first listed in bag-info.txt
800 if isinstance(oxum, list):
801 LOGGER.warning(_("bag-info.txt defines multiple Payload-Oxum values!"))
802 oxum = oxum[0]
803
804 oxum_byte_count, oxum_file_count = oxum.split(".", 1)
805
806 if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit():
807 raise BagError(_("Malformed Payload-Oxum value: %s") % oxum)
808
809 oxum_byte_count = int(oxum_byte_count)
810 oxum_file_count = int(oxum_file_count)
811 total_bytes = 0
812 total_files = 0
813
814 for payload_file in self.payload_files():
815 payload_file = os.path.join(self.path, payload_file)
816 total_bytes += os.stat(payload_file).st_size
817 total_files += 1
818
819 if oxum_file_count != total_files or oxum_byte_count != total_bytes:
820 raise BagValidationError(
821 _(
822 "Payload-Oxum validation failed."
823 " Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes"
824 " but found %(found_file_count)d files and %(found_byte_count)d bytes"
825 )
826 % {
827 "found_file_count": total_files,
828 "found_byte_count": total_bytes,
829 "oxum_file_count": oxum_file_count,
830 "oxum_byte_count": oxum_byte_count,
831 }
832 )
833
834 def _validate_completeness(self):
835 """
836 Verify that the actual file manifests match the files in the data directory
837 """
838 errors = list()
839
840 # First we'll make sure there's no mismatch between the filesystem
841 # and the list of files in the manifest(s)
842 only_in_manifests, only_on_fs = self.compare_manifests_with_fs()
843 for path in only_in_manifests:
844 e = FileMissing(path)
845 LOGGER.warning(force_unicode(e))
846 errors.append(e)
847 for path in only_on_fs:
848 e = UnexpectedFile(path)
849 LOGGER.warning(force_unicode(e))
850 errors.append(e)
851
852 if errors:
853 raise BagValidationError(_("Bag validation failed"), errors)
854
855 def _validate_entries(self, processes):
856 """
857 Verify that the actual file contents match the recorded hashes stored in the manifest files
858 """
859 errors = list()
860
861 if os.name == "posix":
862 worker_init = posix_multiprocessing_worker_initializer
863 else:
864 worker_init = None
865
866 args = (
867 (
868 self.path,
869 self.normalized_filesystem_names.get(rel_path, rel_path),
870 hashes,
871 self.algorithms,
872 )
873 for rel_path, hashes in self.entries.items()
874 )
875
876 try:
877 if processes == 1:
878 hash_results = [_calc_hashes(i) for i in args]
879 else:
880 try:
881 pool = multiprocessing.Pool(
882 processes if processes else None, initializer=worker_init
883 )
884 hash_results = pool.map(_calc_hashes, args)
885 finally:
886 pool.terminate()
887
888 # Any unhandled exceptions are probably fatal
889 except:
890 LOGGER.exception(_("Unable to calculate file hashes for %s"), self)
891 raise
892
893 for rel_path, f_hashes, hashes in hash_results:
894 for alg, computed_hash in f_hashes.items():
895 stored_hash = hashes[alg]
896 if stored_hash.lower() != computed_hash:
897 e = ChecksumMismatch(
898 rel_path, alg, stored_hash.lower(), computed_hash
899 )
900 LOGGER.warning(force_unicode(e))
901 errors.append(e)
902
903 if errors:
904 raise BagValidationError(_("Bag validation failed"), errors)
905
906 def _validate_bagittxt(self):
907 """
908 Verify that bagit.txt conforms to specification
909 """
910 bagit_file_path = os.path.join(self.path, "bagit.txt")
911
912 # Note that we are intentionally opening this file in binary mode so we can confirm
913 # that it does not start with the UTF-8 byte-order-mark
914 with open(bagit_file_path, "rb") as bagit_file:
915 first_line = bagit_file.read(4)
916 if first_line.startswith(codecs.BOM_UTF8):
917 raise BagValidationError(
918 _("bagit.txt must not contain a byte-order mark")
919 )
920
921 def _path_is_dangerous(self, path):
922 """
923 Return true if path looks dangerous, i.e. potentially operates
924 outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json,
925 \\?\c:\, D:\sys32\cmd.exe
926 """
927 if os.path.isabs(path):
928 return True
929 if os.path.expanduser(path) != path:
930 return True
931 if os.path.expandvars(path) != path:
932 return True
933 real_path = os.path.realpath(os.path.join(self.path, path))
934 real_path = os.path.normpath(real_path)
935 bag_path = os.path.realpath(self.path)
936 bag_path = os.path.normpath(bag_path)
937 common = os.path.commonprefix((bag_path, real_path))
938 return not (common == bag_path)
939
940
941 class BagError(Exception):
942 pass
943
944
945 class BagValidationError(BagError):
946 def __init__(self, message, details=None):
947 super(BagValidationError, self).__init__()
948
949 if details is None:
950 details = []
951
952 self.message = message
953 self.details = details
954
955 def __str__(self):
956 if len(self.details) > 0:
957 details = "; ".join([force_unicode(e) for e in self.details])
958 return "%s: %s" % (self.message, details)
959 return self.message
960
961
962 class ManifestErrorDetail(BagError):
963 def __init__(self, path):
964 super(ManifestErrorDetail, self).__init__()
965
966 self.path = path
967
968
969 class ChecksumMismatch(ManifestErrorDetail):
970 def __init__(self, path, algorithm=None, expected=None, found=None):
971 super(ChecksumMismatch, self).__init__(path)
972
973 self.path = path
974 self.algorithm = algorithm
975 self.expected = expected
976 self.found = found
977
978 def __str__(self):
979 return _(
980 '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"'
981 ) % {
982 "path": force_unicode(self.path),
983 "algorithm": self.algorithm,
984 "expected": self.expected,
985 "found": self.found,
986 }
987
988
989 class FileMissing(ManifestErrorDetail):
990 def __str__(self):
991 return _(
992 "%s exists in manifest but was not found on filesystem"
993 ) % force_unicode(self.path)
994
995
996 class UnexpectedFile(ManifestErrorDetail):
997 def __str__(self):
998 return _("%s exists on filesystem but is not in the manifest") % self.path
999
1000
1001 class FileNormalizationConflict(BagError):
1002 """
1003 Exception raised when two files differ only in normalization and thus
1004 are not safely portable
1005 """
1006
1007 def __init__(self, file_a, file_b):
1008 super(FileNormalizationConflict, self).__init__()
1009
1010 self.file_a = file_a
1011 self.file_b = file_b
1012
1013 def __str__(self):
1014 return _(
1015 'Unicode normalization conflict for file "%(file_a)s" and "%(file_b)s"'
1016 ) % {"file_a": self.file_a, "file_b": self.file_b}
1017
1018
1019 def posix_multiprocessing_worker_initializer():
1020 """Ignore SIGINT in multiprocessing workers on POSIX systems"""
1021 signal.signal(signal.SIGINT, signal.SIG_IGN)
1022
1023
1024 # The Unicode normalization form used here doesn't matter – all we care about
1025 # is consistency since the input value will be preserved:
1026
1027
1028 def normalize_unicode_py3(s):
1029 return unicodedata.normalize("NFC", s)
1030
1031
1032 def normalize_unicode_py2(s):
1033 if isinstance(s, str):
1034 s = s.decode("utf-8")
1035 return unicodedata.normalize("NFC", s)
1036
1037
1038 if sys.version_info > (3, 0):
1039 normalize_unicode = normalize_unicode_py3
1040 else:
1041 normalize_unicode = normalize_unicode_py2
1042
1043
1044 def build_unicode_normalized_lookup_dict(filenames):
1045 """
1046 Return a dictionary mapping unicode-normalized filenames to as-encoded
1047 values to efficiently detect conflicts between the filesystem and manifests.
1048
1049 This is necessary because some filesystems and utilities may automatically
1050 apply a different Unicode normalization form to filenames than was applied
1051 when the bag was originally created.
1052
1053 The best known example of this is when a bag is created using a
1054 normalization form other than NFD and then transferred to a Mac where the
1055 HFS+ filesystem will transparently normalize filenames to a variant of NFD
1056 for every call:
1057
1058 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html#UnicodeSubtleties
1059
1060 Windows is documented as storing filenames exactly as provided:
1061
1062 https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx
1063
1064 Linux performs no normalization in the kernel but it is technically
1065 valid for a filesystem to perform normalization, such as when an HFS+
1066 volume is mounted.
1067
1068 See http://www.unicode.org/reports/tr15/ for a full discussion of
1069 equivalence and normalization in Unicode.
1070 """
1071
1072 output = dict()
1073
1074 for filename in filenames:
1075 normalized_filename = normalize_unicode(filename)
1076 if normalized_filename in output:
1077 raise FileNormalizationConflict(filename, output[normalized_filename])
1078 else:
1079 output[normalized_filename] = filename
1080
1081 return output
1082
1083
1084 def get_hashers(algorithms):
1085 """
1086 Given a list of algorithm names, return a dictionary of hasher instances
1087
1088 This avoids redundant code between the creation and validation code where in
1089 both cases we want to avoid reading the same file more than once. The
1090 intended use is a simple for loop:
1091
1092 for block in file:
1093 for hasher in hashers.values():
1094 hasher.update(block)
1095 """
1096
1097 hashers = {}
1098
1099 for alg in algorithms:
1100 try:
1101 hasher = hashlib.new(alg)
1102 except ValueError:
1103 LOGGER.warning(
1104 _("Disabling requested hash algorithm %s: hashlib does not support it"),
1105 alg,
1106 )
1107 continue
1108
1109 hashers[alg] = hasher
1110
1111 if not hashers:
1112 raise ValueError(
1113 _(
1114 "Unable to continue: hashlib does not support any of the requested algorithms!"
1115 )
1116 )
1117
1118 return hashers
1119
1120
1121 def _calc_hashes(args):
1122 # auto unpacking of sequences illegal in Python3
1123 (base_path, rel_path, hashes, algorithms) = args
1124 full_path = os.path.join(base_path, rel_path)
1125
1126 # Create a clone of the default empty hash objects:
1127 f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms)
1128
1129 try:
1130 f_hashes = _calculate_file_hashes(full_path, f_hashers)
1131 except BagValidationError as e:
1132 f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys())
1133
1134 return rel_path, f_hashes, hashes
1135
1136
1137 def _calculate_file_hashes(full_path, f_hashers):
1138 """
1139 Returns a dictionary of (algorithm, hexdigest) values for the provided
1140 filename
1141 """
1142 LOGGER.info(_("Verifying checksum for file %s"), full_path)
1143
1144 try:
1145 with open(full_path, "rb") as f:
1146 while True:
1147 block = f.read(HASH_BLOCK_SIZE)
1148 if not block:
1149 break
1150 for i in f_hashers.values():
1151 i.update(block)
1152 except (OSError, IOError) as e:
1153 raise BagValidationError(
1154 _("Could not read %(filename)s: %(error)s")
1155 % {"filename": full_path, "error": force_unicode(e)}
1156 )
1157
1158 return dict((alg, h.hexdigest()) for alg, h in f_hashers.items())
1159
1160
1161 def _load_tag_file(tag_file_name, encoding="utf-8-sig"):
1162 with open_text_file(tag_file_name, "r", encoding=encoding) as tag_file:
1163 # Store duplicate tags as list of vals
1164 # in order of parsing under the same key.
1165 tags = {}
1166 for name, value in _parse_tags(tag_file):
1167 if name not in tags:
1168 tags[name] = value
1169 continue
1170
1171 if not isinstance(tags[name], list):
1172 tags[name] = [tags[name], value]
1173 else:
1174 tags[name].append(value)
1175
1176 return tags
1177
1178
1179 def _parse_tags(tag_file):
1180 """Parses a tag file, according to RFC 2822. This
1181 includes line folding, permitting extra-long
1182 field values.
1183
1184 See http://www.faqs.org/rfcs/rfc2822.html for
1185 more information.
1186 """
1187
1188 tag_name = None
1189 tag_value = None
1190
1191 # Line folding is handled by yielding values only after we encounter
1192 # the start of a new tag, or if we pass the EOF.
1193 for num, line in enumerate(tag_file):
1194 # Skip over any empty or blank lines.
1195 if len(line) == 0 or line.isspace():
1196 continue
1197 elif line[0].isspace() and tag_value is not None: # folded line
1198 tag_value += line
1199 else:
1200 # Starting a new tag; yield the last one.
1201 if tag_name:
1202 yield (tag_name, tag_value.strip())
1203
1204 if ":" not in line:
1205 raise BagValidationError(
1206 _("%(filename)s contains invalid tag: %(line)s")
1207 % {
1208 "line": line.strip(),
1209 "filename": os.path.basename(tag_file.name),
1210 }
1211 )
1212
1213 parts = line.strip().split(":", 1)
1214 tag_name = parts[0].strip()
1215 tag_value = parts[1]
1216
1217 # Passed the EOF. All done after this.
1218 if tag_name:
1219 yield (tag_name, tag_value.strip())
1220
1221
1222 def _make_tag_file(bag_info_path, bag_info):
1223 headers = sorted(bag_info.keys())
1224 with open_text_file(bag_info_path, "w") as f:
1225 for h in headers:
1226 values = bag_info[h]
1227 if not isinstance(values, list):
1228 values = [values]
1229 for txt in values:
1230 # strip CR, LF and CRLF so they don't mess up the tag file
1231 txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt))
1232 f.write("%s: %s\n" % (h, txt))
1233
1234
1235 def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"):
1236 LOGGER.info(
1237 _("Using %(process_count)d processes to generate manifests: %(algorithms)s"),
1238 {"process_count": processes, "algorithms": ", ".join(algorithms)},
1239 )
1240
1241 manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms)
1242
1243 if processes > 1:
1244 pool = multiprocessing.Pool(processes=processes)
1245 checksums = pool.map(manifest_line_generator, _walk(data_dir))
1246 pool.close()
1247 pool.join()
1248 else:
1249 checksums = [manifest_line_generator(i) for i in _walk(data_dir)]
1250
1251 # At this point we have a list of tuples which start with the algorithm name:
1252 manifest_data = {}
1253 for batch in checksums:
1254 for entry in batch:
1255 manifest_data.setdefault(entry[0], []).append(entry[1:])
1256
1257 # These will be keyed on the algorithm name so we can perform sanity checks
1258 # below to catch failures in the hashing process:
1259 num_files = defaultdict(lambda: 0)
1260 total_bytes = defaultdict(lambda: 0)
1261
1262 for algorithm, values in manifest_data.items():
1263 manifest_filename = "manifest-%s.txt" % algorithm
1264
1265 with open_text_file(manifest_filename, "w", encoding=encoding) as manifest:
1266 for digest, filename, byte_count in values:
1267 manifest.write("%s %s\n" % (digest, _encode_filename(filename)))
1268 num_files[algorithm] += 1
1269 total_bytes[algorithm] += byte_count
1270
1271 # We'll use sets of the values for the error checks and eventually return the payload oxum values:
1272 byte_value_set = set(total_bytes.values())
1273 file_count_set = set(num_files.values())
1274
1275 # allow a bag with an empty payload
1276 if not byte_value_set and not file_count_set:
1277 return 0, 0
1278
1279 if len(file_count_set) != 1:
1280 raise RuntimeError(_("Expected the same number of files for each checksum"))
1281
1282 if len(byte_value_set) != 1:
1283 raise RuntimeError(_("Expected the same number of bytes for each checksums"))
1284
1285 return byte_value_set.pop(), file_count_set.pop()
1286
1287
1288 def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"):
1289 tagmanifest_file = join(bag_dir, "tagmanifest-%s.txt" % alg)
1290 LOGGER.info(_("Creating %s"), tagmanifest_file)
1291
1292 checksums = []
1293 for f in _find_tag_files(bag_dir):
1294 if re.match(r"^tagmanifest-.+\.txt$", f):
1295 continue
1296 with open(join(bag_dir, f), "rb") as fh:
1297 m = hashlib.new(alg)
1298 while True:
1299 block = fh.read(HASH_BLOCK_SIZE)
1300 if not block:
1301 break
1302 m.update(block)
1303 checksums.append((m.hexdigest(), f))
1304
1305 with open_text_file(
1306 join(bag_dir, tagmanifest_file), mode="w", encoding=encoding
1307 ) as tagmanifest:
1308 for digest, filename in checksums:
1309 tagmanifest.write("%s %s\n" % (digest, filename))
1310
1311
1312 def _find_tag_files(bag_dir):
1313 for dir in os.listdir(bag_dir):
1314 if dir != "data":
1315 if os.path.isfile(dir) and not dir.startswith("tagmanifest-"):
1316 yield dir
1317 for dir_name, _, filenames in os.walk(dir):
1318 for filename in filenames:
1319 if filename.startswith("tagmanifest-"):
1320 continue
1321 # remove everything up to the bag_dir directory
1322 p = join(dir_name, filename)
1323 yield os.path.relpath(p, bag_dir)
1324
1325
1326 def _walk(data_dir):
1327 for dirpath, dirnames, filenames in os.walk(data_dir):
1328 # if we don't sort here the order of entries is non-deterministic
1329 # which makes it hard to test the fixity of tagmanifest-md5.txt
1330 filenames.sort()
1331 dirnames.sort()
1332 for fn in filenames:
1333 path = os.path.join(dirpath, fn)
1334 # BagIt spec requires manifest to always use '/' as path separator
1335 if os.path.sep != "/":
1336 parts = path.split(os.path.sep)
1337 path = "/".join(parts)
1338 yield path
1339
1340
1341 def _can_bag(test_dir):
1342 """Scan the provided directory for files which cannot be bagged due to insufficient permissions"""
1343 unbaggable = []
1344
1345 if not os.access(test_dir, os.R_OK):
1346 # We cannot continue without permission to read the source directory
1347 unbaggable.append(test_dir)
1348 return unbaggable
1349
1350 if not os.access(test_dir, os.W_OK):
1351 unbaggable.append(test_dir)
1352
1353 for dirpath, dirnames, filenames in os.walk(test_dir):
1354 for directory in dirnames:
1355 full_path = os.path.join(dirpath, directory)
1356 if not os.access(full_path, os.W_OK):
1357 unbaggable.append(full_path)
1358
1359 return unbaggable
1360
1361
1362 def _can_read(test_dir):
1363 """
1364 returns ((unreadable_dirs), (unreadable_files))
1365 """
1366 unreadable_dirs = []
1367 unreadable_files = []
1368
1369 if not os.access(test_dir, os.R_OK):
1370 unreadable_dirs.append(test_dir)
1371 else:
1372 for dirpath, dirnames, filenames in os.walk(test_dir):
1373 for dn in dirnames:
1374 full_path = os.path.join(dirpath, dn)
1375 if not os.access(full_path, os.R_OK):
1376 unreadable_dirs.append(full_path)
1377 for fn in filenames:
1378 full_path = os.path.join(dirpath, fn)
1379 if not os.access(full_path, os.R_OK):
1380 unreadable_files.append(full_path)
1381 return (tuple(unreadable_dirs), tuple(unreadable_files))
1382
1383
1384 def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS):
1385 LOGGER.info(_("Generating manifest lines for file %s"), filename)
1386
1387 # For performance we'll read the file only once and pass it block
1388 # by block to every requested hash algorithm:
1389 hashers = get_hashers(algorithms)
1390
1391 total_bytes = 0
1392
1393 with open(filename, "rb") as f:
1394 while True:
1395 block = f.read(HASH_BLOCK_SIZE)
1396
1397 if not block:
1398 break
1399
1400 total_bytes += len(block)
1401 for hasher in hashers.values():
1402 hasher.update(block)
1403
1404 decoded_filename = _decode_filename(filename)
1405
1406 # We'll generate a list of results in roughly manifest format but prefixed with the algorithm:
1407 results = [
1408 (alg, hasher.hexdigest(), decoded_filename, total_bytes)
1409 for alg, hasher in hashers.items()
1410 ]
1411
1412 return results
1413
1414
1415 def _encode_filename(s):
1416 s = s.replace("\r", "%0D")
1417 s = s.replace("\n", "%0A")
1418 return s
1419
1420
1421 def _decode_filename(s):
1422 s = re.sub(r"%0D", "\r", s, re.IGNORECASE)
1423 s = re.sub(r"%0A", "\n", s, re.IGNORECASE)
1424 return s
1425
1426
1427 def force_unicode_py2(s):
1428 """Reliably return a Unicode string given a possible unicode or byte string"""
1429 if isinstance(s, str):
1430 return s.decode("utf-8")
1431 else:
1432 return unicode(s)
1433
1434
1435 if sys.version_info > (3, 0):
1436 force_unicode = str
1437 else:
1438 force_unicode = force_unicode_py2
1439
1440 # following code is used for command line program
1441
1442
1443 class BagArgumentParser(argparse.ArgumentParser):
1444 def __init__(self, *args, **kwargs):
1445 self.bag_info = {}
1446 argparse.ArgumentParser.__init__(self, *args, **kwargs)
1447
1448
1449 class BagHeaderAction(argparse.Action):
1450 def __call__(self, parser, _, values, option_string=None):
1451 opt = option_string.lstrip("--")
1452 opt_caps = "-".join([o.capitalize() for o in opt.split("-")])
1453 parser.bag_info[opt_caps] = values
1454
1455
1456 def _make_parser():
1457 parser = BagArgumentParser(
1458 formatter_class=argparse.RawDescriptionHelpFormatter,
1459 description="bagit-python version %s\n\n%s\n" % (VERSION, __doc__.strip()),
1460 )
1461 parser.add_argument(
1462 "--processes",
1463 type=int,
1464 dest="processes",
1465 default=1,
1466 help=_(
1467 "Use multiple processes to calculate checksums faster (default: %(default)s)"
1468 ),
1469 )
1470 parser.add_argument("--log", help=_("The name of the log file (default: stdout)"))
1471 parser.add_argument(
1472 "--quiet",
1473 action="store_true",
1474 help=_("Suppress all progress information other than errors"),
1475 )
1476 parser.add_argument(
1477 "--validate",
1478 action="store_true",
1479 help=_(
1480 "Validate existing bags in the provided directories instead of"
1481 " creating new ones"
1482 ),
1483 )
1484 parser.add_argument(
1485 "--fast",
1486 action="store_true",
1487 help=_(
1488 "Modify --validate behaviour to only test whether the bag directory"
1489 " has the number of files and total size specified in Payload-Oxum"
1490 " without performing checksum validation to detect corruption."
1491 ),
1492 )
1493 parser.add_argument(
1494 "--completeness-only",
1495 action="store_true",
1496 help=_(
1497 "Modify --validate behaviour to test whether the bag directory"
1498 " has the expected payload specified in the checksum manifests"
1499 " without performing checksum validation to detect corruption."
1500 ),
1501 )
1502
1503 checksum_args = parser.add_argument_group(
1504 _("Checksum Algorithms"),
1505 _(
1506 "Select the manifest algorithms to be used when creating bags"
1507 " (default=%s)"
1508 )
1509 % ", ".join(DEFAULT_CHECKSUMS),
1510 )
1511
1512 for i in CHECKSUM_ALGOS:
1513 alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper())
1514 checksum_args.add_argument(
1515 "--%s" % i,
1516 action="append_const",
1517 dest="checksums",
1518 const=i,
1519 help=_("Generate %s manifest when creating a bag") % alg_name,
1520 )
1521
1522 metadata_args = parser.add_argument_group(_("Optional Bag Metadata"))
1523 for header in STANDARD_BAG_INFO_HEADERS:
1524 metadata_args.add_argument(
1525 "--%s" % header.lower(), type=str, action=BagHeaderAction
1526 )
1527
1528 parser.add_argument(
1529 "directory",
1530 nargs="+",
1531 help=_(
1532 "Directory which will be converted into a bag in place"
1533 " by moving any existing files into the BagIt structure"
1534 " and creating the manifests and other metadata."
1535 ),
1536 )
1537
1538 return parser
1539
1540
1541 def _configure_logging(opts):
1542 log_format = "%(asctime)s - %(levelname)s - %(message)s"
1543 if opts.quiet:
1544 level = logging.ERROR
1545 else:
1546 level = logging.INFO
1547 if opts.log:
1548 logging.basicConfig(filename=opts.log, level=level, format=log_format)
1549 else:
1550 logging.basicConfig(level=level, format=log_format)
1551
1552
1553 def main():
1554 if "--version" in sys.argv:
1555 print(_("bagit-python version %s") % VERSION)
1556 sys.exit(0)
1557
1558 parser = _make_parser()
1559 args = parser.parse_args()
1560
1561 if args.processes < 0:
1562 parser.error(_("The number of processes must be 0 or greater"))
1563
1564 if args.fast and not args.validate:
1565 parser.error(_("--fast is only allowed as an option for --validate!"))
1566
1567 _configure_logging(args)
1568
1569 rc = 0
1570 for bag_dir in args.directory:
1571 # validate the bag
1572 if args.validate:
1573 try:
1574 bag = Bag(bag_dir)
1575 # validate throws a BagError or BagValidationError
1576 bag.validate(
1577 processes=args.processes,
1578 fast=args.fast,
1579 completeness_only=args.completeness_only,
1580 )
1581 if args.fast:
1582 LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir)
1583 else:
1584 LOGGER.info(_("%s is valid"), bag_dir)
1585 except BagError as e:
1586 LOGGER.error(
1587 _("%(bag)s is invalid: %(error)s"), {"bag": bag_dir, "error": e}
1588 )
1589 rc = 1
1590
1591 # make the bag
1592 else:
1593 try:
1594 make_bag(
1595 bag_dir,
1596 bag_info=parser.bag_info,
1597 processes=args.processes,
1598 checksums=args.checksums,
1599 )
1600 except Exception as exc:
1601 LOGGER.error(
1602 _("Failed to create bag in %(bag_directory)s: %(error)s"),
1603 {"bag_directory": bag_dir, "error": exc},
1604 exc_info=True,
1605 )
1606 rc = 1
1607
1608 sys.exit(rc)
1609
1610
1611 if __name__ == "__main__":
1612 main()