Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/bagit.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 #!/usr/bin/env python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 from __future__ import absolute_import, division, print_function, unicode_literals | |
5 | |
6 import argparse | |
7 import codecs | |
8 import gettext | |
9 import hashlib | |
10 import logging | |
11 import multiprocessing | |
12 import os | |
13 import re | |
14 import signal | |
15 import sys | |
16 import tempfile | |
17 import unicodedata | |
18 import warnings | |
19 from collections import defaultdict | |
20 from datetime import date | |
21 from functools import partial | |
22 from os.path import abspath, isdir, isfile, join | |
23 | |
24 from pkg_resources import DistributionNotFound, get_distribution | |
25 | |
26 try: | |
27 from urllib.parse import urlparse | |
28 except ImportError: | |
29 from urlparse import urlparse | |
30 | |
31 | |
32 def find_locale_dir(): | |
33 for prefix in (os.path.dirname(__file__), sys.prefix): | |
34 locale_dir = os.path.join(prefix, "locale") | |
35 if os.path.isdir(locale_dir): | |
36 return locale_dir | |
37 | |
38 | |
39 TRANSLATION_CATALOG = gettext.translation( | |
40 "bagit-python", localedir=find_locale_dir(), fallback=True | |
41 ) | |
42 if sys.version_info < (3,): | |
43 _ = TRANSLATION_CATALOG.ugettext | |
44 else: | |
45 _ = TRANSLATION_CATALOG.gettext | |
46 | |
47 MODULE_NAME = "bagit" if __name__ == "__main__" else __name__ | |
48 | |
49 LOGGER = logging.getLogger(MODULE_NAME) | |
50 | |
51 try: | |
52 VERSION = get_distribution(MODULE_NAME).version | |
53 except DistributionNotFound: | |
54 VERSION = "0.0.dev0" | |
55 | |
56 PROJECT_URL = "https://github.com/LibraryOfCongress/bagit-python" | |
57 | |
58 __doc__ = ( | |
59 _( | |
60 """ | |
61 BagIt is a directory, filename convention for bundling an arbitrary set of | |
62 files with a manifest, checksums, and additional metadata. More about BagIt | |
63 can be found at: | |
64 | |
65 http://purl.org/net/bagit | |
66 | |
67 bagit.py is a pure python drop in library and command line tool for creating, | |
68 and working with BagIt directories. | |
69 | |
70 | |
71 Command-Line Usage: | |
72 | |
73 Basic usage is to give bagit.py a directory to bag up: | |
74 | |
75 $ bagit.py my_directory | |
76 | |
77 This does a bag-in-place operation where the current contents will be moved | |
78 into the appropriate BagIt structure and the metadata files will be created. | |
79 | |
80 You can bag multiple directories if you wish: | |
81 | |
82 $ bagit.py directory1 directory2 | |
83 | |
84 Optionally you can provide metadata which will be stored in bag-info.txt: | |
85 | |
86 $ bagit.py --source-organization "Library of Congress" directory | |
87 | |
88 You can also select which manifest algorithms will be used: | |
89 | |
90 $ bagit.py --sha1 --md5 --sha256 --sha512 directory | |
91 | |
92 | |
93 Using BagIt from your Python code: | |
94 | |
95 import bagit | |
96 bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'}) | |
97 print(bag.entries) | |
98 | |
99 For more information or to contribute to bagit-python's development, please | |
100 visit %(PROJECT_URL)s | |
101 """ | |
102 ) | |
103 % globals() | |
104 ) | |
105 | |
106 # standard bag-info.txt metadata | |
107 STANDARD_BAG_INFO_HEADERS = [ | |
108 "Source-Organization", | |
109 "Organization-Address", | |
110 "Contact-Name", | |
111 "Contact-Phone", | |
112 "Contact-Email", | |
113 "External-Description", | |
114 "External-Identifier", | |
115 "Bag-Size", | |
116 "Bag-Group-Identifier", | |
117 "Bag-Count", | |
118 "Internal-Sender-Identifier", | |
119 "Internal-Sender-Description", | |
120 "BagIt-Profile-Identifier", | |
121 # Bagging-Date is autogenerated | |
122 # Payload-Oxum is autogenerated | |
123 ] | |
124 | |
125 try: | |
126 CHECKSUM_ALGOS = hashlib.algorithms_guaranteed | |
127 except AttributeError: | |
128 # FIXME: remove when we drop Python 2 (https://github.com/LibraryOfCongress/bagit-python/issues/102) | |
129 # Python 2.7.0-2.7.8 | |
130 CHECKSUM_ALGOS = set(hashlib.algorithms) | |
131 DEFAULT_CHECKSUMS = ["sha256", "sha512"] | |
132 | |
133 #: Block size used when reading files for hashing: | |
134 HASH_BLOCK_SIZE = 512 * 1024 | |
135 | |
136 #: Convenience function used everywhere we want to open a file to read text | |
137 #: rather than undecoded bytes: | |
138 open_text_file = partial(codecs.open, encoding="utf-8", errors="strict") | |
139 | |
140 # This is the same as decoding the byte values in codecs.BOM: | |
141 UNICODE_BYTE_ORDER_MARK = "\uFEFF" | |
142 | |
143 | |
144 def make_bag( | |
145 bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" | |
146 ): | |
147 """ | |
148 Convert a given directory into a bag. You can pass in arbitrary | |
149 key/value pairs to put into the bag-info.txt metadata file as | |
150 the bag_info dictionary. | |
151 """ | |
152 | |
153 if checksum is not None: | |
154 warnings.warn( | |
155 _( | |
156 "The `checksum` argument for `make_bag` should be replaced with `checksums`" | |
157 ), | |
158 DeprecationWarning, | |
159 ) | |
160 checksums = checksum | |
161 | |
162 if checksums is None: | |
163 checksums = DEFAULT_CHECKSUMS | |
164 | |
165 bag_dir = os.path.abspath(bag_dir) | |
166 cwd = os.path.abspath(os.path.curdir) | |
167 | |
168 if cwd.startswith(bag_dir) and cwd != bag_dir: | |
169 raise RuntimeError( | |
170 _("Bagging a parent of the current directory is not supported") | |
171 ) | |
172 | |
173 LOGGER.info(_("Creating bag for directory %s"), bag_dir) | |
174 | |
175 if not os.path.isdir(bag_dir): | |
176 LOGGER.error(_("Bag directory %s does not exist"), bag_dir) | |
177 raise RuntimeError(_("Bag directory %s does not exist") % bag_dir) | |
178 | |
179 # FIXME: we should do the permissions checks before changing directories | |
180 old_dir = os.path.abspath(os.path.curdir) | |
181 | |
182 try: | |
183 # TODO: These two checks are currently redundant since an unreadable directory will also | |
184 # often be unwritable, and this code will require review when we add the option to | |
185 # bag to a destination other than the source. It would be nice if we could avoid | |
186 # walking the directory tree more than once even if most filesystems will cache it | |
187 | |
188 unbaggable = _can_bag(bag_dir) | |
189 | |
190 if unbaggable: | |
191 LOGGER.error( | |
192 _("Unable to write to the following directories and files:\n%s"), | |
193 unbaggable, | |
194 ) | |
195 raise BagError(_("Missing permissions to move all files and directories")) | |
196 | |
197 unreadable_dirs, unreadable_files = _can_read(bag_dir) | |
198 | |
199 if unreadable_dirs or unreadable_files: | |
200 if unreadable_dirs: | |
201 LOGGER.error( | |
202 _("The following directories do not have read permissions:\n%s"), | |
203 unreadable_dirs, | |
204 ) | |
205 if unreadable_files: | |
206 LOGGER.error( | |
207 _("The following files do not have read permissions:\n%s"), | |
208 unreadable_files, | |
209 ) | |
210 raise BagError( | |
211 _("Read permissions are required to calculate file fixities") | |
212 ) | |
213 else: | |
214 LOGGER.info(_("Creating data directory")) | |
215 | |
216 # FIXME: if we calculate full paths we won't need to deal with changing directories | |
217 os.chdir(bag_dir) | |
218 cwd = os.getcwd() | |
219 temp_data = tempfile.mkdtemp(dir=cwd) | |
220 | |
221 for f in os.listdir("."): | |
222 if os.path.abspath(f) == temp_data: | |
223 continue | |
224 new_f = os.path.join(temp_data, f) | |
225 LOGGER.info( | |
226 _("Moving %(source)s to %(destination)s"), | |
227 {"source": f, "destination": new_f}, | |
228 ) | |
229 os.rename(f, new_f) | |
230 | |
231 LOGGER.info( | |
232 _("Moving %(source)s to %(destination)s"), | |
233 {"source": temp_data, "destination": "data"}, | |
234 ) | |
235 os.rename(temp_data, "data") | |
236 | |
237 # permissions for the payload directory should match those of the | |
238 # original directory | |
239 os.chmod("data", os.stat(cwd).st_mode) | |
240 | |
241 total_bytes, total_files = make_manifests( | |
242 "data", processes, algorithms=checksums, encoding=encoding | |
243 ) | |
244 | |
245 LOGGER.info(_("Creating bagit.txt")) | |
246 txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" | |
247 with open_text_file("bagit.txt", "w") as bagit_file: | |
248 bagit_file.write(txt) | |
249 | |
250 LOGGER.info(_("Creating bag-info.txt")) | |
251 if bag_info is None: | |
252 bag_info = {} | |
253 | |
254 # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden | |
255 if "Bagging-Date" not in bag_info: | |
256 bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") | |
257 if "Bag-Software-Agent" not in bag_info: | |
258 bag_info["Bag-Software-Agent"] = "bagit.py v%s <%s>" % ( | |
259 VERSION, | |
260 PROJECT_URL, | |
261 ) | |
262 | |
263 bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
264 _make_tag_file("bag-info.txt", bag_info) | |
265 | |
266 for c in checksums: | |
267 _make_tagmanifest_file(c, bag_dir, encoding="utf-8") | |
268 except Exception: | |
269 LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir) | |
270 raise | |
271 finally: | |
272 os.chdir(old_dir) | |
273 | |
274 return Bag(bag_dir) | |
275 | |
276 | |
277 class Bag(object): | |
278 """A representation of a bag.""" | |
279 | |
280 valid_files = ["bagit.txt", "fetch.txt"] | |
281 valid_directories = ["data"] | |
282 | |
283 def __init__(self, path=None): | |
284 super(Bag, self).__init__() | |
285 self.tags = {} | |
286 self.info = {} | |
287 #: Dictionary of manifest entries and the checksum values for each | |
288 #: algorithm: | |
289 self.entries = {} | |
290 | |
291 # To reliably handle Unicode normalization differences, we maintain | |
292 # lookup dictionaries in both directions for the filenames read from | |
293 # the filesystem and the manifests so we can handle cases where the | |
294 # normalization form changed between the bag being created and read. | |
295 # See https://github.com/LibraryOfCongress/bagit-python/issues/51. | |
296 | |
297 #: maps Unicode-normalized values to the raw value from the filesystem | |
298 self.normalized_filesystem_names = {} | |
299 | |
300 #: maps Unicode-normalized values to the raw value in the manifest | |
301 self.normalized_manifest_names = {} | |
302 | |
303 self.algorithms = [] | |
304 self.tag_file_name = None | |
305 self.path = abspath(path) | |
306 if path: | |
307 # if path ends in a path separator, strip it off | |
308 if path[-1] == os.sep: | |
309 self.path = path[:-1] | |
310 self._open() | |
311 | |
312 def __str__(self): | |
313 # FIXME: develop a more informative string representation for a Bag | |
314 return self.path | |
315 | |
316 @property | |
317 def algs(self): | |
318 warnings.warn(_("Use Bag.algorithms instead of Bag.algs"), DeprecationWarning) | |
319 return self.algorithms | |
320 | |
321 @property | |
322 def version(self): | |
323 warnings.warn( | |
324 _("Use the Bag.version_info tuple instead of Bag.version"), | |
325 DeprecationWarning, | |
326 ) | |
327 return self._version | |
328 | |
329 def _open(self): | |
330 # Open the bagit.txt file, and load any tags from it, including | |
331 # the required version and encoding. | |
332 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
333 | |
334 if not isfile(bagit_file_path): | |
335 raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path) | |
336 | |
337 self.tags = tags = _load_tag_file(bagit_file_path) | |
338 | |
339 required_tags = ("BagIt-Version", "Tag-File-Character-Encoding") | |
340 missing_tags = [i for i in required_tags if i not in tags] | |
341 if missing_tags: | |
342 raise BagError( | |
343 _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags) | |
344 ) | |
345 | |
346 # To avoid breaking existing code we'll leave self.version as the string | |
347 # and parse it into a numeric version_info tuple. In version 2.0 we can | |
348 # break that. | |
349 | |
350 self._version = tags["BagIt-Version"] | |
351 | |
352 try: | |
353 self.version_info = tuple(int(i) for i in self._version.split(".", 1)) | |
354 except ValueError: | |
355 raise BagError( | |
356 _("Bag version numbers must be MAJOR.MINOR numbers, not %s") | |
357 % self._version | |
358 ) | |
359 | |
360 if (0, 93) <= self.version_info <= (0, 95): | |
361 self.tag_file_name = "package-info.txt" | |
362 elif (0, 96) <= self.version_info < (2,): | |
363 self.tag_file_name = "bag-info.txt" | |
364 else: | |
365 raise BagError(_("Unsupported bag version: %s") % self._version) | |
366 | |
367 self.encoding = tags["Tag-File-Character-Encoding"] | |
368 | |
369 try: | |
370 codecs.lookup(self.encoding) | |
371 except LookupError: | |
372 raise BagValidationError(_("Unsupported encoding: %s") % self.encoding) | |
373 | |
374 info_file_path = os.path.join(self.path, self.tag_file_name) | |
375 if os.path.exists(info_file_path): | |
376 self.info = _load_tag_file(info_file_path, encoding=self.encoding) | |
377 | |
378 self._load_manifests() | |
379 | |
380 def manifest_files(self): | |
381 for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
382 f = os.path.join(self.path, filename) | |
383 if isfile(f): | |
384 yield f | |
385 | |
386 def tagmanifest_files(self): | |
387 for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
388 f = os.path.join(self.path, filename) | |
389 if isfile(f): | |
390 yield f | |
391 | |
392 def compare_manifests_with_fs(self): | |
393 """ | |
394 Compare the filenames in the manifests to the filenames present on the | |
395 local filesystem and returns two lists of the files which are only | |
396 present in the manifests and the files which are only present on the | |
397 local filesystem, respectively. | |
398 """ | |
399 | |
400 # We compare the filenames after Unicode normalization so we can | |
401 # reliably detect normalization changes after bag creation: | |
402 files_on_fs = set(normalize_unicode(i) for i in self.payload_files()) | |
403 files_in_manifest = set( | |
404 normalize_unicode(i) for i in self.payload_entries().keys() | |
405 ) | |
406 | |
407 if self.version_info >= (0, 97): | |
408 files_in_manifest.update(self.missing_optional_tagfiles()) | |
409 | |
410 only_on_fs = list() | |
411 only_in_manifest = list() | |
412 | |
413 for i in files_on_fs.difference(files_in_manifest): | |
414 only_on_fs.append(self.normalized_filesystem_names[i]) | |
415 | |
416 for i in files_in_manifest.difference(files_on_fs): | |
417 only_in_manifest.append(self.normalized_manifest_names[i]) | |
418 | |
419 return only_in_manifest, only_on_fs | |
420 | |
421 def compare_fetch_with_fs(self): | |
422 """Compares the fetch entries with the files actually | |
423 in the payload, and returns a list of all the files | |
424 that still need to be fetched. | |
425 """ | |
426 | |
427 files_on_fs = set(self.payload_files()) | |
428 files_in_fetch = set(self.files_to_be_fetched()) | |
429 | |
430 return list(files_in_fetch - files_on_fs) | |
431 | |
432 def payload_files(self): | |
433 """Returns a list of filenames which are present on the local filesystem""" | |
434 payload_dir = os.path.join(self.path, "data") | |
435 | |
436 for dirpath, _, filenames in os.walk(payload_dir): | |
437 for f in filenames: | |
438 # Jump through some hoops here to make the payload files are | |
439 # returned with the directory structure relative to the base | |
440 # directory rather than the | |
441 normalized_f = os.path.normpath(f) | |
442 rel_path = os.path.relpath( | |
443 os.path.join(dirpath, normalized_f), start=self.path | |
444 ) | |
445 | |
446 self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path | |
447 yield rel_path | |
448 | |
449 def payload_entries(self): | |
450 """Return a dictionary of items """ | |
451 # Don't use dict comprehension (compatibility with Python < 2.7) | |
452 return dict( | |
453 (key, value) | |
454 for (key, value) in self.entries.items() | |
455 if key.startswith("data" + os.sep) | |
456 ) | |
457 | |
458 def save(self, processes=1, manifests=False): | |
459 """ | |
460 save will persist any changes that have been made to the bag | |
461 metadata (self.info). | |
462 | |
463 If you have modified the payload of the bag (added, modified, | |
464 removed files in the data directory) and want to regenerate manifests | |
465 set the manifests parameter to True. The default is False since you | |
466 wouldn't want a save to accidentally create a new manifest for | |
467 a corrupted bag. | |
468 | |
469 If you want to control the number of processes that are used when | |
470 recalculating checksums use the processes parameter. | |
471 """ | |
472 # Error checking | |
473 if not self.path: | |
474 raise BagError(_("Bag.save() called before setting the path!")) | |
475 | |
476 if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK): | |
477 raise BagError( | |
478 _("Cannot save bag to non-existent or inaccessible directory %s") | |
479 % self.path | |
480 ) | |
481 | |
482 unbaggable = _can_bag(self.path) | |
483 if unbaggable: | |
484 LOGGER.error( | |
485 _( | |
486 "Missing write permissions for the following directories and files:\n%s" | |
487 ), | |
488 unbaggable, | |
489 ) | |
490 raise BagError(_("Missing permissions to move all files and directories")) | |
491 | |
492 unreadable_dirs, unreadable_files = _can_read(self.path) | |
493 if unreadable_dirs or unreadable_files: | |
494 if unreadable_dirs: | |
495 LOGGER.error( | |
496 _("The following directories do not have read permissions:\n%s"), | |
497 unreadable_dirs, | |
498 ) | |
499 if unreadable_files: | |
500 LOGGER.error( | |
501 _("The following files do not have read permissions:\n%s"), | |
502 unreadable_files, | |
503 ) | |
504 raise BagError( | |
505 _("Read permissions are required to calculate file fixities") | |
506 ) | |
507 | |
508 # Change working directory to bag directory so helper functions work | |
509 old_dir = os.path.abspath(os.path.curdir) | |
510 os.chdir(self.path) | |
511 | |
512 # Generate new manifest files | |
513 if manifests: | |
514 total_bytes, total_files = make_manifests( | |
515 "data", processes, algorithms=self.algorithms, encoding=self.encoding | |
516 ) | |
517 | |
518 # Update Payload-Oxum | |
519 LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name) | |
520 self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
521 | |
522 _make_tag_file(self.tag_file_name, self.info) | |
523 | |
524 # Update tag-manifest for changes to manifest & bag-info files | |
525 for alg in self.algorithms: | |
526 _make_tagmanifest_file(alg, self.path, encoding=self.encoding) | |
527 | |
528 # Reload the manifests | |
529 self._load_manifests() | |
530 | |
531 os.chdir(old_dir) | |
532 | |
533 def tagfile_entries(self): | |
534 return dict( | |
535 (key, value) | |
536 for (key, value) in self.entries.items() | |
537 if not key.startswith("data" + os.sep) | |
538 ) | |
539 | |
540 def missing_optional_tagfiles(self): | |
541 """ | |
542 From v0.97 we need to validate any tagfiles listed | |
543 in the optional tagmanifest(s). As there is no mandatory | |
544 directory structure for additional tagfiles we can | |
545 only check for entries with missing files (not missing | |
546 entries for existing files). | |
547 """ | |
548 for tagfilepath in self.tagfile_entries().keys(): | |
549 if not os.path.isfile(os.path.join(self.path, tagfilepath)): | |
550 yield tagfilepath | |
551 | |
552 def fetch_entries(self): | |
553 """Load fetch.txt if present and iterate over its contents | |
554 | |
555 yields (url, size, filename) tuples | |
556 | |
557 raises BagError for errors such as an unsafe filename referencing | |
558 data outside of the bag directory | |
559 """ | |
560 | |
561 fetch_file_path = os.path.join(self.path, "fetch.txt") | |
562 | |
563 if isfile(fetch_file_path): | |
564 with open_text_file( | |
565 fetch_file_path, "r", encoding=self.encoding | |
566 ) as fetch_file: | |
567 for line in fetch_file: | |
568 url, file_size, filename = line.strip().split(None, 2) | |
569 | |
570 if self._path_is_dangerous(filename): | |
571 raise BagError( | |
572 _('Path "%(payload_file)s" in "%(source_file)s" is unsafe') | |
573 % { | |
574 "payload_file": filename, | |
575 "source_file": os.path.join(self.path, "fetch.txt"), | |
576 } | |
577 ) | |
578 | |
579 yield url, file_size, filename | |
580 | |
581 def files_to_be_fetched(self): | |
582 """ | |
583 Convenience wrapper for fetch_entries which returns only the | |
584 local filename | |
585 """ | |
586 | |
587 for url, file_size, filename in self.fetch_entries(): | |
588 yield filename | |
589 | |
590 def has_oxum(self): | |
591 return "Payload-Oxum" in self.info | |
592 | |
593 def validate(self, processes=1, fast=False, completeness_only=False): | |
594 """Checks the structure and contents are valid. | |
595 | |
596 If you supply the parameter fast=True the Payload-Oxum (if present) will | |
597 be used to check that the payload files are present and accounted for, | |
598 instead of re-calculating fixities and comparing them against the | |
599 manifest. By default validate() will re-calculate fixities (fast=False). | |
600 """ | |
601 | |
602 self._validate_structure() | |
603 self._validate_bagittxt() | |
604 | |
605 self.validate_fetch() | |
606 | |
607 self._validate_contents( | |
608 processes=processes, fast=fast, completeness_only=completeness_only | |
609 ) | |
610 | |
611 return True | |
612 | |
613 def is_valid(self, fast=False, completeness_only=False): | |
614 """Returns validation success or failure as boolean. | |
615 Optional fast parameter passed directly to validate(). | |
616 """ | |
617 | |
618 try: | |
619 self.validate(fast=fast, completeness_only=completeness_only) | |
620 except BagError: | |
621 return False | |
622 | |
623 return True | |
624 | |
625 def _load_manifests(self): | |
626 self.entries = {} | |
627 manifests = list(self.manifest_files()) | |
628 | |
629 if self.version_info >= (0, 97): | |
630 # v0.97+ requires that optional tagfiles are verified. | |
631 manifests += list(self.tagmanifest_files()) | |
632 | |
633 for manifest_filename in manifests: | |
634 if manifest_filename.find("tagmanifest-") != -1: | |
635 search = "tagmanifest-" | |
636 else: | |
637 search = "manifest-" | |
638 alg = ( | |
639 os.path.basename(manifest_filename) | |
640 .replace(search, "") | |
641 .replace(".txt", "") | |
642 ) | |
643 if alg not in self.algorithms: | |
644 self.algorithms.append(alg) | |
645 | |
646 with open_text_file( | |
647 manifest_filename, "r", encoding=self.encoding | |
648 ) as manifest_file: | |
649 if manifest_file.encoding.startswith("UTF"): | |
650 # We'll check the first character to see if it's a BOM: | |
651 if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK: | |
652 # We'll skip it either way by letting line decoding | |
653 # happen at the new offset but we will issue a warning | |
654 # for UTF-8 since the presence of a BOM is contrary to | |
655 # the BagIt specification: | |
656 if manifest_file.encoding == "UTF-8": | |
657 LOGGER.warning( | |
658 _( | |
659 "%s is encoded using UTF-8 but contains an unnecessary" | |
660 " byte-order mark, which is not in compliance with the" | |
661 " BagIt RFC" | |
662 ), | |
663 manifest_file.name, | |
664 ) | |
665 else: | |
666 manifest_file.seek(0) # Pretend the first read never happened | |
667 | |
668 for line in manifest_file: | |
669 line = line.strip() | |
670 | |
671 # Ignore blank lines and comments. | |
672 if line == "" or line.startswith("#"): | |
673 continue | |
674 | |
675 entry = line.split(None, 1) | |
676 | |
677 # Format is FILENAME *CHECKSUM | |
678 if len(entry) != 2: | |
679 LOGGER.error( | |
680 _( | |
681 "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" | |
682 ), | |
683 {"bag": self, "algorithm": alg, "line": line}, | |
684 ) | |
685 continue | |
686 | |
687 entry_hash = entry[0] | |
688 entry_path = os.path.normpath(entry[1].lstrip("*")) | |
689 entry_path = _decode_filename(entry_path) | |
690 | |
691 if self._path_is_dangerous(entry_path): | |
692 raise BagError( | |
693 _( | |
694 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe' | |
695 ) | |
696 % { | |
697 "payload_file": entry_path, | |
698 "manifest_file": manifest_file.name, | |
699 } | |
700 ) | |
701 | |
702 entry_hashes = self.entries.setdefault(entry_path, {}) | |
703 | |
704 if alg in entry_hashes: | |
705 warning_ctx = { | |
706 "bag": self, | |
707 "algorithm": alg, | |
708 "filename": entry_path, | |
709 } | |
710 if entry_hashes[alg] == entry_hash: | |
711 msg = _( | |
712 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
713 " multiple times with the same value" | |
714 ) | |
715 if self.version_info >= (1,): | |
716 raise BagError(msg % warning_ctx) | |
717 else: | |
718 LOGGER.warning(msg, warning_ctx) | |
719 else: | |
720 raise BagError( | |
721 _( | |
722 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
723 " multiple times with conflicting values" | |
724 ) | |
725 % warning_ctx | |
726 ) | |
727 | |
728 entry_hashes[alg] = entry_hash | |
729 | |
730 self.normalized_manifest_names.update( | |
731 (normalize_unicode(i), i) for i in self.entries.keys() | |
732 ) | |
733 | |
734 def _validate_structure(self): | |
735 """ | |
736 Checks the structure of the bag to determine whether it conforms to the | |
737 BagIt spec. Returns true on success, otherwise it will raise a | |
738 BagValidationError exception. | |
739 """ | |
740 | |
741 self._validate_structure_payload_directory() | |
742 self._validate_structure_tag_files() | |
743 | |
744 def _validate_structure_payload_directory(self): | |
745 data_dir_path = os.path.join(self.path, "data") | |
746 | |
747 if not isdir(data_dir_path): | |
748 raise BagValidationError( | |
749 _("Expected data directory %s does not exist") % data_dir_path | |
750 ) | |
751 | |
752 def _validate_structure_tag_files(self): | |
753 # Note: we deviate somewhat from v0.96 of the spec in that it allows | |
754 # other files and directories to be present in the base directory | |
755 | |
756 if not list(self.manifest_files()): | |
757 raise BagValidationError(_("No manifest files found")) | |
758 if "bagit.txt" not in os.listdir(self.path): | |
759 raise BagValidationError( | |
760 _('Expected %s to contain "bagit.txt"') % self.path | |
761 ) | |
762 | |
763 def validate_fetch(self): | |
764 """Validate the fetch.txt file | |
765 | |
766 Raises `BagError` for errors and otherwise returns no value | |
767 """ | |
768 | |
769 for url, file_size, filename in self.fetch_entries(): | |
770 # fetch_entries will raise a BagError for unsafe filenames | |
771 # so at this point we will check only that the URL is minimally | |
772 # well formed: | |
773 parsed_url = urlparse(url) | |
774 | |
775 if not all((parsed_url.scheme, parsed_url.netloc)): | |
776 raise BagError(_("Malformed URL in fetch.txt: %s") % url) | |
777 | |
778 def _validate_contents(self, processes=1, fast=False, completeness_only=False): | |
779 if fast and not self.has_oxum(): | |
780 raise BagValidationError( | |
781 _("Fast validation requires bag-info.txt to include Payload-Oxum") | |
782 ) | |
783 | |
784 # Perform the fast file count + size check so we can fail early: | |
785 self._validate_oxum() | |
786 | |
787 if fast: | |
788 return | |
789 | |
790 self._validate_completeness() | |
791 | |
792 if completeness_only: | |
793 return | |
794 | |
795 self._validate_entries(processes) | |
796 | |
797 def _validate_oxum(self): | |
798 oxum = self.info.get("Payload-Oxum") | |
799 | |
800 if oxum is None: | |
801 return | |
802 | |
803 # If multiple Payload-Oxum tags (bad idea) | |
804 # use the first listed in bag-info.txt | |
805 if isinstance(oxum, list): | |
806 LOGGER.warning(_("bag-info.txt defines multiple Payload-Oxum values!")) | |
807 oxum = oxum[0] | |
808 | |
809 oxum_byte_count, oxum_file_count = oxum.split(".", 1) | |
810 | |
811 if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit(): | |
812 raise BagError(_("Malformed Payload-Oxum value: %s") % oxum) | |
813 | |
814 oxum_byte_count = int(oxum_byte_count) | |
815 oxum_file_count = int(oxum_file_count) | |
816 total_bytes = 0 | |
817 total_files = 0 | |
818 | |
819 for payload_file in self.payload_files(): | |
820 payload_file = os.path.join(self.path, payload_file) | |
821 total_bytes += os.stat(payload_file).st_size | |
822 total_files += 1 | |
823 | |
824 if oxum_file_count != total_files or oxum_byte_count != total_bytes: | |
825 raise BagValidationError( | |
826 _( | |
827 "Payload-Oxum validation failed." | |
828 " Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes" | |
829 " but found %(found_file_count)d files and %(found_byte_count)d bytes" | |
830 ) | |
831 % { | |
832 "found_file_count": total_files, | |
833 "found_byte_count": total_bytes, | |
834 "oxum_file_count": oxum_file_count, | |
835 "oxum_byte_count": oxum_byte_count, | |
836 } | |
837 ) | |
838 | |
839 def _validate_completeness(self): | |
840 """ | |
841 Verify that the actual file manifests match the files in the data directory | |
842 """ | |
843 errors = list() | |
844 | |
845 # First we'll make sure there's no mismatch between the filesystem | |
846 # and the list of files in the manifest(s) | |
847 only_in_manifests, only_on_fs = self.compare_manifests_with_fs() | |
848 for path in only_in_manifests: | |
849 e = FileMissing(path) | |
850 LOGGER.warning(force_unicode(e)) | |
851 errors.append(e) | |
852 for path in only_on_fs: | |
853 e = UnexpectedFile(path) | |
854 LOGGER.warning(force_unicode(e)) | |
855 errors.append(e) | |
856 | |
857 if errors: | |
858 raise BagValidationError(_("Bag validation failed"), errors) | |
859 | |
860 def _validate_entries(self, processes): | |
861 """ | |
862 Verify that the actual file contents match the recorded hashes stored in the manifest files | |
863 """ | |
864 errors = list() | |
865 | |
866 if os.name == "posix": | |
867 worker_init = posix_multiprocessing_worker_initializer | |
868 else: | |
869 worker_init = None | |
870 | |
871 args = ( | |
872 ( | |
873 self.path, | |
874 self.normalized_filesystem_names.get(rel_path, rel_path), | |
875 hashes, | |
876 self.algorithms, | |
877 ) | |
878 for rel_path, hashes in self.entries.items() | |
879 ) | |
880 | |
881 try: | |
882 if processes == 1: | |
883 hash_results = [_calc_hashes(i) for i in args] | |
884 else: | |
885 try: | |
886 pool = multiprocessing.Pool( | |
887 processes if processes else None, initializer=worker_init | |
888 ) | |
889 hash_results = pool.map(_calc_hashes, args) | |
890 finally: | |
891 pool.terminate() | |
892 | |
893 # Any unhandled exceptions are probably fatal | |
894 except: | |
895 LOGGER.exception(_("Unable to calculate file hashes for %s"), self) | |
896 raise | |
897 | |
898 for rel_path, f_hashes, hashes in hash_results: | |
899 for alg, computed_hash in f_hashes.items(): | |
900 stored_hash = hashes[alg] | |
901 if stored_hash.lower() != computed_hash: | |
902 e = ChecksumMismatch( | |
903 rel_path, alg, stored_hash.lower(), computed_hash | |
904 ) | |
905 LOGGER.warning(force_unicode(e)) | |
906 errors.append(e) | |
907 | |
908 if errors: | |
909 raise BagValidationError(_("Bag validation failed"), errors) | |
910 | |
911 def _validate_bagittxt(self): | |
912 """ | |
913 Verify that bagit.txt conforms to specification | |
914 """ | |
915 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
916 | |
917 # Note that we are intentionally opening this file in binary mode so we can confirm | |
918 # that it does not start with the UTF-8 byte-order-mark | |
919 with open(bagit_file_path, "rb") as bagit_file: | |
920 first_line = bagit_file.read(4) | |
921 if first_line.startswith(codecs.BOM_UTF8): | |
922 raise BagValidationError( | |
923 _("bagit.txt must not contain a byte-order mark") | |
924 ) | |
925 | |
926 def _path_is_dangerous(self, path): | |
927 """ | |
928 Return true if path looks dangerous, i.e. potentially operates | |
929 outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json, | |
930 \\?\c:\, D:\sys32\cmd.exe | |
931 """ | |
932 if os.path.isabs(path): | |
933 return True | |
934 if os.path.expanduser(path) != path: | |
935 return True | |
936 if os.path.expandvars(path) != path: | |
937 return True | |
938 real_path = os.path.realpath(os.path.join(self.path, path)) | |
939 real_path = os.path.normpath(real_path) | |
940 bag_path = os.path.realpath(self.path) | |
941 bag_path = os.path.normpath(bag_path) | |
942 common = os.path.commonprefix((bag_path, real_path)) | |
943 return not (common == bag_path) | |
944 | |
945 | |
946 class BagError(Exception): | |
947 pass | |
948 | |
949 | |
950 class BagValidationError(BagError): | |
951 def __init__(self, message, details=None): | |
952 super(BagValidationError, self).__init__() | |
953 | |
954 if details is None: | |
955 details = [] | |
956 | |
957 self.message = message | |
958 self.details = details | |
959 | |
960 def __str__(self): | |
961 if len(self.details) > 0: | |
962 details = "; ".join([force_unicode(e) for e in self.details]) | |
963 return "%s: %s" % (self.message, details) | |
964 return self.message | |
965 | |
966 | |
967 class ManifestErrorDetail(BagError): | |
968 def __init__(self, path): | |
969 super(ManifestErrorDetail, self).__init__() | |
970 | |
971 self.path = path | |
972 | |
973 | |
974 class ChecksumMismatch(ManifestErrorDetail): | |
975 def __init__(self, path, algorithm=None, expected=None, found=None): | |
976 super(ChecksumMismatch, self).__init__(path) | |
977 | |
978 self.path = path | |
979 self.algorithm = algorithm | |
980 self.expected = expected | |
981 self.found = found | |
982 | |
983 def __str__(self): | |
984 return _( | |
985 '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"' | |
986 ) % { | |
987 "path": force_unicode(self.path), | |
988 "algorithm": self.algorithm, | |
989 "expected": self.expected, | |
990 "found": self.found, | |
991 } | |
992 | |
993 | |
994 class FileMissing(ManifestErrorDetail): | |
995 def __str__(self): | |
996 return _( | |
997 "%s exists in manifest but was not found on filesystem" | |
998 ) % force_unicode(self.path) | |
999 | |
1000 | |
1001 class UnexpectedFile(ManifestErrorDetail): | |
1002 def __str__(self): | |
1003 return _("%s exists on filesystem but is not in the manifest") % self.path | |
1004 | |
1005 | |
1006 class FileNormalizationConflict(BagError): | |
1007 """ | |
1008 Exception raised when two files differ only in normalization and thus | |
1009 are not safely portable | |
1010 """ | |
1011 | |
1012 def __init__(self, file_a, file_b): | |
1013 super(FileNormalizationConflict, self).__init__() | |
1014 | |
1015 self.file_a = file_a | |
1016 self.file_b = file_b | |
1017 | |
1018 def __str__(self): | |
1019 return _( | |
1020 'Unicode normalization conflict for file "%(file_a)s" and "%(file_b)s"' | |
1021 ) % {"file_a": self.file_a, "file_b": self.file_b} | |
1022 | |
1023 | |
1024 def posix_multiprocessing_worker_initializer(): | |
1025 """Ignore SIGINT in multiprocessing workers on POSIX systems""" | |
1026 signal.signal(signal.SIGINT, signal.SIG_IGN) | |
1027 | |
1028 | |
1029 # The Unicode normalization form used here doesn't matter – all we care about | |
1030 # is consistency since the input value will be preserved: | |
1031 | |
1032 | |
1033 def normalize_unicode_py3(s): | |
1034 return unicodedata.normalize("NFC", s) | |
1035 | |
1036 | |
1037 def normalize_unicode_py2(s): | |
1038 if isinstance(s, str): | |
1039 s = s.decode("utf-8") | |
1040 return unicodedata.normalize("NFC", s) | |
1041 | |
1042 | |
1043 if sys.version_info > (3, 0): | |
1044 normalize_unicode = normalize_unicode_py3 | |
1045 else: | |
1046 normalize_unicode = normalize_unicode_py2 | |
1047 | |
1048 | |
1049 def build_unicode_normalized_lookup_dict(filenames): | |
1050 """ | |
1051 Return a dictionary mapping unicode-normalized filenames to as-encoded | |
1052 values to efficiently detect conflicts between the filesystem and manifests. | |
1053 | |
1054 This is necessary because some filesystems and utilities may automatically | |
1055 apply a different Unicode normalization form to filenames than was applied | |
1056 when the bag was originally created. | |
1057 | |
1058 The best known example of this is when a bag is created using a | |
1059 normalization form other than NFD and then transferred to a Mac where the | |
1060 HFS+ filesystem will transparently normalize filenames to a variant of NFD | |
1061 for every call: | |
1062 | |
1063 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html#UnicodeSubtleties | |
1064 | |
1065 Windows is documented as storing filenames exactly as provided: | |
1066 | |
1067 https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx | |
1068 | |
1069 Linux performs no normalization in the kernel but it is technically | |
1070 valid for a filesystem to perform normalization, such as when an HFS+ | |
1071 volume is mounted. | |
1072 | |
1073 See http://www.unicode.org/reports/tr15/ for a full discussion of | |
1074 equivalence and normalization in Unicode. | |
1075 """ | |
1076 | |
1077 output = dict() | |
1078 | |
1079 for filename in filenames: | |
1080 normalized_filename = normalize_unicode(filename) | |
1081 if normalized_filename in output: | |
1082 raise FileNormalizationConflict(filename, output[normalized_filename]) | |
1083 else: | |
1084 output[normalized_filename] = filename | |
1085 | |
1086 return output | |
1087 | |
1088 | |
1089 def get_hashers(algorithms): | |
1090 """ | |
1091 Given a list of algorithm names, return a dictionary of hasher instances | |
1092 | |
1093 This avoids redundant code between the creation and validation code where in | |
1094 both cases we want to avoid reading the same file more than once. The | |
1095 intended use is a simple for loop: | |
1096 | |
1097 for block in file: | |
1098 for hasher in hashers.values(): | |
1099 hasher.update(block) | |
1100 """ | |
1101 | |
1102 hashers = {} | |
1103 | |
1104 for alg in algorithms: | |
1105 try: | |
1106 hasher = hashlib.new(alg) | |
1107 except ValueError: | |
1108 LOGGER.warning( | |
1109 _("Disabling requested hash algorithm %s: hashlib does not support it"), | |
1110 alg, | |
1111 ) | |
1112 continue | |
1113 | |
1114 hashers[alg] = hasher | |
1115 | |
1116 if not hashers: | |
1117 raise ValueError( | |
1118 _( | |
1119 "Unable to continue: hashlib does not support any of the requested algorithms!" | |
1120 ) | |
1121 ) | |
1122 | |
1123 return hashers | |
1124 | |
1125 | |
1126 def _calc_hashes(args): | |
1127 # auto unpacking of sequences illegal in Python3 | |
1128 (base_path, rel_path, hashes, algorithms) = args | |
1129 full_path = os.path.join(base_path, rel_path) | |
1130 | |
1131 # Create a clone of the default empty hash objects: | |
1132 f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms) | |
1133 | |
1134 try: | |
1135 f_hashes = _calculate_file_hashes(full_path, f_hashers) | |
1136 except BagValidationError as e: | |
1137 f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys()) | |
1138 | |
1139 return rel_path, f_hashes, hashes | |
1140 | |
1141 | |
1142 def _calculate_file_hashes(full_path, f_hashers): | |
1143 """ | |
1144 Returns a dictionary of (algorithm, hexdigest) values for the provided | |
1145 filename | |
1146 """ | |
1147 LOGGER.info(_("Verifying checksum for file %s"), full_path) | |
1148 | |
1149 try: | |
1150 with open(full_path, "rb") as f: | |
1151 while True: | |
1152 block = f.read(HASH_BLOCK_SIZE) | |
1153 if not block: | |
1154 break | |
1155 for i in f_hashers.values(): | |
1156 i.update(block) | |
1157 except (OSError, IOError) as e: | |
1158 raise BagValidationError( | |
1159 _("Could not read %(filename)s: %(error)s") | |
1160 % {"filename": full_path, "error": force_unicode(e)} | |
1161 ) | |
1162 | |
1163 return dict((alg, h.hexdigest()) for alg, h in f_hashers.items()) | |
1164 | |
1165 | |
1166 def _load_tag_file(tag_file_name, encoding="utf-8-sig"): | |
1167 with open_text_file(tag_file_name, "r", encoding=encoding) as tag_file: | |
1168 # Store duplicate tags as list of vals | |
1169 # in order of parsing under the same key. | |
1170 tags = {} | |
1171 for name, value in _parse_tags(tag_file): | |
1172 if name not in tags: | |
1173 tags[name] = value | |
1174 continue | |
1175 | |
1176 if not isinstance(tags[name], list): | |
1177 tags[name] = [tags[name], value] | |
1178 else: | |
1179 tags[name].append(value) | |
1180 | |
1181 return tags | |
1182 | |
1183 | |
1184 def _parse_tags(tag_file): | |
1185 """Parses a tag file, according to RFC 2822. This | |
1186 includes line folding, permitting extra-long | |
1187 field values. | |
1188 | |
1189 See http://www.faqs.org/rfcs/rfc2822.html for | |
1190 more information. | |
1191 """ | |
1192 | |
1193 tag_name = None | |
1194 tag_value = None | |
1195 | |
1196 # Line folding is handled by yielding values only after we encounter | |
1197 # the start of a new tag, or if we pass the EOF. | |
1198 for num, line in enumerate(tag_file): | |
1199 # Skip over any empty or blank lines. | |
1200 if len(line) == 0 or line.isspace(): | |
1201 continue | |
1202 elif line[0].isspace() and tag_value is not None: # folded line | |
1203 tag_value += line | |
1204 else: | |
1205 # Starting a new tag; yield the last one. | |
1206 if tag_name: | |
1207 yield (tag_name, tag_value.strip()) | |
1208 | |
1209 if ":" not in line: | |
1210 raise BagValidationError( | |
1211 _("%(filename)s contains invalid tag: %(line)s") | |
1212 % { | |
1213 "line": line.strip(), | |
1214 "filename": os.path.basename(tag_file.name), | |
1215 } | |
1216 ) | |
1217 | |
1218 parts = line.strip().split(":", 1) | |
1219 tag_name = parts[0].strip() | |
1220 tag_value = parts[1] | |
1221 | |
1222 # Passed the EOF. All done after this. | |
1223 if tag_name: | |
1224 yield (tag_name, tag_value.strip()) | |
1225 | |
1226 | |
1227 def _make_tag_file(bag_info_path, bag_info): | |
1228 headers = sorted(bag_info.keys()) | |
1229 with open_text_file(bag_info_path, "w") as f: | |
1230 for h in headers: | |
1231 values = bag_info[h] | |
1232 if not isinstance(values, list): | |
1233 values = [values] | |
1234 for txt in values: | |
1235 # strip CR, LF and CRLF so they don't mess up the tag file | |
1236 txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt)) | |
1237 f.write("%s: %s\n" % (h, txt)) | |
1238 | |
1239 | |
1240 def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"): | |
1241 LOGGER.info( | |
1242 _("Using %(process_count)d processes to generate manifests: %(algorithms)s"), | |
1243 {"process_count": processes, "algorithms": ", ".join(algorithms)}, | |
1244 ) | |
1245 | |
1246 manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms) | |
1247 | |
1248 if processes > 1: | |
1249 pool = multiprocessing.Pool(processes=processes) | |
1250 checksums = pool.map(manifest_line_generator, _walk(data_dir)) | |
1251 pool.close() | |
1252 pool.join() | |
1253 else: | |
1254 checksums = [manifest_line_generator(i) for i in _walk(data_dir)] | |
1255 | |
1256 # At this point we have a list of tuples which start with the algorithm name: | |
1257 manifest_data = {} | |
1258 for batch in checksums: | |
1259 for entry in batch: | |
1260 manifest_data.setdefault(entry[0], []).append(entry[1:]) | |
1261 | |
1262 # These will be keyed on the algorithm name so we can perform sanity checks | |
1263 # below to catch failures in the hashing process: | |
1264 num_files = defaultdict(lambda: 0) | |
1265 total_bytes = defaultdict(lambda: 0) | |
1266 | |
1267 for algorithm, values in manifest_data.items(): | |
1268 manifest_filename = "manifest-%s.txt" % algorithm | |
1269 | |
1270 with open_text_file(manifest_filename, "w", encoding=encoding) as manifest: | |
1271 for digest, filename, byte_count in values: | |
1272 manifest.write("%s %s\n" % (digest, _encode_filename(filename))) | |
1273 num_files[algorithm] += 1 | |
1274 total_bytes[algorithm] += byte_count | |
1275 | |
1276 # We'll use sets of the values for the error checks and eventually return the payload oxum values: | |
1277 byte_value_set = set(total_bytes.values()) | |
1278 file_count_set = set(num_files.values()) | |
1279 | |
1280 # allow a bag with an empty payload | |
1281 if not byte_value_set and not file_count_set: | |
1282 return 0, 0 | |
1283 | |
1284 if len(file_count_set) != 1: | |
1285 raise RuntimeError(_("Expected the same number of files for each checksum")) | |
1286 | |
1287 if len(byte_value_set) != 1: | |
1288 raise RuntimeError(_("Expected the same number of bytes for each checksums")) | |
1289 | |
1290 return byte_value_set.pop(), file_count_set.pop() | |
1291 | |
1292 | |
1293 def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"): | |
1294 tagmanifest_file = join(bag_dir, "tagmanifest-%s.txt" % alg) | |
1295 LOGGER.info(_("Creating %s"), tagmanifest_file) | |
1296 | |
1297 checksums = [] | |
1298 for f in _find_tag_files(bag_dir): | |
1299 if re.match(r"^tagmanifest-.+\.txt$", f): | |
1300 continue | |
1301 with open(join(bag_dir, f), "rb") as fh: | |
1302 m = hashlib.new(alg) | |
1303 while True: | |
1304 block = fh.read(HASH_BLOCK_SIZE) | |
1305 if not block: | |
1306 break | |
1307 m.update(block) | |
1308 checksums.append((m.hexdigest(), f)) | |
1309 | |
1310 with open_text_file( | |
1311 join(bag_dir, tagmanifest_file), mode="w", encoding=encoding | |
1312 ) as tagmanifest: | |
1313 for digest, filename in checksums: | |
1314 tagmanifest.write("%s %s\n" % (digest, filename)) | |
1315 | |
1316 | |
1317 def _find_tag_files(bag_dir): | |
1318 for dir in os.listdir(bag_dir): | |
1319 if dir != "data": | |
1320 if os.path.isfile(dir) and not dir.startswith("tagmanifest-"): | |
1321 yield dir | |
1322 for dir_name, _, filenames in os.walk(dir): | |
1323 for filename in filenames: | |
1324 if filename.startswith("tagmanifest-"): | |
1325 continue | |
1326 # remove everything up to the bag_dir directory | |
1327 p = join(dir_name, filename) | |
1328 yield os.path.relpath(p, bag_dir) | |
1329 | |
1330 | |
1331 def _walk(data_dir): | |
1332 for dirpath, dirnames, filenames in os.walk(data_dir): | |
1333 # if we don't sort here the order of entries is non-deterministic | |
1334 # which makes it hard to test the fixity of tagmanifest-md5.txt | |
1335 filenames.sort() | |
1336 dirnames.sort() | |
1337 for fn in filenames: | |
1338 path = os.path.join(dirpath, fn) | |
1339 # BagIt spec requires manifest to always use '/' as path separator | |
1340 if os.path.sep != "/": | |
1341 parts = path.split(os.path.sep) | |
1342 path = "/".join(parts) | |
1343 yield path | |
1344 | |
1345 | |
1346 def _can_bag(test_dir): | |
1347 """Scan the provided directory for files which cannot be bagged due to insufficient permissions""" | |
1348 unbaggable = [] | |
1349 | |
1350 if not os.access(test_dir, os.R_OK): | |
1351 # We cannot continue without permission to read the source directory | |
1352 unbaggable.append(test_dir) | |
1353 return unbaggable | |
1354 | |
1355 if not os.access(test_dir, os.W_OK): | |
1356 unbaggable.append(test_dir) | |
1357 | |
1358 for dirpath, dirnames, filenames in os.walk(test_dir): | |
1359 for directory in dirnames: | |
1360 full_path = os.path.join(dirpath, directory) | |
1361 if not os.access(full_path, os.W_OK): | |
1362 unbaggable.append(full_path) | |
1363 | |
1364 return unbaggable | |
1365 | |
1366 | |
1367 def _can_read(test_dir): | |
1368 """ | |
1369 returns ((unreadable_dirs), (unreadable_files)) | |
1370 """ | |
1371 unreadable_dirs = [] | |
1372 unreadable_files = [] | |
1373 | |
1374 if not os.access(test_dir, os.R_OK): | |
1375 unreadable_dirs.append(test_dir) | |
1376 else: | |
1377 for dirpath, dirnames, filenames in os.walk(test_dir): | |
1378 for dn in dirnames: | |
1379 full_path = os.path.join(dirpath, dn) | |
1380 if not os.access(full_path, os.R_OK): | |
1381 unreadable_dirs.append(full_path) | |
1382 for fn in filenames: | |
1383 full_path = os.path.join(dirpath, fn) | |
1384 if not os.access(full_path, os.R_OK): | |
1385 unreadable_files.append(full_path) | |
1386 return (tuple(unreadable_dirs), tuple(unreadable_files)) | |
1387 | |
1388 | |
1389 def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): | |
1390 LOGGER.info(_("Generating manifest lines for file %s"), filename) | |
1391 | |
1392 # For performance we'll read the file only once and pass it block | |
1393 # by block to every requested hash algorithm: | |
1394 hashers = get_hashers(algorithms) | |
1395 | |
1396 total_bytes = 0 | |
1397 | |
1398 with open(filename, "rb") as f: | |
1399 while True: | |
1400 block = f.read(HASH_BLOCK_SIZE) | |
1401 | |
1402 if not block: | |
1403 break | |
1404 | |
1405 total_bytes += len(block) | |
1406 for hasher in hashers.values(): | |
1407 hasher.update(block) | |
1408 | |
1409 decoded_filename = _decode_filename(filename) | |
1410 | |
1411 # We'll generate a list of results in roughly manifest format but prefixed with the algorithm: | |
1412 results = [ | |
1413 (alg, hasher.hexdigest(), decoded_filename, total_bytes) | |
1414 for alg, hasher in hashers.items() | |
1415 ] | |
1416 | |
1417 return results | |
1418 | |
1419 | |
1420 def _encode_filename(s): | |
1421 s = s.replace("\r", "%0D") | |
1422 s = s.replace("\n", "%0A") | |
1423 return s | |
1424 | |
1425 | |
1426 def _decode_filename(s): | |
1427 s = re.sub(r"%0D", "\r", s, re.IGNORECASE) | |
1428 s = re.sub(r"%0A", "\n", s, re.IGNORECASE) | |
1429 return s | |
1430 | |
1431 | |
1432 def force_unicode_py2(s): | |
1433 """Reliably return a Unicode string given a possible unicode or byte string""" | |
1434 if isinstance(s, str): | |
1435 return s.decode("utf-8") | |
1436 else: | |
1437 return unicode(s) | |
1438 | |
1439 | |
1440 if sys.version_info > (3, 0): | |
1441 force_unicode = str | |
1442 else: | |
1443 force_unicode = force_unicode_py2 | |
1444 | |
1445 # following code is used for command line program | |
1446 | |
1447 | |
1448 class BagArgumentParser(argparse.ArgumentParser): | |
1449 def __init__(self, *args, **kwargs): | |
1450 argparse.ArgumentParser.__init__(self, *args, **kwargs) | |
1451 self.set_defaults(bag_info={}) | |
1452 | |
1453 | |
1454 class BagHeaderAction(argparse.Action): | |
1455 def __call__(self, parser, namespace, values, option_string=None): | |
1456 opt = option_string.lstrip("--") | |
1457 opt_caps = "-".join([o.capitalize() for o in opt.split("-")]) | |
1458 namespace.bag_info[opt_caps] = values | |
1459 | |
1460 | |
1461 def _make_parser(): | |
1462 parser = BagArgumentParser( | |
1463 formatter_class=argparse.RawDescriptionHelpFormatter, | |
1464 description="bagit-python version %s\n\n%s\n" % (VERSION, __doc__.strip()), | |
1465 ) | |
1466 parser.add_argument( | |
1467 "--processes", | |
1468 type=int, | |
1469 dest="processes", | |
1470 default=1, | |
1471 help=_( | |
1472 "Use multiple processes to calculate checksums faster (default: %(default)s)" | |
1473 ), | |
1474 ) | |
1475 parser.add_argument("--log", help=_("The name of the log file (default: stdout)")) | |
1476 parser.add_argument( | |
1477 "--quiet", | |
1478 action="store_true", | |
1479 help=_("Suppress all progress information other than errors"), | |
1480 ) | |
1481 parser.add_argument( | |
1482 "--validate", | |
1483 action="store_true", | |
1484 help=_( | |
1485 "Validate existing bags in the provided directories instead of" | |
1486 " creating new ones" | |
1487 ), | |
1488 ) | |
1489 parser.add_argument( | |
1490 "--fast", | |
1491 action="store_true", | |
1492 help=_( | |
1493 "Modify --validate behaviour to only test whether the bag directory" | |
1494 " has the number of files and total size specified in Payload-Oxum" | |
1495 " without performing checksum validation to detect corruption." | |
1496 ), | |
1497 ) | |
1498 parser.add_argument( | |
1499 "--completeness-only", | |
1500 action="store_true", | |
1501 help=_( | |
1502 "Modify --validate behaviour to test whether the bag directory" | |
1503 " has the expected payload specified in the checksum manifests" | |
1504 " without performing checksum validation to detect corruption." | |
1505 ), | |
1506 ) | |
1507 | |
1508 checksum_args = parser.add_argument_group( | |
1509 _("Checksum Algorithms"), | |
1510 _( | |
1511 "Select the manifest algorithms to be used when creating bags" | |
1512 " (default=%s)" | |
1513 ) | |
1514 % ", ".join(DEFAULT_CHECKSUMS), | |
1515 ) | |
1516 | |
1517 for i in CHECKSUM_ALGOS: | |
1518 alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper()) | |
1519 checksum_args.add_argument( | |
1520 "--%s" % i, | |
1521 action="append_const", | |
1522 dest="checksums", | |
1523 const=i, | |
1524 help=_("Generate %s manifest when creating a bag") % alg_name, | |
1525 ) | |
1526 | |
1527 metadata_args = parser.add_argument_group(_("Optional Bag Metadata")) | |
1528 for header in STANDARD_BAG_INFO_HEADERS: | |
1529 metadata_args.add_argument( | |
1530 "--%s" % header.lower(), type=str, action=BagHeaderAction, default=argparse.SUPPRESS | |
1531 ) | |
1532 | |
1533 parser.add_argument( | |
1534 "directory", | |
1535 nargs="+", | |
1536 help=_( | |
1537 "Directory which will be converted into a bag in place" | |
1538 " by moving any existing files into the BagIt structure" | |
1539 " and creating the manifests and other metadata." | |
1540 ), | |
1541 ) | |
1542 | |
1543 return parser | |
1544 | |
1545 | |
1546 def _configure_logging(opts): | |
1547 log_format = "%(asctime)s - %(levelname)s - %(message)s" | |
1548 if opts.quiet: | |
1549 level = logging.ERROR | |
1550 else: | |
1551 level = logging.INFO | |
1552 if opts.log: | |
1553 logging.basicConfig(filename=opts.log, level=level, format=log_format) | |
1554 else: | |
1555 logging.basicConfig(level=level, format=log_format) | |
1556 | |
1557 | |
1558 def main(): | |
1559 if "--version" in sys.argv: | |
1560 print(_("bagit-python version %s") % VERSION) | |
1561 sys.exit(0) | |
1562 | |
1563 parser = _make_parser() | |
1564 args = parser.parse_args() | |
1565 | |
1566 if args.processes < 0: | |
1567 parser.error(_("The number of processes must be 0 or greater")) | |
1568 | |
1569 if args.fast and not args.validate: | |
1570 parser.error(_("--fast is only allowed as an option for --validate!")) | |
1571 | |
1572 _configure_logging(args) | |
1573 | |
1574 rc = 0 | |
1575 for bag_dir in args.directory: | |
1576 # validate the bag | |
1577 if args.validate: | |
1578 try: | |
1579 bag = Bag(bag_dir) | |
1580 # validate throws a BagError or BagValidationError | |
1581 bag.validate( | |
1582 processes=args.processes, | |
1583 fast=args.fast, | |
1584 completeness_only=args.completeness_only, | |
1585 ) | |
1586 if args.fast: | |
1587 LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir) | |
1588 else: | |
1589 LOGGER.info(_("%s is valid"), bag_dir) | |
1590 except BagError as e: | |
1591 LOGGER.error( | |
1592 _("%(bag)s is invalid: %(error)s"), {"bag": bag_dir, "error": e} | |
1593 ) | |
1594 rc = 1 | |
1595 | |
1596 # make the bag | |
1597 else: | |
1598 try: | |
1599 make_bag( | |
1600 bag_dir, | |
1601 bag_info=args.bag_info, | |
1602 processes=args.processes, | |
1603 checksums=args.checksums, | |
1604 ) | |
1605 except Exception as exc: | |
1606 LOGGER.error( | |
1607 _("Failed to create bag in %(bag_directory)s: %(error)s"), | |
1608 {"bag_directory": bag_dir, "error": exc}, | |
1609 exc_info=True, | |
1610 ) | |
1611 rc = 1 | |
1612 | |
1613 sys.exit(rc) | |
1614 | |
1615 | |
1616 if __name__ == "__main__": | |
1617 main() |