Mercurial > repos > shellac > guppy_basecaller
comparison env/bin/bagit.py @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 #!/Users/pldms/Development/Projects/2020/david-matthews-galaxy/guppy_basecaller/env/bin/python3 | |
2 # encoding: utf-8 | |
3 | |
4 from __future__ import absolute_import, division, print_function, unicode_literals | |
5 | |
6 import argparse | |
7 import codecs | |
8 import gettext | |
9 import hashlib | |
10 import logging | |
11 import multiprocessing | |
12 import os | |
13 import re | |
14 import signal | |
15 import sys | |
16 import tempfile | |
17 import unicodedata | |
18 import warnings | |
19 from collections import defaultdict | |
20 from datetime import date | |
21 from functools import partial | |
22 from os.path import abspath, isdir, isfile, join | |
23 | |
24 from pkg_resources import DistributionNotFound, get_distribution | |
25 | |
26 try: | |
27 from urllib.parse import urlparse | |
28 except ImportError: | |
29 from urlparse import urlparse | |
30 | |
31 | |
32 def find_locale_dir(): | |
33 for prefix in (os.path.dirname(__file__), sys.prefix): | |
34 locale_dir = os.path.join(prefix, "locale") | |
35 if os.path.isdir(locale_dir): | |
36 return locale_dir | |
37 | |
38 | |
39 TRANSLATION_CATALOG = gettext.translation( | |
40 "bagit-python", localedir=find_locale_dir(), fallback=True | |
41 ) | |
42 if sys.version_info < (3,): | |
43 _ = TRANSLATION_CATALOG.ugettext | |
44 else: | |
45 _ = TRANSLATION_CATALOG.gettext | |
46 | |
47 MODULE_NAME = "bagit" if __name__ == "__main__" else __name__ | |
48 | |
49 LOGGER = logging.getLogger(MODULE_NAME) | |
50 | |
51 try: | |
52 VERSION = get_distribution(MODULE_NAME).version | |
53 except DistributionNotFound: | |
54 VERSION = "0.0.dev0" | |
55 | |
56 PROJECT_URL = "https://github.com/LibraryOfCongress/bagit-python" | |
57 | |
58 __doc__ = ( | |
59 _( | |
60 """ | |
61 BagIt is a directory, filename convention for bundling an arbitrary set of | |
62 files with a manifest, checksums, and additional metadata. More about BagIt | |
63 can be found at: | |
64 | |
65 http://purl.org/net/bagit | |
66 | |
67 bagit.py is a pure python drop in library and command line tool for creating, | |
68 and working with BagIt directories. | |
69 | |
70 | |
71 Command-Line Usage: | |
72 | |
73 Basic usage is to give bagit.py a directory to bag up: | |
74 | |
75 $ bagit.py my_directory | |
76 | |
77 This does a bag-in-place operation where the current contents will be moved | |
78 into the appropriate BagIt structure and the metadata files will be created. | |
79 | |
80 You can bag multiple directories if you wish: | |
81 | |
82 $ bagit.py directory1 directory2 | |
83 | |
84 Optionally you can provide metadata which will be stored in bag-info.txt: | |
85 | |
86 $ bagit.py --source-organization "Library of Congress" directory | |
87 | |
88 You can also select which manifest algorithms will be used: | |
89 | |
90 $ bagit.py --sha1 --md5 --sha256 --sha512 directory | |
91 | |
92 | |
93 Using BagIt from your Python code: | |
94 | |
95 import bagit | |
96 bag = bagit.make_bag('example-directory', {'Contact-Name': 'Ed Summers'}) | |
97 print(bag.entries) | |
98 | |
99 For more information or to contribute to bagit-python's development, please | |
100 visit %(PROJECT_URL)s | |
101 """ | |
102 ) | |
103 % globals() | |
104 ) | |
105 | |
106 # standard bag-info.txt metadata | |
107 STANDARD_BAG_INFO_HEADERS = [ | |
108 "Source-Organization", | |
109 "Organization-Address", | |
110 "Contact-Name", | |
111 "Contact-Phone", | |
112 "Contact-Email", | |
113 "External-Description", | |
114 "External-Identifier", | |
115 "Bag-Size", | |
116 "Bag-Group-Identifier", | |
117 "Bag-Count", | |
118 "Internal-Sender-Identifier", | |
119 "Internal-Sender-Description", | |
120 "BagIt-Profile-Identifier", | |
121 # Bagging-Date is autogenerated | |
122 # Payload-Oxum is autogenerated | |
123 ] | |
124 | |
125 CHECKSUM_ALGOS = hashlib.algorithms_guaranteed | |
126 DEFAULT_CHECKSUMS = ["sha256", "sha512"] | |
127 | |
128 #: Block size used when reading files for hashing: | |
129 HASH_BLOCK_SIZE = 512 * 1024 | |
130 | |
131 #: Convenience function used everywhere we want to open a file to read text | |
132 #: rather than undecoded bytes: | |
133 open_text_file = partial(codecs.open, encoding="utf-8", errors="strict") | |
134 | |
135 # This is the same as decoding the byte values in codecs.BOM: | |
136 UNICODE_BYTE_ORDER_MARK = "\uFEFF" | |
137 | |
138 | |
139 def make_bag( | |
140 bag_dir, bag_info=None, processes=1, checksums=None, checksum=None, encoding="utf-8" | |
141 ): | |
142 """ | |
143 Convert a given directory into a bag. You can pass in arbitrary | |
144 key/value pairs to put into the bag-info.txt metadata file as | |
145 the bag_info dictionary. | |
146 """ | |
147 | |
148 if checksum is not None: | |
149 warnings.warn( | |
150 _( | |
151 "The `checksum` argument for `make_bag` should be replaced with `checksums`" | |
152 ), | |
153 DeprecationWarning, | |
154 ) | |
155 checksums = checksum | |
156 | |
157 if checksums is None: | |
158 checksums = DEFAULT_CHECKSUMS | |
159 | |
160 bag_dir = os.path.abspath(bag_dir) | |
161 cwd = os.path.abspath(os.path.curdir) | |
162 | |
163 if cwd.startswith(bag_dir) and cwd != bag_dir: | |
164 raise RuntimeError( | |
165 _("Bagging a parent of the current directory is not supported") | |
166 ) | |
167 | |
168 LOGGER.info(_("Creating bag for directory %s"), bag_dir) | |
169 | |
170 if not os.path.isdir(bag_dir): | |
171 LOGGER.error(_("Bag directory %s does not exist"), bag_dir) | |
172 raise RuntimeError(_("Bag directory %s does not exist") % bag_dir) | |
173 | |
174 # FIXME: we should do the permissions checks before changing directories | |
175 old_dir = os.path.abspath(os.path.curdir) | |
176 | |
177 try: | |
178 # TODO: These two checks are currently redundant since an unreadable directory will also | |
179 # often be unwritable, and this code will require review when we add the option to | |
180 # bag to a destination other than the source. It would be nice if we could avoid | |
181 # walking the directory tree more than once even if most filesystems will cache it | |
182 | |
183 unbaggable = _can_bag(bag_dir) | |
184 | |
185 if unbaggable: | |
186 LOGGER.error( | |
187 _("Unable to write to the following directories and files:\n%s"), | |
188 unbaggable, | |
189 ) | |
190 raise BagError(_("Missing permissions to move all files and directories")) | |
191 | |
192 unreadable_dirs, unreadable_files = _can_read(bag_dir) | |
193 | |
194 if unreadable_dirs or unreadable_files: | |
195 if unreadable_dirs: | |
196 LOGGER.error( | |
197 _("The following directories do not have read permissions:\n%s"), | |
198 unreadable_dirs, | |
199 ) | |
200 if unreadable_files: | |
201 LOGGER.error( | |
202 _("The following files do not have read permissions:\n%s"), | |
203 unreadable_files, | |
204 ) | |
205 raise BagError( | |
206 _("Read permissions are required to calculate file fixities") | |
207 ) | |
208 else: | |
209 LOGGER.info(_("Creating data directory")) | |
210 | |
211 # FIXME: if we calculate full paths we won't need to deal with changing directories | |
212 os.chdir(bag_dir) | |
213 cwd = os.getcwd() | |
214 temp_data = tempfile.mkdtemp(dir=cwd) | |
215 | |
216 for f in os.listdir("."): | |
217 if os.path.abspath(f) == temp_data: | |
218 continue | |
219 new_f = os.path.join(temp_data, f) | |
220 LOGGER.info( | |
221 _("Moving %(source)s to %(destination)s"), | |
222 {"source": f, "destination": new_f}, | |
223 ) | |
224 os.rename(f, new_f) | |
225 | |
226 LOGGER.info( | |
227 _("Moving %(source)s to %(destination)s"), | |
228 {"source": temp_data, "destination": "data"}, | |
229 ) | |
230 os.rename(temp_data, "data") | |
231 | |
232 # permissions for the payload directory should match those of the | |
233 # original directory | |
234 os.chmod("data", os.stat(cwd).st_mode) | |
235 | |
236 total_bytes, total_files = make_manifests( | |
237 "data", processes, algorithms=checksums, encoding=encoding | |
238 ) | |
239 | |
240 LOGGER.info(_("Creating bagit.txt")) | |
241 txt = """BagIt-Version: 0.97\nTag-File-Character-Encoding: UTF-8\n""" | |
242 with open_text_file("bagit.txt", "w") as bagit_file: | |
243 bagit_file.write(txt) | |
244 | |
245 LOGGER.info(_("Creating bag-info.txt")) | |
246 if bag_info is None: | |
247 bag_info = {} | |
248 | |
249 # allow 'Bagging-Date' and 'Bag-Software-Agent' to be overidden | |
250 if "Bagging-Date" not in bag_info: | |
251 bag_info["Bagging-Date"] = date.strftime(date.today(), "%Y-%m-%d") | |
252 if "Bag-Software-Agent" not in bag_info: | |
253 bag_info["Bag-Software-Agent"] = "bagit.py v%s <%s>" % ( | |
254 VERSION, | |
255 PROJECT_URL, | |
256 ) | |
257 | |
258 bag_info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
259 _make_tag_file("bag-info.txt", bag_info) | |
260 | |
261 for c in checksums: | |
262 _make_tagmanifest_file(c, bag_dir, encoding="utf-8") | |
263 except Exception: | |
264 LOGGER.exception(_("An error occurred creating a bag in %s"), bag_dir) | |
265 raise | |
266 finally: | |
267 os.chdir(old_dir) | |
268 | |
269 return Bag(bag_dir) | |
270 | |
271 | |
272 class Bag(object): | |
273 """A representation of a bag.""" | |
274 | |
275 valid_files = ["bagit.txt", "fetch.txt"] | |
276 valid_directories = ["data"] | |
277 | |
278 def __init__(self, path=None): | |
279 super(Bag, self).__init__() | |
280 self.tags = {} | |
281 self.info = {} | |
282 #: Dictionary of manifest entries and the checksum values for each | |
283 #: algorithm: | |
284 self.entries = {} | |
285 | |
286 # To reliably handle Unicode normalization differences, we maintain | |
287 # lookup dictionaries in both directions for the filenames read from | |
288 # the filesystem and the manifests so we can handle cases where the | |
289 # normalization form changed between the bag being created and read. | |
290 # See https://github.com/LibraryOfCongress/bagit-python/issues/51. | |
291 | |
292 #: maps Unicode-normalized values to the raw value from the filesystem | |
293 self.normalized_filesystem_names = {} | |
294 | |
295 #: maps Unicode-normalized values to the raw value in the manifest | |
296 self.normalized_manifest_names = {} | |
297 | |
298 self.algorithms = [] | |
299 self.tag_file_name = None | |
300 self.path = abspath(path) | |
301 if path: | |
302 # if path ends in a path separator, strip it off | |
303 if path[-1] == os.sep: | |
304 self.path = path[:-1] | |
305 self._open() | |
306 | |
307 def __str__(self): | |
308 # FIXME: develop a more informative string representation for a Bag | |
309 return self.path | |
310 | |
311 @property | |
312 def algs(self): | |
313 warnings.warn(_("Use Bag.algorithms instead of Bag.algs"), DeprecationWarning) | |
314 return self.algorithms | |
315 | |
316 @property | |
317 def version(self): | |
318 warnings.warn( | |
319 _("Use the Bag.version_info tuple instead of Bag.version"), | |
320 DeprecationWarning, | |
321 ) | |
322 return self._version | |
323 | |
324 def _open(self): | |
325 # Open the bagit.txt file, and load any tags from it, including | |
326 # the required version and encoding. | |
327 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
328 | |
329 if not isfile(bagit_file_path): | |
330 raise BagError(_("Expected bagit.txt does not exist: %s") % bagit_file_path) | |
331 | |
332 self.tags = tags = _load_tag_file(bagit_file_path) | |
333 | |
334 required_tags = ("BagIt-Version", "Tag-File-Character-Encoding") | |
335 missing_tags = [i for i in required_tags if i not in tags] | |
336 if missing_tags: | |
337 raise BagError( | |
338 _("Missing required tag in bagit.txt: %s") % ", ".join(missing_tags) | |
339 ) | |
340 | |
341 # To avoid breaking existing code we'll leave self.version as the string | |
342 # and parse it into a numeric version_info tuple. In version 2.0 we can | |
343 # break that. | |
344 | |
345 self._version = tags["BagIt-Version"] | |
346 | |
347 try: | |
348 self.version_info = tuple(int(i) for i in self._version.split(".", 1)) | |
349 except ValueError: | |
350 raise BagError( | |
351 _("Bag version numbers must be MAJOR.MINOR numbers, not %s") | |
352 % self._version | |
353 ) | |
354 | |
355 if (0, 93) <= self.version_info <= (0, 95): | |
356 self.tag_file_name = "package-info.txt" | |
357 elif (0, 96) <= self.version_info < (2,): | |
358 self.tag_file_name = "bag-info.txt" | |
359 else: | |
360 raise BagError(_("Unsupported bag version: %s") % self._version) | |
361 | |
362 self.encoding = tags["Tag-File-Character-Encoding"] | |
363 | |
364 try: | |
365 codecs.lookup(self.encoding) | |
366 except LookupError: | |
367 raise BagValidationError(_("Unsupported encoding: %s") % self.encoding) | |
368 | |
369 info_file_path = os.path.join(self.path, self.tag_file_name) | |
370 if os.path.exists(info_file_path): | |
371 self.info = _load_tag_file(info_file_path, encoding=self.encoding) | |
372 | |
373 self._load_manifests() | |
374 | |
375 def manifest_files(self): | |
376 for filename in ["manifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
377 f = os.path.join(self.path, filename) | |
378 if isfile(f): | |
379 yield f | |
380 | |
381 def tagmanifest_files(self): | |
382 for filename in ["tagmanifest-%s.txt" % a for a in CHECKSUM_ALGOS]: | |
383 f = os.path.join(self.path, filename) | |
384 if isfile(f): | |
385 yield f | |
386 | |
387 def compare_manifests_with_fs(self): | |
388 """ | |
389 Compare the filenames in the manifests to the filenames present on the | |
390 local filesystem and returns two lists of the files which are only | |
391 present in the manifests and the files which are only present on the | |
392 local filesystem, respectively. | |
393 """ | |
394 | |
395 # We compare the filenames after Unicode normalization so we can | |
396 # reliably detect normalization changes after bag creation: | |
397 files_on_fs = set(normalize_unicode(i) for i in self.payload_files()) | |
398 files_in_manifest = set( | |
399 normalize_unicode(i) for i in self.payload_entries().keys() | |
400 ) | |
401 | |
402 if self.version_info >= (0, 97): | |
403 files_in_manifest.update(self.missing_optional_tagfiles()) | |
404 | |
405 only_on_fs = list() | |
406 only_in_manifest = list() | |
407 | |
408 for i in files_on_fs.difference(files_in_manifest): | |
409 only_on_fs.append(self.normalized_filesystem_names[i]) | |
410 | |
411 for i in files_in_manifest.difference(files_on_fs): | |
412 only_in_manifest.append(self.normalized_manifest_names[i]) | |
413 | |
414 return only_in_manifest, only_on_fs | |
415 | |
416 def compare_fetch_with_fs(self): | |
417 """Compares the fetch entries with the files actually | |
418 in the payload, and returns a list of all the files | |
419 that still need to be fetched. | |
420 """ | |
421 | |
422 files_on_fs = set(self.payload_files()) | |
423 files_in_fetch = set(self.files_to_be_fetched()) | |
424 | |
425 return list(files_in_fetch - files_on_fs) | |
426 | |
427 def payload_files(self): | |
428 """Returns a list of filenames which are present on the local filesystem""" | |
429 payload_dir = os.path.join(self.path, "data") | |
430 | |
431 for dirpath, _, filenames in os.walk(payload_dir): | |
432 for f in filenames: | |
433 # Jump through some hoops here to make the payload files are | |
434 # returned with the directory structure relative to the base | |
435 # directory rather than the | |
436 normalized_f = os.path.normpath(f) | |
437 rel_path = os.path.relpath( | |
438 os.path.join(dirpath, normalized_f), start=self.path | |
439 ) | |
440 | |
441 self.normalized_filesystem_names[normalize_unicode(rel_path)] = rel_path | |
442 yield rel_path | |
443 | |
444 def payload_entries(self): | |
445 """Return a dictionary of items """ | |
446 # Don't use dict comprehension (compatibility with Python < 2.7) | |
447 return dict( | |
448 (key, value) | |
449 for (key, value) in self.entries.items() | |
450 if key.startswith("data" + os.sep) | |
451 ) | |
452 | |
453 def save(self, processes=1, manifests=False): | |
454 """ | |
455 save will persist any changes that have been made to the bag | |
456 metadata (self.info). | |
457 | |
458 If you have modified the payload of the bag (added, modified, | |
459 removed files in the data directory) and want to regenerate manifests | |
460 set the manifests parameter to True. The default is False since you | |
461 wouldn't want a save to accidentally create a new manifest for | |
462 a corrupted bag. | |
463 | |
464 If you want to control the number of processes that are used when | |
465 recalculating checksums use the processes parameter. | |
466 """ | |
467 # Error checking | |
468 if not self.path: | |
469 raise BagError(_("Bag.save() called before setting the path!")) | |
470 | |
471 if not os.access(self.path, os.R_OK | os.W_OK | os.X_OK): | |
472 raise BagError( | |
473 _("Cannot save bag to non-existent or inaccessible directory %s") | |
474 % self.path | |
475 ) | |
476 | |
477 unbaggable = _can_bag(self.path) | |
478 if unbaggable: | |
479 LOGGER.error( | |
480 _( | |
481 "Missing write permissions for the following directories and files:\n%s" | |
482 ), | |
483 unbaggable, | |
484 ) | |
485 raise BagError(_("Missing permissions to move all files and directories")) | |
486 | |
487 unreadable_dirs, unreadable_files = _can_read(self.path) | |
488 if unreadable_dirs or unreadable_files: | |
489 if unreadable_dirs: | |
490 LOGGER.error( | |
491 _("The following directories do not have read permissions:\n%s"), | |
492 unreadable_dirs, | |
493 ) | |
494 if unreadable_files: | |
495 LOGGER.error( | |
496 _("The following files do not have read permissions:\n%s"), | |
497 unreadable_files, | |
498 ) | |
499 raise BagError( | |
500 _("Read permissions are required to calculate file fixities") | |
501 ) | |
502 | |
503 # Change working directory to bag directory so helper functions work | |
504 old_dir = os.path.abspath(os.path.curdir) | |
505 os.chdir(self.path) | |
506 | |
507 # Generate new manifest files | |
508 if manifests: | |
509 total_bytes, total_files = make_manifests( | |
510 "data", processes, algorithms=self.algorithms, encoding=self.encoding | |
511 ) | |
512 | |
513 # Update Payload-Oxum | |
514 LOGGER.info(_("Updating Payload-Oxum in %s"), self.tag_file_name) | |
515 self.info["Payload-Oxum"] = "%s.%s" % (total_bytes, total_files) | |
516 | |
517 _make_tag_file(self.tag_file_name, self.info) | |
518 | |
519 # Update tag-manifest for changes to manifest & bag-info files | |
520 for alg in self.algorithms: | |
521 _make_tagmanifest_file(alg, self.path, encoding=self.encoding) | |
522 | |
523 # Reload the manifests | |
524 self._load_manifests() | |
525 | |
526 os.chdir(old_dir) | |
527 | |
528 def tagfile_entries(self): | |
529 return dict( | |
530 (key, value) | |
531 for (key, value) in self.entries.items() | |
532 if not key.startswith("data" + os.sep) | |
533 ) | |
534 | |
535 def missing_optional_tagfiles(self): | |
536 """ | |
537 From v0.97 we need to validate any tagfiles listed | |
538 in the optional tagmanifest(s). As there is no mandatory | |
539 directory structure for additional tagfiles we can | |
540 only check for entries with missing files (not missing | |
541 entries for existing files). | |
542 """ | |
543 for tagfilepath in self.tagfile_entries().keys(): | |
544 if not os.path.isfile(os.path.join(self.path, tagfilepath)): | |
545 yield tagfilepath | |
546 | |
547 def fetch_entries(self): | |
548 """Load fetch.txt if present and iterate over its contents | |
549 | |
550 yields (url, size, filename) tuples | |
551 | |
552 raises BagError for errors such as an unsafe filename referencing | |
553 data outside of the bag directory | |
554 """ | |
555 | |
556 fetch_file_path = os.path.join(self.path, "fetch.txt") | |
557 | |
558 if isfile(fetch_file_path): | |
559 with open_text_file( | |
560 fetch_file_path, "r", encoding=self.encoding | |
561 ) as fetch_file: | |
562 for line in fetch_file: | |
563 url, file_size, filename = line.strip().split(None, 2) | |
564 | |
565 if self._path_is_dangerous(filename): | |
566 raise BagError( | |
567 _('Path "%(payload_file)s" in "%(source_file)s" is unsafe') | |
568 % { | |
569 "payload_file": filename, | |
570 "source_file": os.path.join(self.path, "fetch.txt"), | |
571 } | |
572 ) | |
573 | |
574 yield url, file_size, filename | |
575 | |
576 def files_to_be_fetched(self): | |
577 """ | |
578 Convenience wrapper for fetch_entries which returns only the | |
579 local filename | |
580 """ | |
581 | |
582 for url, file_size, filename in self.fetch_entries(): | |
583 yield filename | |
584 | |
585 def has_oxum(self): | |
586 return "Payload-Oxum" in self.info | |
587 | |
588 def validate(self, processes=1, fast=False, completeness_only=False): | |
589 """Checks the structure and contents are valid. | |
590 | |
591 If you supply the parameter fast=True the Payload-Oxum (if present) will | |
592 be used to check that the payload files are present and accounted for, | |
593 instead of re-calculating fixities and comparing them against the | |
594 manifest. By default validate() will re-calculate fixities (fast=False). | |
595 """ | |
596 | |
597 self._validate_structure() | |
598 self._validate_bagittxt() | |
599 | |
600 self.validate_fetch() | |
601 | |
602 self._validate_contents( | |
603 processes=processes, fast=fast, completeness_only=completeness_only | |
604 ) | |
605 | |
606 return True | |
607 | |
608 def is_valid(self, fast=False, completeness_only=False): | |
609 """Returns validation success or failure as boolean. | |
610 Optional fast parameter passed directly to validate(). | |
611 """ | |
612 | |
613 try: | |
614 self.validate(fast=fast, completeness_only=completeness_only) | |
615 except BagError: | |
616 return False | |
617 | |
618 return True | |
619 | |
620 def _load_manifests(self): | |
621 self.entries = {} | |
622 manifests = list(self.manifest_files()) | |
623 | |
624 if self.version_info >= (0, 97): | |
625 # v0.97+ requires that optional tagfiles are verified. | |
626 manifests += list(self.tagmanifest_files()) | |
627 | |
628 for manifest_filename in manifests: | |
629 if not manifest_filename.find("tagmanifest-") is -1: | |
630 search = "tagmanifest-" | |
631 else: | |
632 search = "manifest-" | |
633 alg = ( | |
634 os.path.basename(manifest_filename) | |
635 .replace(search, "") | |
636 .replace(".txt", "") | |
637 ) | |
638 if alg not in self.algorithms: | |
639 self.algorithms.append(alg) | |
640 | |
641 with open_text_file( | |
642 manifest_filename, "r", encoding=self.encoding | |
643 ) as manifest_file: | |
644 if manifest_file.encoding.startswith("UTF"): | |
645 # We'll check the first character to see if it's a BOM: | |
646 if manifest_file.read(1) == UNICODE_BYTE_ORDER_MARK: | |
647 # We'll skip it either way by letting line decoding | |
648 # happen at the new offset but we will issue a warning | |
649 # for UTF-8 since the presence of a BOM is contrary to | |
650 # the BagIt specification: | |
651 if manifest_file.encoding == "UTF-8": | |
652 LOGGER.warning( | |
653 _( | |
654 "%s is encoded using UTF-8 but contains an unnecessary" | |
655 " byte-order mark, which is not in compliance with the" | |
656 " BagIt RFC" | |
657 ), | |
658 manifest_file.name, | |
659 ) | |
660 else: | |
661 manifest_file.seek(0) # Pretend the first read never happened | |
662 | |
663 for line in manifest_file: | |
664 line = line.strip() | |
665 | |
666 # Ignore blank lines and comments. | |
667 if line == "" or line.startswith("#"): | |
668 continue | |
669 | |
670 entry = line.split(None, 1) | |
671 | |
672 # Format is FILENAME *CHECKSUM | |
673 if len(entry) != 2: | |
674 LOGGER.error( | |
675 _( | |
676 "%(bag)s: Invalid %(algorithm)s manifest entry: %(line)s" | |
677 ), | |
678 {"bag": self, "algorithm": alg, "line": line}, | |
679 ) | |
680 continue | |
681 | |
682 entry_hash = entry[0] | |
683 entry_path = os.path.normpath(entry[1].lstrip("*")) | |
684 entry_path = _decode_filename(entry_path) | |
685 | |
686 if self._path_is_dangerous(entry_path): | |
687 raise BagError( | |
688 _( | |
689 'Path "%(payload_file)s" in manifest "%(manifest_file)s" is unsafe' | |
690 ) | |
691 % { | |
692 "payload_file": entry_path, | |
693 "manifest_file": manifest_file.name, | |
694 } | |
695 ) | |
696 | |
697 entry_hashes = self.entries.setdefault(entry_path, {}) | |
698 | |
699 if alg in entry_hashes: | |
700 warning_ctx = { | |
701 "bag": self, | |
702 "algorithm": alg, | |
703 "filename": entry_path, | |
704 } | |
705 if entry_hashes[alg] == entry_hash: | |
706 msg = _( | |
707 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
708 " multiple times with the same value" | |
709 ) | |
710 if self.version_info >= (1,): | |
711 raise BagError(msg % warning_ctx) | |
712 else: | |
713 LOGGER.warning(msg, warning_ctx) | |
714 else: | |
715 raise BagError( | |
716 _( | |
717 "%(bag)s: %(algorithm)s manifest lists %(filename)s" | |
718 " multiple times with conflicting values" | |
719 ) | |
720 % warning_ctx | |
721 ) | |
722 | |
723 entry_hashes[alg] = entry_hash | |
724 | |
725 self.normalized_manifest_names.update( | |
726 (normalize_unicode(i), i) for i in self.entries.keys() | |
727 ) | |
728 | |
729 def _validate_structure(self): | |
730 """ | |
731 Checks the structure of the bag to determine whether it conforms to the | |
732 BagIt spec. Returns true on success, otherwise it will raise a | |
733 BagValidationError exception. | |
734 """ | |
735 | |
736 self._validate_structure_payload_directory() | |
737 self._validate_structure_tag_files() | |
738 | |
739 def _validate_structure_payload_directory(self): | |
740 data_dir_path = os.path.join(self.path, "data") | |
741 | |
742 if not isdir(data_dir_path): | |
743 raise BagValidationError( | |
744 _("Expected data directory %s does not exist") % data_dir_path | |
745 ) | |
746 | |
747 def _validate_structure_tag_files(self): | |
748 # Note: we deviate somewhat from v0.96 of the spec in that it allows | |
749 # other files and directories to be present in the base directory | |
750 | |
751 if not list(self.manifest_files()): | |
752 raise BagValidationError(_("No manifest files found")) | |
753 if "bagit.txt" not in os.listdir(self.path): | |
754 raise BagValidationError( | |
755 _('Expected %s to contain "bagit.txt"') % self.path | |
756 ) | |
757 | |
758 def validate_fetch(self): | |
759 """Validate the fetch.txt file | |
760 | |
761 Raises `BagError` for errors and otherwise returns no value | |
762 """ | |
763 | |
764 for url, file_size, filename in self.fetch_entries(): | |
765 # fetch_entries will raise a BagError for unsafe filenames | |
766 # so at this point we will check only that the URL is minimally | |
767 # well formed: | |
768 parsed_url = urlparse(url) | |
769 | |
770 if not all((parsed_url.scheme, parsed_url.netloc)): | |
771 raise BagError(_("Malformed URL in fetch.txt: %s") % url) | |
772 | |
773 def _validate_contents(self, processes=1, fast=False, completeness_only=False): | |
774 if fast and not self.has_oxum(): | |
775 raise BagValidationError( | |
776 _("Fast validation requires bag-info.txt to include Payload-Oxum") | |
777 ) | |
778 | |
779 # Perform the fast file count + size check so we can fail early: | |
780 self._validate_oxum() | |
781 | |
782 if fast: | |
783 return | |
784 | |
785 self._validate_completeness() | |
786 | |
787 if completeness_only: | |
788 return | |
789 | |
790 self._validate_entries(processes) | |
791 | |
792 def _validate_oxum(self): | |
793 oxum = self.info.get("Payload-Oxum") | |
794 | |
795 if oxum is None: | |
796 return | |
797 | |
798 # If multiple Payload-Oxum tags (bad idea) | |
799 # use the first listed in bag-info.txt | |
800 if isinstance(oxum, list): | |
801 LOGGER.warning(_("bag-info.txt defines multiple Payload-Oxum values!")) | |
802 oxum = oxum[0] | |
803 | |
804 oxum_byte_count, oxum_file_count = oxum.split(".", 1) | |
805 | |
806 if not oxum_byte_count.isdigit() or not oxum_file_count.isdigit(): | |
807 raise BagError(_("Malformed Payload-Oxum value: %s") % oxum) | |
808 | |
809 oxum_byte_count = int(oxum_byte_count) | |
810 oxum_file_count = int(oxum_file_count) | |
811 total_bytes = 0 | |
812 total_files = 0 | |
813 | |
814 for payload_file in self.payload_files(): | |
815 payload_file = os.path.join(self.path, payload_file) | |
816 total_bytes += os.stat(payload_file).st_size | |
817 total_files += 1 | |
818 | |
819 if oxum_file_count != total_files or oxum_byte_count != total_bytes: | |
820 raise BagValidationError( | |
821 _( | |
822 "Payload-Oxum validation failed." | |
823 " Expected %(oxum_file_count)d files and %(oxum_byte_count)d bytes" | |
824 " but found %(found_file_count)d files and %(found_byte_count)d bytes" | |
825 ) | |
826 % { | |
827 "found_file_count": total_files, | |
828 "found_byte_count": total_bytes, | |
829 "oxum_file_count": oxum_file_count, | |
830 "oxum_byte_count": oxum_byte_count, | |
831 } | |
832 ) | |
833 | |
834 def _validate_completeness(self): | |
835 """ | |
836 Verify that the actual file manifests match the files in the data directory | |
837 """ | |
838 errors = list() | |
839 | |
840 # First we'll make sure there's no mismatch between the filesystem | |
841 # and the list of files in the manifest(s) | |
842 only_in_manifests, only_on_fs = self.compare_manifests_with_fs() | |
843 for path in only_in_manifests: | |
844 e = FileMissing(path) | |
845 LOGGER.warning(force_unicode(e)) | |
846 errors.append(e) | |
847 for path in only_on_fs: | |
848 e = UnexpectedFile(path) | |
849 LOGGER.warning(force_unicode(e)) | |
850 errors.append(e) | |
851 | |
852 if errors: | |
853 raise BagValidationError(_("Bag validation failed"), errors) | |
854 | |
855 def _validate_entries(self, processes): | |
856 """ | |
857 Verify that the actual file contents match the recorded hashes stored in the manifest files | |
858 """ | |
859 errors = list() | |
860 | |
861 if os.name == "posix": | |
862 worker_init = posix_multiprocessing_worker_initializer | |
863 else: | |
864 worker_init = None | |
865 | |
866 args = ( | |
867 ( | |
868 self.path, | |
869 self.normalized_filesystem_names.get(rel_path, rel_path), | |
870 hashes, | |
871 self.algorithms, | |
872 ) | |
873 for rel_path, hashes in self.entries.items() | |
874 ) | |
875 | |
876 try: | |
877 if processes == 1: | |
878 hash_results = [_calc_hashes(i) for i in args] | |
879 else: | |
880 try: | |
881 pool = multiprocessing.Pool( | |
882 processes if processes else None, initializer=worker_init | |
883 ) | |
884 hash_results = pool.map(_calc_hashes, args) | |
885 finally: | |
886 pool.terminate() | |
887 | |
888 # Any unhandled exceptions are probably fatal | |
889 except: | |
890 LOGGER.exception(_("Unable to calculate file hashes for %s"), self) | |
891 raise | |
892 | |
893 for rel_path, f_hashes, hashes in hash_results: | |
894 for alg, computed_hash in f_hashes.items(): | |
895 stored_hash = hashes[alg] | |
896 if stored_hash.lower() != computed_hash: | |
897 e = ChecksumMismatch( | |
898 rel_path, alg, stored_hash.lower(), computed_hash | |
899 ) | |
900 LOGGER.warning(force_unicode(e)) | |
901 errors.append(e) | |
902 | |
903 if errors: | |
904 raise BagValidationError(_("Bag validation failed"), errors) | |
905 | |
906 def _validate_bagittxt(self): | |
907 """ | |
908 Verify that bagit.txt conforms to specification | |
909 """ | |
910 bagit_file_path = os.path.join(self.path, "bagit.txt") | |
911 | |
912 # Note that we are intentionally opening this file in binary mode so we can confirm | |
913 # that it does not start with the UTF-8 byte-order-mark | |
914 with open(bagit_file_path, "rb") as bagit_file: | |
915 first_line = bagit_file.read(4) | |
916 if first_line.startswith(codecs.BOM_UTF8): | |
917 raise BagValidationError( | |
918 _("bagit.txt must not contain a byte-order mark") | |
919 ) | |
920 | |
921 def _path_is_dangerous(self, path): | |
922 """ | |
923 Return true if path looks dangerous, i.e. potentially operates | |
924 outside the bagging directory structure, e.g. ~/.bashrc, ../../../secrets.json, | |
925 \\?\c:\, D:\sys32\cmd.exe | |
926 """ | |
927 if os.path.isabs(path): | |
928 return True | |
929 if os.path.expanduser(path) != path: | |
930 return True | |
931 if os.path.expandvars(path) != path: | |
932 return True | |
933 real_path = os.path.realpath(os.path.join(self.path, path)) | |
934 real_path = os.path.normpath(real_path) | |
935 bag_path = os.path.realpath(self.path) | |
936 bag_path = os.path.normpath(bag_path) | |
937 common = os.path.commonprefix((bag_path, real_path)) | |
938 return not (common == bag_path) | |
939 | |
940 | |
941 class BagError(Exception): | |
942 pass | |
943 | |
944 | |
945 class BagValidationError(BagError): | |
946 def __init__(self, message, details=None): | |
947 super(BagValidationError, self).__init__() | |
948 | |
949 if details is None: | |
950 details = [] | |
951 | |
952 self.message = message | |
953 self.details = details | |
954 | |
955 def __str__(self): | |
956 if len(self.details) > 0: | |
957 details = "; ".join([force_unicode(e) for e in self.details]) | |
958 return "%s: %s" % (self.message, details) | |
959 return self.message | |
960 | |
961 | |
962 class ManifestErrorDetail(BagError): | |
963 def __init__(self, path): | |
964 super(ManifestErrorDetail, self).__init__() | |
965 | |
966 self.path = path | |
967 | |
968 | |
969 class ChecksumMismatch(ManifestErrorDetail): | |
970 def __init__(self, path, algorithm=None, expected=None, found=None): | |
971 super(ChecksumMismatch, self).__init__(path) | |
972 | |
973 self.path = path | |
974 self.algorithm = algorithm | |
975 self.expected = expected | |
976 self.found = found | |
977 | |
978 def __str__(self): | |
979 return _( | |
980 '%(path)s %(algorithm)s validation failed: expected="%(expected)s" found="%(found)s"' | |
981 ) % { | |
982 "path": force_unicode(self.path), | |
983 "algorithm": self.algorithm, | |
984 "expected": self.expected, | |
985 "found": self.found, | |
986 } | |
987 | |
988 | |
989 class FileMissing(ManifestErrorDetail): | |
990 def __str__(self): | |
991 return _( | |
992 "%s exists in manifest but was not found on filesystem" | |
993 ) % force_unicode(self.path) | |
994 | |
995 | |
996 class UnexpectedFile(ManifestErrorDetail): | |
997 def __str__(self): | |
998 return _("%s exists on filesystem but is not in the manifest") % self.path | |
999 | |
1000 | |
1001 class FileNormalizationConflict(BagError): | |
1002 """ | |
1003 Exception raised when two files differ only in normalization and thus | |
1004 are not safely portable | |
1005 """ | |
1006 | |
1007 def __init__(self, file_a, file_b): | |
1008 super(FileNormalizationConflict, self).__init__() | |
1009 | |
1010 self.file_a = file_a | |
1011 self.file_b = file_b | |
1012 | |
1013 def __str__(self): | |
1014 return _( | |
1015 'Unicode normalization conflict for file "%(file_a)s" and "%(file_b)s"' | |
1016 ) % {"file_a": self.file_a, "file_b": self.file_b} | |
1017 | |
1018 | |
1019 def posix_multiprocessing_worker_initializer(): | |
1020 """Ignore SIGINT in multiprocessing workers on POSIX systems""" | |
1021 signal.signal(signal.SIGINT, signal.SIG_IGN) | |
1022 | |
1023 | |
1024 # The Unicode normalization form used here doesn't matter – all we care about | |
1025 # is consistency since the input value will be preserved: | |
1026 | |
1027 | |
1028 def normalize_unicode_py3(s): | |
1029 return unicodedata.normalize("NFC", s) | |
1030 | |
1031 | |
1032 def normalize_unicode_py2(s): | |
1033 if isinstance(s, str): | |
1034 s = s.decode("utf-8") | |
1035 return unicodedata.normalize("NFC", s) | |
1036 | |
1037 | |
1038 if sys.version_info > (3, 0): | |
1039 normalize_unicode = normalize_unicode_py3 | |
1040 else: | |
1041 normalize_unicode = normalize_unicode_py2 | |
1042 | |
1043 | |
1044 def build_unicode_normalized_lookup_dict(filenames): | |
1045 """ | |
1046 Return a dictionary mapping unicode-normalized filenames to as-encoded | |
1047 values to efficiently detect conflicts between the filesystem and manifests. | |
1048 | |
1049 This is necessary because some filesystems and utilities may automatically | |
1050 apply a different Unicode normalization form to filenames than was applied | |
1051 when the bag was originally created. | |
1052 | |
1053 The best known example of this is when a bag is created using a | |
1054 normalization form other than NFD and then transferred to a Mac where the | |
1055 HFS+ filesystem will transparently normalize filenames to a variant of NFD | |
1056 for every call: | |
1057 | |
1058 https://developer.apple.com/legacy/library/technotes/tn/tn1150.html#UnicodeSubtleties | |
1059 | |
1060 Windows is documented as storing filenames exactly as provided: | |
1061 | |
1062 https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx | |
1063 | |
1064 Linux performs no normalization in the kernel but it is technically | |
1065 valid for a filesystem to perform normalization, such as when an HFS+ | |
1066 volume is mounted. | |
1067 | |
1068 See http://www.unicode.org/reports/tr15/ for a full discussion of | |
1069 equivalence and normalization in Unicode. | |
1070 """ | |
1071 | |
1072 output = dict() | |
1073 | |
1074 for filename in filenames: | |
1075 normalized_filename = normalize_unicode(filename) | |
1076 if normalized_filename in output: | |
1077 raise FileNormalizationConflict(filename, output[normalized_filename]) | |
1078 else: | |
1079 output[normalized_filename] = filename | |
1080 | |
1081 return output | |
1082 | |
1083 | |
1084 def get_hashers(algorithms): | |
1085 """ | |
1086 Given a list of algorithm names, return a dictionary of hasher instances | |
1087 | |
1088 This avoids redundant code between the creation and validation code where in | |
1089 both cases we want to avoid reading the same file more than once. The | |
1090 intended use is a simple for loop: | |
1091 | |
1092 for block in file: | |
1093 for hasher in hashers.values(): | |
1094 hasher.update(block) | |
1095 """ | |
1096 | |
1097 hashers = {} | |
1098 | |
1099 for alg in algorithms: | |
1100 try: | |
1101 hasher = hashlib.new(alg) | |
1102 except ValueError: | |
1103 LOGGER.warning( | |
1104 _("Disabling requested hash algorithm %s: hashlib does not support it"), | |
1105 alg, | |
1106 ) | |
1107 continue | |
1108 | |
1109 hashers[alg] = hasher | |
1110 | |
1111 if not hashers: | |
1112 raise ValueError( | |
1113 _( | |
1114 "Unable to continue: hashlib does not support any of the requested algorithms!" | |
1115 ) | |
1116 ) | |
1117 | |
1118 return hashers | |
1119 | |
1120 | |
1121 def _calc_hashes(args): | |
1122 # auto unpacking of sequences illegal in Python3 | |
1123 (base_path, rel_path, hashes, algorithms) = args | |
1124 full_path = os.path.join(base_path, rel_path) | |
1125 | |
1126 # Create a clone of the default empty hash objects: | |
1127 f_hashers = dict((alg, hashlib.new(alg)) for alg in hashes if alg in algorithms) | |
1128 | |
1129 try: | |
1130 f_hashes = _calculate_file_hashes(full_path, f_hashers) | |
1131 except BagValidationError as e: | |
1132 f_hashes = dict((alg, force_unicode(e)) for alg in f_hashers.keys()) | |
1133 | |
1134 return rel_path, f_hashes, hashes | |
1135 | |
1136 | |
1137 def _calculate_file_hashes(full_path, f_hashers): | |
1138 """ | |
1139 Returns a dictionary of (algorithm, hexdigest) values for the provided | |
1140 filename | |
1141 """ | |
1142 LOGGER.info(_("Verifying checksum for file %s"), full_path) | |
1143 | |
1144 try: | |
1145 with open(full_path, "rb") as f: | |
1146 while True: | |
1147 block = f.read(HASH_BLOCK_SIZE) | |
1148 if not block: | |
1149 break | |
1150 for i in f_hashers.values(): | |
1151 i.update(block) | |
1152 except (OSError, IOError) as e: | |
1153 raise BagValidationError( | |
1154 _("Could not read %(filename)s: %(error)s") | |
1155 % {"filename": full_path, "error": force_unicode(e)} | |
1156 ) | |
1157 | |
1158 return dict((alg, h.hexdigest()) for alg, h in f_hashers.items()) | |
1159 | |
1160 | |
1161 def _load_tag_file(tag_file_name, encoding="utf-8-sig"): | |
1162 with open_text_file(tag_file_name, "r", encoding=encoding) as tag_file: | |
1163 # Store duplicate tags as list of vals | |
1164 # in order of parsing under the same key. | |
1165 tags = {} | |
1166 for name, value in _parse_tags(tag_file): | |
1167 if name not in tags: | |
1168 tags[name] = value | |
1169 continue | |
1170 | |
1171 if not isinstance(tags[name], list): | |
1172 tags[name] = [tags[name], value] | |
1173 else: | |
1174 tags[name].append(value) | |
1175 | |
1176 return tags | |
1177 | |
1178 | |
1179 def _parse_tags(tag_file): | |
1180 """Parses a tag file, according to RFC 2822. This | |
1181 includes line folding, permitting extra-long | |
1182 field values. | |
1183 | |
1184 See http://www.faqs.org/rfcs/rfc2822.html for | |
1185 more information. | |
1186 """ | |
1187 | |
1188 tag_name = None | |
1189 tag_value = None | |
1190 | |
1191 # Line folding is handled by yielding values only after we encounter | |
1192 # the start of a new tag, or if we pass the EOF. | |
1193 for num, line in enumerate(tag_file): | |
1194 # Skip over any empty or blank lines. | |
1195 if len(line) == 0 or line.isspace(): | |
1196 continue | |
1197 elif line[0].isspace() and tag_value is not None: # folded line | |
1198 tag_value += line | |
1199 else: | |
1200 # Starting a new tag; yield the last one. | |
1201 if tag_name: | |
1202 yield (tag_name, tag_value.strip()) | |
1203 | |
1204 if ":" not in line: | |
1205 raise BagValidationError( | |
1206 _("%(filename)s contains invalid tag: %(line)s") | |
1207 % { | |
1208 "line": line.strip(), | |
1209 "filename": os.path.basename(tag_file.name), | |
1210 } | |
1211 ) | |
1212 | |
1213 parts = line.strip().split(":", 1) | |
1214 tag_name = parts[0].strip() | |
1215 tag_value = parts[1] | |
1216 | |
1217 # Passed the EOF. All done after this. | |
1218 if tag_name: | |
1219 yield (tag_name, tag_value.strip()) | |
1220 | |
1221 | |
1222 def _make_tag_file(bag_info_path, bag_info): | |
1223 headers = sorted(bag_info.keys()) | |
1224 with open_text_file(bag_info_path, "w") as f: | |
1225 for h in headers: | |
1226 values = bag_info[h] | |
1227 if not isinstance(values, list): | |
1228 values = [values] | |
1229 for txt in values: | |
1230 # strip CR, LF and CRLF so they don't mess up the tag file | |
1231 txt = re.sub(r"\n|\r|(\r\n)", "", force_unicode(txt)) | |
1232 f.write("%s: %s\n" % (h, txt)) | |
1233 | |
1234 | |
1235 def make_manifests(data_dir, processes, algorithms=DEFAULT_CHECKSUMS, encoding="utf-8"): | |
1236 LOGGER.info( | |
1237 _("Using %(process_count)d processes to generate manifests: %(algorithms)s"), | |
1238 {"process_count": processes, "algorithms": ", ".join(algorithms)}, | |
1239 ) | |
1240 | |
1241 manifest_line_generator = partial(generate_manifest_lines, algorithms=algorithms) | |
1242 | |
1243 if processes > 1: | |
1244 pool = multiprocessing.Pool(processes=processes) | |
1245 checksums = pool.map(manifest_line_generator, _walk(data_dir)) | |
1246 pool.close() | |
1247 pool.join() | |
1248 else: | |
1249 checksums = [manifest_line_generator(i) for i in _walk(data_dir)] | |
1250 | |
1251 # At this point we have a list of tuples which start with the algorithm name: | |
1252 manifest_data = {} | |
1253 for batch in checksums: | |
1254 for entry in batch: | |
1255 manifest_data.setdefault(entry[0], []).append(entry[1:]) | |
1256 | |
1257 # These will be keyed on the algorithm name so we can perform sanity checks | |
1258 # below to catch failures in the hashing process: | |
1259 num_files = defaultdict(lambda: 0) | |
1260 total_bytes = defaultdict(lambda: 0) | |
1261 | |
1262 for algorithm, values in manifest_data.items(): | |
1263 manifest_filename = "manifest-%s.txt" % algorithm | |
1264 | |
1265 with open_text_file(manifest_filename, "w", encoding=encoding) as manifest: | |
1266 for digest, filename, byte_count in values: | |
1267 manifest.write("%s %s\n" % (digest, _encode_filename(filename))) | |
1268 num_files[algorithm] += 1 | |
1269 total_bytes[algorithm] += byte_count | |
1270 | |
1271 # We'll use sets of the values for the error checks and eventually return the payload oxum values: | |
1272 byte_value_set = set(total_bytes.values()) | |
1273 file_count_set = set(num_files.values()) | |
1274 | |
1275 # allow a bag with an empty payload | |
1276 if not byte_value_set and not file_count_set: | |
1277 return 0, 0 | |
1278 | |
1279 if len(file_count_set) != 1: | |
1280 raise RuntimeError(_("Expected the same number of files for each checksum")) | |
1281 | |
1282 if len(byte_value_set) != 1: | |
1283 raise RuntimeError(_("Expected the same number of bytes for each checksums")) | |
1284 | |
1285 return byte_value_set.pop(), file_count_set.pop() | |
1286 | |
1287 | |
1288 def _make_tagmanifest_file(alg, bag_dir, encoding="utf-8"): | |
1289 tagmanifest_file = join(bag_dir, "tagmanifest-%s.txt" % alg) | |
1290 LOGGER.info(_("Creating %s"), tagmanifest_file) | |
1291 | |
1292 checksums = [] | |
1293 for f in _find_tag_files(bag_dir): | |
1294 if re.match(r"^tagmanifest-.+\.txt$", f): | |
1295 continue | |
1296 with open(join(bag_dir, f), "rb") as fh: | |
1297 m = hashlib.new(alg) | |
1298 while True: | |
1299 block = fh.read(HASH_BLOCK_SIZE) | |
1300 if not block: | |
1301 break | |
1302 m.update(block) | |
1303 checksums.append((m.hexdigest(), f)) | |
1304 | |
1305 with open_text_file( | |
1306 join(bag_dir, tagmanifest_file), mode="w", encoding=encoding | |
1307 ) as tagmanifest: | |
1308 for digest, filename in checksums: | |
1309 tagmanifest.write("%s %s\n" % (digest, filename)) | |
1310 | |
1311 | |
1312 def _find_tag_files(bag_dir): | |
1313 for dir in os.listdir(bag_dir): | |
1314 if dir != "data": | |
1315 if os.path.isfile(dir) and not dir.startswith("tagmanifest-"): | |
1316 yield dir | |
1317 for dir_name, _, filenames in os.walk(dir): | |
1318 for filename in filenames: | |
1319 if filename.startswith("tagmanifest-"): | |
1320 continue | |
1321 # remove everything up to the bag_dir directory | |
1322 p = join(dir_name, filename) | |
1323 yield os.path.relpath(p, bag_dir) | |
1324 | |
1325 | |
1326 def _walk(data_dir): | |
1327 for dirpath, dirnames, filenames in os.walk(data_dir): | |
1328 # if we don't sort here the order of entries is non-deterministic | |
1329 # which makes it hard to test the fixity of tagmanifest-md5.txt | |
1330 filenames.sort() | |
1331 dirnames.sort() | |
1332 for fn in filenames: | |
1333 path = os.path.join(dirpath, fn) | |
1334 # BagIt spec requires manifest to always use '/' as path separator | |
1335 if os.path.sep != "/": | |
1336 parts = path.split(os.path.sep) | |
1337 path = "/".join(parts) | |
1338 yield path | |
1339 | |
1340 | |
1341 def _can_bag(test_dir): | |
1342 """Scan the provided directory for files which cannot be bagged due to insufficient permissions""" | |
1343 unbaggable = [] | |
1344 | |
1345 if not os.access(test_dir, os.R_OK): | |
1346 # We cannot continue without permission to read the source directory | |
1347 unbaggable.append(test_dir) | |
1348 return unbaggable | |
1349 | |
1350 if not os.access(test_dir, os.W_OK): | |
1351 unbaggable.append(test_dir) | |
1352 | |
1353 for dirpath, dirnames, filenames in os.walk(test_dir): | |
1354 for directory in dirnames: | |
1355 full_path = os.path.join(dirpath, directory) | |
1356 if not os.access(full_path, os.W_OK): | |
1357 unbaggable.append(full_path) | |
1358 | |
1359 return unbaggable | |
1360 | |
1361 | |
1362 def _can_read(test_dir): | |
1363 """ | |
1364 returns ((unreadable_dirs), (unreadable_files)) | |
1365 """ | |
1366 unreadable_dirs = [] | |
1367 unreadable_files = [] | |
1368 | |
1369 if not os.access(test_dir, os.R_OK): | |
1370 unreadable_dirs.append(test_dir) | |
1371 else: | |
1372 for dirpath, dirnames, filenames in os.walk(test_dir): | |
1373 for dn in dirnames: | |
1374 full_path = os.path.join(dirpath, dn) | |
1375 if not os.access(full_path, os.R_OK): | |
1376 unreadable_dirs.append(full_path) | |
1377 for fn in filenames: | |
1378 full_path = os.path.join(dirpath, fn) | |
1379 if not os.access(full_path, os.R_OK): | |
1380 unreadable_files.append(full_path) | |
1381 return (tuple(unreadable_dirs), tuple(unreadable_files)) | |
1382 | |
1383 | |
1384 def generate_manifest_lines(filename, algorithms=DEFAULT_CHECKSUMS): | |
1385 LOGGER.info(_("Generating manifest lines for file %s"), filename) | |
1386 | |
1387 # For performance we'll read the file only once and pass it block | |
1388 # by block to every requested hash algorithm: | |
1389 hashers = get_hashers(algorithms) | |
1390 | |
1391 total_bytes = 0 | |
1392 | |
1393 with open(filename, "rb") as f: | |
1394 while True: | |
1395 block = f.read(HASH_BLOCK_SIZE) | |
1396 | |
1397 if not block: | |
1398 break | |
1399 | |
1400 total_bytes += len(block) | |
1401 for hasher in hashers.values(): | |
1402 hasher.update(block) | |
1403 | |
1404 decoded_filename = _decode_filename(filename) | |
1405 | |
1406 # We'll generate a list of results in roughly manifest format but prefixed with the algorithm: | |
1407 results = [ | |
1408 (alg, hasher.hexdigest(), decoded_filename, total_bytes) | |
1409 for alg, hasher in hashers.items() | |
1410 ] | |
1411 | |
1412 return results | |
1413 | |
1414 | |
1415 def _encode_filename(s): | |
1416 s = s.replace("\r", "%0D") | |
1417 s = s.replace("\n", "%0A") | |
1418 return s | |
1419 | |
1420 | |
1421 def _decode_filename(s): | |
1422 s = re.sub(r"%0D", "\r", s, re.IGNORECASE) | |
1423 s = re.sub(r"%0A", "\n", s, re.IGNORECASE) | |
1424 return s | |
1425 | |
1426 | |
1427 def force_unicode_py2(s): | |
1428 """Reliably return a Unicode string given a possible unicode or byte string""" | |
1429 if isinstance(s, str): | |
1430 return s.decode("utf-8") | |
1431 else: | |
1432 return unicode(s) | |
1433 | |
1434 | |
1435 if sys.version_info > (3, 0): | |
1436 force_unicode = str | |
1437 else: | |
1438 force_unicode = force_unicode_py2 | |
1439 | |
1440 # following code is used for command line program | |
1441 | |
1442 | |
1443 class BagArgumentParser(argparse.ArgumentParser): | |
1444 def __init__(self, *args, **kwargs): | |
1445 self.bag_info = {} | |
1446 argparse.ArgumentParser.__init__(self, *args, **kwargs) | |
1447 | |
1448 | |
1449 class BagHeaderAction(argparse.Action): | |
1450 def __call__(self, parser, _, values, option_string=None): | |
1451 opt = option_string.lstrip("--") | |
1452 opt_caps = "-".join([o.capitalize() for o in opt.split("-")]) | |
1453 parser.bag_info[opt_caps] = values | |
1454 | |
1455 | |
1456 def _make_parser(): | |
1457 parser = BagArgumentParser( | |
1458 formatter_class=argparse.RawDescriptionHelpFormatter, | |
1459 description="bagit-python version %s\n\n%s\n" % (VERSION, __doc__.strip()), | |
1460 ) | |
1461 parser.add_argument( | |
1462 "--processes", | |
1463 type=int, | |
1464 dest="processes", | |
1465 default=1, | |
1466 help=_( | |
1467 "Use multiple processes to calculate checksums faster (default: %(default)s)" | |
1468 ), | |
1469 ) | |
1470 parser.add_argument("--log", help=_("The name of the log file (default: stdout)")) | |
1471 parser.add_argument( | |
1472 "--quiet", | |
1473 action="store_true", | |
1474 help=_("Suppress all progress information other than errors"), | |
1475 ) | |
1476 parser.add_argument( | |
1477 "--validate", | |
1478 action="store_true", | |
1479 help=_( | |
1480 "Validate existing bags in the provided directories instead of" | |
1481 " creating new ones" | |
1482 ), | |
1483 ) | |
1484 parser.add_argument( | |
1485 "--fast", | |
1486 action="store_true", | |
1487 help=_( | |
1488 "Modify --validate behaviour to only test whether the bag directory" | |
1489 " has the number of files and total size specified in Payload-Oxum" | |
1490 " without performing checksum validation to detect corruption." | |
1491 ), | |
1492 ) | |
1493 parser.add_argument( | |
1494 "--completeness-only", | |
1495 action="store_true", | |
1496 help=_( | |
1497 "Modify --validate behaviour to test whether the bag directory" | |
1498 " has the expected payload specified in the checksum manifests" | |
1499 " without performing checksum validation to detect corruption." | |
1500 ), | |
1501 ) | |
1502 | |
1503 checksum_args = parser.add_argument_group( | |
1504 _("Checksum Algorithms"), | |
1505 _( | |
1506 "Select the manifest algorithms to be used when creating bags" | |
1507 " (default=%s)" | |
1508 ) | |
1509 % ", ".join(DEFAULT_CHECKSUMS), | |
1510 ) | |
1511 | |
1512 for i in CHECKSUM_ALGOS: | |
1513 alg_name = re.sub(r"^([A-Z]+)(\d+)$", r"\1-\2", i.upper()) | |
1514 checksum_args.add_argument( | |
1515 "--%s" % i, | |
1516 action="append_const", | |
1517 dest="checksums", | |
1518 const=i, | |
1519 help=_("Generate %s manifest when creating a bag") % alg_name, | |
1520 ) | |
1521 | |
1522 metadata_args = parser.add_argument_group(_("Optional Bag Metadata")) | |
1523 for header in STANDARD_BAG_INFO_HEADERS: | |
1524 metadata_args.add_argument( | |
1525 "--%s" % header.lower(), type=str, action=BagHeaderAction | |
1526 ) | |
1527 | |
1528 parser.add_argument( | |
1529 "directory", | |
1530 nargs="+", | |
1531 help=_( | |
1532 "Directory which will be converted into a bag in place" | |
1533 " by moving any existing files into the BagIt structure" | |
1534 " and creating the manifests and other metadata." | |
1535 ), | |
1536 ) | |
1537 | |
1538 return parser | |
1539 | |
1540 | |
1541 def _configure_logging(opts): | |
1542 log_format = "%(asctime)s - %(levelname)s - %(message)s" | |
1543 if opts.quiet: | |
1544 level = logging.ERROR | |
1545 else: | |
1546 level = logging.INFO | |
1547 if opts.log: | |
1548 logging.basicConfig(filename=opts.log, level=level, format=log_format) | |
1549 else: | |
1550 logging.basicConfig(level=level, format=log_format) | |
1551 | |
1552 | |
1553 def main(): | |
1554 if "--version" in sys.argv: | |
1555 print(_("bagit-python version %s") % VERSION) | |
1556 sys.exit(0) | |
1557 | |
1558 parser = _make_parser() | |
1559 args = parser.parse_args() | |
1560 | |
1561 if args.processes < 0: | |
1562 parser.error(_("The number of processes must be 0 or greater")) | |
1563 | |
1564 if args.fast and not args.validate: | |
1565 parser.error(_("--fast is only allowed as an option for --validate!")) | |
1566 | |
1567 _configure_logging(args) | |
1568 | |
1569 rc = 0 | |
1570 for bag_dir in args.directory: | |
1571 # validate the bag | |
1572 if args.validate: | |
1573 try: | |
1574 bag = Bag(bag_dir) | |
1575 # validate throws a BagError or BagValidationError | |
1576 bag.validate( | |
1577 processes=args.processes, | |
1578 fast=args.fast, | |
1579 completeness_only=args.completeness_only, | |
1580 ) | |
1581 if args.fast: | |
1582 LOGGER.info(_("%s valid according to Payload-Oxum"), bag_dir) | |
1583 else: | |
1584 LOGGER.info(_("%s is valid"), bag_dir) | |
1585 except BagError as e: | |
1586 LOGGER.error( | |
1587 _("%(bag)s is invalid: %(error)s"), {"bag": bag_dir, "error": e} | |
1588 ) | |
1589 rc = 1 | |
1590 | |
1591 # make the bag | |
1592 else: | |
1593 try: | |
1594 make_bag( | |
1595 bag_dir, | |
1596 bag_info=parser.bag_info, | |
1597 processes=args.processes, | |
1598 checksums=args.checksums, | |
1599 ) | |
1600 except Exception as exc: | |
1601 LOGGER.error( | |
1602 _("Failed to create bag in %(bag_directory)s: %(error)s"), | |
1603 {"bag_directory": bag_dir, "error": exc}, | |
1604 exc_info=True, | |
1605 ) | |
1606 rc = 1 | |
1607 | |
1608 sys.exit(rc) | |
1609 | |
1610 | |
1611 if __name__ == "__main__": | |
1612 main() |