comparison pyramid_upgrade.py @ 0:b3054f3d42b2 draft

"planemo upload for repository https://github.com/ohsu-comp-bio/ashlar commit 27f0c9be58e9e5aecc69067d0e60b5cb945de4b2-dirty"
author perssond
date Fri, 12 Mar 2021 00:14:49 +0000
parents
children f183d9de4622
comparison
equal deleted inserted replaced
-1:000000000000 0:b3054f3d42b2
1 import sys
2 import os
3 import argparse
4 import struct
5 import re
6 import fractions
7 import io
8 import xml.etree.ElementTree
9 import collections
10 import reprlib
11 import dataclasses
12 from typing import List, Any
13
14
15 datatype_formats = {
16 1: "B", # BYTE
17 2: "s", # ASCII
18 3: "H", # SHORT
19 4: "I", # LONG
20 5: "I", # RATIONAL (pairs)
21 6: "b", # SBYTE
22 7: "B", # UNDEFINED
23 8: "h", # SSHORT
24 9: "i", # SLONG
25 10: "i", # SRATIONAL (pairs)
26 11: "f", # FLOAT
27 12: "d", # DOUBLE
28 13: "I", # IFD
29 16: "Q", # LONG8
30 17: "q", # SLONG8
31 18: "Q", # IFD8
32 }
33 rational_datatypes = {5, 10}
34
35
36 class TiffSurgeon:
37 """Read, manipulate and write IFDs in BigTIFF files."""
38
39 def __init__(self, path, *, writeable=False, encoding=None):
40 self.path = path
41 self.writeable = writeable
42 self.encoding = encoding
43 self.endian = ""
44 self.ifds = None
45 self.file = open(self.path, "r+b" if self.writeable else "rb")
46 self._validate()
47
48 def _validate(self):
49 signature = self.read("2s")
50 signature = signature.decode("ascii", errors="ignore")
51 if signature == "II":
52 self.endian = "<"
53 elif signature == "MM":
54 self.endian = ">"
55 else:
56 raise FormatError(f"Not a TIFF file (signature is '{signature}').")
57 version = self.read("H")
58 if version == 42:
59 raise FormatError("Cannot process classic TIFF, only BigTIFF.")
60 offset_size, reserved, first_ifd_offset = self.read("H H Q")
61 if version != 43 or offset_size != 8 or reserved != 0:
62 raise FormatError("Malformed TIFF, giving up!")
63 self.first_ifd_offset = first_ifd_offset
64
65 def read(self, fmt, *, file=None):
66 if file is None:
67 file = self.file
68 endian = self.endian or "="
69 size = struct.calcsize(endian + fmt)
70 raw = file.read(size)
71 value = self.unpack(fmt, raw)
72 return value
73
74 def write(self, fmt, *values):
75 if not self.writeable:
76 raise ValueError("File is opened as read-only.")
77 raw = self.pack(fmt, *values)
78 self.file.write(raw)
79
80 def unpack(self, fmt, raw):
81 assert self.endian or re.match(r"\d+s", fmt), \
82 "can't unpack non-string before endianness is detected"
83 fmt = self.endian + fmt
84 size = struct.calcsize(fmt)
85 values = struct.unpack(fmt, raw[:size])
86 if len(values) == 1:
87 return values[0]
88 else:
89 return values
90
91 def pack(self, fmt, *values):
92 assert self.endian, "can't pack without endian set"
93 fmt = self.endian + fmt
94 raw = struct.pack(fmt, *values)
95 return raw
96
97 def read_ifds(self):
98 ifds = [self.read_ifd(self.first_ifd_offset)]
99 while ifds[-1].offset_next:
100 ifds.append(self.read_ifd(ifds[-1].offset_next))
101 self.ifds = ifds
102
103 def read_ifd(self, offset):
104 self.file.seek(offset)
105 num_tags = self.read("Q")
106 buf = io.BytesIO(self.file.read(num_tags * 20))
107 offset_next = self.read("Q")
108 try:
109 tags = TagSet([self.read_tag(buf) for i in range(num_tags)])
110 except FormatError as e:
111 raise FormatError(f"IFD at offset {offset}, {e}") from None
112 ifd = Ifd(tags, offset, offset_next)
113 return ifd
114
115 def read_tag(self, buf):
116 tag = Tag(*self.read("H H Q 8s", file=buf))
117 value, offset_range = self.tag_value(tag)
118 tag = dataclasses.replace(tag, value=value, offset_range=offset_range)
119 return tag
120
121 def append_ifd_sequence(self, ifds):
122 """Write list of IFDs as a chained sequence at the end of the file.
123
124 Returns a list of new Ifd objects with updated offsets.
125
126 """
127 self.file.seek(0, os.SEEK_END)
128 new_ifds = []
129 for ifd in ifds:
130 offset = self.file.tell()
131 self.write("Q", len(ifd.tags))
132 for tag in ifd.tags:
133 self.write_tag(tag)
134 offset_next = self.file.tell() + 8 if ifd is not ifds[-1] else 0
135 self.write("Q", offset_next)
136 new_ifd = dataclasses.replace(
137 ifd, offset=offset, offset_next=offset_next
138 )
139 new_ifds.append(new_ifd)
140 return new_ifds
141
142 def append_tag_data(self, code, datatype, value):
143 """Build new tag and write data to the end of the file if necessary.
144
145 Returns a Tag object corresponding to the passed parameters. This
146 function only writes any "overflow" data and not the IFD entry itself,
147 so the returned Tag must still be written to an IFD.
148
149 If the value is small enough to fit in the data field within an IFD, no
150 data will actually be written to the file and the returned Tag object
151 will have the value encoded in its data attribute. Otherwise the data
152 will be appended to the file and the returned Tag's data attribute will
153 encode the corresponding offset.
154
155 """
156 fmt = datatype_formats[datatype]
157 # FIXME Should we perform our own check that values match datatype?
158 # struct.pack will do it but the exception won't be as understandable.
159 original_value = value
160 if isinstance(value, str):
161 if not self.encoding:
162 raise ValueError(
163 "ASCII tag values must be bytes if encoding is not set"
164 )
165 value = [value.encode(self.encoding) + b"\x00"]
166 count = len(value[0])
167 elif isinstance(value, bytes):
168 value = [value + b"\x00"]
169 count = len(value[0])
170 else:
171 try:
172 len(value)
173 except TypeError:
174 value = [value]
175 count = len(value)
176 struct_count = count
177 if datatype in rational_datatypes:
178 value = [i for v in value for i in v.as_integer_ratio()]
179 count //= 2
180 byte_count = struct_count * struct.calcsize(fmt)
181 if byte_count <= 8:
182 data = self.pack(str(struct_count) + fmt, *value)
183 data += bytes(8 - byte_count)
184 else:
185 self.file.seek(0, os.SEEK_END)
186 data = self.pack("Q", self.file.tell())
187 self.write(str(count) + fmt, *value)
188 # TODO Compute and set offset_range.
189 tag = Tag(code, datatype, count, data, original_value)
190 return tag
191
192 def write_first_ifd_offset(self, offset):
193 self.file.seek(8)
194 self.write("Q", offset)
195
196 def write_tag(self, tag):
197 self.write("H H Q 8s", tag.code, tag.datatype, tag.count, tag.data)
198
199 def tag_value(self, tag):
200 """Return decoded tag data and the file offset range."""
201 fmt = datatype_formats[tag.datatype]
202 count = tag.count
203 if tag.datatype in rational_datatypes:
204 count *= 2
205 byte_count = count * struct.calcsize(fmt)
206 if byte_count <= 8:
207 value = self.unpack(str(count) + fmt, tag.data)
208 offset_range = range(0, 0)
209 else:
210 offset = self.unpack("Q", tag.data)
211 self.file.seek(offset)
212 value = self.read(str(count) + fmt)
213 offset_range = range(offset, offset + byte_count)
214 if tag.datatype == 2:
215 value = value.rstrip(b"\x00")
216 if self.encoding:
217 try:
218 value = value.decode(self.encoding)
219 except UnicodeDecodeError as e:
220 raise FormatError(f"tag {tag.code}: {e}") from None
221 elif tag.datatype in rational_datatypes:
222 value = [
223 fractions.Fraction(*v) for v in zip(value[::2], value[1::2])
224 ]
225 if len(value) == 1:
226 value = value[0]
227 return value, offset_range
228
229 def close(self):
230 self.file.close()
231
232
233 @dataclasses.dataclass(frozen=True)
234 class Tag:
235 code: int
236 datatype: int
237 count: int
238 data: bytes
239 value: Any = None
240 offset_range: range = None
241
242 _vrepr = reprlib.Repr()
243 _vrepr.maxstring = 60
244 _vrepr.maxother = 60
245 vrepr = _vrepr.repr
246
247 def __repr__(self):
248 return (
249 self.__class__.__qualname__ + "("
250 + f"code={self.code!r}, datatype={self.datatype!r}, "
251 + f"count={self.count!r}, data={self.data!r}, "
252 + f"value={self.vrepr(self.value)}"
253 + ")"
254 )
255
256 @dataclasses.dataclass(frozen=True)
257 class TagSet:
258 """Container for Tag objects as stored in a TIFF IFD.
259
260 Tag objects are maintained in a list that's always sorted in ascending order
261 by the tag code. Only one tag for a given code may be present, which is where
262 the "set" name comes from.
263
264 """
265
266 tags: List[Tag] = dataclasses.field(default_factory=list)
267
268 def __post_init__(self):
269 if len(self.codes) != len(set(self.codes)):
270 raise ValueError("Duplicate tag codes are not allowed.")
271
272 def __repr__(self):
273 ret = type(self).__name__ + "(["
274 if self.tags:
275 ret += "\n"
276 ret += "".join([f" {t},\n" for t in self.tags])
277 ret += "])"
278 return ret
279
280 @property
281 def codes(self):
282 return [t.code for t in self.tags]
283
284 def __getitem__(self, code):
285 for t in self.tags:
286 if code == t.code:
287 return t
288 else:
289 raise KeyError(code)
290
291 def __delitem__(self, code):
292 try:
293 i = self.codes.index(code)
294 except ValueError:
295 raise KeyError(code) from None
296 self.tags[:] = self.tags[:i] + self.tags[i+1:]
297
298 def __contains__(self, code):
299 return code in self.codes
300
301 def __len__(self):
302 return len(self.tags)
303
304 def __iter__(self):
305 return iter(self.tags)
306
307 def get(self, code, default=None):
308 try:
309 return self[code]
310 except KeyError:
311 return default
312
313 def get_value(self, code, default=None):
314 tag = self.get(code)
315 if tag:
316 return tag.value
317 else:
318 return default
319
320 def insert(self, tag):
321 """Add a new tag or replace an existing one."""
322 for i, t in enumerate(self.tags):
323 if tag.code == t.code:
324 self.tags[i] = tag
325 return
326 elif tag.code < t.code:
327 break
328 else:
329 i = len(self.tags)
330 n = len(self.tags)
331 self.tags[i:n+1] = [tag] + self.tags[i:n]
332
333
334 @dataclasses.dataclass(frozen=True)
335 class Ifd:
336 tags: TagSet
337 offset: int
338 offset_next: int
339
340 @property
341 def nbytes(self):
342 return len(self.tags) * 20 + 16
343
344 @property
345 def offset_range(self):
346 return range(self.offset, self.offset + self.nbytes)
347
348
349 class FormatError(Exception):
350 pass
351
352
353 def fix_attrib_namespace(elt):
354 """Prefix un-namespaced XML attributes with the tag's namespace."""
355 # This fixes ElementTree's inability to round-trip XML with a default
356 # namespace ("cannot use non-qualified names with default_namespace option"
357 # error). 7-year-old BPO issue here: https://bugs.python.org/issue17088
358 # Code inspired by https://gist.github.com/provegard/1381912 .
359 if elt.tag[0] == "{":
360 uri, _ = elt.tag[1:].rsplit("}", 1)
361 new_attrib = {}
362 for name, value in elt.attrib.items():
363 if name[0] != "{":
364 # For un-namespaced attributes, copy namespace from element.
365 name = f"{{{uri}}}{name}"
366 new_attrib[name] = value
367 elt.attrib = new_attrib
368 for child in elt:
369 fix_attrib_namespace(child)
370
371
372 def parse_args():
373 parser = argparse.ArgumentParser(
374 description="Convert an OME-TIFF legacy pyramid to the BioFormats 6"
375 " OME-TIFF pyramid format in-place.",
376 )
377 parser.add_argument("image", help="OME-TIFF file to convert")
378 parser.add_argument(
379 "-n",
380 dest="channel_names",
381 nargs="+",
382 default=[],
383 metavar="NAME",
384 help="Channel names to be inserted into OME metadata. Number of names"
385 " must match number of channels in image. Be sure to put quotes"
386 " around names containing spaces or other special shell characters."
387 )
388 args = parser.parse_args()
389 return args
390
391
392 def main():
393
394 args = parse_args()
395
396 image_path = sys.argv[1]
397 try:
398 tiff = TiffSurgeon(image_path, encoding="utf-8", writeable=True)
399 except FormatError as e:
400 print(f"TIFF format error: {e}")
401 sys.exit(1)
402
403 tiff.read_ifds()
404
405 # ElementTree doesn't parse xml declarations so we'll just run some sanity
406 # checks that we do have UTF-8 and give it a decoded string instead of raw
407 # bytes. We need to both ensure that the raw tag bytes decode properly and
408 # that the declaration encoding is UTF-8 if present.
409 try:
410 omexml = tiff.ifds[0].tags.get_value(270, "")
411 except FormatError:
412 print("ImageDescription tag is not a valid UTF-8 string (not an OME-TIFF?)")
413 sys.exit(1)
414 if re.match(r'<\?xml [^>]*encoding="(?!UTF-8)[^"]*"', omexml):
415 print("OME-XML is encoded with something other than UTF-8.")
416 sys.exit(1)
417
418 xml_ns = {"ome": "http://www.openmicroscopy.org/Schemas/OME/2016-06"}
419
420 if xml_ns["ome"] not in omexml:
421 print("Not an OME-TIFF.")
422 sys.exit(1)
423 if (
424 "Faas" not in tiff.ifds[0].tags.get_value(305, "")
425 or 330 in tiff.ifds[0].tags
426 ):
427 print("Not a legacy OME-TIFF pyramid.")
428 sys.exit(1)
429
430 # All XML manipulation assumes the document is valid OME-XML!
431 root = xml.etree.ElementTree.fromstring(omexml)
432 image = root.find("ome:Image", xml_ns)
433 pixels = image.find("ome:Pixels", xml_ns)
434 size_x = int(pixels.get("SizeX"))
435 size_y = int(pixels.get("SizeY"))
436 size_c = int(pixels.get("SizeC"))
437 size_z = int(pixels.get("SizeZ"))
438 size_t = int(pixels.get("SizeT"))
439 num_levels = len(root.findall("ome:Image", xml_ns))
440 page_dims = [(ifd.tags[256].value, ifd.tags[257].value) for ifd in tiff.ifds]
441
442 if len(root) != num_levels:
443 print("Top-level OME-XML elements other than Image are not supported.")
444 if size_z != 1 or size_t != 1:
445 print("Z-stacks and multiple timepoints are not supported.")
446 sys.exit(1)
447 if size_c * num_levels != len(tiff.ifds):
448 print("TIFF page count does not match OME-XML Image elements.")
449 sys.exit(1)
450 if any(dims != (size_x, size_y) for dims in page_dims[:size_c]):
451 print(f"TIFF does not begin with SizeC={size_c} full-size pages.")
452 sys.exit(1)
453 for level in range(1, num_levels):
454 level_dims = page_dims[level * size_c : (level + 1) * size_c]
455 if len(set(level_dims)) != 1:
456 print(
457 f"Pyramid level {level + 1} out of {num_levels} has inconsistent"
458 f" sizes:\n{level_dims}"
459 )
460 sys.exit(1)
461 if args.channel_names and len(args.channel_names) != size_c:
462 print(
463 f"Wrong number of channel names -- image has {size_c} channels but"
464 f" {len(args.channel_names)} names were specified:"
465 )
466 for i, n in enumerate(args.channel_names, 1):
467 print(f"{i:4}: {n}")
468 sys.exit(1)
469
470 print("Input image summary")
471 print("===================")
472 print(f"Dimensions: {size_x} x {size_y}")
473 print(f"Number of channels: {size_c}")
474 print(f"Pyramid sub-resolutions ({num_levels - 1} total):")
475 for dim_x, dim_y in page_dims[size_c::size_c]:
476 print(f" {dim_x} x {dim_y}")
477 software = tiff.ifds[0].tags.get_value(305, "<not set>")
478 print(f"Software: {software}")
479 print()
480
481 print("Updating OME-XML metadata...")
482 # We already verified there is nothing but Image elements under the root.
483 for other_image in root[1:]:
484 root.remove(other_image)
485 for tiffdata in pixels.findall("ome:TiffData", xml_ns):
486 pixels.remove(tiffdata)
487 new_tiffdata = xml.etree.ElementTree.Element(
488 f"{{{xml_ns['ome']}}}TiffData",
489 attrib={"IFD": "0", "PlaneCount": str(size_c)},
490 )
491 # A valid OME-XML Pixels begins with size_c Channels; then comes TiffData.
492 pixels.insert(size_c, new_tiffdata)
493
494 if args.channel_names:
495 print("Renaming channels...")
496 channels = pixels.findall("ome:Channel", xml_ns)
497 for channel, name in zip(channels, args.channel_names):
498 channel.attrib["Name"] = name
499
500 fix_attrib_namespace(root)
501 # ElementTree.tostring would have been simpler but it only supports
502 # xml_declaration and default_namespace starting with Python 3.8.
503 xml_file = io.BytesIO()
504 tree = xml.etree.ElementTree.ElementTree(root)
505 tree.write(
506 xml_file,
507 encoding="utf-8",
508 xml_declaration=True,
509 default_namespace=xml_ns["ome"],
510 )
511 new_omexml = xml_file.getvalue()
512
513 print("Writing new TIFF headers...")
514 stale_ranges = [ifd.offset_range for ifd in tiff.ifds]
515 main_ifds = tiff.ifds[:size_c]
516 channel_sub_ifds = [tiff.ifds[c + size_c : : size_c] for c in range(size_c)]
517 for i, (main_ifd, sub_ifds) in enumerate(zip(main_ifds, channel_sub_ifds)):
518 for ifd in sub_ifds:
519 if 305 in ifd.tags:
520 stale_ranges.append(ifd.tags[305].offset_range)
521 del ifd.tags[305]
522 ifd.tags.insert(tiff.append_tag_data(254, 3, 1))
523 if i == 0:
524 stale_ranges.append(main_ifd.tags[305].offset_range)
525 stale_ranges.append(main_ifd.tags[270].offset_range)
526 old_software = main_ifd.tags[305].value.replace("Faas", "F*a*a*s")
527 new_software = f"pyramid_upgrade.py (was {old_software})"
528 main_ifd.tags.insert(tiff.append_tag_data(305, 2, new_software))
529 main_ifd.tags.insert(tiff.append_tag_data(270, 2, new_omexml))
530 else:
531 if 305 in main_ifd.tags:
532 stale_ranges.append(main_ifd.tags[305].offset_range)
533 del main_ifd.tags[305]
534 sub_ifds[:] = tiff.append_ifd_sequence(sub_ifds)
535 offsets = [ifd.offset for ifd in sub_ifds]
536 main_ifd.tags.insert(tiff.append_tag_data(330, 16, offsets))
537 main_ifds = tiff.append_ifd_sequence(main_ifds)
538 tiff.write_first_ifd_offset(main_ifds[0].offset)
539
540 print("Clearing old headers and tag values...")
541 # We overwrite all the old IFDs and referenced data values with obvious
542 # "filler" as a courtesy to anyone who might need to poke around in the TIFF
543 # structure down the road. A real TIFF parser wouldn't see the stale data,
544 # but a human might just scan for the first thing that looks like a run of
545 # OME-XML and not realize it's been replaced with something else. The filler
546 # content is the repeated string "unused " with square brackets at the
547 # beginning and end of each filled IFD or data value.
548 filler = b"unused "
549 f_len = len(filler)
550 for r in stale_ranges:
551 tiff.file.seek(r.start)
552 tiff.file.write(b"[")
553 f_total = len(r) - 2
554 for i in range(f_total // f_len):
555 tiff.file.write(filler)
556 tiff.file.write(b" " * (f_total % f_len))
557 tiff.file.write(b"]")
558
559 tiff.close()
560
561 print()
562 print("Success!")
563
564
565 if __name__ == "__main__":
566 main()