Mercurial > repos > goeckslab > multimodal_learner
annotate utils.py @ 6:871957823d0c draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
| author | goeckslab |
|---|---|
| date | Mon, 26 Jan 2026 18:44:07 +0000 |
| parents | b708d0e210e6 |
| children |
| rev | line source |
|---|---|
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
1 import errno |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
2 import json |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
3 import logging |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
4 import os |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
5 import random |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
6 import sys |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
7 import tempfile |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
8 import zipfile |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
9 from collections import OrderedDict |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
10 from pathlib import Path |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
11 from typing import List, Optional |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
12 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
13 import numpy as np |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
14 import pandas as pd |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
15 import torch |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
16 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
17 LOG = logging.getLogger(__name__) |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
18 _IMAGE_EXTENSIONS = { |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
19 ".jpg", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
20 ".jpeg", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
21 ".png", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
22 ".bmp", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
23 ".gif", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
24 ".tif", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
25 ".tiff", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
26 ".webp", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
27 ".svs", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
28 } |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
29 _MAX_PATH_COMPONENT = 255 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
30 _MAX_EXTRACTED_INDEX_CACHE_SIZE = 2 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
31 _MAX_EXTRACTED_INDEX_FILES = 100000 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
32 _EXTRACTED_INDEX_CACHE = OrderedDict() |
|
6
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
33 _EXTRACTED_PATH_CACHE = OrderedDict() |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
34 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
35 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
36 def str2bool(val) -> bool: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
37 """Parse common truthy strings to bool.""" |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
38 return str(val).strip().lower() in ("1", "true", "yes", "y") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
39 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
40 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
41 def load_user_hparams(hp_arg: Optional[str]) -> dict: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
42 """Parse --hyperparameters (inline JSON or path to .json).""" |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
43 if not hp_arg: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
44 return {} |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
45 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
46 s = hp_arg.strip() |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
47 if s.startswith("{"): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
48 return json.loads(s) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
49 with open(s, "r") as f: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
50 return json.load(f) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
51 except Exception as e: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
52 LOG.warning(f"Could not parse --hyperparameters: {e}. Ignoring.") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
53 return {} |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
54 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
55 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
56 def set_seeds(seed: int = 42): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
57 random.seed(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
58 np.random.seed(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
59 torch.manual_seed(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
60 if torch.cuda.is_available(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
61 torch.cuda.manual_seed_all(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
62 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
63 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
64 def ensure_local_tmp(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
65 os.makedirs("/tmp", exist_ok=True) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
66 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
67 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
68 def enable_tensor_cores_if_available(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
69 if torch.cuda.is_available(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
70 torch.set_float32_matmul_precision("high") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
71 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
72 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
73 def enable_deterministic_mode(seed: Optional[int] = None): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
74 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
75 Force deterministic algorithms where possible to reduce run-to-run variance. |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
76 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
77 if seed is not None: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
78 set_seeds(seed) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
79 os.environ.setdefault("PYTHONHASHSEED", str(int(seed))) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
80 # cuBLAS determinism |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
81 os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
82 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
83 torch.use_deterministic_algorithms(True) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
84 except Exception as e: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
85 LOG.warning(f"Could not enable torch deterministic algorithms: {e}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
86 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
87 torch.backends.cudnn.deterministic = True |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
88 torch.backends.cudnn.benchmark = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
89 except Exception as e: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
90 LOG.warning(f"Could not enforce deterministic cuDNN settings: {e}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
91 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
92 torch.backends.cuda.matmul.allow_tf32 = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
93 except Exception: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
94 pass |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
95 try: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
96 torch.backends.cudnn.allow_tf32 = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
97 except Exception: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
98 pass |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
99 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
100 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
101 def load_file(path: str) -> pd.DataFrame: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
102 if not path: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
103 return None |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
104 path = Path(path) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
105 if not path.exists(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
106 raise FileNotFoundError(f"Dataset not found: {path}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
107 return pd.read_csv(path, sep=None, engine="python") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
108 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
109 |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
110 def _normalize_path_value(val: object) -> Optional[str]: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
111 if val is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
112 return None |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
113 s = str(val).strip().strip('"').strip("'") |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
114 return s if s else None |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
115 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
116 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
117 def _warn_if_long_component(path_str: str) -> None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
118 for part in path_str.replace("\\", "/").split("/"): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
119 if len(part) > _MAX_PATH_COMPONENT: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
120 LOG.warning( |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
121 "Path component exceeds %d chars; resolution may fail: %s", |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
122 _MAX_PATH_COMPONENT, |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
123 path_str, |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
124 ) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
125 return |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
126 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
127 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
128 def _build_extracted_index(extracted_root: Optional[Path]) -> set: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
129 if extracted_root is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
130 return set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
131 index = set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
132 for root, _dirs, files in os.walk(extracted_root): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
133 rel_root = os.path.relpath(root, extracted_root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
134 for fname in files: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
135 ext = os.path.splitext(fname)[1].lower() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
136 if ext not in _IMAGE_EXTENSIONS: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
137 continue |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
138 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
139 index.add(rel_path.replace("\\", "/")) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
140 index.add(fname) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
141 return index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
142 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
143 |
|
6
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
144 def _build_extracted_maps(extracted_root: Optional[Path]) -> tuple[dict, dict]: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
145 if extracted_root is None: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
146 return {}, {} |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
147 rel_map: dict[str, str] = {} |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
148 name_map: dict[str, str] = {} |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
149 name_collisions = set() |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
150 count = 0 |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
151 for root, _dirs, files in os.walk(extracted_root): |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
152 rel_root = os.path.relpath(root, extracted_root) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
153 for fname in files: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
154 ext = os.path.splitext(fname)[1].lower() |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
155 if ext not in _IMAGE_EXTENSIONS: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
156 continue |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
157 count += 1 |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
158 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
159 rel_norm = rel_path.replace("\\", "/") |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
160 abs_path = os.path.join(root, fname) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
161 rel_map[rel_norm] = abs_path |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
162 if fname in name_map and name_map[fname] != abs_path: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
163 name_collisions.add(fname) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
164 else: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
165 name_map[fname] = abs_path |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
166 for name in name_collisions: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
167 name_map.pop(name, None) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
168 return rel_map, name_map |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
169 |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
170 |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
171 def _get_cached_extracted_index(extracted_root: Optional[Path]) -> set: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
172 if extracted_root is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
173 return set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
174 try: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
175 root = extracted_root.resolve() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
176 except Exception: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
177 root = extracted_root |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
178 cache_key = str(root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
179 try: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
180 mtime_ns = root.stat().st_mtime_ns |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
181 except OSError: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
182 _EXTRACTED_INDEX_CACHE.pop(cache_key, None) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
183 return _build_extracted_index(root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
184 cached = _EXTRACTED_INDEX_CACHE.get(cache_key) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
185 if cached: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
186 cached_mtime, cached_index = cached |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
187 if cached_mtime == mtime_ns: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
188 _EXTRACTED_INDEX_CACHE.move_to_end(cache_key) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
189 LOG.debug("Using cached extracted index for %s (%d entries)", root, len(cached_index)) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
190 return cached_index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
191 _EXTRACTED_INDEX_CACHE.pop(cache_key, None) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
192 LOG.debug("Invalidated extracted index cache for %s (mtime changed)", root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
193 else: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
194 LOG.debug("No extracted index cache for %s; building", root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
195 index = _build_extracted_index(root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
196 if len(index) <= _MAX_EXTRACTED_INDEX_FILES: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
197 _EXTRACTED_INDEX_CACHE[cache_key] = (mtime_ns, index) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
198 _EXTRACTED_INDEX_CACHE.move_to_end(cache_key) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
199 while len(_EXTRACTED_INDEX_CACHE) > _MAX_EXTRACTED_INDEX_CACHE_SIZE: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
200 _EXTRACTED_INDEX_CACHE.popitem(last=False) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
201 else: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
202 LOG.debug("Extracted index has %d entries; skipping cache for %s", len(index), root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
203 return index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
204 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
205 |
|
6
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
206 def _get_cached_extracted_maps(extracted_root: Optional[Path]) -> tuple[dict, dict]: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
207 if extracted_root is None: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
208 return {}, {} |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
209 try: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
210 root = extracted_root.resolve() |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
211 except Exception: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
212 root = extracted_root |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
213 cache_key = str(root) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
214 try: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
215 mtime_ns = root.stat().st_mtime_ns |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
216 except OSError: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
217 _EXTRACTED_PATH_CACHE.pop(cache_key, None) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
218 return _build_extracted_maps(root) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
219 cached = _EXTRACTED_PATH_CACHE.get(cache_key) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
220 if cached: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
221 cached_mtime, rel_map, name_map = cached |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
222 if cached_mtime == mtime_ns: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
223 _EXTRACTED_PATH_CACHE.move_to_end(cache_key) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
224 LOG.debug("Using cached extracted path map for %s (%d entries)", root, len(rel_map)) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
225 return rel_map, name_map |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
226 _EXTRACTED_PATH_CACHE.pop(cache_key, None) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
227 LOG.debug("Invalidated extracted path map cache for %s (mtime changed)", root) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
228 rel_map, name_map = _build_extracted_maps(root) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
229 if rel_map and len(rel_map) <= _MAX_EXTRACTED_INDEX_FILES: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
230 _EXTRACTED_PATH_CACHE[cache_key] = (mtime_ns, rel_map, name_map) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
231 _EXTRACTED_PATH_CACHE.move_to_end(cache_key) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
232 while len(_EXTRACTED_PATH_CACHE) > _MAX_EXTRACTED_INDEX_CACHE_SIZE: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
233 _EXTRACTED_PATH_CACHE.popitem(last=False) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
234 return rel_map, name_map |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
235 |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
236 |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
237 def prepare_image_search_dirs(args) -> Optional[Path]: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
238 if not args.images_zip: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
239 return None |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
240 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
241 root = Path(tempfile.mkdtemp(prefix="autogluon_images_")) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
242 LOG.info(f"Extracting {len(args.images_zip)} image ZIP(s) to {root}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
243 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
244 for zip_path in args.images_zip: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
245 path = Path(zip_path) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
246 if not path.exists(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
247 raise FileNotFoundError(f"Image ZIP not found: {zip_path}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
248 with zipfile.ZipFile(path, 'r') as z: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
249 z.extractall(root) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
250 LOG.info(f"Extracted {path.name}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
251 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
252 return root |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
253 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
254 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
255 def absolute_path_expander(df: pd.DataFrame, extracted_root: Optional[Path], image_columns: Optional[List[str]]) -> List[str]: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
256 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
257 Resolve image paths to absolute paths. If no image_columns are provided, |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
258 infers candidate columns whose values resolve to existing files (checking |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
259 absolute paths first, then paths relative to the extracted_root). |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
260 """ |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
261 if df is None or df.empty: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
262 return [] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
263 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
264 image_columns = [c for c in (image_columns or []) if c in df.columns] |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
265 extracted_index = None |
|
6
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
266 extracted_maps = None |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
267 |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
268 def get_extracted_index() -> set: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
269 nonlocal extracted_index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
270 if extracted_index is None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
271 extracted_index = _get_cached_extracted_index(extracted_root) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
272 return extracted_index |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
273 |
|
6
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
274 def get_extracted_maps() -> tuple[dict, dict]: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
275 nonlocal extracted_maps |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
276 if extracted_maps is None: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
277 extracted_maps = _get_cached_extracted_maps(extracted_root) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
278 return extracted_maps |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
279 |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
280 def resolve(p): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
281 if pd.isna(p): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
282 return None |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
283 raw = _normalize_path_value(p) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
284 if not raw: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
285 return None |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
286 _warn_if_long_component(raw) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
287 orig = Path(raw) |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
288 candidates = [] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
289 if orig.is_absolute(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
290 candidates.append(orig) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
291 if extracted_root is not None: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
292 candidates.extend([extracted_root / orig, extracted_root / orig.name]) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
293 for cand in candidates: |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
294 try: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
295 if cand.exists(): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
296 return str(cand.resolve()) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
297 except OSError as e: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
298 if e.errno == errno.ENAMETOOLONG: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
299 LOG.warning("Path too long for filesystem: %s", cand) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
300 continue |
|
6
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
301 if extracted_root is not None: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
302 rel_map, name_map = get_extracted_maps() |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
303 if rel_map: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
304 norm = raw.replace("\\", "/").lstrip("./") |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
305 mapped = rel_map.get(norm) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
306 if mapped: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
307 return str(Path(mapped).resolve()) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
308 base = Path(norm).name |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
309 mapped = name_map.get(base) |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
310 if mapped: |
|
871957823d0c
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents:
2
diff
changeset
|
311 return str(Path(mapped).resolve()) |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
312 return None |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
313 |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
314 def matches_extracted(p) -> bool: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
315 if pd.isna(p): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
316 return False |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
317 raw = _normalize_path_value(p) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
318 if not raw: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
319 return False |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
320 _warn_if_long_component(raw) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
321 index = get_extracted_index() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
322 if not index: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
323 return False |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
324 norm = raw.replace("\\", "/").lstrip("./") |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
325 return norm in index |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
326 |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
327 # Infer image columns if none were provided |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
328 if not image_columns: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
329 obj_cols = [c for c in df.columns if str(df[c].dtype) == "object"] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
330 inferred = [] |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
331 for col in obj_cols: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
332 sample = df[col].dropna().head(50) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
333 if sample.empty: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
334 continue |
|
2
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
335 if extracted_root is not None: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
336 index = get_extracted_index() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
337 else: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
338 index = set() |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
339 if index: |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
340 matched = sample.apply(matches_extracted) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
341 if matched.any(): |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
342 inferred.append(col) |
|
b708d0e210e6
planemo upload for repository https://github.com/goeckslab/gleam.git commit ffd47c4881aaa9fc33e7d3993a8fdf4bd82f3792
goeckslab
parents:
0
diff
changeset
|
343 continue |
|
0
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
344 resolved_sample = sample.apply(resolve) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
345 if resolved_sample.notna().any(): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
346 inferred.append(col) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
347 image_columns = inferred |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
348 if image_columns: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
349 LOG.info(f"Inferred image columns: {image_columns}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
350 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
351 for col in image_columns: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
352 df[col] = df[col].apply(resolve) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
353 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
354 return image_columns |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
355 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
356 |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
357 def verify_outputs(paths): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
358 ok = True |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
359 for p, desc in paths: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
360 if os.path.exists(p): |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
361 size = os.path.getsize(p) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
362 LOG.info(f"✓ Output {desc}: {p} ({size:,} bytes)") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
363 os.chmod(p, 0o644) |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
364 else: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
365 LOG.error(f"✗ Output {desc} MISSING: {p}") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
366 ok = False |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
367 if not ok: |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
368 LOG.error("Some outputs are missing!") |
|
375c36923da1
planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff
changeset
|
369 sys.exit(1) |
