Mercurial > repos > goeckslab > multimodal_learner
comparison utils.py @ 6:871957823d0c draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
| author | goeckslab |
|---|---|
| date | Mon, 26 Jan 2026 18:44:07 +0000 |
| parents | b708d0e210e6 |
| children |
comparison
equal
deleted
inserted
replaced
| 5:975512caae22 | 6:871957823d0c |
|---|---|
| 28 } | 28 } |
| 29 _MAX_PATH_COMPONENT = 255 | 29 _MAX_PATH_COMPONENT = 255 |
| 30 _MAX_EXTRACTED_INDEX_CACHE_SIZE = 2 | 30 _MAX_EXTRACTED_INDEX_CACHE_SIZE = 2 |
| 31 _MAX_EXTRACTED_INDEX_FILES = 100000 | 31 _MAX_EXTRACTED_INDEX_FILES = 100000 |
| 32 _EXTRACTED_INDEX_CACHE = OrderedDict() | 32 _EXTRACTED_INDEX_CACHE = OrderedDict() |
| 33 _EXTRACTED_PATH_CACHE = OrderedDict() | |
| 33 | 34 |
| 34 | 35 |
| 35 def str2bool(val) -> bool: | 36 def str2bool(val) -> bool: |
| 36 """Parse common truthy strings to bool.""" | 37 """Parse common truthy strings to bool.""" |
| 37 return str(val).strip().lower() in ("1", "true", "yes", "y") | 38 return str(val).strip().lower() in ("1", "true", "yes", "y") |
| 136 continue | 137 continue |
| 137 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname) | 138 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname) |
| 138 index.add(rel_path.replace("\\", "/")) | 139 index.add(rel_path.replace("\\", "/")) |
| 139 index.add(fname) | 140 index.add(fname) |
| 140 return index | 141 return index |
| 142 | |
| 143 | |
| 144 def _build_extracted_maps(extracted_root: Optional[Path]) -> tuple[dict, dict]: | |
| 145 if extracted_root is None: | |
| 146 return {}, {} | |
| 147 rel_map: dict[str, str] = {} | |
| 148 name_map: dict[str, str] = {} | |
| 149 name_collisions = set() | |
| 150 count = 0 | |
| 151 for root, _dirs, files in os.walk(extracted_root): | |
| 152 rel_root = os.path.relpath(root, extracted_root) | |
| 153 for fname in files: | |
| 154 ext = os.path.splitext(fname)[1].lower() | |
| 155 if ext not in _IMAGE_EXTENSIONS: | |
| 156 continue | |
| 157 count += 1 | |
| 158 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname) | |
| 159 rel_norm = rel_path.replace("\\", "/") | |
| 160 abs_path = os.path.join(root, fname) | |
| 161 rel_map[rel_norm] = abs_path | |
| 162 if fname in name_map and name_map[fname] != abs_path: | |
| 163 name_collisions.add(fname) | |
| 164 else: | |
| 165 name_map[fname] = abs_path | |
| 166 for name in name_collisions: | |
| 167 name_map.pop(name, None) | |
| 168 return rel_map, name_map | |
| 141 | 169 |
| 142 | 170 |
| 143 def _get_cached_extracted_index(extracted_root: Optional[Path]) -> set: | 171 def _get_cached_extracted_index(extracted_root: Optional[Path]) -> set: |
| 144 if extracted_root is None: | 172 if extracted_root is None: |
| 145 return set() | 173 return set() |
| 173 else: | 201 else: |
| 174 LOG.debug("Extracted index has %d entries; skipping cache for %s", len(index), root) | 202 LOG.debug("Extracted index has %d entries; skipping cache for %s", len(index), root) |
| 175 return index | 203 return index |
| 176 | 204 |
| 177 | 205 |
| 206 def _get_cached_extracted_maps(extracted_root: Optional[Path]) -> tuple[dict, dict]: | |
| 207 if extracted_root is None: | |
| 208 return {}, {} | |
| 209 try: | |
| 210 root = extracted_root.resolve() | |
| 211 except Exception: | |
| 212 root = extracted_root | |
| 213 cache_key = str(root) | |
| 214 try: | |
| 215 mtime_ns = root.stat().st_mtime_ns | |
| 216 except OSError: | |
| 217 _EXTRACTED_PATH_CACHE.pop(cache_key, None) | |
| 218 return _build_extracted_maps(root) | |
| 219 cached = _EXTRACTED_PATH_CACHE.get(cache_key) | |
| 220 if cached: | |
| 221 cached_mtime, rel_map, name_map = cached | |
| 222 if cached_mtime == mtime_ns: | |
| 223 _EXTRACTED_PATH_CACHE.move_to_end(cache_key) | |
| 224 LOG.debug("Using cached extracted path map for %s (%d entries)", root, len(rel_map)) | |
| 225 return rel_map, name_map | |
| 226 _EXTRACTED_PATH_CACHE.pop(cache_key, None) | |
| 227 LOG.debug("Invalidated extracted path map cache for %s (mtime changed)", root) | |
| 228 rel_map, name_map = _build_extracted_maps(root) | |
| 229 if rel_map and len(rel_map) <= _MAX_EXTRACTED_INDEX_FILES: | |
| 230 _EXTRACTED_PATH_CACHE[cache_key] = (mtime_ns, rel_map, name_map) | |
| 231 _EXTRACTED_PATH_CACHE.move_to_end(cache_key) | |
| 232 while len(_EXTRACTED_PATH_CACHE) > _MAX_EXTRACTED_INDEX_CACHE_SIZE: | |
| 233 _EXTRACTED_PATH_CACHE.popitem(last=False) | |
| 234 return rel_map, name_map | |
| 235 | |
| 236 | |
| 178 def prepare_image_search_dirs(args) -> Optional[Path]: | 237 def prepare_image_search_dirs(args) -> Optional[Path]: |
| 179 if not args.images_zip: | 238 if not args.images_zip: |
| 180 return None | 239 return None |
| 181 | 240 |
| 182 root = Path(tempfile.mkdtemp(prefix="autogluon_images_")) | 241 root = Path(tempfile.mkdtemp(prefix="autogluon_images_")) |
| 202 if df is None or df.empty: | 261 if df is None or df.empty: |
| 203 return [] | 262 return [] |
| 204 | 263 |
| 205 image_columns = [c for c in (image_columns or []) if c in df.columns] | 264 image_columns = [c for c in (image_columns or []) if c in df.columns] |
| 206 extracted_index = None | 265 extracted_index = None |
| 266 extracted_maps = None | |
| 207 | 267 |
| 208 def get_extracted_index() -> set: | 268 def get_extracted_index() -> set: |
| 209 nonlocal extracted_index | 269 nonlocal extracted_index |
| 210 if extracted_index is None: | 270 if extracted_index is None: |
| 211 extracted_index = _get_cached_extracted_index(extracted_root) | 271 extracted_index = _get_cached_extracted_index(extracted_root) |
| 212 return extracted_index | 272 return extracted_index |
| 273 | |
| 274 def get_extracted_maps() -> tuple[dict, dict]: | |
| 275 nonlocal extracted_maps | |
| 276 if extracted_maps is None: | |
| 277 extracted_maps = _get_cached_extracted_maps(extracted_root) | |
| 278 return extracted_maps | |
| 213 | 279 |
| 214 def resolve(p): | 280 def resolve(p): |
| 215 if pd.isna(p): | 281 if pd.isna(p): |
| 216 return None | 282 return None |
| 217 raw = _normalize_path_value(p) | 283 raw = _normalize_path_value(p) |
| 230 return str(cand.resolve()) | 296 return str(cand.resolve()) |
| 231 except OSError as e: | 297 except OSError as e: |
| 232 if e.errno == errno.ENAMETOOLONG: | 298 if e.errno == errno.ENAMETOOLONG: |
| 233 LOG.warning("Path too long for filesystem: %s", cand) | 299 LOG.warning("Path too long for filesystem: %s", cand) |
| 234 continue | 300 continue |
| 301 if extracted_root is not None: | |
| 302 rel_map, name_map = get_extracted_maps() | |
| 303 if rel_map: | |
| 304 norm = raw.replace("\\", "/").lstrip("./") | |
| 305 mapped = rel_map.get(norm) | |
| 306 if mapped: | |
| 307 return str(Path(mapped).resolve()) | |
| 308 base = Path(norm).name | |
| 309 mapped = name_map.get(base) | |
| 310 if mapped: | |
| 311 return str(Path(mapped).resolve()) | |
| 235 return None | 312 return None |
| 236 | 313 |
| 237 def matches_extracted(p) -> bool: | 314 def matches_extracted(p) -> bool: |
| 238 if pd.isna(p): | 315 if pd.isna(p): |
| 239 return False | 316 return False |
