comparison utils.py @ 6:871957823d0c draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
author goeckslab
date Mon, 26 Jan 2026 18:44:07 +0000
parents b708d0e210e6
children
comparison
equal deleted inserted replaced
5:975512caae22 6:871957823d0c
28 } 28 }
29 _MAX_PATH_COMPONENT = 255 29 _MAX_PATH_COMPONENT = 255
30 _MAX_EXTRACTED_INDEX_CACHE_SIZE = 2 30 _MAX_EXTRACTED_INDEX_CACHE_SIZE = 2
31 _MAX_EXTRACTED_INDEX_FILES = 100000 31 _MAX_EXTRACTED_INDEX_FILES = 100000
32 _EXTRACTED_INDEX_CACHE = OrderedDict() 32 _EXTRACTED_INDEX_CACHE = OrderedDict()
33 _EXTRACTED_PATH_CACHE = OrderedDict()
33 34
34 35
35 def str2bool(val) -> bool: 36 def str2bool(val) -> bool:
36 """Parse common truthy strings to bool.""" 37 """Parse common truthy strings to bool."""
37 return str(val).strip().lower() in ("1", "true", "yes", "y") 38 return str(val).strip().lower() in ("1", "true", "yes", "y")
136 continue 137 continue
137 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname) 138 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname)
138 index.add(rel_path.replace("\\", "/")) 139 index.add(rel_path.replace("\\", "/"))
139 index.add(fname) 140 index.add(fname)
140 return index 141 return index
142
143
144 def _build_extracted_maps(extracted_root: Optional[Path]) -> tuple[dict, dict]:
145 if extracted_root is None:
146 return {}, {}
147 rel_map: dict[str, str] = {}
148 name_map: dict[str, str] = {}
149 name_collisions = set()
150 count = 0
151 for root, _dirs, files in os.walk(extracted_root):
152 rel_root = os.path.relpath(root, extracted_root)
153 for fname in files:
154 ext = os.path.splitext(fname)[1].lower()
155 if ext not in _IMAGE_EXTENSIONS:
156 continue
157 count += 1
158 rel_path = fname if rel_root == "." else os.path.join(rel_root, fname)
159 rel_norm = rel_path.replace("\\", "/")
160 abs_path = os.path.join(root, fname)
161 rel_map[rel_norm] = abs_path
162 if fname in name_map and name_map[fname] != abs_path:
163 name_collisions.add(fname)
164 else:
165 name_map[fname] = abs_path
166 for name in name_collisions:
167 name_map.pop(name, None)
168 return rel_map, name_map
141 169
142 170
143 def _get_cached_extracted_index(extracted_root: Optional[Path]) -> set: 171 def _get_cached_extracted_index(extracted_root: Optional[Path]) -> set:
144 if extracted_root is None: 172 if extracted_root is None:
145 return set() 173 return set()
173 else: 201 else:
174 LOG.debug("Extracted index has %d entries; skipping cache for %s", len(index), root) 202 LOG.debug("Extracted index has %d entries; skipping cache for %s", len(index), root)
175 return index 203 return index
176 204
177 205
206 def _get_cached_extracted_maps(extracted_root: Optional[Path]) -> tuple[dict, dict]:
207 if extracted_root is None:
208 return {}, {}
209 try:
210 root = extracted_root.resolve()
211 except Exception:
212 root = extracted_root
213 cache_key = str(root)
214 try:
215 mtime_ns = root.stat().st_mtime_ns
216 except OSError:
217 _EXTRACTED_PATH_CACHE.pop(cache_key, None)
218 return _build_extracted_maps(root)
219 cached = _EXTRACTED_PATH_CACHE.get(cache_key)
220 if cached:
221 cached_mtime, rel_map, name_map = cached
222 if cached_mtime == mtime_ns:
223 _EXTRACTED_PATH_CACHE.move_to_end(cache_key)
224 LOG.debug("Using cached extracted path map for %s (%d entries)", root, len(rel_map))
225 return rel_map, name_map
226 _EXTRACTED_PATH_CACHE.pop(cache_key, None)
227 LOG.debug("Invalidated extracted path map cache for %s (mtime changed)", root)
228 rel_map, name_map = _build_extracted_maps(root)
229 if rel_map and len(rel_map) <= _MAX_EXTRACTED_INDEX_FILES:
230 _EXTRACTED_PATH_CACHE[cache_key] = (mtime_ns, rel_map, name_map)
231 _EXTRACTED_PATH_CACHE.move_to_end(cache_key)
232 while len(_EXTRACTED_PATH_CACHE) > _MAX_EXTRACTED_INDEX_CACHE_SIZE:
233 _EXTRACTED_PATH_CACHE.popitem(last=False)
234 return rel_map, name_map
235
236
178 def prepare_image_search_dirs(args) -> Optional[Path]: 237 def prepare_image_search_dirs(args) -> Optional[Path]:
179 if not args.images_zip: 238 if not args.images_zip:
180 return None 239 return None
181 240
182 root = Path(tempfile.mkdtemp(prefix="autogluon_images_")) 241 root = Path(tempfile.mkdtemp(prefix="autogluon_images_"))
202 if df is None or df.empty: 261 if df is None or df.empty:
203 return [] 262 return []
204 263
205 image_columns = [c for c in (image_columns or []) if c in df.columns] 264 image_columns = [c for c in (image_columns or []) if c in df.columns]
206 extracted_index = None 265 extracted_index = None
266 extracted_maps = None
207 267
208 def get_extracted_index() -> set: 268 def get_extracted_index() -> set:
209 nonlocal extracted_index 269 nonlocal extracted_index
210 if extracted_index is None: 270 if extracted_index is None:
211 extracted_index = _get_cached_extracted_index(extracted_root) 271 extracted_index = _get_cached_extracted_index(extracted_root)
212 return extracted_index 272 return extracted_index
273
274 def get_extracted_maps() -> tuple[dict, dict]:
275 nonlocal extracted_maps
276 if extracted_maps is None:
277 extracted_maps = _get_cached_extracted_maps(extracted_root)
278 return extracted_maps
213 279
214 def resolve(p): 280 def resolve(p):
215 if pd.isna(p): 281 if pd.isna(p):
216 return None 282 return None
217 raw = _normalize_path_value(p) 283 raw = _normalize_path_value(p)
230 return str(cand.resolve()) 296 return str(cand.resolve())
231 except OSError as e: 297 except OSError as e:
232 if e.errno == errno.ENAMETOOLONG: 298 if e.errno == errno.ENAMETOOLONG:
233 LOG.warning("Path too long for filesystem: %s", cand) 299 LOG.warning("Path too long for filesystem: %s", cand)
234 continue 300 continue
301 if extracted_root is not None:
302 rel_map, name_map = get_extracted_maps()
303 if rel_map:
304 norm = raw.replace("\\", "/").lstrip("./")
305 mapped = rel_map.get(norm)
306 if mapped:
307 return str(Path(mapped).resolve())
308 base = Path(norm).name
309 mapped = name_map.get(base)
310 if mapped:
311 return str(Path(mapped).resolve())
235 return None 312 return None
236 313
237 def matches_extracted(p) -> bool: 314 def matches_extracted(p) -> bool:
238 if pd.isna(p): 315 if pd.isna(p):
239 return False 316 return False