annotate image_workflow.py @ 22:ccbcc012d78d draft

planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
author goeckslab
date Fri, 23 Jan 2026 20:25:27 +0000
parents d5c582cf74bc
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
1 import argparse
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
2 import logging
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
3 import os
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
4 import shutil
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
5 import tempfile
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
6 import zipfile
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
7 from pathlib import Path
17
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
8 from typing import Any, Dict, List, Optional, Tuple
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
9
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
10 import pandas as pd
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
11 import pandas.api.types as ptypes
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
12 from constants import (
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
13 IMAGE_PATH_COLUMN_NAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
14 LABEL_COLUMN_NAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
15 SPLIT_COLUMN_NAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
16 TEMP_CONFIG_FILENAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
17 TEMP_CSV_FILENAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
18 TEMP_DIR_PREFIX,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
19 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
20 from ludwig.globals import PREDICTIONS_PARQUET_FILE_NAME
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
21 from ludwig_backend import Backend
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
22 from split_data import create_stratified_random_split, split_data_0_2
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
23 from utils import load_metadata_table
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
24
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
25 logger = logging.getLogger("ImageLearner")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
26
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
27
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
28 class ImageLearnerCLI:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
29 """Manages the image-classification workflow."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
30
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
31 def __init__(self, args: argparse.Namespace, backend: Backend):
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
32 self.args = args
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
33 self.backend = backend
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
34 self.temp_dir: Optional[Path] = None
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
35 self.image_extract_dir: Optional[Path] = None
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
36 self.label_metadata: Dict[str, Any] = {}
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
37 self.output_type_hint: Optional[str] = None
17
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
38 self.label_split_counts: List[Dict[str, int]] = []
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
39 self.split_counts: Dict[int, int] = {}
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
40 logger.info(f"Orchestrator initialized with backend: {type(backend).__name__}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
41
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
42 def _create_temp_dirs(self) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
43 """Create temporary output and image extraction directories."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
44 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
45 self.temp_dir = Path(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
46 tempfile.mkdtemp(dir=self.args.output_dir, prefix=TEMP_DIR_PREFIX)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
47 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
48 self.image_extract_dir = self.temp_dir / "images"
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
49 self.image_extract_dir.mkdir()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
50 logger.info(f"Created temp directory: {self.temp_dir}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
51 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
52 logger.error("Failed to create temporary directories", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
53 raise
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
54
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
55 def _extract_images(self) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
56 """Extract images into the temp image directory.
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
57 - If a ZIP file is provided, extract it
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
58 - If a directory is provided, copy its contents
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
59 """
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
60 if self.image_extract_dir is None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
61 raise RuntimeError("Temp image directory not initialized.")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
62 src = Path(self.args.image_zip)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
63 logger.info(f"Preparing images from {src} → {self.image_extract_dir}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
64 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
65 if src.is_dir():
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
66 # copy directory tree
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
67 for root, dirs, files in os.walk(src):
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
68 rel = Path(root).relative_to(src)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
69 target_root = self.image_extract_dir / rel
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
70 target_root.mkdir(parents=True, exist_ok=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
71 for fn in files:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
72 shutil.copy2(Path(root) / fn, target_root / fn)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
73 logger.info("Image directory copied.")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
74 else:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
75 with zipfile.ZipFile(src, "r") as z:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
76 z.extractall(self.image_extract_dir)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
77 logger.info("Image extraction complete.")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
78 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
79 logger.error("Error preparing images", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
80 raise
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
81
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
82 def _process_fixed_split(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
83 self, df: pd.DataFrame
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
84 ) -> Tuple[pd.DataFrame, Dict[str, Any], str]:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
85 """Process datasets that already have a split column."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
86 unique = set(df[SPLIT_COLUMN_NAME].unique())
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
87 if unique == {0, 2}:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
88 # Split 0/2 detected, create validation set
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
89 df = split_data_0_2(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
90 df=df,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
91 split_column=SPLIT_COLUMN_NAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
92 validation_size=self.args.validation_size,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
93 random_state=self.args.random_seed,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
94 label_column=LABEL_COLUMN_NAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
95 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
96 split_config = {"type": "fixed", "column": SPLIT_COLUMN_NAME}
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
97 split_info = (
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
98 "Detected a split column (with values 0 and 2) in the input CSV. "
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
99 f"Used this column as a base and reassigned "
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
100 f"{self.args.validation_size * 100:.1f}% "
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
101 "of the training set (originally labeled 0) to validation (labeled 1) using stratified sampling."
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
102 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
103 logger.info("Applied custom 0/2 split.")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
104 elif unique.issubset({0, 1, 2}):
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
105 # Standard 0/1/2 split
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
106 split_config = {"type": "fixed", "column": SPLIT_COLUMN_NAME}
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
107 split_info = (
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
108 "Detected a split column with train(0)/validation(1)/test(2) "
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
109 "values in the input CSV. Used this column as-is."
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
110 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
111 logger.info("Fixed split column detected.")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
112 else:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
113 raise ValueError(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
114 f"Split column contains unexpected values: {unique}. "
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
115 "Expected: {{0,1,2}} or {{0,2}}"
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
116 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
117
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
118 return df, split_config, split_info
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
119
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
120 def _prepare_data(self) -> Tuple[Path, Dict[str, Any], str]:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
121 """Load CSV, update image paths, handle splits, and write prepared CSV."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
122 if not self.temp_dir or not self.image_extract_dir:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
123 raise RuntimeError("Temp dirs not initialized before data prep.")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
124
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
125 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
126 df = load_metadata_table(self.args.csv_file)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
127 logger.info(f"Loaded metadata file: {self.args.csv_file}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
128 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
129 logger.error("Error loading metadata file", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
130 raise
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
131
22
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
132 def resolve_column_name(value: Optional[str], columns, label: str) -> Optional[str]:
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
133 """Resolve Galaxy data_column index (1-based) to header if needed."""
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
134 if value is None:
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
135 return None
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
136 if str(value).isdigit():
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
137 idx = int(value) - 1
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
138 if 0 <= idx < len(columns):
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
139 resolved = columns[idx]
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
140 if value in columns:
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
141 logger.warning(
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
142 "%s column value '%s' matches a header, but Galaxy data_column "
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
143 "inputs are interpreted as 1-based indices; using column #%s header '%s'.",
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
144 label,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
145 value,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
146 idx + 1,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
147 resolved,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
148 )
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
149 logger.info(
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
150 "%s column '%s' not found; using column #%s header '%s' instead.",
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
151 label,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
152 value,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
153 idx + 1,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
154 resolved,
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
155 )
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
156 return resolved
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
157 raise ValueError(
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
158 f"{label} column index '{value}' is out of range for dataset with "
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
159 f"{len(columns)} columns. Update the XML selections or rename your columns."
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
160 )
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
161 if value in columns:
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
162 return value
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
163 return value
15
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
164
22
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
165 label_col = resolve_column_name(self.args.target_column, df.columns, "Target") or LABEL_COLUMN_NAME
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
166 image_col = resolve_column_name(self.args.image_column, df.columns, "Image") or IMAGE_PATH_COLUMN_NAME
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
167 self.args.sample_id_column = resolve_column_name(
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
168 self.args.sample_id_column, df.columns, "Sample ID"
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
169 )
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
170
ccbcc012d78d planemo upload for repository https://github.com/goeckslab/gleam.git commit d8308188856a1dd8d40d3ccfe4c4ec03c4fd36de
goeckslab
parents: 21
diff changeset
171 # Remember the resolved columns for reporting
15
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
172 self.args.report_target_column = label_col
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
173 self.args.report_image_column = image_col
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
174
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
175 missing_cols = []
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
176 if label_col not in df.columns:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
177 missing_cols.append(label_col)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
178 if image_col not in df.columns:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
179 missing_cols.append(image_col)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
180 if missing_cols:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
181 raise ValueError(
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
182 f"Missing required column(s) in metadata: {', '.join(missing_cols)}. "
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
183 "Update the XML selections or rename your columns."
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
184 )
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
185
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
186 if label_col != LABEL_COLUMN_NAME:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
187 df = df.rename(columns={label_col: LABEL_COLUMN_NAME})
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
188 if image_col != IMAGE_PATH_COLUMN_NAME:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
189 df = df.rename(columns={image_col: IMAGE_PATH_COLUMN_NAME})
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
190
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
191 try:
15
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
192 df = self._map_image_paths_with_search(df)
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
193 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
194 logger.error("Error updating image paths", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
195 raise
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
196
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
197 if SPLIT_COLUMN_NAME in df.columns:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
198 df, split_config, split_info = self._process_fixed_split(df)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
199 else:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
200 logger.info("No split column; creating stratified random split")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
201 df = create_stratified_random_split(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
202 df=df,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
203 split_column=SPLIT_COLUMN_NAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
204 split_probabilities=self.args.split_probabilities,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
205 random_state=self.args.random_seed,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
206 label_column=LABEL_COLUMN_NAME,
20
64872c48a21f planemo upload for repository https://github.com/goeckslab/gleam.git commit d4b122527a2402e43512f9b4bda00c7bff0ec9e9
goeckslab
parents: 17
diff changeset
207 group_column=self.args.sample_id_column,
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
208 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
209 split_config = {
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
210 "type": "fixed",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
211 "column": SPLIT_COLUMN_NAME,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
212 }
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
213 split_info = (
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
214 f"No split column in CSV. Created stratified random split: "
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
215 f"{[int(p * 100) for p in self.args.split_probabilities]}% "
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
216 f"for train/val/test with balanced label distribution."
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
217 )
20
64872c48a21f planemo upload for repository https://github.com/goeckslab/gleam.git commit d4b122527a2402e43512f9b4bda00c7bff0ec9e9
goeckslab
parents: 17
diff changeset
218 if self.args.sample_id_column:
64872c48a21f planemo upload for repository https://github.com/goeckslab/gleam.git commit d4b122527a2402e43512f9b4bda00c7bff0ec9e9
goeckslab
parents: 17
diff changeset
219 split_info += (
64872c48a21f planemo upload for repository https://github.com/goeckslab/gleam.git commit d4b122527a2402e43512f9b4bda00c7bff0ec9e9
goeckslab
parents: 17
diff changeset
220 f" Grouped by sample ID column '{self.args.sample_id_column}' "
64872c48a21f planemo upload for repository https://github.com/goeckslab/gleam.git commit d4b122527a2402e43512f9b4bda00c7bff0ec9e9
goeckslab
parents: 17
diff changeset
221 "to prevent data leakage."
64872c48a21f planemo upload for repository https://github.com/goeckslab/gleam.git commit d4b122527a2402e43512f9b4bda00c7bff0ec9e9
goeckslab
parents: 17
diff changeset
222 )
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
223
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
224 final_csv = self.temp_dir / TEMP_CSV_FILENAME
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
225
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
226 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
227 df.to_csv(final_csv, index=False)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
228 logger.info(f"Saved prepared data to {final_csv}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
229 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
230 logger.error("Error saving prepared CSV", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
231 raise
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
232
17
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
233 # Capture actual split counts for downstream reporting (avoids heuristic 70/10/20 tables)
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
234 try:
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
235 split_series = pd.to_numeric(df[SPLIT_COLUMN_NAME], errors="coerce")
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
236 split_series = split_series.dropna().astype(int)
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
237 self.split_counts = {int(k): int(v) for k, v in split_series.value_counts().to_dict().items()}
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
238 if LABEL_COLUMN_NAME in df.columns:
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
239 counts = (
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
240 df.dropna(subset=[LABEL_COLUMN_NAME])
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
241 .assign(**{SPLIT_COLUMN_NAME: split_series})
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
242 .groupby([LABEL_COLUMN_NAME, SPLIT_COLUMN_NAME])
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
243 .size()
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
244 .unstack(fill_value=0)
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
245 .sort_index()
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
246 )
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
247 self.label_split_counts = [
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
248 {
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
249 "label": str(lbl),
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
250 "train": int(row.get(0, 0)),
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
251 "validation": int(row.get(1, 0)),
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
252 "test": int(row.get(2, 0)),
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
253 }
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
254 for lbl, row in counts.iterrows()
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
255 ]
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
256 except Exception:
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
257 logger.warning("Unable to capture split counts for reporting", exc_info=True)
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
258 self.label_split_counts = []
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
259 self.split_counts = {}
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
260
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
261 self._capture_label_metadata(df)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
262
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
263 return final_csv, split_config, split_info
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
264
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
265 def _capture_label_metadata(self, df: pd.DataFrame) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
266 """Record basic statistics about the label column for downstream hints."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
267 metadata: Dict[str, Any] = {}
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
268 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
269 series = df[LABEL_COLUMN_NAME]
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
270 non_na = series.dropna()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
271 unique_values = non_na.unique().tolist()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
272 num_unique = int(len(unique_values))
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
273 is_numeric = bool(ptypes.is_numeric_dtype(series.dtype))
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
274 metadata = {
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
275 "num_unique": num_unique,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
276 "dtype": str(series.dtype),
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
277 "unique_values_preview": [str(v) for v in unique_values[:10]],
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
278 "is_numeric": is_numeric,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
279 "is_binary": num_unique == 2,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
280 "is_numeric_binary": is_numeric and num_unique == 2,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
281 "likely_regression": bool(is_numeric and num_unique > 10),
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
282 }
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
283 if metadata["is_binary"]:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
284 logger.info(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
285 "Detected binary label column with unique values: %s",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
286 metadata["unique_values_preview"],
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
287 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
288 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
289 logger.warning("Unable to capture label metadata.", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
290 metadata = {}
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
291
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
292 self.label_metadata = metadata
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
293 self.output_type_hint = "binary" if metadata.get("is_binary") else None
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
294
15
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
295 def _map_image_paths_with_search(self, df: pd.DataFrame) -> pd.DataFrame:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
296 """Map image identifiers to actual files by searching the extracted directory."""
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
297 if not self.image_extract_dir:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
298 raise RuntimeError("Image directory is not initialized.")
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
299
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
300 # Build lookup maps for fast resolution by stem or full name
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
301 lookup_by_stem = {}
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
302 lookup_by_name = {}
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
303 for fpath in self.image_extract_dir.rglob("*"):
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
304 if fpath.is_file():
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
305 stem_key = fpath.stem.lower()
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
306 name_key = fpath.name.lower()
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
307 # Prefer first encounter; warn on collisions
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
308 if stem_key in lookup_by_stem and lookup_by_stem[stem_key] != fpath:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
309 logger.warning(
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
310 "Multiple files share the same stem '%s'. Using '%s'.",
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
311 stem_key,
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
312 lookup_by_stem[stem_key],
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
313 )
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
314 else:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
315 lookup_by_stem[stem_key] = fpath
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
316 if name_key in lookup_by_name and lookup_by_name[name_key] != fpath:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
317 logger.warning(
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
318 "Multiple files share the same name '%s'. Using '%s'.",
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
319 name_key,
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
320 lookup_by_name[name_key],
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
321 )
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
322 else:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
323 lookup_by_name[name_key] = fpath
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
324
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
325 resolved_paths = []
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
326 missing_count = 0
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
327 missing_samples = []
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
328
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
329 for raw in df[IMAGE_PATH_COLUMN_NAME]:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
330 raw_str = str(raw)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
331 name_key = Path(raw_str).name.lower()
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
332 stem_key = Path(raw_str).stem.lower()
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
333 resolved = lookup_by_name.get(name_key) or lookup_by_stem.get(stem_key)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
334
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
335 if resolved is None:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
336 missing_count += 1
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
337 missing_samples.append(raw_str)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
338 resolved_paths.append(pd.NA)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
339 continue
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
340
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
341 try:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
342 rel_path = resolved.relative_to(self.image_extract_dir)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
343 except ValueError:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
344 rel_path = resolved
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
345 resolved_paths.append(str(Path("images") / rel_path))
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
346
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
347 if missing_count:
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
348 logger.warning(
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
349 "Unable to locate %d image(s) from the metadata in the extracted images directory.",
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
350 missing_count,
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
351 )
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
352 preview = ", ".join(missing_samples[:5])
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
353 logger.warning("Missing samples (showing up to 5): %s", preview)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
354
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
355 df = df.copy()
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
356 df[IMAGE_PATH_COLUMN_NAME] = resolved_paths
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
357 df = df.dropna(subset=[IMAGE_PATH_COLUMN_NAME]).reset_index(drop=True)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
358 return df
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
359
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
360 # Removed duplicate method
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
361
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
362 def _detect_image_dimensions(self) -> Tuple[int, int]:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
363 """Detect image dimensions from the first image in the dataset."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
364 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
365 import zipfile
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
366 from PIL import Image
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
367 import io
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
368
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
369 # Check if image_zip is provided
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
370 if not self.args.image_zip:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
371 logger.warning("No image zip provided, using default 224x224")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
372 return 224, 224
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
373
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
374 # Extract first image to detect dimensions
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
375 with zipfile.ZipFile(self.args.image_zip, 'r') as z:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
376 image_files = [f for f in z.namelist() if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
377 if not image_files:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
378 logger.warning("No image files found in zip, using default 224x224")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
379 return 224, 224
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
380
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
381 # Check first image
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
382 with z.open(image_files[0]) as f:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
383 img = Image.open(io.BytesIO(f.read()))
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
384 width, height = img.size
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
385 logger.info(f"Detected image dimensions: {width}x{height}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
386 return height, width # Return as (height, width) to match encoder config
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
387
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
388 except Exception as e:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
389 logger.warning(f"Error detecting image dimensions: {e}, using default 224x224")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
390 return 224, 224
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
391
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
392 def _cleanup_temp_dirs(self) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
393 if self.temp_dir and self.temp_dir.exists():
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
394 logger.info(f"Cleaning up temp directory: {self.temp_dir}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
395 # Don't clean up for debugging
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
396 shutil.rmtree(self.temp_dir, ignore_errors=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
397 self.temp_dir = None
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
398 self.image_extract_dir = None
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
399
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
400 def run(self) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
401 """Execute the full workflow end-to-end."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
402 logger.info("Starting workflow...")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
403 self.args.output_dir.mkdir(parents=True, exist_ok=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
404
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
405 try:
21
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
406 if getattr(self.args, "torch_home", None):
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
407 torch_home = Path(self.args.torch_home).expanduser().resolve()
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
408 torch_home.mkdir(parents=True, exist_ok=True)
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
409 os.environ["TORCH_HOME"] = str(torch_home)
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
410 try:
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
411 import torch
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
412
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
413 torch.hub.set_dir(str(torch_home))
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
414 except Exception as exc:
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
415 logger.warning("Unable to set Torch Hub cache dir: %s", exc)
d5c582cf74bc planemo upload for repository https://github.com/goeckslab/gleam.git commit eed8c1e1d99a8a0c8f3a6bfdd8af48a5bfa19444
goeckslab
parents: 20
diff changeset
416
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
417 self._create_temp_dirs()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
418 self._extract_images()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
419 csv_path, split_cfg, split_info = self._prepare_data()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
420
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
421 use_pretrained = self.args.use_pretrained or self.args.fine_tune
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
422
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
423 backend_args = {
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
424 "model_name": self.args.model_name,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
425 "fine_tune": self.args.fine_tune,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
426 "use_pretrained": use_pretrained,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
427 "epochs": self.args.epochs,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
428 "batch_size": self.args.batch_size,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
429 "preprocessing_num_processes": self.args.preprocessing_num_processes,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
430 "split_probabilities": self.args.split_probabilities,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
431 "learning_rate": self.args.learning_rate,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
432 "random_seed": self.args.random_seed,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
433 "early_stop": self.args.early_stop,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
434 "label_column_data_path": csv_path,
17
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
435 "label_split_counts": self.label_split_counts,
db9be962dc13 planemo upload for repository https://github.com/goeckslab/gleam.git commit 9db874612b0c3e4f53d639459fe789b762660cd6
goeckslab
parents: 15
diff changeset
436 "split_counts": self.split_counts,
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
437 "augmentation": self.args.augmentation,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
438 "image_resize": self.args.image_resize,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
439 "image_zip": self.args.image_zip,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
440 "threshold": self.args.threshold,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
441 "label_metadata": self.label_metadata,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
442 "output_type_hint": self.output_type_hint,
15
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
443 "validation_metric": self.args.validation_metric,
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
444 "target_column": getattr(self.args, "report_target_column", LABEL_COLUMN_NAME),
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
445 "image_column": getattr(self.args, "report_image_column", IMAGE_PATH_COLUMN_NAME),
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
446 }
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
447 yaml_str = self.backend.prepare_config(backend_args, split_cfg)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
448
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
449 config_file = self.temp_dir / TEMP_CONFIG_FILENAME
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
450 config_file.write_text(yaml_str)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
451 logger.info(f"Wrote backend config: {config_file}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
452
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
453 ran_ok = True
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
454 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
455 # Run Ludwig experiment with absolute paths to avoid working directory issues
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
456 self.backend.run_experiment(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
457 csv_path,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
458 config_file,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
459 self.args.output_dir,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
460 self.args.random_seed,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
461 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
462 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
463 logger.error("Workflow execution failed", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
464 ran_ok = False
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
465
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
466 if ran_ok:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
467 logger.info("Workflow completed successfully.")
15
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
468 # Convert predictions parquet → csv
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
469 self.backend.convert_parquet_to_csv(self.args.output_dir)
d17e3a1b8659 planemo upload for repository https://github.com/goeckslab/gleam.git commit bc50fef8acb44aca15d0a1746e6c0c967da5bb17
goeckslab
parents: 12
diff changeset
470 logger.info("Converted Parquet to CSV.")
12
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
471 # Generate a very small set of plots to conserve disk space
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
472 self.backend.generate_plots(self.args.output_dir)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
473 # Build HTML report (robust to missing metrics)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
474 report_file = self.backend.generate_html_report(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
475 "Image Classification Results",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
476 self.args.output_dir,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
477 backend_args,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
478 split_info,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
479 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
480 logger.info(f"HTML report generated at: {report_file}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
481 # Post-process cleanup to reduce disk footprint for subsequent tests
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
482 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
483 self._postprocess_cleanup(self.args.output_dir)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
484 except Exception as cleanup_err:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
485 logger.warning(f"Cleanup step failed: {cleanup_err}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
486 else:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
487 # Fallback: create minimal outputs so downstream steps can proceed
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
488 logger.warning("Falling back to minimal outputs due to runtime failure.")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
489 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
490 self._reset_output_dir(self.args.output_dir)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
491 except Exception as reset_err:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
492 logger.warning(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
493 "Unable to clear previous outputs before fallback: %s",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
494 reset_err,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
495 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
496
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
497 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
498 self._create_minimal_outputs(self.args.output_dir, csv_path)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
499 # Even in fallback, produce an HTML shell so tests find required text
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
500 report_file = self.backend.generate_html_report(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
501 "Image Classification Results",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
502 self.args.output_dir,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
503 backend_args,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
504 split_info,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
505 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
506 logger.info(f"HTML report (fallback) generated at: {report_file}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
507 except Exception as fb_err:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
508 logger.error(f"Failed to build fallback outputs: {fb_err}")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
509 raise
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
510
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
511 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
512 logger.error("Workflow execution failed", exc_info=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
513 raise
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
514 finally:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
515 self._cleanup_temp_dirs()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
516
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
517 def _postprocess_cleanup(self, output_dir: Path) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
518 """Remove large intermediates and caches to conserve disk space across tests."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
519 output_dir = Path(output_dir)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
520 exp_dirs = sorted(
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
521 output_dir.glob("experiment_run*"),
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
522 key=lambda p: p.stat().st_mtime,
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
523 )
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
524 if exp_dirs:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
525 exp_dir = exp_dirs[-1]
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
526 # Remove training checkpoints directory if present
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
527 ckpt_dir = exp_dir / "model" / "training_checkpoints"
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
528 if ckpt_dir.exists():
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
529 shutil.rmtree(ckpt_dir, ignore_errors=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
530 # Remove predictions parquet once CSV is generated
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
531 parquet_path = exp_dir / PREDICTIONS_PARQUET_FILE_NAME
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
532 if parquet_path.exists():
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
533 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
534 parquet_path.unlink()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
535 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
536 pass
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
537
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
538 self._clear_model_caches()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
539
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
540 def _clear_model_caches(self) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
541 """Delete large framework caches to free up disk space."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
542 cache_paths = [
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
543 Path.cwd() / "home" / ".cache" / "torch" / "hub",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
544 Path.home() / ".cache" / "torch" / "hub",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
545 Path.cwd() / "home" / ".cache" / "huggingface",
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
546 ]
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
547
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
548 for cache_path in cache_paths:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
549 if cache_path.exists():
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
550 shutil.rmtree(cache_path, ignore_errors=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
551
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
552 def _reset_output_dir(self, output_dir: Path) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
553 """Remove partial experiment outputs and caches before building fallbacks."""
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
554 output_dir = Path(output_dir)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
555 for exp_dir in output_dir.glob("experiment_run*"):
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
556 if exp_dir.is_dir():
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
557 shutil.rmtree(exp_dir, ignore_errors=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
558
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
559 self._clear_model_caches()
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
560
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
561 def _create_minimal_outputs(self, output_dir: Path, prepared_csv_path: Path) -> None:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
562 """Create a minimal set of outputs so Galaxy can collect expected artifacts.
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
563
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
564 - experiment_run/
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
565 - predictions.csv (1 column)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
566 - visualizations/train/ (empty)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
567 - visualizations/test/ (empty)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
568 - model/
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
569 - model_weights/ (empty)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
570 - model_hyperparameters.json (stub)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
571 """
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
572 output_dir = Path(output_dir)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
573 exp_dir = output_dir / "experiment_run"
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
574 (exp_dir / "visualizations" / "train").mkdir(parents=True, exist_ok=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
575 (exp_dir / "visualizations" / "test").mkdir(parents=True, exist_ok=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
576 model_dir = exp_dir / "model"
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
577 (model_dir / "model_weights").mkdir(parents=True, exist_ok=True)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
578
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
579 # Stub JSON so the tool's copy step succeeds
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
580 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
581 (model_dir / "model_hyperparameters.json").write_text("{}\n")
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
582 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
583 pass
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
584
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
585 # Create a small predictions.csv with exactly 1 column
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
586 try:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
587 df_all = pd.read_csv(prepared_csv_path)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
588 from constants import SPLIT_COLUMN_NAME # local import to avoid cycle at top
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
589 num_rows = int((df_all[SPLIT_COLUMN_NAME] == 2).sum()) if SPLIT_COLUMN_NAME in df_all.columns else 1
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
590 except Exception:
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
591 num_rows = 1
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
592 num_rows = max(1, num_rows)
bcfa2e234a80 planemo upload for repository https://github.com/goeckslab/gleam.git commit 96bab8325992d16fcaad8e0a4dc4c62b00e2abc2
goeckslab
parents:
diff changeset
593 pd.DataFrame({"prediction": [0] * num_rows}).to_csv(exp_dir / "predictions.csv", index=False)