diff image_workflow.py @ 20:64872c48a21f draft

planemo upload for repository https://github.com/goeckslab/gleam.git commit d4b122527a2402e43512f9b4bda00c7bff0ec9e9
author goeckslab
date Tue, 06 Jan 2026 15:35:11 +0000
parents db9be962dc13
children d5c582cf74bc
line wrap: on
line diff
--- a/image_workflow.py	Thu Dec 18 16:59:58 2025 +0000
+++ b/image_workflow.py	Tue Jan 06 15:35:11 2026 +0000
@@ -168,6 +168,7 @@
                 split_probabilities=self.args.split_probabilities,
                 random_state=self.args.random_seed,
                 label_column=LABEL_COLUMN_NAME,
+                group_column=self.args.sample_id_column,
             )
             split_config = {
                 "type": "fixed",
@@ -178,6 +179,11 @@
                 f"{[int(p * 100) for p in self.args.split_probabilities]}% "
                 f"for train/val/test with balanced label distribution."
             )
+            if self.args.sample_id_column:
+                split_info += (
+                    f" Grouped by sample ID column '{self.args.sample_id_column}' "
+                    "to prevent data leakage."
+                )
 
         final_csv = self.temp_dir / TEMP_CSV_FILENAME