diff pycaret_train.py @ 15:01e7c5481f13 draft default tip

planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
author goeckslab
date Mon, 19 Jan 2026 05:54:52 +0000
parents 49f73a3c12f3
children
line wrap: on
line diff
--- a/pycaret_train.py	Mon Dec 29 20:34:38 2025 +0000
+++ b/pycaret_train.py	Mon Jan 19 05:54:52 2026 +0000
@@ -134,6 +134,15 @@
         default=None,
         help="Metric used to select the best model (e.g. AUC, Accuracy, R2, RMSE).",
     )
+    parser.add_argument(
+        "--sample-id-column",
+        type=str,
+        default=None,
+        help=(
+            "Optional column name used to group samples during splitting "
+            "to prevent data leakage (e.g., patient_id or slide_id)."
+        ),
+    )
 
     args = parser.parse_args()
 
@@ -170,6 +179,7 @@
         "n_jobs": n_jobs,
         "probability_threshold": args.probability_threshold,
         "best_model_metric": args.best_model_metric,
+        "sample_id_column": args.sample_id_column,
     }
     LOG.info(f"Model kwargs: {model_kwargs}")