annotate multimodal_learner.py @ 6:871957823d0c draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
author goeckslab
date Mon, 26 Jan 2026 18:44:07 +0000
parents 975512caae22
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
1 #!/usr/bin/env python
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
2 """
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
3 Main entrypoint for AutoGluon multimodal training wrapper.
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
4 """
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
5
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
6 import argparse
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
7 import logging
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
8 import os
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
9 import sys
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
10 from typing import List, Optional
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
11
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
12 import pandas as pd
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
13 from metrics_logic import aggregate_metrics
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
14 from plot_logic import infer_problem_type
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
15 from report_utils import write_outputs
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
16 from sklearn.model_selection import KFold, StratifiedKFold
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
17 from split_logic import split_dataset
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
18 from test_pipeline import run_autogluon_test_experiment
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
19 from training_pipeline import autogluon_hyperparameters, handle_missing_images, run_autogluon_experiment
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
20 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
21 # Local imports (your split utilities)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
22 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
23 from utils import (
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
24 absolute_path_expander,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
25 enable_deterministic_mode,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
26 enable_tensor_cores_if_available,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
27 ensure_local_tmp,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
28 load_file,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
29 prepare_image_search_dirs,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
30 set_seeds,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
31 str2bool,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
32 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
33
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
34 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
35 # Logger setup
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
36 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
37 logger = logging.getLogger(__name__)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
38
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
39
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
40 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
41 # Argument parsing (unchanged from your original, only minor fixes)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
42 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
43 def parse_args(argv=None):
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
44 parser = argparse.ArgumentParser(description="Train & report an AutoGluon model")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
45
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
46 parser.add_argument("--input_csv_train", dest="train_dataset", required=True)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
47 parser.add_argument("--input_csv_test", dest="test_dataset", default=None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
48 parser.add_argument("--target_column", required=True)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
49 parser.add_argument("--output_json", default="results.json")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
50 parser.add_argument("--output_html", default="report.html")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
51 parser.add_argument("--output_config", default=None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
52 parser.add_argument("--images_zip", nargs="*", default=None,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
53 help="One or more ZIP files that contain image assets")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
54 parser.add_argument("--missing_image_strategy", default="false",
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
55 help="true/false: remove rows with missing images or use placeholder")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
56 parser.add_argument("--threshold", type=float, default=None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
57 parser.add_argument("--time_limit", type=int, default=None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
58 parser.add_argument("--deterministic", action="store_true", default=False,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
59 help="Enable deterministic algorithms to reduce run-to-run variance")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
60 parser.add_argument("--random_seed", type=int, default=42)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
61 parser.add_argument("--cross_validation", type=str, default="false")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
62 parser.add_argument("--num_folds", type=int, default=5)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
63 parser.add_argument("--epochs", type=int, default=None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
64 parser.add_argument("--learning_rate", type=float, default=None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
65 parser.add_argument("--batch_size", type=int, default=None)
6
871957823d0c planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents: 5
diff changeset
66 parser.add_argument("--num_workers", type=int, default=None,
871957823d0c planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents: 5
diff changeset
67 help="DataLoader worker count (0 disables multiprocessing).")
871957823d0c planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents: 5
diff changeset
68 parser.add_argument("--num_workers_eval", type=int, default=None,
871957823d0c planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents: 5
diff changeset
69 help="DataLoader workers for evaluation; defaults to --num_workers.")
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
70 parser.add_argument("--backbone_image", type=str, default="swin_base_patch4_window7_224")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
71 parser.add_argument("--backbone_text", type=str, default="microsoft/deberta-v3-base")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
72 parser.add_argument("--validation_size", type=float, default=0.2)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
73 parser.add_argument("--split_probabilities", type=float, nargs=3,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
74 default=[0.7, 0.1, 0.2], metavar=("train", "val", "test"))
3
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
75 parser.add_argument("--sample_id_column", default=None)
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
76 parser.add_argument("--preset", choices=["medium_quality", "high_quality", "best_quality"],
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
77 default="medium_quality")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
78 parser.add_argument("--eval_metric", default="roc_auc")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
79 parser.add_argument("--hyperparameters", default=None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
80
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
81 args, unknown = parser.parse_known_args(argv)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
82 if unknown:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
83 logger.warning("Ignoring unknown CLI tokens: %s", unknown)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
84
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
85 # -------------------------- Validation --------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
86 if not (0.0 <= args.validation_size <= 1.0):
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
87 parser.error("--validation_size must be in [0, 1]")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
88 if len(args.split_probabilities) != 3 or abs(sum(args.split_probabilities) - 1.0) > 1e-6:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
89 parser.error("--split_probabilities must be three numbers summing to 1.0")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
90 if args.cross_validation.lower() == "true" and (args.num_folds < 2):
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
91 parser.error("--num_folds must be >= 2 when --cross_validation is true")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
92
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
93 return args
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
94
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
95
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
96 def run_cross_validation(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
97 args,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
98 df_full: pd.DataFrame,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
99 test_dataset: Optional[pd.DataFrame],
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
100 image_cols: List[str],
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
101 ag_config: dict,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
102 ):
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
103 """Cross-validation loop returning aggregated metrics and last predictor."""
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
104 df_full = df_full.drop(columns=["split"], errors="ignore")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
105 y = df_full[args.target_column]
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
106 try:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
107 use_stratified = y.dtype == object or y.nunique() <= 20
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
108 except Exception:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
109 use_stratified = False
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
110
3
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
111 if args.sample_id_column and args.sample_id_column in df_full.columns:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
112 groups = df_full[args.sample_id_column]
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
113 if use_stratified:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
114 try:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
115 from sklearn.model_selection import StratifiedGroupKFold
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
116
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
117 kf = StratifiedGroupKFold(
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
118 n_splits=int(args.num_folds),
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
119 shuffle=True,
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
120 random_state=int(args.random_seed),
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
121 )
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
122 except Exception as exc:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
123 logger.warning(
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
124 "StratifiedGroupKFold unavailable (%s); falling back to GroupKFold.",
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
125 exc,
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
126 )
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
127 from sklearn.model_selection import GroupKFold
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
128
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
129 kf = GroupKFold(n_splits=int(args.num_folds))
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
130 use_stratified = False
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
131 else:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
132 from sklearn.model_selection import GroupKFold
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
133
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
134 kf = GroupKFold(n_splits=int(args.num_folds))
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
135 else:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
136 kf = StratifiedKFold(n_splits=int(args.num_folds), shuffle=True, random_state=int(args.random_seed)) if use_stratified else KFold(n_splits=int(args.num_folds), shuffle=True, random_state=int(args.random_seed))
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
137
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
138 if args.sample_id_column and test_dataset is not None:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
139 test_dataset = test_dataset.drop(columns=[args.sample_id_column], errors="ignore")
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
140
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
141 raw_folds = []
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
142 ag_folds = []
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
143 folds_info = []
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
144 last_predictor = None
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
145 last_data_ctx = None
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
146
3
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
147 if args.sample_id_column and args.sample_id_column in df_full.columns:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
148 split_iter = kf.split(df_full, y if use_stratified else None, groups)
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
149 else:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
150 split_iter = kf.split(df_full, y if use_stratified else None)
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
151
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
152 for fold_idx, (train_idx, val_idx) in enumerate(split_iter, start=1):
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
153 logger.info(f"CV fold {fold_idx}/{args.num_folds}")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
154 df_tr = df_full.iloc[train_idx].copy()
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
155 df_va = df_full.iloc[val_idx].copy()
3
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
156 if args.sample_id_column:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
157 df_tr = df_tr.drop(columns=[args.sample_id_column], errors="ignore")
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
158 df_va = df_va.drop(columns=[args.sample_id_column], errors="ignore")
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
159
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
160 df_tr["split"] = "train"
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
161 df_va["split"] = "val"
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
162 fold_dataset = pd.concat([df_tr, df_va], ignore_index=True)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
163
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
164 predictor_fold, data_ctx = run_autogluon_experiment(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
165 train_dataset=fold_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
166 test_dataset=test_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
167 target_column=args.target_column,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
168 image_columns=image_cols,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
169 ag_config=ag_config,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
170 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
171 last_predictor = predictor_fold
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
172 last_data_ctx = data_ctx
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
173 problem_type = infer_problem_type(predictor_fold, df_tr, args.target_column)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
174 eval_results = run_autogluon_test_experiment(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
175 predictor=predictor_fold,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
176 data_ctx=data_ctx,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
177 target_column=args.target_column,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
178 eval_metric=args.eval_metric,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
179 ag_config=ag_config,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
180 problem_type=problem_type,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
181 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
182
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
183 raw_metrics_fold = eval_results.get("raw_metrics", {})
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
184 ag_by_split_fold = eval_results.get("ag_eval", {})
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
185 raw_folds.append(raw_metrics_fold)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
186 ag_folds.append(ag_by_split_fold)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
187 folds_info.append(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
188 {
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
189 "fold": int(fold_idx),
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
190 "predictor_path": getattr(predictor_fold, "path", None),
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
191 "raw_metrics": raw_metrics_fold,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
192 "ag_eval": ag_by_split_fold,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
193 }
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
194 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
195
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
196 raw_metrics_mean, raw_metrics_std = aggregate_metrics(raw_folds)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
197 ag_by_split_mean, ag_by_split_std = aggregate_metrics(ag_folds)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
198 return (
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
199 last_predictor,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
200 raw_metrics_mean,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
201 ag_by_split_mean,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
202 raw_folds,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
203 ag_folds,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
204 raw_metrics_std,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
205 ag_by_split_std,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
206 folds_info,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
207 last_data_ctx,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
208 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
209
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
210
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
211 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
212 # Main execution
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
213 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
214 def main():
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
215 args = parse_args()
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
216
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
217 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
218 # Debug output
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
219 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
220 logger.info("=== AutoGluon Training Wrapper Started ===")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
221 logger.info(f"Working directory: {os.getcwd()}")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
222 logger.info(f"Command line: {' '.join(sys.argv)}")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
223 logger.info(f"Parsed args: {vars(args)}")
4
de753cf07008 planemo upload for repository https://github.com/goeckslab/gleam.git commit 528178a2b2ca39924fd596859fe658aeaf984e6d
goeckslab
parents: 3
diff changeset
224 logger.info("Cache dirs: TORCH_HOME=%s HF_HOME=%s HUGGINGFACE_HUB_CACHE=%s",
de753cf07008 planemo upload for repository https://github.com/goeckslab/gleam.git commit 528178a2b2ca39924fd596859fe658aeaf984e6d
goeckslab
parents: 3
diff changeset
225 os.environ.get("TORCH_HOME"),
de753cf07008 planemo upload for repository https://github.com/goeckslab/gleam.git commit 528178a2b2ca39924fd596859fe658aeaf984e6d
goeckslab
parents: 3
diff changeset
226 os.environ.get("HF_HOME"),
de753cf07008 planemo upload for repository https://github.com/goeckslab/gleam.git commit 528178a2b2ca39924fd596859fe658aeaf984e6d
goeckslab
parents: 3
diff changeset
227 os.environ.get("HUGGINGFACE_HUB_CACHE"))
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
228
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
229 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
230 # Reproducibility & performance
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
231 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
232 set_seeds(args.random_seed)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
233 if args.deterministic:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
234 enable_deterministic_mode(args.random_seed)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
235 logger.info("Deterministic mode enabled (seed=%s)", args.random_seed)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
236 ensure_local_tmp()
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
237 enable_tensor_cores_if_available()
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
238
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
239 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
240 # Load datasets
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
241 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
242 train_dataset = load_file(args.train_dataset)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
243 test_dataset = load_file(args.test_dataset) if args.test_dataset else None
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
244
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
245 logger.info(f"Train dataset loaded: {len(train_dataset)} rows")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
246 if test_dataset is not None:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
247 logger.info(f"Test dataset loaded: {len(test_dataset)} rows")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
248
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
249 # ------------------------------------------------------------------
5
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
250 # Resolve columns by name; if Galaxy passed a numeric index,
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
251 # translate it to the corresponding header so downstream checks pass.
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
252 # Galaxy's data_column widget is 1-based.
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
253 # ------------------------------------------------------------------
5
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
254 def resolve_column_name(value, columns, label):
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
255 if value is None:
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
256 return None
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
257 if str(value).isdigit():
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
258 idx = int(value) - 1
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
259 if 0 <= idx < len(columns):
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
260 resolved = columns[idx]
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
261 if value in columns:
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
262 logger.warning(
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
263 "%s column value '%s' matches a header, but Galaxy data_column "
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
264 "inputs are interpreted as 1-based indices; using column #%s header '%s'.",
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
265 label,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
266 value,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
267 idx + 1,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
268 resolved,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
269 )
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
270 logger.info(
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
271 "%s column '%s' not found; using column #%s header '%s' instead.",
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
272 label,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
273 value,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
274 idx + 1,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
275 resolved,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
276 )
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
277 return resolved
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
278 logger.error(
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
279 "Numeric %s index '%s' is out of range for dataset with %s columns.",
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
280 label.lower(),
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
281 value,
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
282 len(columns),
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
283 )
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
284 sys.exit(1)
5
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
285 return value
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
286
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
287 args.target_column = resolve_column_name(args.target_column, train_dataset.columns, "Target")
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
288 args.sample_id_column = resolve_column_name(
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
289 args.sample_id_column, train_dataset.columns, "Sample ID"
975512caae22 planemo upload for repository https://github.com/goeckslab/gleam.git commit e984bd965d46c5f9ee5c3beb7429f3fd4a91ee35
goeckslab
parents: 4
diff changeset
290 )
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
291
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
292 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
293 # Image handling (ZIP extraction + absolute path expansion)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
294 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
295 extracted_imgs_path = prepare_image_search_dirs(args)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
296
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
297 image_cols = absolute_path_expander(train_dataset, extracted_imgs_path, None)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
298 if test_dataset is not None:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
299 absolute_path_expander(test_dataset, extracted_imgs_path, image_cols)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
300
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
301 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
302 # Handle missing images
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
303 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
304 train_dataset = handle_missing_images(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
305 train_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
306 image_columns=image_cols,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
307 strategy=args.missing_image_strategy,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
308 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
309 if test_dataset is not None:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
310 test_dataset = handle_missing_images(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
311 test_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
312 image_columns=image_cols,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
313 strategy=args.missing_image_strategy,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
314 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
315
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
316 logger.info(f"After cleanup → train: {len(train_dataset)}, test: {len(test_dataset) if test_dataset is not None else 0}")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
317
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
318 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
319 # Dataset splitting logic (adds 'split' column to train_dataset)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
320 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
321 split_dataset(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
322 train_dataset=train_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
323 test_dataset=test_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
324 target_column=args.target_column,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
325 split_probabilities=args.split_probabilities,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
326 validation_size=args.validation_size,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
327 random_seed=args.random_seed,
3
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
328 sample_id_column=args.sample_id_column,
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
329 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
330
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
331 logger.info("Preprocessing complete — ready for AutoGluon training!")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
332 logger.info(f"Final split counts:\n{train_dataset['split'].value_counts().sort_index()}")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
333
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
334 # Verify target/image/text columns exist
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
335 if args.target_column not in train_dataset.columns:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
336 logger.error(f"Target column '{args.target_column}' not found in training data.")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
337 sys.exit(1)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
338 if test_dataset is not None and args.target_column not in test_dataset.columns:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
339 logger.error(f"Target column '{args.target_column}' not found in test data.")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
340 sys.exit(1)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
341
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
342 # Threshold is only meaningful for binary classification; ignore otherwise.
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
343 threshold_for_run = args.threshold
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
344 unique_labels = None
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
345 target_looks_binary = False
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
346 try:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
347 unique_labels = train_dataset[args.target_column].nunique(dropna=True)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
348 target_looks_binary = unique_labels == 2
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
349 except Exception:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
350 logger.warning("Could not inspect target column '%s' for threshold validation; proceeding without binary check.", args.target_column)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
351
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
352 if threshold_for_run is not None:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
353 if target_looks_binary:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
354 threshold_for_run = float(threshold_for_run)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
355 logger.info("Applying custom decision threshold %.4f for binary evaluation.", threshold_for_run)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
356 else:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
357 logger.warning(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
358 "Threshold %.3f provided but target '%s' does not appear binary (unique labels=%s); ignoring threshold.",
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
359 threshold_for_run,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
360 args.target_column,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
361 unique_labels if unique_labels is not None else "unknown",
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
362 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
363 threshold_for_run = None
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
364 args.threshold = threshold_for_run
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
365 # Image columns are auto-inferred; image_cols already resolved to absolute paths.
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
366 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
367 # Build AutoGluon configuration from CLI knobs
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
368 # ------------------------------------------------------------------
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
369 ag_config = autogluon_hyperparameters(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
370 threshold=args.threshold,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
371 time_limit=args.time_limit,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
372 random_seed=args.random_seed,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
373 epochs=args.epochs,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
374 learning_rate=args.learning_rate,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
375 batch_size=args.batch_size,
6
871957823d0c planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents: 5
diff changeset
376 num_workers=args.num_workers,
871957823d0c planemo upload for repository https://github.com/goeckslab/gleam.git commit 6e49ad44dd8572382ee203926690a30d7e888203
goeckslab
parents: 5
diff changeset
377 num_workers_evaluation=args.num_workers_eval,
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
378 backbone_image=args.backbone_image,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
379 backbone_text=args.backbone_text,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
380 preset=args.preset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
381 eval_metric=args.eval_metric,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
382 hyperparameters=args.hyperparameters,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
383 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
384 logger.info(f"AutoGluon config prepared: fit={ag_config.get('fit')}, hyperparameters keys={list(ag_config.get('hyperparameters', {}).keys())}")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
385
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
386 cv_enabled = str2bool(args.cross_validation)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
387 if cv_enabled:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
388 (
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
389 predictor,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
390 raw_metrics,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
391 ag_by_split,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
392 raw_folds,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
393 ag_folds,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
394 raw_metrics_std,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
395 ag_by_split_std,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
396 folds_info,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
397 data_ctx,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
398 ) = run_cross_validation(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
399 args=args,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
400 df_full=train_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
401 test_dataset=test_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
402 image_cols=image_cols,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
403 ag_config=ag_config,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
404 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
405 if predictor is None:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
406 logger.error("All CV folds failed. Exiting.")
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
407 sys.exit(1)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
408 eval_results = {
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
409 "raw_metrics": raw_metrics,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
410 "ag_eval": ag_by_split,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
411 "fit_summary": None,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
412 }
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
413 else:
3
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
414 # Drop sample-id column before training so it does not leak into modeling.
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
415 if args.sample_id_column:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
416 train_dataset = train_dataset.drop(columns=[args.sample_id_column], errors="ignore")
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
417 if test_dataset is not None:
25bb80df7c0c planemo upload for repository https://github.com/goeckslab/gleam.git commit f0daa5846b336584c708d88f6d7f1b5ee8dc3093
goeckslab
parents: 0
diff changeset
418 test_dataset = test_dataset.drop(columns=[args.sample_id_column], errors="ignore")
0
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
419 predictor, data_ctx = run_autogluon_experiment(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
420 train_dataset=train_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
421 test_dataset=test_dataset,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
422 target_column=args.target_column,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
423 image_columns=image_cols,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
424 ag_config=ag_config,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
425 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
426 logger.info("AutoGluon training finished. Model path: %s", getattr(predictor, "path", None))
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
427
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
428 # Evaluate predictor on Train/Val/Test splits
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
429 problem_type = infer_problem_type(predictor, train_dataset, args.target_column)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
430 eval_results = run_autogluon_test_experiment(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
431 predictor=predictor,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
432 data_ctx=data_ctx,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
433 target_column=args.target_column,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
434 eval_metric=args.eval_metric,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
435 ag_config=ag_config,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
436 problem_type=problem_type,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
437 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
438 raw_metrics = eval_results.get("raw_metrics", {})
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
439 ag_by_split = eval_results.get("ag_eval", {})
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
440 raw_folds = ag_folds = raw_metrics_std = ag_by_split_std = None
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
441
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
442 logger.info("Transparent metrics by split: %s", eval_results["raw_metrics"])
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
443 logger.info("AutoGluon evaluate() by split: %s", eval_results["ag_eval"])
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
444
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
445 if "problem_type" in eval_results:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
446 problem_type_final = eval_results["problem_type"]
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
447 else:
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
448 problem_type_final = infer_problem_type(predictor, train_dataset, args.target_column)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
449
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
450 write_outputs(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
451 args=args,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
452 predictor=predictor,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
453 problem_type=problem_type_final,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
454 eval_results=eval_results,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
455 data_ctx=data_ctx,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
456 raw_folds=raw_folds,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
457 ag_folds=ag_folds,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
458 raw_metrics_std=raw_metrics_std,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
459 ag_by_split_std=ag_by_split_std,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
460 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
461
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
462
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
463 if __name__ == "__main__":
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
464 logging.basicConfig(
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
465 level=logging.INFO,
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
466 format="%(asctime)s | %(levelname)s | %(message)s",
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
467 datefmt="%H:%M:%S"
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
468 )
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
469 # Quiet noisy image parsing logs (e.g., PIL.PngImagePlugin debug streams)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
470 logging.getLogger("PIL").setLevel(logging.WARNING)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
471 logging.getLogger("PIL.PngImagePlugin").setLevel(logging.WARNING)
375c36923da1 planemo upload for repository https://github.com/goeckslab/gleam.git commit 1c6c1ad7a1b2bd3645aa0eafa2167784820b52e0
goeckslab
parents:
diff changeset
472 main()