annotate base_model_trainer.py @ 15:01e7c5481f13 draft default tip

planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
author goeckslab
date Mon, 19 Jan 2026 05:54:52 +0000
parents bf0df21a1ea3
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1 import base64
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
2 import logging
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
3 import tempfile
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
4 from pathlib import Path
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
5
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
6 import h5py
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
7 import joblib
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
8 import numpy as np
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
9 import pandas as pd
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
10 from feature_help_modal import get_feature_metrics_help_modal
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
11 from feature_importance import FeatureImportanceAnalyzer
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
12 from sklearn.metrics import (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
13 accuracy_score,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
14 average_precision_score,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
15 confusion_matrix,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
16 f1_score,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
17 matthews_corrcoef,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
18 precision_score,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
19 recall_score,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
20 roc_auc_score,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
21 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
22 from utils import (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
23 add_hr_to_html,
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
24 add_plot_to_html,
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
25 build_tabbed_html,
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
26 encode_image_to_base64,
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
27 get_html_closing,
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
28 get_html_template,
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
29 )
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
30
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
31 logging.basicConfig(level=logging.DEBUG)
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
32 LOG = logging.getLogger(__name__)
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
33
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
34
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
35 class BaseModelTrainer:
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
36 def __init__(
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
37 self,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
38 input_file,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
39 target_col,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
40 output_dir,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
41 task_type,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
42 random_seed,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
43 test_file=None,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
44 **kwargs,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
45 ):
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
46 self.exp = None
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
47 self.input_file = input_file
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
48 self.target_col = target_col
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
49 self.output_dir = output_dir
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
50 self.task_type = task_type
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
51 self.random_seed = random_seed
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
52 self.data = None
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
53 self.target = None
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
54 self.best_model = None
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
55 self.results = None
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
56 self.tuning_results = None
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
57 self.features_name = None
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
58 self.plot_feature_names = None
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
59 self.plots = {}
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
60 self.explainer_plots = {}
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
61 self.plots_explainer_html = None
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
62 self.trees = []
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
63 self.user_kwargs = kwargs.copy()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
64 for key, value in self.user_kwargs.items():
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
65 setattr(self, key, value)
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
66 if not hasattr(self, "plot_feature_limit"):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
67 self.plot_feature_limit = 30
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
68 self._shap_row_cap = None
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
69 if getattr(self, "polynomial_features", False):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
70 # Keep feature importance responsive by trimming plots/SHAP rows
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
71 try:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
72 limit_val = int(self.plot_feature_limit)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
73 except (TypeError, ValueError):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
74 limit_val = 30
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
75 self.plot_feature_limit = min(limit_val, 15)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
76 self._shap_row_cap = 200
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
77 LOG.info(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
78 "Polynomial features enabled; limiting feature plots to %s and SHAP rows to %s",
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
79 self.plot_feature_limit,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
80 self._shap_row_cap,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
81 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
82 self.imputed_training_data = None
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
83 self._best_model_metric_used = None
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
84 self.setup_params = {}
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
85 self.test_file = test_file
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
86 self.test_data = None
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
87
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
88 if not self.output_dir:
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
89 raise ValueError(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
90 "output_dir must be specified and not None"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
91 )
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
92
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
93 # Warn about irrelevant kwargs for the task type
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
94 if self.task_type == "regression" and (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
95 "probability_threshold" in self.user_kwargs
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
96 ):
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
97 LOG.warning(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
98 "probability_threshold is ignored for regression tasks."
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
99 )
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
100
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
101 LOG.info(f"Model kwargs: {self.__dict__}")
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
102
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
103 def load_data(self):
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
104 LOG.info(f"Loading data from {self.input_file}")
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
105 self.data = pd.read_csv(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
106 self.input_file, sep=None, engine="python"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
107 )
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
108 self.data.columns = self.data.columns.str.replace(".", "_")
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
109
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
110 names = self.data.columns.to_list()
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
111 LOG.info(f"Original dataset columns: {names}")
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
112
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
113 target_index = int(self.target_col) - 1
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
114 num_cols = len(names)
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
115 if target_index < 0 or target_index >= num_cols:
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
116 raise ValueError(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
117 f"Target column number {self.target_col} is invalid. "
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
118 f"Please select a number between 1 and {num_cols}."
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
119 )
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
120
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
121 self.target = names[target_index]
15
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
122 sample_id_column = getattr(self, "sample_id_column", None)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
123 if sample_id_column:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
124 sample_id_column = sample_id_column.replace(".", "_")
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
125 self.sample_id_column = sample_id_column
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
126 else:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
127 self.sample_id_column = None
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
128 self.sample_id_series = None
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
129
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
130 # Conditional drop: only if 'prediction_label' exists and is not
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
131 # the target
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
132 if "prediction_label" in self.data.columns and (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
133 self.data.columns[target_index] != "prediction_label"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
134 ):
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
135 LOG.info(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
136 "Dropping 'prediction_label' column as it's not the target."
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
137 )
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
138 self.data = self.data.drop(columns=["prediction_label"])
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
139 else:
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
140 if self.target == "prediction_label":
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
141 LOG.warning(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
142 "Using 'prediction_label' as target column. "
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
143 "This may not be intended if it's a previous prediction."
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
144 )
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
145
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
146 numeric_cols = self.data.select_dtypes(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
147 include=["number"]
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
148 ).columns
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
149 non_numeric_cols = self.data.select_dtypes(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
150 exclude=["number"]
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
151 ).columns
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
152 self.data[numeric_cols] = self.data[numeric_cols].apply(
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
153 pd.to_numeric, errors="coerce"
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
154 )
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
155 if len(non_numeric_cols) > 0:
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
156 LOG.info(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
157 f"Non-numeric columns found: {non_numeric_cols.tolist()}"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
158 )
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
159
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
160 # Update names after possible drop
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
161 names = self.data.columns.to_list()
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
162 LOG.info(f"Dataset columns after processing: {names}")
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
163
15
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
164 sample_id_valid = False
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
165 if sample_id_column:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
166 if sample_id_column not in self.data.columns:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
167 LOG.warning(
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
168 "Sample ID column '%s' not found; proceeding without group-aware split.",
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
169 sample_id_column,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
170 )
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
171 sample_id_column = None
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
172 self.sample_id_column = None
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
173 elif sample_id_column == self.target:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
174 LOG.warning(
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
175 "Sample ID column '%s' matches target column; skipping group-aware split.",
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
176 sample_id_column,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
177 )
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
178 sample_id_column = None
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
179 self.sample_id_column = None
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
180 else:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
181 sample_id_valid = True
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
182
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
183 if self.test_file:
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
184 LOG.info(f"Loading test data from {self.test_file}")
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
185 df_test = pd.read_csv(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
186 self.test_file, sep=None, engine="python"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
187 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
188 df_test.columns = df_test.columns.str.replace(".", "_")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
189 self.test_data = df_test
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
190
15
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
191 if sample_id_valid and self.test_data is None:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
192 train_size = getattr(self, "train_size", None)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
193 if train_size is None:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
194 train_size = 0.7
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
195 if train_size <= 0 or train_size >= 1:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
196 LOG.warning(
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
197 "Invalid train_size=%s; skipping group-aware split.",
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
198 train_size,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
199 )
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
200 else:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
201 rng = np.random.RandomState(self.random_seed)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
202
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
203 def _allocate_split_counts(n_total: int, probs: list) -> list:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
204 if n_total <= 0:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
205 return [0 for _ in probs]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
206 counts = [0 for _ in probs]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
207 active = [i for i, p in enumerate(probs) if p > 0]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
208 remainder = n_total
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
209 if active and n_total >= len(active):
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
210 for i in active:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
211 counts[i] = 1
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
212 remainder -= len(active)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
213 if remainder > 0:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
214 probs_arr = np.array(probs, dtype=float)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
215 probs_arr = probs_arr / probs_arr.sum()
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
216 raw = remainder * probs_arr
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
217 floors = np.floor(raw).astype(int)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
218 for i, value in enumerate(floors.tolist()):
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
219 counts[i] += value
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
220 leftover = remainder - int(floors.sum())
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
221 if leftover > 0 and active:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
222 frac = raw - floors
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
223 order = sorted(active, key=lambda i: (-frac[i], i))
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
224 for i in range(leftover):
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
225 counts[order[i % len(order)]] += 1
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
226 return counts
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
227
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
228 def _choose_split(counts: list, targets: list, active: list) -> int:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
229 remaining = [targets[i] - counts[i] for i in range(len(targets))]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
230 best = max(active, key=lambda i: (remaining[i], -counts[i], -targets[i]))
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
231 if remaining[best] <= 0:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
232 best = min(active, key=lambda i: counts[i])
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
233 return best
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
234
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
235 probs = [train_size, 1.0 - train_size]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
236 targets = _allocate_split_counts(len(self.data), probs)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
237 counts = [0, 0]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
238 active = [0, 1]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
239 train_idx = []
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
240 test_idx = []
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
241
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
242 group_series = self.data[sample_id_column].astype(object)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
243 missing_mask = group_series.isna()
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
244 if missing_mask.any():
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
245 group_series = group_series.copy()
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
246 group_series.loc[missing_mask] = [
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
247 f"__missing__{idx}" for idx in group_series.index[missing_mask]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
248 ]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
249 group_to_indices = {}
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
250 for idx, group_id in group_series.items():
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
251 group_to_indices.setdefault(group_id, []).append(idx)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
252
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
253 group_ids = sorted(group_to_indices.keys(), key=lambda x: str(x))
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
254 rng.shuffle(group_ids)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
255
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
256 for group_id in group_ids:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
257 split_idx = _choose_split(counts, targets, active)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
258 counts[split_idx] += len(group_to_indices[group_id])
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
259 if split_idx == 0:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
260 train_idx.extend(group_to_indices[group_id])
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
261 else:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
262 test_idx.extend(group_to_indices[group_id])
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
263
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
264 missing_splits = []
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
265 if not train_idx:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
266 missing_splits.append("train")
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
267 if not test_idx:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
268 missing_splits.append("test")
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
269 if missing_splits:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
270 LOG.warning(
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
271 "Group-aware split using '%s' produced empty %s set; "
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
272 "falling back to default split.",
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
273 sample_id_column,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
274 " and ".join(missing_splits),
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
275 )
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
276 else:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
277 self.test_data = self.data.loc[test_idx].reset_index(drop=True)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
278 self.data = self.data.loc[train_idx].reset_index(drop=True)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
279 LOG.info(
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
280 "Applied group-aware split using '%s' (train=%s, test=%s).",
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
281 sample_id_column,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
282 len(train_idx),
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
283 len(test_idx),
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
284 )
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
285
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
286 if sample_id_valid:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
287 self.sample_id_series = self.data[sample_id_column].copy()
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
288 if sample_id_column in self.data.columns:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
289 self.data = self.data.drop(columns=[sample_id_column])
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
290 if self.test_data is not None and sample_id_column in self.test_data.columns:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
291 self.test_data = self.test_data.drop(columns=[sample_id_column])
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
292
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
293 # Refresh feature lists after any sample-id column removal.
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
294 names = self.data.columns.to_list()
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
295 self.features_name = [n for n in names if n != self.target]
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
296 self.plot_feature_names = self._select_plot_features(self.features_name)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
297
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
298 def _select_plot_features(self, all_features):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
299 limit = getattr(self, "plot_feature_limit", 30)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
300 if not isinstance(limit, int) or limit <= 0:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
301 LOG.info(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
302 "Feature plotting limit disabled (plot_feature_limit=%s).", limit
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
303 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
304 return all_features
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
305 if len(all_features) <= limit:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
306 LOG.info(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
307 "Feature plotting limit not needed (%s features <= limit %s).",
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
308 len(all_features),
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
309 limit,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
310 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
311 return all_features
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
312 df = self.data[all_features].copy()
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
313 numeric_cols = df.select_dtypes(include=["number"]).columns
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
314 ranked = []
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
315 if len(numeric_cols) > 0:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
316 variances = (
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
317 df[numeric_cols]
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
318 .var()
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
319 .fillna(0)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
320 .abs()
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
321 .sort_values(ascending=False)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
322 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
323 ranked = variances.index.tolist()
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
324 selected = []
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
325 for col in ranked:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
326 if len(selected) >= limit:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
327 break
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
328 selected.append(col)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
329 if len(selected) < limit:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
330 for col in all_features:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
331 if col in selected:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
332 continue
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
333 selected.append(col)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
334 if len(selected) >= limit:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
335 break
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
336 LOG.info(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
337 "Limiting feature-level plots to %s of %s available features (limit=%s).",
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
338 len(selected),
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
339 len(all_features),
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
340 limit,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
341 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
342 return selected
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
343
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
344 def setup_pycaret(self):
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
345 LOG.info("Initializing PyCaret")
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
346 self.setup_params = {
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
347 "target": self.target,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
348 "session_id": self.random_seed,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
349 "html": True,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
350 "log_experiment": False,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
351 "system_log": False,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
352 "index": False,
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
353 }
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
354 if self.test_data is not None:
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
355 self.setup_params["test_data"] = self.test_data
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
356 for attr in [
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
357 "train_size",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
358 "normalize",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
359 "feature_selection",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
360 "remove_outliers",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
361 "remove_multicollinearity",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
362 "polynomial_features",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
363 "feature_interaction",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
364 "feature_ratio",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
365 "fix_imbalance",
10
49f73a3c12f3 planemo upload for repository https://github.com/goeckslab/gleam commit 1ffd143e57fa952ee9dd84fc141771520aea0791
goeckslab
parents: 9
diff changeset
366 "n_jobs",
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
367 ]:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
368 val = getattr(self, attr, None)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
369 if val is not None:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
370 self.setup_params[attr] = val
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
371 if getattr(self, "cross_validation_folds", None) is not None:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
372 self.setup_params["fold"] = self.cross_validation_folds
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
373 LOG.info(self.setup_params)
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
374
15
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
375 group_series = getattr(self, "sample_id_series", None)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
376 if group_series is not None and getattr(self, "cross_validation", None) is not False:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
377 n_groups = pd.Series(group_series).nunique(dropna=False)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
378 fold_count = getattr(self, "cross_validation_folds", None)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
379 if fold_count is not None and fold_count > n_groups:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
380 LOG.warning(
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
381 "cross_validation_folds=%s exceeds unique groups=%s; "
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
382 "skipping group-aware CV.",
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
383 fold_count,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
384 n_groups,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
385 )
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
386 else:
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
387 self.setup_params["fold_strategy"] = "groupkfold"
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
388 self.setup_params["fold_groups"] = pd.Series(group_series).reset_index(drop=True)
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
389 LOG.info(
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
390 "Enabled group-aware CV with %s unique groups.",
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
391 n_groups,
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
392 )
01e7c5481f13 planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents: 13
diff changeset
393
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
394 if self.task_type == "classification":
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
395 from pycaret.classification import ClassificationExperiment
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
396
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
397 self.exp = ClassificationExperiment()
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
398 elif self.task_type == "regression":
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
399 from pycaret.regression import RegressionExperiment
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
400
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
401 self.exp = RegressionExperiment()
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
402 else:
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
403 raise ValueError(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
404 "task_type must be 'classification' or 'regression'"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
405 )
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
406
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
407 self.exp.setup(self.data, **self.setup_params)
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
408 self._capture_imputed_training_data()
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
409 self.setup_params.update(self.user_kwargs)
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
410
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
411 def _capture_imputed_training_data(self):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
412 """
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
413 Cache the dataset as transformed/imputed by PyCaret so downstream
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
414 components (e.g., feature importance) can operate on the exact data
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
415 used for training.
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
416 """
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
417 if self.exp is None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
418 return
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
419 try:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
420 X_processed = self.exp.get_config("X_transformed").copy()
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
421 y_processed = self.exp.get_config("y")
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
422 if isinstance(y_processed, pd.Series):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
423 y_series = y_processed.reset_index(drop=True)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
424 else:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
425 y_series = pd.Series(y_processed)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
426 y_series.name = self.target
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
427 X_processed = X_processed.reset_index(drop=True)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
428 self.imputed_training_data = pd.concat(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
429 [X_processed, y_series], axis=1
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
430 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
431 LOG.info(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
432 "Captured imputed training dataset from PyCaret "
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
433 "(%s rows, %s features).",
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
434 self.imputed_training_data.shape[0],
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
435 self.imputed_training_data.shape[1] - 1,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
436 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
437 except Exception as exc:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
438 LOG.warning(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
439 "Unable to capture processed training data from PyCaret: %s",
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
440 exc,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
441 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
442 self.imputed_training_data = None
9
e7dd78077b72 planemo upload for repository https://github.com/goeckslab/gleam commit 84d5cd0b1fa5c1ff0ad892bc39c95dad1ceb4920
goeckslab
parents: 6
diff changeset
443
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
444 def train_model(self):
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
445 LOG.info("Training and selecting the best model")
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
446 if self.task_type == "classification":
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
447 self.exp.add_metric(
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
448 id="PR-AUC-Weighted",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
449 name="PR-AUC-Weighted",
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
450 target="pred_proba",
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
451 score_func=average_precision_score,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
452 average="weighted",
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
453 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
454 # Build arguments for compare_models()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
455 compare_kwargs = {}
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
456 if getattr(self, "models", None):
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
457 compare_kwargs["include"] = self.models
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
458
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
459 # Respect explicit cross-validation flag
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
460 if getattr(self, "cross_validation", None) is not None:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
461 compare_kwargs["cross_validation"] = self.cross_validation
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
462
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
463 # Respect explicit fold count
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
464 if getattr(self, "cross_validation_folds", None) is not None:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
465 compare_kwargs["fold"] = self.cross_validation_folds
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
466
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
467 best_metric = getattr(self, "best_model_metric", None)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
468 if best_metric:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
469 compare_kwargs["sort"] = best_metric
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
470 self._best_model_metric_used = best_metric
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
471 LOG.info(f"Ranking models using metric: {best_metric}")
9
e7dd78077b72 planemo upload for repository https://github.com/goeckslab/gleam commit 84d5cd0b1fa5c1ff0ad892bc39c95dad1ceb4920
goeckslab
parents: 6
diff changeset
472
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
473 LOG.info(f"compare_models kwargs: {compare_kwargs}")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
474 self.best_model = self.exp.compare_models(**compare_kwargs)
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
475 if self._best_model_metric_used is None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
476 self._best_model_metric_used = getattr(self.exp, "_fold_metric", None)
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
477 self.results = self.exp.pull()
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
478 if getattr(self, "tune_model", False):
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
479 LOG.info("Tuning hyperparameters of the best model")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
480 self.best_model = self.exp.tune_model(self.best_model)
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
481 self.tuning_results = self.exp.pull()
3
f6a65e05d6ec planemo upload for repository https://github.com/goeckslab/gleam commit b430f8b466655878c3bf63b053655fdbf039ddb0
goeckslab
parents: 2
diff changeset
482
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
483 if self.task_type == "classification":
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
484 self.results.rename(columns={"AUC": "ROC-AUC"}, inplace=True)
5
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
485
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
486 prob_thresh = getattr(self, "probability_threshold", None)
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
487 if self.task_type == "classification" and (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
488 prob_thresh is not None
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
489 ):
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
490 _ = self.exp.predict_model(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
491 self.best_model, probability_threshold=prob_thresh
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
492 )
5
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
493 else:
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
494 _ = self.exp.predict_model(self.best_model)
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
495
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
496 self.test_result_df = self.exp.pull()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
497 if self.task_type == "classification":
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
498 self.test_result_df.rename(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
499 columns={"AUC": "ROC-AUC"}, inplace=True
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
500 )
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
501
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
502 def save_model(self):
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
503 hdf5_path = Path(self.output_dir) / "pycaret_model.h5"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
504 with h5py.File(hdf5_path, "w") as f:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
505 with tempfile.NamedTemporaryFile(delete=False) as tmp:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
506 joblib.dump(self.best_model, tmp.name)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
507 tmp.seek(0)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
508 model_bytes = tmp.read()
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
509 f.create_dataset("model", data=np.void(model_bytes))
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
510
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
511 def generate_plots(self):
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
512 LOG.info("Generating PyCaret diagnostic pltos")
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
513
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
514 # choose the right plots based on task type
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
515 if self.task_type == "classification":
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
516 plot_names = [
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
517 "learning",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
518 "vc",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
519 "calibration",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
520 "dimension",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
521 "manifold",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
522 "rfe",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
523 "threshold",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
524 "percentage_above_below",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
525 "class_report",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
526 "pr_auc",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
527 "roc_auc",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
528 ]
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
529 else:
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
530 plot_names = ["residuals", "vc", "parameter", "error",
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
531 "learning"]
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
532 for name in plot_names:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
533 try:
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
534 ax = self.exp.plot_model(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
535 self.best_model, plot=name, save=False
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
536 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
537 out_path = Path(self.output_dir) / f"plot_{name}.png"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
538 fig = ax.get_figure()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
539 fig.savefig(out_path, bbox_inches="tight")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
540 self.plots[name] = str(out_path)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
541 except Exception as e:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
542 LOG.warning(f"Could not generate {name} plot: {e}")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
543
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
544 def encode_image_to_base64(self, img_path: str) -> str:
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
545 with open(img_path, "rb") as img_file:
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
546 return base64.b64encode(img_file.read()).decode("utf-8")
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
547
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
548 def _build_dataset_overview(self):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
549 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
550 Build an HTML table showing label counts with labels as rows and splits
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
551 (Train / Validation / Test) as columns. Each cell shows count and
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
552 percentage of that split. Returns empty string for regression or when
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
553 no label data is available.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
554 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
555 if self.task_type != "classification":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
556 return ""
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
557
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
558 def _safe_series(obj):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
559 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
560 return pd.Series(obj).reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
561 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
562 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
563
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
564 def _get_from_config(keys):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
565 if self.exp is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
566 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
567 for key in keys:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
568 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
569 val = self.exp.get_config(key)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
570 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
571 val = getattr(self.exp, key, None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
572 if val is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
573 return val
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
574 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
575
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
576 # Prefer PyCaret-configured splits; fall back to raw inputs.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
577 X_train = _get_from_config(["X_train_transformed", "X_train"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
578 y_train = _get_from_config(["y_train_transformed", "y_train"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
579 y_test_cfg = _get_from_config(["y_test_transformed", "y_test"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
580
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
581 if y_train is None and self.data is not None and self.target in self.data.columns:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
582 y_train = self.data[self.target]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
583
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
584 y_train_series = _safe_series(y_train)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
585
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
586 # Build a cross-validation generator to derive a validation subset size.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
587 cv_gen = self._get_cv_generator(y_train_series)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
588 y_train_fold = y_train_series
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
589 y_val_fold = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
590 if cv_gen is not None and y_train_series is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
591 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
592 # Use the first fold to approximate Train/Validation split sizes.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
593 splitter = cv_gen.split(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
594 pd.DataFrame(X_train).reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
595 if X_train is not None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
596 else y_train_series,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
597 y_train_series,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
598 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
599 train_idx, val_idx = next(iter(splitter))
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
600 y_train_fold = y_train_series.iloc[train_idx].reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
601 y_val_fold = y_train_series.iloc[val_idx].reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
602 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
603 LOG.warning("Could not derive validation split for dataset overview: %s", exc)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
604
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
605 # Test labels: prefer PyCaret transformed holdout (single file) or external test.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
606 if self.test_data is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
607 if y_test_cfg is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
608 y_test = y_test_cfg
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
609 elif self.target in self.test_data.columns:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
610 y_test = self.test_data[self.target]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
611 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
612 y_test = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
613 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
614 y_test = y_test_cfg
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
615
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
616 split_map = {
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
617 "Train": _safe_series(y_train_fold),
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
618 "Validation": _safe_series(y_val_fold),
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
619 "Test": _safe_series(y_test),
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
620 }
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
621 available = {k: v for k, v in split_map.items() if v is not None and not v.empty}
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
622 if not available:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
623 return ""
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
624
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
625 # Collect all labels across available splits (including NaN)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
626 label_pool = pd.concat(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
627 available.values(), ignore_index=True
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
628 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
629 labels = pd.unique(label_pool)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
630
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
631 def _count_for_label(series, label):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
632 if series is None or series.empty:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
633 return None, None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
634 total = len(series)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
635 if pd.isna(label):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
636 cnt = series.isna().sum()
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
637 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
638 cnt = (series == label).sum()
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
639 return int(cnt), total
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
640
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
641 rows = []
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
642 for label in labels:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
643 row = ["NaN" if pd.isna(label) else str(label)]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
644 for split_name in ["Train", "Validation", "Test"]:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
645 cnt, total = _count_for_label(split_map.get(split_name), label)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
646 if cnt is None or total is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
647 cell = "—"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
648 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
649 pct = (cnt / total * 100) if total else 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
650 cell = f"{cnt} ({pct:.1f}%)"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
651 row.append(cell)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
652 rows.append(row)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
653
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
654 df = pd.DataFrame(rows, columns=["Label", "Train", "Validation", "Test"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
655 df.sort_values("Label", inplace=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
656
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
657 return (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
658 "<h2>Dataset Overview</h2>"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
659 + '<div class="table-wrapper">'
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
660 + df.to_html(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
661 index=False,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
662 classes=["table", "sortable", "table-dataset-overview"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
663 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
664 + "</div>"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
665 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
666
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
667 def _predict_with_thresholds(self, X, y_true):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
668 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
669 Generate predictions/probabilities for a split, respecting an optional
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
670 probability threshold for binary tasks. Returns a dict with y_true,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
671 y_pred, y_scores (positive-class probs when available), pos_label,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
672 and neg_label.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
673 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
674 if X is None or y_true is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
675 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
676
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
677 y_true_series = pd.Series(y_true).reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
678 classes = list(getattr(self.best_model, "classes_", []))
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
679 if not classes:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
680 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
681 classes = pd.unique(y_true_series).tolist()
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
682 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
683 classes = []
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
684 if len(classes) > 1:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
685 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
686 pos_idx = classes.index(1)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
687 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
688 pos_idx = 1
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
689 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
690 pos_idx = 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
691 pos_idx = min(pos_idx, len(classes) - 1) if classes else 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
692 pos_label = (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
693 classes[pos_idx]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
694 if len(classes) > pos_idx and pos_idx >= 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
695 else (classes[-1] if classes else 1)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
696 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
697 neg_label = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
698 if len(classes) >= 2:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
699 neg_candidates = [c for c in classes if c != pos_label]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
700 if neg_candidates:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
701 neg_label = neg_candidates[0]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
702
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
703 prob_thresh = getattr(self, "probability_threshold", None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
704 y_scores = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
705 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
706 proba = self.best_model.predict_proba(X)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
707 y_scores = np.asarray(proba) if proba is not None else None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
708 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
709 y_scores = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
710
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
711 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
712 if (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
713 prob_thresh is not None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
714 and not getattr(self.exp, "is_multiclass", False)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
715 and y_scores is not None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
716 and y_scores.ndim == 2
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
717 and y_scores.shape[1] > 1
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
718 ):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
719 pos_idx = min(pos_idx, y_scores.shape[1] - 1)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
720 neg_idx = 1 - pos_idx if y_scores.shape[1] > 1 else 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
721 if neg_label is None and len(classes) > neg_idx:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
722 neg_label = classes[neg_idx]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
723 y_pred = np.where(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
724 y_scores[:, pos_idx] >= prob_thresh,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
725 pos_label,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
726 neg_label if neg_label is not None else 0,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
727 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
728 y_scores = y_scores[:, pos_idx]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
729 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
730 y_pred = self.best_model.predict(X)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
731 if (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
732 not getattr(self.exp, "is_multiclass", False)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
733 and y_scores is not None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
734 and y_scores.ndim == 2
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
735 and y_scores.shape[1] > 1
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
736 ):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
737 pos_idx = min(pos_idx, y_scores.shape[1] - 1)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
738 y_scores = y_scores[:, pos_idx]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
739 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
740 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
741 "Falling back to raw predict while computing performance summary: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
742 exc,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
743 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
744 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
745 y_pred = self.best_model.predict(X)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
746 except Exception as exc_inner:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
747 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
748 "Unable to score split after fallback prediction: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
749 exc_inner,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
750 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
751 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
752 y_scores = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
753
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
754 y_pred_series = pd.Series(y_pred).reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
755 if y_scores is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
756 y_scores = np.asarray(y_scores)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
757 if y_scores.ndim > 1 and y_scores.shape[1] == 1:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
758 y_scores = y_scores.ravel()
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
759 if getattr(self.exp, "is_multiclass", False) and y_scores.ndim > 1:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
760 # Avoid passing multiclass score matrices to ROC/PR utilities
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
761 y_scores = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
762
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
763 return {
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
764 "y_true": y_true_series,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
765 "y_pred": y_pred_series,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
766 "y_scores": y_scores,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
767 "pos_label": pos_label,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
768 "neg_label": neg_label,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
769 }
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
770
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
771 def _get_cv_generator(self, y_series):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
772 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
773 Build a cross-validation splitter that mirrors the experiment's
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
774 configuration. Returns None when CV is disabled or not applicable.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
775 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
776 if self.task_type != "classification":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
777 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
778
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
779 if getattr(self, "cross_validation", None) is False:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
780 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
781
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
782 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
783 cfg_gen = self.exp.get_config("fold_generator")
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
784 if cfg_gen is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
785 return cfg_gen
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
786 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
787 cfg_gen = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
788
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
789 folds = (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
790 getattr(self, "cross_validation_folds", None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
791 or self.setup_params.get("fold")
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
792 or getattr(self.exp, "fold", None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
793 or 10
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
794 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
795 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
796 folds = int(folds)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
797 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
798 folds = 10
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
799
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
800 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
801 y_series = pd.Series(y_series).reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
802 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
803 y_series = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
804 if y_series is None or y_series.empty:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
805 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
806
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
807 if folds < 2:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
808 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
809 if len(y_series) < folds:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
810 folds = len(y_series)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
811 if folds < 2:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
812 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
813
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
814 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
815 from sklearn.model_selection import KFold, StratifiedKFold
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
816
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
817 if self.task_type == "classification":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
818 return StratifiedKFold(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
819 n_splits=folds,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
820 shuffle=True,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
821 random_state=self.random_seed,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
822 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
823 return KFold(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
824 n_splits=folds,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
825 shuffle=True,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
826 random_state=self.random_seed,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
827 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
828 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
829 LOG.warning("Could not build CV generator: %s", exc)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
830 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
831
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
832 def _get_cross_validated_predictions(self, X, y):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
833 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
834 Generate cross-validated predictions for the validation split so we
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
835 can report validation metrics for the selected best model.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
836 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
837 if self.task_type != "classification":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
838 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
839 if getattr(self, "cross_validation", None) is False:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
840 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
841 if X is None or y is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
842 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
843
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
844 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
845 from sklearn.model_selection import cross_val_predict
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
846 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
847 LOG.warning("cross_val_predict unavailable: %s", exc)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
848 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
849
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
850 y_series = pd.Series(y).reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
851 if y_series.empty:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
852 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
853
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
854 cv_gen = self._get_cv_generator(y_series)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
855 if cv_gen is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
856 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
857
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
858 X_df = pd.DataFrame(X).reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
859 if len(X_df) != len(y_series):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
860 X_df = X_df.iloc[: len(y_series)].reset_index(drop=True)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
861
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
862 classes = list(getattr(self.best_model, "classes_", []))
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
863 if len(classes) > 1:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
864 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
865 pos_idx = classes.index(1)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
866 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
867 pos_idx = 1
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
868 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
869 pos_idx = 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
870 pos_idx = min(pos_idx, len(classes) - 1) if classes else 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
871 pos_label = (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
872 classes[pos_idx] if len(classes) > pos_idx else 1
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
873 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
874 neg_label = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
875 if len(classes) >= 2:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
876 neg_candidates = [c for c in classes if c != pos_label]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
877 if neg_candidates:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
878 neg_label = neg_candidates[0]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
879
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
880 prob_thresh = getattr(self, "probability_threshold", None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
881 n_jobs = getattr(self, "n_jobs", None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
882
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
883 y_scores = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
884 if not getattr(self.exp, "is_multiclass", False):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
885 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
886 proba = cross_val_predict(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
887 self.best_model,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
888 X_df,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
889 y_series,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
890 cv=cv_gen,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
891 method="predict_proba",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
892 n_jobs=n_jobs,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
893 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
894 y_scores = np.asarray(proba)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
895 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
896 LOG.debug("Could not compute CV probabilities: %s", exc)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
897
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
898 y_pred = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
899 if (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
900 prob_thresh is not None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
901 and not getattr(self.exp, "is_multiclass", False)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
902 and y_scores is not None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
903 and y_scores.ndim == 2
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
904 and y_scores.shape[1] > 1
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
905 ):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
906 pos_idx = min(pos_idx, y_scores.shape[1] - 1)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
907 neg_idx = 1 - pos_idx if y_scores.shape[1] > 1 else 0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
908 if neg_label is None and len(classes) > neg_idx:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
909 neg_label = classes[neg_idx]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
910 y_pred = np.where(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
911 y_scores[:, pos_idx] >= prob_thresh,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
912 pos_label,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
913 neg_label if neg_label is not None else 0,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
914 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
915 y_scores = y_scores[:, pos_idx]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
916 else:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
917 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
918 y_pred = cross_val_predict(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
919 self.best_model,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
920 X_df,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
921 y_series,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
922 cv=cv_gen,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
923 method="predict",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
924 n_jobs=n_jobs,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
925 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
926 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
927 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
928 "Could not compute cross-validated predictions: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
929 exc,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
930 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
931 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
932 if (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
933 not getattr(self.exp, "is_multiclass", False)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
934 and y_scores is not None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
935 and y_scores.ndim == 2
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
936 and y_scores.shape[1] > 1
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
937 ):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
938 pos_idx = min(pos_idx, y_scores.shape[1] - 1)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
939 y_scores = y_scores[:, pos_idx]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
940
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
941 if y_scores is not None and getattr(self.exp, "is_multiclass", False):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
942 y_scores = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
943
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
944 return {
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
945 "y_true": y_series,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
946 "y_pred": pd.Series(y_pred).reset_index(drop=True),
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
947 "y_scores": y_scores,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
948 "pos_label": pos_label,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
949 "neg_label": neg_label,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
950 }
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
951
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
952 def _get_split_predictions_for_report(self):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
953 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
954 Collect predictions/probabilities for Train/Validation/Test splits so the
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
955 performance table can show consistent metrics across splits.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
956 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
957 if self.task_type != "classification":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
958 return {}
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
959
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
960 def _get_from_config(keys):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
961 for key in keys:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
962 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
963 val = self.exp.get_config(key)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
964 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
965 val = getattr(self.exp, key, None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
966 if val is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
967 return val
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
968 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
969
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
970 X_train = _get_from_config(["X_train_transformed", "X_train"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
971 y_train = _get_from_config(["y_train_transformed", "y_train"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
972 X_holdout = _get_from_config(["X_test_transformed", "X_test"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
973 y_holdout = _get_from_config(["y_test_transformed", "y_test"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
974
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
975 predictions = {}
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
976
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
977 # Train metrics (best model on training data)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
978 if X_train is not None and y_train is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
979 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
980 train_preds = self._predict_with_thresholds(X_train, y_train)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
981 if train_preds is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
982 predictions["Train"] = train_preds
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
983 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
984 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
985 "Could not score Train split for performance summary: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
986 exc,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
987 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
988
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
989 # Validation metrics via cross-validation on training data
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
990 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
991 val_preds = self._get_cross_validated_predictions(X_train, y_train)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
992 if val_preds is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
993 predictions["Validation"] = val_preds
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
994 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
995 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
996 "Could not score Validation split for performance summary: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
997 exc,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
998 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
999
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1000 # Test metrics (holdout from single file, or provided test file)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1001 X_test = X_holdout
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1002 y_test = y_holdout
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1003 if (X_test is None or y_test is None) and self.test_data is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1004 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1005 X_test = self.test_data.drop(columns=[self.target])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1006 y_test = self.test_data[self.target]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1007 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1008 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1009 "Could not prepare external test data for performance summary: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1010 exc,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1011 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1012
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1013 if X_test is not None and y_test is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1014 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1015 test_preds = self._predict_with_thresholds(X_test, y_test)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1016 if test_preds is not None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1017 predictions["Test"] = test_preds
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1018 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1019 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1020 "Could not score Test split for performance summary: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1021 exc,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1022 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1023 return predictions
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1024
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1025 def _compute_metric_value(self, metric_name, preds, split_name):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1026 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1027 Compute a single metric for a given split prediction bundle.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1028 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1029 if preds is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1030 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1031
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1032 y_true = preds["y_true"]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1033 y_pred = preds["y_pred"]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1034 y_scores = preds.get("y_scores")
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1035 pos_label = preds.get("pos_label")
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1036 neg_label = preds.get("neg_label")
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1037 is_multiclass = getattr(self.exp, "is_multiclass", False)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1038
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1039 def _format_binary_labels(series):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1040 if pos_label is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1041 return series
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1042 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1043 return (series == pos_label).astype(int)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1044 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1045 return series
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1046
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1047 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1048 if metric_name == "Accuracy":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1049 return accuracy_score(y_true, y_pred)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1050 if metric_name == "ROC-AUC":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1051 if y_scores is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1052 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1053 y_true_bin = _format_binary_labels(y_true)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1054 if len(pd.unique(y_true_bin)) < 2:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1055 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1056 return roc_auc_score(y_true_bin, y_scores)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1057 if metric_name == "Precision":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1058 if is_multiclass:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1059 return precision_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1060 y_true, y_pred, average="weighted", zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1061 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1062 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1063 return precision_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1064 y_true, y_pred, pos_label=pos_label, zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1065 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1066 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1067 return precision_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1068 y_true, y_pred, average="weighted", zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1069 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1070 if metric_name == "Recall":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1071 if is_multiclass:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1072 return recall_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1073 y_true, y_pred, average="weighted", zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1074 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1075 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1076 return recall_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1077 y_true, y_pred, pos_label=pos_label, zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1078 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1079 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1080 return recall_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1081 y_true, y_pred, average="weighted", zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1082 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1083 if metric_name == "F1-Score":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1084 if is_multiclass:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1085 return f1_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1086 y_true, y_pred, average="weighted", zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1087 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1088 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1089 return f1_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1090 y_true, y_pred, pos_label=pos_label, zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1091 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1092 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1093 return f1_score(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1094 y_true, y_pred, average="weighted", zero_division=0
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1095 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1096 if metric_name == "PR-AUC":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1097 if y_scores is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1098 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1099 y_true_bin = _format_binary_labels(y_true)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1100 if len(pd.unique(y_true_bin)) < 2:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1101 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1102 return average_precision_score(y_true_bin, y_scores)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1103 if metric_name == "Specificity":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1104 labels = pd.unique(pd.concat([y_true, y_pred], ignore_index=True))
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1105 if len(labels) != 2:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1106 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1107 if pos_label is None or pos_label not in labels:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1108 pos_label = labels[1]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1109 neg_candidates = [lbl for lbl in labels if lbl != pos_label]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1110 neg_label_final = (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1111 neg_label if neg_label in labels else (neg_candidates[0] if neg_candidates else None)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1112 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1113 if neg_label_final is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1114 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1115 cm = confusion_matrix(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1116 y_true, y_pred, labels=[neg_label_final, pos_label]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1117 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1118 if cm.shape != (2, 2):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1119 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1120 tn, fp, fn, tp = cm.ravel()
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1121 denom = tn + fp
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1122 return (tn / denom) if denom else None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1123 if metric_name == "MCC":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1124 return matthews_corrcoef(y_true, y_pred)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1125 except Exception as exc:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1126 LOG.warning(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1127 "Could not compute %s for %s split: %s",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1128 metric_name,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1129 split_name,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1130 exc,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1131 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1132 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1133 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1134
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1135 def _build_performance_summary_table(self):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1136 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1137 Build a Train/Validation/Test metrics table for classification tasks.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1138 Returns empty string when metrics are unavailable or not applicable.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1139 """
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1140 if self.task_type != "classification":
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1141 return ""
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1142
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1143 split_predictions = self._get_split_predictions_for_report()
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1144 validation_best_row = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1145 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1146 if isinstance(self.results, pd.DataFrame) and not self.results.empty:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1147 validation_best_row = self.results.iloc[0]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1148 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1149 validation_best_row = None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1150
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1151 if not split_predictions and validation_best_row is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1152 return ""
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1153
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1154 metric_names = [
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1155 "Accuracy",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1156 "ROC-AUC",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1157 "Precision",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1158 "Recall",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1159 "F1-Score",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1160 "PR-AUC",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1161 "Specificity",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1162 "MCC",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1163 ]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1164
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1165 validation_column_map = {
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1166 "Accuracy": ["Accuracy"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1167 "ROC-AUC": ["ROC-AUC", "AUC"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1168 "Precision": ["Precision", "Prec.", "Prec"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1169 "Recall": ["Recall"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1170 "F1-Score": ["F1-Score", "F1"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1171 "PR-AUC": ["PR-AUC", "PR-AUC-Weighted", "PRC"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1172 "Specificity": ["Specificity"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1173 "MCC": ["MCC"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1174 }
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1175
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1176 def _fmt(value):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1177 if value is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1178 return "—"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1179 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1180 if isinstance(value, (float, np.floating)) and (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1181 np.isnan(value) or np.isinf(value)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1182 ):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1183 return "—"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1184 return f"{value:.3f}"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1185 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1186 return str(value)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1187
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1188 def _validation_metric(metric_name):
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1189 if validation_best_row is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1190 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1191 cols = validation_column_map.get(metric_name, [])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1192 for col in cols:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1193 if col in validation_best_row:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1194 try:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1195 return validation_best_row[col]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1196 except Exception:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1197 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1198 return None
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1199
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1200 rows = []
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1201 for metric in metric_names:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1202 row = [metric]
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1203 # Train
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1204 train_val = self._compute_metric_value(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1205 metric, split_predictions.get("Train"), "Train"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1206 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1207 row.append(_fmt(train_val))
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1208
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1209 # Validation from Train & Validation Summary first row; fallback to computed CV.
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1210 val_val = _validation_metric(metric)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1211 if val_val is None:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1212 val_val = self._compute_metric_value(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1213 metric, split_predictions.get("Validation"), "Validation"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1214 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1215 row.append(_fmt(val_val))
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1216
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1217 # Test
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1218 test_val = self._compute_metric_value(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1219 metric, split_predictions.get("Test"), "Test"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1220 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1221 row.append(_fmt(test_val))
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1222 rows.append(row)
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1223
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1224 df = pd.DataFrame(rows, columns=["Metric", "Train", "Validation", "Test"])
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1225 return (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1226 "<h2>Model Performance Summary</h2>"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1227 + '<div class="table-wrapper">'
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1228 + df.to_html(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1229 index=False,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1230 classes=["table", "sortable", "table-perf-summary"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1231 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1232 + "</div>"
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1233 )
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1234
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1235 def _resolve_plot_callable(self, key, fig_or_fn, section):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1236 """
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1237 Safely execute stored plot callables so a single failure does not
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1238 abort the entire HTML report generation.
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1239 """
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1240 if fig_or_fn is None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1241 return None
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1242 try:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1243 return fig_or_fn() if callable(fig_or_fn) else fig_or_fn
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1244 except Exception as exc:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1245 extra = ""
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1246 if isinstance(exc, ValueError) and "Input contains NaN" in str(exc):
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1247 extra = (
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1248 " (model returned NaN probabilities; "
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1249 "consider checking data preprocessing)"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1250 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1251 LOG.warning(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1252 "Skipping %s plot '%s' due to error: %s%s",
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1253 section,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1254 key,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1255 exc,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1256 extra,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1257 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1258 return None
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1259
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1260 def save_html_report(self):
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1261 LOG.info("Saving HTML report")
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1262
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1263 # 1) Determine best model name
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1264 try:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1265 best_model_name = str(self.results.iloc[0]["Model"])
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1266 except Exception:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1267 best_model_name = type(self.best_model).__name__
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1268 LOG.info(f"Best model determined as: {best_model_name}")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1269
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1270 # 2) Compute training sample count
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1271 try:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1272 n_train = self.exp.X_train.shape[0]
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1273 except Exception:
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1274 n_train = getattr(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1275 self.exp, "X_train_transformed", pd.DataFrame()
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1276 ).shape[0]
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1277 total_rows = self.data.shape[0]
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1278
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1279 # 3) Build setup parameters table
5
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
1280 all_params = self.setup_params.copy()
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1281 if self.task_type == "classification" and (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1282 hasattr(self, "probability_threshold")
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1283 ):
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1284 all_params["probability_threshold"] = (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1285 self.probability_threshold
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1286 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1287 display_keys = [
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1288 "Target",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1289 "Session ID",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1290 "Train Size",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1291 "Normalize",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1292 "Feature Selection",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1293 "Cross Validation",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1294 "Cross Validation Folds",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1295 "Remove Outliers",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1296 "Remove Multicollinearity",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1297 "Polynomial Features",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1298 "Fix Imbalance",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1299 "Models",
5
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
1300 "Probability Threshold",
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1301 ]
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1302 setup_rows = []
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1303 for key in display_keys:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1304 pk = key.lower().replace(" ", "_")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1305 v = all_params.get(pk)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1306 if key == "Train Size":
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1307 frac = (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1308 float(v)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1309 if v is not None
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1310 else (n_train / total_rows if total_rows else 0)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1311 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1312 dv = f"{frac:.2f} ({n_train} rows)"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1313 elif key in {
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1314 "Normalize",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1315 "Feature Selection",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1316 "Cross Validation",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1317 "Remove Outliers",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1318 "Remove Multicollinearity",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1319 "Polynomial Features",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1320 "Fix Imbalance",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1321 }:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1322 dv = bool(v)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1323 elif key == "Cross Validation Folds":
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1324 dv = v if v is not None else "None"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1325 elif key == "Models":
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1326 dv = ", ".join(map(str, v)) if isinstance(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1327 v, (list, tuple)
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1328 ) else "None"
5
3d42f82b3c7f planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents: 4
diff changeset
1329 elif key == "Probability Threshold":
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1330 dv = f"{v:.2f}" if v is not None else "0.5"
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1331 else:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1332 dv = v if v is not None else "None"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1333 setup_rows.append([key, dv])
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1334 metric_label = self._best_model_metric_used or getattr(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1335 self.exp, "_fold_metric", None
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1336 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1337 if metric_label:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1338 setup_rows.append(["Best Model Metric", metric_label])
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1339
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1340 df_setup = pd.DataFrame(setup_rows, columns=["Parameter", "Value"])
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1341 df_setup.to_csv(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1342 Path(self.output_dir) / "setup_params.csv", index=False
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1343 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1344
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1345 # 4) Persist CSVs
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1346 self.results.to_csv(
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1347 Path(self.output_dir) / "comparison_results.csv",
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1348 index=False
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1349 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1350 self.test_result_df.to_csv(
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1351 Path(self.output_dir) / "test_results.csv", index=False
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1352 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1353 pd.DataFrame(
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1354 self.best_model.get_params().items(),
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1355 columns=["Parameter", "Value"]
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1356 ).to_csv(Path(self.output_dir) / "best_model.csv", index=False)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1357
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1358 if self.tuning_results is not None:
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1359 self.tuning_results.to_csv(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1360 Path(self.output_dir) / "tuning_results.csv",
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1361 index=False
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1362 )
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1363
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1364 # 5) Header
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1365 header = f"<h2>Best Model: {best_model_name}</h2>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1366
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1367 # — Validation Summary & Configuration —
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1368 val_df = self.results.copy()
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1369 dataset_overview_html = self._build_dataset_overview()
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1370 performance_summary_html = self._build_performance_summary_table()
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1371 # mapping raw plot keys to user-friendly titles
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1372 plot_title_map = {
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1373 "learning": "Learning Curve",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1374 "vc": "Validation Curve",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1375 "calibration": "Calibration Curve",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1376 "dimension": "Dimensionality Reduction",
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1377 "manifold": "t-SNE",
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1378 "rfe": "Recursive Feature Elimination",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1379 "threshold": "Threshold Plot",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1380 "percentage_above_below": "Percentage Above vs. Below Cutoff",
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1381 "class_report": "Per-Class Metrics",
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1382 "pr_auc": "Precision-Recall AUC",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1383 "roc_auc": "Receiver Operating Characteristic AUC",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1384 "residuals": "Residuals Distribution",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1385 "error": "Prediction Error Distribution",
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1386 }
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1387 val_df.drop(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1388 columns=["TT (Ec)", "TT (Sec)"], errors="ignore", inplace=True
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1389 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1390 summary_html = (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1391 header
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1392 + "<h2>Train & Validation Summary</h2>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1393 + '<div class="table-wrapper">'
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1394 + val_df.to_html(index=False, classes="table sortable")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1395 + "</div>"
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1396 )
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1397
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1398 if self.tuning_results is not None:
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1399 tuning_df = self.tuning_results.copy()
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1400 tuning_df.drop(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1401 columns=["TT (Sec)"], errors="ignore", inplace=True
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1402 )
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1403 summary_html += (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1404 f"<h2>{best_model_name}: Tuning Summary</h2>"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1405 + '<div class="table-wrapper">'
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1406 + tuning_df.to_html(index=False, classes="table sortable")
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1407 + "</div>"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1408 )
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1409
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1410 config_html = (
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1411 header
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1412 + dataset_overview_html
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1413 + performance_summary_html
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1414 + "<h2>Setup Parameters</h2>"
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1415 + '<div class="table-wrapper">'
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1416 + df_setup.to_html(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1417 index=False,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1418 classes=["table", "sortable", "table-setup-params"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1419 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1420 + "</div>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1421 # — Hyperparameters
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1422 + "<h2>Best Model Hyperparameters</h2>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1423 + '<div class="table-wrapper">'
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1424 + pd.DataFrame(
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1425 self.best_model.get_params().items(),
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1426 columns=["Parameter", "Value"]
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1427 ).to_html(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1428 index=False,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1429 classes=["table", "sortable", "table-hyperparams"],
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1430 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1431 + "</div>"
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1432 )
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1433
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1434 # choose summary plots based on task type
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1435 if self.task_type == "classification":
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1436 summary_plots = [
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1437 "threshold",
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1438 "learning",
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1439 "calibration",
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1440 "rfe",
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1441 "vc",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1442 "dimension",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1443 "manifold",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1444 "percentage_above_below",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1445 ]
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1446 else:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1447 summary_plots = ["learning", "vc", "parameter", "residuals"]
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1448
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1449 for name in summary_plots:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1450 if name in self.plots:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1451 summary_html += "<hr>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1452 b64 = encode_image_to_base64(self.plots[name])
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1453 title = plot_title_map.get(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1454 name, name.replace("_", " ").title()
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1455 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1456 summary_html += (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1457 '<div class="plot">'
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1458 f"<h2>{title}</h2>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1459 f'<img src="data:image/png;base64,{b64}" '
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1460 'style="max-width:90%;max-height:600px;'
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1461 'border:1px solid #ddd;"/>'
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1462 "</div>"
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1463 )
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1464
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1465 # — Test Summary —
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1466 test_html = (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1467 header
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1468 + '<div class="table-wrapper">'
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1469 + self.test_result_df.to_html(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1470 index=False, classes="table sortable"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1471 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1472 + "</div>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1473 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1474 if self.task_type == "regression":
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1475 try:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1476 y_true = (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1477 pd.Series(self.exp.y_test_transformed)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1478 .reset_index(drop=True)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1479 .rename("True")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1480 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1481 y_pred = pd.Series(
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1482 self.best_model.predict(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1483 self.exp.X_test_transformed
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1484 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1485 ).rename("Predicted")
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1486 df_tp = pd.concat([y_true, y_pred], axis=1)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1487 test_html += "<h2>True vs Predicted Values</h2>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1488 test_html += (
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1489 '<div class="table-wrapper" '
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1490 'style="max-height:400px; overflow-y:auto;">'
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1491 + df_tp.head(50).to_html(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1492 index=False, classes="table sortable"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1493 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1494 + "</div>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1495 + add_hr_to_html()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1496 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1497 except Exception as e:
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1498 LOG.warning(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1499 f"Could not generate True vs Predicted table: {e}"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1500 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1501
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1502 # 5a) Explainer-substituted plots in order
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1503 if self.task_type == "regression":
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1504 test_order = ["residuals"]
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1505 else:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1506 test_order = [
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1507 "confusion_matrix",
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1508 "class_report",
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1509 "roc_auc",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1510 "pr_auc",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1511 "lift_curve",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1512 "cumulative_precision",
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1513 ]
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1514 rendered_test_plots = set()
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1515 for key in test_order:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1516 fig_or_fn = self.explainer_plots.pop(key, None)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1517 if fig_or_fn is not None:
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1518 fig = self._resolve_plot_callable(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1519 key, fig_or_fn, section="test/explainer"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1520 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1521 if fig is None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1522 continue
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1523 rendered_test_plots.add(key)
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1524 title = plot_title_map.get(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1525 key, key.replace("_", " ").title()
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1526 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1527 test_html += (
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1528 f"<h2>{title}</h2>" + add_plot_to_html(fig)
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1529 + add_hr_to_html()
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1530 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1531 # 5b) Remaining PyCaret test plots
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1532 for name, path in self.plots.items():
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1533 # classification: include only the small extras, before
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1534 # skipping anything
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1535 if self.task_type == "classification" and (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1536 name in {
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1537 "pr_auc",
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1538 "class_report",
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1539 }
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1540 ):
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1541 if name in rendered_test_plots:
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1542 continue
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1543 title = plot_title_map.get(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1544 name, name.replace("_", " ").title()
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1545 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1546 b64 = encode_image_to_base64(path)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1547 test_html += (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1548 f"<h2>{title}</h2>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1549 "<div class='plot'>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1550 f"<img src='data:image/png;base64,{b64}' "
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1551 "style='max-width:90%;max-height:600px;"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1552 "border:1px solid #ddd;'/>"
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1553 "</div>" + add_hr_to_html()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1554 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1555 continue
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1556
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1557 # regression: explicitly include the 'error' plot,
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1558 # before skipping
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1559 if self.task_type == "regression" and (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1560 name == "error"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1561 ):
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1562 title = plot_title_map.get(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1563 "error", "Prediction Error Distribution"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1564 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1565 b64 = encode_image_to_base64(path)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1566 test_html += (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1567 f"<h2>{title}</h2>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1568 "<div class='plot'>"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1569 f"<img src='data:image/png;base64,{b64}' "
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1570 "style='max-width:90%;max-height:600px;"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1571 "border:1px solid #ddd;'/>"
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1572 "</div>" + add_hr_to_html()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1573 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1574 continue
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1575
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1576 # now skip any plots already rendered via test_order
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1577 if name in test_order:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1578 continue
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1579
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1580 # — Feature Importance —
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1581 feature_html = header
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1582
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1583 # 6a) PyCaret’s default feature importances
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1584 imputed_data = (
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1585 self.imputed_training_data
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1586 if self.imputed_training_data is not None
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1587 else self.data
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1588 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1589 fi_analyzer = FeatureImportanceAnalyzer(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1590 data=imputed_data,
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1591 target_col=self.target_col,
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1592 task_type=self.task_type,
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1593 output_dir=self.output_dir,
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1594 exp=self.exp,
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1595 best_model=self.best_model,
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1596 max_plot_features=self.plot_feature_limit,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1597 processed_data=self.imputed_training_data,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1598 max_shap_rows=self._shap_row_cap,
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1599 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1600 fi_html = fi_analyzer.run()
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1601 # Add a small table to show SHAP feature caps near the Best Model header.
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1602 cap_rows = []
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1603 if fi_analyzer.shap_total_features is not None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1604 cap_rows.append(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1605 ("Total transformed features", fi_analyzer.shap_total_features)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1606 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1607 if fi_analyzer.shap_used_features is not None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1608 cap_rows.append(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1609 ("Features used in SHAP", fi_analyzer.shap_used_features)
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1610 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1611 if cap_rows:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1612 cap_table = (
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1613 "<div class='table-wrapper'>"
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1614 "<table class='table sortable table-fi-scope'>"
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1615 "<thead><tr><th>Feature Importance Scope</th><th>Count</th></tr></thead>"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1616 "<tbody>"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1617 + "".join(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1618 f"<tr><td>{label}</td><td>{value}</td></tr>"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1619 for label, value in cap_rows
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1620 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1621 + "</tbody></table></div>"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1622 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1623 feature_html += cap_table
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1624 feature_html += fi_html
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1625
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1626 # 6b) Explainer SHAP importances
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1627 for key in ["shap_mean", "shap_perm"]:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1628 fig_or_fn = self.explainer_plots.pop(key, None)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1629 if fig_or_fn is not None:
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1630 fig = self._resolve_plot_callable(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1631 key, fig_or_fn, section="feature importance"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1632 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1633 if fig is None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1634 continue
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1635 # give SHAP plots explicit titles
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1636 title = (
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1637 "Mean Absolute SHAP Value Impact"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1638 if key == "shap_mean"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1639 else "Permutation Feature Importance"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1640 )
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1641 feature_html += (
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1642 f"<h2>{title}</h2>" + add_plot_to_html(fig)
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1643 + add_hr_to_html()
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1644 )
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1645
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1646 # 6c) PDPs last
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1647 pdp_keys = sorted(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1648 k for k in self.explainer_plots if k.startswith("pdp__")
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1649 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1650 for k in pdp_keys:
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1651 fig_or_fn = self.explainer_plots[k]
12
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1652 fig = self._resolve_plot_callable(
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1653 k, fig_or_fn, section="pdp"
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1654 )
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1655 if fig is None:
15707141e7da planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents: 10
diff changeset
1656 continue
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1657 # extract feature name
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1658 feature = k.split("__", 1)[1]
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1659 title = f"Partial Dependence for {feature}"
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1660 feature_html += (
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1661 f"<h2>{title}</h2>" + add_plot_to_html(fig)
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1662 + add_hr_to_html()
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1663 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1664 # 7) Assemble final HTML (three tabs)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1665 html = get_html_template()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1666 html += "<h1>Tabular Learner Model Report</h1>"
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1667 html += build_tabbed_html(
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1668 summary_html,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1669 test_html,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1670 feature_html,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1671 explainer_html=None,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1672 config_html=config_html,
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1673 )
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1674 html += get_feature_metrics_help_modal()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1675 html += get_html_closing()
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1676
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1677 # 8) Write out
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1678 (Path(self.output_dir) / "comparison_result.html").write_text(
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1679 html, encoding="utf-8"
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1680 )
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1681 LOG.info(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1682 f"HTML report generated at: "
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1683 f"{self.output_dir}/comparison_result.html"
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1684 )
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1685
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1686 def save_dashboard(self):
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1687 raise NotImplementedError("Subclasses should implement this method")
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1688
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1689 def generate_plots_explainer(self):
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1690 raise NotImplementedError("Subclasses should implement this method")
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1691
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1692 def generate_tree_plots(self):
13
bf0df21a1ea3 planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents: 12
diff changeset
1693 from explainerdashboard.explainers import RandomForestExplainer
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1694 from sklearn.ensemble import (
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1695 RandomForestClassifier, RandomForestRegressor
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1696 )
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1697 from xgboost import XGBClassifier, XGBRegressor
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1698
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1699 LOG.info("Generating tree plots")
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1700 X_test = self.exp.X_test_transformed.copy()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1701 y_test = self.exp.y_test_transformed
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1702
6
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1703 if isinstance(
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1704 self.best_model, (RandomForestClassifier, RandomForestRegressor)
4bd75b45a7a1 planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents: 5
diff changeset
1705 ):
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1706 n_trees = self.best_model.n_estimators
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1707 elif isinstance(self.best_model, (XGBClassifier, XGBRegressor)):
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1708 n_trees = len(self.best_model.get_booster().get_dump())
2
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1709 else:
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1710 LOG.warning("Tree plots not supported for this model type.")
77c88226bfde planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents: 0
diff changeset
1711 return
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1712
4
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1713 explainer = RandomForestExplainer(self.best_model, X_test, y_test)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1714 for i in range(n_trees):
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1715 fig = explainer.decisiontree_encoded(tree_idx=i, index=0)
11fdac5affb3 planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents: 3
diff changeset
1716 self.trees.append(fig)
0
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1717
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1718 def run(self):
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1719 self.load_data()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1720 self.setup_pycaret()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1721 self.train_model()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1722 self.save_model()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1723 self.generate_plots()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1724 self.generate_plots_explainer()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1725 self.generate_tree_plots()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1726 self.save_html_report()
209b663a4f62 planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff changeset
1727 # self.save_dashboard()