Mercurial > repos > goeckslab > tabular_learner
annotate base_model_trainer.py @ 16:e82fd7fe796b draft default tip
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
| author | goeckslab |
|---|---|
| date | Fri, 23 Jan 2026 21:55:30 +0000 |
| parents | 01e7c5481f13 |
| children |
| rev | line source |
|---|---|
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1 import base64 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
2 import logging |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
3 import tempfile |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
4 from pathlib import Path |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
5 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
6 import h5py |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
7 import joblib |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
8 import numpy as np |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
9 import pandas as pd |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
10 from feature_help_modal import get_feature_metrics_help_modal |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
11 from feature_importance import FeatureImportanceAnalyzer |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
12 from sklearn.metrics import ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
13 accuracy_score, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
14 average_precision_score, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
15 confusion_matrix, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
16 f1_score, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
17 matthews_corrcoef, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
18 precision_score, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
19 recall_score, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
20 roc_auc_score, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
21 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
22 from utils import ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
23 add_hr_to_html, |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
24 add_plot_to_html, |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
25 build_tabbed_html, |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
26 encode_image_to_base64, |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
27 get_html_closing, |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
28 get_html_template, |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
29 ) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
30 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
31 logging.basicConfig(level=logging.DEBUG) |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
32 LOG = logging.getLogger(__name__) |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
33 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
34 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
35 class BaseModelTrainer: |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
36 def __init__( |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
37 self, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
38 input_file, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
39 target_col, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
40 output_dir, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
41 task_type, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
42 random_seed, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
43 test_file=None, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
44 **kwargs, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
45 ): |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
46 self.exp = None |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
47 self.input_file = input_file |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
48 self.target_col = target_col |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
49 self.output_dir = output_dir |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
50 self.task_type = task_type |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
51 self.random_seed = random_seed |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
52 self.data = None |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
53 self.target = None |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
54 self.best_model = None |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
55 self.results = None |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
56 self.tuning_results = None |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
57 self.features_name = None |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
58 self.plot_feature_names = None |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
59 self.plots = {} |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
60 self.explainer_plots = {} |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
61 self.plots_explainer_html = None |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
62 self.trees = [] |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
63 self.user_kwargs = kwargs.copy() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
64 for key, value in self.user_kwargs.items(): |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
65 setattr(self, key, value) |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
66 if not hasattr(self, "plot_feature_limit"): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
67 self.plot_feature_limit = 30 |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
68 self._shap_row_cap = None |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
69 if getattr(self, "polynomial_features", False): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
70 # Keep feature importance responsive by trimming plots/SHAP rows |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
71 try: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
72 limit_val = int(self.plot_feature_limit) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
73 except (TypeError, ValueError): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
74 limit_val = 30 |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
75 self.plot_feature_limit = min(limit_val, 15) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
76 self._shap_row_cap = 200 |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
77 LOG.info( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
78 "Polynomial features enabled; limiting feature plots to %s and SHAP rows to %s", |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
79 self.plot_feature_limit, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
80 self._shap_row_cap, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
81 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
82 self.imputed_training_data = None |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
83 self._best_model_metric_used = None |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
84 self.setup_params = {} |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
85 self.test_file = test_file |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
86 self.test_data = None |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
87 |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
88 if not self.output_dir: |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
89 raise ValueError( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
90 "output_dir must be specified and not None" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
91 ) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
92 |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
93 # Warn about irrelevant kwargs for the task type |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
94 if self.task_type == "regression" and ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
95 "probability_threshold" in self.user_kwargs |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
96 ): |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
97 LOG.warning( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
98 "probability_threshold is ignored for regression tasks." |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
99 ) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
100 |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
101 LOG.info(f"Model kwargs: {self.__dict__}") |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
102 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
103 def load_data(self): |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
104 LOG.info(f"Loading data from {self.input_file}") |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
105 self.data = pd.read_csv( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
106 self.input_file, sep=None, engine="python" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
107 ) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
108 self.data.columns = self.data.columns.str.replace(".", "_") |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
109 |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
110 names = self.data.columns.to_list() |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
111 LOG.info(f"Original dataset columns: {names}") |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
112 |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
113 target_index = int(self.target_col) - 1 |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
114 num_cols = len(names) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
115 if target_index < 0 or target_index >= num_cols: |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
116 raise ValueError( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
117 f"Target column number {self.target_col} is invalid. " |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
118 f"Please select a number between 1 and {num_cols}." |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
119 ) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
120 |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
121 self.target = names[target_index] |
|
15
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
122 sample_id_column = getattr(self, "sample_id_column", None) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
123 if sample_id_column: |
|
16
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
124 if str(sample_id_column).isdigit(): |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
125 idx = int(sample_id_column) - 1 |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
126 if 0 <= idx < len(names): |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
127 resolved = names[idx] |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
128 if sample_id_column in names: |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
129 LOG.warning( |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
130 "Sample ID column value '%s' matches a header, but Galaxy data_column " |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
131 "inputs are interpreted as 1-based indices; using column #%s header '%s'.", |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
132 sample_id_column, |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
133 idx + 1, |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
134 resolved, |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
135 ) |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
136 LOG.info( |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
137 "Sample ID column '%s' not found; using column #%s header '%s' instead.", |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
138 sample_id_column, |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
139 idx + 1, |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
140 resolved, |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
141 ) |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
142 sample_id_column = resolved |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
143 else: |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
144 raise ValueError( |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
145 f"Sample ID column index {sample_id_column} is invalid. " |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
146 f"Please select a number between 1 and {len(names)}." |
|
e82fd7fe796b
planemo upload for repository https://github.com/goeckslab/gleam commit 75eddf497315160aa1282ba1db839c4db0aed0c6
goeckslab
parents:
15
diff
changeset
|
147 ) |
|
15
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
148 sample_id_column = sample_id_column.replace(".", "_") |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
149 self.sample_id_column = sample_id_column |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
150 else: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
151 self.sample_id_column = None |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
152 self.sample_id_series = None |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
153 |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
154 # Conditional drop: only if 'prediction_label' exists and is not |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
155 # the target |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
156 if "prediction_label" in self.data.columns and ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
157 self.data.columns[target_index] != "prediction_label" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
158 ): |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
159 LOG.info( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
160 "Dropping 'prediction_label' column as it's not the target." |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
161 ) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
162 self.data = self.data.drop(columns=["prediction_label"]) |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
163 else: |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
164 if self.target == "prediction_label": |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
165 LOG.warning( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
166 "Using 'prediction_label' as target column. " |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
167 "This may not be intended if it's a previous prediction." |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
168 ) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
169 |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
170 numeric_cols = self.data.select_dtypes( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
171 include=["number"] |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
172 ).columns |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
173 non_numeric_cols = self.data.select_dtypes( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
174 exclude=["number"] |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
175 ).columns |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
176 self.data[numeric_cols] = self.data[numeric_cols].apply( |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
177 pd.to_numeric, errors="coerce" |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
178 ) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
179 if len(non_numeric_cols) > 0: |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
180 LOG.info( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
181 f"Non-numeric columns found: {non_numeric_cols.tolist()}" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
182 ) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
183 |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
184 # Update names after possible drop |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
185 names = self.data.columns.to_list() |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
186 LOG.info(f"Dataset columns after processing: {names}") |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
187 |
|
15
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
188 sample_id_valid = False |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
189 if sample_id_column: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
190 if sample_id_column not in self.data.columns: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
191 LOG.warning( |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
192 "Sample ID column '%s' not found; proceeding without group-aware split.", |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
193 sample_id_column, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
194 ) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
195 sample_id_column = None |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
196 self.sample_id_column = None |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
197 elif sample_id_column == self.target: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
198 LOG.warning( |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
199 "Sample ID column '%s' matches target column; skipping group-aware split.", |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
200 sample_id_column, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
201 ) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
202 sample_id_column = None |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
203 self.sample_id_column = None |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
204 else: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
205 sample_id_valid = True |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
206 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
207 if self.test_file: |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
208 LOG.info(f"Loading test data from {self.test_file}") |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
209 df_test = pd.read_csv( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
210 self.test_file, sep=None, engine="python" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
211 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
212 df_test.columns = df_test.columns.str.replace(".", "_") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
213 self.test_data = df_test |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
214 |
|
15
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
215 if sample_id_valid and self.test_data is None: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
216 train_size = getattr(self, "train_size", None) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
217 if train_size is None: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
218 train_size = 0.7 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
219 if train_size <= 0 or train_size >= 1: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
220 LOG.warning( |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
221 "Invalid train_size=%s; skipping group-aware split.", |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
222 train_size, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
223 ) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
224 else: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
225 rng = np.random.RandomState(self.random_seed) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
226 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
227 def _allocate_split_counts(n_total: int, probs: list) -> list: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
228 if n_total <= 0: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
229 return [0 for _ in probs] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
230 counts = [0 for _ in probs] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
231 active = [i for i, p in enumerate(probs) if p > 0] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
232 remainder = n_total |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
233 if active and n_total >= len(active): |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
234 for i in active: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
235 counts[i] = 1 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
236 remainder -= len(active) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
237 if remainder > 0: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
238 probs_arr = np.array(probs, dtype=float) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
239 probs_arr = probs_arr / probs_arr.sum() |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
240 raw = remainder * probs_arr |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
241 floors = np.floor(raw).astype(int) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
242 for i, value in enumerate(floors.tolist()): |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
243 counts[i] += value |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
244 leftover = remainder - int(floors.sum()) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
245 if leftover > 0 and active: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
246 frac = raw - floors |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
247 order = sorted(active, key=lambda i: (-frac[i], i)) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
248 for i in range(leftover): |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
249 counts[order[i % len(order)]] += 1 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
250 return counts |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
251 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
252 def _choose_split(counts: list, targets: list, active: list) -> int: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
253 remaining = [targets[i] - counts[i] for i in range(len(targets))] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
254 best = max(active, key=lambda i: (remaining[i], -counts[i], -targets[i])) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
255 if remaining[best] <= 0: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
256 best = min(active, key=lambda i: counts[i]) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
257 return best |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
258 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
259 probs = [train_size, 1.0 - train_size] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
260 targets = _allocate_split_counts(len(self.data), probs) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
261 counts = [0, 0] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
262 active = [0, 1] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
263 train_idx = [] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
264 test_idx = [] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
265 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
266 group_series = self.data[sample_id_column].astype(object) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
267 missing_mask = group_series.isna() |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
268 if missing_mask.any(): |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
269 group_series = group_series.copy() |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
270 group_series.loc[missing_mask] = [ |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
271 f"__missing__{idx}" for idx in group_series.index[missing_mask] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
272 ] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
273 group_to_indices = {} |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
274 for idx, group_id in group_series.items(): |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
275 group_to_indices.setdefault(group_id, []).append(idx) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
276 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
277 group_ids = sorted(group_to_indices.keys(), key=lambda x: str(x)) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
278 rng.shuffle(group_ids) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
279 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
280 for group_id in group_ids: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
281 split_idx = _choose_split(counts, targets, active) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
282 counts[split_idx] += len(group_to_indices[group_id]) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
283 if split_idx == 0: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
284 train_idx.extend(group_to_indices[group_id]) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
285 else: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
286 test_idx.extend(group_to_indices[group_id]) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
287 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
288 missing_splits = [] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
289 if not train_idx: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
290 missing_splits.append("train") |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
291 if not test_idx: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
292 missing_splits.append("test") |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
293 if missing_splits: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
294 LOG.warning( |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
295 "Group-aware split using '%s' produced empty %s set; " |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
296 "falling back to default split.", |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
297 sample_id_column, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
298 " and ".join(missing_splits), |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
299 ) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
300 else: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
301 self.test_data = self.data.loc[test_idx].reset_index(drop=True) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
302 self.data = self.data.loc[train_idx].reset_index(drop=True) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
303 LOG.info( |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
304 "Applied group-aware split using '%s' (train=%s, test=%s).", |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
305 sample_id_column, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
306 len(train_idx), |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
307 len(test_idx), |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
308 ) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
309 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
310 if sample_id_valid: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
311 self.sample_id_series = self.data[sample_id_column].copy() |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
312 if sample_id_column in self.data.columns: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
313 self.data = self.data.drop(columns=[sample_id_column]) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
314 if self.test_data is not None and sample_id_column in self.test_data.columns: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
315 self.test_data = self.test_data.drop(columns=[sample_id_column]) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
316 |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
317 # Refresh feature lists after any sample-id column removal. |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
318 names = self.data.columns.to_list() |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
319 self.features_name = [n for n in names if n != self.target] |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
320 self.plot_feature_names = self._select_plot_features(self.features_name) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
321 |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
322 def _select_plot_features(self, all_features): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
323 limit = getattr(self, "plot_feature_limit", 30) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
324 if not isinstance(limit, int) or limit <= 0: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
325 LOG.info( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
326 "Feature plotting limit disabled (plot_feature_limit=%s).", limit |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
327 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
328 return all_features |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
329 if len(all_features) <= limit: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
330 LOG.info( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
331 "Feature plotting limit not needed (%s features <= limit %s).", |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
332 len(all_features), |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
333 limit, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
334 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
335 return all_features |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
336 df = self.data[all_features].copy() |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
337 numeric_cols = df.select_dtypes(include=["number"]).columns |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
338 ranked = [] |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
339 if len(numeric_cols) > 0: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
340 variances = ( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
341 df[numeric_cols] |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
342 .var() |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
343 .fillna(0) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
344 .abs() |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
345 .sort_values(ascending=False) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
346 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
347 ranked = variances.index.tolist() |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
348 selected = [] |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
349 for col in ranked: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
350 if len(selected) >= limit: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
351 break |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
352 selected.append(col) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
353 if len(selected) < limit: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
354 for col in all_features: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
355 if col in selected: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
356 continue |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
357 selected.append(col) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
358 if len(selected) >= limit: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
359 break |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
360 LOG.info( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
361 "Limiting feature-level plots to %s of %s available features (limit=%s).", |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
362 len(selected), |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
363 len(all_features), |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
364 limit, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
365 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
366 return selected |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
367 |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
368 def setup_pycaret(self): |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
369 LOG.info("Initializing PyCaret") |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
370 self.setup_params = { |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
371 "target": self.target, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
372 "session_id": self.random_seed, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
373 "html": True, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
374 "log_experiment": False, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
375 "system_log": False, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
376 "index": False, |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
377 } |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
378 if self.test_data is not None: |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
379 self.setup_params["test_data"] = self.test_data |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
380 for attr in [ |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
381 "train_size", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
382 "normalize", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
383 "feature_selection", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
384 "remove_outliers", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
385 "remove_multicollinearity", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
386 "polynomial_features", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
387 "feature_interaction", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
388 "feature_ratio", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
389 "fix_imbalance", |
|
10
49f73a3c12f3
planemo upload for repository https://github.com/goeckslab/gleam commit 1ffd143e57fa952ee9dd84fc141771520aea0791
goeckslab
parents:
9
diff
changeset
|
390 "n_jobs", |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
391 ]: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
392 val = getattr(self, attr, None) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
393 if val is not None: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
394 self.setup_params[attr] = val |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
395 if getattr(self, "cross_validation_folds", None) is not None: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
396 self.setup_params["fold"] = self.cross_validation_folds |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
397 LOG.info(self.setup_params) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
398 |
|
15
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
399 group_series = getattr(self, "sample_id_series", None) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
400 if group_series is not None and getattr(self, "cross_validation", None) is not False: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
401 n_groups = pd.Series(group_series).nunique(dropna=False) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
402 fold_count = getattr(self, "cross_validation_folds", None) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
403 if fold_count is not None and fold_count > n_groups: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
404 LOG.warning( |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
405 "cross_validation_folds=%s exceeds unique groups=%s; " |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
406 "skipping group-aware CV.", |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
407 fold_count, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
408 n_groups, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
409 ) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
410 else: |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
411 self.setup_params["fold_strategy"] = "groupkfold" |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
412 self.setup_params["fold_groups"] = pd.Series(group_series).reset_index(drop=True) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
413 LOG.info( |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
414 "Enabled group-aware CV with %s unique groups.", |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
415 n_groups, |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
416 ) |
|
01e7c5481f13
planemo upload for repository https://github.com/goeckslab/gleam commit f632803cda732005bdcf3ac3e8fe7a807a82c1d9
goeckslab
parents:
13
diff
changeset
|
417 |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
418 if self.task_type == "classification": |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
419 from pycaret.classification import ClassificationExperiment |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
420 |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
421 self.exp = ClassificationExperiment() |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
422 elif self.task_type == "regression": |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
423 from pycaret.regression import RegressionExperiment |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
424 |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
425 self.exp = RegressionExperiment() |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
426 else: |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
427 raise ValueError( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
428 "task_type must be 'classification' or 'regression'" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
429 ) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
430 |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
431 self.exp.setup(self.data, **self.setup_params) |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
432 self._capture_imputed_training_data() |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
433 self.setup_params.update(self.user_kwargs) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
434 |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
435 def _capture_imputed_training_data(self): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
436 """ |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
437 Cache the dataset as transformed/imputed by PyCaret so downstream |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
438 components (e.g., feature importance) can operate on the exact data |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
439 used for training. |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
440 """ |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
441 if self.exp is None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
442 return |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
443 try: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
444 X_processed = self.exp.get_config("X_transformed").copy() |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
445 y_processed = self.exp.get_config("y") |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
446 if isinstance(y_processed, pd.Series): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
447 y_series = y_processed.reset_index(drop=True) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
448 else: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
449 y_series = pd.Series(y_processed) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
450 y_series.name = self.target |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
451 X_processed = X_processed.reset_index(drop=True) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
452 self.imputed_training_data = pd.concat( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
453 [X_processed, y_series], axis=1 |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
454 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
455 LOG.info( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
456 "Captured imputed training dataset from PyCaret " |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
457 "(%s rows, %s features).", |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
458 self.imputed_training_data.shape[0], |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
459 self.imputed_training_data.shape[1] - 1, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
460 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
461 except Exception as exc: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
462 LOG.warning( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
463 "Unable to capture processed training data from PyCaret: %s", |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
464 exc, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
465 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
466 self.imputed_training_data = None |
|
9
e7dd78077b72
planemo upload for repository https://github.com/goeckslab/gleam commit 84d5cd0b1fa5c1ff0ad892bc39c95dad1ceb4920
goeckslab
parents:
6
diff
changeset
|
467 |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
468 def train_model(self): |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
469 LOG.info("Training and selecting the best model") |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
470 if self.task_type == "classification": |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
471 self.exp.add_metric( |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
472 id="PR-AUC-Weighted", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
473 name="PR-AUC-Weighted", |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
474 target="pred_proba", |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
475 score_func=average_precision_score, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
476 average="weighted", |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
477 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
478 # Build arguments for compare_models() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
479 compare_kwargs = {} |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
480 if getattr(self, "models", None): |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
481 compare_kwargs["include"] = self.models |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
482 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
483 # Respect explicit cross-validation flag |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
484 if getattr(self, "cross_validation", None) is not None: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
485 compare_kwargs["cross_validation"] = self.cross_validation |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
486 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
487 # Respect explicit fold count |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
488 if getattr(self, "cross_validation_folds", None) is not None: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
489 compare_kwargs["fold"] = self.cross_validation_folds |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
490 |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
491 best_metric = getattr(self, "best_model_metric", None) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
492 if best_metric: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
493 compare_kwargs["sort"] = best_metric |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
494 self._best_model_metric_used = best_metric |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
495 LOG.info(f"Ranking models using metric: {best_metric}") |
|
9
e7dd78077b72
planemo upload for repository https://github.com/goeckslab/gleam commit 84d5cd0b1fa5c1ff0ad892bc39c95dad1ceb4920
goeckslab
parents:
6
diff
changeset
|
496 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
497 LOG.info(f"compare_models kwargs: {compare_kwargs}") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
498 self.best_model = self.exp.compare_models(**compare_kwargs) |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
499 if self._best_model_metric_used is None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
500 self._best_model_metric_used = getattr(self.exp, "_fold_metric", None) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
501 self.results = self.exp.pull() |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
502 if getattr(self, "tune_model", False): |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
503 LOG.info("Tuning hyperparameters of the best model") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
504 self.best_model = self.exp.tune_model(self.best_model) |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
505 self.tuning_results = self.exp.pull() |
|
3
f6a65e05d6ec
planemo upload for repository https://github.com/goeckslab/gleam commit b430f8b466655878c3bf63b053655fdbf039ddb0
goeckslab
parents:
2
diff
changeset
|
506 |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
507 if self.task_type == "classification": |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
508 self.results.rename(columns={"AUC": "ROC-AUC"}, inplace=True) |
|
5
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
509 |
|
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
510 prob_thresh = getattr(self, "probability_threshold", None) |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
511 if self.task_type == "classification" and ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
512 prob_thresh is not None |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
513 ): |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
514 _ = self.exp.predict_model( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
515 self.best_model, probability_threshold=prob_thresh |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
516 ) |
|
5
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
517 else: |
|
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
518 _ = self.exp.predict_model(self.best_model) |
|
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
519 |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
520 self.test_result_df = self.exp.pull() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
521 if self.task_type == "classification": |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
522 self.test_result_df.rename( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
523 columns={"AUC": "ROC-AUC"}, inplace=True |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
524 ) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
525 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
526 def save_model(self): |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
527 hdf5_path = Path(self.output_dir) / "pycaret_model.h5" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
528 with h5py.File(hdf5_path, "w") as f: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
529 with tempfile.NamedTemporaryFile(delete=False) as tmp: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
530 joblib.dump(self.best_model, tmp.name) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
531 tmp.seek(0) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
532 model_bytes = tmp.read() |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
533 f.create_dataset("model", data=np.void(model_bytes)) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
534 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
535 def generate_plots(self): |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
536 LOG.info("Generating PyCaret diagnostic pltos") |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
537 |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
538 # choose the right plots based on task type |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
539 if self.task_type == "classification": |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
540 plot_names = [ |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
541 "learning", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
542 "vc", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
543 "calibration", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
544 "dimension", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
545 "manifold", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
546 "rfe", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
547 "threshold", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
548 "percentage_above_below", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
549 "class_report", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
550 "pr_auc", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
551 "roc_auc", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
552 ] |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
553 else: |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
554 plot_names = ["residuals", "vc", "parameter", "error", |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
555 "learning"] |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
556 for name in plot_names: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
557 try: |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
558 ax = self.exp.plot_model( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
559 self.best_model, plot=name, save=False |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
560 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
561 out_path = Path(self.output_dir) / f"plot_{name}.png" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
562 fig = ax.get_figure() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
563 fig.savefig(out_path, bbox_inches="tight") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
564 self.plots[name] = str(out_path) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
565 except Exception as e: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
566 LOG.warning(f"Could not generate {name} plot: {e}") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
567 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
568 def encode_image_to_base64(self, img_path: str) -> str: |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
569 with open(img_path, "rb") as img_file: |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
570 return base64.b64encode(img_file.read()).decode("utf-8") |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
571 |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
572 def _build_dataset_overview(self): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
573 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
574 Build an HTML table showing label counts with labels as rows and splits |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
575 (Train / Validation / Test) as columns. Each cell shows count and |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
576 percentage of that split. Returns empty string for regression or when |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
577 no label data is available. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
578 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
579 if self.task_type != "classification": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
580 return "" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
581 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
582 def _safe_series(obj): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
583 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
584 return pd.Series(obj).reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
585 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
586 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
587 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
588 def _get_from_config(keys): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
589 if self.exp is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
590 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
591 for key in keys: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
592 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
593 val = self.exp.get_config(key) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
594 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
595 val = getattr(self.exp, key, None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
596 if val is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
597 return val |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
598 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
599 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
600 # Prefer PyCaret-configured splits; fall back to raw inputs. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
601 X_train = _get_from_config(["X_train_transformed", "X_train"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
602 y_train = _get_from_config(["y_train_transformed", "y_train"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
603 y_test_cfg = _get_from_config(["y_test_transformed", "y_test"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
604 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
605 if y_train is None and self.data is not None and self.target in self.data.columns: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
606 y_train = self.data[self.target] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
607 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
608 y_train_series = _safe_series(y_train) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
609 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
610 # Build a cross-validation generator to derive a validation subset size. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
611 cv_gen = self._get_cv_generator(y_train_series) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
612 y_train_fold = y_train_series |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
613 y_val_fold = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
614 if cv_gen is not None and y_train_series is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
615 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
616 # Use the first fold to approximate Train/Validation split sizes. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
617 splitter = cv_gen.split( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
618 pd.DataFrame(X_train).reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
619 if X_train is not None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
620 else y_train_series, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
621 y_train_series, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
622 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
623 train_idx, val_idx = next(iter(splitter)) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
624 y_train_fold = y_train_series.iloc[train_idx].reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
625 y_val_fold = y_train_series.iloc[val_idx].reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
626 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
627 LOG.warning("Could not derive validation split for dataset overview: %s", exc) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
628 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
629 # Test labels: prefer PyCaret transformed holdout (single file) or external test. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
630 if self.test_data is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
631 if y_test_cfg is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
632 y_test = y_test_cfg |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
633 elif self.target in self.test_data.columns: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
634 y_test = self.test_data[self.target] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
635 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
636 y_test = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
637 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
638 y_test = y_test_cfg |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
639 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
640 split_map = { |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
641 "Train": _safe_series(y_train_fold), |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
642 "Validation": _safe_series(y_val_fold), |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
643 "Test": _safe_series(y_test), |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
644 } |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
645 available = {k: v for k, v in split_map.items() if v is not None and not v.empty} |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
646 if not available: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
647 return "" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
648 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
649 # Collect all labels across available splits (including NaN) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
650 label_pool = pd.concat( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
651 available.values(), ignore_index=True |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
652 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
653 labels = pd.unique(label_pool) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
654 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
655 def _count_for_label(series, label): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
656 if series is None or series.empty: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
657 return None, None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
658 total = len(series) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
659 if pd.isna(label): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
660 cnt = series.isna().sum() |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
661 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
662 cnt = (series == label).sum() |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
663 return int(cnt), total |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
664 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
665 rows = [] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
666 for label in labels: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
667 row = ["NaN" if pd.isna(label) else str(label)] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
668 for split_name in ["Train", "Validation", "Test"]: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
669 cnt, total = _count_for_label(split_map.get(split_name), label) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
670 if cnt is None or total is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
671 cell = "—" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
672 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
673 pct = (cnt / total * 100) if total else 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
674 cell = f"{cnt} ({pct:.1f}%)" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
675 row.append(cell) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
676 rows.append(row) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
677 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
678 df = pd.DataFrame(rows, columns=["Label", "Train", "Validation", "Test"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
679 df.sort_values("Label", inplace=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
680 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
681 return ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
682 "<h2>Dataset Overview</h2>" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
683 + '<div class="table-wrapper">' |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
684 + df.to_html( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
685 index=False, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
686 classes=["table", "sortable", "table-dataset-overview"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
687 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
688 + "</div>" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
689 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
690 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
691 def _predict_with_thresholds(self, X, y_true): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
692 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
693 Generate predictions/probabilities for a split, respecting an optional |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
694 probability threshold for binary tasks. Returns a dict with y_true, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
695 y_pred, y_scores (positive-class probs when available), pos_label, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
696 and neg_label. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
697 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
698 if X is None or y_true is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
699 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
700 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
701 y_true_series = pd.Series(y_true).reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
702 classes = list(getattr(self.best_model, "classes_", [])) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
703 if not classes: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
704 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
705 classes = pd.unique(y_true_series).tolist() |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
706 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
707 classes = [] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
708 if len(classes) > 1: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
709 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
710 pos_idx = classes.index(1) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
711 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
712 pos_idx = 1 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
713 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
714 pos_idx = 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
715 pos_idx = min(pos_idx, len(classes) - 1) if classes else 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
716 pos_label = ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
717 classes[pos_idx] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
718 if len(classes) > pos_idx and pos_idx >= 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
719 else (classes[-1] if classes else 1) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
720 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
721 neg_label = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
722 if len(classes) >= 2: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
723 neg_candidates = [c for c in classes if c != pos_label] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
724 if neg_candidates: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
725 neg_label = neg_candidates[0] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
726 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
727 prob_thresh = getattr(self, "probability_threshold", None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
728 y_scores = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
729 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
730 proba = self.best_model.predict_proba(X) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
731 y_scores = np.asarray(proba) if proba is not None else None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
732 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
733 y_scores = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
734 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
735 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
736 if ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
737 prob_thresh is not None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
738 and not getattr(self.exp, "is_multiclass", False) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
739 and y_scores is not None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
740 and y_scores.ndim == 2 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
741 and y_scores.shape[1] > 1 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
742 ): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
743 pos_idx = min(pos_idx, y_scores.shape[1] - 1) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
744 neg_idx = 1 - pos_idx if y_scores.shape[1] > 1 else 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
745 if neg_label is None and len(classes) > neg_idx: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
746 neg_label = classes[neg_idx] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
747 y_pred = np.where( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
748 y_scores[:, pos_idx] >= prob_thresh, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
749 pos_label, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
750 neg_label if neg_label is not None else 0, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
751 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
752 y_scores = y_scores[:, pos_idx] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
753 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
754 y_pred = self.best_model.predict(X) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
755 if ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
756 not getattr(self.exp, "is_multiclass", False) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
757 and y_scores is not None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
758 and y_scores.ndim == 2 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
759 and y_scores.shape[1] > 1 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
760 ): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
761 pos_idx = min(pos_idx, y_scores.shape[1] - 1) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
762 y_scores = y_scores[:, pos_idx] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
763 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
764 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
765 "Falling back to raw predict while computing performance summary: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
766 exc, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
767 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
768 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
769 y_pred = self.best_model.predict(X) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
770 except Exception as exc_inner: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
771 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
772 "Unable to score split after fallback prediction: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
773 exc_inner, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
774 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
775 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
776 y_scores = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
777 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
778 y_pred_series = pd.Series(y_pred).reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
779 if y_scores is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
780 y_scores = np.asarray(y_scores) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
781 if y_scores.ndim > 1 and y_scores.shape[1] == 1: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
782 y_scores = y_scores.ravel() |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
783 if getattr(self.exp, "is_multiclass", False) and y_scores.ndim > 1: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
784 # Avoid passing multiclass score matrices to ROC/PR utilities |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
785 y_scores = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
786 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
787 return { |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
788 "y_true": y_true_series, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
789 "y_pred": y_pred_series, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
790 "y_scores": y_scores, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
791 "pos_label": pos_label, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
792 "neg_label": neg_label, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
793 } |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
794 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
795 def _get_cv_generator(self, y_series): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
796 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
797 Build a cross-validation splitter that mirrors the experiment's |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
798 configuration. Returns None when CV is disabled or not applicable. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
799 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
800 if self.task_type != "classification": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
801 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
802 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
803 if getattr(self, "cross_validation", None) is False: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
804 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
805 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
806 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
807 cfg_gen = self.exp.get_config("fold_generator") |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
808 if cfg_gen is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
809 return cfg_gen |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
810 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
811 cfg_gen = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
812 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
813 folds = ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
814 getattr(self, "cross_validation_folds", None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
815 or self.setup_params.get("fold") |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
816 or getattr(self.exp, "fold", None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
817 or 10 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
818 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
819 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
820 folds = int(folds) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
821 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
822 folds = 10 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
823 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
824 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
825 y_series = pd.Series(y_series).reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
826 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
827 y_series = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
828 if y_series is None or y_series.empty: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
829 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
830 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
831 if folds < 2: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
832 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
833 if len(y_series) < folds: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
834 folds = len(y_series) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
835 if folds < 2: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
836 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
837 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
838 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
839 from sklearn.model_selection import KFold, StratifiedKFold |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
840 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
841 if self.task_type == "classification": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
842 return StratifiedKFold( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
843 n_splits=folds, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
844 shuffle=True, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
845 random_state=self.random_seed, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
846 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
847 return KFold( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
848 n_splits=folds, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
849 shuffle=True, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
850 random_state=self.random_seed, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
851 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
852 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
853 LOG.warning("Could not build CV generator: %s", exc) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
854 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
855 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
856 def _get_cross_validated_predictions(self, X, y): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
857 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
858 Generate cross-validated predictions for the validation split so we |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
859 can report validation metrics for the selected best model. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
860 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
861 if self.task_type != "classification": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
862 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
863 if getattr(self, "cross_validation", None) is False: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
864 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
865 if X is None or y is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
866 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
867 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
868 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
869 from sklearn.model_selection import cross_val_predict |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
870 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
871 LOG.warning("cross_val_predict unavailable: %s", exc) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
872 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
873 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
874 y_series = pd.Series(y).reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
875 if y_series.empty: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
876 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
877 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
878 cv_gen = self._get_cv_generator(y_series) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
879 if cv_gen is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
880 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
881 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
882 X_df = pd.DataFrame(X).reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
883 if len(X_df) != len(y_series): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
884 X_df = X_df.iloc[: len(y_series)].reset_index(drop=True) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
885 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
886 classes = list(getattr(self.best_model, "classes_", [])) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
887 if len(classes) > 1: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
888 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
889 pos_idx = classes.index(1) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
890 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
891 pos_idx = 1 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
892 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
893 pos_idx = 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
894 pos_idx = min(pos_idx, len(classes) - 1) if classes else 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
895 pos_label = ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
896 classes[pos_idx] if len(classes) > pos_idx else 1 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
897 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
898 neg_label = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
899 if len(classes) >= 2: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
900 neg_candidates = [c for c in classes if c != pos_label] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
901 if neg_candidates: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
902 neg_label = neg_candidates[0] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
903 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
904 prob_thresh = getattr(self, "probability_threshold", None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
905 n_jobs = getattr(self, "n_jobs", None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
906 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
907 y_scores = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
908 if not getattr(self.exp, "is_multiclass", False): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
909 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
910 proba = cross_val_predict( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
911 self.best_model, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
912 X_df, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
913 y_series, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
914 cv=cv_gen, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
915 method="predict_proba", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
916 n_jobs=n_jobs, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
917 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
918 y_scores = np.asarray(proba) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
919 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
920 LOG.debug("Could not compute CV probabilities: %s", exc) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
921 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
922 y_pred = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
923 if ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
924 prob_thresh is not None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
925 and not getattr(self.exp, "is_multiclass", False) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
926 and y_scores is not None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
927 and y_scores.ndim == 2 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
928 and y_scores.shape[1] > 1 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
929 ): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
930 pos_idx = min(pos_idx, y_scores.shape[1] - 1) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
931 neg_idx = 1 - pos_idx if y_scores.shape[1] > 1 else 0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
932 if neg_label is None and len(classes) > neg_idx: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
933 neg_label = classes[neg_idx] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
934 y_pred = np.where( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
935 y_scores[:, pos_idx] >= prob_thresh, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
936 pos_label, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
937 neg_label if neg_label is not None else 0, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
938 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
939 y_scores = y_scores[:, pos_idx] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
940 else: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
941 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
942 y_pred = cross_val_predict( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
943 self.best_model, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
944 X_df, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
945 y_series, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
946 cv=cv_gen, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
947 method="predict", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
948 n_jobs=n_jobs, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
949 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
950 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
951 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
952 "Could not compute cross-validated predictions: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
953 exc, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
954 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
955 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
956 if ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
957 not getattr(self.exp, "is_multiclass", False) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
958 and y_scores is not None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
959 and y_scores.ndim == 2 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
960 and y_scores.shape[1] > 1 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
961 ): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
962 pos_idx = min(pos_idx, y_scores.shape[1] - 1) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
963 y_scores = y_scores[:, pos_idx] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
964 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
965 if y_scores is not None and getattr(self.exp, "is_multiclass", False): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
966 y_scores = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
967 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
968 return { |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
969 "y_true": y_series, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
970 "y_pred": pd.Series(y_pred).reset_index(drop=True), |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
971 "y_scores": y_scores, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
972 "pos_label": pos_label, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
973 "neg_label": neg_label, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
974 } |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
975 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
976 def _get_split_predictions_for_report(self): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
977 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
978 Collect predictions/probabilities for Train/Validation/Test splits so the |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
979 performance table can show consistent metrics across splits. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
980 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
981 if self.task_type != "classification": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
982 return {} |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
983 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
984 def _get_from_config(keys): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
985 for key in keys: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
986 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
987 val = self.exp.get_config(key) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
988 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
989 val = getattr(self.exp, key, None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
990 if val is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
991 return val |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
992 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
993 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
994 X_train = _get_from_config(["X_train_transformed", "X_train"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
995 y_train = _get_from_config(["y_train_transformed", "y_train"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
996 X_holdout = _get_from_config(["X_test_transformed", "X_test"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
997 y_holdout = _get_from_config(["y_test_transformed", "y_test"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
998 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
999 predictions = {} |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1000 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1001 # Train metrics (best model on training data) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1002 if X_train is not None and y_train is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1003 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1004 train_preds = self._predict_with_thresholds(X_train, y_train) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1005 if train_preds is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1006 predictions["Train"] = train_preds |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1007 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1008 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1009 "Could not score Train split for performance summary: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1010 exc, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1011 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1012 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1013 # Validation metrics via cross-validation on training data |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1014 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1015 val_preds = self._get_cross_validated_predictions(X_train, y_train) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1016 if val_preds is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1017 predictions["Validation"] = val_preds |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1018 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1019 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1020 "Could not score Validation split for performance summary: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1021 exc, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1022 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1023 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1024 # Test metrics (holdout from single file, or provided test file) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1025 X_test = X_holdout |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1026 y_test = y_holdout |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1027 if (X_test is None or y_test is None) and self.test_data is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1028 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1029 X_test = self.test_data.drop(columns=[self.target]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1030 y_test = self.test_data[self.target] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1031 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1032 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1033 "Could not prepare external test data for performance summary: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1034 exc, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1035 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1036 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1037 if X_test is not None and y_test is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1038 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1039 test_preds = self._predict_with_thresholds(X_test, y_test) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1040 if test_preds is not None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1041 predictions["Test"] = test_preds |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1042 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1043 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1044 "Could not score Test split for performance summary: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1045 exc, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1046 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1047 return predictions |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1048 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1049 def _compute_metric_value(self, metric_name, preds, split_name): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1050 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1051 Compute a single metric for a given split prediction bundle. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1052 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1053 if preds is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1054 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1055 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1056 y_true = preds["y_true"] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1057 y_pred = preds["y_pred"] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1058 y_scores = preds.get("y_scores") |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1059 pos_label = preds.get("pos_label") |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1060 neg_label = preds.get("neg_label") |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1061 is_multiclass = getattr(self.exp, "is_multiclass", False) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1062 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1063 def _format_binary_labels(series): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1064 if pos_label is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1065 return series |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1066 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1067 return (series == pos_label).astype(int) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1068 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1069 return series |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1070 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1071 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1072 if metric_name == "Accuracy": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1073 return accuracy_score(y_true, y_pred) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1074 if metric_name == "ROC-AUC": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1075 if y_scores is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1076 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1077 y_true_bin = _format_binary_labels(y_true) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1078 if len(pd.unique(y_true_bin)) < 2: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1079 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1080 return roc_auc_score(y_true_bin, y_scores) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1081 if metric_name == "Precision": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1082 if is_multiclass: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1083 return precision_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1084 y_true, y_pred, average="weighted", zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1085 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1086 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1087 return precision_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1088 y_true, y_pred, pos_label=pos_label, zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1089 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1090 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1091 return precision_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1092 y_true, y_pred, average="weighted", zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1093 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1094 if metric_name == "Recall": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1095 if is_multiclass: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1096 return recall_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1097 y_true, y_pred, average="weighted", zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1098 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1099 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1100 return recall_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1101 y_true, y_pred, pos_label=pos_label, zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1102 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1103 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1104 return recall_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1105 y_true, y_pred, average="weighted", zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1106 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1107 if metric_name == "F1-Score": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1108 if is_multiclass: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1109 return f1_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1110 y_true, y_pred, average="weighted", zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1111 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1112 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1113 return f1_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1114 y_true, y_pred, pos_label=pos_label, zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1115 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1116 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1117 return f1_score( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1118 y_true, y_pred, average="weighted", zero_division=0 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1119 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1120 if metric_name == "PR-AUC": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1121 if y_scores is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1122 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1123 y_true_bin = _format_binary_labels(y_true) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1124 if len(pd.unique(y_true_bin)) < 2: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1125 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1126 return average_precision_score(y_true_bin, y_scores) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1127 if metric_name == "Specificity": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1128 labels = pd.unique(pd.concat([y_true, y_pred], ignore_index=True)) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1129 if len(labels) != 2: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1130 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1131 if pos_label is None or pos_label not in labels: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1132 pos_label = labels[1] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1133 neg_candidates = [lbl for lbl in labels if lbl != pos_label] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1134 neg_label_final = ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1135 neg_label if neg_label in labels else (neg_candidates[0] if neg_candidates else None) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1136 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1137 if neg_label_final is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1138 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1139 cm = confusion_matrix( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1140 y_true, y_pred, labels=[neg_label_final, pos_label] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1141 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1142 if cm.shape != (2, 2): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1143 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1144 tn, fp, fn, tp = cm.ravel() |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1145 denom = tn + fp |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1146 return (tn / denom) if denom else None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1147 if metric_name == "MCC": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1148 return matthews_corrcoef(y_true, y_pred) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1149 except Exception as exc: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1150 LOG.warning( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1151 "Could not compute %s for %s split: %s", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1152 metric_name, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1153 split_name, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1154 exc, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1155 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1156 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1157 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1158 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1159 def _build_performance_summary_table(self): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1160 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1161 Build a Train/Validation/Test metrics table for classification tasks. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1162 Returns empty string when metrics are unavailable or not applicable. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1163 """ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1164 if self.task_type != "classification": |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1165 return "" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1166 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1167 split_predictions = self._get_split_predictions_for_report() |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1168 validation_best_row = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1169 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1170 if isinstance(self.results, pd.DataFrame) and not self.results.empty: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1171 validation_best_row = self.results.iloc[0] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1172 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1173 validation_best_row = None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1174 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1175 if not split_predictions and validation_best_row is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1176 return "" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1177 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1178 metric_names = [ |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1179 "Accuracy", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1180 "ROC-AUC", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1181 "Precision", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1182 "Recall", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1183 "F1-Score", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1184 "PR-AUC", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1185 "Specificity", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1186 "MCC", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1187 ] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1188 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1189 validation_column_map = { |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1190 "Accuracy": ["Accuracy"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1191 "ROC-AUC": ["ROC-AUC", "AUC"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1192 "Precision": ["Precision", "Prec.", "Prec"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1193 "Recall": ["Recall"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1194 "F1-Score": ["F1-Score", "F1"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1195 "PR-AUC": ["PR-AUC", "PR-AUC-Weighted", "PRC"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1196 "Specificity": ["Specificity"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1197 "MCC": ["MCC"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1198 } |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1199 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1200 def _fmt(value): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1201 if value is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1202 return "—" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1203 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1204 if isinstance(value, (float, np.floating)) and ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1205 np.isnan(value) or np.isinf(value) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1206 ): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1207 return "—" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1208 return f"{value:.3f}" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1209 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1210 return str(value) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1211 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1212 def _validation_metric(metric_name): |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1213 if validation_best_row is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1214 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1215 cols = validation_column_map.get(metric_name, []) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1216 for col in cols: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1217 if col in validation_best_row: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1218 try: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1219 return validation_best_row[col] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1220 except Exception: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1221 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1222 return None |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1223 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1224 rows = [] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1225 for metric in metric_names: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1226 row = [metric] |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1227 # Train |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1228 train_val = self._compute_metric_value( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1229 metric, split_predictions.get("Train"), "Train" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1230 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1231 row.append(_fmt(train_val)) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1232 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1233 # Validation from Train & Validation Summary first row; fallback to computed CV. |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1234 val_val = _validation_metric(metric) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1235 if val_val is None: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1236 val_val = self._compute_metric_value( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1237 metric, split_predictions.get("Validation"), "Validation" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1238 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1239 row.append(_fmt(val_val)) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1240 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1241 # Test |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1242 test_val = self._compute_metric_value( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1243 metric, split_predictions.get("Test"), "Test" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1244 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1245 row.append(_fmt(test_val)) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1246 rows.append(row) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1247 |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1248 df = pd.DataFrame(rows, columns=["Metric", "Train", "Validation", "Test"]) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1249 return ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1250 "<h2>Model Performance Summary</h2>" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1251 + '<div class="table-wrapper">' |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1252 + df.to_html( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1253 index=False, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1254 classes=["table", "sortable", "table-perf-summary"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1255 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1256 + "</div>" |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1257 ) |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1258 |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1259 def _resolve_plot_callable(self, key, fig_or_fn, section): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1260 """ |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1261 Safely execute stored plot callables so a single failure does not |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1262 abort the entire HTML report generation. |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1263 """ |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1264 if fig_or_fn is None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1265 return None |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1266 try: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1267 return fig_or_fn() if callable(fig_or_fn) else fig_or_fn |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1268 except Exception as exc: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1269 extra = "" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1270 if isinstance(exc, ValueError) and "Input contains NaN" in str(exc): |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1271 extra = ( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1272 " (model returned NaN probabilities; " |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1273 "consider checking data preprocessing)" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1274 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1275 LOG.warning( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1276 "Skipping %s plot '%s' due to error: %s%s", |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1277 section, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1278 key, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1279 exc, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1280 extra, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1281 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1282 return None |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1283 |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1284 def save_html_report(self): |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1285 LOG.info("Saving HTML report") |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1286 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1287 # 1) Determine best model name |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1288 try: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1289 best_model_name = str(self.results.iloc[0]["Model"]) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1290 except Exception: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1291 best_model_name = type(self.best_model).__name__ |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1292 LOG.info(f"Best model determined as: {best_model_name}") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1293 |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1294 # 2) Compute training sample count |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1295 try: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1296 n_train = self.exp.X_train.shape[0] |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1297 except Exception: |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1298 n_train = getattr( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1299 self.exp, "X_train_transformed", pd.DataFrame() |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1300 ).shape[0] |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1301 total_rows = self.data.shape[0] |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1302 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1303 # 3) Build setup parameters table |
|
5
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
1304 all_params = self.setup_params.copy() |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1305 if self.task_type == "classification" and ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1306 hasattr(self, "probability_threshold") |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1307 ): |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1308 all_params["probability_threshold"] = ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1309 self.probability_threshold |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1310 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1311 display_keys = [ |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1312 "Target", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1313 "Session ID", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1314 "Train Size", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1315 "Normalize", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1316 "Feature Selection", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1317 "Cross Validation", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1318 "Cross Validation Folds", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1319 "Remove Outliers", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1320 "Remove Multicollinearity", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1321 "Polynomial Features", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1322 "Fix Imbalance", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1323 "Models", |
|
5
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
1324 "Probability Threshold", |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1325 ] |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1326 setup_rows = [] |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1327 for key in display_keys: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1328 pk = key.lower().replace(" ", "_") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1329 v = all_params.get(pk) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1330 if key == "Train Size": |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1331 frac = ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1332 float(v) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1333 if v is not None |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1334 else (n_train / total_rows if total_rows else 0) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1335 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1336 dv = f"{frac:.2f} ({n_train} rows)" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1337 elif key in { |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1338 "Normalize", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1339 "Feature Selection", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1340 "Cross Validation", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1341 "Remove Outliers", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1342 "Remove Multicollinearity", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1343 "Polynomial Features", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1344 "Fix Imbalance", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1345 }: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1346 dv = bool(v) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1347 elif key == "Cross Validation Folds": |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1348 dv = v if v is not None else "None" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1349 elif key == "Models": |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1350 dv = ", ".join(map(str, v)) if isinstance( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1351 v, (list, tuple) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1352 ) else "None" |
|
5
3d42f82b3c7f
planemo upload for repository https://github.com/goeckslab/gleam commit 4a11e8a4c4e9daa884bddedfa47090476c517667
goeckslab
parents:
4
diff
changeset
|
1353 elif key == "Probability Threshold": |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1354 dv = f"{v:.2f}" if v is not None else "0.5" |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1355 else: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1356 dv = v if v is not None else "None" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1357 setup_rows.append([key, dv]) |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1358 metric_label = self._best_model_metric_used or getattr( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1359 self.exp, "_fold_metric", None |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1360 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1361 if metric_label: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1362 setup_rows.append(["Best Model Metric", metric_label]) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1363 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1364 df_setup = pd.DataFrame(setup_rows, columns=["Parameter", "Value"]) |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1365 df_setup.to_csv( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1366 Path(self.output_dir) / "setup_params.csv", index=False |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1367 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1368 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1369 # 4) Persist CSVs |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1370 self.results.to_csv( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1371 Path(self.output_dir) / "comparison_results.csv", |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1372 index=False |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1373 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1374 self.test_result_df.to_csv( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1375 Path(self.output_dir) / "test_results.csv", index=False |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1376 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1377 pd.DataFrame( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1378 self.best_model.get_params().items(), |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1379 columns=["Parameter", "Value"] |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1380 ).to_csv(Path(self.output_dir) / "best_model.csv", index=False) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1381 |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1382 if self.tuning_results is not None: |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1383 self.tuning_results.to_csv( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1384 Path(self.output_dir) / "tuning_results.csv", |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1385 index=False |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1386 ) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1387 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1388 # 5) Header |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1389 header = f"<h2>Best Model: {best_model_name}</h2>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1390 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1391 # — Validation Summary & Configuration — |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1392 val_df = self.results.copy() |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1393 dataset_overview_html = self._build_dataset_overview() |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1394 performance_summary_html = self._build_performance_summary_table() |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1395 # mapping raw plot keys to user-friendly titles |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1396 plot_title_map = { |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1397 "learning": "Learning Curve", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1398 "vc": "Validation Curve", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1399 "calibration": "Calibration Curve", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1400 "dimension": "Dimensionality Reduction", |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1401 "manifold": "t-SNE", |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1402 "rfe": "Recursive Feature Elimination", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1403 "threshold": "Threshold Plot", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1404 "percentage_above_below": "Percentage Above vs. Below Cutoff", |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1405 "class_report": "Per-Class Metrics", |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1406 "pr_auc": "Precision-Recall AUC", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1407 "roc_auc": "Receiver Operating Characteristic AUC", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1408 "residuals": "Residuals Distribution", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1409 "error": "Prediction Error Distribution", |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1410 } |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1411 val_df.drop( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1412 columns=["TT (Ec)", "TT (Sec)"], errors="ignore", inplace=True |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1413 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1414 summary_html = ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1415 header |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1416 + "<h2>Train & Validation Summary</h2>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1417 + '<div class="table-wrapper">' |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1418 + val_df.to_html(index=False, classes="table sortable") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1419 + "</div>" |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1420 ) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1421 |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1422 if self.tuning_results is not None: |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1423 tuning_df = self.tuning_results.copy() |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1424 tuning_df.drop( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1425 columns=["TT (Sec)"], errors="ignore", inplace=True |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1426 ) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1427 summary_html += ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1428 f"<h2>{best_model_name}: Tuning Summary</h2>" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1429 + '<div class="table-wrapper">' |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1430 + tuning_df.to_html(index=False, classes="table sortable") |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1431 + "</div>" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1432 ) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1433 |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1434 config_html = ( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1435 header |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1436 + dataset_overview_html |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1437 + performance_summary_html |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1438 + "<h2>Setup Parameters</h2>" |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1439 + '<div class="table-wrapper">' |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1440 + df_setup.to_html( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1441 index=False, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1442 classes=["table", "sortable", "table-setup-params"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1443 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1444 + "</div>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1445 # — Hyperparameters |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1446 + "<h2>Best Model Hyperparameters</h2>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1447 + '<div class="table-wrapper">' |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1448 + pd.DataFrame( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1449 self.best_model.get_params().items(), |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1450 columns=["Parameter", "Value"] |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1451 ).to_html( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1452 index=False, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1453 classes=["table", "sortable", "table-hyperparams"], |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1454 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1455 + "</div>" |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1456 ) |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1457 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1458 # choose summary plots based on task type |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1459 if self.task_type == "classification": |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1460 summary_plots = [ |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1461 "threshold", |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1462 "learning", |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1463 "calibration", |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1464 "rfe", |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1465 "vc", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1466 "dimension", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1467 "manifold", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1468 "percentage_above_below", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1469 ] |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1470 else: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1471 summary_plots = ["learning", "vc", "parameter", "residuals"] |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1472 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1473 for name in summary_plots: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1474 if name in self.plots: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1475 summary_html += "<hr>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1476 b64 = encode_image_to_base64(self.plots[name]) |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1477 title = plot_title_map.get( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1478 name, name.replace("_", " ").title() |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1479 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1480 summary_html += ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1481 '<div class="plot">' |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1482 f"<h2>{title}</h2>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1483 f'<img src="data:image/png;base64,{b64}" ' |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1484 'style="max-width:90%;max-height:600px;' |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1485 'border:1px solid #ddd;"/>' |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1486 "</div>" |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1487 ) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1488 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1489 # — Test Summary — |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1490 test_html = ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1491 header |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1492 + '<div class="table-wrapper">' |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1493 + self.test_result_df.to_html( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1494 index=False, classes="table sortable" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1495 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1496 + "</div>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1497 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1498 if self.task_type == "regression": |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1499 try: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1500 y_true = ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1501 pd.Series(self.exp.y_test_transformed) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1502 .reset_index(drop=True) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1503 .rename("True") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1504 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1505 y_pred = pd.Series( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1506 self.best_model.predict( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1507 self.exp.X_test_transformed |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1508 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1509 ).rename("Predicted") |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1510 df_tp = pd.concat([y_true, y_pred], axis=1) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1511 test_html += "<h2>True vs Predicted Values</h2>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1512 test_html += ( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1513 '<div class="table-wrapper" ' |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1514 'style="max-height:400px; overflow-y:auto;">' |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1515 + df_tp.head(50).to_html( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1516 index=False, classes="table sortable" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1517 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1518 + "</div>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1519 + add_hr_to_html() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1520 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1521 except Exception as e: |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1522 LOG.warning( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1523 f"Could not generate True vs Predicted table: {e}" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1524 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1525 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1526 # 5a) Explainer-substituted plots in order |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1527 if self.task_type == "regression": |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1528 test_order = ["residuals"] |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1529 else: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1530 test_order = [ |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1531 "confusion_matrix", |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1532 "class_report", |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1533 "roc_auc", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1534 "pr_auc", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1535 "lift_curve", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1536 "cumulative_precision", |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1537 ] |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1538 rendered_test_plots = set() |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1539 for key in test_order: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1540 fig_or_fn = self.explainer_plots.pop(key, None) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1541 if fig_or_fn is not None: |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1542 fig = self._resolve_plot_callable( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1543 key, fig_or_fn, section="test/explainer" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1544 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1545 if fig is None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1546 continue |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1547 rendered_test_plots.add(key) |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1548 title = plot_title_map.get( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1549 key, key.replace("_", " ").title() |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1550 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1551 test_html += ( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1552 f"<h2>{title}</h2>" + add_plot_to_html(fig) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1553 + add_hr_to_html() |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1554 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1555 # 5b) Remaining PyCaret test plots |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1556 for name, path in self.plots.items(): |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1557 # classification: include only the small extras, before |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1558 # skipping anything |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1559 if self.task_type == "classification" and ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1560 name in { |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1561 "pr_auc", |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1562 "class_report", |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1563 } |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1564 ): |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1565 if name in rendered_test_plots: |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1566 continue |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1567 title = plot_title_map.get( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1568 name, name.replace("_", " ").title() |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1569 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1570 b64 = encode_image_to_base64(path) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1571 test_html += ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1572 f"<h2>{title}</h2>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1573 "<div class='plot'>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1574 f"<img src='data:image/png;base64,{b64}' " |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1575 "style='max-width:90%;max-height:600px;" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1576 "border:1px solid #ddd;'/>" |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1577 "</div>" + add_hr_to_html() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1578 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1579 continue |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1580 |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1581 # regression: explicitly include the 'error' plot, |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1582 # before skipping |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1583 if self.task_type == "regression" and ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1584 name == "error" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1585 ): |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1586 title = plot_title_map.get( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1587 "error", "Prediction Error Distribution" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1588 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1589 b64 = encode_image_to_base64(path) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1590 test_html += ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1591 f"<h2>{title}</h2>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1592 "<div class='plot'>" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1593 f"<img src='data:image/png;base64,{b64}' " |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1594 "style='max-width:90%;max-height:600px;" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1595 "border:1px solid #ddd;'/>" |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1596 "</div>" + add_hr_to_html() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1597 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1598 continue |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1599 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1600 # now skip any plots already rendered via test_order |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1601 if name in test_order: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1602 continue |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1603 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1604 # — Feature Importance — |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1605 feature_html = header |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1606 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1607 # 6a) PyCaret’s default feature importances |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1608 imputed_data = ( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1609 self.imputed_training_data |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1610 if self.imputed_training_data is not None |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1611 else self.data |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1612 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1613 fi_analyzer = FeatureImportanceAnalyzer( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1614 data=imputed_data, |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1615 target_col=self.target_col, |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1616 task_type=self.task_type, |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1617 output_dir=self.output_dir, |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1618 exp=self.exp, |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1619 best_model=self.best_model, |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1620 max_plot_features=self.plot_feature_limit, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1621 processed_data=self.imputed_training_data, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1622 max_shap_rows=self._shap_row_cap, |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1623 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1624 fi_html = fi_analyzer.run() |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1625 # Add a small table to show SHAP feature caps near the Best Model header. |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1626 cap_rows = [] |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1627 if fi_analyzer.shap_total_features is not None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1628 cap_rows.append( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1629 ("Total transformed features", fi_analyzer.shap_total_features) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1630 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1631 if fi_analyzer.shap_used_features is not None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1632 cap_rows.append( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1633 ("Features used in SHAP", fi_analyzer.shap_used_features) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1634 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1635 if cap_rows: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1636 cap_table = ( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1637 "<div class='table-wrapper'>" |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1638 "<table class='table sortable table-fi-scope'>" |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1639 "<thead><tr><th>Feature Importance Scope</th><th>Count</th></tr></thead>" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1640 "<tbody>" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1641 + "".join( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1642 f"<tr><td>{label}</td><td>{value}</td></tr>" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1643 for label, value in cap_rows |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1644 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1645 + "</tbody></table></div>" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1646 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1647 feature_html += cap_table |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1648 feature_html += fi_html |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1649 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1650 # 6b) Explainer SHAP importances |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1651 for key in ["shap_mean", "shap_perm"]: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1652 fig_or_fn = self.explainer_plots.pop(key, None) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1653 if fig_or_fn is not None: |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1654 fig = self._resolve_plot_callable( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1655 key, fig_or_fn, section="feature importance" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1656 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1657 if fig is None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1658 continue |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1659 # give SHAP plots explicit titles |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1660 title = ( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1661 "Mean Absolute SHAP Value Impact" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1662 if key == "shap_mean" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1663 else "Permutation Feature Importance" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1664 ) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1665 feature_html += ( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1666 f"<h2>{title}</h2>" + add_plot_to_html(fig) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1667 + add_hr_to_html() |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1668 ) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1669 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1670 # 6c) PDPs last |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1671 pdp_keys = sorted( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1672 k for k in self.explainer_plots if k.startswith("pdp__") |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1673 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1674 for k in pdp_keys: |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1675 fig_or_fn = self.explainer_plots[k] |
|
12
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1676 fig = self._resolve_plot_callable( |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1677 k, fig_or_fn, section="pdp" |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1678 ) |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1679 if fig is None: |
|
15707141e7da
planemo upload for repository https://github.com/goeckslab/gleam commit 2b826699ef9518d4610f5cfb6468ce719ec8039d
goeckslab
parents:
10
diff
changeset
|
1680 continue |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1681 # extract feature name |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1682 feature = k.split("__", 1)[1] |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1683 title = f"Partial Dependence for {feature}" |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1684 feature_html += ( |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1685 f"<h2>{title}</h2>" + add_plot_to_html(fig) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1686 + add_hr_to_html() |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1687 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1688 # 7) Assemble final HTML (three tabs) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1689 html = get_html_template() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1690 html += "<h1>Tabular Learner Model Report</h1>" |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1691 html += build_tabbed_html( |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1692 summary_html, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1693 test_html, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1694 feature_html, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1695 explainer_html=None, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1696 config_html=config_html, |
|
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1697 ) |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1698 html += get_feature_metrics_help_modal() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1699 html += get_html_closing() |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1700 |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1701 # 8) Write out |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1702 (Path(self.output_dir) / "comparison_result.html").write_text( |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1703 html, encoding="utf-8" |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1704 ) |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1705 LOG.info( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1706 f"HTML report generated at: " |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1707 f"{self.output_dir}/comparison_result.html" |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1708 ) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1709 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1710 def save_dashboard(self): |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1711 raise NotImplementedError("Subclasses should implement this method") |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1712 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1713 def generate_plots_explainer(self): |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1714 raise NotImplementedError("Subclasses should implement this method") |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1715 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1716 def generate_tree_plots(self): |
|
13
bf0df21a1ea3
planemo upload for repository https://github.com/goeckslab/gleam commit 7fc20c9ddc2b641975138c9d67b5da240af0484c
goeckslab
parents:
12
diff
changeset
|
1717 from explainerdashboard.explainers import RandomForestExplainer |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1718 from sklearn.ensemble import ( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1719 RandomForestClassifier, RandomForestRegressor |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1720 ) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1721 from xgboost import XGBClassifier, XGBRegressor |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1722 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1723 LOG.info("Generating tree plots") |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1724 X_test = self.exp.X_test_transformed.copy() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1725 y_test = self.exp.y_test_transformed |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1726 |
|
6
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1727 if isinstance( |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1728 self.best_model, (RandomForestClassifier, RandomForestRegressor) |
|
4bd75b45a7a1
planemo upload for repository https://github.com/goeckslab/gleam commit 47a5977e074223e92e216efa42969a4056516707
goeckslab
parents:
5
diff
changeset
|
1729 ): |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1730 n_trees = self.best_model.n_estimators |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1731 elif isinstance(self.best_model, (XGBClassifier, XGBRegressor)): |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1732 n_trees = len(self.best_model.get_booster().get_dump()) |
|
2
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1733 else: |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1734 LOG.warning("Tree plots not supported for this model type.") |
|
77c88226bfde
planemo upload for repository https://github.com/goeckslab/gleam commit 06c0da44ac93256dfb616a6b40276b5485a71e8e
goeckslab
parents:
0
diff
changeset
|
1735 return |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1736 |
|
4
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1737 explainer = RandomForestExplainer(self.best_model, X_test, y_test) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1738 for i in range(n_trees): |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1739 fig = explainer.decisiontree_encoded(tree_idx=i, index=0) |
|
11fdac5affb3
planemo upload for repository https://github.com/goeckslab/gleam commit 8112548ac44b7a4769093d76c722c8fcdeaaef54
goeckslab
parents:
3
diff
changeset
|
1740 self.trees.append(fig) |
|
0
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1741 |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1742 def run(self): |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1743 self.load_data() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1744 self.setup_pycaret() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1745 self.train_model() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1746 self.save_model() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1747 self.generate_plots() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1748 self.generate_plots_explainer() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1749 self.generate_tree_plots() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1750 self.save_html_report() |
|
209b663a4f62
planemo upload for repository https://github.com/goeckslab/gleam commit 5dd048419fcbd285a327f88267e93996cd279ee6
goeckslab
parents:
diff
changeset
|
1751 # self.save_dashboard() |
