comparison utils.py @ 11:c5150cceab47 draft default tip

planemo upload for repository https://github.com/goeckslab/gleam.git commit 0fe927b618cd4dfc87af7baaa827034cc6813225
author goeckslab
date Sat, 18 Oct 2025 03:17:09 +0000
parents b0d893d04d4c
children
comparison
equal deleted inserted replaced
10:b0d893d04d4c 11:c5150cceab47
102 table.performance-summary th.sortable.sorted-desc::after { content: '↓'; color: #ffffff; } 102 table.performance-summary th.sortable.sorted-desc::after { content: '↓'; color: #ffffff; }
103 103
104 /* show ~30 rows with a scrollbar (tweak if you want) */ 104 /* show ~30 rows with a scrollbar (tweak if you want) */
105 .scroll-rows-30 { 105 .scroll-rows-30 {
106 max-height: 900px; /* ~30 rows depending on row height */ 106 max-height: 900px; /* ~30 rows depending on row height */
107 overflow-y: auto; /* vertical scrollbar (“sidebar”) */ 107 overflow-y: auto; /* vertical scrollbar ("sidebar") */
108 overflow-x: auto; 108 overflow-x: auto;
109 } 109 }
110 110
111 /* Tabs + Help button (used by build_tabbed_html) */ 111 /* Tabs + Help button (used by build_tabbed_html) */
112 .tabs { 112 .tabs {
210 if (!isNaN(n1) && !isNaN(n2)) return asc ? n1 - n2 : n2 - n1; // numeric 210 if (!isNaN(n1) && !isNaN(n2)) return asc ? n1 - n2 : n2 - n1; // numeric
211 return asc ? v1.localeCompare(v2) : v2.localeCompare(v1); // lexical 211 return asc ? v1.localeCompare(v2) : v2.localeCompare(v1); // lexical
212 }; 212 };
213 213
214 document.querySelectorAll('table.performance-summary th.sortable').forEach(th => { 214 document.querySelectorAll('table.performance-summary th.sortable').forEach(th => {
215 // initialize to “none” 215 // initialize to "none"
216 th.classList.remove('sorted-asc','sorted-desc'); 216 th.classList.remove('sorted-asc','sorted-desc');
217 th.classList.add('sorted-none'); 217 th.classList.add('sorted-none');
218 218
219 th.addEventListener('click', () => { 219 th.addEventListener('click', () => {
220 const table = th.closest('table'); 220 const table = th.closest('table');
392 '<div id="metricsHelpModal" class="modal">' 392 '<div id="metricsHelpModal" class="modal">'
393 ' <div class="modal-content">' 393 ' <div class="modal-content">'
394 ' <span class="close">×</span>' 394 ' <span class="close">×</span>'
395 " <h2>Model Evaluation Metrics — Help Guide</h2>" 395 " <h2>Model Evaluation Metrics — Help Guide</h2>"
396 ' <div class="metrics-guide">' 396 ' <div class="metrics-guide">'
397 " <h3>1) General Metrics (Regression and Classification)</h3>" 397 ' <h3>1) General Metrics (Regression and Classification)</h3>'
398 " <p><strong>Loss (Regression & Classification):</strong> " 398 ' <p><strong>Loss (Regression & Classification):</strong> '
399 "Measures the difference between predicted and actual values, " 399 'Measures the difference between predicted and actual values, '
400 "optimized during training. Lower is better. " 400 'optimized during training. Lower is better. '
401 "For regression, this is often Mean Squared Error (MSE) or " 401 'For regression, this is often Mean Squared Error (MSE) or '
402 "Mean Absolute Error (MAE). For classification, it’s typically " 402 'Mean Absolute Error (MAE). For classification, it\'s typically '
403 "cross-entropy or log loss.</p>" 403 'cross-entropy or log loss.</p>'
404 " <h3>2) Regression Metrics</h3>" 404 ' <h3>2) Regression Metrics</h3>'
405 " <p><strong>Mean Absolute Error (MAE):</strong> " 405 ' <p><strong>Mean Absolute Error (MAE):</strong> '
406 "Average of absolute differences between predicted and actual values, " 406 'Average of absolute differences between predicted and actual values, '
407 "in the same units as the target. Use for interpretable error measurement " 407 'in the same units as the target. Use for interpretable error measurement '
408 "when all errors are equally important. Less sensitive to outliers than MSE.</p>" 408 'when all errors are equally important. Less sensitive to outliers than MSE.</p>'
409 " <p><strong>Mean Squared Error (MSE):</strong> " 409 ' <p><strong>Mean Squared Error (MSE):</strong> '
410 "Average of squared differences between predicted and actual values. " 410 'Average of squared differences between predicted and actual values. '
411 "Penalizes larger errors more heavily, useful when large deviations are critical. " 411 'Penalizes larger errors more heavily, useful when large deviations are critical. '
412 "Often used as the loss function in regression.</p>" 412 'Often used as the loss function in regression.</p>'
413 " <p><strong>Root Mean Squared Error (RMSE):</strong> " 413 ' <p><strong>Root Mean Squared Error (RMSE):</strong> '
414 "Square root of MSE, in the same units as the target. " 414 'Square root of MSE, in the same units as the target. '
415 "Balances interpretability and sensitivity to large errors. " 415 'Balances interpretability and sensitivity to large errors. '
416 "Widely used for regression evaluation.</p>" 416 'Widely used for regression evaluation.</p>'
417 " <p><strong>Mean Absolute Percentage Error (MAPE):</strong> " 417 ' <p><strong>Mean Absolute Percentage Error (MAPE):</strong> '
418 "Average absolute error as a percentage of actual values. " 418 'Average absolute error as a percentage of actual values. '
419 "Scale-independent, ideal for comparing relative errors across datasets. " 419 'Scale-independent, ideal for comparing relative errors across datasets. '
420 "Avoid when actual values are near zero.</p>" 420 'Avoid when actual values are near zero.</p>'
421 " <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> " 421 ' <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> '
422 "Square root of mean squared percentage error. Scale-independent, " 422 'Square root of mean squared percentage error. Scale-independent, '
423 "penalizes larger relative errors more than MAPE. Use for forecasting " 423 'penalizes larger relative errors more than MAPE. Use for forecasting '
424 "or when relative accuracy matters.</p>" 424 'or when relative accuracy matters.</p>'
425 " <p><strong>R² Score:</strong> Proportion of variance in the target " 425 ' <p><strong>R² Score:</strong> Proportion of variance in the target '
426 "explained by the model. Ranges from negative infinity to 1 (perfect prediction). " 426 'explained by the model. Ranges from negative infinity to 1 (perfect prediction). '
427 "Use to assess model fit; negative values indicate poor performance " 427 'Use to assess model fit; negative values indicate poor performance '
428 "compared to predicting the mean.</p>" 428 'compared to predicting the mean.</p>'
429 " <h3>3) Classification Metrics</h3>" 429 ' <h3>3) Classification Metrics</h3>'
430 " <p><strong>Accuracy:</strong> Proportion of correct predictions " 430 ' <p><strong>Accuracy:</strong> Proportion of correct predictions '
431 "among all predictions. Simple but misleading for imbalanced datasets, " 431 'among all predictions. Simple but misleading for imbalanced datasets, '
432 "where high accuracy may hide poor performance on minority classes.</p>" 432 'where high accuracy may hide poor performance on minority classes.</p>'
433 " <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives " 433 ' <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives '
434 "across all classes before computing accuracy. Suitable for multiclass or " 434 'across all classes before computing accuracy. Suitable for multiclass or '
435 "multilabel problems with imbalanced data.</p>" 435 'multilabel problems with imbalanced data.</p>'
436 " <p><strong>Token Accuracy:</strong> Measures how often predicted tokens " 436 ' <p><strong>Token Accuracy:</strong> Measures how often predicted tokens '
437 "(e.g., in sequences) match true tokens. Common in NLP tasks like text generation " 437 '(e.g., in sequences) match true tokens. Common in NLP tasks like text generation '
438 "or token classification.</p>" 438 'or token classification.</p>'
439 " <p><strong>Precision:</strong> Proportion of positive predictions that are " 439 ' <p><strong>Precision:</strong> Proportion of positive predictions that are '
440 "correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>" 440 'correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>'
441 " <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives " 441 ' <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives '
442 "correctly predicted (TP / (TP + FN)). Use when missing positives is risky, " 442 'correctly predicted (TP / (TP + FN)). Use when missing positives is risky, '
443 "e.g., disease detection.</p>" 443 'e.g., disease detection.</p>'
444 " <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). " 444 ' <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). '
445 "Measures ability to identify negatives. Useful in medical testing to avoid " 445 'Measures ability to identify negatives. Useful in medical testing to avoid '
446 "false alarms.</p>" 446 'false alarms.</p>'
447 " <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>" 447 ' <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>'
448 " <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric " 448 ' <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric '
449 "across all classes, treating each equally. Best for balanced datasets where " 449 'across all classes, treating each equally. Best for balanced datasets where '
450 "all classes are equally important.</p>" 450 'all classes are equally important.</p>'
451 " <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, " 451 ' <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, '
452 "false positives, and false negatives across all classes before computing. " 452 'false positives, and false negatives across all classes before computing. '
453 "Ideal for imbalanced or multilabel classification.</p>" 453 'Ideal for imbalanced or multilabel classification.</p>'
454 " <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics " 454 ' <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics '
455 "across classes, weighted by the number of true instances per class. Balances " 455 'across classes, weighted by the number of true instances per class. Balances '
456 "class importance based on frequency.</p>" 456 'class importance based on frequency.</p>'
457 " <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>" 457 ' <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>'
458 " <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged " 458 ' <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged '
459 "equally across classes. Use for balanced multiclass problems.</p>" 459 'equally across classes. Use for balanced multiclass problems.</p>'
460 " <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC " 460 ' <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC '
461 "using all instances. Best for imbalanced or multilabel classification.</p>" 461 'using all instances. Best for imbalanced or multilabel classification.</p>'
462 " <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged " 462 ' <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged '
463 "across individual samples. Ideal for multilabel tasks where samples have multiple " 463 'across individual samples. Ideal for multilabel tasks where samples have multiple '
464 "labels.</p>" 464 'labels.</p>'
465 " <h3>6) Classification: ROC-AUC Variants</h3>" 465 ' <h3>6) Classification: ROC-AUC Variants</h3>'
466 " <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. " 466 ' <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. '
467 "AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>" 467 'AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>'
468 " <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. " 468 ' <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. '
469 "Suitable for balanced multiclass problems.</p>" 469 'Suitable for balanced multiclass problems.</p>'
470 " <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions " 470 ' <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions '
471 "across all classes. Useful for imbalanced or multilabel settings.</p>" 471 'across all classes. Useful for imbalanced or multilabel settings.</p>'
472 " <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>" 472 ' <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>'
473 " <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions " 473 ' <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions '
474 "for positives and negatives, respectively.</p>" 474 'for positives and negatives, respectively.</p>'
475 " <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions " 475 ' <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions '
476 "— false alarms and missed detections.</p>" 476 '— false alarms and missed detections.</p>'
477 " <h3>8) Classification: Ranking Metrics</h3>" 477 ' <h3>8) Classification: Ranking Metrics</h3>'
478 " <p><strong>Hits at K:</strong> Measures whether the true label is among the " 478 ' <p><strong>Hits at K:</strong> Measures whether the true label is among the '
479 "top-K predictions. Common in recommendation systems and retrieval tasks.</p>" 479 'top-K predictions. Common in recommendation systems and retrieval tasks.</p>'
480 " <h3>9) Other Metrics (Classification)</h3>" 480 ' <h3>9) Other Metrics (Classification)</h3>'
481 " <p><strong>Cohen's Kappa:</strong> Measures agreement between predicted and " 481 ' <p><strong>Cohen\'s Kappa:</strong> Measures agreement between predicted and '
482 "actual labels, adjusted for chance. Useful for multiclass classification with " 482 'actual labels, adjusted for chance. Useful for multiclass classification with '
483 "imbalanced data.</p>" 483 'imbalanced data.</p>'
484 " <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure " 484 ' <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure '
485 "using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>" 485 'using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>'
486 " <h3>10) Metric Recommendations</h3>" 486 ' <h3>10) Metric Recommendations</h3>'
487 " <ul>" 487 ' <ul>'
488 " <li><strong>Regression:</strong> Use <strong>RMSE</strong> or " 488 ' <li><strong>Regression:</strong> Use <strong>RMSE</strong> or '
489 "<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative " 489 '<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative '
490 "errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or " 490 'errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or '
491 "<strong>RMSPE</strong> when large errors are critical.</li>" 491 '<strong>RMSPE</strong> when large errors are critical.</li>'
492 " <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> " 492 ' <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> '
493 "and <strong>F1</strong> for overall performance.</li>" 493 'and <strong>F1</strong> for overall performance.</li>'
494 " <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, " 494 ' <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, '
495 "<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class " 495 '<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class '
496 "performance.</li>" 496 'performance.</li>'
497 " <li><strong>Multilabel or Imbalanced Classification:</strong> Use " 497 ' <li><strong>Multilabel or Imbalanced Classification:</strong> Use '
498 "<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>" 498 '<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>'
499 " <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> " 499 ' <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> '
500 "or <strong>Macro ROC-AUC</strong>.</li>" 500 'or <strong>Macro ROC-AUC</strong>.</li>'
501 " <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> " 501 ' <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> '
502 "to account for class imbalance.</li>" 502 'to account for class imbalance.</li>'
503 " <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>" 503 ' <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>'
504 " <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> " 504 ' <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> '
505 "for class-wise performance in classification.</li>" 505 'for class-wise performance in classification.</li>'
506 " </ul>" 506 ' </ul>'
507 " </div>" 507 ' </div>'
508 " </div>" 508 ' </div>'
509 "</div>" 509 '</div>'
510 ) 510 )
511 511
512 modal_js = ( 512 modal_js = (
513 "<script>" 513 "<script>"
514 "document.addEventListener('DOMContentLoaded', function() {" 514 "document.addEventListener('DOMContentLoaded', function() {"