Mercurial > repos > goeckslab > image_learner
diff utils.py @ 9:9e912fce264c draft default tip
planemo upload for repository https://github.com/goeckslab/gleam.git commit eace0d7c2b2939029c052991d238a54947d2e191
author | goeckslab |
---|---|
date | Wed, 27 Aug 2025 21:02:48 +0000 |
parents | 85e6f4b2ad18 |
children |
line wrap: on
line diff
--- a/utils.py Thu Aug 14 14:53:10 2025 +0000 +++ b/utils.py Wed Aug 27 21:02:48 2025 +0000 @@ -8,8 +8,6 @@ <head> <meta charset="UTF-8"> <title>Galaxy-Ludwig Report</title> - - <!-- your existing styles --> <style> body { font-family: Arial, sans-serif; @@ -328,121 +326,121 @@ '<div id="metricsHelpModal" class="modal">' ' <div class="modal-content">' ' <span class="close">×</span>' - ' <h2>Model Evaluation Metrics — Help Guide</h2>' + " <h2>Model Evaluation Metrics — Help Guide</h2>" ' <div class="metrics-guide">' - ' <h3>1) General Metrics (Regression and Classification)</h3>' - ' <p><strong>Loss (Regression & Classification):</strong> ' - 'Measures the difference between predicted and actual values, ' - 'optimized during training. Lower is better. ' - 'For regression, this is often Mean Squared Error (MSE) or ' - 'Mean Absolute Error (MAE). For classification, it’s typically ' - 'cross-entropy or log loss.</p>' - ' <h3>2) Regression Metrics</h3>' - ' <p><strong>Mean Absolute Error (MAE):</strong> ' - 'Average of absolute differences between predicted and actual values, ' - 'in the same units as the target. Use for interpretable error measurement ' - 'when all errors are equally important. Less sensitive to outliers than MSE.</p>' - ' <p><strong>Mean Squared Error (MSE):</strong> ' - 'Average of squared differences between predicted and actual values. ' - 'Penalizes larger errors more heavily, useful when large deviations are critical. ' - 'Often used as the loss function in regression.</p>' - ' <p><strong>Root Mean Squared Error (RMSE):</strong> ' - 'Square root of MSE, in the same units as the target. ' - 'Balances interpretability and sensitivity to large errors. ' - 'Widely used for regression evaluation.</p>' - ' <p><strong>Mean Absolute Percentage Error (MAPE):</strong> ' - 'Average absolute error as a percentage of actual values. ' - 'Scale-independent, ideal for comparing relative errors across datasets. ' - 'Avoid when actual values are near zero.</p>' - ' <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> ' - 'Square root of mean squared percentage error. Scale-independent, ' - 'penalizes larger relative errors more than MAPE. Use for forecasting ' - 'or when relative accuracy matters.</p>' - ' <p><strong>R² Score:</strong> Proportion of variance in the target ' - 'explained by the model. Ranges from negative infinity to 1 (perfect prediction). ' - 'Use to assess model fit; negative values indicate poor performance ' - 'compared to predicting the mean.</p>' - ' <h3>3) Classification Metrics</h3>' - ' <p><strong>Accuracy:</strong> Proportion of correct predictions ' - 'among all predictions. Simple but misleading for imbalanced datasets, ' - 'where high accuracy may hide poor performance on minority classes.</p>' - ' <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives ' - 'across all classes before computing accuracy. Suitable for multiclass or ' - 'multilabel problems with imbalanced data.</p>' - ' <p><strong>Token Accuracy:</strong> Measures how often predicted tokens ' - '(e.g., in sequences) match true tokens. Common in NLP tasks like text generation ' - 'or token classification.</p>' - ' <p><strong>Precision:</strong> Proportion of positive predictions that are ' - 'correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>' - ' <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives ' - 'correctly predicted (TP / (TP + FN)). Use when missing positives is risky, ' - 'e.g., disease detection.</p>' - ' <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). ' - 'Measures ability to identify negatives. Useful in medical testing to avoid ' - 'false alarms.</p>' - ' <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>' - ' <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric ' - 'across all classes, treating each equally. Best for balanced datasets where ' - 'all classes are equally important.</p>' - ' <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, ' - 'false positives, and false negatives across all classes before computing. ' - 'Ideal for imbalanced or multilabel classification.</p>' - ' <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics ' - 'across classes, weighted by the number of true instances per class. Balances ' - 'class importance based on frequency.</p>' - ' <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>' - ' <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged ' - 'equally across classes. Use for balanced multiclass problems.</p>' - ' <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC ' - 'using all instances. Best for imbalanced or multilabel classification.</p>' - ' <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged ' - 'across individual samples. Ideal for multilabel tasks where samples have multiple ' - 'labels.</p>' - ' <h3>6) Classification: ROC-AUC Variants</h3>' - ' <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. ' - 'AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>' - ' <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. ' - 'Suitable for balanced multiclass problems.</p>' - ' <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions ' - 'across all classes. Useful for imbalanced or multilabel settings.</p>' - ' <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>' - ' <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions ' - 'for positives and negatives, respectively.</p>' - ' <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions ' - '— false alarms and missed detections.</p>' - ' <h3>8) Classification: Ranking Metrics</h3>' - ' <p><strong>Hits at K:</strong> Measures whether the true label is among the ' - 'top-K predictions. Common in recommendation systems and retrieval tasks.</p>' - ' <h3>9) Other Metrics (Classification)</h3>' - ' <p><strong>Cohen\'s Kappa:</strong> Measures agreement between predicted and ' - 'actual labels, adjusted for chance. Useful for multiclass classification with ' - 'imbalanced data.</p>' - ' <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure ' - 'using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>' - ' <h3>10) Metric Recommendations</h3>' - ' <ul>' - ' <li><strong>Regression:</strong> Use <strong>RMSE</strong> or ' - '<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative ' - 'errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or ' - '<strong>RMSPE</strong> when large errors are critical.</li>' - ' <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> ' - 'and <strong>F1</strong> for overall performance.</li>' - ' <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, ' - '<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class ' - 'performance.</li>' - ' <li><strong>Multilabel or Imbalanced Classification:</strong> Use ' - '<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>' - ' <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> ' - 'or <strong>Macro ROC-AUC</strong>.</li>' - ' <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> ' - 'to account for class imbalance.</li>' - ' <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>' - ' <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> ' - 'for class-wise performance in classification.</li>' - ' </ul>' - ' </div>' - ' </div>' - '</div>' + " <h3>1) General Metrics (Regression and Classification)</h3>" + " <p><strong>Loss (Regression & Classification):</strong> " + "Measures the difference between predicted and actual values, " + "optimized during training. Lower is better. " + "For regression, this is often Mean Squared Error (MSE) or " + "Mean Absolute Error (MAE). For classification, it’s typically " + "cross-entropy or log loss.</p>" + " <h3>2) Regression Metrics</h3>" + " <p><strong>Mean Absolute Error (MAE):</strong> " + "Average of absolute differences between predicted and actual values, " + "in the same units as the target. Use for interpretable error measurement " + "when all errors are equally important. Less sensitive to outliers than MSE.</p>" + " <p><strong>Mean Squared Error (MSE):</strong> " + "Average of squared differences between predicted and actual values. " + "Penalizes larger errors more heavily, useful when large deviations are critical. " + "Often used as the loss function in regression.</p>" + " <p><strong>Root Mean Squared Error (RMSE):</strong> " + "Square root of MSE, in the same units as the target. " + "Balances interpretability and sensitivity to large errors. " + "Widely used for regression evaluation.</p>" + " <p><strong>Mean Absolute Percentage Error (MAPE):</strong> " + "Average absolute error as a percentage of actual values. " + "Scale-independent, ideal for comparing relative errors across datasets. " + "Avoid when actual values are near zero.</p>" + " <p><strong>Root Mean Squared Percentage Error (RMSPE):</strong> " + "Square root of mean squared percentage error. Scale-independent, " + "penalizes larger relative errors more than MAPE. Use for forecasting " + "or when relative accuracy matters.</p>" + " <p><strong>R² Score:</strong> Proportion of variance in the target " + "explained by the model. Ranges from negative infinity to 1 (perfect prediction). " + "Use to assess model fit; negative values indicate poor performance " + "compared to predicting the mean.</p>" + " <h3>3) Classification Metrics</h3>" + " <p><strong>Accuracy:</strong> Proportion of correct predictions " + "among all predictions. Simple but misleading for imbalanced datasets, " + "where high accuracy may hide poor performance on minority classes.</p>" + " <p><strong>Micro Accuracy:</strong> Sums true positives and true negatives " + "across all classes before computing accuracy. Suitable for multiclass or " + "multilabel problems with imbalanced data.</p>" + " <p><strong>Token Accuracy:</strong> Measures how often predicted tokens " + "(e.g., in sequences) match true tokens. Common in NLP tasks like text generation " + "or token classification.</p>" + " <p><strong>Precision:</strong> Proportion of positive predictions that are " + "correct (TP / (TP + FP)). Use when false positives are costly, e.g., spam detection.</p>" + " <p><strong>Recall (Sensitivity):</strong> Proportion of actual positives " + "correctly predicted (TP / (TP + FN)). Use when missing positives is risky, " + "e.g., disease detection.</p>" + " <p><strong>Specificity:</strong> True negative rate (TN / (TN + FP)). " + "Measures ability to identify negatives. Useful in medical testing to avoid " + "false alarms.</p>" + " <h3>4) Classification: Macro, Micro, and Weighted Averages</h3>" + " <p><strong>Macro Precision / Recall / F1:</strong> Averages the metric " + "across all classes, treating each equally. Best for balanced datasets where " + "all classes are equally important.</p>" + " <p><strong>Micro Precision / Recall / F1:</strong> Aggregates true positives, " + "false positives, and false negatives across all classes before computing. " + "Ideal for imbalanced or multilabel classification.</p>" + " <p><strong>Weighted Precision / Recall / F1:</strong> Averages metrics " + "across classes, weighted by the number of true instances per class. Balances " + "class importance based on frequency.</p>" + " <h3>5) Classification: Average Precision (PR-AUC Variants)</h3>" + " <p><strong>Average Precision Macro:</strong> Precision-Recall AUC averaged " + "equally across classes. Use for balanced multiclass problems.</p>" + " <p><strong>Average Precision Micro:</strong> Global Precision-Recall AUC " + "using all instances. Best for imbalanced or multilabel classification.</p>" + " <p><strong>Average Precision Samples:</strong> Precision-Recall AUC averaged " + "across individual samples. Ideal for multilabel tasks where samples have multiple " + "labels.</p>" + " <h3>6) Classification: ROC-AUC Variants</h3>" + " <p><strong>ROC-AUC:</strong> Measures ability to distinguish between classes. " + "AUC = 1 is perfect; 0.5 is random guessing. Use for binary classification.</p>" + " <p><strong>Macro ROC-AUC:</strong> Averages AUC across all classes equally. " + "Suitable for balanced multiclass problems.</p>" + " <p><strong>Micro ROC-AUC:</strong> Computes AUC from aggregated predictions " + "across all classes. Useful for imbalanced or multilabel settings.</p>" + " <h3>7) Classification: Confusion Matrix Stats (Per Class)</h3>" + " <p><strong>True Positives / Negatives (TP / TN):</strong> Correct predictions " + "for positives and negatives, respectively.</p>" + " <p><strong>False Positives / Negatives (FP / FN):</strong> Incorrect predictions " + "— false alarms and missed detections.</p>" + " <h3>8) Classification: Ranking Metrics</h3>" + " <p><strong>Hits at K:</strong> Measures whether the true label is among the " + "top-K predictions. Common in recommendation systems and retrieval tasks.</p>" + " <h3>9) Other Metrics (Classification)</h3>" + " <p><strong>Cohen's Kappa:</strong> Measures agreement between predicted and " + "actual labels, adjusted for chance. Useful for multiclass classification with " + "imbalanced data.</p>" + " <p><strong>Matthews Correlation Coefficient (MCC):</strong> Balanced measure " + "using TP, TN, FP, and FN. Effective for imbalanced datasets.</p>" + " <h3>10) Metric Recommendations</h3>" + " <ul>" + " <li><strong>Regression:</strong> Use <strong>RMSE</strong> or " + "<strong>MAE</strong> for general evaluation, <strong>MAPE</strong> for relative " + "errors, and <strong>R²</strong> to assess model fit. Use <strong>MSE</strong> or " + "<strong>RMSPE</strong> when large errors are critical.</li>" + " <li><strong>Classification (Balanced Data):</strong> Use <strong>Accuracy</strong> " + "and <strong>F1</strong> for overall performance.</li>" + " <li><strong>Classification (Imbalanced Data):</strong> Use <strong>Precision</strong>, " + "<strong>Recall</strong>, and <strong>ROC-AUC</strong> to focus on minority class " + "performance.</li>" + " <li><strong>Multilabel or Imbalanced Classification:</strong> Use " + "<strong>Micro Precision/Recall/F1</strong> or <strong>Micro ROC-AUC</strong>.</li>" + " <li><strong>Balanced Multiclass:</strong> Use <strong>Macro Precision/Recall/F1</strong> " + "or <strong>Macro ROC-AUC</strong>.</li>" + " <li><strong>Class Frequency Matters:</strong> Use <strong>Weighted Precision/Recall/F1</strong> " + "to account for class imbalance.</li>" + " <li><strong>Recommendation/Ranking:</strong> Use <strong>Hits at K</strong> for retrieval tasks.</li>" + " <li><strong>Detailed Analysis:</strong> Use <strong>Confusion Matrix stats</strong> " + "for class-wise performance in classification.</li>" + " </ul>" + " </div>" + " </div>" + "</div>" ) modal_css = ( "<style>" @@ -497,17 +495,17 @@ ' var span = document.getElementsByClassName("close")[0];' " if (openBtn && modal) {" " openBtn.onclick = function() {" - " modal.style.display = \"block\";" + ' modal.style.display = "block";' " };" " }" " if (span && modal) {" " span.onclick = function() {" - " modal.style.display = \"none\";" + ' modal.style.display = "none";' " };" " }" " window.onclick = function(event) {" " if (event.target == modal) {" - " modal.style.display = \"none\";" + ' modal.style.display = "none";' " }" " }" "});"