Mercurial > repos > bgruening > sklearn_numeric_clustering
diff keras_train_and_eval.py @ 40:06d772036a62 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author | bgruening |
---|---|
date | Wed, 09 Aug 2023 13:11:48 +0000 |
parents | 73e7f1c76ece |
children | bb9fc9d46ea4 |
line wrap: on
line diff
--- a/keras_train_and_eval.py Thu Aug 11 08:51:18 2022 +0000 +++ b/keras_train_and_eval.py Wed Aug 09 13:11:48 2023 +0000 @@ -1,34 +1,43 @@ import argparse import json import os -import pickle import warnings from itertools import chain import joblib import numpy as np import pandas as pd -from galaxy_ml.externals.selene_sdk.utils import compute_score -from galaxy_ml.keras_galaxy_models import _predict_generator +from galaxy_ml.keras_galaxy_models import ( + _predict_generator, + KerasGBatchClassifier, +) +from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5 from galaxy_ml.model_validations import train_test_split -from galaxy_ml.utils import (clean_params, get_main_estimator, - get_module, get_scoring, load_model, read_columns, - SafeEval, try_get_attr) +from galaxy_ml.utils import ( + clean_params, + gen_compute_scores, + get_main_estimator, + get_module, + get_scoring, + read_columns, + SafeEval +) from scipy.io import mmread -from sklearn.metrics.scorer import _check_multimetric_scoring -from sklearn.model_selection import _search, _validation +from sklearn.metrics._scorer import _check_multimetric_scoring from sklearn.model_selection._validation import _score -from sklearn.pipeline import Pipeline -from sklearn.utils import indexable, safe_indexing - -_fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") -setattr(_search, "_fit_and_score", _fit_and_score) -setattr(_validation, "_fit_and_score", _fit_and_score) +from sklearn.utils import _safe_indexing, indexable N_JOBS = int(os.environ.get("GALAXY_SLOTS", 1)) CACHE_DIR = os.path.join(os.getcwd(), "cached") -del os -NON_SEARCHABLE = ("n_jobs", "pre_dispatch", "memory", "_path", "nthread", "callbacks") +NON_SEARCHABLE = ( + "n_jobs", + "pre_dispatch", + "memory", + "_path", + "_dir", + "nthread", + "callbacks", +) ALLOWED_CALLBACKS = ( "EarlyStopping", "TerminateOnNaN", @@ -96,7 +105,7 @@ train = index_arr[~np.isin(groups, group_names)] rval = list( chain.from_iterable( - (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays + (_safe_indexing(a, train), _safe_indexing(a, test)) for a in new_arrays ) ) else: @@ -108,68 +117,69 @@ return rval -def _evaluate(y_true, pred_probas, scorer, is_multimetric=True): - """output scores based on input scorer +def _evaluate_keras_and_sklearn_scores( + estimator, + data_generator, + X, + y=None, + sk_scoring=None, + steps=None, + batch_size=32, + return_predictions=False, +): + """output scores for bother keras and sklearn metrics Parameters - ---------- - y_true : array - True label or target values - pred_probas : array - Prediction values, probability for classification problem - scorer : dict - dict of `sklearn.metrics.scorer.SCORER` - is_multimetric : bool, default is True + ----------- + estimator : object + Fitted `galaxy_ml.keras_galaxy_models.KerasGBatchClassifier`. + data_generator : object + From `galaxy_ml.preprocessors.ImageDataFrameBatchGenerator`. + X : 2-D array + Contains indecies of images that need to be evaluated. + y : None + Target value. + sk_scoring : dict + Galaxy tool input parameters. + steps : integer or None + Evaluation/prediction steps before stop. + batch_size : integer + Number of samples in a batch + return_predictions : bool, default is False + Whether to return predictions and true labels. """ - if y_true.ndim == 1 or y_true.shape[-1] == 1: - pred_probas = pred_probas.ravel() - pred_labels = (pred_probas > 0.5).astype("int32") - targets = y_true.ravel().astype("int32") - if not is_multimetric: - preds = ( - pred_labels - if scorer.__class__.__name__ == "_PredictScorer" - else pred_probas - ) - score = scorer._score_func(targets, preds, **scorer._kwargs) + scores = {} - return score - else: - scores = {} - for name, one_scorer in scorer.items(): - preds = ( - pred_labels - if one_scorer.__class__.__name__ == "_PredictScorer" - else pred_probas - ) - score = one_scorer._score_func(targets, preds, **one_scorer._kwargs) - scores[name] = score - - # TODO: multi-class metrics - # multi-label + generator = data_generator.flow(X, y=y, batch_size=batch_size) + # keras metrics evaluation + # handle scorer, convert to scorer dict + generator.reset() + score_results = estimator.model_.evaluate_generator(generator, steps=steps) + metrics_names = estimator.model_.metrics_names + if not isinstance(metrics_names, list): + scores[metrics_names] = score_results else: - pred_labels = (pred_probas > 0.5).astype("int32") - targets = y_true.astype("int32") - if not is_multimetric: - preds = ( - pred_labels - if scorer.__class__.__name__ == "_PredictScorer" - else pred_probas - ) - score, _ = compute_score(preds, targets, scorer._score_func) - return score - else: - scores = {} - for name, one_scorer in scorer.items(): - preds = ( - pred_labels - if one_scorer.__class__.__name__ == "_PredictScorer" - else pred_probas - ) - score, _ = compute_score(preds, targets, one_scorer._score_func) - scores[name] = score + scores = dict(zip(metrics_names, score_results)) + + if sk_scoring["primary_scoring"] == "default" and not return_predictions: + return scores + + generator.reset() + predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps) - return scores + # for sklearn metrics + if sk_scoring["primary_scoring"] != "default": + scorer = get_scoring(sk_scoring) + if not isinstance(scorer, (dict, list)): + scorer = [sk_scoring["primary_scoring"]] + scorer = _check_multimetric_scoring(estimator, scoring=scorer) + sk_scores = gen_compute_scores(y_true, predictions, scorer) + scores.update(sk_scores) + + if return_predictions: + return scores, predictions, y_true + else: + return scores, None, None def main( @@ -179,7 +189,6 @@ infile2, outfile_result, outfile_object=None, - outfile_weights=None, outfile_y_true=None, outfile_y_preds=None, groups=None, @@ -192,46 +201,43 @@ Parameter --------- inputs : str - File path to galaxy tool parameter + File path to galaxy tool parameter. infile_estimator : str - File path to estimator + File path to estimator. infile1 : str - File path to dataset containing features + File path to dataset containing features. infile2 : str - File path to dataset containing target values + File path to dataset containing target values. outfile_result : str - File path to save the results, either cv_results or test result + File path to save the results, either cv_results or test result. outfile_object : str, optional - File path to save searchCV object - - outfile_weights : str, optional - File path to save deep learning model weights + File path to save searchCV object. outfile_y_true : str, optional - File path to target values for prediction + File path to target values for prediction. outfile_y_preds : str, optional - File path to save deep learning model weights + File path to save predictions. groups : str - File path to dataset containing groups labels + File path to dataset containing groups labels. ref_seq : str - File path to dataset containing genome sequence file + File path to dataset containing genome sequence file. intervals : str - File path to dataset containing interval file + File path to dataset containing interval file. targets : str - File path to dataset compressed target bed file + File path to dataset compressed target bed file. fasta_path : str - File path to dataset containing fasta file + File path to dataset containing fasta file. """ warnings.simplefilter("ignore") @@ -239,8 +245,7 @@ params = json.load(param_handler) # load estimator - with open(infile_estimator, "rb") as estimator_handler: - estimator = load_model(estimator_handler) + estimator = load_model_from_h5(infile_estimator) estimator = clean_params(estimator) @@ -333,7 +338,12 @@ loaded_df[df_key] = infile2 y = read_columns( - infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + infile2, + c=c, + c_option=column_option, + sep="\t", + header=header, + parse_dates=True, ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() @@ -387,16 +397,10 @@ # handle scorer, convert to scorer dict scoring = params["experiment_schemes"]["metrics"]["scoring"] - if scoring is not None: - # get_scoring() expects secondary_scoring to be a comma separated string (not a list) - # Check if secondary_scoring is specified - secondary_scoring = scoring.get("secondary_scoring", None) - if secondary_scoring is not None: - # If secondary_scoring is specified, convert the list into comman separated string - scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"]) - scorer = get_scoring(scoring) - scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer) + if not isinstance(scorer, (dict, list)): + scorer = [scoring["primary_scoring"]] + scorer = _check_multimetric_scoring(estimator, scoring=scorer) # handle test (first) split test_split_options = params["experiment_schemes"]["test_split"]["split_algos"] @@ -411,14 +415,9 @@ "Stratified shuffle split is not " "applicable on empty target values!" ) - ( - X_train, - X_test, - y_train, - y_test, - groups_train, - _groups_test, - ) = train_test_split_none(X, y, groups, **test_split_options) + X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split_none( + X, y, groups, **test_split_options + ) exp_scheme = params["experiment_schemes"]["selected_exp_scheme"] @@ -443,11 +442,11 @@ y_train, y_val, groups_train, - _groups_val, + groups_val, ) = train_test_split_none(X_train, y_train, groups_train, **val_split_options) # train and eval - if hasattr(estimator, "validation_data"): + if hasattr(estimator, "config") and hasattr(estimator, "model_type"): if exp_scheme == "train_val_test": estimator.fit(X_train, y_train, validation_data=(X_val, y_val)) else: @@ -455,25 +454,46 @@ else: estimator.fit(X_train, y_train) - if hasattr(estimator, "evaluate"): + if isinstance(estimator, KerasGBatchClassifier): + scores = {} steps = estimator.prediction_steps batch_size = estimator.batch_size - generator = estimator.data_generator_.flow( - X_test, y=y_test, batch_size=batch_size + data_generator = estimator.data_generator_ + + scores, predictions, y_true = _evaluate_keras_and_sklearn_scores( + estimator, + data_generator, + X_test, + y=y_test, + sk_scoring=scoring, + steps=steps, + batch_size=batch_size, + return_predictions=bool(outfile_y_true), ) - predictions, y_true = _predict_generator( - estimator.model_, generator, steps=steps - ) - scores = _evaluate(y_true, predictions, scorer, is_multimetric=True) else: + scores = {} + if hasattr(estimator, "model_") and hasattr(estimator.model_, "metrics_names"): + batch_size = estimator.batch_size + score_results = estimator.model_.evaluate( + X_test, y=y_test, batch_size=batch_size, verbose=0 + ) + metrics_names = estimator.model_.metrics_names + if not isinstance(metrics_names, list): + scores[metrics_names] = score_results + else: + scores = dict(zip(metrics_names, score_results)) + if hasattr(estimator, "predict_proba"): predictions = estimator.predict_proba(X_test) else: predictions = estimator.predict(X_test) y_true = y_test - scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True) + sk_scores = _score(estimator, X_test, y_test, scorer) + scores.update(sk_scores) + + # handle output if outfile_y_true: try: pd.DataFrame(y_true).to_csv(outfile_y_true, sep="\t", index=False) @@ -486,7 +506,6 @@ ) except Exception as e: print("Error in saving predictions: %s" % e) - # handle output for name, score in scores.items(): scores[name] = [score] @@ -497,23 +516,7 @@ memory.clear(warn=False) if outfile_object: - main_est = estimator - if isinstance(estimator, Pipeline): - main_est = estimator.steps[-1][-1] - - if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"): - if outfile_weights: - main_est.save_weights(outfile_weights) - del main_est.model_ - del main_est.fit_params - del main_est.model_class_ - if getattr(main_est, "validation_data", None): - del main_est.validation_data - if getattr(main_est, "data_generator_", None): - del main_est.data_generator_ - - with open(outfile_object, "wb") as output_handler: - pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL) + dump_model_to_h5(estimator, outfile_object) if __name__ == "__main__": @@ -524,7 +527,6 @@ aparser.add_argument("-y", "--infile2", dest="infile2") aparser.add_argument("-O", "--outfile_result", dest="outfile_result") aparser.add_argument("-o", "--outfile_object", dest="outfile_object") - aparser.add_argument("-w", "--outfile_weights", dest="outfile_weights") aparser.add_argument("-l", "--outfile_y_true", dest="outfile_y_true") aparser.add_argument("-p", "--outfile_y_preds", dest="outfile_y_preds") aparser.add_argument("-g", "--groups", dest="groups") @@ -541,7 +543,6 @@ args.infile2, args.outfile_result, outfile_object=args.outfile_object, - outfile_weights=args.outfile_weights, outfile_y_true=args.outfile_y_true, outfile_y_preds=args.outfile_y_preds, groups=args.groups,