sklearn_data_preprocess: keras_train_and

comparison keras_train_and_eval.py @ 41:a16f33c6ca64 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb

author	bgruening
date	Wed, 09 Aug 2023 13:29:02 +0000
parents	1bef885255e0
children	6c030fe29722

comparison

equal deleted inserted replaced

-:80074b842ebd
+:a16f33c6ca64
 import argparse
 import json
 import os
-import pickle
 import warnings
 from itertools import chain
 import joblib
 import numpy as np
 import pandas as pd
-from galaxy_ml.externals.selene_sdk.utils import compute_score
+from galaxy_ml.keras_galaxy_models import (
-from galaxy_ml.keras_galaxy_models import _predict_generator
+_predict_generator,
+KerasGBatchClassifier,
+)
+from galaxy_ml.model_persist import dump_model_to_h5, load_model_from_h5
 from galaxy_ml.model_validations import train_test_split
-from galaxy_ml.utils import (clean_params, get_main_estimator,
+from galaxy_ml.utils import (
-get_module, get_scoring, load_model, read_columns,
+clean_params,
-SafeEval, try_get_attr)
+gen_compute_scores,
+get_main_estimator,
+get_module,
+get_scoring,
+read_columns,
+SafeEval
+)
 from scipy.io import mmread
-from sklearn.metrics.scorer import _check_multimetric_scoring
+from sklearn.metrics._scorer import _check_multimetric_scoring
-from sklearn.model_selection import _search, _validation
 from sklearn.model_selection._validation import _score
-from sklearn.pipeline import Pipeline
+from sklearn.utils import _safe_indexing, indexable
-from sklearn.utils import indexable, safe_indexing
-_fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
-setattr(_search, "_fit_and_score", _fit_and_score)
-setattr(_validation, "_fit_and_score", _fit_and_score)
 N_JOBS = int(os.environ.get("GALAXY_SLOTS", 1))
 CACHE_DIR = os.path.join(os.getcwd(), "cached")
-del os
+NON_SEARCHABLE = (
-NON_SEARCHABLE = ("n_jobs", "pre_dispatch", "memory", "_path", "nthread", "callbacks")
+"n_jobs",
+"pre_dispatch",
+"memory",
+"_path",
+"_dir",
+"nthread",
+"callbacks",
+)
 ALLOWED_CALLBACKS = (
 "EarlyStopping",
 "TerminateOnNaN",
 "ReduceLROnPlateau",
 "CSVLogger",
 index_arr = np.arange(n_samples)
 test = index_arr[np.isin(groups, group_names)]
 train = index_arr[~np.isin(groups, group_names)]
 rval = list(
 chain.from_iterable(
-(safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays
+(_safe_indexing(a, train), _safe_indexing(a, test)) for a in new_arrays
 )
 )
 else:
 rval = train_test_split(*new_arrays, **kwargs)
 rval[pos * 2: 2] = [None, None]
 return rval
-def _evaluate(y_true, pred_probas, scorer, is_multimetric=True):
+def _evaluate_keras_and_sklearn_scores(
-"""output scores based on input scorer
+estimator,
+data_generator,
+X,
+y=None,
+sk_scoring=None,
+steps=None,
+batch_size=32,
+return_predictions=False,
+):
+"""output scores for bother keras and sklearn metrics
 Parameters
-----------
+-----------
-y_true : array
+estimator : object
-True label or target values
+Fitted `galaxy_ml.keras_galaxy_models.KerasGBatchClassifier`.
-pred_probas : array
+data_generator : object
-Prediction values, probability for classification problem
+From `galaxy_ml.preprocessors.ImageDataFrameBatchGenerator`.
-scorer : dict
+X : 2-D array
-dict of `sklearn.metrics.scorer.SCORER`
+Contains indecies of images that need to be evaluated.
-is_multimetric : bool, default is True
+y : None
+Target value.
+sk_scoring : dict
+Galaxy tool input parameters.
+steps : integer or None
+Evaluation/prediction steps before stop.
+batch_size : integer
+Number of samples in a batch
+return_predictions : bool, default is False
+Whether to return predictions and true labels.
 """
-if y_true.ndim == 1 or y_true.shape[-1] == 1:
+scores = {}
-pred_probas = pred_probas.ravel()
-pred_labels = (pred_probas > 0.5).astype("int32")
+generator = data_generator.flow(X, y=y, batch_size=batch_size)
-targets = y_true.ravel().astype("int32")
+# keras metrics evaluation
-if not is_multimetric:
+# handle scorer, convert to scorer dict
-preds = (
+generator.reset()
-pred_labels
+score_results = estimator.model_.evaluate_generator(generator, steps=steps)
-if scorer.__class__.__name__ == "_PredictScorer"
+metrics_names = estimator.model_.metrics_names
-else pred_probas
+if not isinstance(metrics_names, list):
-)
+scores[metrics_names] = score_results
-score = scorer._score_func(targets, preds, **scorer._kwargs)
+else:
+scores = dict(zip(metrics_names, score_results))
-return score
-else:
+if sk_scoring["primary_scoring"] == "default" and not return_predictions:
-scores = {}
+return scores
-for name, one_scorer in scorer.items():
-preds = (
+generator.reset()
-pred_labels
+predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps)
-if one_scorer.__class__.__name__ == "_PredictScorer"
-else pred_probas
+# for sklearn metrics
-)
+if sk_scoring["primary_scoring"] != "default":
-score = one_scorer._score_func(targets, preds, **one_scorer._kwargs)
+scorer = get_scoring(sk_scoring)
-scores[name] = score
+if not isinstance(scorer, (dict, list)):
+scorer = [sk_scoring["primary_scoring"]]
-# TODO: multi-class metrics
+scorer = _check_multimetric_scoring(estimator, scoring=scorer)
-# multi-label
+sk_scores = gen_compute_scores(y_true, predictions, scorer)
-else:
+scores.update(sk_scores)
-pred_labels = (pred_probas > 0.5).astype("int32")
-targets = y_true.astype("int32")
+if return_predictions:
-if not is_multimetric:
+return scores, predictions, y_true
-preds = (
+else:
-pred_labels
+return scores, None, None
-if scorer.__class__.__name__ == "_PredictScorer"
-else pred_probas
-)
-score, _ = compute_score(preds, targets, scorer._score_func)
-return score
-else:
-scores = {}
-for name, one_scorer in scorer.items():
-preds = (
-pred_labels
-if one_scorer.__class__.__name__ == "_PredictScorer"
-else pred_probas
-)
-score, _ = compute_score(preds, targets, one_scorer._score_func)
-scores[name] = score
-return scores
 def main(
 inputs,
 infile_estimator,
 infile1,
 infile2,
 outfile_result,
 outfile_object=None,
-outfile_weights=None,
 outfile_y_true=None,
 outfile_y_preds=None,
 groups=None,
 ref_seq=None,
 intervals=None,
 ):
 """
 Parameter
 ---------
 inputs : str
-File path to galaxy tool parameter
+File path to galaxy tool parameter.
 infile_estimator : str
-File path to estimator
+File path to estimator.
 infile1 : str
-File path to dataset containing features
+File path to dataset containing features.
 infile2 : str
-File path to dataset containing target values
+File path to dataset containing target values.
 outfile_result : str
-File path to save the results, either cv_results or test result
+File path to save the results, either cv_results or test result.
 outfile_object : str, optional
-File path to save searchCV object
+File path to save searchCV object.
-outfile_weights : str, optional
-File path to save deep learning model weights
 outfile_y_true : str, optional
-File path to target values for prediction
+File path to target values for prediction.
 outfile_y_preds : str, optional
-File path to save deep learning model weights
+File path to save predictions.
 groups : str
-File path to dataset containing groups labels
+File path to dataset containing groups labels.
 ref_seq : str
-File path to dataset containing genome sequence file
+File path to dataset containing genome sequence file.
 intervals : str
-File path to dataset containing interval file
+File path to dataset containing interval file.
 targets : str
-File path to dataset compressed target bed file
+File path to dataset compressed target bed file.
 fasta_path : str
-File path to dataset containing fasta file
+File path to dataset containing fasta file.
 """
 warnings.simplefilter("ignore")
 with open(inputs, "r") as param_handler:
 params = json.load(param_handler)
 #  load estimator
-with open(infile_estimator, "rb") as estimator_handler:
+estimator = load_model_from_h5(infile_estimator)
-estimator = load_model(estimator_handler)
 estimator = clean_params(estimator)
 # swap hyperparameter
 swapping = params["experiment_schemes"]["hyperparams_swapping"]
 else:
 infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
 loaded_df[df_key] = infile2
 y = read_columns(
-infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+infile2,
+c=c,
+c_option=column_option,
+sep="\t",
+header=header,
+parse_dates=True,
 )
 if len(y.shape) == 2 and y.shape[1] == 1:
 y = y.ravel()
 if input_type == "refseq_and_interval":
 estimator.set_params(data_batch_generator__features=y.ravel().tolist())
 if main_est.__class__.__name__ == "IRAPSClassifier":
 main_est.set_params(memory=memory)
 # handle scorer, convert to scorer dict
 scoring = params["experiment_schemes"]["metrics"]["scoring"]
-if scoring is not None:
-# get_scoring() expects secondary_scoring to be a comma separated string (not a list)
-# Check if secondary_scoring is specified
-secondary_scoring = scoring.get("secondary_scoring", None)
-if secondary_scoring is not None:
-# If secondary_scoring is specified, convert the list into comman separated string
-scoring["secondary_scoring"] = ",".join(scoring["secondary_scoring"])
 scorer = get_scoring(scoring)
-scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
+if not isinstance(scorer, (dict, list)):
+scorer = [scoring["primary_scoring"]]
+scorer = _check_multimetric_scoring(estimator, scoring=scorer)
 # handle test (first) split
 test_split_options = params["experiment_schemes"]["test_split"]["split_algos"]
 if test_split_options["shuffle"] == "group":
 else:
 raise ValueError(
 "Stratified shuffle split is not " "applicable on empty target values!"
 )
-(
+X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_split_none(
-X_train,
+X, y, groups, **test_split_options
-X_test,
+)
-y_train,
-y_test,
-groups_train,
-_groups_test,
-) = train_test_split_none(X, y, groups, **test_split_options)
 exp_scheme = params["experiment_schemes"]["selected_exp_scheme"]
 # handle validation (second) split
 if exp_scheme == "train_val_test":
 X_train,
 X_val,
 y_train,
 y_val,
 groups_train,
-_groups_val,
+groups_val,
 ) = train_test_split_none(X_train, y_train, groups_train, **val_split_options)
 # train and eval
-if hasattr(estimator, "validation_data"):
+if hasattr(estimator, "config") and hasattr(estimator, "model_type"):
 if exp_scheme == "train_val_test":
 estimator.fit(X_train, y_train, validation_data=(X_val, y_val))
 else:
 estimator.fit(X_train, y_train, validation_data=(X_test, y_test))
 else:
 estimator.fit(X_train, y_train)
-if hasattr(estimator, "evaluate"):
+if isinstance(estimator, KerasGBatchClassifier):
+scores = {}
 steps = estimator.prediction_steps
 batch_size = estimator.batch_size
-generator = estimator.data_generator_.flow(
+data_generator = estimator.data_generator_
-X_test, y=y_test, batch_size=batch_size
+scores, predictions, y_true = _evaluate_keras_and_sklearn_scores(
+estimator,
+data_generator,
+X_test,
+y=y_test,
+sk_scoring=scoring,
+steps=steps,
+batch_size=batch_size,
+return_predictions=bool(outfile_y_true),
 )
-predictions, y_true = _predict_generator(
-estimator.model_, generator, steps=steps
+else:
-)
+scores = {}
-scores = _evaluate(y_true, predictions, scorer, is_multimetric=True)
+if hasattr(estimator, "model_") and hasattr(estimator.model_, "metrics_names"):
+batch_size = estimator.batch_size
-else:
+score_results = estimator.model_.evaluate(
+X_test, y=y_test, batch_size=batch_size, verbose=0
+)
+metrics_names = estimator.model_.metrics_names
+if not isinstance(metrics_names, list):
+scores[metrics_names] = score_results
+else:
+scores = dict(zip(metrics_names, score_results))
 if hasattr(estimator, "predict_proba"):
 predictions = estimator.predict_proba(X_test)
 else:
 predictions = estimator.predict(X_test)
 y_true = y_test
-scores = _score(estimator, X_test, y_test, scorer, is_multimetric=True)
+sk_scores = _score(estimator, X_test, y_test, scorer)
+scores.update(sk_scores)
+# handle output
 if outfile_y_true:
 try:
 pd.DataFrame(y_true).to_csv(outfile_y_true, sep="\t", index=False)
 pd.DataFrame(predictions).astype(np.float32).to_csv(
 outfile_y_preds,
 float_format="%g",
 chunksize=10000,
 )
 except Exception as e:
 print("Error in saving predictions: %s" % e)
 # handle output
 for name, score in scores.items():
 scores[name] = [score]
 df = pd.DataFrame(scores)
 df = df[sorted(df.columns)]
 df.to_csv(path_or_buf=outfile_result, sep="\t", header=True, index=False)
 memory.clear(warn=False)
 if outfile_object:
-main_est = estimator
+dump_model_to_h5(estimator, outfile_object)
-if isinstance(estimator, Pipeline):
-main_est = estimator.steps[-1][-1]
-if hasattr(main_est, "model_") and hasattr(main_est, "save_weights"):
-if outfile_weights:
-main_est.save_weights(outfile_weights)
-del main_est.model_
-del main_est.fit_params
-del main_est.model_class_
-if getattr(main_est, "validation_data", None):
-del main_est.validation_data
-if getattr(main_est, "data_generator_", None):
-del main_est.data_generator_
-with open(outfile_object, "wb") as output_handler:
-pickle.dump(estimator, output_handler, pickle.HIGHEST_PROTOCOL)
 if __name__ == "__main__":
 aparser = argparse.ArgumentParser()
 aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
 aparser.add_argument("-e", "--estimator", dest="infile_estimator")
 aparser.add_argument("-X", "--infile1", dest="infile1")
 aparser.add_argument("-y", "--infile2", dest="infile2")
 aparser.add_argument("-O", "--outfile_result", dest="outfile_result")
 aparser.add_argument("-o", "--outfile_object", dest="outfile_object")
-aparser.add_argument("-w", "--outfile_weights", dest="outfile_weights")
 aparser.add_argument("-l", "--outfile_y_true", dest="outfile_y_true")
 aparser.add_argument("-p", "--outfile_y_preds", dest="outfile_y_preds")
 aparser.add_argument("-g", "--groups", dest="groups")
 aparser.add_argument("-r", "--ref_seq", dest="ref_seq")
 aparser.add_argument("-b", "--intervals", dest="intervals")
 args.infile_estimator,
 args.infile1,
 args.infile2,
 args.outfile_result,
 outfile_object=args.outfile_object,
-outfile_weights=args.outfile_weights,
 outfile_y_true=args.outfile_y_true,
 outfile_y_preds=args.outfile_y_preds,
 groups=args.groups,
 ref_seq=args.ref_seq,
 intervals=args.intervals,

Mercurial > repos > bgruening > sklearn_data_preprocess

comparison keras_train_and_eval.py @ 41:a16f33c6ca64 draft