Mercurial > repos > bgruening > sklearn_data_preprocess
diff keras_train_and_eval.py @ 37:1bef885255e0 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
author | bgruening |
---|---|
date | Sat, 01 May 2021 01:41:14 +0000 |
parents | b75cae00f980 |
children | a16f33c6ca64 |
line wrap: on
line diff
--- a/keras_train_and_eval.py Tue Apr 13 22:16:07 2021 +0000 +++ b/keras_train_and_eval.py Sat May 01 01:41:14 2021 +0000 @@ -11,16 +11,9 @@ from galaxy_ml.externals.selene_sdk.utils import compute_score from galaxy_ml.keras_galaxy_models import _predict_generator from galaxy_ml.model_validations import train_test_split -from galaxy_ml.utils import ( - clean_params, - get_main_estimator, - get_module, - get_scoring, - load_model, - read_columns, - SafeEval, - try_get_attr, -) +from galaxy_ml.utils import (clean_params, get_main_estimator, + get_module, get_scoring, load_model, read_columns, + SafeEval, try_get_attr) from scipy.io import mmread from sklearn.metrics.scorer import _check_multimetric_scoring from sklearn.model_selection import _search, _validation @@ -28,7 +21,6 @@ from sklearn.pipeline import Pipeline from sklearn.utils import indexable, safe_indexing - _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") setattr(_search, "_fit_and_score", _fit_and_score) setattr(_validation, "_fit_and_score", _fit_and_score) @@ -56,7 +48,10 @@ param_name = p["sp_name"] if param_name.lower().endswith(NON_SEARCHABLE): - warnings.warn("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) + warnings.warn( + "Warning: `%s` is not eligible for search and was " + "omitted!" % param_name + ) continue if not swap_value.startswith(":"): @@ -99,7 +94,11 @@ index_arr = np.arange(n_samples) test = index_arr[np.isin(groups, group_names)] train = index_arr[~np.isin(groups, group_names)] - rval = list(chain.from_iterable((safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays)) + rval = list( + chain.from_iterable( + (safe_indexing(a, train), safe_indexing(a, test)) for a in new_arrays + ) + ) else: rval = train_test_split(*new_arrays, **kwargs) @@ -127,14 +126,22 @@ pred_labels = (pred_probas > 0.5).astype("int32") targets = y_true.ravel().astype("int32") if not is_multimetric: - preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score = scorer._score_func(targets, preds, **scorer._kwargs) return score else: scores = {} for name, one_scorer in scorer.items(): - preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if one_scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score = one_scorer._score_func(targets, preds, **one_scorer._kwargs) scores[name] = score @@ -144,13 +151,21 @@ pred_labels = (pred_probas > 0.5).astype("int32") targets = y_true.astype("int32") if not is_multimetric: - preds = pred_labels if scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score, _ = compute_score(preds, targets, scorer._score_func) return score else: scores = {} for name, one_scorer in scorer.items(): - preds = pred_labels if one_scorer.__class__.__name__ == "_PredictScorer" else pred_probas + preds = ( + pred_labels + if one_scorer.__class__.__name__ == "_PredictScorer" + else pred_probas + ) score, _ = compute_score(preds, targets, one_scorer._score_func) scores[name] = score @@ -243,7 +258,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -295,7 +312,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -313,12 +332,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == "refseq_and_interval": @@ -328,10 +344,14 @@ # load groups if groups: - groups_selector = (params["experiment_schemes"]["test_split"]["split_algos"]).pop("groups_selector") + groups_selector = ( + params["experiment_schemes"]["test_split"]["split_algos"] + ).pop("groups_selector") header = "infer" if groups_selector["header_g"] else None - column_option = groups_selector["column_selector_options_g"]["selected_column_selector_option_g"] + column_option = groups_selector["column_selector_options_g"][ + "selected_column_selector_option_g" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -346,12 +366,14 @@ if df_key in loaded_df: groups = loaded_df[df_key] - groups = read_columns(groups, - c=c, - c_option=column_option, - sep='\t', - header=header, - parse_dates=True) + groups = read_columns( + groups, + c=c, + c_option=column_option, + sep="\t", + header=header, + parse_dates=True, + ) groups = groups.ravel() # del loaded_df @@ -364,7 +386,7 @@ main_est.set_params(memory=memory) # handle scorer, convert to scorer dict - scoring = params['experiment_schemes']['metrics']['scoring'] + scoring = params["experiment_schemes"]["metrics"]["scoring"] if scoring is not None: # get_scoring() expects secondary_scoring to be a comma separated string (not a list) # Check if secondary_scoring is specified @@ -385,7 +407,9 @@ if y is not None: test_split_options["labels"] = y else: - raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") + raise ValueError( + "Stratified shuffle split is not " "applicable on empty target values!" + ) ( X_train, @@ -408,7 +432,10 @@ if y_train is not None: val_split_options["labels"] = y_train else: - raise ValueError("Stratified shuffle split is not " "applicable on empty target values!") + raise ValueError( + "Stratified shuffle split is not " + "applicable on empty target values!" + ) ( X_train, @@ -431,8 +458,12 @@ if hasattr(estimator, "evaluate"): steps = estimator.prediction_steps batch_size = estimator.batch_size - generator = estimator.data_generator_.flow(X_test, y=y_test, batch_size=batch_size) - predictions, y_true = _predict_generator(estimator.model_, generator, steps=steps) + generator = estimator.data_generator_.flow( + X_test, y=y_test, batch_size=batch_size + ) + predictions, y_true = _predict_generator( + estimator.model_, generator, steps=steps + ) scores = _evaluate(y_true, predictions, scorer, is_multimetric=True) else: