Mercurial > repos > bgruening > sklearn_train_test_eval
diff search_model_validation.py @ 11:caf7d2b71a48 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"
author | bgruening |
---|---|
date | Sat, 01 May 2021 01:47:26 +0000 |
parents | a9e0b963b7bb |
children | 2eb5c017958d |
line wrap: on
line diff
--- a/search_model_validation.py Tue Apr 13 22:04:06 2021 +0000 +++ b/search_model_validation.py Sat May 01 01:47:26 2021 +0000 @@ -11,31 +11,16 @@ import numpy as np import pandas as pd import skrebate -from galaxy_ml.utils import ( - clean_params, - get_cv, - get_main_estimator, - get_module, - get_scoring, - load_model, - read_columns, - SafeEval, - try_get_attr -) +from galaxy_ml.utils import (clean_params, get_cv, + get_main_estimator, get_module, get_scoring, + load_model, read_columns, SafeEval, try_get_attr) from scipy.io import mmread -from sklearn import ( - cluster, - decomposition, - feature_selection, - kernel_approximation, - model_selection, - preprocessing, -) +from sklearn import (cluster, decomposition, feature_selection, + kernel_approximation, model_selection, preprocessing) from sklearn.exceptions import FitFailedWarning from sklearn.model_selection import _search, _validation from sklearn.model_selection._validation import _score, cross_validate - _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score") setattr(_search, "_fit_and_score", _fit_and_score) setattr(_validation, "_fit_and_score", _fit_and_score) @@ -57,7 +42,10 @@ param_name = p["sp_name"] if param_name.lower().endswith(NON_SEARCHABLE): - print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name) + print( + "Warning: `%s` is not eligible for search and was " + "omitted!" % param_name + ) continue if not search_list.startswith(":"): @@ -90,7 +78,9 @@ decomposition.IncrementalPCA(), decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS), - decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS), + decomposition.MiniBatchDictionaryLearning( + random_state=0, n_jobs=N_JOBS + ), decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0), decomposition.PCA(random_state=0), @@ -107,14 +97,26 @@ skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS), imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS), + imblearn.under_sampling.CondensedNearestNeighbour( + random_state=0, n_jobs=N_JOBS + ), + imblearn.under_sampling.EditedNearestNeighbours( + random_state=0, n_jobs=N_JOBS + ), + imblearn.under_sampling.RepeatedEditedNearestNeighbours( + random_state=0, n_jobs=N_JOBS + ), imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS), + imblearn.under_sampling.InstanceHardnessThreshold( + random_state=0, n_jobs=N_JOBS + ), imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS), - imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS), + imblearn.under_sampling.NeighbourhoodCleaningRule( + random_state=0, n_jobs=N_JOBS + ), + imblearn.under_sampling.OneSidedSelection( + random_state=0, n_jobs=N_JOBS + ), imblearn.under_sampling.RandomUnderSampler(random_state=0), imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS), @@ -122,7 +124,9 @@ imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS), imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS), - imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS), + imblearn.over_sampling.SMOTENC( + categorical_features=[], random_state=0, n_jobs=N_JOBS + ), imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0), ) @@ -205,7 +209,9 @@ # tabular input if input_type == "tabular": header = "infer" if params["input_options"]["header1"] else None - column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] + column_option = params["input_options"]["column_selector_options_1"][ + "selected_column_selector_option" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -261,7 +267,9 @@ # Get target y header = "infer" if params["input_options"]["header2"] else None - column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] + column_option = params["input_options"]["column_selector_options_2"][ + "selected_column_selector_option2" + ] if column_option in [ "by_index_number", "all_but_by_index_number", @@ -279,7 +287,9 @@ infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True) loaded_df[df_key] = infile2 - y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True) + y = read_columns( + infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True + ) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() if input_type == "refseq_and_interval": @@ -378,12 +388,16 @@ X, X_test, y, y_test = train_test_split(X, y, **split_options) elif split_options["shuffle"] == "group": if groups is None: - raise ValueError("No group based CV option was choosen for " "group shuffle!") + raise ValueError( + "No group based CV option was choosen for " "group shuffle!" + ) split_options["labels"] = groups if y is None: X, X_test, groups, _ = train_test_split(X, groups, **split_options) else: - X, X_test, y, y_test, groups, _ = train_test_split(X, y, groups, **split_options) + X, X_test, y, y_test, groups, _ = train_test_split( + X, y, groups, **split_options + ) else: if split_options["shuffle"] == "None": split_options["shuffle"] = None @@ -411,9 +425,13 @@ # TODO Solve deep learning models in pipeline if best_estimator_.__class__.__name__ == "KerasGBatchClassifier": - test_score = best_estimator_.evaluate(X_test, scorer=scorer_, is_multimetric=is_multimetric) + test_score = best_estimator_.evaluate( + X_test, scorer=scorer_, is_multimetric=is_multimetric + ) else: - test_score = _score(best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric) + test_score = _score( + best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric + ) if not is_multimetric: test_score = {primary_scoring: test_score} @@ -487,7 +505,9 @@ params = json.load(param_handler) # Override the refit parameter - params["search_schemes"]["options"]["refit"] = True if params["save"] != "nope" else False + params["search_schemes"]["options"]["refit"] = ( + True if params["save"] != "nope" else False + ) with open(infile_estimator, "rb") as estimator_handler: estimator = load_model(estimator_handler) @@ -499,17 +519,21 @@ options = params["search_schemes"]["options"] if groups: - header = "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None - column_option = options["cv_selector"]["groups_selector"]["column_selector_options_g"][ - "selected_column_selector_option_g" - ] + header = ( + "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None + ) + column_option = options["cv_selector"]["groups_selector"][ + "column_selector_options_g" + ]["selected_column_selector_option_g"] if column_option in [ "by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name", ]: - c = options["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"] + c = options["cv_selector"]["groups_selector"]["column_selector_options_g"][ + "col_g" + ] else: c = None @@ -537,12 +561,14 @@ secondary_scoring = options["scoring"].get("secondary_scoring", None) if secondary_scoring is not None: # If secondary_scoring is specified, convert the list into comman separated string - options["scoring"]["secondary_scoring"] = ",".join(options["scoring"]["secondary_scoring"]) + options["scoring"]["secondary_scoring"] = ",".join( + options["scoring"]["secondary_scoring"] + ) options["scoring"] = get_scoring(options["scoring"]) if options["error_score"]: options["error_score"] = "raise" else: - options["error_score"] = np.NaN + options["error_score"] = np.nan if options["refit"] and isinstance(options["scoring"], dict): options["refit"] = primary_scoring if "pre_dispatch" in options and options["pre_dispatch"] == "": @@ -588,7 +614,9 @@ # make sure refit is choosen # this could be True for sklearn models, but not the case for # deep learning models - if not options["refit"] and not all(hasattr(estimator, attr) for attr in ("config", "model_type")): + if not options["refit"] and not all( + hasattr(estimator, attr) for attr in ("config", "model_type") + ): warnings.warn("Refit is change to `True` for nested validation!") setattr(searcher, "refit", True) @@ -687,7 +715,9 @@ cv_results = pd.DataFrame(searcher.cv_results_) cv_results = cv_results[sorted(cv_results.columns)] - cv_results.to_csv(path_or_buf=outfile_result, sep="\t", header=True, index=False) + cv_results.to_csv( + path_or_buf=outfile_result, sep="\t", header=True, index=False + ) memory.clear(warn=False)