sklearn_train_test_eval: search_model_validation.py comparison

comparison search_model_validation.py @ 11:caf7d2b71a48 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ea12f973df4b97a2691d9e4ce6bf6fae59d57717"

author	bgruening
date	Sat, 01 May 2021 01:47:26 +0000
parents	a9e0b963b7bb
children	2eb5c017958d

comparison

equal deleted inserted replaced

-:a9e0b963b7bb
+:caf7d2b71a48
 import imblearn
 import joblib
 import numpy as np
 import pandas as pd
 import skrebate
-from galaxy_ml.utils import (
+from galaxy_ml.utils import (clean_params, get_cv,
-clean_params,
+get_main_estimator, get_module, get_scoring,
-get_cv,
+load_model, read_columns, SafeEval, try_get_attr)
-get_main_estimator,
-get_module,
-get_scoring,
-load_model,
-read_columns,
-SafeEval,
-try_get_attr
-)
 from scipy.io import mmread
-from sklearn import (
+from sklearn import (cluster, decomposition, feature_selection,
-cluster,
+kernel_approximation, model_selection, preprocessing)
-decomposition,
-feature_selection,
-kernel_approximation,
-model_selection,
-preprocessing,
-)
 from sklearn.exceptions import FitFailedWarning
 from sklearn.model_selection import _search, _validation
 from sklearn.model_selection._validation import _score, cross_validate
 _fit_and_score = try_get_attr("galaxy_ml.model_validations", "_fit_and_score")
 setattr(_search, "_fit_and_score", _fit_and_score)
 setattr(_validation, "_fit_and_score", _fit_and_score)
 if search_list == "":
 continue
 param_name = p["sp_name"]
 if param_name.lower().endswith(NON_SEARCHABLE):
-print("Warning: `%s` is not eligible for search and was " "omitted!" % param_name)
+print(
+"Warning: `%s` is not eligible for search and was "
+"omitted!" % param_name
+)
 continue
 if not search_list.startswith(":"):
 safe_eval = SafeEval(load_scipy=True, load_numpy=True)
 ev = safe_eval(search_list)
 decomposition.FactorAnalysis(random_state=0),
 decomposition.FastICA(random_state=0),
 decomposition.IncrementalPCA(),
 decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS),
 decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS),
-decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS),
+decomposition.MiniBatchDictionaryLearning(
+random_state=0, n_jobs=N_JOBS
+),
 decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS),
 decomposition.NMF(random_state=0),
 decomposition.PCA(random_state=0),
 decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
 decomposition.TruncatedSVD(random_state=0),
 skrebate.SURF(n_jobs=N_JOBS),
 skrebate.SURFstar(n_jobs=N_JOBS),
 skrebate.MultiSURF(n_jobs=N_JOBS),
 skrebate.MultiSURFstar(n_jobs=N_JOBS),
 imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS),
-imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS),
+imblearn.under_sampling.CondensedNearestNeighbour(
-imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
+random_state=0, n_jobs=N_JOBS
-imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
+),
+imblearn.under_sampling.EditedNearestNeighbours(
+random_state=0, n_jobs=N_JOBS
+),
+imblearn.under_sampling.RepeatedEditedNearestNeighbours(
+random_state=0, n_jobs=N_JOBS
+),
 imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
-imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS),
+imblearn.under_sampling.InstanceHardnessThreshold(
+random_state=0, n_jobs=N_JOBS
+),
 imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS),
-imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS),
+imblearn.under_sampling.NeighbourhoodCleaningRule(
-imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS),
+random_state=0, n_jobs=N_JOBS
+),
+imblearn.under_sampling.OneSidedSelection(
+random_state=0, n_jobs=N_JOBS
+),
 imblearn.under_sampling.RandomUnderSampler(random_state=0),
 imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS),
 imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
 imblearn.over_sampling.RandomOverSampler(random_state=0),
 imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
 imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
 imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS),
-imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS),
+imblearn.over_sampling.SMOTENC(
+categorical_features=[], random_state=0, n_jobs=N_JOBS
+),
 imblearn.combine.SMOTEENN(random_state=0),
 imblearn.combine.SMOTETomek(random_state=0),
 )
 newlist = []
 for obj in ev:
 input_type = params["input_options"]["selected_input"]
 # tabular input
 if input_type == "tabular":
 header = "infer" if params["input_options"]["header1"] else None
-column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+column_option = params["input_options"]["column_selector_options_1"][
+"selected_column_selector_option"
+]
 if column_option in [
 "by_index_number",
 "all_but_by_index_number",
 "by_header_name",
 "all_but_by_header_name",
 n_intervals = sum(1 for line in open(intervals))
 X = np.arange(n_intervals)[:, np.newaxis]
 # Get target y
 header = "infer" if params["input_options"]["header2"] else None
-column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+column_option = params["input_options"]["column_selector_options_2"][
+"selected_column_selector_option2"
+]
 if column_option in [
 "by_index_number",
 "all_but_by_index_number",
 "by_header_name",
 "all_but_by_header_name",
 infile2 = loaded_df[df_key]
 else:
 infile2 = pd.read_csv(infile2, sep="\t", header=header, parse_dates=True)
 loaded_df[df_key] = infile2
-y = read_columns(infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True)
+y = read_columns(
+infile2, c=c, c_option=column_option, sep="\t", header=header, parse_dates=True
+)
 if len(y.shape) == 2 and y.shape[1] == 1:
 y = y.ravel()
 if input_type == "refseq_and_interval":
 estimator.set_params(data_batch_generator__features=y.ravel().tolist())
 y = None
 if split_options["shuffle"] == "stratified":
 split_options["labels"] = y
 X, X_test, y, y_test = train_test_split(X, y, **split_options)
 elif split_options["shuffle"] == "group":
 if groups is None:
-raise ValueError("No group based CV option was choosen for " "group shuffle!")
+raise ValueError(
+"No group based CV option was choosen for " "group shuffle!"
+)
 split_options["labels"] = groups
 if y is None:
 X, X_test, groups, _ = train_test_split(X, groups, **split_options)
 else:
-X, X_test, y, y_test, groups, _ = train_test_split(X, y, groups, **split_options)
+X, X_test, y, y_test, groups, _ = train_test_split(
+X, y, groups, **split_options
+)
 else:
 if split_options["shuffle"] == "None":
 split_options["shuffle"] = None
 X, X_test, y, y_test = train_test_split(X, y, **split_options)
 best_estimator_ = getattr(searcher, "best_estimator_")
 # TODO Solve deep learning models in pipeline
 if best_estimator_.__class__.__name__ == "KerasGBatchClassifier":
-test_score = best_estimator_.evaluate(X_test, scorer=scorer_, is_multimetric=is_multimetric)
+test_score = best_estimator_.evaluate(
-else:
+X_test, scorer=scorer_, is_multimetric=is_multimetric
-test_score = _score(best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric)
+)
+else:
+test_score = _score(
+best_estimator_, X_test, y_test, scorer_, is_multimetric=is_multimetric
+)
 if not is_multimetric:
 test_score = {primary_scoring: test_score}
 for key, value in test_score.items():
 test_score[key] = [value]
 with open(inputs, "r") as param_handler:
 params = json.load(param_handler)
 # Override the refit parameter
-params["search_schemes"]["options"]["refit"] = True if params["save"] != "nope" else False
+params["search_schemes"]["options"]["refit"] = (
+True if params["save"] != "nope" else False
+)
 with open(infile_estimator, "rb") as estimator_handler:
 estimator = load_model(estimator_handler)
 optimizer = params["search_schemes"]["selected_search_scheme"]
 # handle gridsearchcv options
 options = params["search_schemes"]["options"]
 if groups:
-header = "infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None
+header = (
-column_option = options["cv_selector"]["groups_selector"]["column_selector_options_g"][
+"infer" if (options["cv_selector"]["groups_selector"]["header_g"]) else None
-"selected_column_selector_option_g"
+)
-]
+column_option = options["cv_selector"]["groups_selector"][
+"column_selector_options_g"
+]["selected_column_selector_option_g"]
 if column_option in [
 "by_index_number",
 "all_but_by_index_number",
 "by_header_name",
 "all_but_by_header_name",
 ]:
-c = options["cv_selector"]["groups_selector"]["column_selector_options_g"]["col_g"]
+c = options["cv_selector"]["groups_selector"]["column_selector_options_g"][
+"col_g"
+]
 else:
 c = None
 df_key = groups + repr(header)
 # get_scoring() expects secondary_scoring to be a comma separated string (not a list)
 # Check if secondary_scoring is specified
 secondary_scoring = options["scoring"].get("secondary_scoring", None)
 if secondary_scoring is not None:
 # If secondary_scoring is specified, convert the list into comman separated string
-options["scoring"]["secondary_scoring"] = ",".join(options["scoring"]["secondary_scoring"])
+options["scoring"]["secondary_scoring"] = ",".join(
+options["scoring"]["secondary_scoring"]
+)
 options["scoring"] = get_scoring(options["scoring"])
 if options["error_score"]:
 options["error_score"] = "raise"
 else:
-options["error_score"] = np.NaN
+options["error_score"] = np.nan
 if options["refit"] and isinstance(options["scoring"], dict):
 options["refit"] = primary_scoring
 if "pre_dispatch" in options and options["pre_dispatch"] == "":
 options["pre_dispatch"] = None
 if split_mode == "nested_cv":
 # make sure refit is choosen
 # this could be True for sklearn models, but not the case for
 # deep learning models
-if not options["refit"] and not all(hasattr(estimator, attr) for attr in ("config", "model_type")):
+if not options["refit"] and not all(
+hasattr(estimator, attr) for attr in ("config", "model_type")
+):
 warnings.warn("Refit is change to `True` for nested validation!")
 setattr(searcher, "refit", True)
 outer_cv, _ = get_cv(params["outer_split"]["cv_selector"])
 # nested CV, outer cv using cross_validate
 for warning in w:
 print(repr(warning.message))
 cv_results = pd.DataFrame(searcher.cv_results_)
 cv_results = cv_results[sorted(cv_results.columns)]
-cv_results.to_csv(path_or_buf=outfile_result, sep="\t", header=True, index=False)
+cv_results.to_csv(
+path_or_buf=outfile_result, sep="\t", header=True, index=False
+)
 memory.clear(warn=False)
 # output best estimator, and weights if applicable
 if outfile_object:

Mercurial > repos > bgruening > sklearn_train_test_eval

comparison search_model_validation.py @ 11:caf7d2b71a48 draft