Mercurial > repos > bgruening > sklearn_model_validation
diff model_validation.xml @ 34:1fe00785190d draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author | bgruening |
---|---|
date | Wed, 09 Aug 2023 13:44:18 +0000 |
parents | 4b359039f09f |
children |
line wrap: on
line diff
--- a/model_validation.xml Thu Aug 11 08:49:05 2022 +0000 +++ b/model_validation.xml Wed Aug 09 13:44:18 2023 +0000 @@ -1,4 +1,4 @@ -<tool id="sklearn_model_validation" name="Model Validation" version="@VERSION@" profile="20.05"> +<tool id="sklearn_model_validation" name="Model Validation" version="@VERSION@" profile="@PROFILE@"> <description>includes cross_validate, cross_val_predict, learning_curve, and more</description> <macros> <import>main_macros.xml</import> @@ -22,7 +22,6 @@ import numpy as np import os import pandas as pd -import pickle import pprint import skrebate import sys @@ -35,19 +34,18 @@ model_selection, naive_bayes, neighbors, pipeline, preprocessing, svm, linear_model, tree, discriminant_analysis) from sklearn.model_selection import _validation +from sklearn.preprocessing import LabelEncoder -from galaxy_ml.utils import (SafeEval, get_cv, get_scoring, load_model, - read_columns, get_module) -from galaxy_ml.model_validations import _fit_and_score +from distutils.version import LooseVersion as Version +from galaxy_ml import __version__ as galaxy_ml_version +from galaxy_ml.model_persist import load_model_from_h5 +from galaxy_ml.utils import (SafeEval, get_cv, get_scoring, + read_columns, get_module, + clean_params, get_main_estimator) -setattr(_validation, '_fit_and_score', _fit_and_score) - N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1)) CACHE_DIR = os.path.join(os.getcwd(), 'cached') -del os -ALLOWED_CALLBACKS = ('EarlyStopping', 'TerminateOnNaN', 'ReduceLROnPlateau', - 'CSVLogger', 'None') warnings.filterwarnings('ignore') @@ -58,8 +56,15 @@ params = json.load(param_handler) ## load estimator -with open('$infile_estimator', 'rb') as estimator_handler: - estimator = load_model(estimator_handler) +estimator = load_model_from_h5('$infile_estimator') +estimator = clean_params(estimator) + +if estimator.__class__.__name__ == 'KerasGBatchClassifier': + _fit_and_score = try_get_attr('galaxy_ml.model_validations', + '_fit_and_score') + + setattr(_search, '_fit_and_score', _fit_and_score) + setattr(_validation, '_fit_and_score', _fit_and_score) estimator_params = estimator.get_params() @@ -71,29 +76,9 @@ else: # For iraps buried in pipeline for p, v in estimator_params.items(): - if p.endswith('memory'): - # for case of `__irapsclassifier__memory` - if len(p) > 8 and p[:-8].endswith('irapsclassifier'): - # cache iraps_core fits could increase search - # speed significantly - new_params = {p: memory} - estimator.set_params(**new_params) - # security reason, we don't want memory being - # modified unexpectedly - elif v: - new_params = {p, None} - estimator.set_params(**new_params) - # For now, 1 CPU is suggested for iprasclassifier - elif p.endswith('n_jobs'): - new_params = {p: 1} + if p.endswith('__irapsclassifier__memory'): + new_params = {p: memory} estimator.set_params(**new_params) - # for security reason, types of callback are limited - elif p.endswith('callbacks'): - for cb in v: - cb_type = cb['callback_selection']['callback_type'] - if cb_type not in ALLOWED_CALLBACKS: - raise ValueError( - "Prohibited callback type: %s!" % cb_type) ## store read dataframe object loaded_df = {} @@ -162,18 +147,22 @@ infile2 = pd.read_csv(infile2, sep='\t', header=header, parse_dates=True) loaded_df[df_key] = infile2 y = read_columns( - infile2, - c = c, - c_option = column_option, - sep='\t', - header=header, - parse_dates=True) + infile2, + c = c, + c_option = column_option, + sep='\t', + header=header, + parse_dates=True) if len(y.shape) == 2 and y.shape[1] == 1: y = y.ravel() #if $input_options.selected_input == 'refseq_and_interval' estimator.set_params( data_batch_generator__features=y.ravel().tolist()) y = None +label_encoder = LabelEncoder() +if get_main_estimator(estimator).__class__.__name__ == "XGBClassifier": + y = label_encoder.fit_transform(y) + print(label_encoder.classes_) #end if ## handle options @@ -202,7 +191,10 @@ ## del loaded_df del loaded_df -splitter, groups = get_cv( options.pop('cv_selector') ) +cv_selector = options.pop('cv_selector') +if Version(galaxy_ml_version) < Version('0.8.3'): + cv_selector.pop('n_stratification_bins', None) +splitter, groups = get_cv( cv_selector ) options['cv'] = splitter options['groups'] = groups options['n_jobs'] = N_JOBS @@ -238,7 +230,7 @@ elif selected_function == 'learning_curve': try: train_sizes = safe_eval(options['train_sizes']) - except Exception: + except: sys.exit("Unsupported train_sizes input! Supports int/float in tuple and array-like structure.") if type(train_sizes) is tuple: train_sizes = np.linspace(*train_sizes) @@ -267,7 +259,7 @@ </configfile> </configfiles> <inputs> - <param name="infile_estimator" type="data" format="zip" label="Choose the dataset containing model/pipeline object" /> + <param name="infile_estimator" type="data" format="h5mlm" label="Choose the dataset containing model/pipeline object" /> <conditional name="model_validation_functions"> <param name="selected_function" type="select" label="Select a model validation function"> <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option> @@ -281,8 +273,8 @@ <expand macro="scoring_selection" /> <expand macro="model_validation_common_options" /> <param argument="return_train_score" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to include train scores." /> - <!--param argument="return_estimator" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to return the estimators fitted on each split."/> --> - <!--param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised."/> --> + <!--param argument="return_estimator" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to return the estimators fitted on each split." /> --> + <!--param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised." /> --> <!--fit_params--> <expand macro="pre_dispatch" /> </section> @@ -302,7 +294,8 @@ <section name="options" title="Other Options" expanded="false"> <expand macro="scoring_selection" /> <expand macro="model_validation_common_options" /> - <param argument="train_sizes" type="text" value="(0.1, 1.0, 5)" label="train_sizes" help="Relative or absolute numbers of training examples that will be used to generate the learning curve. Supports 1) tuple, to be evaled by np.linspace, e.g. (0.1, 1.0, 5); 2) array-like, e.g. [0.1 , 0.325, 0.55 , 0.775, 1.]"> + <param argument="train_sizes" type="text" value="(0.1, 1.0, 5)" label="train_sizes" + help="Relative or absolute numbers of training examples that will be used to generate the learning curve. Supports 1) tuple, to be evaled by np.linspace, e.g. (0.1, 1.0, 5); 2) array-like, e.g. [0.1 , 0.325, 0.55 , 0.775, 1.]"> <sanitizer> <valid initial="default"> <add value="[" /> @@ -343,9 +336,9 @@ <output name="outfile"> <assert_contents> <has_n_columns n="6" /> - <has_text text="0.9999961390418067" /> - <has_text text="0.9944541531269271" /> - <has_text text="0.9999193322454393" /> + <has_text text="0.9998136508657879" /> + <has_text text="0.9999980090366614" /> + <has_text text="0.9999977541353663" /> </assert_contents> </output> </test> @@ -356,7 +349,16 @@ <param name="col1" value="1,2,3,4,5" /> <param name="infile2" value="regression_train.tabular" ftype="tabular" /> <param name="col2" value="6" /> - <output name="outfile" file="mv_result02.tabular" lines_diff="14" /> + <output name="outfile"> + <assert_contents> + <has_n_columns n="1" /> + <has_text text="1.5781414" /> + <has_text text="-1.19994559787" /> + <has_text text="-0.7187446" /> + <has_text text="0.324693926" /> + <has_text text="1.25823227" /> + </assert_contents> + </output> </test> <test> <param name="infile_estimator" value="pipeline05" /> @@ -379,7 +381,7 @@ <output name="outfile"> <assert_contents> <has_n_columns n="3" /> - <has_text text="0.25697059258228816" /> + <has_text text="-2.7453395018288753" /> </assert_contents> </output> </test>