diff model_validation.xml @ 34:1fe00785190d draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 9981e25b00de29ed881b2229a173a8c812ded9bb
author bgruening
date Wed, 09 Aug 2023 13:44:18 +0000
parents 4b359039f09f
children
line wrap: on
line diff
--- a/model_validation.xml	Thu Aug 11 08:49:05 2022 +0000
+++ b/model_validation.xml	Wed Aug 09 13:44:18 2023 +0000
@@ -1,4 +1,4 @@
-<tool id="sklearn_model_validation" name="Model Validation" version="@VERSION@" profile="20.05">
+<tool id="sklearn_model_validation" name="Model Validation" version="@VERSION@" profile="@PROFILE@">
     <description>includes cross_validate, cross_val_predict, learning_curve, and more</description>
     <macros>
         <import>main_macros.xml</import>
@@ -22,7 +22,6 @@
 import numpy as np
 import os
 import pandas as pd
-import pickle
 import pprint
 import skrebate
 import sys
@@ -35,19 +34,18 @@
     model_selection, naive_bayes, neighbors, pipeline, preprocessing,
     svm, linear_model, tree, discriminant_analysis)
 from sklearn.model_selection import _validation
+from sklearn.preprocessing import LabelEncoder
 
-from galaxy_ml.utils import (SafeEval, get_cv, get_scoring, load_model,
-                   read_columns, get_module)
-from galaxy_ml.model_validations import _fit_and_score
+from distutils.version import LooseVersion as Version
+from galaxy_ml import __version__ as galaxy_ml_version
+from galaxy_ml.model_persist import load_model_from_h5
+from galaxy_ml.utils import (SafeEval, get_cv, get_scoring,
+                             read_columns, get_module,
+                             clean_params, get_main_estimator)
 
 
-setattr(_validation, '_fit_and_score', _fit_and_score)
-
 N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1))
 CACHE_DIR = os.path.join(os.getcwd(), 'cached')
-del os
-ALLOWED_CALLBACKS = ('EarlyStopping', 'TerminateOnNaN', 'ReduceLROnPlateau',
-                     'CSVLogger', 'None')
 
 warnings.filterwarnings('ignore')
 
@@ -58,8 +56,15 @@
     params = json.load(param_handler)
 
 ## load estimator
-with open('$infile_estimator', 'rb') as estimator_handler:
-    estimator = load_model(estimator_handler)
+estimator = load_model_from_h5('$infile_estimator')
+estimator = clean_params(estimator)
+
+if estimator.__class__.__name__ == 'KerasGBatchClassifier':
+    _fit_and_score = try_get_attr('galaxy_ml.model_validations',
+                                  '_fit_and_score')
+
+    setattr(_search, '_fit_and_score', _fit_and_score)
+    setattr(_validation, '_fit_and_score', _fit_and_score)
 
 estimator_params = estimator.get_params()
 
@@ -71,29 +76,9 @@
 else:
     # For iraps buried in pipeline
     for p, v in estimator_params.items():
-        if p.endswith('memory'):
-            # for case of `__irapsclassifier__memory`
-            if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
-                # cache iraps_core fits could increase search
-                # speed significantly
-                new_params = {p: memory}
-                estimator.set_params(**new_params)
-            # security reason, we don't want memory being
-            # modified unexpectedly
-            elif v:
-                new_params = {p, None}
-                estimator.set_params(**new_params)
-        # For now, 1 CPU is suggested for iprasclassifier
-        elif p.endswith('n_jobs'):
-            new_params = {p: 1}
+        if p.endswith('__irapsclassifier__memory'):
+            new_params = {p: memory}
             estimator.set_params(**new_params)
-        # for security reason, types of callback are limited
-        elif p.endswith('callbacks'):
-            for cb in v:
-                cb_type = cb['callback_selection']['callback_type']
-                if cb_type not in ALLOWED_CALLBACKS:
-                    raise ValueError(
-                        "Prohibited callback type: %s!" % cb_type)
 
 ## store read dataframe object
 loaded_df = {}
@@ -162,18 +147,22 @@
     infile2 = pd.read_csv(infile2, sep='\t', header=header, parse_dates=True)
     loaded_df[df_key] = infile2
 y = read_columns(
-        infile2,
-        c = c,
-        c_option = column_option,
-        sep='\t',
-        header=header,
-        parse_dates=True)
+    infile2,
+    c = c,
+    c_option = column_option,
+    sep='\t',
+    header=header,
+    parse_dates=True)
 if len(y.shape) == 2 and y.shape[1] == 1:
     y = y.ravel()
 #if $input_options.selected_input == 'refseq_and_interval'
 estimator.set_params(
     data_batch_generator__features=y.ravel().tolist())
 y = None
+label_encoder = LabelEncoder()
+if get_main_estimator(estimator).__class__.__name__ == "XGBClassifier":
+    y = label_encoder.fit_transform(y)
+    print(label_encoder.classes_)
 #end if
 
 ## handle options
@@ -202,7 +191,10 @@
 ## del loaded_df
 del loaded_df
 
-splitter, groups = get_cv( options.pop('cv_selector') )
+cv_selector = options.pop('cv_selector')
+if Version(galaxy_ml_version) < Version('0.8.3'):
+    cv_selector.pop('n_stratification_bins', None)
+splitter, groups = get_cv( cv_selector )
 options['cv'] = splitter
 options['groups'] = groups
 options['n_jobs'] = N_JOBS
@@ -238,7 +230,7 @@
 elif selected_function == 'learning_curve':
     try:
         train_sizes = safe_eval(options['train_sizes'])
-    except Exception:
+    except:
         sys.exit("Unsupported train_sizes input! Supports int/float in tuple and array-like structure.")
     if type(train_sizes) is tuple:
         train_sizes = np.linspace(*train_sizes)
@@ -267,7 +259,7 @@
         </configfile>
     </configfiles>
     <inputs>
-        <param name="infile_estimator" type="data" format="zip" label="Choose the dataset containing model/pipeline object" />
+        <param name="infile_estimator" type="data" format="h5mlm" label="Choose the dataset containing model/pipeline object" />
         <conditional name="model_validation_functions">
             <param name="selected_function" type="select" label="Select a model validation function">
                 <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option>
@@ -281,8 +273,8 @@
                     <expand macro="scoring_selection" />
                     <expand macro="model_validation_common_options" />
                     <param argument="return_train_score" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to include train scores." />
-                    <!--param argument="return_estimator" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to return the estimators fitted on each split."/> -->
-                    <!--param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised."/> -->
+                    <!--param argument="return_estimator" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to return the estimators fitted on each split." /> -->
+                    <!--param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised." /> -->
                     <!--fit_params-->
                     <expand macro="pre_dispatch" />
                 </section>
@@ -302,7 +294,8 @@
                 <section name="options" title="Other Options" expanded="false">
                     <expand macro="scoring_selection" />
                     <expand macro="model_validation_common_options" />
-                    <param argument="train_sizes" type="text" value="(0.1, 1.0, 5)" label="train_sizes" help="Relative or absolute numbers of training examples that will be used to generate the learning curve. Supports 1) tuple, to be evaled by np.linspace, e.g. (0.1, 1.0, 5); 2) array-like, e.g. [0.1  , 0.325, 0.55 , 0.775, 1.]">
+                    <param argument="train_sizes" type="text" value="(0.1, 1.0, 5)" label="train_sizes"
+                            help="Relative or absolute numbers of training examples that will be used to generate the learning curve. Supports 1) tuple, to be evaled by np.linspace, e.g. (0.1, 1.0, 5); 2) array-like, e.g. [0.1  , 0.325, 0.55 , 0.775, 1.]">
                         <sanitizer>
                             <valid initial="default">
                                 <add value="[" />
@@ -343,9 +336,9 @@
             <output name="outfile">
                 <assert_contents>
                     <has_n_columns n="6" />
-                    <has_text text="0.9999961390418067" />
-                    <has_text text="0.9944541531269271" />
-                    <has_text text="0.9999193322454393" />
+                    <has_text text="0.9998136508657879" />
+                    <has_text text="0.9999980090366614" />
+                    <has_text text="0.9999977541353663" />
                 </assert_contents>
             </output>
         </test>
@@ -356,7 +349,16 @@
             <param name="col1" value="1,2,3,4,5" />
             <param name="infile2" value="regression_train.tabular" ftype="tabular" />
             <param name="col2" value="6" />
-            <output name="outfile" file="mv_result02.tabular" lines_diff="14" />
+            <output name="outfile">
+                <assert_contents>
+                    <has_n_columns n="1" />
+                    <has_text text="1.5781414" />
+                    <has_text text="-1.19994559787" />
+                    <has_text text="-0.7187446" />
+                    <has_text text="0.324693926" />
+                    <has_text text="1.25823227" />
+                </assert_contents>
+            </output>
         </test>
         <test>
             <param name="infile_estimator" value="pipeline05" />
@@ -379,7 +381,7 @@
             <output name="outfile">
                 <assert_contents>
                     <has_n_columns n="3" />
-                    <has_text text="0.25697059258228816" />
+                    <has_text text="-2.7453395018288753" />
                 </assert_contents>
             </output>
         </test>