diff model_validation.xml @ 19:efbec977a47d draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 60f0fbc0eafd7c11bc60fb6c77f2937782efd8a9-dirty
author bgruening
date Fri, 09 Aug 2019 07:26:09 -0400
parents cf9aa11b91c8
children 5895fe0b8bde
line wrap: on
line diff
--- a/model_validation.xml	Tue Jul 09 19:39:58 2019 -0400
+++ b/model_validation.xml	Fri Aug 09 07:26:09 2019 -0400
@@ -1,5 +1,5 @@
 <tool id="sklearn_model_validation" name="Model Validation" version="@VERSION@">
-    <description>evaluates estimator performance by cross-validation</description>
+    <description>evaluates estimator performances without changing parameters</description>
     <macros>
         <import>main_macros.xml</import>
     </macros>
@@ -16,6 +16,7 @@
         <configfile name="sklearn_model_validation_script">
             <![CDATA[
 import imblearn
+import joblib
 import json
 import numpy as np
 import pandas as pd
@@ -31,11 +32,19 @@
     feature_selection, gaussian_process, kernel_approximation, metrics,
     model_selection, naive_bayes, neighbors, pipeline, preprocessing,
     svm, linear_model, tree, discriminant_analysis)
+from sklearn.model_selection import _validation
 
-sys.path.insert(0, '$__tool_directory__')
-from utils import SafeEval, get_cv, get_scoring, load_model, read_columns
+from galaxy_ml.utils import (SafeEval, get_cv, get_scoring, load_model,
+                   read_columns, get_module)
+from galaxy_ml.model_validations import _fit_and_score
+
+
+setattr(_validation, '_fit_and_score', _fit_and_score)
 
 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
+CACHE_DIR = './cached'
+ALLOWED_CALLBACKS = ('EarlyStopping', 'TerminateOnNaN', 'ReduceLROnPlateau',
+                     'CSVLogger', 'None')
 
 warnings.filterwarnings('ignore')
 
@@ -45,29 +54,96 @@
 with open(input_json_path, 'r') as param_handler:
     params = json.load(param_handler)
 
-#if $model_validation_functions.options.cv_selector.selected_cv\
-        in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']:
-params['model_validation_functions']['options']['cv_selector']['groups_selector']['infile_g'] =\
-        '$model_validation_functions.options.cv_selector.groups_selector.infile_g'
-#end if
+## load estimator
+with open('$infile_estimator', 'rb') as estimator_handler:
+    estimator = load_model(estimator_handler)
+
+estimator_params = estimator.get_params()
+
+## check estimator hyperparameters
+memory = joblib.Memory(location=CACHE_DIR, verbose=0)
+# cache iraps_core fits could increase search speed significantly
+if estimator.__class__.__name__ == 'IRAPSClassifier':
+    estimator.set_params(memory=memory)
+else:
+    # For iraps buried in pipeline
+    for p, v in estimator_params.items():
+        if p.endswith('memory'):
+            # for case of `__irapsclassifier__memory`
+            if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
+                # cache iraps_core fits could increase search
+                # speed significantly
+                new_params = {p: memory}
+                estimator.set_params(**new_params)
+            # security reason, we don't want memory being
+            # modified unexpectedly
+            elif v:
+                new_params = {p, None}
+                estimator.set_params(**new_params)
+        # For now, 1 CPU is suggested for iprasclassifier
+        elif p.endswith('n_jobs'):
+            new_params = {p: 1}
+            estimator.set_params(**new_params)
+        # for security reason, types of callback are limited
+        elif p.endswith('callbacks'):
+            for cb in v:
+                cb_type = cb['callback_selection']['callback_type']
+                if cb_type not in ALLOWED_CALLBACKS:
+                    raise ValueError(
+                        "Prohibited callback type: %s!" % cb_type)
+
+## store read dataframe object
+loaded_df = {}
 
-input_type = params['input_options']['selected_input']
-if input_type == 'tabular':
-    header = 'infer' if params['input_options']['header1'] else None
-    column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option']
-    if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
-        c = params['input_options']['column_selector_options_1']['col1']
-    else:
-        c = None
-    X = read_columns(
-            '$input_options.infile1',
-            c = c,
-            c_option = column_option,
-            sep='\t',
-            header=header,
-            parse_dates=True).astype(float)
+#if $input_options.selected_input == 'tabular'
+header = 'infer' if params['input_options']['header1'] else None
+column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option']
+if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
+    c = params['input_options']['column_selector_options_1']['col1']
 else:
-    X = mmread('$input_options.infile1')
+    c = None
+infile1 = '$input_options.infile1'
+df_key = infile1 + repr(header)
+df = pd.read_csv(infile1, sep='\t', header=header, parse_dates=True)
+loaded_df[df_key] = df
+X = read_columns(df, c=c, c_option=column_option).astype(float)
+
+#elif $input_options.selected_input == 'sparse':
+X = mmread('$input_options.infile1')
+
+#elif $input_options.selected_input == 'seq_fasta'
+fasta_path = '$input_options.fasta_path'
+pyfaidx = get_module('pyfaidx')
+sequences = pyfaidx.Fasta(fasta_path)
+n_seqs = len(sequences.keys())
+X = np.arange(n_seqs)[:, np.newaxis]
+for param in estimator_params.keys():
+    if param.endswith('fasta_path'):
+        estimator.set_params(
+            **{param: fasta_path})
+        break
+else:
+    raise ValueError(
+        "The selected estimator doesn't support "
+        "fasta file input! Please consider using "
+        "KerasGBatchClassifier with "
+        "FastaDNABatchGenerator/FastaProteinBatchGenerator "
+        "or having GenomeOneHotEncoder/ProteinOneHotEncoder "
+        "in pipeline!")
+#elif $input_options.selected_input == 'refseq_and_interval'
+ref_seq = '$input_options.ref_genome_file'
+intervals = '$input_options.interval_file'
+targets = __import__('os').path.join(__import__('os').getcwd(),
+                                     '${target_file.element_identifier}.gz')
+path_params = {
+    'data_batch_generator__ref_genome_path': ref_seq,
+    'data_batch_generator__intervals_path': intervals,
+    'data_batch_generator__target_path': targets
+}
+estimator.set_params(**path_params)
+n_intervals = sum(1 for line in open(intervals))
+X = np.arange(n_intervals)[:, np.newaxis]
+#end if
 
 header = 'infer' if params['input_options']['header2'] else None
 column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2']
@@ -75,17 +151,54 @@
     c = params['input_options']['column_selector_options_2']['col2']
 else:
     c = None
+infile2 = '$input_options.infile2'
+df_key = infile2 + repr(header)
+if df_key in loaded_df:
+    infile2 = loaded_df[df_key]
+else:
+    infile2 = pd.read_csv(infile2, sep='\t', header=header, parse_dates=True)
+    loaded_df[df_key] = infile2
 y = read_columns(
-        '$input_options.infile2',
+        infile2,
         c = c,
         c_option = column_option,
         sep='\t',
         header=header,
         parse_dates=True)
-y = y.ravel()
+if len(y.shape) == 2 and y.shape[1] == 1:
+    y = y.ravel()
+#if $input_options.selected_input == 'refseq_and_interval'
+estimator.set_params(
+    data_batch_generator__features=y.ravel().tolist())
+y = None
+#end if
 
 ## handle options
 options = params['model_validation_functions']['options']
+
+#if $model_validation_functions.options.cv_selector.selected_cv\
+        in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']:
+infile_g = '$model_validation_functions.options.cv_selector.groups_selector.infile_g'
+header = 'infer' if options['cv_selector']['groups_selector']['header_g'] else None
+column_option = (options['cv_selector']['groups_selector']['column_selector_options_g']
+                        ['selected_column_selector_option_g'])
+if column_option in ['by_index_number', 'all_but_by_index_number',
+                     'by_header_name', 'all_but_by_header_name']:
+    c = (options['cv_selector']['groups_selector']['column_selector_options_g']['col_g'])
+else:
+    c = None
+df_key = infile_g + repr(header)
+if df_key in loaded_df:
+    infile_g = loaded_df[df_key]
+groups = read_columns(infile_g, c=c, c_option=column_option,
+                      sep='\t', header=header, parse_dates=True)
+groups = groups.ravel()
+options['cv_selector']['groups_selector'] = groups
+#end if
+
+## del loaded_df
+del loaded_df
+
 splitter, groups = get_cv( options.pop('cv_selector') )
 options['cv'] = splitter
 options['groups'] = groups
@@ -96,27 +209,25 @@
 if 'pre_dispatch' in options and options['pre_dispatch'] == '':
     options['pre_dispatch'] = None
 
-## load pipeline
-with open('$infile_pipeline', 'rb') as pipeline_handler:
-    pipeline = load_model(pipeline_handler)
-
-## Set up validator, run pipeline through validator and return results.
+## Set up validator, run estimator through validator and return results.
 
 validator = params['model_validation_functions']['selected_function']
-validator = getattr(model_selection, validator)
+validator = getattr(_validation, validator)
 
 selected_function = params['model_validation_functions']['selected_function']
 
 if selected_function == 'cross_validate':
-    res = validator(pipeline, X, y, **options)
+    res = validator(estimator, X, y, **options)
+    stat = {}
+    for k, v in res.items():
+        if k.startswith('test'):
+            stat['mean_' + k] = np.mean(v)
+            stat['std_' + k] = np.std(v)
+    res.update(stat)
     rval = pd.DataFrame(res)
-    col_rename = {}
-    for col in rval.columns:
-        if col.endswith('_primary'):
-            col_rename[col] = col[:-7] + primary_scoring
-    rval.rename(inplace=True, columns=col_rename)
+    rval = rval[sorted(rval.columns)]
 elif selected_function == 'cross_val_predict':
-    predicted = validator(pipeline, X, y, **options)
+    predicted = validator(estimator, X, y, **options)
     if len(predicted.shape) == 1:
         rval = pd.DataFrame(predicted, columns=['Predicted'])
     else:
@@ -129,7 +240,7 @@
     if type(train_sizes) is tuple:
         train_sizes = np.linspace(*train_sizes)
     options['train_sizes'] = train_sizes
-    train_sizes_abs, train_scores, test_scores = validator(pipeline, X, y, **options)
+    train_sizes_abs, train_scores, test_scores = validator(estimator, X, y, **options)
     rval = pd.DataFrame(dict(
                 train_sizes_abs = train_sizes_abs,
                 mean_train_scores = np.mean(train_scores, axis=1),
@@ -139,7 +250,7 @@
     rval = rval[['train_sizes_abs', 'mean_train_scores', 'std_train_scores',
                 'mean_test_scores', 'std_test_scores']]
 elif selected_function == 'permutation_test_score':
-    score, permutation_scores, pvalue = validator(pipeline, X, y, **options)
+    score, permutation_scores, pvalue = validator(estimator, X, y, **options)
     permutation_scores_df = pd.DataFrame(dict(
             permutation_scores = permutation_scores))
     score_df = pd.DataFrame(dict(
@@ -153,7 +264,7 @@
         </configfile>
     </configfiles>
     <inputs>
-        <param name="infile_pipeline" type="data" format="zip" label="Choose the dataset containing model/pipeline object"/>
+        <param name="infile_estimator" type="data" format="zip" label="Choose the dataset containing model/pipeline object"/>
         <conditional name="model_validation_functions">
             <param name="selected_function" type="select" label="Select a model validation function">
                 <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option>
@@ -220,7 +331,7 @@
     </outputs>
     <tests>
         <test>
-            <param name="infile_pipeline" value="pipeline02"/>
+            <param name="infile_estimator" value="pipeline02"/>
             <param name="selected_function" value="cross_validate"/>
             <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
             <param name="col1" value="1,2,3,4,5"/>
@@ -228,7 +339,7 @@
             <param name="col2" value="6"/>
             <output name="outfile">
                 <assert_contents>
-                    <has_n_columns n="4"/>
+                    <has_n_columns n="6"/>
                     <has_text text="0.9999961390418067"/>
                     <has_text text="0.9944541531269271"/>
                     <has_text text="0.9999193322454393"/>
@@ -236,7 +347,7 @@
             </output>
         </test>
         <test>
-            <param name="infile_pipeline" value="pipeline02"/>
+            <param name="infile_estimator" value="pipeline02"/>
             <param name="selected_function" value="cross_val_predict"/>
             <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
             <param name="col1" value="1,2,3,4,5"/>
@@ -245,7 +356,7 @@
             <output name="outfile" file="mv_result02.tabular" lines_diff="4"/>
         </test>
         <test>
-            <param name="infile_pipeline" value="pipeline05"/>
+            <param name="infile_estimator" value="pipeline05"/>
             <param name="selected_function" value="learning_curve"/>
             <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
             <param name="header1" value="true" />
@@ -256,7 +367,7 @@
             <output name="outfile" file="mv_result03.tabular"/>
         </test>
         <test>
-            <param name="infile_pipeline" value="pipeline05"/>
+            <param name="infile_estimator" value="pipeline05"/>
             <param name="selected_function" value="permutation_test_score"/>
             <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
             <param name="col1" value="1,2,3,4,5"/>
@@ -270,7 +381,7 @@
             </output>
         </test>
         <test>
-            <param name="infile_pipeline" value="pipeline05"/>
+            <param name="infile_estimator" value="pipeline05"/>
             <param name="selected_function" value="cross_val_predict"/>
             <section name="groups_selector">
                 <param name="infile_groups" value="regression_y.tabular" ftype="tabular"/>