Mercurial > repos > bgruening > sklearn_model_validation

<tool id="sklearn_model_validation" name="Model Validation" version="@VERSION@">
    <description>includes cross_validate, cross_val_predict, learning_curve, and more</description>
    <macros>
        <import>main_macros.xml</import>
    </macros>
    <expand macro="python_requirements"/>
    <expand macro="macro_stdio"/>
    <version_command>echo "@VERSION@"</version_command>
    <command>
        <![CDATA[
        export HDF5_USE_FILE_LOCKING='FALSE';
        python "$sklearn_model_validation_script" '$inputs'
        ]]>
    </command>
    <configfiles>
        <inputs name="inputs" />
        <configfile name="sklearn_model_validation_script">
            <![CDATA[
import imblearn
import joblib
import json
import numpy as np
import os
import pandas as pd
import pickle
import pprint
import skrebate
import sys
import warnings
import xgboost
from mlxtend import classifier, regressor
from sklearn import (
    cluster, compose, decomposition, ensemble, feature_extraction,
    feature_selection, gaussian_process, kernel_approximation, metrics,
    model_selection, naive_bayes, neighbors, pipeline, preprocessing,
    svm, linear_model, tree, discriminant_analysis)
from sklearn.model_selection import _validation

from galaxy_ml.utils import (SafeEval, get_cv, get_scoring, load_model,
                   read_columns, get_module)
from galaxy_ml.model_validations import _fit_and_score


setattr(_validation, '_fit_and_score', _fit_and_score)

N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1))
CACHE_DIR = os.path.join(os.getcwd(), 'cached')
del os
ALLOWED_CALLBACKS = ('EarlyStopping', 'TerminateOnNaN', 'ReduceLROnPlateau',
                     'CSVLogger', 'None')

warnings.filterwarnings('ignore')

safe_eval = SafeEval()

input_json_path = sys.argv[1]
with open(input_json_path, 'r') as param_handler:
    params = json.load(param_handler)

## load estimator
with open('$infile_estimator', 'rb') as estimator_handler:
    estimator = load_model(estimator_handler)

estimator_params = estimator.get_params()

## check estimator hyperparameters
memory = joblib.Memory(location=CACHE_DIR, verbose=0)
# cache iraps_core fits could increase search speed significantly
if estimator.__class__.__name__ == 'IRAPSClassifier':
    estimator.set_params(memory=memory)
else:
    # For iraps buried in pipeline
    for p, v in estimator_params.items():
        if p.endswith('memory'):
            # for case of `__irapsclassifier__memory`
            if len(p) > 8 and p[:-8].endswith('irapsclassifier'):
                # cache iraps_core fits could increase search
                # speed significantly
                new_params = {p: memory}
                estimator.set_params(**new_params)
            # security reason, we don't want memory being
            # modified unexpectedly
            elif v:
                new_params = {p, None}
                estimator.set_params(**new_params)
        # For now, 1 CPU is suggested for iprasclassifier
        elif p.endswith('n_jobs'):
            new_params = {p: 1}
            estimator.set_params(**new_params)
        # for security reason, types of callback are limited
        elif p.endswith('callbacks'):
            for cb in v:
                cb_type = cb['callback_selection']['callback_type']
                if cb_type not in ALLOWED_CALLBACKS:
                    raise ValueError(
                        "Prohibited callback type: %s!" % cb_type)

## store read dataframe object
loaded_df = {}

#if $input_options.selected_input == 'tabular'
header = 'infer' if params['input_options']['header1'] else None
column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option']
if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
    c = params['input_options']['column_selector_options_1']['col1']
else:
    c = None
infile1 = '$input_options.infile1'
df_key = infile1 + repr(header)
df = pd.read_csv(infile1, sep='\t', header=header, parse_dates=True)
loaded_df[df_key] = df
X = read_columns(df, c=c, c_option=column_option).astype(float)

#elif $input_options.selected_input == 'sparse':
X = mmread('$input_options.infile1')

#elif $input_options.selected_input == 'seq_fasta'
fasta_path = '$input_options.fasta_path'
pyfaidx = get_module('pyfaidx')
sequences = pyfaidx.Fasta(fasta_path)
n_seqs = len(sequences.keys())
X = np.arange(n_seqs)[:, np.newaxis]
for param in estimator_params.keys():
    if param.endswith('fasta_path'):
        estimator.set_params(
            **{param: fasta_path})
        break
else:
    raise ValueError(
        "The selected estimator doesn't support "
        "fasta file input! Please consider using "
        "KerasGBatchClassifier with "
        "FastaDNABatchGenerator/FastaProteinBatchGenerator "
        "or having GenomeOneHotEncoder/ProteinOneHotEncoder "
        "in pipeline!")
#elif $input_options.selected_input == 'refseq_and_interval'
ref_seq = '$input_options.ref_genome_file'
intervals = '$input_options.interval_file'
targets = __import__('os').path.join(__import__('os').getcwd(),
                                     '${target_file.element_identifier}.gz')
path_params = {
    'data_batch_generator__ref_genome_path': ref_seq,
    'data_batch_generator__intervals_path': intervals,
    'data_batch_generator__target_path': targets
}
estimator.set_params(**path_params)
n_intervals = sum(1 for line in open(intervals))
X = np.arange(n_intervals)[:, np.newaxis]
#end if

header = 'infer' if params['input_options']['header2'] else None
column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2']
if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
    c = params['input_options']['column_selector_options_2']['col2']
else:
    c = None
infile2 = '$input_options.infile2'
df_key = infile2 + repr(header)
if df_key in loaded_df:
    infile2 = loaded_df[df_key]
else:
    infile2 = pd.read_csv(infile2, sep='\t', header=header, parse_dates=True)
    loaded_df[df_key] = infile2
y = read_columns(
        infile2,
        c = c,
        c_option = column_option,
        sep='\t',
        header=header,
        parse_dates=True)
if len(y.shape) == 2 and y.shape[1] == 1:
    y = y.ravel()
#if $input_options.selected_input == 'refseq_and_interval'
estimator.set_params(
    data_batch_generator__features=y.ravel().tolist())
y = None
#end if

## handle options
options = params['model_validation_functions']['options']

#if $model_validation_functions.options.cv_selector.selected_cv\
        in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']:
infile_g = '$model_validation_functions.options.cv_selector.groups_selector.infile_g'
header = 'infer' if options['cv_selector']['groups_selector']['header_g'] else None
column_option = (options['cv_selector']['groups_selector']['column_selector_options_g']
                        ['selected_column_selector_option_g'])
if column_option in ['by_index_number', 'all_but_by_index_number',
                     'by_header_name', 'all_but_by_header_name']:
    c = (options['cv_selector']['groups_selector']['column_selector_options_g']['col_g'])
else:
    c = None
df_key = infile_g + repr(header)
if df_key in loaded_df:
    infile_g = loaded_df[df_key]
groups = read_columns(infile_g, c=c, c_option=column_option,
                      sep='\t', header=header, parse_dates=True)
groups = groups.ravel()
options['cv_selector']['groups_selector'] = groups
#end if

## del loaded_df
del loaded_df

splitter, groups = get_cv( options.pop('cv_selector') )
options['cv'] = splitter
options['groups'] = groups
options['n_jobs'] = N_JOBS
if 'scoring' in options:
    primary_scoring = options['scoring']['primary_scoring']
    options['scoring'] = get_scoring(options['scoring'])
if 'pre_dispatch' in options and options['pre_dispatch'] == '':
    options['pre_dispatch'] = None

## Set up validator, run estimator through validator and return results.

validator = params['model_validation_functions']['selected_function']
validator = getattr(_validation, validator)

selected_function = params['model_validation_functions']['selected_function']

if selected_function == 'cross_validate':
    res = validator(estimator, X, y, **options)
    stat = {}
    for k, v in res.items():
        if k.startswith('test'):
            stat['mean_' + k] = np.mean(v)
            stat['std_' + k] = np.std(v)
    res.update(stat)
    rval = pd.DataFrame(res)
    rval = rval[sorted(rval.columns)]
elif selected_function == 'cross_val_predict':
    predicted = validator(estimator, X, y, **options)
    if len(predicted.shape) == 1:
        rval = pd.DataFrame(predicted, columns=['Predicted'])
    else:
        rval = pd.DataFrame(predicted)
elif selected_function == 'learning_curve':
    try:
        train_sizes = safe_eval(options['train_sizes'])
    except:
        sys.exit("Unsupported train_sizes input! Supports int/float in tuple and array-like structure.")
    if type(train_sizes) is tuple:
        train_sizes = np.linspace(*train_sizes)
    options['train_sizes'] = train_sizes
    train_sizes_abs, train_scores, test_scores = validator(estimator, X, y, **options)
    rval = pd.DataFrame(dict(
                train_sizes_abs = train_sizes_abs,
                mean_train_scores = np.mean(train_scores, axis=1),
                std_train_scores = np.std(train_scores, axis=1),
                mean_test_scores = np.mean(test_scores, axis=1),
                std_test_scores = np.std(test_scores, axis=1)))
    rval = rval[['train_sizes_abs', 'mean_train_scores', 'std_train_scores',
                'mean_test_scores', 'std_test_scores']]
elif selected_function == 'permutation_test_score':
    score, permutation_scores, pvalue = validator(estimator, X, y, **options)
    permutation_scores_df = pd.DataFrame(dict(
            permutation_scores = permutation_scores))
    score_df = pd.DataFrame(dict(
            score = [score],
            pvalue = [pvalue]))
    rval = pd.concat([score_df[['score', 'pvalue']], permutation_scores_df], axis=1)

rval.to_csv(path_or_buf='$outfile', sep='\t', header=True, index=False)

            ]]>
        </configfile>
    </configfiles>
    <inputs>
        <param name="infile_estimator" type="data" format="zip" label="Choose the dataset containing model/pipeline object"/>
        <conditional name="model_validation_functions">
            <param name="selected_function" type="select" label="Select a model validation function">
                <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option>
                <option value="cross_val_predict">cross_val_predict - Generate cross-validated estimates for each input data point</option>
                <option value="learning_curve">learning_curve - Learning curve</option>
                <option value="permutation_test_score">permutation_test_score - Evaluate the significance of a cross-validated score with permutations</option>
                <option value="validation_curve">validation_curve - Use grid search with one parameter instead</option>
            </param>
            <when value="cross_validate">
                <section name="options" title="Other Options" expanded="false">
                    <expand macro="scoring_selection"/>
                    <expand macro="model_validation_common_options"/>
                    <param argument="return_train_score" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to include train scores."/>
                    <!--param argument="return_estimator" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to return the estimators fitted on each split."/> -->
                    <!--param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised."/> -->
                    <!--fit_params-->
                    <expand macro="pre_dispatch"/>
                </section>
            </when>
            <when value="cross_val_predict">
                <section name="options" title="Other Options" expanded="false">
                    <expand macro="model_validation_common_options" />
                    <!--fit_params-->
                    <expand macro="pre_dispatch" value="2*n_jobs’" help="Controls the number of jobs that get dispatched during parallel execution"/>
                    <param argument="method" type="select" label="Invokes the passed method name of the passed estimator">
                        <option value="predict" selected="true">predict</option>
                        <option value="predict_proba">predict_proba</option>
                    </param>
                </section>
            </when>
            <when value="learning_curve">
                <section name="options" title="Other Options" expanded="false">
                    <expand macro="scoring_selection"/>
                    <expand macro="model_validation_common_options"/>
                    <param argument="train_sizes" type="text" value="(0.1, 1.0, 5)" label="train_sizes"
                            help="Relative or absolute numbers of training examples that will be used to generate the learning curve. Supports 1) tuple, to be evaled by np.linspace, e.g. (0.1, 1.0, 5); 2) array-like, e.g. [0.1  , 0.325, 0.55 , 0.775, 1.]">
                        <sanitizer>
                            <valid initial="default">
                                <add value="["/>
                                <add value="]"/>
                            </valid>
                        </sanitizer>
                    </param>
                    <param argument="exploit_incremental_learning" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to apply incremental learning to speed up fitting of the estimator if supported"/>
                    <expand macro="pre_dispatch"/>
                    <expand macro="shuffle" checked="false" label="shuffle" help="Whether to shuffle training data before taking prefixes"/>
                    <expand macro="random_state" help_text="If int, the seed used by the random number generator. Used when `shuffle` is True"/>
                </section>
            </when>
            <when value="permutation_test_score">
                <section name="options" title="Other Options" expanded="false">
                    <expand macro="scoring_selection"/>
                    <expand macro="model_validation_common_options"/>
                    <param name="n_permutations" type="integer" value="100" optional="true" label="n_permutations" help="Number of times to permute y"/>
                    <expand macro="random_state"/>
                </section>
            </when>
            <when value="validation_curve"/>
        </conditional>
        <expand macro="sl_mixed_input_plus_sequence"/>
    </inputs>
    <outputs>
        <data format="tabular" name="outfile"/>
    </outputs>
    <tests>
        <test>
            <param name="infile_estimator" value="pipeline02"/>
            <param name="selected_function" value="cross_validate"/>
            <param name="return_train_score" value="True"/>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col2" value="6"/>
            <output name="outfile">
                <assert_contents>
                    <has_n_columns n="6"/>
                    <has_text text="0.9999961390418067"/>
                    <has_text text="0.9944541531269271"/>
                    <has_text text="0.9999193322454393"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="infile_estimator" value="pipeline02"/>
            <param name="selected_function" value="cross_val_predict"/>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col2" value="6"/>
            <output name="outfile" file="mv_result02.tabular" lines_diff="14"/>
        </test>
        <test>
            <param name="infile_estimator" value="pipeline05"/>
            <param name="selected_function" value="learning_curve"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true" />
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true" />
            <param name="col2" value="1"/>
            <output name="outfile" file="mv_result03.tabular"/>
        </test>
        <test>
            <param name="infile_estimator" value="pipeline05"/>
            <param name="selected_function" value="permutation_test_score"/>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col2" value="6"/>
            <output name="outfile">
                <assert_contents>
                    <has_n_columns n="3"/>
                    <has_text text="0.25697059258228816"/>
                </assert_contents>
            </output>
        </test>
        <test>
            <param name="infile_estimator" value="pipeline05"/>
            <param name="selected_function" value="cross_val_predict"/>
            <section name="groups_selector">
                <param name="infile_groups" value="regression_y.tabular" ftype="tabular"/>
                <param name="header_g" value="true"/>
                <param name="selected_column_selector_option_g" value="by_index_number"/>
                <param name="col_g" value="1"/>
            </section>
            <param name="selected_cv" value="GroupKFold"/>
            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
            <param name="header1" value="true"/>
            <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
            <param name="header2" value="true"/>
            <param name="col2" value="1"/>
            <output name="outfile" file="mv_result05.tabular"/>
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**
This tool includes model validation functions to evaluate estimator performance in the cross-validation approach. This tool is based on
sklearn.model_selection package.
For information about model validation functions and their parameter settings please refer to `Scikit-learn model_selection`_.

.. _`Scikit-learn model_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
        ]]>
    </help>
    <expand macro="sklearn_citation">
        <expand macro="skrebate_citation"/>
        <expand macro="xgboost_citation"/>
    </expand>
</tool>
author	bgruening
date	Wed, 11 Mar 2020 13:43:57 -0400
parents	a5aed87b2cc0
children	9b017b0da56e