Mercurial > repos > bgruening > sklearn_estimator_attributes

<tool id="sklearn_estimator_attributes" name="Estimator attributes" version="@VERSION@" profile="@PROFILE@">
    <description>get important attributes from an estimator or scikit object</description>
    <macros>
        <import>main_macros.xml</import>
    </macros>
    <expand macro="python_requirements" />
    <expand macro="macro_stdio" />
    <version_command>echo "@VERSION@"</version_command>
    <command>
        <![CDATA[
        python '$main_script'
        ]]>
    </command>
    <configfiles>
        <configfile name="main_script">
            <![CDATA[
import json
import pandas
import skrebate
import sys
import warnings
import xgboost
from mlxtend import regressor, classifier
from sklearn import (
    cluster, compose, decomposition, ensemble, feature_extraction,
    feature_selection, gaussian_process, kernel_approximation, metrics,
    model_selection, naive_bayes, neighbors, pipeline, preprocessing,
    svm, linear_model, tree, discriminant_analysis)
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.pipeline import Pipeline

from galaxy_ml.model_persist import load_model_from_h5, dump_model_to_h5
from galaxy_ml.utils import get_search_params

warnings.simplefilter('ignore')

infile_object = '$infile_object'
attribute = '$attribute_type'

est_obj = load_model_from_h5(infile_object)

if attribute == 'get_params':
    ## get_params()
    results = get_search_params(est_obj)
    df = pandas.DataFrame(results, columns=['', 'Parameter', 'Value'])
    df.to_csv('$outfile', sep='\t', index=False)
elif attribute == 'final_estimator':
    res = est_obj.steps[-1][-1]
    print(repr(res))
    dump_model_to_h5(res, '$outfile')
elif attribute in ['best_estimator_', 'init_', 'classifier_', 'regressor_']:
    res = getattr(est_obj, attribute)
    print(repr(res))
    dump_model_to_h5(res, '$outfile')
elif attribute in ['oob_score_', 'best_score_', 'n_features_in']:
    res = getattr(est_obj, attribute)
    res = pandas.DataFrame([res], columns=[attribute])
    res.to_csv('$outfile', sep='\t', index=False)
elif attribute in ['best_params_', 'named_steps']:
    res = getattr(est_obj, attribute)
    with open('$outfile', 'w') as f:
        f.write(repr(res))
elif attribute == 'cv_results_':
    res = pandas.DataFrame(est_obj.cv_results_)
    res = res[sorted(res.columns)]
    res.to_csv('$outfile', sep='\t', index=False)
else:
    if attribute == 'get_signature':
        res = est_obj.get_signature()
    else:
        res = getattr(est_obj, attribute)
    columns = []
    if res.ndim == 1 or res.shape[-1] == 1:
        columns = [attribute]
    else:
        for i in range(res.shape[-1]):
            columns.append(attribute + '_' + str(i))
    res = pandas.DataFrame(res, columns=columns)
    res.to_csv('$outfile', sep='\t', index=False)

            ]]>
        </configfile>
    </configfiles>
    <inputs>
        <param name="infile_object" type="data" format="h5mlm" label="Choose the dataset containing estimator/pipeline object" />
        <param name="attribute_type" type="select" label="Select an attribute retrival type">
            <option value="get_params" selected="true">Estimator - get_params()</option>
            <option value="feature_importances_" >Fitted estimator - feature_importances_ </option>
            <option value="coef_">Fitted estimator - coef_ </option>
            <option value="train_score_">Fitted estimator - train_score_ </option>
            <option value="oob_score_">Fitted estimator - oob_score_ </option>
            <option value="init_">Fitted estimator - init_ </option>
            <option value="classifier_">Fitted BinarizeTargetClassifier - classifier_</option>
            <option value="regressor_">Fitted BinarizeTargetRegressor - regressor_</option>
            <option value="get_signature">Fitted IRAPSClassifier - get_signature</option>
            <option value="named_steps">Pipeline - named_steps </option>
            <option value="final_estimator">Pipeline - final_estimator </option>
            <option value="cv_results_">SearchCV - cv_results_ </option>
            <option value="best_estimator_">SearchCV - best_estimator_ </option>
            <option value="best_score_">SearchCV - best_score_ </option>
            <option value="best_params_">SearchCV - best_params_ </option>
            <option value="scores_">Feature_selection - scores_ </option>
            <option value="pvalues_">Feature_selection - pvalues_ </option>
            <option value="ranking_">Feature_selection - ranking_ </option>
            <option value="n_features_in">Feature_selection - n_features_in </option>
            <option value="grid_scores_">Feature_selection - grid_scores_ </option>
        </param>
    </inputs>
    <outputs>
        <data format="tabular" name="outfile" label="${attribute_type} from ${on_string}">
            <change_format>
                <when input="attribute_type" value="named_steps" format="txt" />
                <when input="attribute_type" value="best_params_" format="txt" />
                <when input="attribute_type" value="final_estimator" format="h5mlm" />
                <when input="attribute_type" value="best_estimator_" format="h5mlm" />
                <when input="attribute_type" value="init_" format="h5mlm" />
                <when input="attribute_type" value="classifier_" format="h5mlm" />
                <when input="attribute_type" value="regressor_" format="h5mlm" />
            </change_format>
        </data>
    </outputs>
    <tests>
        <test>
            <param name="infile_object" value="GridSearchCV01.h5mlm" ftype="h5mlm" />
            <param name="attribute_type" value="best_score_" />
            <output name="outfile" file="best_score_.tabular" />
        </test>
        <test>
            <param name="infile_object" value="GridSearchCV01.h5mlm" ftype="h5mlm" />
            <param name="attribute_type" value="best_params_" />
            <output name="outfile" file="best_params_.txt" />
        </test>
        <test>
            <param name="infile_object" value="GridSearchCV01.h5mlm" ftype="h5mlm" />
            <param name="attribute_type" value="best_estimator_" />
            <output name="outfile" file="best_estimator_.h5mlm" compare="sim_size" delta="10" />
        </test>
         <test>
            <param name="infile_object" value="searchCV01" ftype="h5mlm" />
            <param name="attribute_type" value="final_estimator" />
            <output name="outfile" file="final_estimator.h5mlm" compare="sim_size" delta="10" />
        </test>
        <test>
            <param name="infile_object" value="searchCV01" ftype="h5mlm" />
            <param name="attribute_type" value="named_steps" />
            <output name="outfile" file="named_steps.txt" compare="sim_size" delta="5" />
        </test>
        <test>
            <param name="infile_object" value="best_estimator_.h5mlm" ftype="h5mlm" />
            <param name="attribute_type" value="feature_importances_" />
            <output name="outfile" file="feature_importances_.tabular" />
        </test>
        <test>
            <param name="infile_object" value="RFE.h5mlm" ftype="h5mlm" />
            <param name="attribute_type" value="ranking_" />
            <output name="outfile" file="ranking_.tabular" />
        </test>
        <test>
            <param name="infile_object" value="LinearRegression01.h5mlm" ftype="h5mlm" />
            <param name="attribute_type" value="get_params" />
            <output name="outfile" value="get_params.tabular" />
        </test>
    </tests>
    <help>
        <![CDATA[
**What it does**
Output attribute from an estimator or any scikit object.

Common attributes are :

  - ``estimator.`` *feature_importances_*
  - ``RFE``. *ranking_*
  - ``RFECV``. *grid_scores_*
  - ``GridSearchCV``. *best_estimator_*

        ]]>
    </help>
    <expand macro="sklearn_citation">
        <expand macro="skrebate_citation" />
        <expand macro="xgboost_citation" />
        <expand macro="imblearn_citation" />
    </expand>
</tool>
author	bgruening
date	Wed, 09 Aug 2023 14:10:58 +0000
parents	a01fa4e8fe4f
children