Mercurial > repos > bgruening > sklearn_generalized_linear

<tool id="sklearn_generalized_linear" name="Generalized linear models" version="@VERSION@">
    <description>for classification and regression</description>
    <macros>
        <import>main_macros.xml</import>
    </macros>
    <expand macro="python_requirements"/>
    <expand macro="macro_stdio"/>
    <version_command>echo "@VERSION@"</version_command>
    <command><![CDATA[
    python "$glm_script" '$inputs'
]]>
    </command>
    <configfiles>
        <inputs name="inputs"/>
        <configfile name="glm_script">
<![CDATA[
import sys
import json
import numpy as np
import sklearn.linear_model
import pandas
from scipy.io import mmread

with open("$__tool_directory__/sk_whitelist.json", "r") as f:
    sk_whitelist = json.load(f)
exec(open("$__tool_directory__/utils.py").read(), globals())

input_json_path = sys.argv[1]
with open(input_json_path, "r") as param_handler:
    params = json.load(param_handler)

#if $selected_tasks.selected_task == "train":

X, y = get_X_y(params, "$selected_tasks.selected_algorithms.input_options.infile1" ,"$selected_tasks.selected_algorithms.input_options.infile2")

algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"]
options = params["selected_tasks"]["selected_algorithms"]["options"]

my_class = getattr(sklearn.linear_model, algorithm)
estimator = my_class(**options)
estimator.fit(X,y)
with open("$outfile_fit", 'wb') as out_handler:
    pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL)

#else:
with open("$selected_tasks.infile_model", 'rb') as model_handler:
    classifier_object = load_model(model_handler)
data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
prediction = classifier_object.predict(data)
prediction_df = pandas.DataFrame(prediction, columns=["predicted"])
res = pandas.concat([data, prediction_df], axis=1)
res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False, header=None)
#end if

]]>
        </configfile>
    </configfiles>
    <inputs>
        <expand macro="sl_Conditional" model="zip">
            <param name="selected_algorithm" type="select" label="Select a linear model:">
                <option value="SGDClassifier" selected="true">Stochastic Gradient Descent (SGD) classifier</option>
                <option value="SGDRegressor">Stochastic Gradient Descent (SGD) regressor</option>
                <option value="LinearRegression">Linear Regression model</option>
                <option value="RidgeClassifier">Ridge classifier</option>
                <option value="Ridge">Ridge regressor</option>
                <option value="LogisticRegression">Logistic Regression</option>
                <option value="LogisticRegressionCV">Logitic Regression with Cross Validation</option>
                <option value="Perceptron">Perceptron</option>
            </param>
            <when value="SGDClassifier">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="loss">
                        <option value="hinge" selected="true">hinge</option>
                        <option value="log">log</option>
                        <option value="modified_huber">modified huber</option>
                        <option value="squared_hinge">squared hinge</option>
                        <option value="perceptron">perceptron</option>
                    </expand>
                    <expand macro="penalty"/>
                    <expand macro="alpha"/>
                    <expand macro="l1_ratio"/>
                    <expand macro="fit_intercept"/>
                    <expand macro="n_iter" />
                    <expand macro="shuffle"/>
                    <expand macro="epsilon"/>
                    <expand macro="learning_rate_s" selected1="true"/>
                    <expand macro="eta0"/>
                    <expand macro="power_t"/>
                    <!--class_weight-->
                    <expand macro="warm_start" checked="false"/>
                    <expand macro="random_state"/>
                    <!--average-->
                </section>
            </when>
            <when value="SGDRegressor">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="loss" select="true"/>
                    <expand macro="penalty"/>
                    <expand macro="alpha"/>
                    <expand macro="l1_ratio"/>
                    <expand macro="fit_intercept"/>
                    <expand macro="n_iter" />
                    <expand macro="shuffle"/>
                    <expand macro="epsilon"/>
                    <expand macro="learning_rate_s" selected2="true"/>
                    <expand macro="eta0" default_value="0.01"/>
                    <expand macro="power_t" default_value="0.25"/>
                    <expand macro="warm_start" checked="false"/>
                    <expand macro="random_state"/>
                    <!--average-->
                </section>
            </when>
            <when value="LinearRegression">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="fit_intercept"/>
                    <expand macro="normalize"/>
                    <expand macro="copy_X"/>
                </section>
            </when>
            <when value="RidgeClassifier">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="ridge_params"/>
                </section>
            </when>
            <when value="Ridge">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="ridge_params"/>
                </section>
            </when>
            <when value="LogisticRegression">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="penalty"/>
                    <param argument="dual" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Use dual formulation" help=" "/>
                    <expand macro="tol" default_value="0.0001" help_text="Tolerance for stopping criteria. "/>
                    <expand macro="C"/>
                    <expand macro="fit_intercept"/>
                    <expand macro="max_iter" default_value="100"/>
                    <expand macro="warm_start" checked="false"/>
                    <param argument="solver" type="select" label="Optimization algorithm" help=" ">
                        <option value="liblinear" selected="true">liblinear</option>
                        <option value="sag">sag</option>
                        <option value="lbfgs">lbfgs</option>
                        <option value="newton-cg">newton-cg</option>
                    </param>
                    <param argument="intercept_scaling" type="float" value="1" label="Intercept scaling factor" help="Useful only if solver is liblinear. "/>
                    <param argument="multi_class" type="select" label="Multiclass option" help="Works only for lbfgs solver. ">
                        <option value="ovr" selected="true">ovr</option>
                        <option value="multinomial">multinomial</option>
                    </param>
                    <!--class_weight-->
                    <expand macro="random_state"/>
                </section>
            </when>
            <when value="LogisticRegressionCV">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <param argument="Cs" type="integer" value="10" label="Inverse of regularization strength" help="A grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Like in support vector machines, smaller values specify stronger regularization. "/>
                    <param argument="dual" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Use dual formulation" help=" "/>
                    <param argument="cv" type="integer" optional="true" value="" label="Number of folds used in cross validation" help="If not set, the default cross-validation generator (Stratified K-Folds) is used. "/>
                    <expand macro="penalty"/>
                    <expand macro="tol" default_value="0.0001" help_text="Tolerance for stopping criteria. "/>
                    <expand macro="fit_intercept"/>
                    <expand macro="max_iter" default_value="100"/>
                    <param argument="solver" type="select" label="Optimization algorithm" help=" ">
                        <option value="liblinear" selected="true">liblinear</option>
                        <option value="sag">sag</option>
                        <option value="lbfgs">lbfgs</option>
                        <option value="newton-cg">newton-cg</option>
                    </param>
                    <param argument="intercept_scaling" type="float" value="1" label="Intercept scaling factor" help="Useful only if solver is liblinear. "/>
                    <param argument="multi_class" type="select" label="Multiclass option" help="Works only for lbfgs solver. ">
                        <option value="ovr" selected="true">ovr</option>
                        <option value="multinomial">multinomial</option>
                    </param>
                    <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Average scores across all folds" help=" "/>
                    <expand macro="random_state"/>
                    <!--scoring=None>
                    <class_weight=None-->
                </section>
            </when>
            <when value="Perceptron">
                <expand macro="sl_mixed_input"/>
                <section name="options" title="Advanced Options" expanded="False">
                    <expand macro="penalty" default_value="none"/>
                    <expand macro="alpha"/>
                    <expand macro="fit_intercept"/>
                    <expand macro="n_iter" />
                    <expand macro="shuffle"/>
                    <expand macro="eta0" default_value="1"/>
                    <expand macro="warm_start" checked="false"/>
                    <expand macro="random_state" default_value="0"/>
                    <!--class_weight=None-->
                </section>
            </when>
        </expand>
    </inputs>
    <expand macro="output"/>
    <tests>
        <test>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="selected_column_selector_option" value="all_but_by_index_number"/>
            <param name="col1" value="6"/>
            <param name="col2" value="6"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="SGDRegressor"/>
            <param name="random_state" value="10"/>
            <output name="outfile_fit" file="glm_model01" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model01" ftype="zip"/>
            <param name="infile_data" value="regression_test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result01" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile1" value="train.tabular" ftype="tabular"/>
            <param name="infile2" value="train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4"/>
            <param name="col2" value="5"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="SGDClassifier"/>
            <param name="random_state" value="10"/>
            <output name="outfile_fit" file="glm_model02" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model02" ftype="zip"/>
            <param name="infile_data" value="test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result02"/>
        </test>
        <test>
            <param name="infile1" value="train.tabular" ftype="tabular"/>
            <param name="infile2" value="train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4"/>
            <param name="col2" value="5"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="RidgeClassifier"/>
            <param name="random_state" value="10"/>
            <output name="outfile_fit" file="glm_model03" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model03" ftype="zip"/>
            <param name="infile_data" value="test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result03"/>
        </test>
        <test>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="col2" value="6"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="LinearRegression"/>
            <output name="outfile_fit" file="glm_model04" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model04" ftype="zip"/>
            <param name="infile_data" value="regression_test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result04" compare="sim_size"/>
        </test>
        <test>
            <param name="infile1" value="train.tabular" ftype="tabular"/>
            <param name="infile2" value="train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4"/>
            <param name="col2" value="5"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="LogisticRegression"/>
            <param name="random_state" value="10"/>
            <output name="outfile_fit" file="glm_model05" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model05" ftype="zip"/>
            <param name="infile_data" value="test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result05"/>
        </test>
        <test>
            <param name="infile1" value="train.tabular" ftype="tabular"/>
            <param name="infile2" value="train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4"/>
            <param name="col2" value="5"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="LogisticRegressionCV"/>
            <param name="random_state" value="10"/>
            <output name="outfile_fit" file="glm_model06" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model06" ftype="zip"/>
            <param name="infile_data" value="test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result06"/>
        </test>
        <test>
            <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
            <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4,5"/>
            <param name="col2" value="6"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="Ridge"/>
            <param name="random_state" value="10"/>
            <output name="outfile_fit" file="glm_model07" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model07" ftype="zip"/>
            <param name="infile_data" value="regression_test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result07" compare="sim_size"/>
        </test>
        <test>
            <param name="infile1" value="train.tabular" ftype="tabular"/>
            <param name="infile2" value="train.tabular" ftype="tabular"/>
            <param name="col1" value="1,2,3,4"/>
            <param name="col2" value="5"/>
            <param name="selected_task" value="train"/>
            <param name="selected_algorithm" value="Perceptron"/>
            <param name="random_state" value="10"/>
            <output name="outfile_fit" file="glm_model08" compare="sim_size" delta="500"/>
        </test>
        <test>
            <param name="infile_model" value="glm_model08" ftype="zip"/>
            <param name="infile_data" value="test.tabular" ftype="tabular"/>
            <param name="selected_task" value="load"/>
            <output name="outfile_predict" file="glm_result08"/>
        </test>
    </tests>
    <help><![CDATA[
***What it does***
This module implements a set of linear models for classification and regression such as: SGD classification and regression,  Linear and Ridge regression and classification. This wrapper is using sklearn.linear_model module at its core. For information about linear models and their parameter settings please refer to `Scikit-learn generalized linear models`_.

.. _`Scikit-learn generalized linear models`: http://scikit-learn.org/stable/modules/linear_model.html

 **1 - Methods**
 There are two groups of operations available:

  1 - Train a model : A training set containing samples and their respective labels (or predicted values) are input. Based on the selected algorithm and options, an estimator object is fit to the data and is returned.

  2 - Load a model and predict : An existing model predicts the class labels (or regression values) for a new dataset.

 **2 - Trainig input**
 When you choose to train a model, you need a features dataset X and a labels set y. This tool expects tabular or sparse data for X and a single column for y (tabular). You can select a subset of columns in a tabular dataset as your features dataset or labels column. Below you find some examples:

 **Sample tabular features dataset**
 The following training dataset contains 3 feature columns and a column containing class labels. You can simply select the first 3 columns as features and the last column as labels:

 ::

  4.01163365529    -6.10797684314    8.29829894763     1
  10.0788438916    1.59539821454     10.0684278289     0
  -5.17607775503   -0.878286135332   6.92941850665     2
  4.00975406235    -7.11847496542    9.3802423585      1
  4.61204065139    -5.71217537352    9.12509610964     1


 **Sample sparse features dataset**
 In this case you cannot specifiy a column range.

 ::

  4 1048577 8738
  1 271 0.020833
  1 1038 0.02461
  2 829017 0.016
  2 829437 0.012
  2 830752 0.025
  3 1047487 0.01
  3 1047980 0.02
  3 1048475 0.01
  4 608 0.016629
  4 1651 0.02519
  4 4053 0.04223


 **2 - Trainig output**
 The trained model is generated and output in the form of a binary file.


 **3 - Prediction input**

 When you choose to load a model and do prediction, the tool expects an already trained estimator and a tabular dataset as input. The dataset contains new samples which you want to classify or predict regression values for.


 .. class:: warningmark

 The number of feature columns must be the same in training and prediction datasets!


 **3 - Prediction output**
 The tool predicts the class labels for new samples and adds them as the last column to the prediction dataset. The new dataset then is output as a tabular file. The prediction output format should look like the training dataset.

    ]]></help>
    <expand macro="sklearn_citation"/>
</tool>
author	bgruening
date	Thu, 11 Oct 2018 03:37:34 -0400
parents	212e7adfe65f
children	e3bc646e63b2