comparison generalized_linear.xml @ 0:32a88b3bea94 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 0e582cf1f3134c777cce3aa57d71b80ed95e6ba9
author bgruening
date Fri, 16 Feb 2018 09:19:45 -0500
parents
children 3326dd4f1e8d
comparison
equal deleted inserted replaced
-1:000000000000 0:32a88b3bea94
1 <tool id="sklearn_generalized_linear" name="Generalized linear models" version="@VERSION@">
2 <description>for classification and regression</description>
3 <macros>
4 <import>main_macros.xml</import>
5 </macros>
6 <expand macro="python_requirements"/>
7 <expand macro="macro_stdio"/>
8 <version_command>echo "@VERSION@"</version_command>
9 <command><![CDATA[
10 python "$glm_script" '$inputs'
11 ]]>
12 </command>
13 <configfiles>
14 <inputs name="inputs"/>
15 <configfile name="glm_script">
16 <![CDATA[
17 import sys
18 import json
19 import numpy as np
20 import sklearn.linear_model
21 import pandas
22 import pickle
23 from scipy.io import mmread
24
25 @COLUMNS_FUNCTION@
26
27 input_json_path = sys.argv[1]
28 params = json.load(open(input_json_path, "r"))
29
30 #if $selected_tasks.selected_task == "train":
31
32 algorithm = params["selected_tasks"]["selected_algorithms"]["selected_algorithm"]
33 options = params["selected_tasks"]["selected_algorithms"]["options"]
34
35 #if $selected_tasks.selected_algorithms.input_options.selected_input=="tabular":
36 X = columns("$selected_tasks.selected_algorithms.input_options.infile1","$selected_tasks.selected_algorithms.input_options.col1")
37 #else:
38 X = mmread(open("$selected_tasks.selected_algorithms.input_options.infile1", 'r'))
39 #end if
40
41 y = columns("$selected_tasks.selected_algorithms.input_options.infile2","$selected_tasks.selected_algorithms.input_options.col2")
42
43 my_class = getattr(sklearn.linear_model, algorithm)
44 estimator = my_class(**options)
45 estimator.fit(X,y)
46 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL)
47
48 #else:
49 classifier_object = pickle.load(open("$selected_tasks.infile_model", 'r'))
50 data = pandas.read_csv("$selected_tasks.infile_data", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
51 prediction = classifier_object.predict(data)
52 prediction_df = pandas.DataFrame(prediction)
53 res = pandas.concat([data, prediction_df], axis=1)
54 res.to_csv(path_or_buf = "$outfile_predict", sep="\t", index=False, header=None)
55 #end if
56
57 ]]>
58 </configfile>
59 </configfiles>
60 <inputs>
61 <expand macro="sl_Conditional" model="zip">
62 <param name="selected_algorithm" type="select" label="Select a linear model:">
63 <option value="SGDClassifier" selected="true">Stochastic Gradient Descent (SGD) classifier</option>
64 <option value="SGDRegressor">Stochastic Gradient Descent (SGD) regressor</option>
65 <option value="LinearRegression">Linear Regression model</option>
66 <option value="RidgeClassifier">Ridge classifier</option>
67 <option value="Ridge">Ridge regressor</option>
68 <option value="LogisticRegression">Logistic Regression</option>
69 <option value="LogisticRegressionCV">Logitic Regression with Cross Validation</option>
70 <option value="Perceptron">Perceptron</option>
71 </param>
72 <when value="SGDClassifier">
73 <expand macro="sl_mixed_input"/>
74 <section name="options" title="Advanced Options" expanded="False">
75 <expand macro="loss">
76 <option value="hinge" selected="true">hinge</option>
77 <option value="log">log</option>
78 <option value="modified_huber">modified huber</option>
79 <option value="squared_hinge">squared hinge</option>
80 <option value="perceptron">perceptron</option>
81 </expand>
82 <expand macro="penalty"/>
83 <expand macro="alpha"/>
84 <expand macro="l1_ratio"/>
85 <expand macro="fit_intercept"/>
86 <expand macro="n_iter" />
87 <expand macro="shuffle"/>
88 <expand macro="epsilon"/>
89 <expand macro="learning_rate_s" selected1="true"/>
90 <expand macro="eta0"/>
91 <expand macro="power_t"/>
92 <!--class_weight-->
93 <expand macro="warm_start" checked="false"/>
94 <expand macro="random_state"/>
95 <!--average-->
96 </section>
97 </when>
98 <when value="SGDRegressor">
99 <expand macro="sl_mixed_input"/>
100 <section name="options" title="Advanced Options" expanded="False">
101 <expand macro="loss" select="true"/>
102 <expand macro="penalty"/>
103 <expand macro="alpha"/>
104 <expand macro="l1_ratio"/>
105 <expand macro="fit_intercept"/>
106 <expand macro="n_iter" />
107 <expand macro="shuffle"/>
108 <expand macro="epsilon"/>
109 <expand macro="learning_rate_s" selected2="true"/>
110 <expand macro="eta0" default_value="0.01"/>
111 <expand macro="power_t" default_value="0.25"/>
112 <expand macro="warm_start" checked="false"/>
113 <expand macro="random_state"/>
114 <!--average-->
115 </section>
116 </when>
117 <when value="LinearRegression">
118 <expand macro="sl_mixed_input"/>
119 <section name="options" title="Advanced Options" expanded="False">
120 <expand macro="fit_intercept"/>
121 <expand macro="normalize"/>
122 <expand macro="copy_X"/>
123 </section>
124 </when>
125 <when value="RidgeClassifier">
126 <expand macro="sl_mixed_input"/>
127 <section name="options" title="Advanced Options" expanded="False">
128 <expand macro="ridge_params"/>
129 </section>
130 </when>
131 <when value="Ridge">
132 <expand macro="sl_mixed_input"/>
133 <section name="options" title="Advanced Options" expanded="False">
134 <expand macro="ridge_params"/>
135 </section>
136 </when>
137 <when value="LogisticRegression">
138 <expand macro="sl_mixed_input"/>
139 <section name="options" title="Advanced Options" expanded="False">
140 <expand macro="penalty"/>
141 <param argument="dual" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Use dual formulation" help=" "/>
142 <expand macro="tol" default_value="0.0001" help_text="Tolerance for stopping criteria. "/>
143 <expand macro="C"/>
144 <expand macro="fit_intercept"/>
145 <expand macro="max_iter" default_value="100"/>
146 <expand macro="warm_start" checked="false"/>
147 <param argument="solver" type="select" label="Optimization algorithm" help=" ">
148 <option value="liblinear" selected="true">liblinear</option>
149 <option value="sag">sag</option>
150 <option value="lbfgs">lbfgs</option>
151 <option value="newton-cg">newton-cg</option>
152 </param>
153 <param argument="intercept_scaling" type="float" value="1" label="Intercept scaling factor" help="Useful only if solver is liblinear. "/>
154 <param argument="multi_class" type="select" label="Multiclass option" help="Works only for lbfgs solver. ">
155 <option value="ovr" selected="true">ovr</option>
156 <option value="multinomial">multinomial</option>
157 </param>
158 <!--class_weight-->
159 <expand macro="random_state"/>
160 </section>
161 </when>
162 <when value="LogisticRegressionCV">
163 <expand macro="sl_mixed_input"/>
164 <section name="options" title="Advanced Options" expanded="False">
165 <param argument="Cs" type="integer" value="10" label="Inverse of regularization strength" help="A grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4. Like in support vector machines, smaller values specify stronger regularization. "/>
166 <param argument="dual" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Use dual formulation" help=" "/>
167 <param argument="cv" type="integer" optional="true" value="" label="Number of folds used in cross validation" help="If not set, the default cross-validation generator (Stratified K-Folds) is used. "/>
168 <expand macro="penalty"/>
169 <expand macro="tol" default_value="0.0001" help_text="Tolerance for stopping criteria. "/>
170 <expand macro="fit_intercept"/>
171 <expand macro="max_iter" default_value="100"/>
172 <param argument="solver" type="select" label="Optimization algorithm" help=" ">
173 <option value="liblinear" selected="true">liblinear</option>
174 <option value="sag">sag</option>
175 <option value="lbfgs">lbfgs</option>
176 <option value="newton-cg">newton-cg</option>
177 </param>
178 <param argument="intercept_scaling" type="float" value="1" label="Intercept scaling factor" help="Useful only if solver is liblinear. "/>
179 <param argument="multi_class" type="select" label="Multiclass option" help="Works only for lbfgs solver. ">
180 <option value="ovr" selected="true">ovr</option>
181 <option value="multinomial">multinomial</option>
182 </param>
183 <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Average scores across all folds" help=" "/>
184 <expand macro="random_state"/>
185 <!--scoring=None>
186 <class_weight=None-->
187 </section>
188 </when>
189 <when value="Perceptron">
190 <expand macro="sl_mixed_input"/>
191 <section name="options" title="Advanced Options" expanded="False">
192 <expand macro="penalty" default_value="none"/>
193 <expand macro="alpha"/>
194 <expand macro="fit_intercept"/>
195 <expand macro="n_iter" />
196 <expand macro="shuffle"/>
197 <expand macro="eta0" default_value="1"/>
198 <expand macro="warm_start" checked="false"/>
199 <expand macro="random_state" default_value="0"/>
200 <!--class_weight=None-->
201 </section>
202 </when>
203 </expand>
204 </inputs>
205 <outputs>
206 <data format="tabular" name="outfile_predict">
207 <filter>selected_tasks['selected_task'] == 'load'</filter>
208 </data>
209 <data format="zip" name="outfile_fit">
210 <filter>selected_tasks['selected_task'] == 'train'</filter>
211 </data>
212 </outputs>
213 <tests>
214 <test>
215 <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
216 <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
217 <param name="col1" value="1,2,3,4,5"/>
218 <param name="col2" value="6"/>
219 <param name="selected_task" value="train"/>
220 <param name="selected_algorithm" value="SGDRegressor"/>
221 <param name="random_state" value="10"/>
222 <output name="outfile_fit" file="glm_model01" compare="sim_size" delta="500"/>
223 </test>
224 <test>
225 <param name="infile_model" value="glm_model01" ftype="zip"/>
226 <param name="infile_data" value="regression_test.tabular" ftype="tabular"/>
227 <param name="selected_task" value="load"/>
228 <output name="outfile_predict" file="glm_result01" compare="sim_size" delta="500"/>
229 </test>
230 <test>
231 <param name="infile1" value="train.tabular" ftype="tabular"/>
232 <param name="infile2" value="train.tabular" ftype="tabular"/>
233 <param name="col1" value="1,2,3,4"/>
234 <param name="col2" value="5"/>
235 <param name="selected_task" value="train"/>
236 <param name="selected_algorithm" value="SGDClassifier"/>
237 <param name="random_state" value="10"/>
238 <output name="outfile_fit" file="glm_model02" compare="sim_size" delta="500"/>
239 </test>
240 <test>
241 <param name="infile_model" value="glm_model02" ftype="zip"/>
242 <param name="infile_data" value="test.tabular" ftype="tabular"/>
243 <param name="selected_task" value="load"/>
244 <output name="outfile_predict" file="glm_result02"/>
245 </test>
246 <test>
247 <param name="infile1" value="train.tabular" ftype="tabular"/>
248 <param name="infile2" value="train.tabular" ftype="tabular"/>
249 <param name="col1" value="1,2,3,4"/>
250 <param name="col2" value="5"/>
251 <param name="selected_task" value="train"/>
252 <param name="selected_algorithm" value="RidgeClassifier"/>
253 <param name="random_state" value="10"/>
254 <output name="outfile_fit" file="glm_model03" compare="sim_size" delta="500"/>
255 </test>
256 <test>
257 <param name="infile_model" value="glm_model03" ftype="zip"/>
258 <param name="infile_data" value="test.tabular" ftype="tabular"/>
259 <param name="selected_task" value="load"/>
260 <output name="outfile_predict" file="glm_result03"/>
261 </test>
262 <test>
263 <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
264 <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
265 <param name="col1" value="1,2,3,4,5"/>
266 <param name="col2" value="6"/>
267 <param name="selected_task" value="train"/>
268 <param name="selected_algorithm" value="LinearRegression"/>
269 <param name="random_state" value="10"/>
270 <output name="outfile_fit" file="glm_model04" compare="sim_size" delta="500"/>
271 </test>
272 <test>
273 <param name="infile_model" value="glm_model04" ftype="zip"/>
274 <param name="infile_data" value="regression_test.tabular" ftype="tabular"/>
275 <param name="selected_task" value="load"/>
276 <output name="outfile_predict" file="glm_result04" compare="sim_size"/>
277 </test>
278 <test>
279 <param name="infile1" value="train.tabular" ftype="tabular"/>
280 <param name="infile2" value="train.tabular" ftype="tabular"/>
281 <param name="col1" value="1,2,3,4"/>
282 <param name="col2" value="5"/>
283 <param name="selected_task" value="train"/>
284 <param name="selected_algorithm" value="LogisticRegression"/>
285 <param name="random_state" value="10"/>
286 <output name="outfile_fit" file="glm_model05" compare="sim_size" delta="500"/>
287 </test>
288 <test>
289 <param name="infile_model" value="glm_model05" ftype="zip"/>
290 <param name="infile_data" value="test.tabular" ftype="tabular"/>
291 <param name="selected_task" value="load"/>
292 <output name="outfile_predict" file="glm_result05"/>
293 </test>
294 <test>
295 <param name="infile1" value="train.tabular" ftype="tabular"/>
296 <param name="infile2" value="train.tabular" ftype="tabular"/>
297 <param name="col1" value="1,2,3,4"/>
298 <param name="col2" value="5"/>
299 <param name="selected_task" value="train"/>
300 <param name="selected_algorithm" value="LogisticRegressionCV"/>
301 <param name="random_state" value="10"/>
302 <output name="outfile_fit" file="glm_model06" compare="sim_size" delta="500"/>
303 </test>
304 <test>
305 <param name="infile_model" value="glm_model06" ftype="zip"/>
306 <param name="infile_data" value="test.tabular" ftype="tabular"/>
307 <param name="selected_task" value="load"/>
308 <output name="outfile_predict" file="glm_result06"/>
309 </test>
310 <test>
311 <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
312 <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
313 <param name="col1" value="1,2,3,4,5"/>
314 <param name="col2" value="6"/>
315 <param name="selected_task" value="train"/>
316 <param name="selected_algorithm" value="Ridge"/>
317 <param name="random_state" value="10"/>
318 <output name="outfile_fit" file="glm_model07" compare="sim_size" delta="500"/>
319 </test>
320 <test>
321 <param name="infile_model" value="glm_model07" ftype="zip"/>
322 <param name="infile_data" value="regression_test.tabular" ftype="tabular"/>
323 <param name="selected_task" value="load"/>
324 <output name="outfile_predict" file="glm_result07" compare="sim_size"/>
325 </test>
326 <test>
327 <param name="infile1" value="train.tabular" ftype="tabular"/>
328 <param name="infile2" value="train.tabular" ftype="tabular"/>
329 <param name="col1" value="1,2,3,4"/>
330 <param name="col2" value="5"/>
331 <param name="selected_task" value="train"/>
332 <param name="selected_algorithm" value="LogisticRegressionCV"/>
333 <param name="random_state" value="10"/>
334 <output name="outfile_fit" file="glm_model08" compare="sim_size" delta="500"/>
335 </test>
336 <test>
337 <param name="infile_model" value="glm_model08" ftype="zip"/>
338 <param name="infile_data" value="test.tabular" ftype="tabular"/>
339 <param name="selected_task" value="load"/>
340 <output name="outfile_predict" file="glm_result08"/>
341 </test>
342 </tests>
343 <help><![CDATA[
344 ***What it does***
345 This module implements a set of linear models for classification and regression such as: SGD classification and regression, Linear and Ridge regression and classification. This wrapper is using sklearn.linear_model module at its core. For information about linear models and their parameter settings please refer to `Scikit-learn generalized linear models`_.
346
347 .. _`Scikit-learn generalized linear models`: http://scikit-learn.org/stable/modules/linear_model.html
348
349 **1 - Methods**
350 There are two groups of operations available:
351
352 1 - Train a model : A training set containing samples and their respective labels (or predicted values) are input. Based on the selected algorithm and options, an estimator object is fit to the data and is returned.
353
354 2 - Load a model and predict : An existing model predicts the class labels (or regression values) for a new dataset.
355
356 **2 - Trainig input**
357 When you choose to train a model, you need a features dataset X and a labels set y. This tool expects tabular or sparse data for X and a single column for y (tabular). You can select a subset of columns in a tabular dataset as your features dataset or labels column. Below you find some examples:
358
359 **Sample tabular features dataset**
360 The following training dataset contains 3 feature columns and a column containing class labels. You can simply select the first 3 columns as features and the last column as labels:
361
362 ::
363
364 4.01163365529 -6.10797684314 8.29829894763 1
365 10.0788438916 1.59539821454 10.0684278289 0
366 -5.17607775503 -0.878286135332 6.92941850665 2
367 4.00975406235 -7.11847496542 9.3802423585 1
368 4.61204065139 -5.71217537352 9.12509610964 1
369
370
371 **Sample sparse features dataset**
372 In this case you cannot specifiy a column range.
373
374 ::
375
376 4 1048577 8738
377 1 271 0.020833
378 1 1038 0.02461
379 2 829017 0.016
380 2 829437 0.012
381 2 830752 0.025
382 3 1047487 0.01
383 3 1047980 0.02
384 3 1048475 0.01
385 4 608 0.016629
386 4 1651 0.02519
387 4 4053 0.04223
388
389
390 **2 - Trainig output**
391 The trained model is generated and output in the form of a binary file.
392
393
394 **3 - Prediction input**
395
396 When you choose to load a model and do prediction, the tool expects an already trained estimator and a tabular dataset as input. The dataset contains new samples which you want to classify or predict regression values for.
397
398
399 .. class:: warningmark
400
401 The number of feature columns must be the same in training and prediction datasets!
402
403
404 **3 - Prediction output**
405 The tool predicts the class labels for new samples and adds them as the last column to the prediction dataset. The new dataset then is output as a tabular file. The prediction output format should look like the training dataset.
406
407 ]]></help>
408 <expand macro="sklearn_citation"/>
409 </tool>