comparison pipeline.xml @ 15:3f3c6dc38f3e draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5b2ac730ec6d3b762faa9034eddd19ad1b347476"
author bgruening
date Mon, 16 Dec 2019 05:39:20 -0500
parents 775b004b7920
children 4de3d598c116
comparison
equal deleted inserted replaced
14:c33145a815ee 15:3f3c6dc38f3e
1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@"> 1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@">
2 <description>constructs a list of transforms and a final estimator</description> 2 <description>an all-in-one platform to build pipeline, single estimator, preprocessor and custom wrappers</description>
3 <macros> 3 <macros>
4 <import>main_macros.xml</import> 4 <import>main_macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="python_requirements"/> 6 <expand macro="python_requirements"/>
7 <expand macro="macro_stdio"/> 7 <expand macro="macro_stdio"/>
29 model_selection, naive_bayes, neighbors, pipeline, preprocessing, 29 model_selection, naive_bayes, neighbors, pipeline, preprocessing,
30 svm, linear_model, tree, discriminant_analysis) 30 svm, linear_model, tree, discriminant_analysis)
31 from sklearn.pipeline import make_pipeline 31 from sklearn.pipeline import make_pipeline
32 from imblearn.pipeline import make_pipeline as imb_make_pipeline 32 from imblearn.pipeline import make_pipeline as imb_make_pipeline
33 from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator, 33 from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator,
34 try_get_attr, get_search_params) 34 try_get_attr, get_search_params, load_model)
35
36 ## TODO remove following imports after scikit-learn v0.22
37 from sklearn.experimental import enable_hist_gradient_boosting
35 38
36 39
37 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) 40 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
38 41
39 warnings.filterwarnings('ignore') 42 warnings.filterwarnings('ignore')
172 estimator_json = params['final_estimator']['estimator_selector'] 175 estimator_json = params['final_estimator']['estimator_selector']
173 if estimator_json['selected_module'] == 'none': 176 if estimator_json['selected_module'] == 'none':
174 if len(pipeline_steps) == 0: 177 if len(pipeline_steps) == 0:
175 sys.exit("No pipeline steps specified!") 178 sys.exit("No pipeline steps specified!")
176 ## else: turn the last pre-process component to final estimator 179 ## else: turn the last pre-process component to final estimator
180 elif estimator_json['selected_module'] == 'sklearn.compose':
181 #if $final_estimator.estimator_selector.selected_module == 'sklearn.compose':
182 regressor_path = '$final_estimator.estimator_selector.regressor'
183 transformer_path = '$final_estimator.estimator_selector.transformer'
184 #end if
185 with open(regressor_path, 'rb') as f:
186 regressor = load_model(f)
187 with open(transformer_path, 'rb') as f:
188 transformer = load_model(f)
189 estimator = compose.TransformedTargetRegressor(regressor=regressor, transformer=transformer)
190 pipeline_steps.append( estimator )
177 else: 191 else:
178 estimator = get_estimator(estimator_json) 192 estimator = get_estimator(estimator_json)
179 pipeline_steps.append( estimator ) 193 pipeline_steps.append( estimator )
180 194
181 #if $output_type == 'Final_Estimator_Builder': 195 if len(pipeline_steps) == 1:
196 out_obj = pipeline_steps[-1]
197 print(out_obj)
198 else:
199 if has_imblearn:
200 out_obj = imb_make_pipeline(*pipeline_steps)
201 else:
202 out_obj = make_pipeline(*pipeline_steps)
203 pprint.pprint(out_obj.named_steps)
204
182 with open('$outfile', 'wb') as out_handler: 205 with open('$outfile', 'wb') as out_handler:
183 final_est = pipeline_steps[-1] 206 pickle.dump(out_obj, out_handler, pickle.HIGHEST_PROTOCOL)
184 print(final_est)
185 pickle.dump(final_est, out_handler, pickle.HIGHEST_PROTOCOL)
186 out_obj = final_est
187 #else:
188 if has_imblearn:
189 pipeline = imb_make_pipeline(*pipeline_steps)
190 else:
191 pipeline = make_pipeline(*pipeline_steps)
192 pprint.pprint(pipeline.named_steps)
193
194 with open('$outfile', 'wb') as out_handler:
195 pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL)
196 out_obj = pipeline
197 #end if
198 207
199 #if $get_params 208 #if $get_params
200 results = get_search_params(out_obj) 209 results = get_search_params(out_obj)
201 df = pd.DataFrame(results, columns=['', 'Parameter', 'Value']) 210 df = pd.DataFrame(results, columns=['', 'Parameter', 'Value'])
202 df.to_csv('$outfile_params', sep='\t', index=False) 211 df.to_csv('$outfile_params', sep='\t', index=False)
260 </repeat> 269 </repeat>
261 <section name="final_estimator" title="Final Estimator" expanded="true"> 270 <section name="final_estimator" title="Final Estimator" expanded="true">
262 <conditional name="estimator_selector"> 271 <conditional name="estimator_selector">
263 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > 272 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" >
264 <expand macro="estimator_module_options"> 273 <expand macro="estimator_module_options">
274 <option value="sklearn.compose">sklearn.compose</option>
265 <option value="binarize_target">Binarize Target Classifier or Regressor</option> 275 <option value="binarize_target">Binarize Target Classifier or Regressor</option>
266 <option value="custom_estimator">Load a custom estimator</option> 276 <option value="custom_estimator">Load a custom estimator</option>
267 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> 277 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option>
268 </expand> 278 </expand>
269 </param> 279 </param>
270 <expand macro="estimator_suboptions"> 280 <expand macro="estimator_suboptions">
281 <when value="sklearn.compose">
282 <param name="selected_estimator" type="select" label="Choose estimator class:">
283 <option value="TransformedTargetRegressor" selected="true">TransformedTargetRegressor</option>
284 </param>
285 <param name="regressor" type="data" format="zip" label="Choose the dataset containing the wrapped regressor"/>
286 <param name="transformer" type="data" format="zip" label="Choose the dataset containing transformer"/>
287 </when>
271 <when value="binarize_target"> 288 <when value="binarize_target">
272 <param name="clf_or_regr" type="select" label="Classifier or Regressor:"> 289 <param name="clf_or_regr" type="select" label="Classifier or Regressor:">
273 <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option> 290 <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option>
274 <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option> 291 <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option>
275 </param> 292 </param>
283 </when> 300 </when>
284 <when value="none"/> 301 <when value="none"/>
285 </expand> 302 </expand>
286 </conditional> 303 </conditional>
287 </section> 304 </section>
288 <param name="output_type" type="select" label="Output the final estimator instead?"> 305 <!--param name="output_type" type="select" label="Output the final estimator instead?">
289 <option value="Pipeline_Builder" selected="true">Pipeline</option> 306 <option value="Pipeline_Builder" selected="true">Pipeline</option>
290 <option value="Final_Estimator_Builder">Final Estimator</option> 307 <option value="Final_Estimator_Builder">Final Estimator</option>
291 </param> 308 </param>-->
292 <param name="get_params" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Output parameters for searchCV?" 309 <param name="get_params" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Output parameters for searchCV?"
293 help="Optional. Tunable parameters could be obtained through `estimator_attributes` tool."/> 310 help="Optional. Tunable parameters could be obtained through `estimator_attributes` tool."/>
294 </inputs> 311 </inputs>
295 <outputs> 312 <outputs>
296 <data format="zip" name="outfile" label="${output_type}"/> 313 <data format="zip" name="outfile" label="New Pipleline/Estimator"/>
297 <data format="tabular" name="outfile_params" label="get_params for ${output_type}"> 314 <data format="tabular" name="outfile_params" label="get_params for Pipleline/Estimator">
298 <filter>get_params</filter> 315 <filter>get_params</filter>
299 </data> 316 </data>
300 </outputs> 317 </outputs>
301 <tests> 318 <tests>
319 <test>
320 <conditional name="component_selector">
321 <param name="component_type" value="pre_processor"/>
322 <conditional name="pre_processors">
323 <param name="selected_pre_processor" value="QuantileTransformer"/>
324 <section name="options">
325 <param name="random_state" value="10"/>
326 </section>
327 </conditional>
328 </conditional>
329 <section name="final_estimator">
330 <conditional name="estimator_selector">
331 <param name="selected_module" value="none"/>
332 </conditional>
333 </section>
334 <output name="outfile" file="pipeline17" compare="sim_size" delta="5"/>
335 </test>
336 <test>
337 <conditional name="component_selector">
338 <param name="component_type" value="pre_processor"/>
339 <conditional name="pre_processors">
340 <param name="selected_pre_processor" value="PowerTransformer"/>
341 </conditional>
342 </conditional>
343 <section name="final_estimator">
344 <conditional name="estimator_selector">
345 <param name="selected_module" value="sklearn.compose"/>
346 <param name="regressor" value="RandomForestRegressor01.zip" ftype="zip"/>
347 <param name="transformer" value="pipeline17" ftype="zip"/>
348 </conditional>
349 </section>
350 <param name="get_params" value="true"/>
351 <output name="outfile_params" file="pipeline_params18" ftype="tabular"/>
352 </test>
302 <test> 353 <test>
303 <repeat name="pipeline_component"> 354 <repeat name="pipeline_component">
304 <conditional name="component_selector"> 355 <conditional name="component_selector">
305 <param name="component_type" value="pre_processor"/> 356 <param name="component_type" value="pre_processor"/>
306 <conditional name="pre_processors"> 357 <conditional name="pre_processors">
368 <param name="component_type" value="None"/> 419 <param name="component_type" value="None"/>
369 </conditional> 420 </conditional>
370 <param name="selected_module" value="ensemble"/> 421 <param name="selected_module" value="ensemble"/>
371 <param name="selected_estimator" value="RandomForestRegressor"/> 422 <param name="selected_estimator" value="RandomForestRegressor"/>
372 <param name="text_params" value="n_estimators=100, random_state=42"/> 423 <param name="text_params" value="n_estimators=100, random_state=42"/>
424 <param name="get_params" value="true"/>
373 <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/> 425 <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/>
426 <output name="outfile_params" file="pipeline_params05.tabular" ftype="tabular"/>
374 </test> 427 </test>
375 <test> 428 <test>
376 <conditional name="component_selector"> 429 <conditional name="component_selector">
377 <param name="component_type" value="decomposition"/> 430 <param name="component_type" value="decomposition"/>
378 <conditional name="matrix_decomposition_selector"> 431 <conditional name="matrix_decomposition_selector">
416 </conditional> 469 </conditional>
417 </conditional> 470 </conditional>
418 <param name="selected_module" value="ensemble"/> 471 <param name="selected_module" value="ensemble"/>
419 <param name="selected_estimator" value="RandomForestRegressor"/> 472 <param name="selected_estimator" value="RandomForestRegressor"/>
420 <output name="outfile" file="pipeline09" compare="sim_size" delta="5"/> 473 <output name="outfile" file="pipeline09" compare="sim_size" delta="5"/>
421 </test>
422 <test>
423 <conditional name="component_selector">
424 <param name="component_type" value="None"/>
425 </conditional>
426 <param name="selected_module" value="ensemble"/>
427 <param name="selected_estimator" value="AdaBoostRegressor"/>
428 <output name="outfile" file="pipeline10" compare="sim_size" delta="5"/>
429 </test> 474 </test>
430 <test> 475 <test>
431 <conditional name="component_selector"> 476 <conditional name="component_selector">
432 <param name="component_type" value="imblearn"/> 477 <param name="component_type" value="imblearn"/>
433 <conditional name="imblearn_selector"> 478 <conditional name="imblearn_selector">
469 <conditional name="component_selector"> 514 <conditional name="component_selector">
470 <param name="component_type" value="None"/> 515 <param name="component_type" value="None"/>
471 </conditional> 516 </conditional>
472 <param name="selected_module" value="ensemble"/> 517 <param name="selected_module" value="ensemble"/>
473 <param name="selected_estimator" value="RandomForestClassifier"/> 518 <param name="selected_estimator" value="RandomForestClassifier"/>
474 <param name="output_type" value="Final_Estimator_Builder"/>
475 <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/> 519 <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/>
476 </test> 520 </test>
477 <test> 521 <test>
478 <conditional name="component_selector"> 522 <conditional name="component_selector">
479 <param name="component_type" value="IRAPS"/> 523 <param name="component_type" value="IRAPS"/>
481 <section name="final_estimator"> 525 <section name="final_estimator">
482 <conditional name="estimator_selector"> 526 <conditional name="estimator_selector">
483 <param name="selected_module" value="none"/> 527 <param name="selected_module" value="none"/>
484 </conditional> 528 </conditional>
485 </section> 529 </section>
486 <param name="output_type" value="Final_Estimator_Builder"/>
487 <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/> 530 <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/>
488 </test> 531 </test>
489 <test> 532 <test>
490 <conditional name="component_selector"> 533 <conditional name="component_selector">
491 <param name="component_type" value="None"/> 534 <param name="component_type" value="None"/>
495 <param name="selected_module" value="binarize_target"/> 538 <param name="selected_module" value="binarize_target"/>
496 <param name="clf_or_regr" value="BinarizeTargetClassifier"/> 539 <param name="clf_or_regr" value="BinarizeTargetClassifier"/>
497 <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/> 540 <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/>
498 </conditional> 541 </conditional>
499 </section> 542 </section>
500 <param name="output_type" value="Final_Estimator_Builder"/>
501 <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/> 543 <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/>
502 </test> 544 </test>
503 <test> 545 <test>
504 <conditional name="component_selector"> 546 <conditional name="component_selector">
505 <param name="component_type" value="preprocessors"/> 547 <param name="component_type" value="preprocessors"/>
519 </test> 561 </test>
520 </tests> 562 </tests>
521 <help> 563 <help>
522 <![CDATA[ 564 <![CDATA[
523 **What it does** 565 **What it does**
524 Constructs a pipeline that contains a list of transfroms and a final estimator. Pipeline assembles several steps 566 This tool not only builds sklearn pipeline object, but also builds single main estimator or single preprocessing component. The output object type is based on the length of pipeline steps. When there is only one step (choose `None` for others), either a main estimator or preprocessor, the component is output directly instead of wrapping in a pipeline object.
525 that can be cross-validated together while setting different parameters. 567
526 please refer to `Scikit-learn pipeline Pipeline`_. 568 A typical pipeline chains one or more preprocessing steps plus a final main estimator, for example, [VarianceThreshold, StandardScaler, SGDClassifier] which is composed of a feature selctor, a preprocessing scaler and a main estimator together.
527 569 For more information, please refer to `Scikit-learn pipeline Pipeline`_.
528 **Pre-processing components** allow None, one or a combination of up to 5 transformations from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and/or `skrebate`_. 570
529 571 **Pre-processing components** come from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_, `skrebate`_ and more.
530 **Estimator** selector supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_ and `neighbors`_. 572
573 **Final Estimator** supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and so on.
574
575 **Custom estimators**
576
577 - `GenomeOneHotEncoder`_
578
579 - `ProteinOnehotEncoder`_
580
581 - `IRAPSClassifier`_
582
583 - `BinarizeTargetClassifier`_
584
585 - `BinarizeTargetRegressor`_
586
587 **Output**
588
589 - Pickled pipeline/estimator object
590
591 - Hyperparameter of the ojbect (optional)
531 592
532 593
533 .. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html 594 .. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
534 .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm 595 .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm
535 .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model 596 .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model
544 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition 605 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition
545 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation 606 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation
546 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html 607 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html
547 .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/ 608 .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/
548 609
610 .. _`GenomeOneHotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#genomeonehotencoder
611 .. _`ProteinOnehotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#proteinonehotencoder
612 .. _`IRAPSClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/iraps-classifier/#irapsclassifier
613 .. _`BinarizeTargetClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetclassifier
614 .. _`BinarizeTargetRegressor`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetregressor
615
549 ]]> 616 ]]>
550 </help> 617 </help>
551 <expand macro="sklearn_citation"> 618 <expand macro="sklearn_citation">
552 <expand macro="skrebate_citation"/> 619 <expand macro="skrebate_citation"/>
553 <expand macro="xgboost_citation"/> 620 <expand macro="xgboost_citation"/>