Mercurial > repos > bgruening > sklearn_build_pipeline
comparison pipeline.xml @ 15:3f3c6dc38f3e draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5b2ac730ec6d3b762faa9034eddd19ad1b347476"
author | bgruening |
---|---|
date | Mon, 16 Dec 2019 05:39:20 -0500 |
parents | 775b004b7920 |
children | 4de3d598c116 |
comparison
equal
deleted
inserted
replaced
14:c33145a815ee | 15:3f3c6dc38f3e |
---|---|
1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@"> | 1 <tool id="sklearn_build_pipeline" name="Pipeline Builder" version="@VERSION@"> |
2 <description>constructs a list of transforms and a final estimator</description> | 2 <description>an all-in-one platform to build pipeline, single estimator, preprocessor and custom wrappers</description> |
3 <macros> | 3 <macros> |
4 <import>main_macros.xml</import> | 4 <import>main_macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="python_requirements"/> | 6 <expand macro="python_requirements"/> |
7 <expand macro="macro_stdio"/> | 7 <expand macro="macro_stdio"/> |
29 model_selection, naive_bayes, neighbors, pipeline, preprocessing, | 29 model_selection, naive_bayes, neighbors, pipeline, preprocessing, |
30 svm, linear_model, tree, discriminant_analysis) | 30 svm, linear_model, tree, discriminant_analysis) |
31 from sklearn.pipeline import make_pipeline | 31 from sklearn.pipeline import make_pipeline |
32 from imblearn.pipeline import make_pipeline as imb_make_pipeline | 32 from imblearn.pipeline import make_pipeline as imb_make_pipeline |
33 from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator, | 33 from galaxy_ml.utils import (SafeEval, feature_selector, get_estimator, |
34 try_get_attr, get_search_params) | 34 try_get_attr, get_search_params, load_model) |
35 | |
36 ## TODO remove following imports after scikit-learn v0.22 | |
37 from sklearn.experimental import enable_hist_gradient_boosting | |
35 | 38 |
36 | 39 |
37 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) | 40 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) |
38 | 41 |
39 warnings.filterwarnings('ignore') | 42 warnings.filterwarnings('ignore') |
172 estimator_json = params['final_estimator']['estimator_selector'] | 175 estimator_json = params['final_estimator']['estimator_selector'] |
173 if estimator_json['selected_module'] == 'none': | 176 if estimator_json['selected_module'] == 'none': |
174 if len(pipeline_steps) == 0: | 177 if len(pipeline_steps) == 0: |
175 sys.exit("No pipeline steps specified!") | 178 sys.exit("No pipeline steps specified!") |
176 ## else: turn the last pre-process component to final estimator | 179 ## else: turn the last pre-process component to final estimator |
180 elif estimator_json['selected_module'] == 'sklearn.compose': | |
181 #if $final_estimator.estimator_selector.selected_module == 'sklearn.compose': | |
182 regressor_path = '$final_estimator.estimator_selector.regressor' | |
183 transformer_path = '$final_estimator.estimator_selector.transformer' | |
184 #end if | |
185 with open(regressor_path, 'rb') as f: | |
186 regressor = load_model(f) | |
187 with open(transformer_path, 'rb') as f: | |
188 transformer = load_model(f) | |
189 estimator = compose.TransformedTargetRegressor(regressor=regressor, transformer=transformer) | |
190 pipeline_steps.append( estimator ) | |
177 else: | 191 else: |
178 estimator = get_estimator(estimator_json) | 192 estimator = get_estimator(estimator_json) |
179 pipeline_steps.append( estimator ) | 193 pipeline_steps.append( estimator ) |
180 | 194 |
181 #if $output_type == 'Final_Estimator_Builder': | 195 if len(pipeline_steps) == 1: |
196 out_obj = pipeline_steps[-1] | |
197 print(out_obj) | |
198 else: | |
199 if has_imblearn: | |
200 out_obj = imb_make_pipeline(*pipeline_steps) | |
201 else: | |
202 out_obj = make_pipeline(*pipeline_steps) | |
203 pprint.pprint(out_obj.named_steps) | |
204 | |
182 with open('$outfile', 'wb') as out_handler: | 205 with open('$outfile', 'wb') as out_handler: |
183 final_est = pipeline_steps[-1] | 206 pickle.dump(out_obj, out_handler, pickle.HIGHEST_PROTOCOL) |
184 print(final_est) | |
185 pickle.dump(final_est, out_handler, pickle.HIGHEST_PROTOCOL) | |
186 out_obj = final_est | |
187 #else: | |
188 if has_imblearn: | |
189 pipeline = imb_make_pipeline(*pipeline_steps) | |
190 else: | |
191 pipeline = make_pipeline(*pipeline_steps) | |
192 pprint.pprint(pipeline.named_steps) | |
193 | |
194 with open('$outfile', 'wb') as out_handler: | |
195 pickle.dump(pipeline, out_handler, pickle.HIGHEST_PROTOCOL) | |
196 out_obj = pipeline | |
197 #end if | |
198 | 207 |
199 #if $get_params | 208 #if $get_params |
200 results = get_search_params(out_obj) | 209 results = get_search_params(out_obj) |
201 df = pd.DataFrame(results, columns=['', 'Parameter', 'Value']) | 210 df = pd.DataFrame(results, columns=['', 'Parameter', 'Value']) |
202 df.to_csv('$outfile_params', sep='\t', index=False) | 211 df.to_csv('$outfile_params', sep='\t', index=False) |
260 </repeat> | 269 </repeat> |
261 <section name="final_estimator" title="Final Estimator" expanded="true"> | 270 <section name="final_estimator" title="Final Estimator" expanded="true"> |
262 <conditional name="estimator_selector"> | 271 <conditional name="estimator_selector"> |
263 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > | 272 <param name="selected_module" type="select" label="Choose the module that contains target estimator:" > |
264 <expand macro="estimator_module_options"> | 273 <expand macro="estimator_module_options"> |
274 <option value="sklearn.compose">sklearn.compose</option> | |
265 <option value="binarize_target">Binarize Target Classifier or Regressor</option> | 275 <option value="binarize_target">Binarize Target Classifier or Regressor</option> |
266 <option value="custom_estimator">Load a custom estimator</option> | 276 <option value="custom_estimator">Load a custom estimator</option> |
267 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> | 277 <option value="none">none -- The last component of pre-processing step will turn to a final estimator</option> |
268 </expand> | 278 </expand> |
269 </param> | 279 </param> |
270 <expand macro="estimator_suboptions"> | 280 <expand macro="estimator_suboptions"> |
281 <when value="sklearn.compose"> | |
282 <param name="selected_estimator" type="select" label="Choose estimator class:"> | |
283 <option value="TransformedTargetRegressor" selected="true">TransformedTargetRegressor</option> | |
284 </param> | |
285 <param name="regressor" type="data" format="zip" label="Choose the dataset containing the wrapped regressor"/> | |
286 <param name="transformer" type="data" format="zip" label="Choose the dataset containing transformer"/> | |
287 </when> | |
271 <when value="binarize_target"> | 288 <when value="binarize_target"> |
272 <param name="clf_or_regr" type="select" label="Classifier or Regressor:"> | 289 <param name="clf_or_regr" type="select" label="Classifier or Regressor:"> |
273 <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option> | 290 <option value="BinarizeTargetClassifier">BinarizeTargetClassifier</option> |
274 <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option> | 291 <option value="BinarizeTargetRegressor">BinarizeTargetRegressor</option> |
275 </param> | 292 </param> |
283 </when> | 300 </when> |
284 <when value="none"/> | 301 <when value="none"/> |
285 </expand> | 302 </expand> |
286 </conditional> | 303 </conditional> |
287 </section> | 304 </section> |
288 <param name="output_type" type="select" label="Output the final estimator instead?"> | 305 <!--param name="output_type" type="select" label="Output the final estimator instead?"> |
289 <option value="Pipeline_Builder" selected="true">Pipeline</option> | 306 <option value="Pipeline_Builder" selected="true">Pipeline</option> |
290 <option value="Final_Estimator_Builder">Final Estimator</option> | 307 <option value="Final_Estimator_Builder">Final Estimator</option> |
291 </param> | 308 </param>--> |
292 <param name="get_params" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Output parameters for searchCV?" | 309 <param name="get_params" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Output parameters for searchCV?" |
293 help="Optional. Tunable parameters could be obtained through `estimator_attributes` tool."/> | 310 help="Optional. Tunable parameters could be obtained through `estimator_attributes` tool."/> |
294 </inputs> | 311 </inputs> |
295 <outputs> | 312 <outputs> |
296 <data format="zip" name="outfile" label="${output_type}"/> | 313 <data format="zip" name="outfile" label="New Pipleline/Estimator"/> |
297 <data format="tabular" name="outfile_params" label="get_params for ${output_type}"> | 314 <data format="tabular" name="outfile_params" label="get_params for Pipleline/Estimator"> |
298 <filter>get_params</filter> | 315 <filter>get_params</filter> |
299 </data> | 316 </data> |
300 </outputs> | 317 </outputs> |
301 <tests> | 318 <tests> |
319 <test> | |
320 <conditional name="component_selector"> | |
321 <param name="component_type" value="pre_processor"/> | |
322 <conditional name="pre_processors"> | |
323 <param name="selected_pre_processor" value="QuantileTransformer"/> | |
324 <section name="options"> | |
325 <param name="random_state" value="10"/> | |
326 </section> | |
327 </conditional> | |
328 </conditional> | |
329 <section name="final_estimator"> | |
330 <conditional name="estimator_selector"> | |
331 <param name="selected_module" value="none"/> | |
332 </conditional> | |
333 </section> | |
334 <output name="outfile" file="pipeline17" compare="sim_size" delta="5"/> | |
335 </test> | |
336 <test> | |
337 <conditional name="component_selector"> | |
338 <param name="component_type" value="pre_processor"/> | |
339 <conditional name="pre_processors"> | |
340 <param name="selected_pre_processor" value="PowerTransformer"/> | |
341 </conditional> | |
342 </conditional> | |
343 <section name="final_estimator"> | |
344 <conditional name="estimator_selector"> | |
345 <param name="selected_module" value="sklearn.compose"/> | |
346 <param name="regressor" value="RandomForestRegressor01.zip" ftype="zip"/> | |
347 <param name="transformer" value="pipeline17" ftype="zip"/> | |
348 </conditional> | |
349 </section> | |
350 <param name="get_params" value="true"/> | |
351 <output name="outfile_params" file="pipeline_params18" ftype="tabular"/> | |
352 </test> | |
302 <test> | 353 <test> |
303 <repeat name="pipeline_component"> | 354 <repeat name="pipeline_component"> |
304 <conditional name="component_selector"> | 355 <conditional name="component_selector"> |
305 <param name="component_type" value="pre_processor"/> | 356 <param name="component_type" value="pre_processor"/> |
306 <conditional name="pre_processors"> | 357 <conditional name="pre_processors"> |
368 <param name="component_type" value="None"/> | 419 <param name="component_type" value="None"/> |
369 </conditional> | 420 </conditional> |
370 <param name="selected_module" value="ensemble"/> | 421 <param name="selected_module" value="ensemble"/> |
371 <param name="selected_estimator" value="RandomForestRegressor"/> | 422 <param name="selected_estimator" value="RandomForestRegressor"/> |
372 <param name="text_params" value="n_estimators=100, random_state=42"/> | 423 <param name="text_params" value="n_estimators=100, random_state=42"/> |
424 <param name="get_params" value="true"/> | |
373 <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/> | 425 <output name="outfile" file="pipeline05" compare="sim_size" delta="5"/> |
426 <output name="outfile_params" file="pipeline_params05.tabular" ftype="tabular"/> | |
374 </test> | 427 </test> |
375 <test> | 428 <test> |
376 <conditional name="component_selector"> | 429 <conditional name="component_selector"> |
377 <param name="component_type" value="decomposition"/> | 430 <param name="component_type" value="decomposition"/> |
378 <conditional name="matrix_decomposition_selector"> | 431 <conditional name="matrix_decomposition_selector"> |
416 </conditional> | 469 </conditional> |
417 </conditional> | 470 </conditional> |
418 <param name="selected_module" value="ensemble"/> | 471 <param name="selected_module" value="ensemble"/> |
419 <param name="selected_estimator" value="RandomForestRegressor"/> | 472 <param name="selected_estimator" value="RandomForestRegressor"/> |
420 <output name="outfile" file="pipeline09" compare="sim_size" delta="5"/> | 473 <output name="outfile" file="pipeline09" compare="sim_size" delta="5"/> |
421 </test> | |
422 <test> | |
423 <conditional name="component_selector"> | |
424 <param name="component_type" value="None"/> | |
425 </conditional> | |
426 <param name="selected_module" value="ensemble"/> | |
427 <param name="selected_estimator" value="AdaBoostRegressor"/> | |
428 <output name="outfile" file="pipeline10" compare="sim_size" delta="5"/> | |
429 </test> | 474 </test> |
430 <test> | 475 <test> |
431 <conditional name="component_selector"> | 476 <conditional name="component_selector"> |
432 <param name="component_type" value="imblearn"/> | 477 <param name="component_type" value="imblearn"/> |
433 <conditional name="imblearn_selector"> | 478 <conditional name="imblearn_selector"> |
469 <conditional name="component_selector"> | 514 <conditional name="component_selector"> |
470 <param name="component_type" value="None"/> | 515 <param name="component_type" value="None"/> |
471 </conditional> | 516 </conditional> |
472 <param name="selected_module" value="ensemble"/> | 517 <param name="selected_module" value="ensemble"/> |
473 <param name="selected_estimator" value="RandomForestClassifier"/> | 518 <param name="selected_estimator" value="RandomForestClassifier"/> |
474 <param name="output_type" value="Final_Estimator_Builder"/> | |
475 <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/> | 519 <output name="outfile" file="RandomForestClassifier.zip" compare="sim_size" delta="5"/> |
476 </test> | 520 </test> |
477 <test> | 521 <test> |
478 <conditional name="component_selector"> | 522 <conditional name="component_selector"> |
479 <param name="component_type" value="IRAPS"/> | 523 <param name="component_type" value="IRAPS"/> |
481 <section name="final_estimator"> | 525 <section name="final_estimator"> |
482 <conditional name="estimator_selector"> | 526 <conditional name="estimator_selector"> |
483 <param name="selected_module" value="none"/> | 527 <param name="selected_module" value="none"/> |
484 </conditional> | 528 </conditional> |
485 </section> | 529 </section> |
486 <param name="output_type" value="Final_Estimator_Builder"/> | |
487 <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/> | 530 <output name="outfile" file="pipeline14" compare="sim_size" delta="5"/> |
488 </test> | 531 </test> |
489 <test> | 532 <test> |
490 <conditional name="component_selector"> | 533 <conditional name="component_selector"> |
491 <param name="component_type" value="None"/> | 534 <param name="component_type" value="None"/> |
495 <param name="selected_module" value="binarize_target"/> | 538 <param name="selected_module" value="binarize_target"/> |
496 <param name="clf_or_regr" value="BinarizeTargetClassifier"/> | 539 <param name="clf_or_regr" value="BinarizeTargetClassifier"/> |
497 <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/> | 540 <param name="wrapped_estimator" value="RandomForestClassifier.zip" ftype="zip"/> |
498 </conditional> | 541 </conditional> |
499 </section> | 542 </section> |
500 <param name="output_type" value="Final_Estimator_Builder"/> | |
501 <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/> | 543 <output name="outfile" file="pipeline15" compare="sim_size" delta="5"/> |
502 </test> | 544 </test> |
503 <test> | 545 <test> |
504 <conditional name="component_selector"> | 546 <conditional name="component_selector"> |
505 <param name="component_type" value="preprocessors"/> | 547 <param name="component_type" value="preprocessors"/> |
519 </test> | 561 </test> |
520 </tests> | 562 </tests> |
521 <help> | 563 <help> |
522 <![CDATA[ | 564 <![CDATA[ |
523 **What it does** | 565 **What it does** |
524 Constructs a pipeline that contains a list of transfroms and a final estimator. Pipeline assembles several steps | 566 This tool not only builds sklearn pipeline object, but also builds single main estimator or single preprocessing component. The output object type is based on the length of pipeline steps. When there is only one step (choose `None` for others), either a main estimator or preprocessor, the component is output directly instead of wrapping in a pipeline object. |
525 that can be cross-validated together while setting different parameters. | 567 |
526 please refer to `Scikit-learn pipeline Pipeline`_. | 568 A typical pipeline chains one or more preprocessing steps plus a final main estimator, for example, [VarianceThreshold, StandardScaler, SGDClassifier] which is composed of a feature selctor, a preprocessing scaler and a main estimator together. |
527 | 569 For more information, please refer to `Scikit-learn pipeline Pipeline`_. |
528 **Pre-processing components** allow None, one or a combination of up to 5 transformations from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_ and/or `skrebate`_. | 570 |
529 | 571 **Pre-processing components** come from `sklearn.preprocessing`_, `feature_selection`_, `decomposition`_, `kernel_approximation`_, `cluster.FeatureAgglomeration`_, `skrebate`_ and more. |
530 **Estimator** selector supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_ and `neighbors`_. | 572 |
573 **Final Estimator** supports estimators from `xgboost`_ and many scikit-learn modules, including `svm`_, `linear_model`_, `ensemble`_, `naive_bayes`_, `tree`_, `neighbors`_ and so on. | |
574 | |
575 **Custom estimators** | |
576 | |
577 - `GenomeOneHotEncoder`_ | |
578 | |
579 - `ProteinOnehotEncoder`_ | |
580 | |
581 - `IRAPSClassifier`_ | |
582 | |
583 - `BinarizeTargetClassifier`_ | |
584 | |
585 - `BinarizeTargetRegressor`_ | |
586 | |
587 **Output** | |
588 | |
589 - Pickled pipeline/estimator object | |
590 | |
591 - Hyperparameter of the ojbect (optional) | |
531 | 592 |
532 | 593 |
533 .. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html | 594 .. _`Scikit-learn pipeline Pipeline`: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html |
534 .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm | 595 .. _`svm`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.svm |
535 .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model | 596 .. _`linear_model`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model |
544 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition | 605 .. _`decomposition`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.decomposition |
545 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation | 606 .. _`kernel_approximation`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.kernel_approximation |
546 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html | 607 .. _`cluster.FeatureAgglomeration`: http://scikit-learn.org/stable/modules/generated/sklearn.cluster.FeatureAgglomeration.html |
547 .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/ | 608 .. _`skrebate`: https://epistasislab.github.io/scikit-rebate/using/ |
548 | 609 |
610 .. _`GenomeOneHotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#genomeonehotencoder | |
611 .. _`ProteinOnehotEncoder`: https://goeckslab.github.io/Galaxy-ML/APIs/preprocessors/#proteinonehotencoder | |
612 .. _`IRAPSClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/iraps-classifier/#irapsclassifier | |
613 .. _`BinarizeTargetClassifier`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetclassifier | |
614 .. _`BinarizeTargetRegressor`: https://goeckslab.github.io/Galaxy-ML/APIs/binarize-target/#binarizetargetregressor | |
615 | |
549 ]]> | 616 ]]> |
550 </help> | 617 </help> |
551 <expand macro="sklearn_citation"> | 618 <expand macro="sklearn_citation"> |
552 <expand macro="skrebate_citation"/> | 619 <expand macro="skrebate_citation"/> |
553 <expand macro="xgboost_citation"/> | 620 <expand macro="xgboost_citation"/> |