Mercurial > repos > bgruening > sklearn_data_preprocess
view pre_process.xml @ 46:761269451e98 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57a0433defa3cbc37ab34fbb0ebcfaeb680db8d5
author | bgruening |
---|---|
date | Sun, 05 Nov 2023 15:52:21 +0000 |
parents | a16f33c6ca64 |
children |
line wrap: on
line source
<tool id="sklearn_data_preprocess" name="Preprocess" version="@VERSION@" profile="@PROFILE@"> <description>raw feature vectors into standardized datasets</description> <macros> <import>main_macros.xml</import> </macros> <expand macro="python_requirements" /> <expand macro="macro_stdio" /> <version_command>echo "@VERSION@"</version_command> <command> <![CDATA[ python "$pre_processor_script" '$inputs' ]]> </command> <configfiles> <inputs name="inputs" /> <configfile name="pre_processor_script"> <![CDATA[ import sys import json import pandas from scipy.io import mmread from scipy.io import mmwrite from sklearn import preprocessing from galaxy_ml.model_persist import dump_model_to_h5 from galaxy_ml.utils import read_columns, SafeEval safe_eval = SafeEval() input_json_path = sys.argv[1] with open(input_json_path, "r") as param_handler: params = json.load(param_handler) #if $input_type.selected_input_type == "sparse": X = mmread("$infile") #else: header = 'infer' if params["input_type"]["header1"] else None column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"] if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: c = params["input_type"]["column_selector_options_1"]["col1"] else: c = None X, input_df = read_columns( "$input_type.infile", c=c, c_option=column_option, return_df=True, sep='\t', header=header, parse_dates=True, encoding=None, index_col=None) X = X.astype(float) #end if preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] options = params["input_type"]["pre_processors"]["options"] if 'feature_range' in options: feature_range = safe_eval(options['feature_range'].strip()) if not feature_range: feature_range = (0, 1) options['feature_range'] = feature_range my_class = getattr(preprocessing, preprocessor) estimator = my_class(**options) estimator.fit(X) result = estimator.transform(X) #if $input_type.selected_input_type == "sparse": with open("$outfile_transform", "wb") as transform_handler: mmwrite(transform_handler, result) #else: columns = input_df.columns if preprocessor == 'PolynomialFeatures': columns = None header = False res = pandas.DataFrame(result, columns=columns) res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=True if header else False) #end if #if $save: dump_model_to_h5(estimator, "$outfile_fit") #end if ]]> </configfile> </configfiles> <inputs> <conditional name="input_type"> <param name="selected_input_type" type="select" label="Select the type of your input data:"> <option value="tabular" selected="true">Tabular</option> <option value="sparse">Sparse</option> </param> <when value="tabular"> <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:" /> <param name="header1" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="Does the dataset contain header:" /> <conditional name="column_selector_options_1"> <expand macro="samples_column_selector_options" multiple="true" column_option="selected_column_selector_option" col_name="col1" infile="infile" /> </conditional> <conditional name="pre_processors"> <expand macro="sparse_preprocessors_ext" /> <expand macro="sparse_preprocessor_options_ext" /> </conditional> </when> <when value="sparse"> <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:" /> <conditional name="pre_processors"> <expand macro="sparse_preprocessors" /> <expand macro="sparse_preprocessor_options" /> </conditional> </when> </conditional> <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Save the preprocessor" help="Saves the preprocessor after fitting to the data. The preprocessor can then be passed to other tools and used in later operations." /> </inputs> <outputs> <data format="tabular" name="outfile_transform" from_work_dir="./output" /> <data format="h5mlm" name="outfile_fit"> <filter>save</filter> </data> </outputs> <tests> <test> <param name="infile" value="train.tabular" ftype="tabular" /> <param name="selected_column_selector_option" value="all_columns" /> <param name="selected_input_type" value="tabular" /> <param name="selected_pre_processor" value="QuantileTransformer" /> <param name="save" value="true" /> <param name="random_state" value="200" /> <param name="n_quantiles" value="10" /> <param name="subsample" value="100" /> <output name="outfile_transform" file="prp_result01" ftype="tabular" /> <output name="outfile_fit" file="prp_model01" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="train.tabular" ftype="tabular" /> <param name="selected_column_selector_option" value="all_columns" /> <param name="selected_input_type" value="tabular" /> <param name="selected_pre_processor" value="MinMaxScaler" /> <param name="save" value="true" /> <output name="outfile_transform" file="prp_result02" ftype="tabular" /> <output name="outfile_fit" file="prp_model02" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="train.tabular" ftype="tabular" /> <param name="selected_column_selector_option" value="all_columns" /> <param name="selected_input_type" value="tabular" /> <param name="selected_pre_processor" value="PolynomialFeatures" /> <param name="save" value="true" /> <output name="outfile_transform" file="prp_result03" ftype="tabular" /> <output name="outfile_fit" file="prp_model03" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="train.tabular" ftype="tabular" /> <param name="selected_column_selector_option" value="all_columns" /> <param name="selected_input_type" value="tabular" /> <param name="selected_pre_processor" value="RobustScaler" /> <param name="save" value="true" /> <output name="outfile_transform" file="prp_result04" ftype="tabular" /> <output name="outfile_fit" file="prp_model04" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="csr_sparse2.mtx" ftype="txt" /> <param name="selected_input_type" value="sparse" /> <param name="selected_pre_processor" value="Binarizer" /> <param name="save" value="true" /> <output name="outfile_transform" file="prp_result05" ftype="tabular" /> <output name="outfile_fit" file="prp_model05" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="train.tabular" ftype="tabular" /> <param name="selected_input_type" value="tabular" /> <param name="selected_column_selector_option" value="all_columns" /> <param name="selected_pre_processor" value="StandardScaler" /> <param name="save" value="true" /> <output name="outfile_transform" file="prp_result07" ftype="tabular" /> <output name="outfile_fit" file="prp_model07" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="csr_sparse2.mtx" ftype="txt" /> <param name="selected_input_type" value="sparse" /> <param name="selected_pre_processor" value="MaxAbsScaler" /> <param name="save" value="true" /> <output name="outfile_transform" file="prp_result08" ftype="tabular" /> <output name="outfile_fit" file="prp_model08" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="csr_sparse2.mtx" ftype="txt" /> <param name="selected_input_type" value="sparse" /> <param name="selected_pre_processor" value="Normalizer" /> <param name="save" value="true" /> <output name="outfile_transform" file="prp_result09" ftype="tabular" /> <output name="outfile_fit" file="prp_model09" ftype="h5mlm" compare="sim_size" delta="5" /> </test> <test> <param name="infile" value="regression_X.tabular" ftype="tabular" /> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns" /> <param name="selected_input_type" value="tabular" /> <param name="selected_pre_processor" value="MinMaxScaler" /> <param name="feature_range" value="(-1, 1)" /> <param name="save" value="false" /> <output name="outfile_transform" file="prp_result10" ftype="tabular" /> </test> </tests> <help> <![CDATA[ **What it does** This tool provides several transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. The library is provided by sklearn.preprocessing package. For information about preprocessing classes and parameter settings please refer to `Scikit-learn preprocessing`_. .. _`Scikit-learn preprocessing`: http://scikit-learn.org/stable/modules/preprocessing.html ]]> </help> <expand macro="sklearn_citation" /> </tool>