Mercurial > repos > bgruening > sklearn_data_preprocess
diff pre_process.xml @ 0:29899feb4d44 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 0e582cf1f3134c777cce3aa57d71b80ed95e6ba9
author | bgruening |
---|---|
date | Fri, 16 Feb 2018 09:18:41 -0500 |
parents | |
children | dad38f036e83 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pre_process.xml Fri Feb 16 09:18:41 2018 -0500 @@ -0,0 +1,229 @@ +<tool id="sklearn_data_preprocess" name="Preprocess" version="@VERSION@"> + <description>raw feature vectors into standardized datasets</description> + <macros> + <import>main_macros.xml</import> + </macros> + <expand macro="python_requirements"/> + <expand macro="macro_stdio"/> + <version_command>echo "@VERSION@"</version_command> + <command> + <![CDATA[ + python "$pre_processor_script" '$inputs' + ]]> + </command> + <configfiles> + <inputs name="inputs" /> + <configfile name="pre_processor_script"> + <![CDATA[ +import sys +import json +import pandas +import pickle +import numpy as np +from scipy.io import mmread +from scipy.io import mmwrite +from sklearn import preprocessing + +input_json_path = sys.argv[1] +params = json.load(open(input_json_path, "r")) + +#if $input_type.selected_input_type == "sparse": +X = mmread(open("$infile", 'r')) +#else: +X = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) +#end if + +#if $input_type.pre_processors.infile_transform.ext == 'txt': +y = mmread(open("$infile", 'r')) +#else: +y = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) +#end if + +preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] +options = params["input_type"]["pre_processors"]["options"] + +my_class = getattr(preprocessing, preprocessor) +estimator = my_class(**options) +estimator.fit(X) +result = estimator.transform(y) + +#if $input_type.pre_processors.infile_transform.ext == 'txt': +mmwrite(open("$outfile_transform" , 'w+'), result) +#else: +res = pandas.DataFrame(result) +res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) +#end if + +#if $save: +pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) +#end if + ]]> + </configfile> + </configfiles> + <inputs> + <conditional name="input_type"> + <param name="selected_input_type" type="select" label="Select the type of your input data:"> + <option value="tabular" selected="true">Tabular</option> + <option value="sparse">Sparse</option> + </param> + <when value="tabular"> + <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/> + <conditional name="pre_processors"> + <expand macro="sparse_preprocessors"> + <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option> + <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option> + <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option> + <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option> + </expand> + <expand macro="sparse_preprocessor_options"> + <when value="KernelCenterer"> + <expand macro="multitype_input"/> + <section name="options" title="Advanced Options" expanded="False"> + </section> + </when> + <when value="MinMaxScaler"> + <expand macro="multitype_input"/> + <section name="options" title="Advanced Options" expanded="False"> + <!--feature_range--> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Use a copy of data for precomputing normalization" help=" "/> + </section> + </when> + <when value="PolynomialFeatures"> + <expand macro="multitype_input"/> + <section name="options" title="Advanced Options" expanded="False"> + <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/> + <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/> + <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/> + </section> + </when> + <when value="RobustScaler"> + <expand macro="multitype_input"/> + <section name="options" title="Advanced Options" expanded="False"> + <!--=True, =True, copy=True--> + <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Center the data before scaling" help=" "/> + <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Scale the data to interquartile range" help=" "/> + <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" + label="Use a copy of data for inplace scaling" help=" "/> + </section> + </when> + </expand> + </conditional> + </when> + <when value="sparse"> + <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:"/> + <conditional name="pre_processors"> + <expand macro="sparse_preprocessors"/> + <expand macro="sparse_preprocessor_options"/> + </conditional> + </when> + </conditional> + <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolflase" checked="false" + label="Save the preprocessor" + help="Saves the preprocessor after fitting to the data. The preprocessor can then be passed to other tools and used in later operations."/> + </inputs> + <outputs> + <data format="tabular" name="outfile_transform" from_work_dir="./output"/> + <data format="zip" name="outfile_fit"> + <filter>save</filter> + </data> + </outputs> + <tests> + <test> + <param name="infile" value="train.tabular" ftype="tabular"/> + <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_input_type" value="tabular"/> + <param name="selected_pre_processor" value="KernelCenterer"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result01" ftype="tabular"/> + <output name="outfile_fit" file="prp_model01" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="train.tabular" ftype="tabular"/> + <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_input_type" value="tabular"/> + <param name="selected_pre_processor" value="MinMaxScaler"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result02" ftype="tabular"/> + <output name="outfile_fit" file="prp_model02" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="train.tabular" ftype="tabular"/> + <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_input_type" value="tabular"/> + <param name="selected_pre_processor" value="PolynomialFeatures"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result03" ftype="tabular"/> + <output name="outfile_fit" file="prp_model03" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="train.tabular" ftype="tabular"/> + <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_input_type" value="tabular"/> + <param name="selected_pre_processor" value="RobustScaler"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result04" ftype="tabular"/> + <output name="outfile_fit" file="prp_model04" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> + <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> + <param name="selected_input_type" value="sparse"/> + <param name="selected_pre_processor" value="Binarizer"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result05" ftype="tabular"/> + <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> + <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> + <param name="selected_input_type" value="sparse"/> + <param name="selected_pre_processor" value="Imputer"/> + <param name="save" value="true"/> + <param name="axis" value="true"/> + <output name="outfile_transform" file="prp_result06" ftype="tabular"/> + <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="train.tabular" ftype="tabular"/> + <param name="infile_transform" value="train.tabular" ftype="tabular"/> + <param name="selected_input_type" value="tabular"/> + <param name="selected_pre_processor" value="StandardScaler"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result07" ftype="tabular"/> + <output name="outfile_fit" file="prp_model07" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> + <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> + <param name="selected_input_type" value="sparse"/> + <param name="selected_pre_processor" value="MaxAbsScaler"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result08" ftype="tabular"/> + <output name="outfile_fit" file="prp_model08" ftype="zip" compare="sim_size" delta="500"/> + </test> + <test> + <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> + <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> + <param name="selected_input_type" value="sparse"/> + <param name="selected_pre_processor" value="Normalizer"/> + <param name="save" value="true"/> + <output name="outfile_transform" file="prp_result09" ftype="tabular"/> + <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="500"/> + </test> + </tests> + <help> + <![CDATA[ +**What it does** + +This tool provides several transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. The library is provided by sklearn.preprocessing package. + +For information about preprocessing classes and parameter settings please refer to `Scikit-learn preprocessing`_. + +.. _`Scikit-learn preprocessing`: http://scikit-learn.org/stable/modules/preprocessing.html + ]]> + </help> + <expand macro="sklearn_citation"/> +</tool>