Mercurial > repos > bgruening > sklearn_data_preprocess
comparison pre_process.xml @ 0:29899feb4d44 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 0e582cf1f3134c777cce3aa57d71b80ed95e6ba9
author | bgruening |
---|---|
date | Fri, 16 Feb 2018 09:18:41 -0500 |
parents | |
children | dad38f036e83 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:29899feb4d44 |
---|---|
1 <tool id="sklearn_data_preprocess" name="Preprocess" version="@VERSION@"> | |
2 <description>raw feature vectors into standardized datasets</description> | |
3 <macros> | |
4 <import>main_macros.xml</import> | |
5 </macros> | |
6 <expand macro="python_requirements"/> | |
7 <expand macro="macro_stdio"/> | |
8 <version_command>echo "@VERSION@"</version_command> | |
9 <command> | |
10 <![CDATA[ | |
11 python "$pre_processor_script" '$inputs' | |
12 ]]> | |
13 </command> | |
14 <configfiles> | |
15 <inputs name="inputs" /> | |
16 <configfile name="pre_processor_script"> | |
17 <![CDATA[ | |
18 import sys | |
19 import json | |
20 import pandas | |
21 import pickle | |
22 import numpy as np | |
23 from scipy.io import mmread | |
24 from scipy.io import mmwrite | |
25 from sklearn import preprocessing | |
26 | |
27 input_json_path = sys.argv[1] | |
28 params = json.load(open(input_json_path, "r")) | |
29 | |
30 #if $input_type.selected_input_type == "sparse": | |
31 X = mmread(open("$infile", 'r')) | |
32 #else: | |
33 X = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | |
34 #end if | |
35 | |
36 #if $input_type.pre_processors.infile_transform.ext == 'txt': | |
37 y = mmread(open("$infile", 'r')) | |
38 #else: | |
39 y = pandas.read_csv("$infile", sep='\t', header=None, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | |
40 #end if | |
41 | |
42 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] | |
43 options = params["input_type"]["pre_processors"]["options"] | |
44 | |
45 my_class = getattr(preprocessing, preprocessor) | |
46 estimator = my_class(**options) | |
47 estimator.fit(X) | |
48 result = estimator.transform(y) | |
49 | |
50 #if $input_type.pre_processors.infile_transform.ext == 'txt': | |
51 mmwrite(open("$outfile_transform" , 'w+'), result) | |
52 #else: | |
53 res = pandas.DataFrame(result) | |
54 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) | |
55 #end if | |
56 | |
57 #if $save: | |
58 pickle.dump(estimator,open("$outfile_fit", 'w+'), pickle.HIGHEST_PROTOCOL) | |
59 #end if | |
60 ]]> | |
61 </configfile> | |
62 </configfiles> | |
63 <inputs> | |
64 <conditional name="input_type"> | |
65 <param name="selected_input_type" type="select" label="Select the type of your input data:"> | |
66 <option value="tabular" selected="true">Tabular</option> | |
67 <option value="sparse">Sparse</option> | |
68 </param> | |
69 <when value="tabular"> | |
70 <param name="infile" type="data" format="tabular" label="Select a tabular file you want to train your preprocessor on its data:"/> | |
71 <conditional name="pre_processors"> | |
72 <expand macro="sparse_preprocessors"> | |
73 <option value="KernelCenterer">Kernel Centerer (Centers a kernel matrix)</option> | |
74 <option value="MinMaxScaler">Minmax Scaler (Scales features to a range)</option> | |
75 <option value="PolynomialFeatures">Polynomial Features (Generates polynomial and interaction features)</option> | |
76 <option value="RobustScaler">Robust Scaler (Scales features using outlier-invariance statistics)</option> | |
77 </expand> | |
78 <expand macro="sparse_preprocessor_options"> | |
79 <when value="KernelCenterer"> | |
80 <expand macro="multitype_input"/> | |
81 <section name="options" title="Advanced Options" expanded="False"> | |
82 </section> | |
83 </when> | |
84 <when value="MinMaxScaler"> | |
85 <expand macro="multitype_input"/> | |
86 <section name="options" title="Advanced Options" expanded="False"> | |
87 <!--feature_range--> | |
88 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
89 label="Use a copy of data for precomputing normalization" help=" "/> | |
90 </section> | |
91 </when> | |
92 <when value="PolynomialFeatures"> | |
93 <expand macro="multitype_input"/> | |
94 <section name="options" title="Advanced Options" expanded="False"> | |
95 <param argument="degree" type="integer" optional="true" value="2" label="The degree of the polynomial features " help=""/> | |
96 <param argument="interaction_only" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="false" label="Produce interaction features only" help="(Features that are products of at most degree distinct input features) "/> | |
97 <param argument="include_bias" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Include a bias column" help="Feature in which all polynomial powers are zero "/> | |
98 </section> | |
99 </when> | |
100 <when value="RobustScaler"> | |
101 <expand macro="multitype_input"/> | |
102 <section name="options" title="Advanced Options" expanded="False"> | |
103 <!--=True, =True, copy=True--> | |
104 <param argument="with_centering" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
105 label="Center the data before scaling" help=" "/> | |
106 <param argument="with_scaling" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
107 label="Scale the data to interquartile range" help=" "/> | |
108 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" | |
109 label="Use a copy of data for inplace scaling" help=" "/> | |
110 </section> | |
111 </when> | |
112 </expand> | |
113 </conditional> | |
114 </when> | |
115 <when value="sparse"> | |
116 <param name="infile" type="data" format="txt" label="Select a sparse representation you want to train your preprocessor on its data:"/> | |
117 <conditional name="pre_processors"> | |
118 <expand macro="sparse_preprocessors"/> | |
119 <expand macro="sparse_preprocessor_options"/> | |
120 </conditional> | |
121 </when> | |
122 </conditional> | |
123 <param name="save" type="boolean" truevalue="booltrue" falsevalue="boolflase" checked="false" | |
124 label="Save the preprocessor" | |
125 help="Saves the preprocessor after fitting to the data. The preprocessor can then be passed to other tools and used in later operations."/> | |
126 </inputs> | |
127 <outputs> | |
128 <data format="tabular" name="outfile_transform" from_work_dir="./output"/> | |
129 <data format="zip" name="outfile_fit"> | |
130 <filter>save</filter> | |
131 </data> | |
132 </outputs> | |
133 <tests> | |
134 <test> | |
135 <param name="infile" value="train.tabular" ftype="tabular"/> | |
136 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | |
137 <param name="selected_input_type" value="tabular"/> | |
138 <param name="selected_pre_processor" value="KernelCenterer"/> | |
139 <param name="save" value="true"/> | |
140 <output name="outfile_transform" file="prp_result01" ftype="tabular"/> | |
141 <output name="outfile_fit" file="prp_model01" ftype="zip" compare="sim_size" delta="500"/> | |
142 </test> | |
143 <test> | |
144 <param name="infile" value="train.tabular" ftype="tabular"/> | |
145 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | |
146 <param name="selected_input_type" value="tabular"/> | |
147 <param name="selected_pre_processor" value="MinMaxScaler"/> | |
148 <param name="save" value="true"/> | |
149 <output name="outfile_transform" file="prp_result02" ftype="tabular"/> | |
150 <output name="outfile_fit" file="prp_model02" ftype="zip" compare="sim_size" delta="500"/> | |
151 </test> | |
152 <test> | |
153 <param name="infile" value="train.tabular" ftype="tabular"/> | |
154 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | |
155 <param name="selected_input_type" value="tabular"/> | |
156 <param name="selected_pre_processor" value="PolynomialFeatures"/> | |
157 <param name="save" value="true"/> | |
158 <output name="outfile_transform" file="prp_result03" ftype="tabular"/> | |
159 <output name="outfile_fit" file="prp_model03" ftype="zip" compare="sim_size" delta="500"/> | |
160 </test> | |
161 <test> | |
162 <param name="infile" value="train.tabular" ftype="tabular"/> | |
163 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | |
164 <param name="selected_input_type" value="tabular"/> | |
165 <param name="selected_pre_processor" value="RobustScaler"/> | |
166 <param name="save" value="true"/> | |
167 <output name="outfile_transform" file="prp_result04" ftype="tabular"/> | |
168 <output name="outfile_fit" file="prp_model04" ftype="zip" compare="sim_size" delta="500"/> | |
169 </test> | |
170 <test> | |
171 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | |
172 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
173 <param name="selected_input_type" value="sparse"/> | |
174 <param name="selected_pre_processor" value="Binarizer"/> | |
175 <param name="save" value="true"/> | |
176 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> | |
177 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="500"/> | |
178 </test> | |
179 <test> | |
180 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | |
181 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
182 <param name="selected_input_type" value="sparse"/> | |
183 <param name="selected_pre_processor" value="Imputer"/> | |
184 <param name="save" value="true"/> | |
185 <param name="axis" value="true"/> | |
186 <output name="outfile_transform" file="prp_result06" ftype="tabular"/> | |
187 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="500"/> | |
188 </test> | |
189 <test> | |
190 <param name="infile" value="train.tabular" ftype="tabular"/> | |
191 <param name="infile_transform" value="train.tabular" ftype="tabular"/> | |
192 <param name="selected_input_type" value="tabular"/> | |
193 <param name="selected_pre_processor" value="StandardScaler"/> | |
194 <param name="save" value="true"/> | |
195 <output name="outfile_transform" file="prp_result07" ftype="tabular"/> | |
196 <output name="outfile_fit" file="prp_model07" ftype="zip" compare="sim_size" delta="500"/> | |
197 </test> | |
198 <test> | |
199 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | |
200 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
201 <param name="selected_input_type" value="sparse"/> | |
202 <param name="selected_pre_processor" value="MaxAbsScaler"/> | |
203 <param name="save" value="true"/> | |
204 <output name="outfile_transform" file="prp_result08" ftype="tabular"/> | |
205 <output name="outfile_fit" file="prp_model08" ftype="zip" compare="sim_size" delta="500"/> | |
206 </test> | |
207 <test> | |
208 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | |
209 <param name="infile_transform" value="csr_sparse2.mtx" ftype="txt"/> | |
210 <param name="selected_input_type" value="sparse"/> | |
211 <param name="selected_pre_processor" value="Normalizer"/> | |
212 <param name="save" value="true"/> | |
213 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> | |
214 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="500"/> | |
215 </test> | |
216 </tests> | |
217 <help> | |
218 <![CDATA[ | |
219 **What it does** | |
220 | |
221 This tool provides several transformer classes to change raw feature vectors into a representation that is more suitable for the downstream estimators. The library is provided by sklearn.preprocessing package. | |
222 | |
223 For information about preprocessing classes and parameter settings please refer to `Scikit-learn preprocessing`_. | |
224 | |
225 .. _`Scikit-learn preprocessing`: http://scikit-learn.org/stable/modules/preprocessing.html | |
226 ]]> | |
227 </help> | |
228 <expand macro="sklearn_citation"/> | |
229 </tool> |