Mercurial > repos > bgruening > sklearn_data_preprocess
comparison pre_process.xml @ 26:685046e0381a draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 60f0fbc0eafd7c11bc60fb6c77f2937782efd8a9-dirty
author | bgruening |
---|---|
date | Fri, 09 Aug 2019 07:16:21 -0400 |
parents | 9e43ee712723 |
children | eb79bde99328 |
comparison
equal
deleted
inserted
replaced
25:9ac0b78c6b6d | 26:685046e0381a |
---|---|
17 <![CDATA[ | 17 <![CDATA[ |
18 import sys | 18 import sys |
19 import json | 19 import json |
20 import pandas | 20 import pandas |
21 import pickle | 21 import pickle |
22 | |
22 from scipy.io import mmread | 23 from scipy.io import mmread |
23 from scipy.io import mmwrite | 24 from scipy.io import mmwrite |
24 from sklearn import preprocessing | 25 from sklearn import preprocessing |
25 | 26 from galaxy_ml.utils import read_columns, SafeEval |
26 sys.path.insert(0, '$__tool_directory__') | 27 |
27 from utils import read_columns | 28 |
29 safe_eval = SafeEval() | |
28 | 30 |
29 input_json_path = sys.argv[1] | 31 input_json_path = sys.argv[1] |
30 with open(input_json_path, "r") as param_handler: | 32 with open(input_json_path, "r") as param_handler: |
31 params = json.load(param_handler) | 33 params = json.load(param_handler) |
32 | 34 |
37 column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"] | 39 column_option = params["input_type"]["column_selector_options_1"]["selected_column_selector_option"] |
38 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: | 40 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: |
39 c = params["input_type"]["column_selector_options_1"]["col1"] | 41 c = params["input_type"]["column_selector_options_1"]["col1"] |
40 else: | 42 else: |
41 c = None | 43 c = None |
42 X = read_columns( | 44 X, input_df = read_columns( |
43 "$input_type.infile", | 45 "$input_type.infile", |
44 c = c, | 46 c=c, |
45 c_option = column_option, | 47 c_option=column_option, |
48 return_df=True, | |
46 sep='\t', | 49 sep='\t', |
47 header=header, | 50 header=header, |
48 parse_dates=True, | 51 parse_dates=True, |
49 encoding=None, | 52 encoding=None, |
50 index_col=None, | 53 index_col=None, |
51 tupleize_cols=False).astype(float) | 54 tupleize_cols=False) |
55 X = X.astype(float) | |
52 #end if | 56 #end if |
53 | 57 |
54 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] | 58 preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] |
55 options = params["input_type"]["pre_processors"]["options"] | 59 options = params["input_type"]["pre_processors"]["options"] |
60 if 'feature_range' in options: | |
61 feature_range = safe_eval(options['feature_range'].strip()) | |
62 if not feature_range: | |
63 feature_range = (0, 1) | |
64 options['feature_range'] = feature_range | |
56 | 65 |
57 my_class = getattr(preprocessing, preprocessor) | 66 my_class = getattr(preprocessing, preprocessor) |
58 estimator = my_class(**options) | 67 estimator = my_class(**options) |
59 estimator.fit(X) | 68 estimator.fit(X) |
60 result = estimator.transform(X) | 69 result = estimator.transform(X) |
61 | 70 |
62 #if $input_type.selected_input_type == "sparse": | 71 #if $input_type.selected_input_type == "sparse": |
63 with open("$outfile_transform", "wb") as transform_handler: | 72 with open("$outfile_transform", "wb") as transform_handler: |
64 mmwrite(transform_handler, result) | 73 mmwrite(transform_handler, result) |
65 #else: | 74 #else: |
66 res = pandas.DataFrame(result) | 75 columns = input_df.columns |
67 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) | 76 if preprocessor == 'PolynomialFeatures': |
77 columns = None | |
78 header = False | |
79 res = pandas.DataFrame(result, columns=columns) | |
80 res.to_csv(path_or_buf = "$outfile_transform", sep="\t", | |
81 index=False, header=True if header else False) | |
68 #end if | 82 #end if |
69 | 83 |
70 #if $save: | 84 #if $save: |
71 with open("$outfile_fit", 'wb') as out_handler: | 85 with open("$outfile_fit", 'wb') as out_handler: |
72 pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL) | 86 pickle.dump(estimator, out_handler, pickle.HIGHEST_PROTOCOL) |
153 <param name="save" value="true"/> | 167 <param name="save" value="true"/> |
154 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> | 168 <output name="outfile_transform" file="prp_result05" ftype="tabular"/> |
155 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="5"/> | 169 <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="5"/> |
156 </test> | 170 </test> |
157 <test> | 171 <test> |
158 <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> | |
159 <param name="selected_input_type" value="sparse"/> | |
160 <param name="selected_pre_processor" value="Imputer"/> | |
161 <param name="save" value="true"/> | |
162 <param name="axis" value="true"/> | |
163 <output name="outfile_transform" file="prp_result06" ftype="tabular"/> | |
164 <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="50"/> | |
165 </test> | |
166 <test> | |
167 <param name="infile" value="train.tabular" ftype="tabular"/> | 172 <param name="infile" value="train.tabular" ftype="tabular"/> |
168 <param name="selected_input_type" value="tabular"/> | 173 <param name="selected_input_type" value="tabular"/> |
169 <param name="selected_column_selector_option" value="all_columns"/> | 174 <param name="selected_column_selector_option" value="all_columns"/> |
170 <param name="selected_pre_processor" value="StandardScaler"/> | 175 <param name="selected_pre_processor" value="StandardScaler"/> |
171 <param name="save" value="true"/> | 176 <param name="save" value="true"/> |
185 <param name="selected_input_type" value="sparse"/> | 190 <param name="selected_input_type" value="sparse"/> |
186 <param name="selected_pre_processor" value="Normalizer"/> | 191 <param name="selected_pre_processor" value="Normalizer"/> |
187 <param name="save" value="true"/> | 192 <param name="save" value="true"/> |
188 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> | 193 <output name="outfile_transform" file="prp_result09" ftype="tabular"/> |
189 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="5"/> | 194 <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="5"/> |
195 </test> | |
196 <test> | |
197 <param name="infile" value="regression_X.tabular" ftype="tabular"/> | |
198 <param name="header1" value="true"/> | |
199 <param name="selected_column_selector_option" value="all_columns"/> | |
200 <param name="selected_input_type" value="tabular"/> | |
201 <param name="selected_pre_processor" value="MinMaxScaler"/> | |
202 <param name="feature_range" value="(-1, 1)"/> | |
203 <param name="save" value="false"/> | |
204 <output name="outfile_transform" file="prp_result10" ftype="tabular"/> | |
190 </test> | 205 </test> |
191 </tests> | 206 </tests> |
192 <help> | 207 <help> |
193 <![CDATA[ | 208 <![CDATA[ |
194 **What it does** | 209 **What it does** |