Mercurial > repos > bgruening > sklearn_data_preprocess
diff pre_process.xml @ 26:685046e0381a draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 60f0fbc0eafd7c11bc60fb6c77f2937782efd8a9-dirty
author | bgruening |
---|---|
date | Fri, 09 Aug 2019 07:16:21 -0400 |
parents | 9e43ee712723 |
children | eb79bde99328 |
line wrap: on
line diff
--- a/pre_process.xml Tue Jul 09 19:35:04 2019 -0400 +++ b/pre_process.xml Fri Aug 09 07:16:21 2019 -0400 @@ -19,12 +19,14 @@ import json import pandas import pickle + from scipy.io import mmread from scipy.io import mmwrite from sklearn import preprocessing +from galaxy_ml.utils import read_columns, SafeEval -sys.path.insert(0, '$__tool_directory__') -from utils import read_columns + +safe_eval = SafeEval() input_json_path = sys.argv[1] with open(input_json_path, "r") as param_handler: @@ -39,20 +41,27 @@ c = params["input_type"]["column_selector_options_1"]["col1"] else: c = None -X = read_columns( +X, input_df = read_columns( "$input_type.infile", - c = c, - c_option = column_option, + c=c, + c_option=column_option, + return_df=True, sep='\t', header=header, parse_dates=True, encoding=None, index_col=None, - tupleize_cols=False).astype(float) + tupleize_cols=False) +X = X.astype(float) #end if preprocessor = params["input_type"]["pre_processors"]["selected_pre_processor"] options = params["input_type"]["pre_processors"]["options"] +if 'feature_range' in options: + feature_range = safe_eval(options['feature_range'].strip()) + if not feature_range: + feature_range = (0, 1) + options['feature_range'] = feature_range my_class = getattr(preprocessing, preprocessor) estimator = my_class(**options) @@ -63,8 +72,13 @@ with open("$outfile_transform", "wb") as transform_handler: mmwrite(transform_handler, result) #else: -res = pandas.DataFrame(result) -res.to_csv(path_or_buf = "$outfile_transform", sep="\t", index=False, header=None) +columns = input_df.columns +if preprocessor == 'PolynomialFeatures': + columns = None + header = False +res = pandas.DataFrame(result, columns=columns) +res.to_csv(path_or_buf = "$outfile_transform", sep="\t", + index=False, header=True if header else False) #end if #if $save: @@ -155,15 +169,6 @@ <output name="outfile_fit" file="prp_model05" ftype="zip" compare="sim_size" delta="5"/> </test> <test> - <param name="infile" value="csr_sparse2.mtx" ftype="txt"/> - <param name="selected_input_type" value="sparse"/> - <param name="selected_pre_processor" value="Imputer"/> - <param name="save" value="true"/> - <param name="axis" value="true"/> - <output name="outfile_transform" file="prp_result06" ftype="tabular"/> - <output name="outfile_fit" file="prp_model06" ftype="zip" compare="sim_size" delta="50"/> - </test> - <test> <param name="infile" value="train.tabular" ftype="tabular"/> <param name="selected_input_type" value="tabular"/> <param name="selected_column_selector_option" value="all_columns"/> @@ -188,6 +193,16 @@ <output name="outfile_transform" file="prp_result09" ftype="tabular"/> <output name="outfile_fit" file="prp_model09" ftype="zip" compare="sim_size" delta="5"/> </test> + <test> + <param name="infile" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="true"/> + <param name="selected_column_selector_option" value="all_columns"/> + <param name="selected_input_type" value="tabular"/> + <param name="selected_pre_processor" value="MinMaxScaler"/> + <param name="feature_range" value="(-1, 1)"/> + <param name="save" value="false"/> + <output name="outfile_transform" file="prp_result10" ftype="tabular"/> + </test> </tests> <help> <![CDATA[