comparison fitted_model_eval.py @ 3:20bb2a45f922 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit eb703290e2589561ea215c84aa9f71bcfe1712c6"
author bgruening
date Fri, 01 Nov 2019 17:12:39 -0400
parents
children ead7adad8d0e
comparison
equal deleted inserted replaced
2:e23cfe4be9d4 3:20bb2a45f922
1 import argparse
2 import json
3 import pandas as pd
4 import warnings
5
6 from scipy.io import mmread
7 from sklearn.pipeline import Pipeline
8 from sklearn.metrics.scorer import _check_multimetric_scoring
9 from sklearn.model_selection._validation import _score
10 from galaxy_ml.utils import get_scoring, load_model, read_columns
11
12
13 def _get_X_y(params, infile1, infile2):
14 """ read from inputs and output X and y
15
16 Parameters
17 ----------
18 params : dict
19 Tool inputs parameter
20 infile1 : str
21 File path to dataset containing features
22 infile2 : str
23 File path to dataset containing target values
24
25 """
26 # store read dataframe object
27 loaded_df = {}
28
29 input_type = params['input_options']['selected_input']
30 # tabular input
31 if input_type == 'tabular':
32 header = 'infer' if params['input_options']['header1'] else None
33 column_option = (params['input_options']['column_selector_options_1']
34 ['selected_column_selector_option'])
35 if column_option in ['by_index_number', 'all_but_by_index_number',
36 'by_header_name', 'all_but_by_header_name']:
37 c = params['input_options']['column_selector_options_1']['col1']
38 else:
39 c = None
40
41 df_key = infile1 + repr(header)
42 df = pd.read_csv(infile1, sep='\t', header=header,
43 parse_dates=True)
44 loaded_df[df_key] = df
45
46 X = read_columns(df, c=c, c_option=column_option).astype(float)
47 # sparse input
48 elif input_type == 'sparse':
49 X = mmread(open(infile1, 'r'))
50
51 # Get target y
52 header = 'infer' if params['input_options']['header2'] else None
53 column_option = (params['input_options']['column_selector_options_2']
54 ['selected_column_selector_option2'])
55 if column_option in ['by_index_number', 'all_but_by_index_number',
56 'by_header_name', 'all_but_by_header_name']:
57 c = params['input_options']['column_selector_options_2']['col2']
58 else:
59 c = None
60
61 df_key = infile2 + repr(header)
62 if df_key in loaded_df:
63 infile2 = loaded_df[df_key]
64 else:
65 infile2 = pd.read_csv(infile2, sep='\t',
66 header=header, parse_dates=True)
67 loaded_df[df_key] = infile2
68
69 y = read_columns(
70 infile2,
71 c=c,
72 c_option=column_option,
73 sep='\t',
74 header=header,
75 parse_dates=True)
76 if len(y.shape) == 2 and y.shape[1] == 1:
77 y = y.ravel()
78
79 return X, y
80
81
82 def main(inputs, infile_estimator, outfile_eval,
83 infile_weights=None, infile1=None,
84 infile2=None):
85 """
86 Parameter
87 ---------
88 inputs : str
89 File path to galaxy tool parameter
90
91 infile_estimator : strgit
92 File path to trained estimator input
93
94 outfile_eval : str
95 File path to save the evalulation results, tabular
96
97 infile_weights : str
98 File path to weights input
99
100 infile1 : str
101 File path to dataset containing features
102
103 infile2 : str
104 File path to dataset containing target values
105 """
106 warnings.filterwarnings('ignore')
107
108 with open(inputs, 'r') as param_handler:
109 params = json.load(param_handler)
110
111 X_test, y_test = _get_X_y(params, infile1, infile2)
112
113 # load model
114 with open(infile_estimator, 'rb') as est_handler:
115 estimator = load_model(est_handler)
116
117 main_est = estimator
118 if isinstance(estimator, Pipeline):
119 main_est = estimator.steps[-1][-1]
120 if hasattr(main_est, 'config') and hasattr(main_est, 'load_weights'):
121 if not infile_weights or infile_weights == 'None':
122 raise ValueError("The selected model skeleton asks for weights, "
123 "but no dataset for weights was provided!")
124 main_est.load_weights(infile_weights)
125
126 # handle scorer, convert to scorer dict
127 scoring = params['scoring']
128 scorer = get_scoring(scoring)
129 scorer, _ = _check_multimetric_scoring(estimator, scoring=scorer)
130
131 if hasattr(estimator, 'evaluate'):
132 scores = estimator.evaluate(X_test, y_test=y_test,
133 scorer=scorer,
134 is_multimetric=True)
135 else:
136 scores = _score(estimator, X_test, y_test, scorer,
137 is_multimetric=True)
138
139 # handle output
140 for name, score in scores.items():
141 scores[name] = [score]
142 df = pd.DataFrame(scores)
143 df = df[sorted(df.columns)]
144 df.to_csv(path_or_buf=outfile_eval, sep='\t',
145 header=True, index=False)
146
147
148 if __name__ == '__main__':
149 aparser = argparse.ArgumentParser()
150 aparser.add_argument("-i", "--inputs", dest="inputs", required=True)
151 aparser.add_argument("-e", "--infile_estimator", dest="infile_estimator")
152 aparser.add_argument("-w", "--infile_weights", dest="infile_weights")
153 aparser.add_argument("-X", "--infile1", dest="infile1")
154 aparser.add_argument("-y", "--infile2", dest="infile2")
155 aparser.add_argument("-O", "--outfile_eval", dest="outfile_eval")
156 args = aparser.parse_args()
157
158 main(args.inputs, args.infile_estimator, args.outfile_eval,
159 infile_weights=args.infile_weights, infile1=args.infile1,
160 infile2=args.infile2)