comparison utils.py @ 0:9bf25dbe00ad draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
author bgruening
date Wed, 28 Aug 2019 07:19:38 -0400
parents
children 76251d1ccdcc
comparison
equal deleted inserted replaced
-1:000000000000 0:9bf25dbe00ad
1 import os
2 import numpy as np
3 import json
4 import h5py
5
6 from keras.models import model_from_json, Sequential
7 from keras.layers import Dense, GRU, Dropout
8 from keras.layers.embeddings import Embedding
9 from keras.layers.core import SpatialDropout1D
10 from keras.optimizers import RMSprop
11 from keras import backend as K
12
13
14 def read_file(file_path):
15 """
16 Read a file
17 """
18 with open(file_path, "r") as json_file:
19 file_content = json.loads(json_file.read())
20 return file_content
21
22
23 def write_file(file_path, content):
24 """
25 Write a file
26 """
27 remove_file(file_path)
28 with open(file_path, "w") as json_file:
29 json_file.write(json.dumps(content))
30
31
32 def save_processed_workflows(file_path, unique_paths):
33 workflow_paths_unique = ""
34 for path in unique_paths:
35 workflow_paths_unique += path + "\n"
36 with open(file_path, "w") as workflows_file:
37 workflows_file.write(workflow_paths_unique)
38
39
40 def load_saved_model(model_config, model_weights):
41 """
42 Load the saved trained model using the saved network and its weights
43 """
44 # load the network
45 loaded_model = model_from_json(model_config)
46 # load the saved weights into the model
47 loaded_model.set_weights(model_weights)
48 return loaded_model
49
50
51 def format_tool_id(tool_link):
52 """
53 Extract tool id from tool link
54 """
55 tool_id_split = tool_link.split("/")
56 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link
57 return tool_id
58
59
60 def get_HDF5(hf, d_key):
61 """
62 Read h5 file to get train and test data
63 """
64 return hf.get(d_key).value
65
66
67 def save_HDF5(hf_file, d_key, data, d_type=""):
68 """
69 Save datasets as h5 file
70 """
71 if (d_type == 'json'):
72 data = json.dumps(data)
73 hf_file.create_dataset(d_key, data=data)
74
75
76 def set_trained_model(dump_file, model_values):
77 """
78 Create an h5 file with the trained weights and associated dicts
79 """
80 hf_file = h5py.File(dump_file, 'w')
81 for key in model_values:
82 value = model_values[key]
83 if key == 'model_weights':
84 for idx, item in enumerate(value):
85 w_key = "weight_" + str(idx)
86 if w_key in hf_file:
87 hf_file.modify(w_key, item)
88 else:
89 hf_file.create_dataset(w_key, data=item)
90 else:
91 if key in hf_file:
92 hf_file.modify(key, json.dumps(value))
93 else:
94 hf_file.create_dataset(key, data=json.dumps(value))
95 hf_file.close()
96
97
98 def remove_file(file_path):
99 if os.path.exists(file_path):
100 os.remove(file_path)
101
102
103 def extract_configuration(config_object):
104 config_loss = dict()
105 for index, item in enumerate(config_object):
106 config_loss[index] = list()
107 d_config = dict()
108 d_config['loss'] = item['result']['loss']
109 d_config['params_config'] = item['misc']['vals']
110 config_loss[index].append(d_config)
111 return config_loss
112
113
114 def get_best_parameters(mdl_dict):
115 """
116 Get param values (defaults as well)
117 """
118 lr = float(mdl_dict.get("learning_rate", "0.001"))
119 embedding_size = int(mdl_dict.get("embedding_size", "512"))
120 dropout = float(mdl_dict.get("dropout", "0.2"))
121 recurrent_dropout = float(mdl_dict.get("recurrent_dropout", "0.2"))
122 spatial_dropout = float(mdl_dict.get("spatial_dropout", "0.2"))
123 units = int(mdl_dict.get("units", "512"))
124 batch_size = int(mdl_dict.get("batch_size", "512"))
125 activation_recurrent = mdl_dict.get("activation_recurrent", "elu")
126 activation_output = mdl_dict.get("activation_output", "sigmoid")
127
128 return {
129 "lr": lr,
130 "embedding_size": embedding_size,
131 "dropout": dropout,
132 "recurrent_dropout": recurrent_dropout,
133 "spatial_dropout": spatial_dropout,
134 "units": units,
135 "batch_size": batch_size,
136 "activation_recurrent": activation_recurrent,
137 "activation_output": activation_output,
138 }
139
140
141 def weighted_loss(class_weights):
142 """
143 Create a weighted loss function. Penalise the misclassification
144 of classes more with the higher usage
145 """
146 weight_values = list(class_weights.values())
147
148 def weighted_binary_crossentropy(y_true, y_pred):
149 # add another dimension to compute dot product
150 expanded_weights = K.expand_dims(weight_values, axis=-1)
151 return K.dot(K.binary_crossentropy(y_true, y_pred), expanded_weights)
152 return weighted_binary_crossentropy
153
154
155 def set_recurrent_network(mdl_dict, reverse_dictionary, class_weights):
156 """
157 Create a RNN network and set its parameters
158 """
159 dimensions = len(reverse_dictionary) + 1
160 model_params = get_best_parameters(mdl_dict)
161
162 # define the architecture of the neural network
163 model = Sequential()
164 model.add(Embedding(dimensions, model_params["embedding_size"], mask_zero=True))
165 model.add(SpatialDropout1D(model_params["spatial_dropout"]))
166 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=True))
167 model.add(Dropout(model_params["dropout"]))
168 model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=False))
169 model.add(Dropout(model_params["dropout"]))
170 model.add(Dense(dimensions, activation=model_params["activation_output"]))
171 optimizer = RMSprop(lr=model_params["lr"])
172 model.compile(loss=weighted_loss(class_weights), optimizer=optimizer)
173 return model, model_params
174
175
176 def compute_precision(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, topk):
177 """
178 Compute absolute and compatible precision
179 """
180 absolute_precision = 0.0
181 test_sample = np.reshape(x, (1, len(x)))
182
183 # predict next tools for a test path
184 prediction = model.predict(test_sample, verbose=0)
185
186 nw_dimension = prediction.shape[1]
187
188 # remove the 0th position as there is no tool at this index
189 prediction = np.reshape(prediction, (nw_dimension,))
190
191 prediction_pos = np.argsort(prediction, axis=-1)
192 topk_prediction_pos = prediction_pos[-topk:]
193
194 # remove the wrong tool position from the predicted list of tool positions
195 topk_prediction_pos = [x for x in topk_prediction_pos if x > 0]
196
197 # read tool names using reverse dictionary
198 actual_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in actual_classes_pos]
199 top_predicted_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in topk_prediction_pos]
200
201 # compute the class weights of predicted tools
202 mean_usg_score = 0
203 usg_wt_scores = list()
204 for t_id in topk_prediction_pos:
205 t_name = reverse_data_dictionary[int(t_id)]
206 if t_id in usage_scores and t_name in actual_next_tool_names:
207 usg_wt_scores.append(np.log(usage_scores[t_id] + 1.0))
208 if len(usg_wt_scores) > 0:
209 mean_usg_score = np.sum(usg_wt_scores) / float(topk)
210 false_positives = [tool_name for tool_name in top_predicted_next_tool_names if tool_name not in actual_next_tool_names]
211 absolute_precision = 1 - (len(false_positives) / float(topk))
212 return mean_usg_score, absolute_precision
213
214
215 def verify_model(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, topk_list=[1, 2, 3]):
216 """
217 Verify the model on test data
218 """
219 print("Evaluating performance on test data...")
220 print("Test data size: %d" % len(y))
221 size = y.shape[0]
222 precision = np.zeros([len(y), len(topk_list)])
223 usage_weights = np.zeros([len(y), len(topk_list)])
224 # loop over all the test samples and find prediction precision
225 for i in range(size):
226 actual_classes_pos = np.where(y[i] > 0)[0]
227 for index, abs_topk in enumerate(topk_list):
228 abs_mean_usg_score, absolute_precision = compute_precision(model, x[i, :], y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, abs_topk)
229 precision[i][index] = absolute_precision
230 usage_weights[i][index] = abs_mean_usg_score
231 mean_precision = np.mean(precision, axis=0)
232 mean_usage = np.mean(usage_weights, axis=0)
233 return mean_precision, mean_usage
234
235
236 def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights):
237 # save files
238 trained_model = results["model"]
239 best_model_parameters = results["best_parameters"]
240 model_config = trained_model.to_json()
241 model_weights = trained_model.get_weights()
242
243 model_values = {
244 'data_dictionary': data_dictionary,
245 'model_config': model_config,
246 'best_parameters': best_model_parameters,
247 'model_weights': model_weights,
248 "compatible_tools": compatible_next_tools,
249 "class_weights": class_weights
250 }
251 set_trained_model(trained_model_path, model_values)