Mercurial > repos > bgruening > create_tool_recommendation_model
comparison main.py @ 2:76251d1ccdcc draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 6fa2a0294d615c9f267b766337dca0b2d3637219"
author | bgruening |
---|---|
date | Fri, 11 Oct 2019 18:24:54 -0400 |
parents | 12764915e1c5 |
children | 5b3c08710e47 |
comparison
equal
deleted
inserted
replaced
1:12764915e1c5 | 2:76251d1ccdcc |
---|---|
6 import numpy as np | 6 import numpy as np |
7 import argparse | 7 import argparse |
8 import time | 8 import time |
9 | 9 |
10 # machine learning library | 10 # machine learning library |
11 import tensorflow as tf | |
12 from keras import backend as K | |
11 import keras.callbacks as callbacks | 13 import keras.callbacks as callbacks |
12 | 14 |
13 import extract_workflow_connections | 15 import extract_workflow_connections |
14 import prepare_data | 16 import prepare_data |
15 import optimise_hyperparameters | 17 import optimise_hyperparameters |
17 | 19 |
18 | 20 |
19 class PredictTool: | 21 class PredictTool: |
20 | 22 |
21 @classmethod | 23 @classmethod |
22 def __init__(self): | 24 def __init__(self, num_cpus): |
23 """ Init method. """ | 25 """ Init method. """ |
26 # set the number of cpus | |
27 cpu_config = tf.ConfigProto( | |
28 device_count={"CPU": num_cpus}, | |
29 intra_op_parallelism_threads=num_cpus, | |
30 inter_op_parallelism_threads=num_cpus, | |
31 allow_soft_placement=True | |
32 ) | |
33 K.set_session(tf.Session(config=cpu_config)) | |
24 | 34 |
25 @classmethod | 35 @classmethod |
26 def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools): | 36 def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools): |
27 """ | 37 """ |
28 Define recurrent neural network and train sequential data | 38 Define recurrent neural network and train sequential data |
29 """ | 39 """ |
30 print("Start hyperparameter optimisation...") | 40 print("Start hyperparameter optimisation...") |
31 hyper_opt = optimise_hyperparameters.HyperparameterOptimisation() | 41 hyper_opt = optimise_hyperparameters.HyperparameterOptimisation() |
32 best_params = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights) | 42 best_params, best_model = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, class_weights) |
33 | |
34 # retrieve the model and train on complete dataset without validation set | |
35 model, best_params = utils.set_recurrent_network(best_params, reverse_dictionary, class_weights) | |
36 | 43 |
37 # define callbacks | 44 # define callbacks |
45 early_stopping = callbacks.EarlyStopping(monitor='loss', mode='min', verbose=1, min_delta=1e-4, restore_best_weights=True) | |
38 predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred) | 46 predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred) |
39 # tensor_board = callbacks.TensorBoard(log_dir=log_directory, histogram_freq=0, write_graph=True, write_images=True) | 47 |
40 callbacks_list = [predict_callback_test] | 48 callbacks_list = [predict_callback_test, early_stopping] |
41 | 49 |
42 print("Start training on the best model...") | 50 print("Start training on the best model...") |
43 model_fit = model.fit( | 51 train_performance = dict() |
44 train_data, | |
45 train_labels, | |
46 batch_size=int(best_params["batch_size"]), | |
47 epochs=n_epochs, | |
48 verbose=2, | |
49 callbacks=callbacks_list, | |
50 shuffle="batch", | |
51 validation_data=(test_data, test_labels) | |
52 ) | |
53 | |
54 train_performance = { | |
55 "train_loss": np.array(model_fit.history["loss"]), | |
56 "model": model, | |
57 "best_parameters": best_params | |
58 } | |
59 | |
60 # if there is test data, add more information | |
61 if len(test_data) > 0: | 52 if len(test_data) > 0: |
62 train_performance["validation_loss"] = np.array(model_fit.history["val_loss"]) | 53 trained_model = best_model.fit( |
54 train_data, | |
55 train_labels, | |
56 batch_size=int(best_params["batch_size"]), | |
57 epochs=n_epochs, | |
58 verbose=2, | |
59 callbacks=callbacks_list, | |
60 shuffle="batch", | |
61 validation_data=(test_data, test_labels) | |
62 ) | |
63 train_performance["validation_loss"] = np.array(trained_model.history["val_loss"]) | |
63 train_performance["precision"] = predict_callback_test.precision | 64 train_performance["precision"] = predict_callback_test.precision |
64 train_performance["usage_weights"] = predict_callback_test.usage_weights | 65 train_performance["usage_weights"] = predict_callback_test.usage_weights |
66 else: | |
67 trained_model = best_model.fit( | |
68 train_data, | |
69 train_labels, | |
70 batch_size=int(best_params["batch_size"]), | |
71 epochs=n_epochs, | |
72 verbose=2, | |
73 callbacks=callbacks_list, | |
74 shuffle="batch" | |
75 ) | |
76 train_performance["train_loss"] = np.array(trained_model.history["loss"]) | |
77 train_performance["model"] = best_model | |
78 train_performance["best_parameters"] = best_params | |
65 return train_performance | 79 return train_performance |
66 | 80 |
67 | 81 |
68 class PredictCallback(callbacks.Callback): | 82 class PredictCallback(callbacks.Callback): |
69 def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, next_compatible_tools, usg_scores): | 83 def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, next_compatible_tools, usg_scores): |
88 print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights)) | 102 print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights)) |
89 | 103 |
90 | 104 |
91 if __name__ == "__main__": | 105 if __name__ == "__main__": |
92 start_time = time.time() | 106 start_time = time.time() |
107 | |
93 arg_parser = argparse.ArgumentParser() | 108 arg_parser = argparse.ArgumentParser() |
94 arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file") | 109 arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file") |
95 arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file") | 110 arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file") |
96 arg_parser.add_argument("-om", "--output_model", required=True, help="trained model file") | 111 arg_parser.add_argument("-om", "--output_model", required=True, help="trained model file") |
97 # data parameters | 112 # data parameters |
110 arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer") | 125 arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer") |
111 arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers") | 126 arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers") |
112 arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate") | 127 arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate") |
113 arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers") | 128 arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers") |
114 arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers") | 129 arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers") |
130 | |
115 # get argument values | 131 # get argument values |
116 args = vars(arg_parser.parse_args()) | 132 args = vars(arg_parser.parse_args()) |
117 tool_usage_path = args["tool_usage_file"] | 133 tool_usage_path = args["tool_usage_file"] |
118 workflows_path = args["workflow_file"] | 134 workflows_path = args["workflow_file"] |
119 cutoff_date = args["cutoff_date"] | 135 cutoff_date = args["cutoff_date"] |
131 spatial_dropout = args["spatial_dropout"] | 147 spatial_dropout = args["spatial_dropout"] |
132 recurrent_dropout = args["recurrent_dropout"] | 148 recurrent_dropout = args["recurrent_dropout"] |
133 learning_rate = args["learning_rate"] | 149 learning_rate = args["learning_rate"] |
134 activation_recurrent = args["activation_recurrent"] | 150 activation_recurrent = args["activation_recurrent"] |
135 activation_output = args["activation_output"] | 151 activation_output = args["activation_output"] |
152 num_cpus = 16 | |
136 | 153 |
137 config = { | 154 config = { |
138 'cutoff_date': cutoff_date, | 155 'cutoff_date': cutoff_date, |
139 'maximum_path_length': maximum_path_length, | 156 'maximum_path_length': maximum_path_length, |
140 'n_epochs': n_epochs, | 157 'n_epochs': n_epochs, |
159 # Process the paths from workflows | 176 # Process the paths from workflows |
160 print("Dividing data...") | 177 print("Dividing data...") |
161 data = prepare_data.PrepareData(maximum_path_length, test_share) | 178 data = prepare_data.PrepareData(maximum_path_length, test_share) |
162 train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools) | 179 train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools) |
163 # find the best model and start training | 180 # find the best model and start training |
164 predict_tool = PredictTool() | 181 predict_tool = PredictTool(num_cpus) |
165 # start training with weighted classes | 182 # start training with weighted classes |
166 print("Training with weighted classes and samples ...") | 183 print("Training with weighted classes and samples ...") |
167 results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools) | 184 results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools) |
168 print() | 185 print() |
169 print("Best parameters \n") | 186 print("Best parameters \n") |