Mercurial > repos > bgruening > create_tool_recommendation_model
diff main.py @ 0:9bf25dbe00ad draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
author | bgruening |
---|---|
date | Wed, 28 Aug 2019 07:19:38 -0400 |
parents | |
children | 12764915e1c5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/main.py Wed Aug 28 07:19:38 2019 -0400 @@ -0,0 +1,178 @@ +""" +Predict next tools in the Galaxy workflows +using machine learning (recurrent neural network) +""" + +import numpy as np +import argparse +import time + +# machine learning library +import keras.callbacks as callbacks + +import extract_workflow_connections +import prepare_data +import optimise_hyperparameters +import utils + + +class PredictTool: + + @classmethod + def __init__(self): + """ Init method. """ + + @classmethod + def find_train_best_network(self, network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools): + """ + Define recurrent neural network and train sequential data + """ + print("Start hyperparameter optimisation...") + hyper_opt = optimise_hyperparameters.HyperparameterOptimisation() + best_params = hyper_opt.train_model(network_config, reverse_dictionary, train_data, train_labels, test_data, test_labels, class_weights) + + # retrieve the model and train on complete dataset without validation set + model, best_params = utils.set_recurrent_network(best_params, reverse_dictionary, class_weights) + + # define callbacks + predict_callback_test = PredictCallback(test_data, test_labels, reverse_dictionary, n_epochs, compatible_next_tools, usage_pred) + # tensor_board = callbacks.TensorBoard(log_dir=log_directory, histogram_freq=0, write_graph=True, write_images=True) + callbacks_list = [predict_callback_test] + + print("Start training on the best model...") + model_fit = model.fit( + train_data, + train_labels, + batch_size=int(best_params["batch_size"]), + epochs=n_epochs, + verbose=2, + callbacks=callbacks_list, + shuffle="batch", + validation_data=(test_data, test_labels) + ) + + train_performance = { + "train_loss": np.array(model_fit.history["loss"]), + "model": model, + "best_parameters": best_params + } + + # if there is test data, add more information + if len(test_data) > 0: + train_performance["validation_loss"] = np.array(model_fit.history["val_loss"]) + train_performance["precision"] = predict_callback_test.precision + train_performance["usage_weights"] = predict_callback_test.usage_weights + return train_performance + + +class PredictCallback(callbacks.Callback): + def __init__(self, test_data, test_labels, reverse_data_dictionary, n_epochs, next_compatible_tools, usg_scores): + self.test_data = test_data + self.test_labels = test_labels + self.reverse_data_dictionary = reverse_data_dictionary + self.precision = list() + self.usage_weights = list() + self.n_epochs = n_epochs + self.next_compatible_tools = next_compatible_tools + self.pred_usage_scores = usg_scores + + def on_epoch_end(self, epoch, logs={}): + """ + Compute absolute and compatible precision for test data + """ + if len(self.test_data) > 0: + precision, usage_weights = utils.verify_model(self.model, self.test_data, self.test_labels, self.reverse_data_dictionary, self.next_compatible_tools, self.pred_usage_scores) + self.precision.append(precision) + self.usage_weights.append(usage_weights) + print("Epoch %d precision: %s" % (epoch + 1, precision)) + print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights)) + + +if __name__ == "__main__": + start_time = time.time() + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file") + arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file") + arg_parser.add_argument("-om", "--output_model", required=True, help="trained model file") + # data parameters + arg_parser.add_argument("-cd", "--cutoff_date", required=True, help="earliest date for taking tool usage") + arg_parser.add_argument("-pl", "--maximum_path_length", required=True, help="maximum length of tool path") + arg_parser.add_argument("-ep", "--n_epochs", required=True, help="number of iterations to run to create model") + arg_parser.add_argument("-oe", "--optimize_n_epochs", required=True, help="number of iterations to run to find best model parameters") + arg_parser.add_argument("-me", "--max_evals", required=True, help="maximum number of configuration evaluations") + arg_parser.add_argument("-ts", "--test_share", required=True, help="share of data to be used for testing") + arg_parser.add_argument("-vs", "--validation_share", required=True, help="share of data to be used for validation") + # neural network parameters + arg_parser.add_argument("-bs", "--batch_size", required=True, help="size of the tranining batch i.e. the number of samples per batch") + arg_parser.add_argument("-ut", "--units", required=True, help="number of hidden recurrent units") + arg_parser.add_argument("-es", "--embedding_size", required=True, help="size of the fixed vector learned for each tool") + arg_parser.add_argument("-dt", "--dropout", required=True, help="percentage of neurons to be dropped") + arg_parser.add_argument("-sd", "--spatial_dropout", required=True, help="1d dropout used for embedding layer") + arg_parser.add_argument("-rd", "--recurrent_dropout", required=True, help="dropout for the recurrent layers") + arg_parser.add_argument("-lr", "--learning_rate", required=True, help="learning rate") + arg_parser.add_argument("-ar", "--activation_recurrent", required=True, help="activation function for recurrent layers") + arg_parser.add_argument("-ao", "--activation_output", required=True, help="activation function for output layers") + arg_parser.add_argument("-lt", "--loss_type", required=True, help="type of the loss/error function") + # get argument values + args = vars(arg_parser.parse_args()) + tool_usage_path = args["tool_usage_file"] + workflows_path = args["workflow_file"] + cutoff_date = args["cutoff_date"] + maximum_path_length = int(args["maximum_path_length"]) + trained_model_path = args["output_model"] + n_epochs = int(args["n_epochs"]) + optimize_n_epochs = int(args["optimize_n_epochs"]) + max_evals = int(args["max_evals"]) + test_share = float(args["test_share"]) + validation_share = float(args["validation_share"]) + batch_size = args["batch_size"] + units = args["units"] + embedding_size = args["embedding_size"] + dropout = args["dropout"] + spatial_dropout = args["spatial_dropout"] + recurrent_dropout = args["recurrent_dropout"] + learning_rate = args["learning_rate"] + activation_recurrent = args["activation_recurrent"] + activation_output = args["activation_output"] + loss_type = args["loss_type"] + + config = { + 'cutoff_date': cutoff_date, + 'maximum_path_length': maximum_path_length, + 'n_epochs': n_epochs, + 'optimize_n_epochs': optimize_n_epochs, + 'max_evals': max_evals, + 'test_share': test_share, + 'validation_share': validation_share, + 'batch_size': batch_size, + 'units': units, + 'embedding_size': embedding_size, + 'dropout': dropout, + 'spatial_dropout': spatial_dropout, + 'recurrent_dropout': recurrent_dropout, + 'learning_rate': learning_rate, + 'activation_recurrent': activation_recurrent, + 'activation_output': activation_output, + 'loss_type': loss_type + } + + # Extract and process workflows + connections = extract_workflow_connections.ExtractWorkflowConnections() + workflow_paths, compatible_next_tools = connections.read_tabular_file(workflows_path) + # Process the paths from workflows + print("Dividing data...") + data = prepare_data.PrepareData(maximum_path_length, test_share) + train_data, train_labels, test_data, test_labels, data_dictionary, reverse_dictionary, class_weights, usage_pred = data.get_data_labels_matrices(workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools) + # find the best model and start training + predict_tool = PredictTool() + # start training with weighted classes + print("Training with weighted classes and samples ...") + results_weighted = predict_tool.find_train_best_network(config, reverse_dictionary, train_data, train_labels, test_data, test_labels, n_epochs, class_weights, usage_pred, compatible_next_tools) + print() + print("Best parameters \n") + print(results_weighted["best_parameters"]) + print() + utils.save_model(results_weighted, data_dictionary, compatible_next_tools, trained_model_path, class_weights) + end_time = time.time() + print() + print("Program finished in %s seconds" % str(end_time - start_time))