Mercurial > repos > bgruening > create_tool_recommendation_model
diff main.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author | bgruening |
---|---|
date | Sun, 16 Oct 2022 11:52:10 +0000 |
parents | 4f7e6612906b |
children |
line wrap: on
line diff
--- a/main.py Fri May 06 09:05:18 2022 +0000 +++ b/main.py Sun Oct 16 11:52:10 2022 +0000 @@ -1,260 +1,36 @@ """ Predict next tools in the Galaxy workflows -using machine learning (recurrent neural network) +using deep learning learning (Transformers) """ - import argparse import time import extract_workflow_connections -import keras.callbacks as callbacks -import numpy as np -import optimise_hyperparameters import prepare_data -import utils - - -class PredictTool: - def __init__(self, num_cpus): - """ Init method. """ - - def find_train_best_network( - self, - network_config, - reverse_dictionary, - train_data, - train_labels, - test_data, - test_labels, - n_epochs, - class_weights, - usage_pred, - standard_connections, - tool_freq, - tool_tr_samples, - ): - """ - Define recurrent neural network and train sequential data - """ - # get tools with lowest representation - lowest_tool_ids = utils.get_lowest_tools(tool_freq) - - print("Start hyperparameter optimisation...") - hyper_opt = optimise_hyperparameters.HyperparameterOptimisation() - best_params, best_model = hyper_opt.train_model( - network_config, - reverse_dictionary, - train_data, - train_labels, - test_data, - test_labels, - tool_tr_samples, - class_weights, - ) - - # define callbacks - early_stopping = callbacks.EarlyStopping( - monitor="loss", - mode="min", - verbose=1, - min_delta=1e-1, - restore_best_weights=True, - ) - predict_callback_test = PredictCallback( - test_data, - test_labels, - reverse_dictionary, - n_epochs, - usage_pred, - standard_connections, - lowest_tool_ids, - ) - - callbacks_list = [predict_callback_test, early_stopping] - batch_size = int(best_params["batch_size"]) - - print("Start training on the best model...") - train_performance = dict() - trained_model = best_model.fit_generator( - utils.balanced_sample_generator( - train_data, - train_labels, - batch_size, - tool_tr_samples, - reverse_dictionary, - ), - steps_per_epoch=len(train_data) // batch_size, - epochs=n_epochs, - callbacks=callbacks_list, - validation_data=(test_data, test_labels), - verbose=2, - shuffle=True, - ) - train_performance["validation_loss"] = np.array( - trained_model.history["val_loss"] - ) - train_performance["precision"] = predict_callback_test.precision - train_performance["usage_weights"] = predict_callback_test.usage_weights - train_performance[ - "published_precision" - ] = predict_callback_test.published_precision - train_performance[ - "lowest_pub_precision" - ] = predict_callback_test.lowest_pub_precision - train_performance[ - "lowest_norm_precision" - ] = predict_callback_test.lowest_norm_precision - train_performance["train_loss"] = np.array(trained_model.history["loss"]) - train_performance["model"] = best_model - train_performance["best_parameters"] = best_params - return train_performance - - -class PredictCallback(callbacks.Callback): - def __init__( - self, - test_data, - test_labels, - reverse_data_dictionary, - n_epochs, - usg_scores, - standard_connections, - lowest_tool_ids, - ): - self.test_data = test_data - self.test_labels = test_labels - self.reverse_data_dictionary = reverse_data_dictionary - self.precision = list() - self.usage_weights = list() - self.published_precision = list() - self.n_epochs = n_epochs - self.pred_usage_scores = usg_scores - self.standard_connections = standard_connections - self.lowest_tool_ids = lowest_tool_ids - self.lowest_pub_precision = list() - self.lowest_norm_precision = list() - - def on_epoch_end(self, epoch, logs={}): - """ - Compute absolute and compatible precision for test data - """ - if len(self.test_data) > 0: - ( - usage_weights, - precision, - precision_pub, - low_pub_prec, - low_norm_prec, - low_num, - ) = utils.verify_model( - self.model, - self.test_data, - self.test_labels, - self.reverse_data_dictionary, - self.pred_usage_scores, - self.standard_connections, - self.lowest_tool_ids, - ) - self.precision.append(precision) - self.usage_weights.append(usage_weights) - self.published_precision.append(precision_pub) - self.lowest_pub_precision.append(low_pub_prec) - self.lowest_norm_precision.append(low_norm_prec) - print("Epoch %d usage weights: %s" % (epoch + 1, usage_weights)) - print("Epoch %d normal precision: %s" % (epoch + 1, precision)) - print("Epoch %d published precision: %s" % (epoch + 1, precision_pub)) - print("Epoch %d lowest published precision: %s" % (epoch + 1, low_pub_prec)) - print("Epoch %d lowest normal precision: %s" % (epoch + 1, low_norm_prec)) - print( - "Epoch %d number of test samples with lowest tool ids: %s" - % (epoch + 1, low_num) - ) - +import train_transformer if __name__ == "__main__": start_time = time.time() arg_parser = argparse.ArgumentParser() - arg_parser.add_argument( - "-wf", "--workflow_file", required=True, help="workflows tabular file" - ) - arg_parser.add_argument( - "-tu", "--tool_usage_file", required=True, help="tool usage file" - ) - arg_parser.add_argument( - "-om", "--output_model", required=True, help="trained model file" - ) + arg_parser.add_argument("-wf", "--workflow_file", required=True, help="workflows tabular file") + arg_parser.add_argument("-tu", "--tool_usage_file", required=True, help="tool usage file") # data parameters - arg_parser.add_argument( - "-cd", - "--cutoff_date", - required=True, - help="earliest date for taking tool usage", - ) - arg_parser.add_argument( - "-pl", - "--maximum_path_length", - required=True, - help="maximum length of tool path", - ) - arg_parser.add_argument( - "-ep", - "--n_epochs", - required=True, - help="number of iterations to run to create model", - ) - arg_parser.add_argument( - "-oe", - "--optimize_n_epochs", - required=True, - help="number of iterations to run to find best model parameters", - ) - arg_parser.add_argument( - "-me", - "--max_evals", - required=True, - help="maximum number of configuration evaluations", - ) - arg_parser.add_argument( - "-ts", - "--test_share", - required=True, - help="share of data to be used for testing", - ) + arg_parser.add_argument("-cd", "--cutoff_date", required=True, help="earliest date for taking tool usage") + arg_parser.add_argument("-pl", "--maximum_path_length", required=True, help="maximum length of tool path") + arg_parser.add_argument("-om", "--output_model", required=True, help="trained model path") # neural network parameters - arg_parser.add_argument( - "-bs", - "--batch_size", - required=True, - help="size of the tranining batch i.e. the number of samples per batch", - ) - arg_parser.add_argument( - "-ut", "--units", required=True, help="number of hidden recurrent units" - ) - arg_parser.add_argument( - "-es", - "--embedding_size", - required=True, - help="size of the fixed vector learned for each tool", - ) - arg_parser.add_argument( - "-dt", "--dropout", required=True, help="percentage of neurons to be dropped" - ) - arg_parser.add_argument( - "-sd", - "--spatial_dropout", - required=True, - help="1d dropout used for embedding layer", - ) - arg_parser.add_argument( - "-rd", - "--recurrent_dropout", - required=True, - help="dropout for the recurrent layers", - ) - arg_parser.add_argument( - "-lr", "--learning_rate", required=True, help="learning rate" - ) + arg_parser.add_argument("-ti", "--n_train_iter", required=True, help="Number of training iterations run to create model") + arg_parser.add_argument("-nhd", "--n_heads", required=True, help="Number of head in transformer's multi-head attention") + arg_parser.add_argument("-ed", "--n_embed_dim", required=True, help="Embedding dimension") + arg_parser.add_argument("-fd", "--n_feed_forward_dim", required=True, help="Feed forward network dimension") + arg_parser.add_argument("-dt", "--dropout", required=True, help="Percentage of neurons to be dropped") + arg_parser.add_argument("-lr", "--learning_rate", required=True, help="Learning rate") + arg_parser.add_argument("-ts", "--te_share", required=True, help="Share of data to be used for testing") + arg_parser.add_argument("-trbs", "--tr_batch_size", required=True, help="Train batch size") + arg_parser.add_argument("-trlg", "--tr_logging_step", required=True, help="Train logging frequency") + arg_parser.add_argument("-telg", "--te_logging_step", required=True, help="Test logging frequency") + arg_parser.add_argument("-tebs", "--te_batch_size", required=True, help="Test batch size") # get argument values args = vars(arg_parser.parse_args()) @@ -262,89 +38,48 @@ workflows_path = args["workflow_file"] cutoff_date = args["cutoff_date"] maximum_path_length = int(args["maximum_path_length"]) + + n_train_iter = int(args["n_train_iter"]) + te_share = float(args["te_share"]) + tr_batch_size = int(args["tr_batch_size"]) + te_batch_size = int(args["te_batch_size"]) + + n_heads = int(args["n_heads"]) + feed_forward_dim = int(args["n_feed_forward_dim"]) + embedding_dim = int(args["n_embed_dim"]) + dropout = float(args["dropout"]) + learning_rate = float(args["learning_rate"]) + te_logging_step = int(args["te_logging_step"]) + tr_logging_step = int(args["tr_logging_step"]) trained_model_path = args["output_model"] - n_epochs = int(args["n_epochs"]) - optimize_n_epochs = int(args["optimize_n_epochs"]) - max_evals = int(args["max_evals"]) - test_share = float(args["test_share"]) - batch_size = args["batch_size"] - units = args["units"] - embedding_size = args["embedding_size"] - dropout = args["dropout"] - spatial_dropout = args["spatial_dropout"] - recurrent_dropout = args["recurrent_dropout"] - learning_rate = args["learning_rate"] - num_cpus = 16 config = { - "cutoff_date": cutoff_date, - "maximum_path_length": maximum_path_length, - "n_epochs": n_epochs, - "optimize_n_epochs": optimize_n_epochs, - "max_evals": max_evals, - "test_share": test_share, - "batch_size": batch_size, - "units": units, - "embedding_size": embedding_size, - "dropout": dropout, - "spatial_dropout": spatial_dropout, - "recurrent_dropout": recurrent_dropout, - "learning_rate": learning_rate, + 'cutoff_date': cutoff_date, + 'maximum_path_length': maximum_path_length, + 'n_train_iter': n_train_iter, + 'n_heads': n_heads, + 'feed_forward_dim': feed_forward_dim, + 'embedding_dim': embedding_dim, + 'dropout': dropout, + 'learning_rate': learning_rate, + 'te_share': te_share, + 'te_logging_step': te_logging_step, + 'tr_logging_step': tr_logging_step, + 'tr_batch_size': tr_batch_size, + 'te_batch_size': te_batch_size, + 'trained_model_path': trained_model_path } - + print("Preprocessing workflows...") # Extract and process workflows connections = extract_workflow_connections.ExtractWorkflowConnections() - ( - workflow_paths, - compatible_next_tools, - standard_connections, - ) = connections.read_tabular_file(workflows_path) + # Process raw workflow file + wf_dataframe, usage_df = connections.process_raw_files(workflows_path, tool_usage_path, config) + workflow_paths, pub_conn = connections.read_tabular_file(wf_dataframe, config) # Process the paths from workflows print("Dividing data...") - data = prepare_data.PrepareData(maximum_path_length, test_share) - ( - train_data, - train_labels, - test_data, - test_labels, - data_dictionary, - reverse_dictionary, - class_weights, - usage_pred, - train_tool_freq, - tool_tr_samples, - ) = data.get_data_labels_matrices( - workflow_paths, - tool_usage_path, - cutoff_date, - compatible_next_tools, - standard_connections, - ) - # find the best model and start training - predict_tool = PredictTool(num_cpus) - # start training with weighted classes - print("Training with weighted classes and samples ...") - results_weighted = predict_tool.find_train_best_network( - config, - reverse_dictionary, - train_data, - train_labels, - test_data, - test_labels, - n_epochs, - class_weights, - usage_pred, - standard_connections, - train_tool_freq, - tool_tr_samples, - ) - utils.save_model( - results_weighted, - data_dictionary, - compatible_next_tools, - trained_model_path, - class_weights, - standard_connections, - ) + data = prepare_data.PrepareData(maximum_path_length, te_share) + train_data, train_labels, test_data, test_labels, f_dict, r_dict, c_wts, c_tools, tr_tool_freq = data.get_data_labels_matrices(workflow_paths, usage_df, cutoff_date, pub_conn) + print(train_data.shape, train_labels.shape, test_data.shape, test_labels.shape) + train_transformer.create_enc_transformer(train_data, train_labels, test_data, test_labels, f_dict, r_dict, c_wts, c_tools, pub_conn, tr_tool_freq, config) end_time = time.time() print("Program finished in %s seconds" % str(end_time - start_time))