diff utils.py @ 0:9bf25dbe00ad draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
author bgruening
date Wed, 28 Aug 2019 07:19:38 -0400
parents
children 76251d1ccdcc
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py	Wed Aug 28 07:19:38 2019 -0400
@@ -0,0 +1,251 @@
+import os
+import numpy as np
+import json
+import h5py
+
+from keras.models import model_from_json, Sequential
+from keras.layers import Dense, GRU, Dropout
+from keras.layers.embeddings import Embedding
+from keras.layers.core import SpatialDropout1D
+from keras.optimizers import RMSprop
+from keras import backend as K
+
+
+def read_file(file_path):
+    """
+    Read a file
+    """
+    with open(file_path, "r") as json_file:
+        file_content = json.loads(json_file.read())
+    return file_content
+
+
+def write_file(file_path, content):
+    """
+    Write a file
+    """
+    remove_file(file_path)
+    with open(file_path, "w") as json_file:
+        json_file.write(json.dumps(content))
+
+
+def save_processed_workflows(file_path, unique_paths):
+    workflow_paths_unique = ""
+    for path in unique_paths:
+        workflow_paths_unique += path + "\n"
+    with open(file_path, "w") as workflows_file:
+        workflows_file.write(workflow_paths_unique)
+
+
+def load_saved_model(model_config, model_weights):
+    """
+    Load the saved trained model using the saved network and its weights
+    """
+    # load the network
+    loaded_model = model_from_json(model_config)
+    # load the saved weights into the model
+    loaded_model.set_weights(model_weights)
+    return loaded_model
+
+
+def format_tool_id(tool_link):
+    """
+    Extract tool id from tool link
+    """
+    tool_id_split = tool_link.split("/")
+    tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link
+    return tool_id
+
+
+def get_HDF5(hf, d_key):
+    """
+    Read h5 file to get train and test data
+    """
+    return hf.get(d_key).value
+
+
+def save_HDF5(hf_file, d_key, data, d_type=""):
+    """
+    Save datasets as h5 file
+    """
+    if (d_type == 'json'):
+        data = json.dumps(data)
+    hf_file.create_dataset(d_key, data=data)
+
+
+def set_trained_model(dump_file, model_values):
+    """
+    Create an h5 file with the trained weights and associated dicts
+    """
+    hf_file = h5py.File(dump_file, 'w')
+    for key in model_values:
+        value = model_values[key]
+        if key == 'model_weights':
+            for idx, item in enumerate(value):
+                w_key = "weight_" + str(idx)
+                if w_key in hf_file:
+                    hf_file.modify(w_key, item)
+                else:
+                    hf_file.create_dataset(w_key, data=item)
+        else:
+            if key in hf_file:
+                hf_file.modify(key, json.dumps(value))
+            else:
+                hf_file.create_dataset(key, data=json.dumps(value))
+    hf_file.close()
+
+
+def remove_file(file_path):
+    if os.path.exists(file_path):
+        os.remove(file_path)
+
+
+def extract_configuration(config_object):
+    config_loss = dict()
+    for index, item in enumerate(config_object):
+        config_loss[index] = list()
+        d_config = dict()
+        d_config['loss'] = item['result']['loss']
+        d_config['params_config'] = item['misc']['vals']
+        config_loss[index].append(d_config)
+    return config_loss
+
+
+def get_best_parameters(mdl_dict):
+    """
+    Get param values (defaults as well)
+    """
+    lr = float(mdl_dict.get("learning_rate", "0.001"))
+    embedding_size = int(mdl_dict.get("embedding_size", "512"))
+    dropout = float(mdl_dict.get("dropout", "0.2"))
+    recurrent_dropout = float(mdl_dict.get("recurrent_dropout", "0.2"))
+    spatial_dropout = float(mdl_dict.get("spatial_dropout", "0.2"))
+    units = int(mdl_dict.get("units", "512"))
+    batch_size = int(mdl_dict.get("batch_size", "512"))
+    activation_recurrent = mdl_dict.get("activation_recurrent", "elu")
+    activation_output = mdl_dict.get("activation_output", "sigmoid")
+
+    return {
+        "lr": lr,
+        "embedding_size": embedding_size,
+        "dropout": dropout,
+        "recurrent_dropout": recurrent_dropout,
+        "spatial_dropout": spatial_dropout,
+        "units": units,
+        "batch_size": batch_size,
+        "activation_recurrent": activation_recurrent,
+        "activation_output": activation_output,
+    }
+
+
+def weighted_loss(class_weights):
+    """
+    Create a weighted loss function. Penalise the misclassification
+    of classes more with the higher usage
+    """
+    weight_values = list(class_weights.values())
+
+    def weighted_binary_crossentropy(y_true, y_pred):
+        # add another dimension to compute dot product
+        expanded_weights = K.expand_dims(weight_values, axis=-1)
+        return K.dot(K.binary_crossentropy(y_true, y_pred), expanded_weights)
+    return weighted_binary_crossentropy
+
+
+def set_recurrent_network(mdl_dict, reverse_dictionary, class_weights):
+    """
+    Create a RNN network and set its parameters
+    """
+    dimensions = len(reverse_dictionary) + 1
+    model_params = get_best_parameters(mdl_dict)
+
+    # define the architecture of the neural network
+    model = Sequential()
+    model.add(Embedding(dimensions, model_params["embedding_size"], mask_zero=True))
+    model.add(SpatialDropout1D(model_params["spatial_dropout"]))
+    model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=True))
+    model.add(Dropout(model_params["dropout"]))
+    model.add(GRU(model_params["units"], dropout=model_params["spatial_dropout"], recurrent_dropout=model_params["recurrent_dropout"], activation=model_params["activation_recurrent"], return_sequences=False))
+    model.add(Dropout(model_params["dropout"]))
+    model.add(Dense(dimensions, activation=model_params["activation_output"]))
+    optimizer = RMSprop(lr=model_params["lr"])
+    model.compile(loss=weighted_loss(class_weights), optimizer=optimizer)
+    return model, model_params
+
+
+def compute_precision(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, topk):
+    """
+    Compute absolute and compatible precision
+    """
+    absolute_precision = 0.0
+    test_sample = np.reshape(x, (1, len(x)))
+
+    # predict next tools for a test path
+    prediction = model.predict(test_sample, verbose=0)
+
+    nw_dimension = prediction.shape[1]
+
+    # remove the 0th position as there is no tool at this index
+    prediction = np.reshape(prediction, (nw_dimension,))
+
+    prediction_pos = np.argsort(prediction, axis=-1)
+    topk_prediction_pos = prediction_pos[-topk:]
+
+    # remove the wrong tool position from the predicted list of tool positions
+    topk_prediction_pos = [x for x in topk_prediction_pos if x > 0]
+
+    # read tool names using reverse dictionary
+    actual_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in actual_classes_pos]
+    top_predicted_next_tool_names = [reverse_data_dictionary[int(tool_pos)] for tool_pos in topk_prediction_pos]
+
+    # compute the class weights of predicted tools
+    mean_usg_score = 0
+    usg_wt_scores = list()
+    for t_id in topk_prediction_pos:
+        t_name = reverse_data_dictionary[int(t_id)]
+        if t_id in usage_scores and t_name in actual_next_tool_names:
+            usg_wt_scores.append(np.log(usage_scores[t_id] + 1.0))
+    if len(usg_wt_scores) > 0:
+            mean_usg_score = np.sum(usg_wt_scores) / float(topk)
+    false_positives = [tool_name for tool_name in top_predicted_next_tool_names if tool_name not in actual_next_tool_names]
+    absolute_precision = 1 - (len(false_positives) / float(topk))
+    return mean_usg_score, absolute_precision
+
+
+def verify_model(model, x, y, reverse_data_dictionary, next_compatible_tools, usage_scores, topk_list=[1, 2, 3]):
+    """
+    Verify the model on test data
+    """
+    print("Evaluating performance on test data...")
+    print("Test data size: %d" % len(y))
+    size = y.shape[0]
+    precision = np.zeros([len(y), len(topk_list)])
+    usage_weights = np.zeros([len(y), len(topk_list)])
+    # loop over all the test samples and find prediction precision
+    for i in range(size):
+        actual_classes_pos = np.where(y[i] > 0)[0]
+        for index, abs_topk in enumerate(topk_list):
+            abs_mean_usg_score, absolute_precision = compute_precision(model, x[i, :], y, reverse_data_dictionary, next_compatible_tools, usage_scores, actual_classes_pos, abs_topk)
+            precision[i][index] = absolute_precision
+            usage_weights[i][index] = abs_mean_usg_score
+    mean_precision = np.mean(precision, axis=0)
+    mean_usage = np.mean(usage_weights, axis=0)
+    return mean_precision, mean_usage
+
+
+def save_model(results, data_dictionary, compatible_next_tools, trained_model_path, class_weights):
+    # save files
+    trained_model = results["model"]
+    best_model_parameters = results["best_parameters"]
+    model_config = trained_model.to_json()
+    model_weights = trained_model.get_weights()
+
+    model_values = {
+        'data_dictionary': data_dictionary,
+        'model_config': model_config,
+        'best_parameters': best_model_parameters,
+        'model_weights': model_weights,
+        "compatible_tools": compatible_next_tools,
+        "class_weights": class_weights
+    }
+    set_trained_model(trained_model_path, model_values)