create_tool_recommendation_model: utils.py comparison

comparison utils.py @ 6:e94dc7945639 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164

author	bgruening
date	Sun, 16 Oct 2022 11:52:10 +0000
parents	4f7e6612906b
children

comparison

equal deleted inserted replaced

-:4f7e6612906b
+:e94dc7945639
 import json
+import os
 import random
 import h5py
 import numpy as np
+import pandas as pd
 import tensorflow as tf
-from numpy.random import choice
-from tensorflow.keras import backend
+binary_ce = tf.keras.losses.BinaryCrossentropy()
+binary_acc = tf.keras.metrics.BinaryAccuracy()
+categorical_ce = tf.keras.metrics.CategoricalCrossentropy(from_logits=True)
 def read_file(file_path):
 """
 Read a file
 with open(file_path, "r") as json_file:
 file_content = json.loads(json_file.read())
 return file_content
+def write_file(file_path, content):
+"""
+Write a file
+"""
+remove_file(file_path)
+with open(file_path, "w") as json_file:
+json_file.write(json.dumps(content))
+def save_h5_data(inp, tar, filename):
+hf_file = h5py.File(filename, 'w')
+hf_file.create_dataset("input", data=inp)
+hf_file.create_dataset("target", data=tar)
+hf_file.close()
+def get_low_freq_te_samples(te_data, te_target, tr_freq_dict):
+lowest_tool_te_ids = list()
+lowest_t_ids = get_lowest_tools(tr_freq_dict)
+for i, te_labels in enumerate(te_target):
+tools_pos = np.where(te_labels > 0)[0]
+tools_pos = [str(int(item)) for item in tools_pos]
+intersection = list(set(tools_pos).intersection(set(lowest_t_ids)))
+if len(intersection) > 0:
+lowest_tool_te_ids.append(i)
+lowest_t_ids = [item for item in lowest_t_ids if item not in intersection]
+return lowest_tool_te_ids
+def save_processed_workflows(file_path, unique_paths):
+workflow_paths_unique = ""
+for path in unique_paths:
+workflow_paths_unique += path + "\n"
+with open(file_path, "w") as workflows_file:
+workflows_file.write(workflow_paths_unique)
 def format_tool_id(tool_link):
 """
 Extract tool id from tool link
 """
 tool_id_split = tool_link.split("/")
 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link
 return tool_id
-def set_trained_model(dump_file, model_values):
+def save_model_file(model, r_dict, c_wts, c_tools, s_conn, model_file):
-"""
+model.save_weights(model_file, save_format="h5")
-Create an h5 file with the trained weights and associated dicts
+hf_file = h5py.File(model_file, 'r+')
-"""
+model_values = {
-hf_file = h5py.File(dump_file, "w")
+"reverse_dict": r_dict,
-for key in model_values:
+"class_weights": c_wts,
-value = model_values[key]
+"compatible_tools": c_tools,
-if key == "model_weights":
+"standard_connections": s_conn
-for idx, item in enumerate(value):
+}
-w_key = "weight_" + str(idx)
+for k in model_values:
-if w_key in hf_file:
+hf_file.create_dataset(k, data=json.dumps(model_values[k]))
-hf_file.modify(w_key, item)
+hf_file.close()
-else:
-hf_file.create_dataset(w_key, data=item)
+def remove_file(file_path):
+if os.path.exists(file_path):
+os.remove(file_path)
+def verify_oversampling_freq(oversampled_tr_data, rev_dict):
+"""
+Compute the frequency of tool sequences after oversampling
+"""
+freq_dict = dict()
+freq_dict_names = dict()
+for tr_data in oversampled_tr_data:
+t_pos = np.where(tr_data > 0)[0]
+last_tool_id = str(int(tr_data[t_pos[-1]]))
+if last_tool_id not in freq_dict:
+freq_dict[last_tool_id] = 0
+freq_dict_names[rev_dict[int(last_tool_id)]] = 0
+freq_dict[last_tool_id] += 1
+freq_dict_names[rev_dict[int(last_tool_id)]] += 1
+s_freq = dict(sorted(freq_dict_names.items(), key=lambda kv: kv[1], reverse=True))
+return s_freq
+def collect_sampled_tool_freq(collected_dict, c_freq):
+for t in c_freq:
+if t not in collected_dict:
+collected_dict[t] = int(c_freq[t])
 else:
-if key in hf_file:
+collected_dict[t] += int(c_freq[t])
-hf_file.modify(key, json.dumps(value))
+return collected_dict
-else:
-hf_file.create_dataset(key, data=json.dumps(value))
-hf_file.close()
+def save_data_as_dict(f_dict, r_dict, inp, tar, save_path):
+inp_tar = dict()
+for index, (i, t) in enumerate(zip(inp, tar)):
-def weighted_loss(class_weights):
+i_pos = np.where(i > 0)[0]
-"""
+i_seq = ",".join([str(int(item)) for item in i[1:i_pos[-1] + 1]])
-Create a weighted loss function. Penalise the misclassification
+t_pos = np.where(t > 0)[0]
-of classes more with the higher usage
+t_seq = ",".join([str(int(item)) for item in t[1:t_pos[-1] + 1]])
-"""
+if i_seq not in inp_tar:
-weight_values = list(class_weights.values())
+inp_tar[i_seq] = list()
-weight_values.extend(weight_values)
+inp_tar[i_seq].append(t_seq)
+size = 0
-def weighted_binary_crossentropy(y_true, y_pred):
+for item in inp_tar:
-# add another dimension to compute dot product
+size += len(inp_tar[item])
-expanded_weights = tf.expand_dims(weight_values, axis=-1)
+print("Size saved file: ", size)
-bce = backend.binary_crossentropy(y_true, y_pred)
+write_file(save_path, inp_tar)
-return backend.dot(bce, expanded_weights)
-return weighted_binary_crossentropy
+def read_train_test(datapath):
+file_obj = h5py.File(datapath, 'r')
+data_input = np.array(file_obj["input"])
-def balanced_sample_generator(
+data_target = np.array(file_obj["target"])
-train_data, train_labels, batch_size, l_tool_tr_samples, reverse_dictionary
+return data_input, data_target
-):
-while True:
-dimension = train_data.shape[1]
+def sample_balanced_tr_y(x_seqs, y_labels, ulabels_tr_y_dict, b_size, tr_t_freq, prev_sel_tools):
-n_classes = train_labels.shape[1]
+batch_y_tools = list(ulabels_tr_y_dict.keys())
-tool_ids = list(l_tool_tr_samples.keys())
+random.shuffle(batch_y_tools)
-random.shuffle(tool_ids)
+label_tools = list()
-generator_batch_data = np.zeros([batch_size, dimension])
+rand_batch_indices = list()
-generator_batch_labels = np.zeros([batch_size, n_classes])
+sel_tools = list()
-generated_tool_ids = choice(tool_ids, batch_size)
-for i in range(batch_size):
+unselected_tools = [t for t in batch_y_tools if t not in prev_sel_tools]
-random_toolid = generated_tool_ids[i]
+rand_selected_tools = unselected_tools[:b_size]
-sample_indices = l_tool_tr_samples[str(random_toolid)]
-random_index = random.sample(range(0, len(sample_indices)), 1)[0]
+for l_tool in rand_selected_tools:
-random_tr_index = sample_indices[random_index]
+seq_indices = ulabels_tr_y_dict[l_tool]
-generator_batch_data[i] = train_data[random_tr_index]
+random.shuffle(seq_indices)
-generator_batch_labels[i] = train_labels[random_tr_index]
+rand_s_index = np.random.randint(0, len(seq_indices), 1)[0]
-yield generator_batch_data, generator_batch_labels
+rand_sample = seq_indices[rand_s_index]
+sel_tools.append(l_tool)
+rand_batch_indices.append(rand_sample)
-def compute_precision(
+label_tools.append(l_tool)
-model,
-x,
+x_batch_train = x_seqs[rand_batch_indices]
-y,
+y_batch_train = y_labels[rand_batch_indices]
-reverse_data_dictionary,
-usage_scores,
+unrolled_x = tf.convert_to_tensor(x_batch_train, dtype=tf.int64)
-actual_classes_pos,
+unrolled_y = tf.convert_to_tensor(y_batch_train, dtype=tf.int64)
-topk,
+return unrolled_x, unrolled_y, sel_tools
-standard_conn,
-last_tool_id,
-lowest_tool_ids,
+def sample_balanced_te_y(x_seqs, y_labels, ulabels_tr_y_dict, b_size):
-):
+batch_y_tools = list(ulabels_tr_y_dict.keys())
-"""
+random.shuffle(batch_y_tools)
-Compute absolute and compatible precision
+label_tools = list()
-"""
+rand_batch_indices = list()
-pred_t_name = ""
+sel_tools = list()
-top_precision = 0.0
+for l_tool in batch_y_tools:
-mean_usage = 0.0
+seq_indices = ulabels_tr_y_dict[l_tool]
-usage_wt_score = list()
+random.shuffle(seq_indices)
-pub_precision = 0.0
+rand_s_index = np.random.randint(0, len(seq_indices), 1)[0]
-lowest_pub_prec = 0.0
+rand_sample = seq_indices[rand_s_index]
-lowest_norm_prec = 0.0
+sel_tools.append(l_tool)
-pub_tools = list()
+if rand_sample not in rand_batch_indices:
-actual_next_tool_names = list()
+rand_batch_indices.append(rand_sample)
-test_sample = np.reshape(x, (1, len(x)))
+label_tools.append(l_tool)
+if len(rand_batch_indices) == b_size:
-# predict next tools for a test path
+break
-prediction = model.predict(test_sample, verbose=0)
+x_batch_train = x_seqs[rand_batch_indices]
+y_batch_train = y_labels[rand_batch_indices]
-# divide the predicted vector into two halves - one for published and
-# another for normal workflows
+unrolled_x = tf.convert_to_tensor(x_batch_train, dtype=tf.int64)
-nw_dimension = prediction.shape[1]
+unrolled_y = tf.convert_to_tensor(y_batch_train, dtype=tf.int64)
-half_len = int(nw_dimension / 2)
+return unrolled_x, unrolled_y, sel_tools
-# predict tools
-prediction = np.reshape(prediction, (nw_dimension,))
+def get_u_tr_labels(y_tr):
-# get predictions of tools from published workflows
+labels = list()
-standard_pred = prediction[:half_len]
+labels_pos_dict = dict()
-# get predictions of tools from normal workflows
+for i, item in enumerate(y_tr):
-normal_pred = prediction[half_len:]
+label_pos = np.where(item > 0)[0]
+labels.extend(label_pos)
-standard_prediction_pos = np.argsort(standard_pred, axis=-1)
+for label in label_pos:
-standard_topk_prediction_pos = standard_prediction_pos[-topk]
+if label not in labels_pos_dict:
+labels_pos_dict[label] = list()
-normal_prediction_pos = np.argsort(normal_pred, axis=-1)
+labels_pos_dict[label].append(i)
-normal_topk_prediction_pos = normal_prediction_pos[-topk]
+u_labels = list(set(labels))
+for item in labels_pos_dict:
-# get true tools names
+labels_pos_dict[item] = list(set(labels_pos_dict[item]))
-for a_t_pos in actual_classes_pos:
+return u_labels, labels_pos_dict
-if a_t_pos > half_len:
-t_name = reverse_data_dictionary[int(a_t_pos - half_len)]
-else:
+def compute_loss(y_true, y_pred, class_weights=None):
-t_name = reverse_data_dictionary[int(a_t_pos)]
+y_true = tf.cast(y_true, dtype=tf.float32)
-actual_next_tool_names.append(t_name)
+loss = binary_ce(y_true, y_pred)
-last_tool_name = reverse_data_dictionary[x[-1]]
+categorical_loss = categorical_ce(y_true, y_pred)
-# compute scores for published recommendations
+if class_weights is None:
-if standard_topk_prediction_pos in reverse_data_dictionary:
+return tf.reduce_mean(loss), categorical_loss
-pred_t_name = reverse_data_dictionary[int(standard_topk_prediction_pos)]
+return tf.tensordot(loss, class_weights, axes=1), categorical_loss
-if last_tool_name in standard_conn:
-pub_tools = standard_conn[last_tool_name]
-if pred_t_name in pub_tools:
+def compute_acc(y_true, y_pred):
-pub_precision = 1.0
+return binary_acc(y_true, y_pred)
-# count precision only when there is actually true published tools
-if last_tool_id in lowest_tool_ids:
-lowest_pub_prec = 1.0
+def validate_model(te_x, te_y, te_batch_size, model, f_dict, r_dict, ulabels_te_dict, tr_labels, lowest_t_ids):
-else:
+te_x_batch, y_train_batch, _ = sample_balanced_te_y(te_x, te_y, ulabels_te_dict, te_batch_size)
-lowest_pub_prec = np.nan
+print("Total test data size: ", te_x.shape, te_y.shape)
-if standard_topk_prediction_pos in usage_scores:
+print("Batch test data size: ", te_x_batch.shape, y_train_batch.shape)
-usage_wt_score.append(
+te_pred_batch, _ = model(te_x_batch, training=False)
-np.log(usage_scores[standard_topk_prediction_pos] + 1.0)
+test_err, _ = compute_loss(y_train_batch, te_pred_batch)
-)
+print("Test loss:")
-else:
+print(test_err.numpy())
-# count precision only when there is actually true published tools
+print("Test finished")
-# else set to np.nan. Set to 0 only when there is wrong prediction
-pub_precision = np.nan
-lowest_pub_prec = np.nan
-# compute scores for normal recommendations
-if normal_topk_prediction_pos in reverse_data_dictionary:
-pred_t_name = reverse_data_dictionary[int(normal_topk_prediction_pos)]
-if pred_t_name in actual_next_tool_names:
-if normal_topk_prediction_pos in usage_scores:
-usage_wt_score.append(
-np.log(usage_scores[normal_topk_prediction_pos] + 1.0)
-)
-top_precision = 1.0
-if last_tool_id in lowest_tool_ids:
-lowest_norm_prec = 1.0
-else:
-lowest_norm_prec = np.nan
-if len(usage_wt_score) > 0:
-mean_usage = np.mean(usage_wt_score)
-return mean_usage, top_precision, pub_precision, lowest_pub_prec, lowest_norm_prec
 def get_lowest_tools(l_tool_freq, fraction=0.25):
 l_tool_freq = dict(sorted(l_tool_freq.items(), key=lambda kv: kv[1], reverse=True))
 tool_ids = list(l_tool_freq.keys())
 lowest_ids = tool_ids[-int(len(tool_ids) * fraction):]
 return lowest_ids
-def verify_model(
+def remove_pipe(file_path):
-model,
+dataframe = pd.read_csv(file_path, sep="|", header=None)
-x,
+dataframe = dataframe[1:len(dataframe.index) - 1]
-y,
+return dataframe[1:]
-reverse_data_dictionary,
-usage_scores,
-standard_conn,
-lowest_tool_ids,
-topk_list=[1, 2, 3],
-):
-"""
-Verify the model on test data
-"""
-print("Evaluating performance on test data...")
-print("Test data size: %d" % len(y))
-size = y.shape[0]
-precision = np.zeros([len(y), len(topk_list)])
-usage_weights = np.zeros([len(y), len(topk_list)])
-epo_pub_prec = np.zeros([len(y), len(topk_list)])
-epo_lowest_tools_pub_prec = list()
-epo_lowest_tools_norm_prec = list()
-lowest_counter = 0
-# loop over all the test samples and find prediction precision
-for i in range(size):
-lowest_pub_topk = list()
-lowest_norm_topk = list()
-actual_classes_pos = np.where(y[i] > 0)[0]
-test_sample = x[i, :]
-last_tool_id = str(int(test_sample[-1]))
-for index, abs_topk in enumerate(topk_list):
-(
-usg_wt_score,
-absolute_precision,
-pub_prec,
-lowest_p_prec,
-lowest_n_prec,
-) = compute_precision(
-model,
-test_sample,
-y,
-reverse_data_dictionary,
-usage_scores,
-actual_classes_pos,
-abs_topk,
-standard_conn,
-last_tool_id,
-lowest_tool_ids,
-)
-precision[i][index] = absolute_precision
-usage_weights[i][index] = usg_wt_score
-epo_pub_prec[i][index] = pub_prec
-lowest_pub_topk.append(lowest_p_prec)
-lowest_norm_topk.append(lowest_n_prec)
-epo_lowest_tools_pub_prec.append(lowest_pub_topk)
-epo_lowest_tools_norm_prec.append(lowest_norm_topk)
-if last_tool_id in lowest_tool_ids:
-lowest_counter += 1
-mean_precision = np.mean(precision, axis=0)
-mean_usage = np.mean(usage_weights, axis=0)
-mean_pub_prec = np.nanmean(epo_pub_prec, axis=0)
-mean_lowest_pub_prec = np.nanmean(epo_lowest_tools_pub_prec, axis=0)
-mean_lowest_norm_prec = np.nanmean(epo_lowest_tools_norm_prec, axis=0)
-return (
-mean_usage,
-mean_precision,
-mean_pub_prec,
-mean_lowest_pub_prec,
-mean_lowest_norm_prec,
-lowest_counter,
-)
-def save_model(
-results,
-data_dictionary,
-compatible_next_tools,
-trained_model_path,
-class_weights,
-standard_connections,
-):
-# save files
-trained_model = results["model"]
-best_model_parameters = results["best_parameters"]
-model_config = trained_model.to_json()
-model_weights = trained_model.get_weights()
-model_values = {
-"data_dictionary": data_dictionary,
-"model_config": model_config,
-"best_parameters": best_model_parameters,
-"model_weights": model_weights,
-"compatible_tools": compatible_next_tools,
-"class_weights": class_weights,
-"standard_connections": standard_connections,
-}
-set_trained_model(trained_model_path, model_values)

Mercurial > repos > bgruening > create_tool_recommendation_model

comparison utils.py @ 6:e94dc7945639 draft default tip