Mercurial > repos > bgruening > create_tool_recommendation_model
diff prepare_data.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author | bgruening |
---|---|
date | Sun, 16 Oct 2022 11:52:10 +0000 |
parents | 4f7e6612906b |
children |
line wrap: on
line diff
--- a/prepare_data.py Fri May 06 09:05:18 2022 +0000 +++ b/prepare_data.py Sun Oct 16 11:52:10 2022 +0000 @@ -5,16 +5,15 @@ """ import collections -import os import random import numpy as np import predict_tool_usage - -main_path = os.getcwd() +from sklearn.model_selection import train_test_split class PrepareData: + def __init__(self, max_seq_length, test_data_share): """ Init method. """ self.max_tool_sequence_len = max_seq_length @@ -26,7 +25,7 @@ """ tokens = list() raw_paths = workflow_paths - raw_paths = [x.replace("\n", "") for x in raw_paths] + raw_paths = [x.replace("\n", '') for x in raw_paths] for item in raw_paths: split_items = item.split(",") for token in split_items: @@ -34,12 +33,7 @@ tokens.append(token) tokens = list(set(tokens)) tokens = np.array(tokens) - tokens = np.reshape( - tokens, - [ - -1, - ], - ) + tokens = np.reshape(tokens, [-1, ]) return tokens, raw_paths def create_new_dict(self, new_data_dict): @@ -62,116 +56,110 @@ """ count = collections.Counter(words).most_common() dictionary = dict() - for word, _ in count: + for index, (word, _) in enumerate(count): + word = word.lstrip() + word = word.rstrip() dictionary[word] = len(dictionary) + 1 - word = word.strip() - dictionary, reverse_dictionary = self.assemble_dictionary( - dictionary, old_data_dictionary - ) + dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) return dictionary, reverse_dictionary def decompose_paths(self, paths, dictionary): """ Decompose the paths to variable length sub-paths keeping the first tool fixed """ + max_len = 0 sub_paths_pos = list() for index, item in enumerate(paths): tools = item.split(",") len_tools = len(tools) - if len_tools <= self.max_tool_sequence_len: - for window in range(1, len_tools): - sequence = tools[0: window + 1] - tools_pos = [ - str(dictionary[str(tool_item)]) for tool_item in sequence - ] - if len(tools_pos) > 1: - sub_paths_pos.append(",".join(tools_pos)) + if len_tools > max_len: + max_len = len_tools + if len_tools < self.max_tool_sequence_len: + sequence = tools[0: len_tools] + tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] + if len(tools_pos) > 1: + sub_paths_pos.append(",".join(tools_pos)) sub_paths_pos = list(set(sub_paths_pos)) + print("Max length of tools: ", max_len) return sub_paths_pos - def prepare_paths_labels_dictionary( - self, dictionary, reverse_dictionary, paths, compatible_next_tools - ): - """ - Create a dictionary of sequences with their labels for training and test paths - """ - paths_labels = dict() - random.shuffle(paths) - for item in paths: - if item and item not in "": - tools = item.split(",") - label = tools[-1] - train_tools = tools[: len(tools) - 1] - last_but_one_name = reverse_dictionary[int(train_tools[-1])] - try: - compatible_tools = compatible_next_tools[last_but_one_name].split( - "," - ) - except Exception: - continue - if len(compatible_tools) > 0: - compatible_tools_ids = [ - str(dictionary[x]) for x in compatible_tools - ] - compatible_tools_ids.append(label) - composite_labels = ",".join(compatible_tools_ids) - train_tools = ",".join(train_tools) - if train_tools in paths_labels: - paths_labels[train_tools] += "," + composite_labels - else: - paths_labels[train_tools] = composite_labels - for item in paths_labels: - paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) - return paths_labels + def prepare_input_one_target_paths(self, dictionary, reverse_dictionary, paths): + input_target_paths = dict() + compatible_tools = dict() + d_size = 0 + for i, item in enumerate(paths): + input_tools = item.split(",") + tool_seq = input_tools + i_tools = ",".join(tool_seq[0:-1]) + last_i_tool = i_tools.split(",")[-1] + if last_i_tool not in compatible_tools: + compatible_tools[last_i_tool] = list() + t_tools = tool_seq[-1] + if t_tools not in compatible_tools[last_i_tool]: + compatible_tools[last_i_tool].append(t_tools) + if i_tools not in input_target_paths: + input_target_paths[i_tools] = list() + if t_tools not in input_target_paths[i_tools]: + input_target_paths[i_tools].append(t_tools) + if i_tools not in input_target_paths: + input_target_paths[i_tools] = list() + if t_tools not in input_target_paths[i_tools]: + input_target_paths[i_tools].append(t_tools) + for item in input_target_paths: + d_size += len(input_target_paths[item]) + print("Dataset size:", d_size) + return input_target_paths, compatible_tools, d_size - def pad_test_paths(self, paths_dictionary, num_classes): - """ - Add padding to the tools sequences and create multi-hot encoded labels - """ - size_data = len(paths_dictionary) - data_mat = np.zeros([size_data, self.max_tool_sequence_len]) - label_mat = np.zeros([size_data, num_classes + 1]) - train_counter = 0 - for train_seq, train_label in list(paths_dictionary.items()): - positions = train_seq.split(",") - start_pos = self.max_tool_sequence_len - len(positions) - for id_pos, pos in enumerate(positions): - data_mat[train_counter][start_pos + id_pos] = int(pos) - for label_item in train_label.split(","): - label_mat[train_counter][int(label_item)] = 1.0 - train_counter += 1 - return data_mat, label_mat + def prepare_input_target_paths(self, dictionary, reverse_dictionary, paths): + input_target_paths = dict() + compatible_tools = dict() + d_size = 0 + for i, item in enumerate(paths): + input_tools = item.split(",") + ctr = 0 + for ctr in range(len(input_tools) - 1): + # uncomment this for one token target idea + tool_seq = input_tools[0: ctr + 2] + i_tools = ",".join(tool_seq[0:-1]) + last_i_tool = i_tools.split(",")[-1] + if last_i_tool not in compatible_tools: + compatible_tools[last_i_tool] = list() + t_tools = tool_seq[-1] + if t_tools not in compatible_tools[last_i_tool]: + compatible_tools[last_i_tool].append(t_tools) + if i_tools not in input_target_paths: + input_target_paths[i_tools] = list() + if t_tools not in input_target_paths[i_tools]: + input_target_paths[i_tools].append(t_tools) + if i_tools not in input_target_paths: + input_target_paths[i_tools] = list() + if t_tools not in input_target_paths[i_tools]: + input_target_paths[i_tools].append(t_tools) + for item in input_target_paths: + d_size += len(input_target_paths[item]) + print("Dataset size:", d_size) + return input_target_paths, compatible_tools, d_size - def pad_paths( - self, paths_dictionary, num_classes, standard_connections, reverse_dictionary - ): - """ - Add padding to the tools sequences and create multi-hot encoded labels - """ - size_data = len(paths_dictionary) - data_mat = np.zeros([size_data, self.max_tool_sequence_len]) - label_mat = np.zeros([size_data, 2 * (num_classes + 1)]) - pos_flag = 1.0 + def pad_paths_one_tool_target(self, multi_paths, compatible_tools, d_size, rev_dict, dictionary): + d_size = len(multi_paths) + input_mat = np.zeros([d_size, self.max_tool_sequence_len]) + target_mat = np.zeros([d_size, len(dictionary) + 1]) train_counter = 0 - for train_seq, train_label in list(paths_dictionary.items()): - pub_connections = list() - positions = train_seq.split(",") - last_tool_id = positions[-1] - last_tool_name = reverse_dictionary[int(last_tool_id)] - start_pos = self.max_tool_sequence_len - len(positions) - for id_pos, pos in enumerate(positions): - data_mat[train_counter][start_pos + id_pos] = int(pos) - if last_tool_name in standard_connections: - pub_connections = standard_connections[last_tool_name] - for label_item in train_label.split(","): - label_pos = int(label_item) - label_row = label_mat[train_counter] - if reverse_dictionary[label_pos] in pub_connections: - label_row[label_pos] = pos_flag - else: - label_row[label_pos + num_classes + 1] = pos_flag + for input_seq, target_seq_tools in list(multi_paths.items()): + input_seq_tools = input_seq.split(",") + last_i_tool = input_seq_tools[-1] + for id_pos, pos in enumerate(input_seq_tools): + input_mat[train_counter][id_pos] = int(pos) + if last_i_tool in compatible_tools: + compatible_targets = compatible_tools[last_i_tool] + for k, t_label in enumerate(target_seq_tools): + target_mat[train_counter][int(t_label)] = 1 + for c_tool in compatible_targets: + target_mat[train_counter][int(c_tool)] = 1 train_counter += 1 - return data_mat, label_mat + print("Final data size: ", input_mat.shape, target_mat.shape) + train_data, test_data, train_labels, test_labels = train_test_split(input_mat, target_mat, test_size=self.test_share, random_state=42) + return train_data, train_labels, test_data, test_labels def split_test_train_data(self, multilabels_paths): """ @@ -221,6 +209,27 @@ class_weights[key] = np.round(np.log(u_score), 6) return class_weights + def get_train_tool_labels_freq(self, train_paths, reverse_dictionary): + """ + Get the frequency of last tool of each tool sequence + to estimate the frequency of tool sequences + """ + last_tool_freq = dict() + freq_dict_names = dict() + for path in train_paths: + tools_pos = np.where(path > 0)[0] + path_pos = tools_pos + path_pos = [str(int(item)) for item in path_pos] + + for tool_pos in path_pos: + if tool_pos not in last_tool_freq: + last_tool_freq[tool_pos] = 0 + freq_dict_names[reverse_dictionary[int(tool_pos)]] = 0 + last_tool_freq[tool_pos] += 1 + freq_dict_names[reverse_dictionary[int(tool_pos)]] += 1 + sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) + return sorted_dict + def get_train_last_tool_freq(self, train_paths, reverse_dictionary): """ Get the frequency of last tool of each tool sequence @@ -229,13 +238,17 @@ last_tool_freq = dict() freq_dict_names = dict() for path in train_paths: - last_tool = path.split(",")[-1] + tools_pos = np.where(path > 0)[0] + path_pos = path[tools_pos] + path_pos = [str(int(item)) for item in path_pos] + last_tool = path_pos[-1] if last_tool not in last_tool_freq: last_tool_freq[last_tool] = 0 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 last_tool_freq[last_tool] += 1 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 - return last_tool_freq + sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) + return sorted_dict def get_toolid_samples(self, train_data, l_tool_freq): l_tool_tr_samples = dict() @@ -248,22 +261,13 @@ l_tool_tr_samples[last_tool_id].append(index) return l_tool_tr_samples - def get_data_labels_matrices( - self, - workflow_paths, - tool_usage_path, - cutoff_date, - compatible_next_tools, - standard_connections, - old_data_dictionary={}, - ): + def get_data_labels_matrices(self, workflow_paths, usage_df, cutoff_date, standard_connections, old_data_dictionary={}): """ Convert the training and test paths into corresponding numpy matrices """ processed_data, raw_paths = self.process_workflow_paths(workflow_paths) - dictionary, rev_dict = self.create_data_dictionary( - processed_data, old_data_dictionary - ) + dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) + num_classes = len(dictionary) print("Raw paths: %d" % len(raw_paths)) @@ -274,50 +278,26 @@ random.shuffle(all_unique_paths) print("Creating dictionaries...") - multilabels_paths = self.prepare_paths_labels_dictionary( - dictionary, rev_dict, all_unique_paths, compatible_next_tools - ) + multilabels_paths, compatible_tools, d_size = self.prepare_input_target_paths(dictionary, rev_dict, all_unique_paths) - print("Complete data: %d" % len(multilabels_paths)) - train_paths_dict, test_paths_dict = self.split_test_train_data( - multilabels_paths - ) - - print("Train data: %d" % len(train_paths_dict)) - print("Test data: %d" % len(test_paths_dict)) + print("Complete data: %d" % d_size) print("Padding train and test data...") - # pad training and test data with leading zeros - test_data, test_labels = self.pad_paths( - test_paths_dict, num_classes, standard_connections, rev_dict - ) - train_data, train_labels = self.pad_paths( - train_paths_dict, num_classes, standard_connections, rev_dict - ) + # pad training and test data with trailing zeros + train_data, train_labels, test_data, test_labels = self.pad_paths_one_tool_target(multilabels_paths, compatible_tools, d_size, rev_dict, dictionary) + + print("Train data: ", train_data.shape) + print("Test data: ", test_data.shape) print("Estimating sample frequency...") - l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) - l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) + tr_tool_freq = self.get_train_tool_labels_freq(train_labels, rev_dict) # Predict tools usage print("Predicting tools' usage...") usage_pred = predict_tool_usage.ToolPopularity() - usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) + usage = usage_pred.extract_tool_usage(usage_df, cutoff_date, dictionary) tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) - # get class weights using the predicted usage for each tool class_weights = self.assign_class_weights(num_classes, t_pred_usage) - - return ( - train_data, - train_labels, - test_data, - test_labels, - dictionary, - rev_dict, - class_weights, - t_pred_usage, - l_tool_freq, - l_tool_tr_samples, - ) + return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, compatible_tools, tr_tool_freq