Mercurial > repos > bgruening > create_tool_recommendation_model
diff prepare_data.py @ 3:5b3c08710e47 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
author | bgruening |
---|---|
date | Sat, 09 May 2020 05:38:23 -0400 |
parents | 9bf25dbe00ad |
children | afec8c595124 |
line wrap: on
line diff
--- a/prepare_data.py Fri Oct 11 18:24:54 2019 -0400 +++ b/prepare_data.py Sat May 09 05:38:23 2020 -0400 @@ -10,19 +10,18 @@ import random import predict_tool_usage +import utils main_path = os.getcwd() class PrepareData: - @classmethod def __init__(self, max_seq_length, test_data_share): """ Init method. """ self.max_tool_sequence_len = max_seq_length self.test_share = test_data_share - @classmethod def process_workflow_paths(self, workflow_paths): """ Get all the tools and complete set of individual paths for each workflow @@ -40,7 +39,6 @@ tokens = np.reshape(tokens, [-1, ]) return tokens, raw_paths - @classmethod def create_new_dict(self, new_data_dict): """ Create new data dictionary @@ -48,7 +46,6 @@ reverse_dict = dict((v, k) for k, v in new_data_dict.items()) return new_data_dict, reverse_dict - @classmethod def assemble_dictionary(self, new_data_dict, old_data_dictionary={}): """ Create/update tools indices in the forward and backward dictionary @@ -56,7 +53,6 @@ new_data_dict, reverse_dict = self.create_new_dict(new_data_dict) return new_data_dict, reverse_dict - @classmethod def create_data_dictionary(self, words, old_data_dictionary={}): """ Create two dictionaries having tools names and their indexes @@ -68,7 +64,6 @@ dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) return dictionary, reverse_dictionary - @classmethod def decompose_paths(self, paths, dictionary): """ Decompose the paths to variable length sub-paths keeping the first tool fixed @@ -86,7 +81,6 @@ sub_paths_pos = list(set(sub_paths_pos)) return sub_paths_pos - @classmethod def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools): """ Create a dictionary of sequences with their labels for training and test paths @@ -116,8 +110,7 @@ paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) return paths_labels - @classmethod - def pad_paths(self, paths_dictionary, num_classes): + def pad_test_paths(self, paths_dictionary, num_classes): """ Add padding to the tools sequences and create multi-hot encoded labels """ @@ -135,7 +128,35 @@ train_counter += 1 return data_mat, label_mat - @classmethod + def pad_paths(self, paths_dictionary, num_classes, standard_connections, reverse_dictionary): + """ + Add padding to the tools sequences and create multi-hot encoded labels + """ + size_data = len(paths_dictionary) + data_mat = np.zeros([size_data, self.max_tool_sequence_len]) + label_mat = np.zeros([size_data, 2 * (num_classes + 1)]) + pos_flag = 1.0 + train_counter = 0 + for train_seq, train_label in list(paths_dictionary.items()): + pub_connections = list() + positions = train_seq.split(",") + last_tool_id = positions[-1] + last_tool_name = reverse_dictionary[int(last_tool_id)] + start_pos = self.max_tool_sequence_len - len(positions) + for id_pos, pos in enumerate(positions): + data_mat[train_counter][start_pos + id_pos] = int(pos) + if last_tool_name in standard_connections: + pub_connections = standard_connections[last_tool_name] + for label_item in train_label.split(","): + label_pos = int(label_item) + label_row = label_mat[train_counter] + if reverse_dictionary[label_pos] in pub_connections: + label_row[label_pos] = pos_flag + else: + label_row[label_pos + num_classes + 1] = pos_flag + train_counter += 1 + return data_mat, label_mat + def split_test_train_data(self, multilabels_paths): """ Split into test and train data randomly for each run @@ -152,15 +173,6 @@ train_dict[path] = multilabels_paths[path] return train_dict, test_dict - @classmethod - def verify_overlap(self, train_paths, test_paths): - """ - Verify the overlapping of samples in train and test data - """ - intersection = list(set(train_paths).intersection(set(test_paths))) - print("Overlap in train and test: %d" % len(intersection)) - - @classmethod def get_predicted_usage(self, data_dictionary, predicted_usage): """ Get predicted usage for tools @@ -180,43 +192,53 @@ continue return usage - @classmethod def assign_class_weights(self, n_classes, predicted_usage): """ Compute class weights using usage """ class_weights = dict() class_weights[str(0)] = 0.0 - for key in range(1, n_classes): + for key in range(1, n_classes + 1): u_score = predicted_usage[key] if u_score < 1.0: u_score += 1.0 - class_weights[key] = np.log(u_score) + class_weights[key] = np.round(np.log(u_score), 6) return class_weights - @classmethod - def get_sample_weights(self, train_data, reverse_dictionary, paths_frequency): + def get_train_last_tool_freq(self, train_paths, reverse_dictionary): """ - Compute the frequency of paths in training data + Get the frequency of last tool of each tool sequence + to estimate the frequency of tool sequences """ - path_weights = np.zeros(len(train_data)) - for path_index, path in enumerate(train_data): - sample_pos = np.where(path > 0)[0] - sample_tool_pos = path[sample_pos[0]:] - path_name = ",".join([reverse_dictionary[int(tool_pos)] for tool_pos in sample_tool_pos]) - try: - path_weights[path_index] = int(paths_frequency[path_name]) - except Exception: - path_weights[path_index] = 1 - return path_weights + last_tool_freq = dict() + inv_freq = dict() + for path in train_paths: + last_tool = path.split(",")[-1] + if last_tool not in last_tool_freq: + last_tool_freq[last_tool] = 0 + last_tool_freq[last_tool] += 1 + max_freq = max(last_tool_freq.values()) + for t in last_tool_freq: + inv_freq[t] = int(np.round(max_freq / float(last_tool_freq[t]), 0)) + return last_tool_freq, inv_freq - @classmethod - def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, old_data_dictionary={}): + def get_toolid_samples(self, train_data, l_tool_freq): + l_tool_tr_samples = dict() + for tool_id in l_tool_freq: + for index, tr_sample in enumerate(train_data): + last_tool_id = str(int(tr_sample[-1])) + if last_tool_id == tool_id: + if last_tool_id not in l_tool_tr_samples: + l_tool_tr_samples[last_tool_id] = list() + l_tool_tr_samples[last_tool_id].append(index) + return l_tool_tr_samples + + def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections, old_data_dictionary={}): """ Convert the training and test paths into corresponding numpy matrices """ processed_data, raw_paths = self.process_workflow_paths(workflow_paths) - dictionary, reverse_dictionary = self.create_data_dictionary(processed_data, old_data_dictionary) + dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) num_classes = len(dictionary) print("Raw paths: %d" % len(raw_paths)) @@ -227,25 +249,32 @@ random.shuffle(all_unique_paths) print("Creating dictionaries...") - multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, reverse_dictionary, all_unique_paths, compatible_next_tools) + multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, rev_dict, all_unique_paths, compatible_next_tools) print("Complete data: %d" % len(multilabels_paths)) train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) + # get sample frequency + l_tool_freq, inv_last_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) + print("Train data: %d" % len(train_paths_dict)) print("Test data: %d" % len(test_paths_dict)) - test_data, test_labels = self.pad_paths(test_paths_dict, num_classes) - train_data, train_labels = self.pad_paths(train_paths_dict, num_classes) + print("Padding train and test data...") + # pad training and test data with leading zeros + test_data, test_labels = self.pad_paths(test_paths_dict, num_classes, standard_connections, rev_dict) + train_data, train_labels = self.pad_paths(train_paths_dict, num_classes, standard_connections, rev_dict) + + l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) # Predict tools usage print("Predicting tools' usage...") usage_pred = predict_tool_usage.ToolPopularity() usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) - tool_predicted_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) + t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) # get class weights using the predicted usage for each tool - class_weights = self.assign_class_weights(train_labels.shape[1], tool_predicted_usage) + class_weights = self.assign_class_weights(num_classes, t_pred_usage) - return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage + return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, t_pred_usage, l_tool_freq, l_tool_tr_samples