Mercurial > repos > bgruening > create_tool_recommendation_model
comparison prepare_data.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
| author | bgruening |
|---|---|
| date | Sun, 16 Oct 2022 11:52:10 +0000 |
| parents | 4f7e6612906b |
| children |
comparison
equal
deleted
inserted
replaced
| 5:4f7e6612906b | 6:e94dc7945639 |
|---|---|
| 3 machine learning algorithm. The paths are divided | 3 machine learning algorithm. The paths are divided |
| 4 into the test and training sets | 4 into the test and training sets |
| 5 """ | 5 """ |
| 6 | 6 |
| 7 import collections | 7 import collections |
| 8 import os | |
| 9 import random | 8 import random |
| 10 | 9 |
| 11 import numpy as np | 10 import numpy as np |
| 12 import predict_tool_usage | 11 import predict_tool_usage |
| 13 | 12 from sklearn.model_selection import train_test_split |
| 14 main_path = os.getcwd() | |
| 15 | 13 |
| 16 | 14 |
| 17 class PrepareData: | 15 class PrepareData: |
| 16 | |
| 18 def __init__(self, max_seq_length, test_data_share): | 17 def __init__(self, max_seq_length, test_data_share): |
| 19 """ Init method. """ | 18 """ Init method. """ |
| 20 self.max_tool_sequence_len = max_seq_length | 19 self.max_tool_sequence_len = max_seq_length |
| 21 self.test_share = test_data_share | 20 self.test_share = test_data_share |
| 22 | 21 |
| 24 """ | 23 """ |
| 25 Get all the tools and complete set of individual paths for each workflow | 24 Get all the tools and complete set of individual paths for each workflow |
| 26 """ | 25 """ |
| 27 tokens = list() | 26 tokens = list() |
| 28 raw_paths = workflow_paths | 27 raw_paths = workflow_paths |
| 29 raw_paths = [x.replace("\n", "") for x in raw_paths] | 28 raw_paths = [x.replace("\n", '') for x in raw_paths] |
| 30 for item in raw_paths: | 29 for item in raw_paths: |
| 31 split_items = item.split(",") | 30 split_items = item.split(",") |
| 32 for token in split_items: | 31 for token in split_items: |
| 33 if token != "": | 32 if token != "": |
| 34 tokens.append(token) | 33 tokens.append(token) |
| 35 tokens = list(set(tokens)) | 34 tokens = list(set(tokens)) |
| 36 tokens = np.array(tokens) | 35 tokens = np.array(tokens) |
| 37 tokens = np.reshape( | 36 tokens = np.reshape(tokens, [-1, ]) |
| 38 tokens, | |
| 39 [ | |
| 40 -1, | |
| 41 ], | |
| 42 ) | |
| 43 return tokens, raw_paths | 37 return tokens, raw_paths |
| 44 | 38 |
| 45 def create_new_dict(self, new_data_dict): | 39 def create_new_dict(self, new_data_dict): |
| 46 """ | 40 """ |
| 47 Create new data dictionary | 41 Create new data dictionary |
| 60 """ | 54 """ |
| 61 Create two dictionaries having tools names and their indexes | 55 Create two dictionaries having tools names and their indexes |
| 62 """ | 56 """ |
| 63 count = collections.Counter(words).most_common() | 57 count = collections.Counter(words).most_common() |
| 64 dictionary = dict() | 58 dictionary = dict() |
| 65 for word, _ in count: | 59 for index, (word, _) in enumerate(count): |
| 60 word = word.lstrip() | |
| 61 word = word.rstrip() | |
| 66 dictionary[word] = len(dictionary) + 1 | 62 dictionary[word] = len(dictionary) + 1 |
| 67 word = word.strip() | 63 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) |
| 68 dictionary, reverse_dictionary = self.assemble_dictionary( | |
| 69 dictionary, old_data_dictionary | |
| 70 ) | |
| 71 return dictionary, reverse_dictionary | 64 return dictionary, reverse_dictionary |
| 72 | 65 |
| 73 def decompose_paths(self, paths, dictionary): | 66 def decompose_paths(self, paths, dictionary): |
| 74 """ | 67 """ |
| 75 Decompose the paths to variable length sub-paths keeping the first tool fixed | 68 Decompose the paths to variable length sub-paths keeping the first tool fixed |
| 76 """ | 69 """ |
| 70 max_len = 0 | |
| 77 sub_paths_pos = list() | 71 sub_paths_pos = list() |
| 78 for index, item in enumerate(paths): | 72 for index, item in enumerate(paths): |
| 79 tools = item.split(",") | 73 tools = item.split(",") |
| 80 len_tools = len(tools) | 74 len_tools = len(tools) |
| 81 if len_tools <= self.max_tool_sequence_len: | 75 if len_tools > max_len: |
| 82 for window in range(1, len_tools): | 76 max_len = len_tools |
| 83 sequence = tools[0: window + 1] | 77 if len_tools < self.max_tool_sequence_len: |
| 84 tools_pos = [ | 78 sequence = tools[0: len_tools] |
| 85 str(dictionary[str(tool_item)]) for tool_item in sequence | 79 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] |
| 86 ] | 80 if len(tools_pos) > 1: |
| 87 if len(tools_pos) > 1: | 81 sub_paths_pos.append(",".join(tools_pos)) |
| 88 sub_paths_pos.append(",".join(tools_pos)) | |
| 89 sub_paths_pos = list(set(sub_paths_pos)) | 82 sub_paths_pos = list(set(sub_paths_pos)) |
| 83 print("Max length of tools: ", max_len) | |
| 90 return sub_paths_pos | 84 return sub_paths_pos |
| 91 | 85 |
| 92 def prepare_paths_labels_dictionary( | 86 def prepare_input_one_target_paths(self, dictionary, reverse_dictionary, paths): |
| 93 self, dictionary, reverse_dictionary, paths, compatible_next_tools | 87 input_target_paths = dict() |
| 94 ): | 88 compatible_tools = dict() |
| 95 """ | 89 d_size = 0 |
| 96 Create a dictionary of sequences with their labels for training and test paths | 90 for i, item in enumerate(paths): |
| 97 """ | 91 input_tools = item.split(",") |
| 98 paths_labels = dict() | 92 tool_seq = input_tools |
| 99 random.shuffle(paths) | 93 i_tools = ",".join(tool_seq[0:-1]) |
| 100 for item in paths: | 94 last_i_tool = i_tools.split(",")[-1] |
| 101 if item and item not in "": | 95 if last_i_tool not in compatible_tools: |
| 102 tools = item.split(",") | 96 compatible_tools[last_i_tool] = list() |
| 103 label = tools[-1] | 97 t_tools = tool_seq[-1] |
| 104 train_tools = tools[: len(tools) - 1] | 98 if t_tools not in compatible_tools[last_i_tool]: |
| 105 last_but_one_name = reverse_dictionary[int(train_tools[-1])] | 99 compatible_tools[last_i_tool].append(t_tools) |
| 106 try: | 100 if i_tools not in input_target_paths: |
| 107 compatible_tools = compatible_next_tools[last_but_one_name].split( | 101 input_target_paths[i_tools] = list() |
| 108 "," | 102 if t_tools not in input_target_paths[i_tools]: |
| 109 ) | 103 input_target_paths[i_tools].append(t_tools) |
| 110 except Exception: | 104 if i_tools not in input_target_paths: |
| 111 continue | 105 input_target_paths[i_tools] = list() |
| 112 if len(compatible_tools) > 0: | 106 if t_tools not in input_target_paths[i_tools]: |
| 113 compatible_tools_ids = [ | 107 input_target_paths[i_tools].append(t_tools) |
| 114 str(dictionary[x]) for x in compatible_tools | 108 for item in input_target_paths: |
| 115 ] | 109 d_size += len(input_target_paths[item]) |
| 116 compatible_tools_ids.append(label) | 110 print("Dataset size:", d_size) |
| 117 composite_labels = ",".join(compatible_tools_ids) | 111 return input_target_paths, compatible_tools, d_size |
| 118 train_tools = ",".join(train_tools) | 112 |
| 119 if train_tools in paths_labels: | 113 def prepare_input_target_paths(self, dictionary, reverse_dictionary, paths): |
| 120 paths_labels[train_tools] += "," + composite_labels | 114 input_target_paths = dict() |
| 121 else: | 115 compatible_tools = dict() |
| 122 paths_labels[train_tools] = composite_labels | 116 d_size = 0 |
| 123 for item in paths_labels: | 117 for i, item in enumerate(paths): |
| 124 paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) | 118 input_tools = item.split(",") |
| 125 return paths_labels | 119 ctr = 0 |
| 126 | 120 for ctr in range(len(input_tools) - 1): |
| 127 def pad_test_paths(self, paths_dictionary, num_classes): | 121 # uncomment this for one token target idea |
| 128 """ | 122 tool_seq = input_tools[0: ctr + 2] |
| 129 Add padding to the tools sequences and create multi-hot encoded labels | 123 i_tools = ",".join(tool_seq[0:-1]) |
| 130 """ | 124 last_i_tool = i_tools.split(",")[-1] |
| 131 size_data = len(paths_dictionary) | 125 if last_i_tool not in compatible_tools: |
| 132 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) | 126 compatible_tools[last_i_tool] = list() |
| 133 label_mat = np.zeros([size_data, num_classes + 1]) | 127 t_tools = tool_seq[-1] |
| 128 if t_tools not in compatible_tools[last_i_tool]: | |
| 129 compatible_tools[last_i_tool].append(t_tools) | |
| 130 if i_tools not in input_target_paths: | |
| 131 input_target_paths[i_tools] = list() | |
| 132 if t_tools not in input_target_paths[i_tools]: | |
| 133 input_target_paths[i_tools].append(t_tools) | |
| 134 if i_tools not in input_target_paths: | |
| 135 input_target_paths[i_tools] = list() | |
| 136 if t_tools not in input_target_paths[i_tools]: | |
| 137 input_target_paths[i_tools].append(t_tools) | |
| 138 for item in input_target_paths: | |
| 139 d_size += len(input_target_paths[item]) | |
| 140 print("Dataset size:", d_size) | |
| 141 return input_target_paths, compatible_tools, d_size | |
| 142 | |
| 143 def pad_paths_one_tool_target(self, multi_paths, compatible_tools, d_size, rev_dict, dictionary): | |
| 144 d_size = len(multi_paths) | |
| 145 input_mat = np.zeros([d_size, self.max_tool_sequence_len]) | |
| 146 target_mat = np.zeros([d_size, len(dictionary) + 1]) | |
| 134 train_counter = 0 | 147 train_counter = 0 |
| 135 for train_seq, train_label in list(paths_dictionary.items()): | 148 for input_seq, target_seq_tools in list(multi_paths.items()): |
| 136 positions = train_seq.split(",") | 149 input_seq_tools = input_seq.split(",") |
| 137 start_pos = self.max_tool_sequence_len - len(positions) | 150 last_i_tool = input_seq_tools[-1] |
| 138 for id_pos, pos in enumerate(positions): | 151 for id_pos, pos in enumerate(input_seq_tools): |
| 139 data_mat[train_counter][start_pos + id_pos] = int(pos) | 152 input_mat[train_counter][id_pos] = int(pos) |
| 140 for label_item in train_label.split(","): | 153 if last_i_tool in compatible_tools: |
| 141 label_mat[train_counter][int(label_item)] = 1.0 | 154 compatible_targets = compatible_tools[last_i_tool] |
| 155 for k, t_label in enumerate(target_seq_tools): | |
| 156 target_mat[train_counter][int(t_label)] = 1 | |
| 157 for c_tool in compatible_targets: | |
| 158 target_mat[train_counter][int(c_tool)] = 1 | |
| 142 train_counter += 1 | 159 train_counter += 1 |
| 143 return data_mat, label_mat | 160 print("Final data size: ", input_mat.shape, target_mat.shape) |
| 144 | 161 train_data, test_data, train_labels, test_labels = train_test_split(input_mat, target_mat, test_size=self.test_share, random_state=42) |
| 145 def pad_paths( | 162 return train_data, train_labels, test_data, test_labels |
| 146 self, paths_dictionary, num_classes, standard_connections, reverse_dictionary | |
| 147 ): | |
| 148 """ | |
| 149 Add padding to the tools sequences and create multi-hot encoded labels | |
| 150 """ | |
| 151 size_data = len(paths_dictionary) | |
| 152 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) | |
| 153 label_mat = np.zeros([size_data, 2 * (num_classes + 1)]) | |
| 154 pos_flag = 1.0 | |
| 155 train_counter = 0 | |
| 156 for train_seq, train_label in list(paths_dictionary.items()): | |
| 157 pub_connections = list() | |
| 158 positions = train_seq.split(",") | |
| 159 last_tool_id = positions[-1] | |
| 160 last_tool_name = reverse_dictionary[int(last_tool_id)] | |
| 161 start_pos = self.max_tool_sequence_len - len(positions) | |
| 162 for id_pos, pos in enumerate(positions): | |
| 163 data_mat[train_counter][start_pos + id_pos] = int(pos) | |
| 164 if last_tool_name in standard_connections: | |
| 165 pub_connections = standard_connections[last_tool_name] | |
| 166 for label_item in train_label.split(","): | |
| 167 label_pos = int(label_item) | |
| 168 label_row = label_mat[train_counter] | |
| 169 if reverse_dictionary[label_pos] in pub_connections: | |
| 170 label_row[label_pos] = pos_flag | |
| 171 else: | |
| 172 label_row[label_pos + num_classes + 1] = pos_flag | |
| 173 train_counter += 1 | |
| 174 return data_mat, label_mat | |
| 175 | 163 |
| 176 def split_test_train_data(self, multilabels_paths): | 164 def split_test_train_data(self, multilabels_paths): |
| 177 """ | 165 """ |
| 178 Split into test and train data randomly for each run | 166 Split into test and train data randomly for each run |
| 179 """ | 167 """ |
| 219 if u_score < 1.0: | 207 if u_score < 1.0: |
| 220 u_score += 1.0 | 208 u_score += 1.0 |
| 221 class_weights[key] = np.round(np.log(u_score), 6) | 209 class_weights[key] = np.round(np.log(u_score), 6) |
| 222 return class_weights | 210 return class_weights |
| 223 | 211 |
| 224 def get_train_last_tool_freq(self, train_paths, reverse_dictionary): | 212 def get_train_tool_labels_freq(self, train_paths, reverse_dictionary): |
| 225 """ | 213 """ |
| 226 Get the frequency of last tool of each tool sequence | 214 Get the frequency of last tool of each tool sequence |
| 227 to estimate the frequency of tool sequences | 215 to estimate the frequency of tool sequences |
| 228 """ | 216 """ |
| 229 last_tool_freq = dict() | 217 last_tool_freq = dict() |
| 230 freq_dict_names = dict() | 218 freq_dict_names = dict() |
| 231 for path in train_paths: | 219 for path in train_paths: |
| 232 last_tool = path.split(",")[-1] | 220 tools_pos = np.where(path > 0)[0] |
| 221 path_pos = tools_pos | |
| 222 path_pos = [str(int(item)) for item in path_pos] | |
| 223 | |
| 224 for tool_pos in path_pos: | |
| 225 if tool_pos not in last_tool_freq: | |
| 226 last_tool_freq[tool_pos] = 0 | |
| 227 freq_dict_names[reverse_dictionary[int(tool_pos)]] = 0 | |
| 228 last_tool_freq[tool_pos] += 1 | |
| 229 freq_dict_names[reverse_dictionary[int(tool_pos)]] += 1 | |
| 230 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) | |
| 231 return sorted_dict | |
| 232 | |
| 233 def get_train_last_tool_freq(self, train_paths, reverse_dictionary): | |
| 234 """ | |
| 235 Get the frequency of last tool of each tool sequence | |
| 236 to estimate the frequency of tool sequences | |
| 237 """ | |
| 238 last_tool_freq = dict() | |
| 239 freq_dict_names = dict() | |
| 240 for path in train_paths: | |
| 241 tools_pos = np.where(path > 0)[0] | |
| 242 path_pos = path[tools_pos] | |
| 243 path_pos = [str(int(item)) for item in path_pos] | |
| 244 last_tool = path_pos[-1] | |
| 233 if last_tool not in last_tool_freq: | 245 if last_tool not in last_tool_freq: |
| 234 last_tool_freq[last_tool] = 0 | 246 last_tool_freq[last_tool] = 0 |
| 235 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 | 247 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 |
| 236 last_tool_freq[last_tool] += 1 | 248 last_tool_freq[last_tool] += 1 |
| 237 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 | 249 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 |
| 238 return last_tool_freq | 250 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) |
| 251 return sorted_dict | |
| 239 | 252 |
| 240 def get_toolid_samples(self, train_data, l_tool_freq): | 253 def get_toolid_samples(self, train_data, l_tool_freq): |
| 241 l_tool_tr_samples = dict() | 254 l_tool_tr_samples = dict() |
| 242 for tool_id in l_tool_freq: | 255 for tool_id in l_tool_freq: |
| 243 for index, tr_sample in enumerate(train_data): | 256 for index, tr_sample in enumerate(train_data): |
| 246 if last_tool_id not in l_tool_tr_samples: | 259 if last_tool_id not in l_tool_tr_samples: |
| 247 l_tool_tr_samples[last_tool_id] = list() | 260 l_tool_tr_samples[last_tool_id] = list() |
| 248 l_tool_tr_samples[last_tool_id].append(index) | 261 l_tool_tr_samples[last_tool_id].append(index) |
| 249 return l_tool_tr_samples | 262 return l_tool_tr_samples |
| 250 | 263 |
| 251 def get_data_labels_matrices( | 264 def get_data_labels_matrices(self, workflow_paths, usage_df, cutoff_date, standard_connections, old_data_dictionary={}): |
| 252 self, | |
| 253 workflow_paths, | |
| 254 tool_usage_path, | |
| 255 cutoff_date, | |
| 256 compatible_next_tools, | |
| 257 standard_connections, | |
| 258 old_data_dictionary={}, | |
| 259 ): | |
| 260 """ | 265 """ |
| 261 Convert the training and test paths into corresponding numpy matrices | 266 Convert the training and test paths into corresponding numpy matrices |
| 262 """ | 267 """ |
| 263 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) | 268 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) |
| 264 dictionary, rev_dict = self.create_data_dictionary( | 269 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) |
| 265 processed_data, old_data_dictionary | 270 |
| 266 ) | |
| 267 num_classes = len(dictionary) | 271 num_classes = len(dictionary) |
| 268 | 272 |
| 269 print("Raw paths: %d" % len(raw_paths)) | 273 print("Raw paths: %d" % len(raw_paths)) |
| 270 random.shuffle(raw_paths) | 274 random.shuffle(raw_paths) |
| 271 | 275 |
| 272 print("Decomposing paths...") | 276 print("Decomposing paths...") |
| 273 all_unique_paths = self.decompose_paths(raw_paths, dictionary) | 277 all_unique_paths = self.decompose_paths(raw_paths, dictionary) |
| 274 random.shuffle(all_unique_paths) | 278 random.shuffle(all_unique_paths) |
| 275 | 279 |
| 276 print("Creating dictionaries...") | 280 print("Creating dictionaries...") |
| 277 multilabels_paths = self.prepare_paths_labels_dictionary( | 281 multilabels_paths, compatible_tools, d_size = self.prepare_input_target_paths(dictionary, rev_dict, all_unique_paths) |
| 278 dictionary, rev_dict, all_unique_paths, compatible_next_tools | 282 |
| 279 ) | 283 print("Complete data: %d" % d_size) |
| 280 | |
| 281 print("Complete data: %d" % len(multilabels_paths)) | |
| 282 train_paths_dict, test_paths_dict = self.split_test_train_data( | |
| 283 multilabels_paths | |
| 284 ) | |
| 285 | |
| 286 print("Train data: %d" % len(train_paths_dict)) | |
| 287 print("Test data: %d" % len(test_paths_dict)) | |
| 288 | 284 |
| 289 print("Padding train and test data...") | 285 print("Padding train and test data...") |
| 290 # pad training and test data with leading zeros | 286 # pad training and test data with trailing zeros |
| 291 test_data, test_labels = self.pad_paths( | 287 train_data, train_labels, test_data, test_labels = self.pad_paths_one_tool_target(multilabels_paths, compatible_tools, d_size, rev_dict, dictionary) |
| 292 test_paths_dict, num_classes, standard_connections, rev_dict | 288 |
| 293 ) | 289 print("Train data: ", train_data.shape) |
| 294 train_data, train_labels = self.pad_paths( | 290 print("Test data: ", test_data.shape) |
| 295 train_paths_dict, num_classes, standard_connections, rev_dict | |
| 296 ) | |
| 297 | 291 |
| 298 print("Estimating sample frequency...") | 292 print("Estimating sample frequency...") |
| 299 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) | 293 tr_tool_freq = self.get_train_tool_labels_freq(train_labels, rev_dict) |
| 300 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) | |
| 301 | 294 |
| 302 # Predict tools usage | 295 # Predict tools usage |
| 303 print("Predicting tools' usage...") | 296 print("Predicting tools' usage...") |
| 304 usage_pred = predict_tool_usage.ToolPopularity() | 297 usage_pred = predict_tool_usage.ToolPopularity() |
| 305 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) | 298 usage = usage_pred.extract_tool_usage(usage_df, cutoff_date, dictionary) |
| 306 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) | 299 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) |
| 307 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) | 300 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) |
| 308 | |
| 309 # get class weights using the predicted usage for each tool | 301 # get class weights using the predicted usage for each tool |
| 310 class_weights = self.assign_class_weights(num_classes, t_pred_usage) | 302 class_weights = self.assign_class_weights(num_classes, t_pred_usage) |
| 311 | 303 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, compatible_tools, tr_tool_freq |
| 312 return ( | |
| 313 train_data, | |
| 314 train_labels, | |
| 315 test_data, | |
| 316 test_labels, | |
| 317 dictionary, | |
| 318 rev_dict, | |
| 319 class_weights, | |
| 320 t_pred_usage, | |
| 321 l_tool_freq, | |
| 322 l_tool_tr_samples, | |
| 323 ) |
