Mercurial > repos > bgruening > create_tool_recommendation_model
comparison prepare_data.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author | bgruening |
---|---|
date | Sun, 16 Oct 2022 11:52:10 +0000 |
parents | 4f7e6612906b |
children |
comparison
equal
deleted
inserted
replaced
5:4f7e6612906b | 6:e94dc7945639 |
---|---|
3 machine learning algorithm. The paths are divided | 3 machine learning algorithm. The paths are divided |
4 into the test and training sets | 4 into the test and training sets |
5 """ | 5 """ |
6 | 6 |
7 import collections | 7 import collections |
8 import os | |
9 import random | 8 import random |
10 | 9 |
11 import numpy as np | 10 import numpy as np |
12 import predict_tool_usage | 11 import predict_tool_usage |
13 | 12 from sklearn.model_selection import train_test_split |
14 main_path = os.getcwd() | |
15 | 13 |
16 | 14 |
17 class PrepareData: | 15 class PrepareData: |
16 | |
18 def __init__(self, max_seq_length, test_data_share): | 17 def __init__(self, max_seq_length, test_data_share): |
19 """ Init method. """ | 18 """ Init method. """ |
20 self.max_tool_sequence_len = max_seq_length | 19 self.max_tool_sequence_len = max_seq_length |
21 self.test_share = test_data_share | 20 self.test_share = test_data_share |
22 | 21 |
24 """ | 23 """ |
25 Get all the tools and complete set of individual paths for each workflow | 24 Get all the tools and complete set of individual paths for each workflow |
26 """ | 25 """ |
27 tokens = list() | 26 tokens = list() |
28 raw_paths = workflow_paths | 27 raw_paths = workflow_paths |
29 raw_paths = [x.replace("\n", "") for x in raw_paths] | 28 raw_paths = [x.replace("\n", '') for x in raw_paths] |
30 for item in raw_paths: | 29 for item in raw_paths: |
31 split_items = item.split(",") | 30 split_items = item.split(",") |
32 for token in split_items: | 31 for token in split_items: |
33 if token != "": | 32 if token != "": |
34 tokens.append(token) | 33 tokens.append(token) |
35 tokens = list(set(tokens)) | 34 tokens = list(set(tokens)) |
36 tokens = np.array(tokens) | 35 tokens = np.array(tokens) |
37 tokens = np.reshape( | 36 tokens = np.reshape(tokens, [-1, ]) |
38 tokens, | |
39 [ | |
40 -1, | |
41 ], | |
42 ) | |
43 return tokens, raw_paths | 37 return tokens, raw_paths |
44 | 38 |
45 def create_new_dict(self, new_data_dict): | 39 def create_new_dict(self, new_data_dict): |
46 """ | 40 """ |
47 Create new data dictionary | 41 Create new data dictionary |
60 """ | 54 """ |
61 Create two dictionaries having tools names and their indexes | 55 Create two dictionaries having tools names and their indexes |
62 """ | 56 """ |
63 count = collections.Counter(words).most_common() | 57 count = collections.Counter(words).most_common() |
64 dictionary = dict() | 58 dictionary = dict() |
65 for word, _ in count: | 59 for index, (word, _) in enumerate(count): |
60 word = word.lstrip() | |
61 word = word.rstrip() | |
66 dictionary[word] = len(dictionary) + 1 | 62 dictionary[word] = len(dictionary) + 1 |
67 word = word.strip() | 63 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) |
68 dictionary, reverse_dictionary = self.assemble_dictionary( | |
69 dictionary, old_data_dictionary | |
70 ) | |
71 return dictionary, reverse_dictionary | 64 return dictionary, reverse_dictionary |
72 | 65 |
73 def decompose_paths(self, paths, dictionary): | 66 def decompose_paths(self, paths, dictionary): |
74 """ | 67 """ |
75 Decompose the paths to variable length sub-paths keeping the first tool fixed | 68 Decompose the paths to variable length sub-paths keeping the first tool fixed |
76 """ | 69 """ |
70 max_len = 0 | |
77 sub_paths_pos = list() | 71 sub_paths_pos = list() |
78 for index, item in enumerate(paths): | 72 for index, item in enumerate(paths): |
79 tools = item.split(",") | 73 tools = item.split(",") |
80 len_tools = len(tools) | 74 len_tools = len(tools) |
81 if len_tools <= self.max_tool_sequence_len: | 75 if len_tools > max_len: |
82 for window in range(1, len_tools): | 76 max_len = len_tools |
83 sequence = tools[0: window + 1] | 77 if len_tools < self.max_tool_sequence_len: |
84 tools_pos = [ | 78 sequence = tools[0: len_tools] |
85 str(dictionary[str(tool_item)]) for tool_item in sequence | 79 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] |
86 ] | 80 if len(tools_pos) > 1: |
87 if len(tools_pos) > 1: | 81 sub_paths_pos.append(",".join(tools_pos)) |
88 sub_paths_pos.append(",".join(tools_pos)) | |
89 sub_paths_pos = list(set(sub_paths_pos)) | 82 sub_paths_pos = list(set(sub_paths_pos)) |
83 print("Max length of tools: ", max_len) | |
90 return sub_paths_pos | 84 return sub_paths_pos |
91 | 85 |
92 def prepare_paths_labels_dictionary( | 86 def prepare_input_one_target_paths(self, dictionary, reverse_dictionary, paths): |
93 self, dictionary, reverse_dictionary, paths, compatible_next_tools | 87 input_target_paths = dict() |
94 ): | 88 compatible_tools = dict() |
95 """ | 89 d_size = 0 |
96 Create a dictionary of sequences with their labels for training and test paths | 90 for i, item in enumerate(paths): |
97 """ | 91 input_tools = item.split(",") |
98 paths_labels = dict() | 92 tool_seq = input_tools |
99 random.shuffle(paths) | 93 i_tools = ",".join(tool_seq[0:-1]) |
100 for item in paths: | 94 last_i_tool = i_tools.split(",")[-1] |
101 if item and item not in "": | 95 if last_i_tool not in compatible_tools: |
102 tools = item.split(",") | 96 compatible_tools[last_i_tool] = list() |
103 label = tools[-1] | 97 t_tools = tool_seq[-1] |
104 train_tools = tools[: len(tools) - 1] | 98 if t_tools not in compatible_tools[last_i_tool]: |
105 last_but_one_name = reverse_dictionary[int(train_tools[-1])] | 99 compatible_tools[last_i_tool].append(t_tools) |
106 try: | 100 if i_tools not in input_target_paths: |
107 compatible_tools = compatible_next_tools[last_but_one_name].split( | 101 input_target_paths[i_tools] = list() |
108 "," | 102 if t_tools not in input_target_paths[i_tools]: |
109 ) | 103 input_target_paths[i_tools].append(t_tools) |
110 except Exception: | 104 if i_tools not in input_target_paths: |
111 continue | 105 input_target_paths[i_tools] = list() |
112 if len(compatible_tools) > 0: | 106 if t_tools not in input_target_paths[i_tools]: |
113 compatible_tools_ids = [ | 107 input_target_paths[i_tools].append(t_tools) |
114 str(dictionary[x]) for x in compatible_tools | 108 for item in input_target_paths: |
115 ] | 109 d_size += len(input_target_paths[item]) |
116 compatible_tools_ids.append(label) | 110 print("Dataset size:", d_size) |
117 composite_labels = ",".join(compatible_tools_ids) | 111 return input_target_paths, compatible_tools, d_size |
118 train_tools = ",".join(train_tools) | 112 |
119 if train_tools in paths_labels: | 113 def prepare_input_target_paths(self, dictionary, reverse_dictionary, paths): |
120 paths_labels[train_tools] += "," + composite_labels | 114 input_target_paths = dict() |
121 else: | 115 compatible_tools = dict() |
122 paths_labels[train_tools] = composite_labels | 116 d_size = 0 |
123 for item in paths_labels: | 117 for i, item in enumerate(paths): |
124 paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) | 118 input_tools = item.split(",") |
125 return paths_labels | 119 ctr = 0 |
126 | 120 for ctr in range(len(input_tools) - 1): |
127 def pad_test_paths(self, paths_dictionary, num_classes): | 121 # uncomment this for one token target idea |
128 """ | 122 tool_seq = input_tools[0: ctr + 2] |
129 Add padding to the tools sequences and create multi-hot encoded labels | 123 i_tools = ",".join(tool_seq[0:-1]) |
130 """ | 124 last_i_tool = i_tools.split(",")[-1] |
131 size_data = len(paths_dictionary) | 125 if last_i_tool not in compatible_tools: |
132 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) | 126 compatible_tools[last_i_tool] = list() |
133 label_mat = np.zeros([size_data, num_classes + 1]) | 127 t_tools = tool_seq[-1] |
128 if t_tools not in compatible_tools[last_i_tool]: | |
129 compatible_tools[last_i_tool].append(t_tools) | |
130 if i_tools not in input_target_paths: | |
131 input_target_paths[i_tools] = list() | |
132 if t_tools not in input_target_paths[i_tools]: | |
133 input_target_paths[i_tools].append(t_tools) | |
134 if i_tools not in input_target_paths: | |
135 input_target_paths[i_tools] = list() | |
136 if t_tools not in input_target_paths[i_tools]: | |
137 input_target_paths[i_tools].append(t_tools) | |
138 for item in input_target_paths: | |
139 d_size += len(input_target_paths[item]) | |
140 print("Dataset size:", d_size) | |
141 return input_target_paths, compatible_tools, d_size | |
142 | |
143 def pad_paths_one_tool_target(self, multi_paths, compatible_tools, d_size, rev_dict, dictionary): | |
144 d_size = len(multi_paths) | |
145 input_mat = np.zeros([d_size, self.max_tool_sequence_len]) | |
146 target_mat = np.zeros([d_size, len(dictionary) + 1]) | |
134 train_counter = 0 | 147 train_counter = 0 |
135 for train_seq, train_label in list(paths_dictionary.items()): | 148 for input_seq, target_seq_tools in list(multi_paths.items()): |
136 positions = train_seq.split(",") | 149 input_seq_tools = input_seq.split(",") |
137 start_pos = self.max_tool_sequence_len - len(positions) | 150 last_i_tool = input_seq_tools[-1] |
138 for id_pos, pos in enumerate(positions): | 151 for id_pos, pos in enumerate(input_seq_tools): |
139 data_mat[train_counter][start_pos + id_pos] = int(pos) | 152 input_mat[train_counter][id_pos] = int(pos) |
140 for label_item in train_label.split(","): | 153 if last_i_tool in compatible_tools: |
141 label_mat[train_counter][int(label_item)] = 1.0 | 154 compatible_targets = compatible_tools[last_i_tool] |
155 for k, t_label in enumerate(target_seq_tools): | |
156 target_mat[train_counter][int(t_label)] = 1 | |
157 for c_tool in compatible_targets: | |
158 target_mat[train_counter][int(c_tool)] = 1 | |
142 train_counter += 1 | 159 train_counter += 1 |
143 return data_mat, label_mat | 160 print("Final data size: ", input_mat.shape, target_mat.shape) |
144 | 161 train_data, test_data, train_labels, test_labels = train_test_split(input_mat, target_mat, test_size=self.test_share, random_state=42) |
145 def pad_paths( | 162 return train_data, train_labels, test_data, test_labels |
146 self, paths_dictionary, num_classes, standard_connections, reverse_dictionary | |
147 ): | |
148 """ | |
149 Add padding to the tools sequences and create multi-hot encoded labels | |
150 """ | |
151 size_data = len(paths_dictionary) | |
152 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) | |
153 label_mat = np.zeros([size_data, 2 * (num_classes + 1)]) | |
154 pos_flag = 1.0 | |
155 train_counter = 0 | |
156 for train_seq, train_label in list(paths_dictionary.items()): | |
157 pub_connections = list() | |
158 positions = train_seq.split(",") | |
159 last_tool_id = positions[-1] | |
160 last_tool_name = reverse_dictionary[int(last_tool_id)] | |
161 start_pos = self.max_tool_sequence_len - len(positions) | |
162 for id_pos, pos in enumerate(positions): | |
163 data_mat[train_counter][start_pos + id_pos] = int(pos) | |
164 if last_tool_name in standard_connections: | |
165 pub_connections = standard_connections[last_tool_name] | |
166 for label_item in train_label.split(","): | |
167 label_pos = int(label_item) | |
168 label_row = label_mat[train_counter] | |
169 if reverse_dictionary[label_pos] in pub_connections: | |
170 label_row[label_pos] = pos_flag | |
171 else: | |
172 label_row[label_pos + num_classes + 1] = pos_flag | |
173 train_counter += 1 | |
174 return data_mat, label_mat | |
175 | 163 |
176 def split_test_train_data(self, multilabels_paths): | 164 def split_test_train_data(self, multilabels_paths): |
177 """ | 165 """ |
178 Split into test and train data randomly for each run | 166 Split into test and train data randomly for each run |
179 """ | 167 """ |
219 if u_score < 1.0: | 207 if u_score < 1.0: |
220 u_score += 1.0 | 208 u_score += 1.0 |
221 class_weights[key] = np.round(np.log(u_score), 6) | 209 class_weights[key] = np.round(np.log(u_score), 6) |
222 return class_weights | 210 return class_weights |
223 | 211 |
224 def get_train_last_tool_freq(self, train_paths, reverse_dictionary): | 212 def get_train_tool_labels_freq(self, train_paths, reverse_dictionary): |
225 """ | 213 """ |
226 Get the frequency of last tool of each tool sequence | 214 Get the frequency of last tool of each tool sequence |
227 to estimate the frequency of tool sequences | 215 to estimate the frequency of tool sequences |
228 """ | 216 """ |
229 last_tool_freq = dict() | 217 last_tool_freq = dict() |
230 freq_dict_names = dict() | 218 freq_dict_names = dict() |
231 for path in train_paths: | 219 for path in train_paths: |
232 last_tool = path.split(",")[-1] | 220 tools_pos = np.where(path > 0)[0] |
221 path_pos = tools_pos | |
222 path_pos = [str(int(item)) for item in path_pos] | |
223 | |
224 for tool_pos in path_pos: | |
225 if tool_pos not in last_tool_freq: | |
226 last_tool_freq[tool_pos] = 0 | |
227 freq_dict_names[reverse_dictionary[int(tool_pos)]] = 0 | |
228 last_tool_freq[tool_pos] += 1 | |
229 freq_dict_names[reverse_dictionary[int(tool_pos)]] += 1 | |
230 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) | |
231 return sorted_dict | |
232 | |
233 def get_train_last_tool_freq(self, train_paths, reverse_dictionary): | |
234 """ | |
235 Get the frequency of last tool of each tool sequence | |
236 to estimate the frequency of tool sequences | |
237 """ | |
238 last_tool_freq = dict() | |
239 freq_dict_names = dict() | |
240 for path in train_paths: | |
241 tools_pos = np.where(path > 0)[0] | |
242 path_pos = path[tools_pos] | |
243 path_pos = [str(int(item)) for item in path_pos] | |
244 last_tool = path_pos[-1] | |
233 if last_tool not in last_tool_freq: | 245 if last_tool not in last_tool_freq: |
234 last_tool_freq[last_tool] = 0 | 246 last_tool_freq[last_tool] = 0 |
235 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 | 247 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 |
236 last_tool_freq[last_tool] += 1 | 248 last_tool_freq[last_tool] += 1 |
237 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 | 249 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 |
238 return last_tool_freq | 250 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) |
251 return sorted_dict | |
239 | 252 |
240 def get_toolid_samples(self, train_data, l_tool_freq): | 253 def get_toolid_samples(self, train_data, l_tool_freq): |
241 l_tool_tr_samples = dict() | 254 l_tool_tr_samples = dict() |
242 for tool_id in l_tool_freq: | 255 for tool_id in l_tool_freq: |
243 for index, tr_sample in enumerate(train_data): | 256 for index, tr_sample in enumerate(train_data): |
246 if last_tool_id not in l_tool_tr_samples: | 259 if last_tool_id not in l_tool_tr_samples: |
247 l_tool_tr_samples[last_tool_id] = list() | 260 l_tool_tr_samples[last_tool_id] = list() |
248 l_tool_tr_samples[last_tool_id].append(index) | 261 l_tool_tr_samples[last_tool_id].append(index) |
249 return l_tool_tr_samples | 262 return l_tool_tr_samples |
250 | 263 |
251 def get_data_labels_matrices( | 264 def get_data_labels_matrices(self, workflow_paths, usage_df, cutoff_date, standard_connections, old_data_dictionary={}): |
252 self, | |
253 workflow_paths, | |
254 tool_usage_path, | |
255 cutoff_date, | |
256 compatible_next_tools, | |
257 standard_connections, | |
258 old_data_dictionary={}, | |
259 ): | |
260 """ | 265 """ |
261 Convert the training and test paths into corresponding numpy matrices | 266 Convert the training and test paths into corresponding numpy matrices |
262 """ | 267 """ |
263 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) | 268 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) |
264 dictionary, rev_dict = self.create_data_dictionary( | 269 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) |
265 processed_data, old_data_dictionary | 270 |
266 ) | |
267 num_classes = len(dictionary) | 271 num_classes = len(dictionary) |
268 | 272 |
269 print("Raw paths: %d" % len(raw_paths)) | 273 print("Raw paths: %d" % len(raw_paths)) |
270 random.shuffle(raw_paths) | 274 random.shuffle(raw_paths) |
271 | 275 |
272 print("Decomposing paths...") | 276 print("Decomposing paths...") |
273 all_unique_paths = self.decompose_paths(raw_paths, dictionary) | 277 all_unique_paths = self.decompose_paths(raw_paths, dictionary) |
274 random.shuffle(all_unique_paths) | 278 random.shuffle(all_unique_paths) |
275 | 279 |
276 print("Creating dictionaries...") | 280 print("Creating dictionaries...") |
277 multilabels_paths = self.prepare_paths_labels_dictionary( | 281 multilabels_paths, compatible_tools, d_size = self.prepare_input_target_paths(dictionary, rev_dict, all_unique_paths) |
278 dictionary, rev_dict, all_unique_paths, compatible_next_tools | 282 |
279 ) | 283 print("Complete data: %d" % d_size) |
280 | |
281 print("Complete data: %d" % len(multilabels_paths)) | |
282 train_paths_dict, test_paths_dict = self.split_test_train_data( | |
283 multilabels_paths | |
284 ) | |
285 | |
286 print("Train data: %d" % len(train_paths_dict)) | |
287 print("Test data: %d" % len(test_paths_dict)) | |
288 | 284 |
289 print("Padding train and test data...") | 285 print("Padding train and test data...") |
290 # pad training and test data with leading zeros | 286 # pad training and test data with trailing zeros |
291 test_data, test_labels = self.pad_paths( | 287 train_data, train_labels, test_data, test_labels = self.pad_paths_one_tool_target(multilabels_paths, compatible_tools, d_size, rev_dict, dictionary) |
292 test_paths_dict, num_classes, standard_connections, rev_dict | 288 |
293 ) | 289 print("Train data: ", train_data.shape) |
294 train_data, train_labels = self.pad_paths( | 290 print("Test data: ", test_data.shape) |
295 train_paths_dict, num_classes, standard_connections, rev_dict | |
296 ) | |
297 | 291 |
298 print("Estimating sample frequency...") | 292 print("Estimating sample frequency...") |
299 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) | 293 tr_tool_freq = self.get_train_tool_labels_freq(train_labels, rev_dict) |
300 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) | |
301 | 294 |
302 # Predict tools usage | 295 # Predict tools usage |
303 print("Predicting tools' usage...") | 296 print("Predicting tools' usage...") |
304 usage_pred = predict_tool_usage.ToolPopularity() | 297 usage_pred = predict_tool_usage.ToolPopularity() |
305 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) | 298 usage = usage_pred.extract_tool_usage(usage_df, cutoff_date, dictionary) |
306 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) | 299 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) |
307 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) | 300 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) |
308 | |
309 # get class weights using the predicted usage for each tool | 301 # get class weights using the predicted usage for each tool |
310 class_weights = self.assign_class_weights(num_classes, t_pred_usage) | 302 class_weights = self.assign_class_weights(num_classes, t_pred_usage) |
311 | 303 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, compatible_tools, tr_tool_freq |
312 return ( | |
313 train_data, | |
314 train_labels, | |
315 test_data, | |
316 test_labels, | |
317 dictionary, | |
318 rev_dict, | |
319 class_weights, | |
320 t_pred_usage, | |
321 l_tool_freq, | |
322 l_tool_tr_samples, | |
323 ) |