comparison prepare_data.py @ 5:4f7e6612906b draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
author bgruening
date Fri, 06 May 2022 09:05:18 +0000
parents afec8c595124
children e94dc7945639
comparison
equal deleted inserted replaced
4:afec8c595124 5:4f7e6612906b
2 Prepare the workflow paths to be used by downstream 2 Prepare the workflow paths to be used by downstream
3 machine learning algorithm. The paths are divided 3 machine learning algorithm. The paths are divided
4 into the test and training sets 4 into the test and training sets
5 """ 5 """
6 6
7 import collections
7 import os 8 import os
8 import collections 9 import random
10
9 import numpy as np 11 import numpy as np
10 import random
11
12 import predict_tool_usage 12 import predict_tool_usage
13 13
14 main_path = os.getcwd() 14 main_path = os.getcwd()
15 15
16 16
17 class PrepareData: 17 class PrepareData:
18
19 def __init__(self, max_seq_length, test_data_share): 18 def __init__(self, max_seq_length, test_data_share):
20 """ Init method. """ 19 """ Init method. """
21 self.max_tool_sequence_len = max_seq_length 20 self.max_tool_sequence_len = max_seq_length
22 self.test_share = test_data_share 21 self.test_share = test_data_share
23 22
25 """ 24 """
26 Get all the tools and complete set of individual paths for each workflow 25 Get all the tools and complete set of individual paths for each workflow
27 """ 26 """
28 tokens = list() 27 tokens = list()
29 raw_paths = workflow_paths 28 raw_paths = workflow_paths
30 raw_paths = [x.replace("\n", '') for x in raw_paths] 29 raw_paths = [x.replace("\n", "") for x in raw_paths]
31 for item in raw_paths: 30 for item in raw_paths:
32 split_items = item.split(",") 31 split_items = item.split(",")
33 for token in split_items: 32 for token in split_items:
34 if token is not "": 33 if token != "":
35 tokens.append(token) 34 tokens.append(token)
36 tokens = list(set(tokens)) 35 tokens = list(set(tokens))
37 tokens = np.array(tokens) 36 tokens = np.array(tokens)
38 tokens = np.reshape(tokens, [-1, ]) 37 tokens = np.reshape(
38 tokens,
39 [
40 -1,
41 ],
42 )
39 return tokens, raw_paths 43 return tokens, raw_paths
40 44
41 def create_new_dict(self, new_data_dict): 45 def create_new_dict(self, new_data_dict):
42 """ 46 """
43 Create new data dictionary 47 Create new data dictionary
58 """ 62 """
59 count = collections.Counter(words).most_common() 63 count = collections.Counter(words).most_common()
60 dictionary = dict() 64 dictionary = dict()
61 for word, _ in count: 65 for word, _ in count:
62 dictionary[word] = len(dictionary) + 1 66 dictionary[word] = len(dictionary) + 1
63 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) 67 word = word.strip()
68 dictionary, reverse_dictionary = self.assemble_dictionary(
69 dictionary, old_data_dictionary
70 )
64 return dictionary, reverse_dictionary 71 return dictionary, reverse_dictionary
65 72
66 def decompose_paths(self, paths, dictionary): 73 def decompose_paths(self, paths, dictionary):
67 """ 74 """
68 Decompose the paths to variable length sub-paths keeping the first tool fixed 75 Decompose the paths to variable length sub-paths keeping the first tool fixed
72 tools = item.split(",") 79 tools = item.split(",")
73 len_tools = len(tools) 80 len_tools = len(tools)
74 if len_tools <= self.max_tool_sequence_len: 81 if len_tools <= self.max_tool_sequence_len:
75 for window in range(1, len_tools): 82 for window in range(1, len_tools):
76 sequence = tools[0: window + 1] 83 sequence = tools[0: window + 1]
77 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] 84 tools_pos = [
85 str(dictionary[str(tool_item)]) for tool_item in sequence
86 ]
78 if len(tools_pos) > 1: 87 if len(tools_pos) > 1:
79 sub_paths_pos.append(",".join(tools_pos)) 88 sub_paths_pos.append(",".join(tools_pos))
80 sub_paths_pos = list(set(sub_paths_pos)) 89 sub_paths_pos = list(set(sub_paths_pos))
81 return sub_paths_pos 90 return sub_paths_pos
82 91
83 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools): 92 def prepare_paths_labels_dictionary(
93 self, dictionary, reverse_dictionary, paths, compatible_next_tools
94 ):
84 """ 95 """
85 Create a dictionary of sequences with their labels for training and test paths 96 Create a dictionary of sequences with their labels for training and test paths
86 """ 97 """
87 paths_labels = dict() 98 paths_labels = dict()
88 random.shuffle(paths) 99 random.shuffle(paths)
89 for item in paths: 100 for item in paths:
90 if item and item not in "": 101 if item and item not in "":
91 tools = item.split(",") 102 tools = item.split(",")
92 label = tools[-1] 103 label = tools[-1]
93 train_tools = tools[:len(tools) - 1] 104 train_tools = tools[: len(tools) - 1]
94 last_but_one_name = reverse_dictionary[int(train_tools[-1])] 105 last_but_one_name = reverse_dictionary[int(train_tools[-1])]
95 try: 106 try:
96 compatible_tools = compatible_next_tools[last_but_one_name].split(",") 107 compatible_tools = compatible_next_tools[last_but_one_name].split(
108 ","
109 )
97 except Exception: 110 except Exception:
98 continue 111 continue
99 if len(compatible_tools) > 0: 112 if len(compatible_tools) > 0:
100 compatible_tools_ids = [str(dictionary[x]) for x in compatible_tools] 113 compatible_tools_ids = [
114 str(dictionary[x]) for x in compatible_tools
115 ]
101 compatible_tools_ids.append(label) 116 compatible_tools_ids.append(label)
102 composite_labels = ",".join(compatible_tools_ids) 117 composite_labels = ",".join(compatible_tools_ids)
103 train_tools = ",".join(train_tools) 118 train_tools = ",".join(train_tools)
104 if train_tools in paths_labels: 119 if train_tools in paths_labels:
105 paths_labels[train_tools] += "," + composite_labels 120 paths_labels[train_tools] += "," + composite_labels
125 for label_item in train_label.split(","): 140 for label_item in train_label.split(","):
126 label_mat[train_counter][int(label_item)] = 1.0 141 label_mat[train_counter][int(label_item)] = 1.0
127 train_counter += 1 142 train_counter += 1
128 return data_mat, label_mat 143 return data_mat, label_mat
129 144
130 def pad_paths(self, paths_dictionary, num_classes, standard_connections, reverse_dictionary): 145 def pad_paths(
146 self, paths_dictionary, num_classes, standard_connections, reverse_dictionary
147 ):
131 """ 148 """
132 Add padding to the tools sequences and create multi-hot encoded labels 149 Add padding to the tools sequences and create multi-hot encoded labels
133 """ 150 """
134 size_data = len(paths_dictionary) 151 size_data = len(paths_dictionary)
135 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) 152 data_mat = np.zeros([size_data, self.max_tool_sequence_len])
229 if last_tool_id not in l_tool_tr_samples: 246 if last_tool_id not in l_tool_tr_samples:
230 l_tool_tr_samples[last_tool_id] = list() 247 l_tool_tr_samples[last_tool_id] = list()
231 l_tool_tr_samples[last_tool_id].append(index) 248 l_tool_tr_samples[last_tool_id].append(index)
232 return l_tool_tr_samples 249 return l_tool_tr_samples
233 250
234 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections, old_data_dictionary={}): 251 def get_data_labels_matrices(
252 self,
253 workflow_paths,
254 tool_usage_path,
255 cutoff_date,
256 compatible_next_tools,
257 standard_connections,
258 old_data_dictionary={},
259 ):
235 """ 260 """
236 Convert the training and test paths into corresponding numpy matrices 261 Convert the training and test paths into corresponding numpy matrices
237 """ 262 """
238 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) 263 processed_data, raw_paths = self.process_workflow_paths(workflow_paths)
239 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) 264 dictionary, rev_dict = self.create_data_dictionary(
265 processed_data, old_data_dictionary
266 )
240 num_classes = len(dictionary) 267 num_classes = len(dictionary)
241 268
242 print("Raw paths: %d" % len(raw_paths)) 269 print("Raw paths: %d" % len(raw_paths))
243 random.shuffle(raw_paths) 270 random.shuffle(raw_paths)
244 271
245 print("Decomposing paths...") 272 print("Decomposing paths...")
246 all_unique_paths = self.decompose_paths(raw_paths, dictionary) 273 all_unique_paths = self.decompose_paths(raw_paths, dictionary)
247 random.shuffle(all_unique_paths) 274 random.shuffle(all_unique_paths)
248 275
249 print("Creating dictionaries...") 276 print("Creating dictionaries...")
250 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, rev_dict, all_unique_paths, compatible_next_tools) 277 multilabels_paths = self.prepare_paths_labels_dictionary(
278 dictionary, rev_dict, all_unique_paths, compatible_next_tools
279 )
251 280
252 print("Complete data: %d" % len(multilabels_paths)) 281 print("Complete data: %d" % len(multilabels_paths))
253 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) 282 train_paths_dict, test_paths_dict = self.split_test_train_data(
283 multilabels_paths
284 )
254 285
255 print("Train data: %d" % len(train_paths_dict)) 286 print("Train data: %d" % len(train_paths_dict))
256 print("Test data: %d" % len(test_paths_dict)) 287 print("Test data: %d" % len(test_paths_dict))
257 288
258 print("Padding train and test data...") 289 print("Padding train and test data...")
259 # pad training and test data with leading zeros 290 # pad training and test data with leading zeros
260 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes, standard_connections, rev_dict) 291 test_data, test_labels = self.pad_paths(
261 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes, standard_connections, rev_dict) 292 test_paths_dict, num_classes, standard_connections, rev_dict
293 )
294 train_data, train_labels = self.pad_paths(
295 train_paths_dict, num_classes, standard_connections, rev_dict
296 )
262 297
263 print("Estimating sample frequency...") 298 print("Estimating sample frequency...")
264 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) 299 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict)
265 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) 300 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq)
266 301
272 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) 307 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction)
273 308
274 # get class weights using the predicted usage for each tool 309 # get class weights using the predicted usage for each tool
275 class_weights = self.assign_class_weights(num_classes, t_pred_usage) 310 class_weights = self.assign_class_weights(num_classes, t_pred_usage)
276 311
277 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, t_pred_usage, l_tool_freq, l_tool_tr_samples 312 return (
313 train_data,
314 train_labels,
315 test_data,
316 test_labels,
317 dictionary,
318 rev_dict,
319 class_weights,
320 t_pred_usage,
321 l_tool_freq,
322 l_tool_tr_samples,
323 )