Mercurial > repos > bgruening > create_tool_recommendation_model
comparison prepare_data.py @ 0:9bf25dbe00ad draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
author | bgruening |
---|---|
date | Wed, 28 Aug 2019 07:19:38 -0400 |
parents | |
children | 5b3c08710e47 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9bf25dbe00ad |
---|---|
1 """ | |
2 Prepare the workflow paths to be used by downstream | |
3 machine learning algorithm. The paths are divided | |
4 into the test and training sets | |
5 """ | |
6 | |
7 import os | |
8 import collections | |
9 import numpy as np | |
10 import random | |
11 | |
12 import predict_tool_usage | |
13 | |
14 main_path = os.getcwd() | |
15 | |
16 | |
17 class PrepareData: | |
18 | |
19 @classmethod | |
20 def __init__(self, max_seq_length, test_data_share): | |
21 """ Init method. """ | |
22 self.max_tool_sequence_len = max_seq_length | |
23 self.test_share = test_data_share | |
24 | |
25 @classmethod | |
26 def process_workflow_paths(self, workflow_paths): | |
27 """ | |
28 Get all the tools and complete set of individual paths for each workflow | |
29 """ | |
30 tokens = list() | |
31 raw_paths = workflow_paths | |
32 raw_paths = [x.replace("\n", '') for x in raw_paths] | |
33 for item in raw_paths: | |
34 split_items = item.split(",") | |
35 for token in split_items: | |
36 if token is not "": | |
37 tokens.append(token) | |
38 tokens = list(set(tokens)) | |
39 tokens = np.array(tokens) | |
40 tokens = np.reshape(tokens, [-1, ]) | |
41 return tokens, raw_paths | |
42 | |
43 @classmethod | |
44 def create_new_dict(self, new_data_dict): | |
45 """ | |
46 Create new data dictionary | |
47 """ | |
48 reverse_dict = dict((v, k) for k, v in new_data_dict.items()) | |
49 return new_data_dict, reverse_dict | |
50 | |
51 @classmethod | |
52 def assemble_dictionary(self, new_data_dict, old_data_dictionary={}): | |
53 """ | |
54 Create/update tools indices in the forward and backward dictionary | |
55 """ | |
56 new_data_dict, reverse_dict = self.create_new_dict(new_data_dict) | |
57 return new_data_dict, reverse_dict | |
58 | |
59 @classmethod | |
60 def create_data_dictionary(self, words, old_data_dictionary={}): | |
61 """ | |
62 Create two dictionaries having tools names and their indexes | |
63 """ | |
64 count = collections.Counter(words).most_common() | |
65 dictionary = dict() | |
66 for word, _ in count: | |
67 dictionary[word] = len(dictionary) + 1 | |
68 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) | |
69 return dictionary, reverse_dictionary | |
70 | |
71 @classmethod | |
72 def decompose_paths(self, paths, dictionary): | |
73 """ | |
74 Decompose the paths to variable length sub-paths keeping the first tool fixed | |
75 """ | |
76 sub_paths_pos = list() | |
77 for index, item in enumerate(paths): | |
78 tools = item.split(",") | |
79 len_tools = len(tools) | |
80 if len_tools <= self.max_tool_sequence_len: | |
81 for window in range(1, len_tools): | |
82 sequence = tools[0: window + 1] | |
83 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] | |
84 if len(tools_pos) > 1: | |
85 sub_paths_pos.append(",".join(tools_pos)) | |
86 sub_paths_pos = list(set(sub_paths_pos)) | |
87 return sub_paths_pos | |
88 | |
89 @classmethod | |
90 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools): | |
91 """ | |
92 Create a dictionary of sequences with their labels for training and test paths | |
93 """ | |
94 paths_labels = dict() | |
95 random.shuffle(paths) | |
96 for item in paths: | |
97 if item and item not in "": | |
98 tools = item.split(",") | |
99 label = tools[-1] | |
100 train_tools = tools[:len(tools) - 1] | |
101 last_but_one_name = reverse_dictionary[int(train_tools[-1])] | |
102 try: | |
103 compatible_tools = compatible_next_tools[last_but_one_name].split(",") | |
104 except Exception: | |
105 continue | |
106 if len(compatible_tools) > 0: | |
107 compatible_tools_ids = [str(dictionary[x]) for x in compatible_tools] | |
108 compatible_tools_ids.append(label) | |
109 composite_labels = ",".join(compatible_tools_ids) | |
110 train_tools = ",".join(train_tools) | |
111 if train_tools in paths_labels: | |
112 paths_labels[train_tools] += "," + composite_labels | |
113 else: | |
114 paths_labels[train_tools] = composite_labels | |
115 for item in paths_labels: | |
116 paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) | |
117 return paths_labels | |
118 | |
119 @classmethod | |
120 def pad_paths(self, paths_dictionary, num_classes): | |
121 """ | |
122 Add padding to the tools sequences and create multi-hot encoded labels | |
123 """ | |
124 size_data = len(paths_dictionary) | |
125 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) | |
126 label_mat = np.zeros([size_data, num_classes + 1]) | |
127 train_counter = 0 | |
128 for train_seq, train_label in list(paths_dictionary.items()): | |
129 positions = train_seq.split(",") | |
130 start_pos = self.max_tool_sequence_len - len(positions) | |
131 for id_pos, pos in enumerate(positions): | |
132 data_mat[train_counter][start_pos + id_pos] = int(pos) | |
133 for label_item in train_label.split(","): | |
134 label_mat[train_counter][int(label_item)] = 1.0 | |
135 train_counter += 1 | |
136 return data_mat, label_mat | |
137 | |
138 @classmethod | |
139 def split_test_train_data(self, multilabels_paths): | |
140 """ | |
141 Split into test and train data randomly for each run | |
142 """ | |
143 train_dict = dict() | |
144 test_dict = dict() | |
145 all_paths = multilabels_paths.keys() | |
146 random.shuffle(list(all_paths)) | |
147 split_number = int(self.test_share * len(all_paths)) | |
148 for index, path in enumerate(list(all_paths)): | |
149 if index < split_number: | |
150 test_dict[path] = multilabels_paths[path] | |
151 else: | |
152 train_dict[path] = multilabels_paths[path] | |
153 return train_dict, test_dict | |
154 | |
155 @classmethod | |
156 def verify_overlap(self, train_paths, test_paths): | |
157 """ | |
158 Verify the overlapping of samples in train and test data | |
159 """ | |
160 intersection = list(set(train_paths).intersection(set(test_paths))) | |
161 print("Overlap in train and test: %d" % len(intersection)) | |
162 | |
163 @classmethod | |
164 def get_predicted_usage(self, data_dictionary, predicted_usage): | |
165 """ | |
166 Get predicted usage for tools | |
167 """ | |
168 usage = dict() | |
169 epsilon = 0.0 | |
170 # index 0 does not belong to any tool | |
171 usage[0] = epsilon | |
172 for k, v in data_dictionary.items(): | |
173 try: | |
174 usg = predicted_usage[k] | |
175 if usg < epsilon: | |
176 usg = epsilon | |
177 usage[v] = usg | |
178 except Exception: | |
179 usage[v] = epsilon | |
180 continue | |
181 return usage | |
182 | |
183 @classmethod | |
184 def assign_class_weights(self, n_classes, predicted_usage): | |
185 """ | |
186 Compute class weights using usage | |
187 """ | |
188 class_weights = dict() | |
189 class_weights[str(0)] = 0.0 | |
190 for key in range(1, n_classes): | |
191 u_score = predicted_usage[key] | |
192 if u_score < 1.0: | |
193 u_score += 1.0 | |
194 class_weights[key] = np.log(u_score) | |
195 return class_weights | |
196 | |
197 @classmethod | |
198 def get_sample_weights(self, train_data, reverse_dictionary, paths_frequency): | |
199 """ | |
200 Compute the frequency of paths in training data | |
201 """ | |
202 path_weights = np.zeros(len(train_data)) | |
203 for path_index, path in enumerate(train_data): | |
204 sample_pos = np.where(path > 0)[0] | |
205 sample_tool_pos = path[sample_pos[0]:] | |
206 path_name = ",".join([reverse_dictionary[int(tool_pos)] for tool_pos in sample_tool_pos]) | |
207 try: | |
208 path_weights[path_index] = int(paths_frequency[path_name]) | |
209 except Exception: | |
210 path_weights[path_index] = 1 | |
211 return path_weights | |
212 | |
213 @classmethod | |
214 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, old_data_dictionary={}): | |
215 """ | |
216 Convert the training and test paths into corresponding numpy matrices | |
217 """ | |
218 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) | |
219 dictionary, reverse_dictionary = self.create_data_dictionary(processed_data, old_data_dictionary) | |
220 num_classes = len(dictionary) | |
221 | |
222 print("Raw paths: %d" % len(raw_paths)) | |
223 random.shuffle(raw_paths) | |
224 | |
225 print("Decomposing paths...") | |
226 all_unique_paths = self.decompose_paths(raw_paths, dictionary) | |
227 random.shuffle(all_unique_paths) | |
228 | |
229 print("Creating dictionaries...") | |
230 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, reverse_dictionary, all_unique_paths, compatible_next_tools) | |
231 | |
232 print("Complete data: %d" % len(multilabels_paths)) | |
233 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) | |
234 | |
235 print("Train data: %d" % len(train_paths_dict)) | |
236 print("Test data: %d" % len(test_paths_dict)) | |
237 | |
238 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes) | |
239 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes) | |
240 | |
241 # Predict tools usage | |
242 print("Predicting tools' usage...") | |
243 usage_pred = predict_tool_usage.ToolPopularity() | |
244 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) | |
245 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) | |
246 tool_predicted_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) | |
247 | |
248 # get class weights using the predicted usage for each tool | |
249 class_weights = self.assign_class_weights(train_labels.shape[1], tool_predicted_usage) | |
250 | |
251 return train_data, train_labels, test_data, test_labels, dictionary, reverse_dictionary, class_weights, tool_predicted_usage |