Mercurial > repos > bgruening > create_tool_recommendation_model
annotate prepare_data.py @ 4:afec8c595124 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
author | bgruening |
---|---|
date | Tue, 07 Jul 2020 03:25:49 -0400 |
parents | 5b3c08710e47 |
children | 4f7e6612906b |
rev | line source |
---|---|
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
1 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
2 Prepare the workflow paths to be used by downstream |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
3 machine learning algorithm. The paths are divided |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
4 into the test and training sets |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
5 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
6 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
7 import os |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
8 import collections |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
9 import numpy as np |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
10 import random |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
11 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
12 import predict_tool_usage |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
13 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
14 main_path = os.getcwd() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
15 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
16 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
17 class PrepareData: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
18 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
19 def __init__(self, max_seq_length, test_data_share): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
20 """ Init method. """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
21 self.max_tool_sequence_len = max_seq_length |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
22 self.test_share = test_data_share |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
23 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
24 def process_workflow_paths(self, workflow_paths): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
25 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
26 Get all the tools and complete set of individual paths for each workflow |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
27 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
28 tokens = list() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
29 raw_paths = workflow_paths |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
30 raw_paths = [x.replace("\n", '') for x in raw_paths] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
31 for item in raw_paths: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
32 split_items = item.split(",") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
33 for token in split_items: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
34 if token is not "": |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
35 tokens.append(token) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
36 tokens = list(set(tokens)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
37 tokens = np.array(tokens) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
38 tokens = np.reshape(tokens, [-1, ]) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
39 return tokens, raw_paths |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
40 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
41 def create_new_dict(self, new_data_dict): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
42 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
43 Create new data dictionary |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
44 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
45 reverse_dict = dict((v, k) for k, v in new_data_dict.items()) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
46 return new_data_dict, reverse_dict |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
47 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
48 def assemble_dictionary(self, new_data_dict, old_data_dictionary={}): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
49 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
50 Create/update tools indices in the forward and backward dictionary |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
51 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
52 new_data_dict, reverse_dict = self.create_new_dict(new_data_dict) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
53 return new_data_dict, reverse_dict |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
54 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
55 def create_data_dictionary(self, words, old_data_dictionary={}): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
56 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
57 Create two dictionaries having tools names and their indexes |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
58 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
59 count = collections.Counter(words).most_common() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
60 dictionary = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
61 for word, _ in count: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
62 dictionary[word] = len(dictionary) + 1 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
63 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
64 return dictionary, reverse_dictionary |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
65 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
66 def decompose_paths(self, paths, dictionary): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
67 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
68 Decompose the paths to variable length sub-paths keeping the first tool fixed |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
69 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
70 sub_paths_pos = list() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
71 for index, item in enumerate(paths): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
72 tools = item.split(",") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
73 len_tools = len(tools) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
74 if len_tools <= self.max_tool_sequence_len: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
75 for window in range(1, len_tools): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
76 sequence = tools[0: window + 1] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
77 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
78 if len(tools_pos) > 1: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
79 sub_paths_pos.append(",".join(tools_pos)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
80 sub_paths_pos = list(set(sub_paths_pos)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
81 return sub_paths_pos |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
82 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
83 def prepare_paths_labels_dictionary(self, dictionary, reverse_dictionary, paths, compatible_next_tools): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
84 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
85 Create a dictionary of sequences with their labels for training and test paths |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
86 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
87 paths_labels = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
88 random.shuffle(paths) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
89 for item in paths: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
90 if item and item not in "": |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
91 tools = item.split(",") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
92 label = tools[-1] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
93 train_tools = tools[:len(tools) - 1] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
94 last_but_one_name = reverse_dictionary[int(train_tools[-1])] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
95 try: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
96 compatible_tools = compatible_next_tools[last_but_one_name].split(",") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
97 except Exception: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
98 continue |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
99 if len(compatible_tools) > 0: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
100 compatible_tools_ids = [str(dictionary[x]) for x in compatible_tools] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
101 compatible_tools_ids.append(label) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
102 composite_labels = ",".join(compatible_tools_ids) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
103 train_tools = ",".join(train_tools) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
104 if train_tools in paths_labels: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
105 paths_labels[train_tools] += "," + composite_labels |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
106 else: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
107 paths_labels[train_tools] = composite_labels |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
108 for item in paths_labels: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
109 paths_labels[item] = ",".join(list(set(paths_labels[item].split(",")))) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
110 return paths_labels |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
111 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
112 def pad_test_paths(self, paths_dictionary, num_classes): |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
113 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
114 Add padding to the tools sequences and create multi-hot encoded labels |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
115 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
116 size_data = len(paths_dictionary) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
117 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
118 label_mat = np.zeros([size_data, num_classes + 1]) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
119 train_counter = 0 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
120 for train_seq, train_label in list(paths_dictionary.items()): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
121 positions = train_seq.split(",") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
122 start_pos = self.max_tool_sequence_len - len(positions) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
123 for id_pos, pos in enumerate(positions): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
124 data_mat[train_counter][start_pos + id_pos] = int(pos) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
125 for label_item in train_label.split(","): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
126 label_mat[train_counter][int(label_item)] = 1.0 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
127 train_counter += 1 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
128 return data_mat, label_mat |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
129 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
130 def pad_paths(self, paths_dictionary, num_classes, standard_connections, reverse_dictionary): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
131 """ |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
132 Add padding to the tools sequences and create multi-hot encoded labels |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
133 """ |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
134 size_data = len(paths_dictionary) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
135 data_mat = np.zeros([size_data, self.max_tool_sequence_len]) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
136 label_mat = np.zeros([size_data, 2 * (num_classes + 1)]) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
137 pos_flag = 1.0 |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
138 train_counter = 0 |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
139 for train_seq, train_label in list(paths_dictionary.items()): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
140 pub_connections = list() |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
141 positions = train_seq.split(",") |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
142 last_tool_id = positions[-1] |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
143 last_tool_name = reverse_dictionary[int(last_tool_id)] |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
144 start_pos = self.max_tool_sequence_len - len(positions) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
145 for id_pos, pos in enumerate(positions): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
146 data_mat[train_counter][start_pos + id_pos] = int(pos) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
147 if last_tool_name in standard_connections: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
148 pub_connections = standard_connections[last_tool_name] |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
149 for label_item in train_label.split(","): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
150 label_pos = int(label_item) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
151 label_row = label_mat[train_counter] |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
152 if reverse_dictionary[label_pos] in pub_connections: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
153 label_row[label_pos] = pos_flag |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
154 else: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
155 label_row[label_pos + num_classes + 1] = pos_flag |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
156 train_counter += 1 |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
157 return data_mat, label_mat |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
158 |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
159 def split_test_train_data(self, multilabels_paths): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
160 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
161 Split into test and train data randomly for each run |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
162 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
163 train_dict = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
164 test_dict = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
165 all_paths = multilabels_paths.keys() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
166 random.shuffle(list(all_paths)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
167 split_number = int(self.test_share * len(all_paths)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
168 for index, path in enumerate(list(all_paths)): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
169 if index < split_number: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
170 test_dict[path] = multilabels_paths[path] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
171 else: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
172 train_dict[path] = multilabels_paths[path] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
173 return train_dict, test_dict |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
174 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
175 def get_predicted_usage(self, data_dictionary, predicted_usage): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
176 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
177 Get predicted usage for tools |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
178 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
179 usage = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
180 epsilon = 0.0 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
181 # index 0 does not belong to any tool |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
182 usage[0] = epsilon |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
183 for k, v in data_dictionary.items(): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
184 try: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
185 usg = predicted_usage[k] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
186 if usg < epsilon: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
187 usg = epsilon |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
188 usage[v] = usg |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
189 except Exception: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
190 usage[v] = epsilon |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
191 continue |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
192 return usage |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
193 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
194 def assign_class_weights(self, n_classes, predicted_usage): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
195 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
196 Compute class weights using usage |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
197 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
198 class_weights = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
199 class_weights[str(0)] = 0.0 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
200 for key in range(1, n_classes + 1): |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
201 u_score = predicted_usage[key] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
202 if u_score < 1.0: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
203 u_score += 1.0 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
204 class_weights[key] = np.round(np.log(u_score), 6) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
205 return class_weights |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
206 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
207 def get_train_last_tool_freq(self, train_paths, reverse_dictionary): |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
208 """ |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
209 Get the frequency of last tool of each tool sequence |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
210 to estimate the frequency of tool sequences |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
211 """ |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
212 last_tool_freq = dict() |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
213 freq_dict_names = dict() |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
214 for path in train_paths: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
215 last_tool = path.split(",")[-1] |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
216 if last_tool not in last_tool_freq: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
217 last_tool_freq[last_tool] = 0 |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
218 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
219 last_tool_freq[last_tool] += 1 |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
220 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 |
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
221 return last_tool_freq |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
222 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
223 def get_toolid_samples(self, train_data, l_tool_freq): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
224 l_tool_tr_samples = dict() |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
225 for tool_id in l_tool_freq: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
226 for index, tr_sample in enumerate(train_data): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
227 last_tool_id = str(int(tr_sample[-1])) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
228 if last_tool_id == tool_id: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
229 if last_tool_id not in l_tool_tr_samples: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
230 l_tool_tr_samples[last_tool_id] = list() |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
231 l_tool_tr_samples[last_tool_id].append(index) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
232 return l_tool_tr_samples |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
233 |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
234 def get_data_labels_matrices(self, workflow_paths, tool_usage_path, cutoff_date, compatible_next_tools, standard_connections, old_data_dictionary={}): |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
235 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
236 Convert the training and test paths into corresponding numpy matrices |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
237 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
238 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
239 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
240 num_classes = len(dictionary) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
241 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
242 print("Raw paths: %d" % len(raw_paths)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
243 random.shuffle(raw_paths) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
244 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
245 print("Decomposing paths...") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
246 all_unique_paths = self.decompose_paths(raw_paths, dictionary) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
247 random.shuffle(all_unique_paths) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
248 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
249 print("Creating dictionaries...") |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
250 multilabels_paths = self.prepare_paths_labels_dictionary(dictionary, rev_dict, all_unique_paths, compatible_next_tools) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
251 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
252 print("Complete data: %d" % len(multilabels_paths)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
253 train_paths_dict, test_paths_dict = self.split_test_train_data(multilabels_paths) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
254 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
255 print("Train data: %d" % len(train_paths_dict)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
256 print("Test data: %d" % len(test_paths_dict)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
257 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
258 print("Padding train and test data...") |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
259 # pad training and test data with leading zeros |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
260 test_data, test_labels = self.pad_paths(test_paths_dict, num_classes, standard_connections, rev_dict) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
261 train_data, train_labels = self.pad_paths(train_paths_dict, num_classes, standard_connections, rev_dict) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
262 |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
263 print("Estimating sample frequency...") |
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
264 l_tool_freq = self.get_train_last_tool_freq(train_paths_dict, rev_dict) |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
265 l_tool_tr_samples = self.get_toolid_samples(train_data, l_tool_freq) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
266 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
267 # Predict tools usage |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
268 print("Predicting tools' usage...") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
269 usage_pred = predict_tool_usage.ToolPopularity() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
270 usage = usage_pred.extract_tool_usage(tool_usage_path, cutoff_date, dictionary) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
271 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
272 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
273 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
274 # get class weights using the predicted usage for each tool |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
275 class_weights = self.assign_class_weights(num_classes, t_pred_usage) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
276 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
277 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, t_pred_usage, l_tool_freq, l_tool_tr_samples |