Mercurial > repos > bgruening > create_tool_recommendation_model
annotate prepare_data.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author | bgruening |
---|---|
date | Sun, 16 Oct 2022 11:52:10 +0000 |
parents | 4f7e6612906b |
children |
rev | line source |
---|---|
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
1 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
2 Prepare the workflow paths to be used by downstream |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
3 machine learning algorithm. The paths are divided |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
4 into the test and training sets |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
5 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
6 |
5
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
7 import collections |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
8 import random |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
9 |
5
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
10 import numpy as np |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
11 import predict_tool_usage |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
12 from sklearn.model_selection import train_test_split |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
13 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
14 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
15 class PrepareData: |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
16 |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
17 def __init__(self, max_seq_length, test_data_share): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
18 """ Init method. """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
19 self.max_tool_sequence_len = max_seq_length |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
20 self.test_share = test_data_share |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
21 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
22 def process_workflow_paths(self, workflow_paths): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
23 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
24 Get all the tools and complete set of individual paths for each workflow |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
25 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
26 tokens = list() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
27 raw_paths = workflow_paths |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
28 raw_paths = [x.replace("\n", '') for x in raw_paths] |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
29 for item in raw_paths: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
30 split_items = item.split(",") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
31 for token in split_items: |
5
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
32 if token != "": |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
33 tokens.append(token) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
34 tokens = list(set(tokens)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
35 tokens = np.array(tokens) |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
36 tokens = np.reshape(tokens, [-1, ]) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
37 return tokens, raw_paths |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
38 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
39 def create_new_dict(self, new_data_dict): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
40 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
41 Create new data dictionary |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
42 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
43 reverse_dict = dict((v, k) for k, v in new_data_dict.items()) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
44 return new_data_dict, reverse_dict |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
45 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
46 def assemble_dictionary(self, new_data_dict, old_data_dictionary={}): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
47 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
48 Create/update tools indices in the forward and backward dictionary |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
49 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
50 new_data_dict, reverse_dict = self.create_new_dict(new_data_dict) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
51 return new_data_dict, reverse_dict |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
52 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
53 def create_data_dictionary(self, words, old_data_dictionary={}): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
54 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
55 Create two dictionaries having tools names and their indexes |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
56 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
57 count = collections.Counter(words).most_common() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
58 dictionary = dict() |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
59 for index, (word, _) in enumerate(count): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
60 word = word.lstrip() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
61 word = word.rstrip() |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
62 dictionary[word] = len(dictionary) + 1 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
63 dictionary, reverse_dictionary = self.assemble_dictionary(dictionary, old_data_dictionary) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
64 return dictionary, reverse_dictionary |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
65 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
66 def decompose_paths(self, paths, dictionary): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
67 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
68 Decompose the paths to variable length sub-paths keeping the first tool fixed |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
69 """ |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
70 max_len = 0 |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
71 sub_paths_pos = list() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
72 for index, item in enumerate(paths): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
73 tools = item.split(",") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
74 len_tools = len(tools) |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
75 if len_tools > max_len: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
76 max_len = len_tools |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
77 if len_tools < self.max_tool_sequence_len: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
78 sequence = tools[0: len_tools] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
79 tools_pos = [str(dictionary[str(tool_item)]) for tool_item in sequence] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
80 if len(tools_pos) > 1: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
81 sub_paths_pos.append(",".join(tools_pos)) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
82 sub_paths_pos = list(set(sub_paths_pos)) |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
83 print("Max length of tools: ", max_len) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
84 return sub_paths_pos |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
85 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
86 def prepare_input_one_target_paths(self, dictionary, reverse_dictionary, paths): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
87 input_target_paths = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
88 compatible_tools = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
89 d_size = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
90 for i, item in enumerate(paths): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
91 input_tools = item.split(",") |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
92 tool_seq = input_tools |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
93 i_tools = ",".join(tool_seq[0:-1]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
94 last_i_tool = i_tools.split(",")[-1] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
95 if last_i_tool not in compatible_tools: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
96 compatible_tools[last_i_tool] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
97 t_tools = tool_seq[-1] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
98 if t_tools not in compatible_tools[last_i_tool]: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
99 compatible_tools[last_i_tool].append(t_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
100 if i_tools not in input_target_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
101 input_target_paths[i_tools] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
102 if t_tools not in input_target_paths[i_tools]: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
103 input_target_paths[i_tools].append(t_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
104 if i_tools not in input_target_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
105 input_target_paths[i_tools] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
106 if t_tools not in input_target_paths[i_tools]: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
107 input_target_paths[i_tools].append(t_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
108 for item in input_target_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
109 d_size += len(input_target_paths[item]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
110 print("Dataset size:", d_size) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
111 return input_target_paths, compatible_tools, d_size |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
112 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
113 def prepare_input_target_paths(self, dictionary, reverse_dictionary, paths): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
114 input_target_paths = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
115 compatible_tools = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
116 d_size = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
117 for i, item in enumerate(paths): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
118 input_tools = item.split(",") |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
119 ctr = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
120 for ctr in range(len(input_tools) - 1): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
121 # uncomment this for one token target idea |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
122 tool_seq = input_tools[0: ctr + 2] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
123 i_tools = ",".join(tool_seq[0:-1]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
124 last_i_tool = i_tools.split(",")[-1] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
125 if last_i_tool not in compatible_tools: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
126 compatible_tools[last_i_tool] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
127 t_tools = tool_seq[-1] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
128 if t_tools not in compatible_tools[last_i_tool]: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
129 compatible_tools[last_i_tool].append(t_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
130 if i_tools not in input_target_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
131 input_target_paths[i_tools] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
132 if t_tools not in input_target_paths[i_tools]: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
133 input_target_paths[i_tools].append(t_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
134 if i_tools not in input_target_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
135 input_target_paths[i_tools] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
136 if t_tools not in input_target_paths[i_tools]: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
137 input_target_paths[i_tools].append(t_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
138 for item in input_target_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
139 d_size += len(input_target_paths[item]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
140 print("Dataset size:", d_size) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
141 return input_target_paths, compatible_tools, d_size |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
142 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
143 def pad_paths_one_tool_target(self, multi_paths, compatible_tools, d_size, rev_dict, dictionary): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
144 d_size = len(multi_paths) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
145 input_mat = np.zeros([d_size, self.max_tool_sequence_len]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
146 target_mat = np.zeros([d_size, len(dictionary) + 1]) |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
147 train_counter = 0 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
148 for input_seq, target_seq_tools in list(multi_paths.items()): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
149 input_seq_tools = input_seq.split(",") |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
150 last_i_tool = input_seq_tools[-1] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
151 for id_pos, pos in enumerate(input_seq_tools): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
152 input_mat[train_counter][id_pos] = int(pos) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
153 if last_i_tool in compatible_tools: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
154 compatible_targets = compatible_tools[last_i_tool] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
155 for k, t_label in enumerate(target_seq_tools): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
156 target_mat[train_counter][int(t_label)] = 1 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
157 for c_tool in compatible_targets: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
158 target_mat[train_counter][int(c_tool)] = 1 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
159 train_counter += 1 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
160 print("Final data size: ", input_mat.shape, target_mat.shape) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
161 train_data, test_data, train_labels, test_labels = train_test_split(input_mat, target_mat, test_size=self.test_share, random_state=42) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
162 return train_data, train_labels, test_data, test_labels |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
163 |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
164 def split_test_train_data(self, multilabels_paths): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
165 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
166 Split into test and train data randomly for each run |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
167 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
168 train_dict = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
169 test_dict = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
170 all_paths = multilabels_paths.keys() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
171 random.shuffle(list(all_paths)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
172 split_number = int(self.test_share * len(all_paths)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
173 for index, path in enumerate(list(all_paths)): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
174 if index < split_number: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
175 test_dict[path] = multilabels_paths[path] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
176 else: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
177 train_dict[path] = multilabels_paths[path] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
178 return train_dict, test_dict |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
179 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
180 def get_predicted_usage(self, data_dictionary, predicted_usage): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
181 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
182 Get predicted usage for tools |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
183 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
184 usage = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
185 epsilon = 0.0 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
186 # index 0 does not belong to any tool |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
187 usage[0] = epsilon |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
188 for k, v in data_dictionary.items(): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
189 try: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
190 usg = predicted_usage[k] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
191 if usg < epsilon: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
192 usg = epsilon |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
193 usage[v] = usg |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
194 except Exception: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
195 usage[v] = epsilon |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
196 continue |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
197 return usage |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
198 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
199 def assign_class_weights(self, n_classes, predicted_usage): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
200 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
201 Compute class weights using usage |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
202 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
203 class_weights = dict() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
204 class_weights[str(0)] = 0.0 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
205 for key in range(1, n_classes + 1): |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
206 u_score = predicted_usage[key] |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
207 if u_score < 1.0: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
208 u_score += 1.0 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
209 class_weights[key] = np.round(np.log(u_score), 6) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
210 return class_weights |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
211 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
212 def get_train_tool_labels_freq(self, train_paths, reverse_dictionary): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
213 """ |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
214 Get the frequency of last tool of each tool sequence |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
215 to estimate the frequency of tool sequences |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
216 """ |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
217 last_tool_freq = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
218 freq_dict_names = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
219 for path in train_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
220 tools_pos = np.where(path > 0)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
221 path_pos = tools_pos |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
222 path_pos = [str(int(item)) for item in path_pos] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
223 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
224 for tool_pos in path_pos: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
225 if tool_pos not in last_tool_freq: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
226 last_tool_freq[tool_pos] = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
227 freq_dict_names[reverse_dictionary[int(tool_pos)]] = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
228 last_tool_freq[tool_pos] += 1 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
229 freq_dict_names[reverse_dictionary[int(tool_pos)]] += 1 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
230 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
231 return sorted_dict |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
232 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
233 def get_train_last_tool_freq(self, train_paths, reverse_dictionary): |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
234 """ |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
235 Get the frequency of last tool of each tool sequence |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
236 to estimate the frequency of tool sequences |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
237 """ |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
238 last_tool_freq = dict() |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
239 freq_dict_names = dict() |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
240 for path in train_paths: |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
241 tools_pos = np.where(path > 0)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
242 path_pos = path[tools_pos] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
243 path_pos = [str(int(item)) for item in path_pos] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
244 last_tool = path_pos[-1] |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
245 if last_tool not in last_tool_freq: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
246 last_tool_freq[last_tool] = 0 |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
247 freq_dict_names[reverse_dictionary[int(last_tool)]] = 0 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
248 last_tool_freq[last_tool] += 1 |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
249 freq_dict_names[reverse_dictionary[int(last_tool)]] += 1 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
250 sorted_dict = dict(sorted(last_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
251 return sorted_dict |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
252 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
253 def get_toolid_samples(self, train_data, l_tool_freq): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
254 l_tool_tr_samples = dict() |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
255 for tool_id in l_tool_freq: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
256 for index, tr_sample in enumerate(train_data): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
257 last_tool_id = str(int(tr_sample[-1])) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
258 if last_tool_id == tool_id: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
259 if last_tool_id not in l_tool_tr_samples: |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
260 l_tool_tr_samples[last_tool_id] = list() |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
261 l_tool_tr_samples[last_tool_id].append(index) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
262 return l_tool_tr_samples |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
263 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
264 def get_data_labels_matrices(self, workflow_paths, usage_df, cutoff_date, standard_connections, old_data_dictionary={}): |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
265 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
266 Convert the training and test paths into corresponding numpy matrices |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
267 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
268 processed_data, raw_paths = self.process_workflow_paths(workflow_paths) |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
269 dictionary, rev_dict = self.create_data_dictionary(processed_data, old_data_dictionary) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
270 |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
271 num_classes = len(dictionary) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
272 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
273 print("Raw paths: %d" % len(raw_paths)) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
274 random.shuffle(raw_paths) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
275 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
276 print("Decomposing paths...") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
277 all_unique_paths = self.decompose_paths(raw_paths, dictionary) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
278 random.shuffle(all_unique_paths) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
279 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
280 print("Creating dictionaries...") |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
281 multilabels_paths, compatible_tools, d_size = self.prepare_input_target_paths(dictionary, rev_dict, all_unique_paths) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
282 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
283 print("Complete data: %d" % d_size) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
284 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
285 print("Padding train and test data...") |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
286 # pad training and test data with trailing zeros |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
287 train_data, train_labels, test_data, test_labels = self.pad_paths_one_tool_target(multilabels_paths, compatible_tools, d_size, rev_dict, dictionary) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
288 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
289 print("Train data: ", train_data.shape) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
290 print("Test data: ", test_data.shape) |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
291 |
4
afec8c595124
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 65d36f271296a38deeceb0d0e8d471b2898ee8f4"
bgruening
parents:
3
diff
changeset
|
292 print("Estimating sample frequency...") |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
293 tr_tool_freq = self.get_train_tool_labels_freq(train_labels, rev_dict) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
294 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
295 # Predict tools usage |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
296 print("Predicting tools' usage...") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
297 usage_pred = predict_tool_usage.ToolPopularity() |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
298 usage = usage_pred.extract_tool_usage(usage_df, cutoff_date, dictionary) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
299 tool_usage_prediction = usage_pred.get_pupularity_prediction(usage) |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
300 t_pred_usage = self.get_predicted_usage(dictionary, tool_usage_prediction) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
301 # get class weights using the predicted usage for each tool |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
0
diff
changeset
|
302 class_weights = self.assign_class_weights(num_classes, t_pred_usage) |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
303 return train_data, train_labels, test_data, test_labels, dictionary, rev_dict, class_weights, compatible_tools, tr_tool_freq |