Mercurial > repos > bgruening > create_tool_recommendation_model
annotate utils.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author | bgruening |
---|---|
date | Sun, 16 Oct 2022 11:52:10 +0000 |
parents | 4f7e6612906b |
children |
rev | line source |
---|---|
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
1 import json |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
2 import os |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
3 import random |
5
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
4 |
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
5 import h5py |
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
6 import numpy as np |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
7 import pandas as pd |
5
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
8 import tensorflow as tf |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
9 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
10 binary_ce = tf.keras.losses.BinaryCrossentropy() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
11 binary_acc = tf.keras.metrics.BinaryAccuracy() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
12 categorical_ce = tf.keras.metrics.CategoricalCrossentropy(from_logits=True) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
13 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
14 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
15 def read_file(file_path): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
16 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
17 Read a file |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
18 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
19 with open(file_path, "r") as json_file: |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
20 file_content = json.loads(json_file.read()) |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
21 return file_content |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
22 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
23 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
24 def write_file(file_path, content): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
25 """ |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
26 Write a file |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
27 """ |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
28 remove_file(file_path) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
29 with open(file_path, "w") as json_file: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
30 json_file.write(json.dumps(content)) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
31 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
32 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
33 def save_h5_data(inp, tar, filename): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
34 hf_file = h5py.File(filename, 'w') |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
35 hf_file.create_dataset("input", data=inp) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
36 hf_file.create_dataset("target", data=tar) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
37 hf_file.close() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
38 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
39 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
40 def get_low_freq_te_samples(te_data, te_target, tr_freq_dict): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
41 lowest_tool_te_ids = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
42 lowest_t_ids = get_lowest_tools(tr_freq_dict) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
43 for i, te_labels in enumerate(te_target): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
44 tools_pos = np.where(te_labels > 0)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
45 tools_pos = [str(int(item)) for item in tools_pos] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
46 intersection = list(set(tools_pos).intersection(set(lowest_t_ids))) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
47 if len(intersection) > 0: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
48 lowest_tool_te_ids.append(i) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
49 lowest_t_ids = [item for item in lowest_t_ids if item not in intersection] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
50 return lowest_tool_te_ids |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
51 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
52 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
53 def save_processed_workflows(file_path, unique_paths): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
54 workflow_paths_unique = "" |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
55 for path in unique_paths: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
56 workflow_paths_unique += path + "\n" |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
57 with open(file_path, "w") as workflows_file: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
58 workflows_file.write(workflow_paths_unique) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
59 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
60 |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
61 def format_tool_id(tool_link): |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
62 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
63 Extract tool id from tool link |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
64 """ |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
65 tool_id_split = tool_link.split("/") |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
66 tool_id = tool_id_split[-2] if len(tool_id_split) > 1 else tool_link |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
67 return tool_id |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
68 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
69 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
70 def save_model_file(model, r_dict, c_wts, c_tools, s_conn, model_file): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
71 model.save_weights(model_file, save_format="h5") |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
72 hf_file = h5py.File(model_file, 'r+') |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
73 model_values = { |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
74 "reverse_dict": r_dict, |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
75 "class_weights": c_wts, |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
76 "compatible_tools": c_tools, |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
77 "standard_connections": s_conn |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
78 } |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
79 for k in model_values: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
80 hf_file.create_dataset(k, data=json.dumps(model_values[k])) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
81 hf_file.close() |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
82 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
83 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
84 def remove_file(file_path): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
85 if os.path.exists(file_path): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
86 os.remove(file_path) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
87 |
5
4f7e6612906b
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 5eebc0cb44e71f581d548b7e842002705dd155eb"
bgruening
parents:
4
diff
changeset
|
88 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
89 def verify_oversampling_freq(oversampled_tr_data, rev_dict): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
90 """ |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
91 Compute the frequency of tool sequences after oversampling |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
92 """ |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
93 freq_dict = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
94 freq_dict_names = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
95 for tr_data in oversampled_tr_data: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
96 t_pos = np.where(tr_data > 0)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
97 last_tool_id = str(int(tr_data[t_pos[-1]])) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
98 if last_tool_id not in freq_dict: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
99 freq_dict[last_tool_id] = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
100 freq_dict_names[rev_dict[int(last_tool_id)]] = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
101 freq_dict[last_tool_id] += 1 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
102 freq_dict_names[rev_dict[int(last_tool_id)]] += 1 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
103 s_freq = dict(sorted(freq_dict_names.items(), key=lambda kv: kv[1], reverse=True)) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
104 return s_freq |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
105 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
106 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
107 def collect_sampled_tool_freq(collected_dict, c_freq): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
108 for t in c_freq: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
109 if t not in collected_dict: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
110 collected_dict[t] = int(c_freq[t]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
111 else: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
112 collected_dict[t] += int(c_freq[t]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
113 return collected_dict |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
114 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
115 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
116 def save_data_as_dict(f_dict, r_dict, inp, tar, save_path): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
117 inp_tar = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
118 for index, (i, t) in enumerate(zip(inp, tar)): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
119 i_pos = np.where(i > 0)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
120 i_seq = ",".join([str(int(item)) for item in i[1:i_pos[-1] + 1]]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
121 t_pos = np.where(t > 0)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
122 t_seq = ",".join([str(int(item)) for item in t[1:t_pos[-1] + 1]]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
123 if i_seq not in inp_tar: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
124 inp_tar[i_seq] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
125 inp_tar[i_seq].append(t_seq) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
126 size = 0 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
127 for item in inp_tar: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
128 size += len(inp_tar[item]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
129 print("Size saved file: ", size) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
130 write_file(save_path, inp_tar) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
131 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
132 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
133 def read_train_test(datapath): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
134 file_obj = h5py.File(datapath, 'r') |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
135 data_input = np.array(file_obj["input"]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
136 data_target = np.array(file_obj["target"]) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
137 return data_input, data_target |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
138 |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
139 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
140 def sample_balanced_tr_y(x_seqs, y_labels, ulabels_tr_y_dict, b_size, tr_t_freq, prev_sel_tools): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
141 batch_y_tools = list(ulabels_tr_y_dict.keys()) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
142 random.shuffle(batch_y_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
143 label_tools = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
144 rand_batch_indices = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
145 sel_tools = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
146 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
147 unselected_tools = [t for t in batch_y_tools if t not in prev_sel_tools] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
148 rand_selected_tools = unselected_tools[:b_size] |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
149 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
150 for l_tool in rand_selected_tools: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
151 seq_indices = ulabels_tr_y_dict[l_tool] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
152 random.shuffle(seq_indices) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
153 rand_s_index = np.random.randint(0, len(seq_indices), 1)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
154 rand_sample = seq_indices[rand_s_index] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
155 sel_tools.append(l_tool) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
156 rand_batch_indices.append(rand_sample) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
157 label_tools.append(l_tool) |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
158 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
159 x_batch_train = x_seqs[rand_batch_indices] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
160 y_batch_train = y_labels[rand_batch_indices] |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
161 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
162 unrolled_x = tf.convert_to_tensor(x_batch_train, dtype=tf.int64) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
163 unrolled_y = tf.convert_to_tensor(y_batch_train, dtype=tf.int64) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
164 return unrolled_x, unrolled_y, sel_tools |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
165 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
166 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
167 def sample_balanced_te_y(x_seqs, y_labels, ulabels_tr_y_dict, b_size): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
168 batch_y_tools = list(ulabels_tr_y_dict.keys()) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
169 random.shuffle(batch_y_tools) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
170 label_tools = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
171 rand_batch_indices = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
172 sel_tools = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
173 for l_tool in batch_y_tools: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
174 seq_indices = ulabels_tr_y_dict[l_tool] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
175 random.shuffle(seq_indices) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
176 rand_s_index = np.random.randint(0, len(seq_indices), 1)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
177 rand_sample = seq_indices[rand_s_index] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
178 sel_tools.append(l_tool) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
179 if rand_sample not in rand_batch_indices: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
180 rand_batch_indices.append(rand_sample) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
181 label_tools.append(l_tool) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
182 if len(rand_batch_indices) == b_size: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
183 break |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
184 x_batch_train = x_seqs[rand_batch_indices] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
185 y_batch_train = y_labels[rand_batch_indices] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
186 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
187 unrolled_x = tf.convert_to_tensor(x_batch_train, dtype=tf.int64) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
188 unrolled_y = tf.convert_to_tensor(y_batch_train, dtype=tf.int64) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
189 return unrolled_x, unrolled_y, sel_tools |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
190 |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
191 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
192 def get_u_tr_labels(y_tr): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
193 labels = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
194 labels_pos_dict = dict() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
195 for i, item in enumerate(y_tr): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
196 label_pos = np.where(item > 0)[0] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
197 labels.extend(label_pos) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
198 for label in label_pos: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
199 if label not in labels_pos_dict: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
200 labels_pos_dict[label] = list() |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
201 labels_pos_dict[label].append(i) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
202 u_labels = list(set(labels)) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
203 for item in labels_pos_dict: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
204 labels_pos_dict[item] = list(set(labels_pos_dict[item])) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
205 return u_labels, labels_pos_dict |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
206 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
207 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
208 def compute_loss(y_true, y_pred, class_weights=None): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
209 y_true = tf.cast(y_true, dtype=tf.float32) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
210 loss = binary_ce(y_true, y_pred) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
211 categorical_loss = categorical_ce(y_true, y_pred) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
212 if class_weights is None: |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
213 return tf.reduce_mean(loss), categorical_loss |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
214 return tf.tensordot(loss, class_weights, axes=1), categorical_loss |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
215 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
216 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
217 def compute_acc(y_true, y_pred): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
218 return binary_acc(y_true, y_pred) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
219 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
220 |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
221 def validate_model(te_x, te_y, te_batch_size, model, f_dict, r_dict, ulabels_te_dict, tr_labels, lowest_t_ids): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
222 te_x_batch, y_train_batch, _ = sample_balanced_te_y(te_x, te_y, ulabels_te_dict, te_batch_size) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
223 print("Total test data size: ", te_x.shape, te_y.shape) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
224 print("Batch test data size: ", te_x_batch.shape, y_train_batch.shape) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
225 te_pred_batch, _ = model(te_x_batch, training=False) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
226 test_err, _ = compute_loss(y_train_batch, te_pred_batch) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
227 print("Test loss:") |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
228 print(test_err.numpy()) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
229 print("Test finished") |
0
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
230 |
9bf25dbe00ad
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
bgruening
parents:
diff
changeset
|
231 |
3
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
232 def get_lowest_tools(l_tool_freq, fraction=0.25): |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
233 l_tool_freq = dict(sorted(l_tool_freq.items(), key=lambda kv: kv[1], reverse=True)) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
234 tool_ids = list(l_tool_freq.keys()) |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
235 lowest_ids = tool_ids[-int(len(tool_ids) * fraction):] |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
236 return lowest_ids |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
237 |
5b3c08710e47
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
bgruening
parents:
2
diff
changeset
|
238 |
6
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
239 def remove_pipe(file_path): |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
240 dataframe = pd.read_csv(file_path, sep="|", header=None) |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
241 dataframe = dataframe[1:len(dataframe.index) - 1] |
e94dc7945639
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
bgruening
parents:
5
diff
changeset
|
242 return dataframe[1:] |