Mercurial > repos > bgruening > create_tool_recommendation_model
comparison extract_workflow_connections.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author | bgruening |
---|---|
date | Sun, 16 Oct 2022 11:52:10 +0000 |
parents | 4f7e6612906b |
children |
comparison
equal
deleted
inserted
replaced
5:4f7e6612906b | 6:e94dc7945639 |
---|---|
1 """ | 1 """ |
2 Extract workflow paths from the tabular file containing | 2 Extract workflow paths from the tabular file containing |
3 input and output tools | 3 input and output tools |
4 """ | 4 """ |
5 | |
6 import csv | |
7 import random | 5 import random |
8 | 6 |
9 import utils | 7 import utils |
10 | 8 |
11 | 9 |
12 class ExtractWorkflowConnections: | 10 class ExtractWorkflowConnections: |
11 | |
13 def __init__(self): | 12 def __init__(self): |
14 """ Init method. """ | 13 """ Init method. """ |
15 | 14 |
16 def collect_standard_connections(self, row): | 15 def process_raw_files(self, wf_path, tool_popu_path, config): |
17 published = row[8] | 16 """ |
18 deleted = row[9] | 17 Remove pipe from workflows and popularity tabular files |
19 has_errors = row[10] | 18 """ |
20 if published == "t" and deleted == "f" and has_errors == "f": | 19 print("Removing pipe from tabular datasets...") |
21 return True | 20 wf_frame = utils.remove_pipe(wf_path) |
22 return False | 21 tool_popu_frame = utils.remove_pipe(tool_popu_path) |
22 return wf_frame, tool_popu_frame | |
23 | 23 |
24 def read_tabular_file(self, raw_file_path): | 24 def read_tabular_file(self, wf_dataframe, config): |
25 """ | 25 """ |
26 Read tabular file and extract workflow connections | 26 Read tabular file and extract workflow connections |
27 """ | 27 """ |
28 print("Reading workflows...") | 28 print("Reading workflows...") |
29 workflows = {} | 29 workflows = {} |
30 workflow_paths_dup = "" | 30 workflow_paths_dup = "" |
31 workflow_parents = dict() | 31 workflow_parents = dict() |
32 workflow_paths = list() | 32 workflow_paths = list() |
33 unique_paths = dict() | 33 unique_paths = dict() |
34 standard_connections = dict() | 34 standard_connections = dict() |
35 with open(raw_file_path, "rt") as workflow_connections_file: | 35 for index, row in wf_dataframe.iterrows(): |
36 workflow_connections = csv.reader(workflow_connections_file, delimiter="\t") | 36 row = row.tolist() |
37 for index, row in enumerate(workflow_connections): | 37 row = [str(item).strip() for item in row] |
38 wf_id = str(row[0]) | 38 wf_id = str(row[0]) |
39 in_tool = row[3].strip() | 39 if row[1] > config["cutoff_date"]: |
40 out_tool = row[6].strip() | 40 in_tool = row[3] |
41 out_tool = row[6] | |
41 if wf_id not in workflows: | 42 if wf_id not in workflows: |
42 workflows[wf_id] = list() | 43 workflows[wf_id] = list() |
43 if out_tool and in_tool and out_tool != in_tool: | 44 if out_tool and in_tool and out_tool != in_tool: |
44 workflows[wf_id].append((out_tool, in_tool)) | 45 workflows[wf_id].append((out_tool, in_tool)) |
45 qc = self.collect_standard_connections(row) | 46 qc = self.__collect_standard_connections(row) |
46 if qc: | 47 if qc: |
47 i_t = utils.format_tool_id(in_tool) | 48 i_t = utils.format_tool_id(in_tool) |
48 o_t = utils.format_tool_id(out_tool) | 49 o_t = utils.format_tool_id(out_tool) |
49 if i_t not in standard_connections: | 50 if i_t not in standard_connections: |
50 standard_connections[i_t] = list() | 51 standard_connections[i_t] = list() |
52 standard_connections[i_t].append(o_t) | 53 standard_connections[i_t].append(o_t) |
53 print("Processing workflows...") | 54 print("Processing workflows...") |
54 wf_ctr = 0 | 55 wf_ctr = 0 |
55 for wf_id in workflows: | 56 for wf_id in workflows: |
56 wf_ctr += 1 | 57 wf_ctr += 1 |
57 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) | 58 workflow_parents[wf_id] = self.__read_workflow(wf_id, workflows[wf_id]) |
58 | 59 |
59 for wf_id in workflow_parents: | 60 for wf_id in workflow_parents: |
60 flow_paths = list() | 61 flow_paths = list() |
61 parents_graph = workflow_parents[wf_id] | 62 parents_graph = workflow_parents[wf_id] |
62 roots, leaves = self.get_roots_leaves(parents_graph) | 63 roots, leaves = self.__get_roots_leaves(parents_graph) |
63 for root in roots: | 64 for root in roots: |
64 for leaf in leaves: | 65 for leaf in leaves: |
65 paths = self.find_tool_paths_workflow(parents_graph, root, leaf) | 66 paths = self.__find_tool_paths_workflow(parents_graph, root, leaf) |
66 # reverse the paths as they are computed from leaves to roots leaf | 67 # reverse the paths as they are computed from leaves to roots leaf |
67 paths = [tool_path for tool_path in paths] | 68 paths = [tool_path for tool_path in paths] |
68 if len(paths) > 0: | 69 if len(paths) > 0: |
69 flow_paths.extend(paths) | 70 flow_paths.extend(paths) |
70 workflow_paths.extend(flow_paths) | 71 workflow_paths.extend(flow_paths) |
82 | 83 |
83 # collect unique paths | 84 # collect unique paths |
84 unique_paths = list(workflow_paths_dup.split("\n")) | 85 unique_paths = list(workflow_paths_dup.split("\n")) |
85 unique_paths = list(filter(None, unique_paths)) | 86 unique_paths = list(filter(None, unique_paths)) |
86 random.shuffle(unique_paths) | 87 random.shuffle(unique_paths) |
88 print("unique_paths: {}".format(len(unique_paths))) | |
87 no_dup_paths = list(set(unique_paths)) | 89 no_dup_paths = list(set(unique_paths)) |
90 print("no_dup_paths: {}".format(len(no_dup_paths))) | |
91 return no_dup_paths, standard_connections | |
88 | 92 |
89 print("Finding compatible next tools...") | 93 def __collect_standard_connections(self, row): |
90 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) | 94 published = row[8].strip() |
91 return unique_paths, compatible_next_tools, standard_connections | 95 deleted = row[9].strip() |
96 has_errors = row[10].strip() | |
97 if published == "t" and deleted == "f" and has_errors == "f": | |
98 return True | |
99 return False | |
92 | 100 |
93 def set_compatible_next_tools(self, workflow_paths): | 101 def __set_compatible_next_tools(self, workflow_paths): |
94 """ | 102 """ |
95 Find next tools for each tool | 103 Find next tools for each tool |
96 """ | 104 """ |
97 next_tools = dict() | 105 next_tools = dict() |
98 for path in workflow_paths: | 106 for path in workflow_paths: |
107 next_tools[current_tool] = next_tool | 115 next_tools[current_tool] = next_tool |
108 for tool in next_tools: | 116 for tool in next_tools: |
109 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) | 117 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) |
110 return next_tools | 118 return next_tools |
111 | 119 |
112 def read_workflow(self, wf_id, workflow_rows): | 120 def __read_workflow(self, wf_id, workflow_rows): |
113 """ | 121 """ |
114 Read all connections for a workflow | 122 Read all connections for a workflow |
115 """ | 123 """ |
116 tool_parents = dict() | 124 tool_parents = dict() |
117 for connection in workflow_rows: | 125 for connection in workflow_rows: |
121 tool_parents[out_tool] = list() | 129 tool_parents[out_tool] = list() |
122 if in_tool not in tool_parents[out_tool]: | 130 if in_tool not in tool_parents[out_tool]: |
123 tool_parents[out_tool].append(in_tool) | 131 tool_parents[out_tool].append(in_tool) |
124 return tool_parents | 132 return tool_parents |
125 | 133 |
126 def get_roots_leaves(self, graph): | 134 def __get_roots_leaves(self, graph): |
127 roots = list() | 135 roots = list() |
128 leaves = list() | 136 leaves = list() |
129 all_parents = list() | 137 all_parents = list() |
130 for item in graph: | 138 for item in graph: |
131 all_parents.extend(graph[item]) | 139 all_parents.extend(graph[item]) |
133 children = graph.keys() | 141 children = graph.keys() |
134 roots = list(set(all_parents).difference(set(children))) | 142 roots = list(set(all_parents).difference(set(children))) |
135 leaves = list(set(children).difference(set(all_parents))) | 143 leaves = list(set(children).difference(set(all_parents))) |
136 return roots, leaves | 144 return roots, leaves |
137 | 145 |
138 def find_tool_paths_workflow(self, graph, start, end, path=[]): | 146 def __find_tool_paths_workflow(self, graph, start, end, path=[]): |
139 path = path + [end] | 147 path = path + [end] |
140 if start == end: | 148 if start == end: |
141 return [path] | 149 return [path] |
142 path_list = list() | 150 path_list = list() |
143 if end in graph: | 151 if end in graph: |
144 for node in graph[end]: | 152 for node in graph[end]: |
145 if node not in path: | 153 if node not in path: |
146 new_tools_paths = self.find_tool_paths_workflow( | 154 new_tools_paths = self.__find_tool_paths_workflow(graph, start, node, path) |
147 graph, start, node, path | |
148 ) | |
149 for tool_path in new_tools_paths: | 155 for tool_path in new_tools_paths: |
150 path_list.append(tool_path) | 156 path_list.append(tool_path) |
151 return path_list | 157 return path_list |