Mercurial > repos > bgruening > create_tool_recommendation_model
comparison extract_workflow_connections.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
| author | bgruening |
|---|---|
| date | Sun, 16 Oct 2022 11:52:10 +0000 |
| parents | 4f7e6612906b |
| children |
comparison
equal
deleted
inserted
replaced
| 5:4f7e6612906b | 6:e94dc7945639 |
|---|---|
| 1 """ | 1 """ |
| 2 Extract workflow paths from the tabular file containing | 2 Extract workflow paths from the tabular file containing |
| 3 input and output tools | 3 input and output tools |
| 4 """ | 4 """ |
| 5 | |
| 6 import csv | |
| 7 import random | 5 import random |
| 8 | 6 |
| 9 import utils | 7 import utils |
| 10 | 8 |
| 11 | 9 |
| 12 class ExtractWorkflowConnections: | 10 class ExtractWorkflowConnections: |
| 11 | |
| 13 def __init__(self): | 12 def __init__(self): |
| 14 """ Init method. """ | 13 """ Init method. """ |
| 15 | 14 |
| 16 def collect_standard_connections(self, row): | 15 def process_raw_files(self, wf_path, tool_popu_path, config): |
| 17 published = row[8] | 16 """ |
| 18 deleted = row[9] | 17 Remove pipe from workflows and popularity tabular files |
| 19 has_errors = row[10] | 18 """ |
| 20 if published == "t" and deleted == "f" and has_errors == "f": | 19 print("Removing pipe from tabular datasets...") |
| 21 return True | 20 wf_frame = utils.remove_pipe(wf_path) |
| 22 return False | 21 tool_popu_frame = utils.remove_pipe(tool_popu_path) |
| 22 return wf_frame, tool_popu_frame | |
| 23 | 23 |
| 24 def read_tabular_file(self, raw_file_path): | 24 def read_tabular_file(self, wf_dataframe, config): |
| 25 """ | 25 """ |
| 26 Read tabular file and extract workflow connections | 26 Read tabular file and extract workflow connections |
| 27 """ | 27 """ |
| 28 print("Reading workflows...") | 28 print("Reading workflows...") |
| 29 workflows = {} | 29 workflows = {} |
| 30 workflow_paths_dup = "" | 30 workflow_paths_dup = "" |
| 31 workflow_parents = dict() | 31 workflow_parents = dict() |
| 32 workflow_paths = list() | 32 workflow_paths = list() |
| 33 unique_paths = dict() | 33 unique_paths = dict() |
| 34 standard_connections = dict() | 34 standard_connections = dict() |
| 35 with open(raw_file_path, "rt") as workflow_connections_file: | 35 for index, row in wf_dataframe.iterrows(): |
| 36 workflow_connections = csv.reader(workflow_connections_file, delimiter="\t") | 36 row = row.tolist() |
| 37 for index, row in enumerate(workflow_connections): | 37 row = [str(item).strip() for item in row] |
| 38 wf_id = str(row[0]) | 38 wf_id = str(row[0]) |
| 39 in_tool = row[3].strip() | 39 if row[1] > config["cutoff_date"]: |
| 40 out_tool = row[6].strip() | 40 in_tool = row[3] |
| 41 out_tool = row[6] | |
| 41 if wf_id not in workflows: | 42 if wf_id not in workflows: |
| 42 workflows[wf_id] = list() | 43 workflows[wf_id] = list() |
| 43 if out_tool and in_tool and out_tool != in_tool: | 44 if out_tool and in_tool and out_tool != in_tool: |
| 44 workflows[wf_id].append((out_tool, in_tool)) | 45 workflows[wf_id].append((out_tool, in_tool)) |
| 45 qc = self.collect_standard_connections(row) | 46 qc = self.__collect_standard_connections(row) |
| 46 if qc: | 47 if qc: |
| 47 i_t = utils.format_tool_id(in_tool) | 48 i_t = utils.format_tool_id(in_tool) |
| 48 o_t = utils.format_tool_id(out_tool) | 49 o_t = utils.format_tool_id(out_tool) |
| 49 if i_t not in standard_connections: | 50 if i_t not in standard_connections: |
| 50 standard_connections[i_t] = list() | 51 standard_connections[i_t] = list() |
| 52 standard_connections[i_t].append(o_t) | 53 standard_connections[i_t].append(o_t) |
| 53 print("Processing workflows...") | 54 print("Processing workflows...") |
| 54 wf_ctr = 0 | 55 wf_ctr = 0 |
| 55 for wf_id in workflows: | 56 for wf_id in workflows: |
| 56 wf_ctr += 1 | 57 wf_ctr += 1 |
| 57 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) | 58 workflow_parents[wf_id] = self.__read_workflow(wf_id, workflows[wf_id]) |
| 58 | 59 |
| 59 for wf_id in workflow_parents: | 60 for wf_id in workflow_parents: |
| 60 flow_paths = list() | 61 flow_paths = list() |
| 61 parents_graph = workflow_parents[wf_id] | 62 parents_graph = workflow_parents[wf_id] |
| 62 roots, leaves = self.get_roots_leaves(parents_graph) | 63 roots, leaves = self.__get_roots_leaves(parents_graph) |
| 63 for root in roots: | 64 for root in roots: |
| 64 for leaf in leaves: | 65 for leaf in leaves: |
| 65 paths = self.find_tool_paths_workflow(parents_graph, root, leaf) | 66 paths = self.__find_tool_paths_workflow(parents_graph, root, leaf) |
| 66 # reverse the paths as they are computed from leaves to roots leaf | 67 # reverse the paths as they are computed from leaves to roots leaf |
| 67 paths = [tool_path for tool_path in paths] | 68 paths = [tool_path for tool_path in paths] |
| 68 if len(paths) > 0: | 69 if len(paths) > 0: |
| 69 flow_paths.extend(paths) | 70 flow_paths.extend(paths) |
| 70 workflow_paths.extend(flow_paths) | 71 workflow_paths.extend(flow_paths) |
| 82 | 83 |
| 83 # collect unique paths | 84 # collect unique paths |
| 84 unique_paths = list(workflow_paths_dup.split("\n")) | 85 unique_paths = list(workflow_paths_dup.split("\n")) |
| 85 unique_paths = list(filter(None, unique_paths)) | 86 unique_paths = list(filter(None, unique_paths)) |
| 86 random.shuffle(unique_paths) | 87 random.shuffle(unique_paths) |
| 88 print("unique_paths: {}".format(len(unique_paths))) | |
| 87 no_dup_paths = list(set(unique_paths)) | 89 no_dup_paths = list(set(unique_paths)) |
| 90 print("no_dup_paths: {}".format(len(no_dup_paths))) | |
| 91 return no_dup_paths, standard_connections | |
| 88 | 92 |
| 89 print("Finding compatible next tools...") | 93 def __collect_standard_connections(self, row): |
| 90 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) | 94 published = row[8].strip() |
| 91 return unique_paths, compatible_next_tools, standard_connections | 95 deleted = row[9].strip() |
| 96 has_errors = row[10].strip() | |
| 97 if published == "t" and deleted == "f" and has_errors == "f": | |
| 98 return True | |
| 99 return False | |
| 92 | 100 |
| 93 def set_compatible_next_tools(self, workflow_paths): | 101 def __set_compatible_next_tools(self, workflow_paths): |
| 94 """ | 102 """ |
| 95 Find next tools for each tool | 103 Find next tools for each tool |
| 96 """ | 104 """ |
| 97 next_tools = dict() | 105 next_tools = dict() |
| 98 for path in workflow_paths: | 106 for path in workflow_paths: |
| 107 next_tools[current_tool] = next_tool | 115 next_tools[current_tool] = next_tool |
| 108 for tool in next_tools: | 116 for tool in next_tools: |
| 109 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) | 117 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) |
| 110 return next_tools | 118 return next_tools |
| 111 | 119 |
| 112 def read_workflow(self, wf_id, workflow_rows): | 120 def __read_workflow(self, wf_id, workflow_rows): |
| 113 """ | 121 """ |
| 114 Read all connections for a workflow | 122 Read all connections for a workflow |
| 115 """ | 123 """ |
| 116 tool_parents = dict() | 124 tool_parents = dict() |
| 117 for connection in workflow_rows: | 125 for connection in workflow_rows: |
| 121 tool_parents[out_tool] = list() | 129 tool_parents[out_tool] = list() |
| 122 if in_tool not in tool_parents[out_tool]: | 130 if in_tool not in tool_parents[out_tool]: |
| 123 tool_parents[out_tool].append(in_tool) | 131 tool_parents[out_tool].append(in_tool) |
| 124 return tool_parents | 132 return tool_parents |
| 125 | 133 |
| 126 def get_roots_leaves(self, graph): | 134 def __get_roots_leaves(self, graph): |
| 127 roots = list() | 135 roots = list() |
| 128 leaves = list() | 136 leaves = list() |
| 129 all_parents = list() | 137 all_parents = list() |
| 130 for item in graph: | 138 for item in graph: |
| 131 all_parents.extend(graph[item]) | 139 all_parents.extend(graph[item]) |
| 133 children = graph.keys() | 141 children = graph.keys() |
| 134 roots = list(set(all_parents).difference(set(children))) | 142 roots = list(set(all_parents).difference(set(children))) |
| 135 leaves = list(set(children).difference(set(all_parents))) | 143 leaves = list(set(children).difference(set(all_parents))) |
| 136 return roots, leaves | 144 return roots, leaves |
| 137 | 145 |
| 138 def find_tool_paths_workflow(self, graph, start, end, path=[]): | 146 def __find_tool_paths_workflow(self, graph, start, end, path=[]): |
| 139 path = path + [end] | 147 path = path + [end] |
| 140 if start == end: | 148 if start == end: |
| 141 return [path] | 149 return [path] |
| 142 path_list = list() | 150 path_list = list() |
| 143 if end in graph: | 151 if end in graph: |
| 144 for node in graph[end]: | 152 for node in graph[end]: |
| 145 if node not in path: | 153 if node not in path: |
| 146 new_tools_paths = self.find_tool_paths_workflow( | 154 new_tools_paths = self.__find_tool_paths_workflow(graph, start, node, path) |
| 147 graph, start, node, path | |
| 148 ) | |
| 149 for tool_path in new_tools_paths: | 155 for tool_path in new_tools_paths: |
| 150 path_list.append(tool_path) | 156 path_list.append(tool_path) |
| 151 return path_list | 157 return path_list |
