Mercurial > repos > bgruening > create_tool_recommendation_model
comparison extract_workflow_connections.py @ 0:9bf25dbe00ad draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 7fac577189d01cedd01118a77fc2baaefe7d5cad"
author | bgruening |
---|---|
date | Wed, 28 Aug 2019 07:19:38 -0400 |
parents | |
children | 5b3c08710e47 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:9bf25dbe00ad |
---|---|
1 """ | |
2 Extract workflow paths from the tabular file containing | |
3 input and output tools | |
4 """ | |
5 | |
6 import csv | |
7 import random | |
8 | |
9 import utils | |
10 | |
11 | |
12 class ExtractWorkflowConnections: | |
13 | |
14 @classmethod | |
15 def __init__(self): | |
16 """ Init method. """ | |
17 | |
18 @classmethod | |
19 def read_tabular_file(self, raw_file_path): | |
20 """ | |
21 Read tabular file and extract workflow connections | |
22 """ | |
23 print("Reading workflows...") | |
24 workflows = {} | |
25 workflow_paths_dup = "" | |
26 workflow_parents = dict() | |
27 workflow_paths = list() | |
28 unique_paths = list() | |
29 with open(raw_file_path, 'rt') as workflow_connections_file: | |
30 workflow_connections = csv.reader(workflow_connections_file, delimiter='\t') | |
31 for index, row in enumerate(workflow_connections): | |
32 wf_id = str(row[0]) | |
33 in_tool = row[3] | |
34 out_tool = row[6] | |
35 if wf_id not in workflows: | |
36 workflows[wf_id] = list() | |
37 if out_tool and in_tool and out_tool != in_tool: | |
38 workflows[wf_id].append((in_tool, out_tool)) | |
39 print("Processing workflows...") | |
40 wf_ctr = 0 | |
41 for wf_id in workflows: | |
42 wf_ctr += 1 | |
43 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) | |
44 | |
45 for wf_id in workflow_parents: | |
46 flow_paths = list() | |
47 parents_graph = workflow_parents[wf_id] | |
48 roots, leaves = self.get_roots_leaves(parents_graph) | |
49 for root in roots: | |
50 for leaf in leaves: | |
51 paths = self.find_tool_paths_workflow(parents_graph, root, leaf) | |
52 # reverse the paths as they are computed from leaves to roots leaf | |
53 paths = [tool_path for tool_path in paths] | |
54 if len(paths) > 0: | |
55 flow_paths.extend(paths) | |
56 workflow_paths.extend(flow_paths) | |
57 | |
58 print("Workflows processed: %d" % wf_ctr) | |
59 | |
60 # remove slashes from the tool ids | |
61 wf_paths_no_slash = list() | |
62 for path in workflow_paths: | |
63 path_no_slash = [utils.format_tool_id(tool_id) for tool_id in path] | |
64 wf_paths_no_slash.append(path_no_slash) | |
65 | |
66 # collect duplicate paths | |
67 for path in wf_paths_no_slash: | |
68 workflow_paths_dup += ",".join(path) + "\n" | |
69 | |
70 # collect unique paths | |
71 unique_paths = list(workflow_paths_dup.split("\n")) | |
72 unique_paths = list(filter(None, unique_paths)) | |
73 random.shuffle(unique_paths) | |
74 no_dup_paths = list(set(unique_paths)) | |
75 | |
76 print("Finding compatible next tools...") | |
77 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) | |
78 return unique_paths, compatible_next_tools | |
79 | |
80 @classmethod | |
81 def set_compatible_next_tools(self, workflow_paths): | |
82 """ | |
83 Find next tools for each tool | |
84 """ | |
85 next_tools = dict() | |
86 for path in workflow_paths: | |
87 path_split = path.split(",") | |
88 for window in range(0, len(path_split) - 1): | |
89 current_next_tools = path_split[window: window + 2] | |
90 current_tool = current_next_tools[0] | |
91 next_tool = current_next_tools[1] | |
92 try: | |
93 next_tools[current_tool] += "," + next_tool | |
94 except Exception: | |
95 next_tools[current_tool] = next_tool | |
96 for tool in next_tools: | |
97 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) | |
98 return next_tools | |
99 | |
100 @classmethod | |
101 def read_workflow(self, wf_id, workflow_rows): | |
102 """ | |
103 Read all connections for a workflow | |
104 """ | |
105 tool_parents = dict() | |
106 for connection in workflow_rows: | |
107 in_tool = connection[0] | |
108 out_tool = connection[1] | |
109 if out_tool not in tool_parents: | |
110 tool_parents[out_tool] = list() | |
111 if in_tool not in tool_parents[out_tool]: | |
112 tool_parents[out_tool].append(in_tool) | |
113 return tool_parents | |
114 | |
115 @classmethod | |
116 def get_roots_leaves(self, graph): | |
117 roots = list() | |
118 leaves = list() | |
119 all_parents = list() | |
120 for item in graph: | |
121 all_parents.extend(graph[item]) | |
122 all_parents = list(set(all_parents)) | |
123 children = graph.keys() | |
124 roots = list(set(all_parents).difference(set(children))) | |
125 leaves = list(set(children).difference(set(all_parents))) | |
126 return roots, leaves | |
127 | |
128 @classmethod | |
129 def find_tool_paths_workflow(self, graph, start, end, path=[]): | |
130 path = path + [end] | |
131 if start == end: | |
132 return [path] | |
133 path_list = list() | |
134 if end in graph: | |
135 for node in graph[end]: | |
136 if node not in path: | |
137 new_tools_paths = self.find_tool_paths_workflow(graph, start, node, path) | |
138 for tool_path in new_tools_paths: | |
139 path_list.append(tool_path) | |
140 return path_list |