comparison extract_workflow_connections.py @ 3:5b3c08710e47 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
author bgruening
date Sat, 09 May 2020 05:38:23 -0400
parents 9bf25dbe00ad
children 4f7e6612906b
comparison
equal deleted inserted replaced
2:76251d1ccdcc 3:5b3c08710e47
9 import utils 9 import utils
10 10
11 11
12 class ExtractWorkflowConnections: 12 class ExtractWorkflowConnections:
13 13
14 @classmethod
15 def __init__(self): 14 def __init__(self):
16 """ Init method. """ 15 """ Init method. """
17 16
18 @classmethod 17 def collect_standard_connections(self, row):
18 published = row[8]
19 deleted = row[9]
20 has_errors = row[10]
21 if published == "t" and deleted == "f" and has_errors == "f":
22 return True
23 return False
24
19 def read_tabular_file(self, raw_file_path): 25 def read_tabular_file(self, raw_file_path):
20 """ 26 """
21 Read tabular file and extract workflow connections 27 Read tabular file and extract workflow connections
22 """ 28 """
23 print("Reading workflows...") 29 print("Reading workflows...")
24 workflows = {} 30 workflows = {}
25 workflow_paths_dup = "" 31 workflow_paths_dup = ""
26 workflow_parents = dict() 32 workflow_parents = dict()
27 workflow_paths = list() 33 workflow_paths = list()
28 unique_paths = list() 34 unique_paths = dict()
35 standard_connections = dict()
29 with open(raw_file_path, 'rt') as workflow_connections_file: 36 with open(raw_file_path, 'rt') as workflow_connections_file:
30 workflow_connections = csv.reader(workflow_connections_file, delimiter='\t') 37 workflow_connections = csv.reader(workflow_connections_file, delimiter='\t')
31 for index, row in enumerate(workflow_connections): 38 for index, row in enumerate(workflow_connections):
32 wf_id = str(row[0]) 39 wf_id = str(row[0])
33 in_tool = row[3] 40 in_tool = row[3]
34 out_tool = row[6] 41 out_tool = row[6]
35 if wf_id not in workflows: 42 if wf_id not in workflows:
36 workflows[wf_id] = list() 43 workflows[wf_id] = list()
37 if out_tool and in_tool and out_tool != in_tool: 44 if out_tool and in_tool and out_tool != in_tool:
38 workflows[wf_id].append((in_tool, out_tool)) 45 workflows[wf_id].append((out_tool, in_tool))
46 qc = self.collect_standard_connections(row)
47 if qc:
48 i_t = utils.format_tool_id(in_tool)
49 o_t = utils.format_tool_id(out_tool)
50 if i_t not in standard_connections:
51 standard_connections[i_t] = list()
52 if o_t not in standard_connections[i_t]:
53 standard_connections[i_t].append(o_t)
39 print("Processing workflows...") 54 print("Processing workflows...")
40 wf_ctr = 0 55 wf_ctr = 0
41 for wf_id in workflows: 56 for wf_id in workflows:
42 wf_ctr += 1 57 wf_ctr += 1
43 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) 58 workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id])
52 # reverse the paths as they are computed from leaves to roots leaf 67 # reverse the paths as they are computed from leaves to roots leaf
53 paths = [tool_path for tool_path in paths] 68 paths = [tool_path for tool_path in paths]
54 if len(paths) > 0: 69 if len(paths) > 0:
55 flow_paths.extend(paths) 70 flow_paths.extend(paths)
56 workflow_paths.extend(flow_paths) 71 workflow_paths.extend(flow_paths)
57
58 print("Workflows processed: %d" % wf_ctr) 72 print("Workflows processed: %d" % wf_ctr)
59 73
60 # remove slashes from the tool ids 74 # remove slashes from the tool ids
61 wf_paths_no_slash = list() 75 wf_paths_no_slash = list()
62 for path in workflow_paths: 76 for path in workflow_paths:
73 random.shuffle(unique_paths) 87 random.shuffle(unique_paths)
74 no_dup_paths = list(set(unique_paths)) 88 no_dup_paths = list(set(unique_paths))
75 89
76 print("Finding compatible next tools...") 90 print("Finding compatible next tools...")
77 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) 91 compatible_next_tools = self.set_compatible_next_tools(no_dup_paths)
78 return unique_paths, compatible_next_tools 92 return unique_paths, compatible_next_tools, standard_connections
79 93
80 @classmethod
81 def set_compatible_next_tools(self, workflow_paths): 94 def set_compatible_next_tools(self, workflow_paths):
82 """ 95 """
83 Find next tools for each tool 96 Find next tools for each tool
84 """ 97 """
85 next_tools = dict() 98 next_tools = dict()
95 next_tools[current_tool] = next_tool 108 next_tools[current_tool] = next_tool
96 for tool in next_tools: 109 for tool in next_tools:
97 next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) 110 next_tools[tool] = ",".join(list(set(next_tools[tool].split(","))))
98 return next_tools 111 return next_tools
99 112
100 @classmethod
101 def read_workflow(self, wf_id, workflow_rows): 113 def read_workflow(self, wf_id, workflow_rows):
102 """ 114 """
103 Read all connections for a workflow 115 Read all connections for a workflow
104 """ 116 """
105 tool_parents = dict() 117 tool_parents = dict()
110 tool_parents[out_tool] = list() 122 tool_parents[out_tool] = list()
111 if in_tool not in tool_parents[out_tool]: 123 if in_tool not in tool_parents[out_tool]:
112 tool_parents[out_tool].append(in_tool) 124 tool_parents[out_tool].append(in_tool)
113 return tool_parents 125 return tool_parents
114 126
115 @classmethod
116 def get_roots_leaves(self, graph): 127 def get_roots_leaves(self, graph):
117 roots = list() 128 roots = list()
118 leaves = list() 129 leaves = list()
119 all_parents = list() 130 all_parents = list()
120 for item in graph: 131 for item in graph:
123 children = graph.keys() 134 children = graph.keys()
124 roots = list(set(all_parents).difference(set(children))) 135 roots = list(set(all_parents).difference(set(children)))
125 leaves = list(set(children).difference(set(all_parents))) 136 leaves = list(set(children).difference(set(all_parents)))
126 return roots, leaves 137 return roots, leaves
127 138
128 @classmethod
129 def find_tool_paths_workflow(self, graph, start, end, path=[]): 139 def find_tool_paths_workflow(self, graph, start, end, path=[]):
130 path = path + [end] 140 path = path + [end]
131 if start == end: 141 if start == end:
132 return [path] 142 return [path]
133 path_list = list() 143 path_list = list()