Mercurial > repos > bgruening > create_tool_recommendation_model
diff extract_workflow_connections.py @ 6:e94dc7945639 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit 24bab7a797f53fe4bcc668b18ee0326625486164
author | bgruening |
---|---|
date | Sun, 16 Oct 2022 11:52:10 +0000 |
parents | 4f7e6612906b |
children |
line wrap: on
line diff
--- a/extract_workflow_connections.py Fri May 06 09:05:18 2022 +0000 +++ b/extract_workflow_connections.py Sun Oct 16 11:52:10 2022 +0000 @@ -2,26 +2,26 @@ Extract workflow paths from the tabular file containing input and output tools """ - -import csv import random import utils class ExtractWorkflowConnections: + def __init__(self): """ Init method. """ - def collect_standard_connections(self, row): - published = row[8] - deleted = row[9] - has_errors = row[10] - if published == "t" and deleted == "f" and has_errors == "f": - return True - return False + def process_raw_files(self, wf_path, tool_popu_path, config): + """ + Remove pipe from workflows and popularity tabular files + """ + print("Removing pipe from tabular datasets...") + wf_frame = utils.remove_pipe(wf_path) + tool_popu_frame = utils.remove_pipe(tool_popu_path) + return wf_frame, tool_popu_frame - def read_tabular_file(self, raw_file_path): + def read_tabular_file(self, wf_dataframe, config): """ Read tabular file and extract workflow connections """ @@ -32,17 +32,18 @@ workflow_paths = list() unique_paths = dict() standard_connections = dict() - with open(raw_file_path, "rt") as workflow_connections_file: - workflow_connections = csv.reader(workflow_connections_file, delimiter="\t") - for index, row in enumerate(workflow_connections): - wf_id = str(row[0]) - in_tool = row[3].strip() - out_tool = row[6].strip() + for index, row in wf_dataframe.iterrows(): + row = row.tolist() + row = [str(item).strip() for item in row] + wf_id = str(row[0]) + if row[1] > config["cutoff_date"]: + in_tool = row[3] + out_tool = row[6] if wf_id not in workflows: workflows[wf_id] = list() if out_tool and in_tool and out_tool != in_tool: workflows[wf_id].append((out_tool, in_tool)) - qc = self.collect_standard_connections(row) + qc = self.__collect_standard_connections(row) if qc: i_t = utils.format_tool_id(in_tool) o_t = utils.format_tool_id(out_tool) @@ -54,15 +55,15 @@ wf_ctr = 0 for wf_id in workflows: wf_ctr += 1 - workflow_parents[wf_id] = self.read_workflow(wf_id, workflows[wf_id]) + workflow_parents[wf_id] = self.__read_workflow(wf_id, workflows[wf_id]) for wf_id in workflow_parents: flow_paths = list() parents_graph = workflow_parents[wf_id] - roots, leaves = self.get_roots_leaves(parents_graph) + roots, leaves = self.__get_roots_leaves(parents_graph) for root in roots: for leaf in leaves: - paths = self.find_tool_paths_workflow(parents_graph, root, leaf) + paths = self.__find_tool_paths_workflow(parents_graph, root, leaf) # reverse the paths as they are computed from leaves to roots leaf paths = [tool_path for tool_path in paths] if len(paths) > 0: @@ -84,13 +85,20 @@ unique_paths = list(workflow_paths_dup.split("\n")) unique_paths = list(filter(None, unique_paths)) random.shuffle(unique_paths) + print("unique_paths: {}".format(len(unique_paths))) no_dup_paths = list(set(unique_paths)) + print("no_dup_paths: {}".format(len(no_dup_paths))) + return no_dup_paths, standard_connections - print("Finding compatible next tools...") - compatible_next_tools = self.set_compatible_next_tools(no_dup_paths) - return unique_paths, compatible_next_tools, standard_connections + def __collect_standard_connections(self, row): + published = row[8].strip() + deleted = row[9].strip() + has_errors = row[10].strip() + if published == "t" and deleted == "f" and has_errors == "f": + return True + return False - def set_compatible_next_tools(self, workflow_paths): + def __set_compatible_next_tools(self, workflow_paths): """ Find next tools for each tool """ @@ -109,7 +117,7 @@ next_tools[tool] = ",".join(list(set(next_tools[tool].split(",")))) return next_tools - def read_workflow(self, wf_id, workflow_rows): + def __read_workflow(self, wf_id, workflow_rows): """ Read all connections for a workflow """ @@ -123,7 +131,7 @@ tool_parents[out_tool].append(in_tool) return tool_parents - def get_roots_leaves(self, graph): + def __get_roots_leaves(self, graph): roots = list() leaves = list() all_parents = list() @@ -135,7 +143,7 @@ leaves = list(set(children).difference(set(all_parents))) return roots, leaves - def find_tool_paths_workflow(self, graph, start, end, path=[]): + def __find_tool_paths_workflow(self, graph, start, end, path=[]): path = path + [end] if start == end: return [path] @@ -143,9 +151,7 @@ if end in graph: for node in graph[end]: if node not in path: - new_tools_paths = self.find_tool_paths_workflow( - graph, start, node, path - ) + new_tools_paths = self.__find_tool_paths_workflow(graph, start, node, path) for tool_path in new_tools_paths: path_list.append(tool_path) return path_list