diff extract_workflow_connections.py @ 3:5b3c08710e47 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/recommendation_training/tools/tool_recommendation_model commit c635df659fe1835679438589ded43136b0e515c6"
author bgruening
date Sat, 09 May 2020 05:38:23 -0400
parents 9bf25dbe00ad
children 4f7e6612906b
line wrap: on
line diff
--- a/extract_workflow_connections.py	Fri Oct 11 18:24:54 2019 -0400
+++ b/extract_workflow_connections.py	Sat May 09 05:38:23 2020 -0400
@@ -11,11 +11,17 @@
 
 class ExtractWorkflowConnections:
 
-    @classmethod
     def __init__(self):
         """ Init method. """
 
-    @classmethod
+    def collect_standard_connections(self, row):
+        published = row[8]
+        deleted = row[9]
+        has_errors = row[10]
+        if published == "t" and deleted == "f" and has_errors == "f":
+            return True
+        return False
+
     def read_tabular_file(self, raw_file_path):
         """
         Read tabular file and extract workflow connections
@@ -25,7 +31,8 @@
         workflow_paths_dup = ""
         workflow_parents = dict()
         workflow_paths = list()
-        unique_paths = list()
+        unique_paths = dict()
+        standard_connections = dict()
         with open(raw_file_path, 'rt') as workflow_connections_file:
             workflow_connections = csv.reader(workflow_connections_file, delimiter='\t')
             for index, row in enumerate(workflow_connections):
@@ -35,7 +42,15 @@
                 if wf_id not in workflows:
                     workflows[wf_id] = list()
                 if out_tool and in_tool and out_tool != in_tool:
-                    workflows[wf_id].append((in_tool, out_tool))
+                    workflows[wf_id].append((out_tool, in_tool))
+                    qc = self.collect_standard_connections(row)
+                    if qc:
+                        i_t = utils.format_tool_id(in_tool)
+                        o_t = utils.format_tool_id(out_tool)
+                        if i_t not in standard_connections:
+                            standard_connections[i_t] = list()
+                        if o_t not in standard_connections[i_t]:
+                            standard_connections[i_t].append(o_t)
         print("Processing workflows...")
         wf_ctr = 0
         for wf_id in workflows:
@@ -54,7 +69,6 @@
                     if len(paths) > 0:
                         flow_paths.extend(paths)
             workflow_paths.extend(flow_paths)
-
         print("Workflows processed: %d" % wf_ctr)
 
         # remove slashes from the tool ids
@@ -75,9 +89,8 @@
 
         print("Finding compatible next tools...")
         compatible_next_tools = self.set_compatible_next_tools(no_dup_paths)
-        return unique_paths, compatible_next_tools
+        return unique_paths, compatible_next_tools, standard_connections
 
-    @classmethod
     def set_compatible_next_tools(self, workflow_paths):
         """
         Find next tools for each tool
@@ -97,7 +110,6 @@
             next_tools[tool] = ",".join(list(set(next_tools[tool].split(","))))
         return next_tools
 
-    @classmethod
     def read_workflow(self, wf_id, workflow_rows):
         """
         Read all connections for a workflow
@@ -112,7 +124,6 @@
                 tool_parents[out_tool].append(in_tool)
         return tool_parents
 
-    @classmethod
     def get_roots_leaves(self, graph):
         roots = list()
         leaves = list()
@@ -125,7 +136,6 @@
         leaves = list(set(children).difference(set(all_parents)))
         return roots, leaves
 
-    @classmethod
     def find_tool_paths_workflow(self, graph, start, end, path=[]):
         path = path + [end]
         if start == end: