diff tools/blast2go/blast2go.py @ 5:e4419efbefad draft

Uploaded v0.0.6 of wrapper, which now supports b2g4pipe v2.5 (and no longer works with b2g4pipe v2.3.5). See manual install instructions!
author peterjc
date Fri, 22 Feb 2013 08:47:27 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/blast2go/blast2go.py	Fri Feb 22 08:47:27 2013 -0500
@@ -0,0 +1,173 @@
+#!/usr/bin/env python
+"""Galaxy wrapper for Blast2GO for pipelines, b2g4pipe v2.5.
+
+This script takes exactly three command line arguments:
+ * Input BLAST XML filename
+ * Blast2GO properties filename (settings file)
+ * Output tabular filename
+
+The properties filename can be a fully qualified path, but if not
+this will look next to the blast2go.jar file.
+
+Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current
+style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat
+these to avoid it crashing with a Java heap space OutOfMemoryError.
+
+As part of this reformatting, we check for BLASTP or BLASTX output
+(otherwise raise an error), and print the query count.
+
+It then calls the Java command line tool, and moves the output file to
+the location Galaxy is expecting, and removes the tempory XML file.
+"""
+import sys
+import os
+import subprocess
+
+#You may need to edit this to match your local setup,
+#blast2go_jar = "/opt/b2g4pipe/blast2go.jar"
+blast2go_jar = "/opt/b2g4pipe_v2.5/blast2go.jar"
+
+
+def stop_err(msg, error_level=1):
+    """Print error message to stdout and quit with given error level."""
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(error_level)
+
+if len(sys.argv) != 4:
+    stop_err("Require three arguments: XML filename, properties filename, output tabular filename")
+
+xml_file, prop_file, tabular_file = sys.argv[1:]
+
+#We should have write access here:
+tmp_xml_file = tabular_file + ".tmp.xml"
+
+if not os.path.isfile(blast2go_jar):
+    stop_err("Blast2GO JAR file not found: %s" % blast2go_jar)
+
+if not os.path.isfile(xml_file):
+    stop_err("Input BLAST XML file not found: %s" % xml_file)
+
+if not os.path.isfile(prop_file):
+    tmp = os.path.join(os.path.split(blast2go_jar)[0], prop_file)
+    if os.path.isfile(tmp):
+        #The properties file seems to have been given relative to the JAR
+        prop_file = tmp
+    else:
+        stop_err("Blast2GO configuration file not found: %s" % prop_file)
+    del tmp
+
+def prepare_xml(original_xml, mangled_xml):
+    """Reformat BLAST XML to suit Blast2GO.
+
+    Blast2GO can't cope with 1000s of <Iteration> tags within a
+    single <BlastResult> tag, so instead split this into one
+    full XML record per interation (i.e. per query). This gives
+    a concatenated XML file mimicing old versions of BLAST.
+
+    This also checks for BLASTP or BLASTX output, and outputs
+    the number of queries. Galaxy will show this as "info".
+    """
+    in_handle = open(original_xml)
+    footer = "  </BlastOutput_iterations>\n</BlastOutput>\n"
+    header = ""
+    while True:
+        line = in_handle.readline()
+        if not line:
+            #No hits?
+            stop_err("Problem with XML file?")
+        if line.strip() == "<Iteration>":
+            break
+        header += line
+
+    if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
+        print "BLASTX output identified"
+    elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
+        print "BLASTP output identified"
+    else:
+        in_handle.close()
+        stop_err("Expect BLASTP or BLASTX output")
+
+    out_handle = open(mangled_xml, "w")
+    out_handle.write(header)
+    out_handle.write(line)
+    count = 1
+    while True:
+        line = in_handle.readline()
+        if not line:
+            break
+        elif line.strip() == "<Iteration>":
+           #Insert footer/header
+           out_handle.write(footer)
+           out_handle.write(header)
+           count += 1
+        out_handle.write(line)
+
+    out_handle.close()
+    in_handle.close()
+    print "Input has %i queries" % count
+
+
+def run(cmd):
+    #Avoid using shell=True when we call subprocess to ensure if the Python
+    #script is killed, so too is the child process.
+    try:
+        child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    except Exception, err:
+        stop_err("Error invoking command:\n%s\n\n%s\n" % (" ".join(cmd), err))
+    #Use .communicate as can get deadlocks with .wait(),
+    stdout, stderr = child.communicate()
+    return_code = child.returncode
+
+    #keep stdout minimal as shown prominently in Galaxy
+    #Record it in case a silent error needs diagnosis
+    if stdout:
+        sys.stderr.write("Standard out:\n%s\n\n" % stdout)
+    if stderr:
+        sys.stderr.write("Standard error:\n%s\n\n" % stderr)
+
+    error_msg = None
+    if return_code:
+        cmd_str = " ".join(cmd)
+        error_msg = "Return code %i from command:\n%s" % (return_code, cmd_str)
+    elif "Database or network connection (timeout) error" in stdout+stderr:
+        error_msg = "Database or network connection (timeout) error"
+    elif "Annotation of 0 seqs with 0 annots finished." in stdout+stderr:
+        error_msg = "No sequences processed!"
+
+    if error_msg:
+        print error_msg
+        stop_err(error_msg)
+
+
+blast2go_classpath = os.path.split(blast2go_jar)[0]
+assert os.path.isdir(blast2go_classpath)
+blast2go_classpath = "%s/*:%s/ext/*:" % (blast2go_classpath, blast2go_classpath)
+
+prepare_xml(xml_file, tmp_xml_file)
+#print "XML file prepared for Blast2GO"
+
+#We will have write access wherever the output should be,
+#so we'll ask Blast2GO to use that as the stem for its output
+#(it will append .annot to the filename)
+cmd = ["java", "-cp", blast2go_classpath, "es.blast2go.prog.B2GAnnotPipe",
+       "-in", tmp_xml_file,
+       "-prop", prop_file,
+       "-out", tabular_file, #Used as base name for output files
+       "-annot", # Generate *.annot tabular file
+       #NOTE: For v2.3.5 must use -a, for v2.5 must use -annot instead
+       #"-img", # Generate images, feature not in v2.3.5
+       ]
+#print " ".join(cmd)
+run(cmd)
+
+#Remove the temp XML file
+os.remove(tmp_xml_file)
+
+out_file = tabular_file + ".annot"
+if not os.path.isfile(out_file):
+    stop_err("ERROR - No output annotation file from Blast2GO")
+
+#Move the output file where Galaxy expects it to be:
+os.rename(out_file, tabular_file)
+
+print "Done"