changeset 1:0f159cf346c8

Migrated tool version 0.0.2 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 16:29:28 -0400
parents cd52c931b325
children e88a3246520e
files tools/ncbi_blast_plus/blast2go.py tools/ncbi_blast_plus/blast2go.txt tools/ncbi_blast_plus/blast2go.xml
diffstat 3 files changed, 101 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/tools/ncbi_blast_plus/blast2go.py	Tue Jun 07 16:28:31 2011 -0400
+++ b/tools/ncbi_blast_plus/blast2go.py	Tue Jun 07 16:29:28 2011 -0400
@@ -6,8 +6,15 @@
  * Blast2GO properties filename (settings file)
  * Output tabular filename
 
+Sadly b2g4pipe v2.3.5 cannot cope with current style large BLAST XML
+files (e.g. from BLAST 2.2.25+), so we have to reformat these to
+avoid it crashing with a Java heap space OutOfMemoryError.
+
+As part of this reformatting, we check for BLASTP or BLASTX output
+(otherwise raise an error), and print the query count.
+
 It then calls the Java command line tool, and moves the output file to
-the location Galaxy is expecting.
+the location Galaxy is expecting, and removes the tempory XML file.
 """
 import sys
 import os
@@ -27,12 +34,66 @@
 
 xml_file, prop_file, tabular_file = sys.argv[1:]
 
+#We should have write access here:
+tmp_xml_file = tabular_file + ".tmp.xml"
+
 if not os.path.isfile(xml_file):
    stop_err("Input BLAST XML file not found: %s" % xml_file)
 
 if not os.path.isfile(prop_file):
    stop_err("Blast2GO configuration file not found: %s" % prop_file)
 
+def prepare_xml(original_xml, mangled_xml):
+    """Reformat BLAST XML to suit Blast2GO.
+
+    Blast2GO can't cope with 1000s of <Iteration> tags within a
+    single <BlastResult> tag, so instead split this into one
+    full XML record per interation (i.e. per query). This gives
+    a concatenated XML file mimicing old versions of BLAST.
+
+    This also checks for BLASTP or BLASTX output, and outputs
+    the number of queries. Galaxy will show this as "info".
+    """
+    in_handle = open(original_xml)
+    footer = "  </BlastOutput_iterations>\n</BlastOutput>\n"
+    header = ""
+    while True:
+        line = in_handle.readline()
+        if not line:
+            #No hits?
+            stop_err("Problem with XML file?")
+        if line.strip() == "<Iteration>":
+            break
+        header += line
+
+    if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
+        print "BLASTX output identified"
+    elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
+        print "BLASTP output identified"
+    else:
+        in_handle.close()
+        stop_err("Expect BLASTP or BLASTX output")
+
+    out_handle = open(mangled_xml, "w")
+    out_handle.write(header)
+    out_handle.write(line)
+    count = 1
+    while True:
+        line = in_handle.readline()
+        if not line:
+            break
+        elif line.strip() == "<Iteration>":
+           #Insert footer/header
+           out_handle.write(footer)
+           out_handle.write(header)
+           count += 1
+        out_handle.write(line)
+
+    out_handle.close()
+    in_handle.close()
+    print "Input has %i queries" % count
+
+
 def run(cmd):
     #Avoid using shell=True when we call subprocess to ensure if the Python
     #script is killed, so too is the child process.
@@ -44,10 +105,11 @@
     stdout, stderr = child.communicate()
     return_code = child.returncode
     if return_code:
+        cmd_str = " ".join(cmd)
         if stderr and stdout:
-            stop_err("Return code %i from command:\n%s\n\n%s\n\n%s" % (return_code, err, stdout, stderr))
+            stop_err("Return code %i from command:\n%s\n\n%s\n\n%s" % (return_code, cmd_str, stdout, stderr))
         else:
-            stop_err("Return code %i from command:\n%s\n%s" % (return_code, err, stderr))
+            stop_err("Return code %i from command:\n%s\n%s" % (return_code, cmd_str, stderr))
     #For early diagnostics,
     else:
        print stdout
@@ -56,16 +118,25 @@
 if not os.path.isfile(blast2go_jar):
    stop_err("Blast2GO JAR file not found: %s" % blast2go_jar)
 
-#We will have write access whereever the output should be,
+prepare_xml(xml_file, tmp_xml_file)
+#print "XML file prepared for Blast2GO"
+
+#We will have write access wherever the output should be,
 #so we'll ask Blast2GO to use that as the stem for its output
 #(it will append .annot to the filename)
 cmd = ["java", "-jar", blast2go_jar,
-       "-in", xml_file,
+       "-in", tmp_xml_file,
        "-prop", prop_file,
-       "-out", tabular_file,
-       "-a"]
+       "-out", tabular_file, #Used as base name for output files
+       "-a", # Generate *.annot tabular file
+       #"-img", # Generate images, feature not in v2.3.5
+       ]
+#print " ".join(cmd)
 run(cmd)
 
+#Remove the temp XML file
+os.remove(tmp_xml_file)
+
 out_file = tabular_file + ".annot"
 if not os.path.isfile(out_file):
    stop_err("ERROR - No output annotation file from Blast2GO")
--- a/tools/ncbi_blast_plus/blast2go.txt	Tue Jun 07 16:28:31 2011 -0400
+++ b/tools/ncbi_blast_plus/blast2go.txt	Tue Jun 07 16:29:28 2011 -0400
@@ -76,6 +76,11 @@
 =======
 
 v0.0.1 - Initial public release
+v0.0.2 - Documentation clarifications, e.g. concatenated BLAST XML is allowed.
+       - Fixed error handler in wrapper script (for when b2g4pipe fails).
+       - Reformats the XML to use old NCBI-style concatenated BLAST XML since
+         b2g4pipe crashes with heap space error on with large files using
+         current NCBI output.
 
 
 Developers
--- a/tools/ncbi_blast_plus/blast2go.xml	Tue Jun 07 16:28:31 2011 -0400
+++ b/tools/ncbi_blast_plus/blast2go.xml	Tue Jun 07 16:29:28 2011 -0400
@@ -1,4 +1,4 @@
-<tool id="blast2go" name="Blast2GO" version="0.0.1">
+<tool id="blast2go" name="Blast2GO" version="0.0.2">
     <description>Maps BLAST results to GO annotation terms</description>
     <command interpreter="python">
         blast2go.py $xml ${prop.fields.path} $tab
@@ -35,8 +35,13 @@
 for use in pipelines.
 
 It takes as input BLAST XML results against a protein database, typically
-the NCBI non-redundant (NR) database. The BLAST matches are used to assign
-Gene Ontology (GO) annotation terms to each query sequence.
+the NCBI non-redundant (NR) database. This tool will accept concatenated
+BLAST XML files (although they are technically invalid XML), which is very
+useful if you have sub-divided your protein FASTA files and run BLAST on
+them in batches.
+
+The BLAST matches are used to assign Gene Ontology (GO) annotation terms
+to each query sequence.
 
 The output from this tool is a tabular file containing three columns, with
 the order taken from query order in the original BLAST XML file:
@@ -52,6 +57,16 @@
 Note that if no GO terms are assigned to a sequence (e.g. if it had no
 BLAST matches), then it will not be present in the output file.
 
+
+**Advanced Settings**
+
+Blast2GO has a properties setting file which includes which database
+server to connect to (e.g. the public server in Valencia, Spain, or a
+local server), as well as more advanced options such as thresholds and
+evidence code weights. To change these settings, your Galaxy administrator
+must create a new properties file, and add it to the drop down menu above.
+
+
 **References**
 
 S. Götz et al.