comparison tools/blast2go/blast2go.py @ 5:e4419efbefad draft

Uploaded v0.0.6 of wrapper, which now supports b2g4pipe v2.5 (and no longer works with b2g4pipe v2.3.5). See manual install instructions!
author peterjc
date Fri, 22 Feb 2013 08:47:27 -0500
parents
children
comparison
equal deleted inserted replaced
4:c8c04cd07ca0 5:e4419efbefad
1 #!/usr/bin/env python
2 """Galaxy wrapper for Blast2GO for pipelines, b2g4pipe v2.5.
3
4 This script takes exactly three command line arguments:
5 * Input BLAST XML filename
6 * Blast2GO properties filename (settings file)
7 * Output tabular filename
8
9 The properties filename can be a fully qualified path, but if not
10 this will look next to the blast2go.jar file.
11
12 Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current
13 style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat
14 these to avoid it crashing with a Java heap space OutOfMemoryError.
15
16 As part of this reformatting, we check for BLASTP or BLASTX output
17 (otherwise raise an error), and print the query count.
18
19 It then calls the Java command line tool, and moves the output file to
20 the location Galaxy is expecting, and removes the tempory XML file.
21 """
22 import sys
23 import os
24 import subprocess
25
26 #You may need to edit this to match your local setup,
27 #blast2go_jar = "/opt/b2g4pipe/blast2go.jar"
28 blast2go_jar = "/opt/b2g4pipe_v2.5/blast2go.jar"
29
30
31 def stop_err(msg, error_level=1):
32 """Print error message to stdout and quit with given error level."""
33 sys.stderr.write("%s\n" % msg)
34 sys.exit(error_level)
35
36 if len(sys.argv) != 4:
37 stop_err("Require three arguments: XML filename, properties filename, output tabular filename")
38
39 xml_file, prop_file, tabular_file = sys.argv[1:]
40
41 #We should have write access here:
42 tmp_xml_file = tabular_file + ".tmp.xml"
43
44 if not os.path.isfile(blast2go_jar):
45 stop_err("Blast2GO JAR file not found: %s" % blast2go_jar)
46
47 if not os.path.isfile(xml_file):
48 stop_err("Input BLAST XML file not found: %s" % xml_file)
49
50 if not os.path.isfile(prop_file):
51 tmp = os.path.join(os.path.split(blast2go_jar)[0], prop_file)
52 if os.path.isfile(tmp):
53 #The properties file seems to have been given relative to the JAR
54 prop_file = tmp
55 else:
56 stop_err("Blast2GO configuration file not found: %s" % prop_file)
57 del tmp
58
59 def prepare_xml(original_xml, mangled_xml):
60 """Reformat BLAST XML to suit Blast2GO.
61
62 Blast2GO can't cope with 1000s of <Iteration> tags within a
63 single <BlastResult> tag, so instead split this into one
64 full XML record per interation (i.e. per query). This gives
65 a concatenated XML file mimicing old versions of BLAST.
66
67 This also checks for BLASTP or BLASTX output, and outputs
68 the number of queries. Galaxy will show this as "info".
69 """
70 in_handle = open(original_xml)
71 footer = " </BlastOutput_iterations>\n</BlastOutput>\n"
72 header = ""
73 while True:
74 line = in_handle.readline()
75 if not line:
76 #No hits?
77 stop_err("Problem with XML file?")
78 if line.strip() == "<Iteration>":
79 break
80 header += line
81
82 if "<BlastOutput_program>blastx</BlastOutput_program>" in header:
83 print "BLASTX output identified"
84 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header:
85 print "BLASTP output identified"
86 else:
87 in_handle.close()
88 stop_err("Expect BLASTP or BLASTX output")
89
90 out_handle = open(mangled_xml, "w")
91 out_handle.write(header)
92 out_handle.write(line)
93 count = 1
94 while True:
95 line = in_handle.readline()
96 if not line:
97 break
98 elif line.strip() == "<Iteration>":
99 #Insert footer/header
100 out_handle.write(footer)
101 out_handle.write(header)
102 count += 1
103 out_handle.write(line)
104
105 out_handle.close()
106 in_handle.close()
107 print "Input has %i queries" % count
108
109
110 def run(cmd):
111 #Avoid using shell=True when we call subprocess to ensure if the Python
112 #script is killed, so too is the child process.
113 try:
114 child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
115 except Exception, err:
116 stop_err("Error invoking command:\n%s\n\n%s\n" % (" ".join(cmd), err))
117 #Use .communicate as can get deadlocks with .wait(),
118 stdout, stderr = child.communicate()
119 return_code = child.returncode
120
121 #keep stdout minimal as shown prominently in Galaxy
122 #Record it in case a silent error needs diagnosis
123 if stdout:
124 sys.stderr.write("Standard out:\n%s\n\n" % stdout)
125 if stderr:
126 sys.stderr.write("Standard error:\n%s\n\n" % stderr)
127
128 error_msg = None
129 if return_code:
130 cmd_str = " ".join(cmd)
131 error_msg = "Return code %i from command:\n%s" % (return_code, cmd_str)
132 elif "Database or network connection (timeout) error" in stdout+stderr:
133 error_msg = "Database or network connection (timeout) error"
134 elif "Annotation of 0 seqs with 0 annots finished." in stdout+stderr:
135 error_msg = "No sequences processed!"
136
137 if error_msg:
138 print error_msg
139 stop_err(error_msg)
140
141
142 blast2go_classpath = os.path.split(blast2go_jar)[0]
143 assert os.path.isdir(blast2go_classpath)
144 blast2go_classpath = "%s/*:%s/ext/*:" % (blast2go_classpath, blast2go_classpath)
145
146 prepare_xml(xml_file, tmp_xml_file)
147 #print "XML file prepared for Blast2GO"
148
149 #We will have write access wherever the output should be,
150 #so we'll ask Blast2GO to use that as the stem for its output
151 #(it will append .annot to the filename)
152 cmd = ["java", "-cp", blast2go_classpath, "es.blast2go.prog.B2GAnnotPipe",
153 "-in", tmp_xml_file,
154 "-prop", prop_file,
155 "-out", tabular_file, #Used as base name for output files
156 "-annot", # Generate *.annot tabular file
157 #NOTE: For v2.3.5 must use -a, for v2.5 must use -annot instead
158 #"-img", # Generate images, feature not in v2.3.5
159 ]
160 #print " ".join(cmd)
161 run(cmd)
162
163 #Remove the temp XML file
164 os.remove(tmp_xml_file)
165
166 out_file = tabular_file + ".annot"
167 if not os.path.isfile(out_file):
168 stop_err("ERROR - No output annotation file from Blast2GO")
169
170 #Move the output file where Galaxy expects it to be:
171 os.rename(out_file, tabular_file)
172
173 print "Done"