Mercurial > repos > peterjc > blast2go
comparison tools/blast2go/blast2go.py @ 5:e4419efbefad draft
Uploaded v0.0.6 of wrapper, which now supports b2g4pipe v2.5 (and no longer works with b2g4pipe v2.3.5). See manual install instructions!
author | peterjc |
---|---|
date | Fri, 22 Feb 2013 08:47:27 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:c8c04cd07ca0 | 5:e4419efbefad |
---|---|
1 #!/usr/bin/env python | |
2 """Galaxy wrapper for Blast2GO for pipelines, b2g4pipe v2.5. | |
3 | |
4 This script takes exactly three command line arguments: | |
5 * Input BLAST XML filename | |
6 * Blast2GO properties filename (settings file) | |
7 * Output tabular filename | |
8 | |
9 The properties filename can be a fully qualified path, but if not | |
10 this will look next to the blast2go.jar file. | |
11 | |
12 Sadly b2g4pipe (at least v2.3.5 to v2.5.0) cannot cope with current | |
13 style large BLAST XML files (e.g. from BLAST 2.2.25+), so we reformat | |
14 these to avoid it crashing with a Java heap space OutOfMemoryError. | |
15 | |
16 As part of this reformatting, we check for BLASTP or BLASTX output | |
17 (otherwise raise an error), and print the query count. | |
18 | |
19 It then calls the Java command line tool, and moves the output file to | |
20 the location Galaxy is expecting, and removes the tempory XML file. | |
21 """ | |
22 import sys | |
23 import os | |
24 import subprocess | |
25 | |
26 #You may need to edit this to match your local setup, | |
27 #blast2go_jar = "/opt/b2g4pipe/blast2go.jar" | |
28 blast2go_jar = "/opt/b2g4pipe_v2.5/blast2go.jar" | |
29 | |
30 | |
31 def stop_err(msg, error_level=1): | |
32 """Print error message to stdout and quit with given error level.""" | |
33 sys.stderr.write("%s\n" % msg) | |
34 sys.exit(error_level) | |
35 | |
36 if len(sys.argv) != 4: | |
37 stop_err("Require three arguments: XML filename, properties filename, output tabular filename") | |
38 | |
39 xml_file, prop_file, tabular_file = sys.argv[1:] | |
40 | |
41 #We should have write access here: | |
42 tmp_xml_file = tabular_file + ".tmp.xml" | |
43 | |
44 if not os.path.isfile(blast2go_jar): | |
45 stop_err("Blast2GO JAR file not found: %s" % blast2go_jar) | |
46 | |
47 if not os.path.isfile(xml_file): | |
48 stop_err("Input BLAST XML file not found: %s" % xml_file) | |
49 | |
50 if not os.path.isfile(prop_file): | |
51 tmp = os.path.join(os.path.split(blast2go_jar)[0], prop_file) | |
52 if os.path.isfile(tmp): | |
53 #The properties file seems to have been given relative to the JAR | |
54 prop_file = tmp | |
55 else: | |
56 stop_err("Blast2GO configuration file not found: %s" % prop_file) | |
57 del tmp | |
58 | |
59 def prepare_xml(original_xml, mangled_xml): | |
60 """Reformat BLAST XML to suit Blast2GO. | |
61 | |
62 Blast2GO can't cope with 1000s of <Iteration> tags within a | |
63 single <BlastResult> tag, so instead split this into one | |
64 full XML record per interation (i.e. per query). This gives | |
65 a concatenated XML file mimicing old versions of BLAST. | |
66 | |
67 This also checks for BLASTP or BLASTX output, and outputs | |
68 the number of queries. Galaxy will show this as "info". | |
69 """ | |
70 in_handle = open(original_xml) | |
71 footer = " </BlastOutput_iterations>\n</BlastOutput>\n" | |
72 header = "" | |
73 while True: | |
74 line = in_handle.readline() | |
75 if not line: | |
76 #No hits? | |
77 stop_err("Problem with XML file?") | |
78 if line.strip() == "<Iteration>": | |
79 break | |
80 header += line | |
81 | |
82 if "<BlastOutput_program>blastx</BlastOutput_program>" in header: | |
83 print "BLASTX output identified" | |
84 elif "<BlastOutput_program>blastp</BlastOutput_program>" in header: | |
85 print "BLASTP output identified" | |
86 else: | |
87 in_handle.close() | |
88 stop_err("Expect BLASTP or BLASTX output") | |
89 | |
90 out_handle = open(mangled_xml, "w") | |
91 out_handle.write(header) | |
92 out_handle.write(line) | |
93 count = 1 | |
94 while True: | |
95 line = in_handle.readline() | |
96 if not line: | |
97 break | |
98 elif line.strip() == "<Iteration>": | |
99 #Insert footer/header | |
100 out_handle.write(footer) | |
101 out_handle.write(header) | |
102 count += 1 | |
103 out_handle.write(line) | |
104 | |
105 out_handle.close() | |
106 in_handle.close() | |
107 print "Input has %i queries" % count | |
108 | |
109 | |
110 def run(cmd): | |
111 #Avoid using shell=True when we call subprocess to ensure if the Python | |
112 #script is killed, so too is the child process. | |
113 try: | |
114 child = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
115 except Exception, err: | |
116 stop_err("Error invoking command:\n%s\n\n%s\n" % (" ".join(cmd), err)) | |
117 #Use .communicate as can get deadlocks with .wait(), | |
118 stdout, stderr = child.communicate() | |
119 return_code = child.returncode | |
120 | |
121 #keep stdout minimal as shown prominently in Galaxy | |
122 #Record it in case a silent error needs diagnosis | |
123 if stdout: | |
124 sys.stderr.write("Standard out:\n%s\n\n" % stdout) | |
125 if stderr: | |
126 sys.stderr.write("Standard error:\n%s\n\n" % stderr) | |
127 | |
128 error_msg = None | |
129 if return_code: | |
130 cmd_str = " ".join(cmd) | |
131 error_msg = "Return code %i from command:\n%s" % (return_code, cmd_str) | |
132 elif "Database or network connection (timeout) error" in stdout+stderr: | |
133 error_msg = "Database or network connection (timeout) error" | |
134 elif "Annotation of 0 seqs with 0 annots finished." in stdout+stderr: | |
135 error_msg = "No sequences processed!" | |
136 | |
137 if error_msg: | |
138 print error_msg | |
139 stop_err(error_msg) | |
140 | |
141 | |
142 blast2go_classpath = os.path.split(blast2go_jar)[0] | |
143 assert os.path.isdir(blast2go_classpath) | |
144 blast2go_classpath = "%s/*:%s/ext/*:" % (blast2go_classpath, blast2go_classpath) | |
145 | |
146 prepare_xml(xml_file, tmp_xml_file) | |
147 #print "XML file prepared for Blast2GO" | |
148 | |
149 #We will have write access wherever the output should be, | |
150 #so we'll ask Blast2GO to use that as the stem for its output | |
151 #(it will append .annot to the filename) | |
152 cmd = ["java", "-cp", blast2go_classpath, "es.blast2go.prog.B2GAnnotPipe", | |
153 "-in", tmp_xml_file, | |
154 "-prop", prop_file, | |
155 "-out", tabular_file, #Used as base name for output files | |
156 "-annot", # Generate *.annot tabular file | |
157 #NOTE: For v2.3.5 must use -a, for v2.5 must use -annot instead | |
158 #"-img", # Generate images, feature not in v2.3.5 | |
159 ] | |
160 #print " ".join(cmd) | |
161 run(cmd) | |
162 | |
163 #Remove the temp XML file | |
164 os.remove(tmp_xml_file) | |
165 | |
166 out_file = tabular_file + ".annot" | |
167 if not os.path.isfile(out_file): | |
168 stop_err("ERROR - No output annotation file from Blast2GO") | |
169 | |
170 #Move the output file where Galaxy expects it to be: | |
171 os.rename(out_file, tabular_file) | |
172 | |
173 print "Done" |