0
|
1 #!/usr/bin/env python
|
|
2 """Convert a BLAST XML file to 12 column tabular output
|
|
3
|
|
4 Takes three command line options, input BLAST XML filename, output tabular
|
|
5 BLAST filename, output format (std for standard 12 columns, or ext for the
|
|
6 extended 25 columns offered in the BLAST+ wrappers).
|
|
7
|
|
8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
|
|
9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
|
|
10 mean:
|
|
11
|
|
12 ====== ========= ============================================
|
|
13 Column NCBI name Description
|
|
14 ------ --------- --------------------------------------------
|
|
15 1 qseqid Query Seq-id (ID of your sequence)
|
|
16 2 sseqid Subject Seq-id (ID of the database hit)
|
|
17 3 pident Percentage of identical matches
|
|
18 4 length Alignment length
|
|
19 5 mismatch Number of mismatches
|
|
20 6 gapopen Number of gap openings
|
|
21 7 qstart Start of alignment in query
|
|
22 8 qend End of alignment in query
|
|
23 9 sstart Start of alignment in subject (database hit)
|
|
24 10 send End of alignment in subject (database hit)
|
|
25 11 evalue Expectation value (E-value)
|
|
26 12 bitscore Bit score
|
|
27 ====== ========= ============================================
|
|
28
|
|
29 The additional columns offered in the Galaxy BLAST+ wrappers are:
|
|
30
|
|
31 ====== ============= ===========================================
|
|
32 Column NCBI name Description
|
|
33 ------ ------------- -------------------------------------------
|
|
34 13 sallseqid All subject Seq-id(s), separated by a ';'
|
|
35 14 score Raw score
|
|
36 15 nident Number of identical matches
|
|
37 16 positive Number of positive-scoring matches
|
|
38 17 gaps Total number of gaps
|
|
39 18 ppos Percentage of positive-scoring matches
|
|
40 19 qframe Query frame
|
|
41 20 sframe Subject frame
|
|
42 21 qseq Aligned part of query sequence
|
|
43 22 sseq Aligned part of subject sequence
|
|
44 23 qlen Query sequence length
|
|
45 24 slen Subject sequence length
|
|
46 25 salltitles All subject titles, separated by '<>'
|
|
47 ====== ============= ===========================================
|
|
48
|
|
49 Most of these fields are given explicitly in the XML file, others some like
|
|
50 the percentage identity and the number of gap openings must be calculated.
|
|
51
|
|
52 Be aware that the sequence in the extended tabular output or XML direct from
|
|
53 BLAST+ may or may not use XXXX masking on regions of low complexity. This
|
|
54 can throw the off the calculation of percentage identity and gap openings.
|
|
55 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
|
|
56 with these numbers changing depending on whether or not the low complexity
|
|
57 filter is used.]
|
|
58
|
|
59 This script attempts to produce identical output to what BLAST+ would have done.
|
|
60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
|
|
61 space character (probably a bug).
|
|
62 """
|
|
63 import sys
|
|
64 import re
|
|
65 import os
|
|
66 from optparse import OptionParser
|
|
67
|
|
68 if "-v" in sys.argv or "--version" in sys.argv:
|
|
69 print "v0.0.12"
|
|
70 sys.exit(0)
|
|
71
|
|
72 if sys.version_info[:2] >= ( 2, 5 ):
|
|
73 try:
|
|
74 from xml.etree import cElementTree as ElementTree
|
|
75 except ImportError:
|
|
76 from xml.etree import ElementTree as ElementTree
|
|
77 else:
|
|
78 from galaxy import eggs
|
|
79 import pkg_resources; pkg_resources.require( "elementtree" )
|
|
80 from elementtree import ElementTree
|
|
81
|
|
82 def stop_err( msg ):
|
|
83 sys.stderr.write("%s\n" % msg)
|
|
84 sys.exit(1)
|
|
85
|
|
86 usage = "usage: %prog [options] blastxml[,...]"
|
|
87 parser = OptionParser(usage=usage)
|
|
88 parser.add_option('-o','--output', dest='output', default = None, help='output file path', metavar="FILE")
|
|
89 parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|colname[,colname,...]] std: 12 column, ext: 25 column, or user specified columns")
|
|
90 parser.add_option("-a", "--allqueries", action="store_true", dest="allqueries", default=False, help="ouptut row for each query, including those with no hits")
|
|
91 parser.add_option("-d", "--qdef", action="store_true", dest="qdef", default=False, help="use Iteration_query-def for qseqid")
|
|
92 parser.add_option('-u', '--unmatched', dest='unmatched', default = None, help='unmatched queries file path', metavar="FILE")
|
|
93 parser.add_option('-m', '--maxhits', dest='maxhits', type="int", default = 1, help='maximum number of HITs for output for a Query')
|
|
94 parser.add_option('-p', '--maxhsps', dest='maxhsps', type="int", default = 1, help='maximum number of HSPs for output for a Hit')
|
|
95 (options, args) = parser.parse_args()
|
|
96
|
|
97 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',')
|
|
98
|
|
99 if len(args) < 1:
|
|
100 parser.error("no BLASTXML input files")
|
|
101
|
|
102 if options.columns == "std":
|
|
103 out_fmt = 'std'
|
|
104 extended = False
|
|
105 columns = colnames[0:12]
|
|
106 elif options.columns == "x22":
|
|
107 stop_err("Format argument x22 has been replaced with ext (extended 25 columns)")
|
|
108 elif options.columns == "ext":
|
|
109 out_fmt = 'ext'
|
|
110 extended = True
|
|
111 columns = colnames[0:25]
|
|
112 else:
|
|
113 out_fmt = 'cols'
|
|
114 extended = True
|
|
115 sel_cols = options.columns.split(',')
|
|
116 columns = []
|
|
117 for col in sel_cols:
|
|
118 if col in colnames:
|
|
119 columns.append(col)
|
|
120
|
|
121 for in_file in args:
|
|
122 if not os.path.isfile(in_file):
|
|
123 stop_err("Input BLAST XML file not found: %s" % in_file)
|
|
124
|
|
125 unhitfile = open(options.unmatched,'w') if options.unmatched else None
|
|
126
|
|
127 re_default_query_id = re.compile("^Query_\d+$")
|
|
128 assert re_default_query_id.match("Query_101")
|
|
129 assert not re_default_query_id.match("Query_101a")
|
|
130 assert not re_default_query_id.match("MyQuery_101")
|
|
131 re_default_subject_id = re.compile("^Subject_\d+$")
|
|
132 assert re_default_subject_id.match("Subject_1")
|
|
133 assert not re_default_subject_id.match("Subject_")
|
|
134 assert not re_default_subject_id.match("Subject_12a")
|
|
135 assert not re_default_subject_id.match("TheSubject_1")
|
|
136
|
|
137 header = "#%s\n" % "\t".join(columns)
|
|
138 outfile = open(options.output, 'w') if options.output else sys.stdout
|
|
139 outfile.write(header)
|
|
140
|
|
141 def handle_event(event, elem):
|
|
142 blast_program = None
|
|
143 if event == "end" and elem.tag == "BlastOutput_program":
|
|
144 blast_program = elem.text
|
|
145 # for every <Iteration> tag
|
|
146 if event == "end" and elem.tag == "Iteration":
|
|
147 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
|
|
148 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
|
|
149 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
|
|
150 # <Iteration_query-len>406</Iteration_query-len>
|
|
151 # <Iteration_hits></Iteration_hits>
|
|
152 #
|
|
153 #Or, from BLAST 2.2.24+ run online
|
|
154 # <Iteration_query-ID>Query_1</Iteration_query-ID>
|
|
155 # <Iteration_query-def>Sample</Iteration_query-def>
|
|
156 # <Iteration_query-len>516</Iteration_query-len>
|
|
157 # <Iteration_hits>...
|
|
158 qseqid = elem.findtext("Iteration_query-ID")
|
|
159 if options.qdef or re_default_query_id.match(qseqid):
|
|
160 #Place holder ID, take the first word of the query definition
|
|
161 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
|
|
162 qlen = int(elem.findtext("Iteration_query-len"))
|
|
163 # If no hits in this iteration
|
|
164 if not elem.find("Iteration_hits/Hit"):
|
|
165 if unhitfile:
|
|
166 unhitfile.write("%s\n" % qseqid)
|
|
167 if options.allqueries:
|
|
168 if columns and len(columns) > 0:
|
|
169 v = []
|
|
170 for name in columns:
|
|
171 if name == 'qseqid':
|
|
172 v.append(qseqid)
|
|
173 elif name == 'qlen':
|
|
174 v.append(str(qlen))
|
|
175 else:
|
|
176 v.append('')
|
|
177 outfile.write("\t".join(v) + "\n")
|
|
178 # for every <Hit> within <Iteration>
|
|
179 for hitn, hit in enumerate(elem.findall("Iteration_hits/Hit")):
|
|
180 if hitn >= options.maxhits:
|
|
181 break
|
|
182 #Expecting either this,
|
|
183 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
|
|
184 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
|
|
185 # <Hit_accession>P56514</Hit_accession>
|
|
186 #or,
|
|
187 # <Hit_id>Subject_1</Hit_id>
|
|
188 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
|
|
189 # <Hit_accession>Subject_1</Hit_accession>
|
|
190 #
|
|
191 #apparently depending on the parse_deflines switch
|
|
192 #
|
|
193 #Or, with a local database not using -parse_seqids can get this,
|
|
194 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
|
|
195 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
|
|
196 # <Hit_accession>2</Hit_accession>
|
|
197 sseqid = hit.findtext("Hit_id").split(None,1)[0]
|
|
198 hit_def = sseqid + " " + hit.findtext("Hit_def")
|
|
199 if re_default_subject_id.match(sseqid) \
|
|
200 and sseqid == hit.findtext("Hit_accession"):
|
|
201 #Place holder ID, take the first word of the subject definition
|
|
202 hit_def = hit.findtext("Hit_def")
|
|
203 sseqid = hit_def.split(None,1)[0]
|
|
204 # for every <Hsp> within <Hit>
|
|
205 for hspn,hsp in enumerate(hit.findall("Hit_hsps/Hsp")):
|
|
206 if hspn >= options.maxhsps:
|
|
207 break
|
|
208 nident = hsp.findtext("Hsp_identity")
|
|
209 length = hsp.findtext("Hsp_align-len")
|
|
210 pident = "%0.2f" % (100*float(nident)/float(length))
|
|
211
|
|
212 q_seq = hsp.findtext("Hsp_qseq")
|
|
213 h_seq = hsp.findtext("Hsp_hseq")
|
|
214 m_seq = hsp.findtext("Hsp_midline")
|
|
215 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
|
|
216 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \
|
|
217 len(h_seq.replace('-', ' ').split())-1)
|
|
218
|
|
219 mismatch = m_seq.count(' ') + m_seq.count('+') \
|
|
220 - q_seq.count('-') - h_seq.count('-')
|
|
221 #TODO - Remove this alternative mismatch calculation and test
|
|
222 #once satisifed there are no problems
|
|
223 expected_mismatch = len(q_seq) \
|
|
224 - sum(1 for q,h in zip(q_seq, h_seq) \
|
|
225 if q == h or q == "-" or h == "-")
|
|
226 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
|
|
227 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
|
|
228 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
|
|
229 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
|
|
230 int(mismatch), expected_mismatch))
|
|
231
|
|
232 #TODO - Remove this alternative identity calculation and test
|
|
233 #once satisifed there are no problems
|
|
234 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
|
|
235 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
|
|
236 stop_err("%s vs %s identities, expected %i <= %i <= %i" \
|
|
237 % (qseqid, sseqid, expected_identity, int(nident),
|
|
238 expected_identity + q_seq.count("X")))
|
|
239
|
|
240
|
|
241 evalue = hsp.findtext("Hsp_evalue")
|
|
242 if evalue == "0":
|
|
243 evalue = "0.0"
|
|
244 else:
|
|
245 evalue = "%0.0e" % float(evalue)
|
|
246
|
|
247 bitscore = float(hsp.findtext("Hsp_bit-score"))
|
|
248 if bitscore < 100:
|
|
249 #Seems to show one decimal place for lower scores
|
|
250 bitscore = "%0.1f" % bitscore
|
|
251 else:
|
|
252 #Note BLAST does not round to nearest int, it truncates
|
|
253 bitscore = "%i" % bitscore
|
|
254
|
|
255 values = [qseqid,
|
|
256 sseqid,
|
|
257 pident,
|
|
258 length, #hsp.findtext("Hsp_align-len")
|
|
259 str(mismatch),
|
|
260 gapopen,
|
|
261 hsp.findtext("Hsp_query-from"), #qstart,
|
|
262 hsp.findtext("Hsp_query-to"), #qend,
|
|
263 hsp.findtext("Hsp_hit-from"), #sstart,
|
|
264 hsp.findtext("Hsp_hit-to"), #send,
|
|
265 evalue, #hsp.findtext("Hsp_evalue") in scientific notation
|
|
266 bitscore, #hsp.findtext("Hsp_bit-score") rounded
|
|
267 ]
|
|
268
|
|
269 if extended:
|
|
270 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
|
|
271 salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >"))
|
|
272 #print hit_def, "-->", sallseqid
|
|
273 positive = hsp.findtext("Hsp_positive")
|
|
274 ppos = "%0.2f" % (100*float(positive)/float(length))
|
|
275 qframe = hsp.findtext("Hsp_query-frame")
|
|
276 sframe = hsp.findtext("Hsp_hit-frame")
|
|
277 if blast_program == "blastp":
|
|
278 #Probably a bug in BLASTP that they use 0 or 1 depending on format
|
|
279 if qframe == "0": qframe = "1"
|
|
280 if sframe == "0": sframe = "1"
|
|
281 slen = int(hit.findtext("Hit_len"))
|
|
282 values.extend([sallseqid,
|
|
283 hsp.findtext("Hsp_score"), #score,
|
|
284 nident,
|
|
285 positive,
|
|
286 hsp.findtext("Hsp_gaps"), #gaps,
|
|
287 ppos,
|
|
288 qframe,
|
|
289 sframe,
|
|
290 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
|
|
291 q_seq,
|
|
292 h_seq,
|
|
293 str(qlen),
|
|
294 str(slen),
|
|
295 salltitles,
|
|
296 ])
|
|
297 if out_fmt == 'cols':
|
|
298 if columns and len(columns) > 0:
|
|
299 v = []
|
|
300 for name in columns:
|
|
301 v.append(values[colnames.index(name)])
|
|
302 values = v
|
|
303 #print "\t".join(values)
|
|
304 outfile.write("\t".join(values) + "\n")
|
|
305 # prevents ElementTree from growing large datastructure
|
|
306 root.clear()
|
|
307 elem.clear()
|
|
308
|
|
309
|
|
310 for in_file in args:
|
|
311 # get an iterable
|
|
312 try:
|
|
313 context = ElementTree.iterparse(in_file, events=("start", "end"))
|
|
314 except:
|
|
315 stop_err("Invalid data format.")
|
|
316 # turn it into an iterator
|
|
317 context = iter(context)
|
|
318 # get the root element
|
|
319 try:
|
|
320 event, root = context.next()
|
|
321 except:
|
|
322 stop_err( "Invalid data format." )
|
|
323 for event, elem in context:
|
|
324 handle_event(event, elem)
|
|
325
|
|
326 if unhitfile:
|
|
327 unhitfile.close()
|
|
328 if options.output:
|
|
329 outfile.close()
|