comparison blastxml_to_tabular_selectable.py @ 0:2bd0cbccb3c6

Uploaded
author galaxyp
date Wed, 08 Oct 2014 19:38:28 -0400
parents
children 5da5dcc5e13a
comparison
equal deleted inserted replaced
-1:000000000000 0:2bd0cbccb3c6
1 #!/usr/bin/env python
2 """Convert a BLAST XML file to 12 column tabular output
3
4 Takes three command line options, input BLAST XML filename, output tabular
5 BLAST filename, output format (std for standard 12 columns, or ext for the
6 extended 25 columns offered in the BLAST+ wrappers).
7
8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
10 mean:
11
12 ====== ========= ============================================
13 Column NCBI name Description
14 ------ --------- --------------------------------------------
15 1 qseqid Query Seq-id (ID of your sequence)
16 2 sseqid Subject Seq-id (ID of the database hit)
17 3 pident Percentage of identical matches
18 4 length Alignment length
19 5 mismatch Number of mismatches
20 6 gapopen Number of gap openings
21 7 qstart Start of alignment in query
22 8 qend End of alignment in query
23 9 sstart Start of alignment in subject (database hit)
24 10 send End of alignment in subject (database hit)
25 11 evalue Expectation value (E-value)
26 12 bitscore Bit score
27 ====== ========= ============================================
28
29 The additional columns offered in the Galaxy BLAST+ wrappers are:
30
31 ====== ============= ===========================================
32 Column NCBI name Description
33 ------ ------------- -------------------------------------------
34 13 sallseqid All subject Seq-id(s), separated by a ';'
35 14 score Raw score
36 15 nident Number of identical matches
37 16 positive Number of positive-scoring matches
38 17 gaps Total number of gaps
39 18 ppos Percentage of positive-scoring matches
40 19 qframe Query frame
41 20 sframe Subject frame
42 21 qseq Aligned part of query sequence
43 22 sseq Aligned part of subject sequence
44 23 qlen Query sequence length
45 24 slen Subject sequence length
46 25 salltitles All subject titles, separated by '<>'
47 ====== ============= ===========================================
48
49 Most of these fields are given explicitly in the XML file, others some like
50 the percentage identity and the number of gap openings must be calculated.
51
52 Be aware that the sequence in the extended tabular output or XML direct from
53 BLAST+ may or may not use XXXX masking on regions of low complexity. This
54 can throw the off the calculation of percentage identity and gap openings.
55 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
56 with these numbers changing depending on whether or not the low complexity
57 filter is used.]
58
59 This script attempts to produce identical output to what BLAST+ would have done.
60 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
61 space character (probably a bug).
62 """
63 import sys
64 import re
65 import os
66 from optparse import OptionParser
67
68 if "-v" in sys.argv or "--version" in sys.argv:
69 print "v0.0.12"
70 sys.exit(0)
71
72 if sys.version_info[:2] >= ( 2, 5 ):
73 try:
74 from xml.etree import cElementTree as ElementTree
75 except ImportError:
76 from xml.etree import ElementTree as ElementTree
77 else:
78 from galaxy import eggs
79 import pkg_resources; pkg_resources.require( "elementtree" )
80 from elementtree import ElementTree
81
82 def stop_err( msg ):
83 sys.stderr.write("%s\n" % msg)
84 sys.exit(1)
85
86 usage = "usage: %prog [options] blastxml[,...]"
87 parser = OptionParser(usage=usage)
88 parser.add_option('-o','--output', dest='output', default = None, help='output file path', metavar="FILE")
89 parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|colname[,colname,...]] std: 12 column, ext: 25 column, or user specified columns")
90 parser.add_option("-a", "--allqueries", action="store_true", dest="allqueries", default=False, help="ouptut row for each query, including those with no hits")
91 parser.add_option("-d", "--qdef", action="store_true", dest="qdef", default=False, help="use Iteration_query-def for qseqid")
92 parser.add_option('-u', '--unmatched', dest='unmatched', default = None, help='unmatched queries file path', metavar="FILE")
93 parser.add_option('-m', '--maxhits', dest='maxhits', type="int", default = 1, help='maximum number of HITs for output for a Query')
94 parser.add_option('-p', '--maxhsps', dest='maxhsps', type="int", default = 1, help='maximum number of HSPs for output for a Hit')
95 (options, args) = parser.parse_args()
96
97 colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',')
98
99 if len(args) < 1:
100 parser.error("no BLASTXML input files")
101
102 if options.columns == "std":
103 out_fmt = 'std'
104 extended = False
105 columns = colnames[0:12]
106 elif options.columns == "x22":
107 stop_err("Format argument x22 has been replaced with ext (extended 25 columns)")
108 elif options.columns == "ext":
109 out_fmt = 'ext'
110 extended = True
111 columns = colnames[0:25]
112 else:
113 out_fmt = 'cols'
114 extended = True
115 sel_cols = options.columns.split(',')
116 columns = []
117 for col in sel_cols:
118 if col in colnames:
119 columns.append(col)
120
121 for in_file in args:
122 if not os.path.isfile(in_file):
123 stop_err("Input BLAST XML file not found: %s" % in_file)
124
125 unhitfile = open(options.unmatched,'w') if options.unmatched else None
126
127 re_default_query_id = re.compile("^Query_\d+$")
128 assert re_default_query_id.match("Query_101")
129 assert not re_default_query_id.match("Query_101a")
130 assert not re_default_query_id.match("MyQuery_101")
131 re_default_subject_id = re.compile("^Subject_\d+$")
132 assert re_default_subject_id.match("Subject_1")
133 assert not re_default_subject_id.match("Subject_")
134 assert not re_default_subject_id.match("Subject_12a")
135 assert not re_default_subject_id.match("TheSubject_1")
136
137 header = "#%s\n" % "\t".join(columns)
138 outfile = open(options.output, 'w') if options.output else sys.stdout
139 outfile.write(header)
140
141 def handle_event(event, elem):
142 blast_program = None
143 if event == "end" and elem.tag == "BlastOutput_program":
144 blast_program = elem.text
145 # for every <Iteration> tag
146 if event == "end" and elem.tag == "Iteration":
147 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
148 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
149 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
150 # <Iteration_query-len>406</Iteration_query-len>
151 # <Iteration_hits></Iteration_hits>
152 #
153 #Or, from BLAST 2.2.24+ run online
154 # <Iteration_query-ID>Query_1</Iteration_query-ID>
155 # <Iteration_query-def>Sample</Iteration_query-def>
156 # <Iteration_query-len>516</Iteration_query-len>
157 # <Iteration_hits>...
158 qseqid = elem.findtext("Iteration_query-ID")
159 if options.qdef or re_default_query_id.match(qseqid):
160 #Place holder ID, take the first word of the query definition
161 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
162 qlen = int(elem.findtext("Iteration_query-len"))
163 # If no hits in this iteration
164 if not elem.find("Iteration_hits/Hit"):
165 if unhitfile:
166 unhitfile.write("%s\n" % qseqid)
167 if options.allqueries:
168 if columns and len(columns) > 0:
169 v = []
170 for name in columns:
171 if name == 'qseqid':
172 v.append(qseqid)
173 elif name == 'qlen':
174 v.append(str(qlen))
175 else:
176 v.append('')
177 outfile.write("\t".join(v) + "\n")
178 # for every <Hit> within <Iteration>
179 for hitn, hit in enumerate(elem.findall("Iteration_hits/Hit")):
180 if hitn >= options.maxhits:
181 break
182 #Expecting either this,
183 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
184 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
185 # <Hit_accession>P56514</Hit_accession>
186 #or,
187 # <Hit_id>Subject_1</Hit_id>
188 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
189 # <Hit_accession>Subject_1</Hit_accession>
190 #
191 #apparently depending on the parse_deflines switch
192 #
193 #Or, with a local database not using -parse_seqids can get this,
194 # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
195 # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
196 # <Hit_accession>2</Hit_accession>
197 sseqid = hit.findtext("Hit_id").split(None,1)[0]
198 hit_def = sseqid + " " + hit.findtext("Hit_def")
199 if re_default_subject_id.match(sseqid) \
200 and sseqid == hit.findtext("Hit_accession"):
201 #Place holder ID, take the first word of the subject definition
202 hit_def = hit.findtext("Hit_def")
203 sseqid = hit_def.split(None,1)[0]
204 # for every <Hsp> within <Hit>
205 for hspn,hsp in enumerate(hit.findall("Hit_hsps/Hsp")):
206 if hspn >= options.maxhsps:
207 break
208 nident = hsp.findtext("Hsp_identity")
209 length = hsp.findtext("Hsp_align-len")
210 pident = "%0.2f" % (100*float(nident)/float(length))
211
212 q_seq = hsp.findtext("Hsp_qseq")
213 h_seq = hsp.findtext("Hsp_hseq")
214 m_seq = hsp.findtext("Hsp_midline")
215 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
216 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \
217 len(h_seq.replace('-', ' ').split())-1)
218
219 mismatch = m_seq.count(' ') + m_seq.count('+') \
220 - q_seq.count('-') - h_seq.count('-')
221 #TODO - Remove this alternative mismatch calculation and test
222 #once satisifed there are no problems
223 expected_mismatch = len(q_seq) \
224 - sum(1 for q,h in zip(q_seq, h_seq) \
225 if q == h or q == "-" or h == "-")
226 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
227 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
228 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
229 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
230 int(mismatch), expected_mismatch))
231
232 #TODO - Remove this alternative identity calculation and test
233 #once satisifed there are no problems
234 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
235 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
236 stop_err("%s vs %s identities, expected %i <= %i <= %i" \
237 % (qseqid, sseqid, expected_identity, int(nident),
238 expected_identity + q_seq.count("X")))
239
240
241 evalue = hsp.findtext("Hsp_evalue")
242 if evalue == "0":
243 evalue = "0.0"
244 else:
245 evalue = "%0.0e" % float(evalue)
246
247 bitscore = float(hsp.findtext("Hsp_bit-score"))
248 if bitscore < 100:
249 #Seems to show one decimal place for lower scores
250 bitscore = "%0.1f" % bitscore
251 else:
252 #Note BLAST does not round to nearest int, it truncates
253 bitscore = "%i" % bitscore
254
255 values = [qseqid,
256 sseqid,
257 pident,
258 length, #hsp.findtext("Hsp_align-len")
259 str(mismatch),
260 gapopen,
261 hsp.findtext("Hsp_query-from"), #qstart,
262 hsp.findtext("Hsp_query-to"), #qend,
263 hsp.findtext("Hsp_hit-from"), #sstart,
264 hsp.findtext("Hsp_hit-to"), #send,
265 evalue, #hsp.findtext("Hsp_evalue") in scientific notation
266 bitscore, #hsp.findtext("Hsp_bit-score") rounded
267 ]
268
269 if extended:
270 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
271 salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >"))
272 #print hit_def, "-->", sallseqid
273 positive = hsp.findtext("Hsp_positive")
274 ppos = "%0.2f" % (100*float(positive)/float(length))
275 qframe = hsp.findtext("Hsp_query-frame")
276 sframe = hsp.findtext("Hsp_hit-frame")
277 if blast_program == "blastp":
278 #Probably a bug in BLASTP that they use 0 or 1 depending on format
279 if qframe == "0": qframe = "1"
280 if sframe == "0": sframe = "1"
281 slen = int(hit.findtext("Hit_len"))
282 values.extend([sallseqid,
283 hsp.findtext("Hsp_score"), #score,
284 nident,
285 positive,
286 hsp.findtext("Hsp_gaps"), #gaps,
287 ppos,
288 qframe,
289 sframe,
290 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
291 q_seq,
292 h_seq,
293 str(qlen),
294 str(slen),
295 salltitles,
296 ])
297 if out_fmt == 'cols':
298 if columns and len(columns) > 0:
299 v = []
300 for name in columns:
301 v.append(values[colnames.index(name)])
302 values = v
303 #print "\t".join(values)
304 outfile.write("\t".join(values) + "\n")
305 # prevents ElementTree from growing large datastructure
306 root.clear()
307 elem.clear()
308
309
310 for in_file in args:
311 # get an iterable
312 try:
313 context = ElementTree.iterparse(in_file, events=("start", "end"))
314 except:
315 stop_err("Invalid data format.")
316 # turn it into an iterator
317 context = iter(context)
318 # get the root element
319 try:
320 event, root = context.next()
321 except:
322 stop_err( "Invalid data format." )
323 for event, elem in context:
324 handle_event(event, elem)
325
326 if unhitfile:
327 unhitfile.close()
328 if options.output:
329 outfile.close()