comparison tools/ncbi_blast_plus/blastxml_to_tabular.py @ 3:643338ac83c0 draft

Uploaded v0.0.12b, same code but moving folders around to match all my other tools and make future development simpler.
author peterjc
date Thu, 23 Aug 2012 09:00:40 -0400
parents
children 70e7dcbf6573
comparison
equal deleted inserted replaced
2:ab1a8640f817 3:643338ac83c0
1 #!/usr/bin/env python
2 """Convert a BLAST XML file to 12 column tabular output
3
4 Takes three command line options, input BLAST XML filename, output tabular
5 BLAST filename, output format (std for standard 12 columns, or ext for the
6 extended 24 columns offered in the BLAST+ wrappers).
7
8 The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
9 qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
10 mean:
11
12 ====== ========= ============================================
13 Column NCBI name Description
14 ------ --------- --------------------------------------------
15 1 qseqid Query Seq-id (ID of your sequence)
16 2 sseqid Subject Seq-id (ID of the database hit)
17 3 pident Percentage of identical matches
18 4 length Alignment length
19 5 mismatch Number of mismatches
20 6 gapopen Number of gap openings
21 7 qstart Start of alignment in query
22 8 qend End of alignment in query
23 9 sstart Start of alignment in subject (database hit)
24 10 send End of alignment in subject (database hit)
25 11 evalue Expectation value (E-value)
26 12 bitscore Bit score
27 ====== ========= ============================================
28
29 The additional columns offered in the Galaxy BLAST+ wrappers are:
30
31 ====== ============= ===========================================
32 Column NCBI name Description
33 ------ ------------- -------------------------------------------
34 13 sallseqid All subject Seq-id(s), separated by a ';'
35 14 score Raw score
36 15 nident Number of identical matches
37 16 positive Number of positive-scoring matches
38 17 gaps Total number of gaps
39 18 ppos Percentage of positive-scoring matches
40 19 qframe Query frame
41 20 sframe Subject frame
42 21 qseq Aligned part of query sequence
43 22 sseq Aligned part of subject sequence
44 23 qlen Query sequence length
45 24 slen Subject sequence length
46 ====== ============= ===========================================
47
48 Most of these fields are given explicitly in the XML file, others some like
49 the percentage identity and the number of gap openings must be calculated.
50
51 Be aware that the sequence in the extended tabular output or XML direct from
52 BLAST+ may or may not use XXXX masking on regions of low complexity. This
53 can throw the off the calculation of percentage identity and gap openings.
54 [In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
55 with these numbers changing depending on whether or not the low complexity
56 filter is used.]
57
58 This script attempts to produce identical output to what BLAST+ would have done.
59 However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
60 space character (probably a bug).
61 """
62 import sys
63 import re
64
65 if sys.version_info[:2] >= ( 2, 5 ):
66 import xml.etree.cElementTree as ElementTree
67 else:
68 from galaxy import eggs
69 import pkg_resources; pkg_resources.require( "elementtree" )
70 from elementtree import ElementTree
71
72 def stop_err( msg ):
73 sys.stderr.write("%s\n" % msg)
74 sys.exit(1)
75
76 #Parse Command Line
77 try:
78 in_file, out_file, out_fmt = sys.argv[1:]
79 except:
80 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, out format (std or ext)")
81
82 if out_fmt == "std":
83 extended = False
84 elif out_fmt == "x22":
85 stop_err("Format argument x22 has been replaced with ext (extended 24 columns)")
86 elif out_fmt == "ext":
87 extended = True
88 else:
89 stop_err("Format argument should be std (12 column) or ext (extended 24 columns)")
90
91
92 # get an iterable
93 try:
94 context = ElementTree.iterparse(in_file, events=("start", "end"))
95 except:
96 stop_err("Invalid data format.")
97 # turn it into an iterator
98 context = iter(context)
99 # get the root element
100 try:
101 event, root = context.next()
102 except:
103 stop_err( "Invalid data format." )
104
105
106 re_default_query_id = re.compile("^Query_\d+$")
107 assert re_default_query_id.match("Query_101")
108 assert not re_default_query_id.match("Query_101a")
109 assert not re_default_query_id.match("MyQuery_101")
110 re_default_subject_id = re.compile("^Subject_\d+$")
111 assert re_default_subject_id.match("Subject_1")
112 assert not re_default_subject_id.match("Subject_")
113 assert not re_default_subject_id.match("Subject_12a")
114 assert not re_default_subject_id.match("TheSubject_1")
115
116
117 outfile = open(out_file, 'w')
118 blast_program = None
119 for event, elem in context:
120 if event == "end" and elem.tag == "BlastOutput_program":
121 blast_program = elem.text
122 # for every <Iteration> tag
123 if event == "end" and elem.tag == "Iteration":
124 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
125 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
126 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
127 # <Iteration_query-len>406</Iteration_query-len>
128 # <Iteration_hits></Iteration_hits>
129 #
130 #Or, from BLAST 2.2.24+ run online
131 # <Iteration_query-ID>Query_1</Iteration_query-ID>
132 # <Iteration_query-def>Sample</Iteration_query-def>
133 # <Iteration_query-len>516</Iteration_query-len>
134 # <Iteration_hits>...
135 qseqid = elem.findtext("Iteration_query-ID")
136 if re_default_query_id.match(qseqid):
137 #Place holder ID, take the first word of the query definition
138 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
139 qlen = int(elem.findtext("Iteration_query-len"))
140
141 # for every <Hit> within <Iteration>
142 for hit in elem.findall("Iteration_hits/Hit"):
143 #Expecting either this,
144 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
145 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
146 # <Hit_accession>P56514</Hit_accession>
147 #or,
148 # <Hit_id>Subject_1</Hit_id>
149 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
150 # <Hit_accession>Subject_1</Hit_accession>
151 #
152 #apparently depending on the parse_deflines switch
153 sseqid = hit.findtext("Hit_id").split(None,1)[0]
154 hit_def = sseqid + " " + hit.findtext("Hit_def")
155 if re_default_subject_id.match(sseqid) \
156 and sseqid == hit.findtext("Hit_accession"):
157 #Place holder ID, take the first word of the subject definition
158 hit_def = hit.findtext("Hit_def")
159 sseqid = hit_def.split(None,1)[0]
160 # for every <Hsp> within <Hit>
161 for hsp in hit.findall("Hit_hsps/Hsp"):
162 nident = hsp.findtext("Hsp_identity")
163 length = hsp.findtext("Hsp_align-len")
164 pident = "%0.2f" % (100*float(nident)/float(length))
165
166 q_seq = hsp.findtext("Hsp_qseq")
167 h_seq = hsp.findtext("Hsp_hseq")
168 m_seq = hsp.findtext("Hsp_midline")
169 assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
170 gapopen = str(len(q_seq.replace('-', ' ').split())-1 + \
171 len(h_seq.replace('-', ' ').split())-1)
172
173 mismatch = m_seq.count(' ') + m_seq.count('+') \
174 - q_seq.count('-') - h_seq.count('-')
175 #TODO - Remove this alternative mismatch calculation and test
176 #once satisifed there are no problems
177 expected_mismatch = len(q_seq) \
178 - sum(1 for q,h in zip(q_seq, h_seq) \
179 if q == h or q == "-" or h == "-")
180 xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
181 if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
182 stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
183 % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
184 int(mismatch), expected_mismatch))
185
186 #TODO - Remove this alternative identity calculation and test
187 #once satisifed there are no problems
188 expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
189 if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
190 stop_err("%s vs %s identities, expected %i <= %i <= %i" \
191 % (qseqid, sseqid, expected_identity, int(nident),
192 expected_identity + q_seq.count("X")))
193
194
195 evalue = hsp.findtext("Hsp_evalue")
196 if evalue == "0":
197 evalue = "0.0"
198 else:
199 evalue = "%0.0e" % float(evalue)
200
201 bitscore = float(hsp.findtext("Hsp_bit-score"))
202 if bitscore < 100:
203 #Seems to show one decimal place for lower scores
204 bitscore = "%0.1f" % bitscore
205 else:
206 #Note BLAST does not round to nearest int, it truncates
207 bitscore = "%i" % bitscore
208
209 values = [qseqid,
210 sseqid,
211 pident,
212 length, #hsp.findtext("Hsp_align-len")
213 str(mismatch),
214 gapopen,
215 hsp.findtext("Hsp_query-from"), #qstart,
216 hsp.findtext("Hsp_query-to"), #qend,
217 hsp.findtext("Hsp_hit-from"), #sstart,
218 hsp.findtext("Hsp_hit-to"), #send,
219 evalue, #hsp.findtext("Hsp_evalue") in scientific notation
220 bitscore, #hsp.findtext("Hsp_bit-score") rounded
221 ]
222
223 if extended:
224 sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
225 #print hit_def, "-->", sallseqid
226 positive = hsp.findtext("Hsp_positive")
227 ppos = "%0.2f" % (100*float(positive)/float(length))
228 qframe = hsp.findtext("Hsp_query-frame")
229 sframe = hsp.findtext("Hsp_hit-frame")
230 if blast_program == "blastp":
231 #Probably a bug in BLASTP that they use 0 or 1 depending on format
232 if qframe == "0": qframe = "1"
233 if sframe == "0": sframe = "1"
234 slen = int(hit.findtext("Hit_len"))
235 values.extend([sallseqid,
236 hsp.findtext("Hsp_score"), #score,
237 nident,
238 positive,
239 hsp.findtext("Hsp_gaps"), #gaps,
240 ppos,
241 qframe,
242 sframe,
243 #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
244 q_seq,
245 h_seq,
246 str(qlen),
247 str(slen),
248 ])
249 #print "\t".join(values)
250 outfile.write("\t".join(values) + "\n")
251 # prevents ElementTree from growing large datastructure
252 root.clear()
253 elem.clear()
254 outfile.close()