0
|
1 #!/usr/bin/env python
|
|
2 """Convert a BLAST XML file to a top hits description table.
|
|
3
|
|
4 Takes three command line options, input BLAST XML filename, output tabular
|
|
5 BLAST filename, number of hits to collect the descriptions of.
|
|
6 """
|
|
7 import sys
|
|
8 import re
|
|
9
|
|
10 if sys.version_info[:2] >= ( 2, 5 ):
|
|
11 import xml.etree.cElementTree as ElementTree
|
|
12 else:
|
|
13 from galaxy import eggs
|
|
14 import pkg_resources; pkg_resources.require( "elementtree" )
|
|
15 from elementtree import ElementTree
|
|
16
|
|
17 def stop_err( msg ):
|
|
18 sys.stderr.write("%s\n" % msg)
|
|
19 sys.exit(1)
|
|
20
|
|
21 #Parse Command Line
|
|
22 try:
|
|
23 in_file, out_file, topN = sys.argv[1:]
|
|
24 except:
|
|
25 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, number of hits")
|
|
26
|
|
27
|
|
28 try:
|
|
29 topN = int(topN)
|
|
30 except ValueError:
|
|
31 stop_err("Number of hits argument should be an integer (at least 1)")
|
|
32 if topN < 1:
|
|
33 stop_err("Number of hits argument should be an integer (at least 1)")
|
|
34
|
|
35 # get an iterable
|
|
36 try:
|
|
37 context = ElementTree.iterparse(in_file, events=("start", "end"))
|
|
38 except:
|
|
39 stop_err("Invalid data format.")
|
|
40 # turn it into an iterator
|
|
41 context = iter(context)
|
|
42 # get the root element
|
|
43 try:
|
|
44 event, root = context.next()
|
|
45 except:
|
|
46 stop_err( "Invalid data format." )
|
|
47
|
|
48
|
|
49 re_default_query_id = re.compile("^Query_\d+$")
|
|
50 assert re_default_query_id.match("Query_101")
|
|
51 assert not re_default_query_id.match("Query_101a")
|
|
52 assert not re_default_query_id.match("MyQuery_101")
|
|
53 re_default_subject_id = re.compile("^Subject_\d+$")
|
|
54 assert re_default_subject_id.match("Subject_1")
|
|
55 assert not re_default_subject_id.match("Subject_")
|
|
56 assert not re_default_subject_id.match("Subject_12a")
|
|
57 assert not re_default_subject_id.match("TheSubject_1")
|
|
58
|
|
59
|
|
60 count = 0
|
|
61 outfile = open(out_file, 'w')
|
|
62 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN)))
|
|
63 for event, elem in context:
|
|
64 # for every <Iteration> tag
|
|
65 if event == "end" and elem.tag == "Iteration":
|
|
66 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
|
|
67 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
|
|
68 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
|
|
69 # <Iteration_query-len>406</Iteration_query-len>
|
|
70 # <Iteration_hits></Iteration_hits>
|
|
71 #
|
|
72 #Or, from BLAST 2.2.24+ run online
|
|
73 # <Iteration_query-ID>Query_1</Iteration_query-ID>
|
|
74 # <Iteration_query-def>Sample</Iteration_query-def>
|
|
75 # <Iteration_query-len>516</Iteration_query-len>
|
|
76 # <Iteration_hits>...
|
|
77 qseqid = elem.findtext("Iteration_query-ID")
|
|
78 if qseqid is None:
|
|
79 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)")
|
|
80 if re_default_query_id.match(qseqid):
|
|
81 #Place holder ID, take the first word of the query definition
|
|
82 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
|
|
83 # for every <Hit> within <Iteration>
|
|
84 hit_descrs = []
|
|
85 for hit in elem.findall("Iteration_hits/Hit"):
|
|
86 #Expecting either this,
|
|
87 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
|
|
88 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
|
|
89 # <Hit_accession>P56514</Hit_accession>
|
|
90 #or,
|
|
91 # <Hit_id>Subject_1</Hit_id>
|
|
92 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
|
|
93 # <Hit_accession>Subject_1</Hit_accession>
|
|
94 #
|
|
95 #apparently depending on the parse_deflines switch
|
|
96 sseqid = hit.findtext("Hit_id").split(None,1)[0]
|
|
97 hit_def = sseqid + " " + hit.findtext("Hit_def")
|
|
98 if re_default_subject_id.match(sseqid) \
|
|
99 and sseqid == hit.findtext("Hit_accession"):
|
|
100 #Place holder ID, take the first word of the subject definition
|
|
101 hit_def = hit.findtext("Hit_def")
|
|
102 sseqid = hit_def.split(None,1)[0]
|
|
103 assert hit_def not in hit_descrs
|
|
104 hit_descrs.append(hit_def)
|
|
105 #print "%r has %i hits" % (qseqid, len(hit_descrs))
|
|
106 hit_descrs = hit_descrs[:topN]
|
|
107 while len(hit_descrs) < topN:
|
|
108 hit_descrs.append("")
|
|
109 outfile.write("%s\t%s\n" % (qseqid, "\t".join(hit_descrs)))
|
|
110 count += 1
|
|
111 # prevents ElementTree from growing large datastructure
|
|
112 root.clear()
|
|
113 elem.clear()
|
|
114 outfile.close()
|
|
115 print "%i BLAST results" % count
|