comparison tools/ncbi_blast_plus/blastxml_to_top_descr.py @ 0:075fe5424c32 draft

Uploaded v0.0.1
author peterjc
date Thu, 07 Feb 2013 14:56:18 -0500
parents
children 8a0771c5e236
comparison
equal deleted inserted replaced
-1:000000000000 0:075fe5424c32
1 #!/usr/bin/env python
2 """Convert a BLAST XML file to a top hits description table.
3
4 Takes three command line options, input BLAST XML filename, output tabular
5 BLAST filename, number of hits to collect the descriptions of.
6 """
7 import sys
8 import re
9
10 if sys.version_info[:2] >= ( 2, 5 ):
11 import xml.etree.cElementTree as ElementTree
12 else:
13 from galaxy import eggs
14 import pkg_resources; pkg_resources.require( "elementtree" )
15 from elementtree import ElementTree
16
17 def stop_err( msg ):
18 sys.stderr.write("%s\n" % msg)
19 sys.exit(1)
20
21 #Parse Command Line
22 try:
23 in_file, out_file, topN = sys.argv[1:]
24 except:
25 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, number of hits")
26
27
28 try:
29 topN = int(topN)
30 except ValueError:
31 stop_err("Number of hits argument should be an integer (at least 1)")
32 if topN < 1:
33 stop_err("Number of hits argument should be an integer (at least 1)")
34
35 # get an iterable
36 try:
37 context = ElementTree.iterparse(in_file, events=("start", "end"))
38 except:
39 stop_err("Invalid data format.")
40 # turn it into an iterator
41 context = iter(context)
42 # get the root element
43 try:
44 event, root = context.next()
45 except:
46 stop_err( "Invalid data format." )
47
48
49 re_default_query_id = re.compile("^Query_\d+$")
50 assert re_default_query_id.match("Query_101")
51 assert not re_default_query_id.match("Query_101a")
52 assert not re_default_query_id.match("MyQuery_101")
53 re_default_subject_id = re.compile("^Subject_\d+$")
54 assert re_default_subject_id.match("Subject_1")
55 assert not re_default_subject_id.match("Subject_")
56 assert not re_default_subject_id.match("Subject_12a")
57 assert not re_default_subject_id.match("TheSubject_1")
58
59
60 count = 0
61 outfile = open(out_file, 'w')
62 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN)))
63 for event, elem in context:
64 # for every <Iteration> tag
65 if event == "end" and elem.tag == "Iteration":
66 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
67 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
68 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
69 # <Iteration_query-len>406</Iteration_query-len>
70 # <Iteration_hits></Iteration_hits>
71 #
72 #Or, from BLAST 2.2.24+ run online
73 # <Iteration_query-ID>Query_1</Iteration_query-ID>
74 # <Iteration_query-def>Sample</Iteration_query-def>
75 # <Iteration_query-len>516</Iteration_query-len>
76 # <Iteration_hits>...
77 qseqid = elem.findtext("Iteration_query-ID")
78 if qseqid is None:
79 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)")
80 if re_default_query_id.match(qseqid):
81 #Place holder ID, take the first word of the query definition
82 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
83 # for every <Hit> within <Iteration>
84 hit_descrs = []
85 for hit in elem.findall("Iteration_hits/Hit"):
86 #Expecting either this,
87 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
88 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
89 # <Hit_accession>P56514</Hit_accession>
90 #or,
91 # <Hit_id>Subject_1</Hit_id>
92 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
93 # <Hit_accession>Subject_1</Hit_accession>
94 #
95 #apparently depending on the parse_deflines switch
96 sseqid = hit.findtext("Hit_id").split(None,1)[0]
97 hit_def = sseqid + " " + hit.findtext("Hit_def")
98 if re_default_subject_id.match(sseqid) \
99 and sseqid == hit.findtext("Hit_accession"):
100 #Place holder ID, take the first word of the subject definition
101 hit_def = hit.findtext("Hit_def")
102 sseqid = hit_def.split(None,1)[0]
103 assert hit_def not in hit_descrs
104 hit_descrs.append(hit_def)
105 #print "%r has %i hits" % (qseqid, len(hit_descrs))
106 hit_descrs = hit_descrs[:topN]
107 while len(hit_descrs) < topN:
108 hit_descrs.append("")
109 outfile.write("%s\t%s\n" % (qseqid, "\t".join(hit_descrs)))
110 count += 1
111 # prevents ElementTree from growing large datastructure
112 root.clear()
113 elem.clear()
114 outfile.close()
115 print "%i BLAST results" % count