Mercurial > repos > peterjc > blastxml_to_top_descr
comparison tools/ncbi_blast_plus/blastxml_to_top_descr.py @ 0:075fe5424c32 draft
Uploaded v0.0.1
author | peterjc |
---|---|
date | Thu, 07 Feb 2013 14:56:18 -0500 |
parents | |
children | 8a0771c5e236 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:075fe5424c32 |
---|---|
1 #!/usr/bin/env python | |
2 """Convert a BLAST XML file to a top hits description table. | |
3 | |
4 Takes three command line options, input BLAST XML filename, output tabular | |
5 BLAST filename, number of hits to collect the descriptions of. | |
6 """ | |
7 import sys | |
8 import re | |
9 | |
10 if sys.version_info[:2] >= ( 2, 5 ): | |
11 import xml.etree.cElementTree as ElementTree | |
12 else: | |
13 from galaxy import eggs | |
14 import pkg_resources; pkg_resources.require( "elementtree" ) | |
15 from elementtree import ElementTree | |
16 | |
17 def stop_err( msg ): | |
18 sys.stderr.write("%s\n" % msg) | |
19 sys.exit(1) | |
20 | |
21 #Parse Command Line | |
22 try: | |
23 in_file, out_file, topN = sys.argv[1:] | |
24 except: | |
25 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, number of hits") | |
26 | |
27 | |
28 try: | |
29 topN = int(topN) | |
30 except ValueError: | |
31 stop_err("Number of hits argument should be an integer (at least 1)") | |
32 if topN < 1: | |
33 stop_err("Number of hits argument should be an integer (at least 1)") | |
34 | |
35 # get an iterable | |
36 try: | |
37 context = ElementTree.iterparse(in_file, events=("start", "end")) | |
38 except: | |
39 stop_err("Invalid data format.") | |
40 # turn it into an iterator | |
41 context = iter(context) | |
42 # get the root element | |
43 try: | |
44 event, root = context.next() | |
45 except: | |
46 stop_err( "Invalid data format." ) | |
47 | |
48 | |
49 re_default_query_id = re.compile("^Query_\d+$") | |
50 assert re_default_query_id.match("Query_101") | |
51 assert not re_default_query_id.match("Query_101a") | |
52 assert not re_default_query_id.match("MyQuery_101") | |
53 re_default_subject_id = re.compile("^Subject_\d+$") | |
54 assert re_default_subject_id.match("Subject_1") | |
55 assert not re_default_subject_id.match("Subject_") | |
56 assert not re_default_subject_id.match("Subject_12a") | |
57 assert not re_default_subject_id.match("TheSubject_1") | |
58 | |
59 | |
60 count = 0 | |
61 outfile = open(out_file, 'w') | |
62 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) | |
63 for event, elem in context: | |
64 # for every <Iteration> tag | |
65 if event == "end" and elem.tag == "Iteration": | |
66 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | |
67 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | |
68 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
69 # <Iteration_query-len>406</Iteration_query-len> | |
70 # <Iteration_hits></Iteration_hits> | |
71 # | |
72 #Or, from BLAST 2.2.24+ run online | |
73 # <Iteration_query-ID>Query_1</Iteration_query-ID> | |
74 # <Iteration_query-def>Sample</Iteration_query-def> | |
75 # <Iteration_query-len>516</Iteration_query-len> | |
76 # <Iteration_hits>... | |
77 qseqid = elem.findtext("Iteration_query-ID") | |
78 if qseqid is None: | |
79 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") | |
80 if re_default_query_id.match(qseqid): | |
81 #Place holder ID, take the first word of the query definition | |
82 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] | |
83 # for every <Hit> within <Iteration> | |
84 hit_descrs = [] | |
85 for hit in elem.findall("Iteration_hits/Hit"): | |
86 #Expecting either this, | |
87 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> | |
88 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> | |
89 # <Hit_accession>P56514</Hit_accession> | |
90 #or, | |
91 # <Hit_id>Subject_1</Hit_id> | |
92 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | |
93 # <Hit_accession>Subject_1</Hit_accession> | |
94 # | |
95 #apparently depending on the parse_deflines switch | |
96 sseqid = hit.findtext("Hit_id").split(None,1)[0] | |
97 hit_def = sseqid + " " + hit.findtext("Hit_def") | |
98 if re_default_subject_id.match(sseqid) \ | |
99 and sseqid == hit.findtext("Hit_accession"): | |
100 #Place holder ID, take the first word of the subject definition | |
101 hit_def = hit.findtext("Hit_def") | |
102 sseqid = hit_def.split(None,1)[0] | |
103 assert hit_def not in hit_descrs | |
104 hit_descrs.append(hit_def) | |
105 #print "%r has %i hits" % (qseqid, len(hit_descrs)) | |
106 hit_descrs = hit_descrs[:topN] | |
107 while len(hit_descrs) < topN: | |
108 hit_descrs.append("") | |
109 outfile.write("%s\t%s\n" % (qseqid, "\t".join(hit_descrs))) | |
110 count += 1 | |
111 # prevents ElementTree from growing large datastructure | |
112 root.clear() | |
113 elem.clear() | |
114 outfile.close() | |
115 print "%i BLAST results" % count |