Mercurial > repos > peterjc > blastxml_to_top_descr
annotate blastxml_to_top_descr/blastxml_to_top_descr.py @ 9:6aafa0ced802 draft
Uploaded v0.0.8, moved development to GitHub; no functional changes
author | peterjc |
---|---|
date | Tue, 30 Jul 2013 08:07:28 -0400 |
parents | |
children |
rev | line source |
---|---|
9
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
2 """Convert a BLAST XML file to a top hits description table. |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
3 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
4 Takes three command line options, input BLAST XML filename, output tabular |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
5 BLAST filename, number of hits to collect the descriptions of. |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
6 """ |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
7 import sys |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
8 import re |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
9 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
10 if "-v" in sys.argv or "--version" in sys.argv: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
11 print "v0.0.5" |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
12 sys.exit(0) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
13 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
14 if sys.version_info[:2] >= ( 2, 5 ): |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
15 import xml.etree.cElementTree as ElementTree |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
16 else: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
17 from galaxy import eggs |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
18 import pkg_resources; pkg_resources.require( "elementtree" ) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
19 from elementtree import ElementTree |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
20 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
21 def stop_err( msg ): |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
22 sys.stderr.write("%s\n" % msg) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
23 sys.exit(1) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
24 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
25 #Parse Command Line |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
26 try: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
27 in_file, out_file, topN = sys.argv[1:] |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
28 except: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
29 stop_err("Expect 3 arguments: input BLAST XML file, output tabular file, number of hits") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
30 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
31 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
32 try: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
33 topN = int(topN) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
34 except ValueError: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
35 stop_err("Number of hits argument should be an integer (at least 1)") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
36 if topN < 1: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
37 stop_err("Number of hits argument should be an integer (at least 1)") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
38 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
39 # get an iterable |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
40 try: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
41 context = ElementTree.iterparse(in_file, events=("start", "end")) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
42 except: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
43 stop_err("Invalid data format.") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
44 # turn it into an iterator |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
45 context = iter(context) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
46 # get the root element |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
47 try: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
48 event, root = context.next() |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
49 except: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
50 stop_err( "Invalid data format." ) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
51 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
52 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
53 re_default_query_id = re.compile("^Query_\d+$") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
54 assert re_default_query_id.match("Query_101") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
55 assert not re_default_query_id.match("Query_101a") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
56 assert not re_default_query_id.match("MyQuery_101") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
57 re_default_subject_id = re.compile("^Subject_\d+$") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
58 assert re_default_subject_id.match("Subject_1") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
59 assert not re_default_subject_id.match("Subject_") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
60 assert not re_default_subject_id.match("Subject_12a") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
61 assert not re_default_subject_id.match("TheSubject_1") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
62 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
63 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
64 count = 0 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
65 pos_count = 0 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
66 outfile = open(out_file, 'w') |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
67 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
68 for event, elem in context: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
69 # for every <Iteration> tag |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
70 if event == "end" and elem.tag == "Iteration": |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
71 #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
72 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
73 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
74 # <Iteration_query-len>406</Iteration_query-len> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
75 # <Iteration_hits></Iteration_hits> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
76 # |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
77 #Or, from BLAST 2.2.24+ run online |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
78 # <Iteration_query-ID>Query_1</Iteration_query-ID> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
79 # <Iteration_query-def>Sample</Iteration_query-def> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
80 # <Iteration_query-len>516</Iteration_query-len> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
81 # <Iteration_hits>... |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
82 qseqid = elem.findtext("Iteration_query-ID") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
83 if qseqid is None: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
84 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
85 if re_default_query_id.match(qseqid): |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
86 #Place holder ID, take the first word of the query definition |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
87 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
88 # for every <Hit> within <Iteration> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
89 hit_descrs = [] |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
90 for hit in elem.findall("Iteration_hits/Hit"): |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
91 #Expecting either this, |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
92 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
93 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
94 # <Hit_accession>P56514</Hit_accession> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
95 #or, |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
96 # <Hit_id>Subject_1</Hit_id> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
97 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
98 # <Hit_accession>Subject_1</Hit_accession> |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
99 # |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
100 #apparently depending on the parse_deflines switch |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
101 sseqid = hit.findtext("Hit_id").split(None,1)[0] |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
102 hit_def = sseqid + " " + hit.findtext("Hit_def") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
103 if re_default_subject_id.match(sseqid) \ |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
104 and sseqid == hit.findtext("Hit_accession"): |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
105 #Place holder ID, take the first word of the subject definition |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
106 hit_def = hit.findtext("Hit_def") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
107 sseqid = hit_def.split(None,1)[0] |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
108 assert hit_def not in hit_descrs |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
109 hit_descrs.append(hit_def) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
110 #print "%r has %i hits" % (qseqid, len(hit_descrs)) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
111 if hit_descrs: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
112 pos_count += 1 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
113 hit_descrs = hit_descrs[:topN] |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
114 while len(hit_descrs) < topN: |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
115 hit_descrs.append("") |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
116 outfile.write("%s\t%s\n" % (qseqid, "\t".join(hit_descrs))) |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
117 count += 1 |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
118 # prevents ElementTree from growing large datastructure |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
119 root.clear() |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
120 elem.clear() |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
121 outfile.close() |
6aafa0ced802
Uploaded v0.0.8, moved development to GitHub; no functional changes
peterjc
parents:
diff
changeset
|
122 print "Of %i queries, %i had BLAST results" % (count, pos_count) |