Mercurial > repos > peterjc > blastxml_to_top_descr
annotate tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 13:8dc4ba7eba5d draft default tip
v0.1.2 with Python 3.9 declaration
author | peterjc |
---|---|
date | Sun, 17 Sep 2023 13:01:56 +0000 |
parents | 98f8431dab44 |
children |
rev | line source |
---|---|
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
2 """Convert a BLAST XML file to a top hits description table. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
3 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
4 Takes three command line options, input BLAST XML filename, output tabular |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
5 BLAST filename, number of hits to collect the descriptions of. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
6 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
8 """ |
13 | 9 from __future__ import print_function |
10 | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
11 import os |
13 | 12 import re |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
13 import sys |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
14 from optparse import OptionParser |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
15 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
16 if "-v" in sys.argv or "--version" in sys.argv: |
13 | 17 print("v0.1.2") |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
18 sys.exit(0) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
19 |
13 | 20 if sys.version_info[:2] >= (2, 5): |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
21 import xml.etree.cElementTree as ElementTree |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
22 else: |
13 | 23 from galaxy import eggs # noqa - ignore flake8 F401 |
24 import pkg_resources | |
25 | |
26 pkg_resources.require("elementtree") | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
27 from elementtree import ElementTree |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
28 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
29 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
30 usage = """Use as follows: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
31 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
32 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
33 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
34 Or, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
35 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
36 $ blastxml_to_top_descr.py [--topN 3] --output example.tabular input.xml |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
37 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
38 This will take the top 3 BLAST descriptions from the input BLAST XML file, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
39 writing them to the specified output file in tabular format. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
40 """ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
41 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
42 parser = OptionParser(usage=usage) |
13 | 43 parser.add_option( |
44 "-t", | |
45 "--topN", | |
46 dest="topN", | |
47 default=3, | |
48 help="Number of descriptions to collect (in order from file)", | |
49 ) | |
50 parser.add_option( | |
51 "-o", | |
52 "--output", | |
53 dest="out_file", | |
54 default=None, | |
55 help="Output filename for tabular file", | |
56 metavar="FILE", | |
57 ) | |
58 parser.add_option( | |
59 "-f", | |
60 "--format", | |
61 dest="format", | |
62 default="blastxml", | |
63 help="Input format (blastxml or tabular)", | |
64 ) | |
65 parser.add_option( | |
66 "-q", | |
67 "--qseqid", | |
68 dest="qseqid", | |
69 default="1", | |
70 help="Column for query 'qseqid' (for tabular input; default 1)", | |
71 ) | |
72 parser.add_option( | |
73 "-s", | |
74 "--sseqid", | |
75 dest="sseqid", | |
76 default="2", | |
77 help="Column for subject 'sseqid' (for tabular input; default 2)", | |
78 ) | |
79 parser.add_option( | |
80 "-d", | |
81 "--salltitles", | |
82 dest="salltitles", | |
83 default="25", | |
84 help="Column for descriptions 'salltitles' (for tabular input; default 25)", | |
85 ) | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
86 (options, args) = parser.parse_args() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
87 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
88 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: |
13 | 89 sys.exit( |
90 """The API has changed, replace this: | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
91 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
92 $ python blastxml_to_top_descr.py input.xml output.tab 3 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
93 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
94 with: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
95 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
96 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
97 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
98 Sorry. |
13 | 99 """ |
100 ) | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
101 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
102 if not args: |
13 | 103 sys.exit("Input filename missing, try -h") |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
104 if len(args) > 1: |
13 | 105 sys.exit("Expects a single argument, one input filename") |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
106 in_file = args[0] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
107 out_file = options.out_file |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
108 topN = options.topN |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
109 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
110 try: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
111 topN = int(topN) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
112 except ValueError: |
13 | 113 sys.exit("Number of hits argument should be an integer (at least 1)") |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
114 if topN < 1: |
13 | 115 sys.exit("Number of hits argument should be an integer (at least 1)") |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
116 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
117 if not os.path.isfile(in_file): |
13 | 118 sys.exit("Missing input file: %r" % in_file) |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
119 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
120 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
121 def get_column(value): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
122 """Convert column number on command line to Python index.""" |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
123 if value.startswith("c"): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
124 # Ignore c prefix, e.g. "c1" for "1" |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
125 value = value[1:] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
126 try: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
127 col = int(value) |
13 | 128 except ValueError: |
129 sys.exit("Expected an integer column number, not %r" % value) | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
130 if col < 1: |
13 | 131 sys.exit("Expect column numbers to be at least one, not %r" % value) |
132 return col - 1 # Python counting! | |
133 | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
134 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
135 def tabular_hits(in_file, qseqid, sseqid, salltitles): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
136 """Parse key data from tabular BLAST output. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
137 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
138 Iterator returning tuples (qseqid, list_of_subject_description) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
139 """ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
140 current_query = None |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
141 current_hits = [] |
13 | 142 with open(in_file) as input_handle: |
143 for line in input_handle: | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
144 parts = line.rstrip("\n").split("\t") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
145 query = parts[qseqid] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
146 descr = "%s %s" % (parts[sseqid], parts[salltitles]) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
147 if current_query is None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
148 # First hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
149 current_query = query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
150 current_hits = [descr] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
151 elif current_query == query: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
152 # Another hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
153 current_hits.append(descr) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
154 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
155 # New query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
156 yield current_query, current_hits |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
157 current_query = query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
158 current_hits = [descr] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
159 if current_query is not None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
160 # Final query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
161 yield current_query, current_hits |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
162 |
13 | 163 |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
164 def blastxml_hits(in_file): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
165 """Parse key data from BLAST XML output. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
166 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
167 Iterator returning tuples (qseqid, list_of_subject_description) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
168 """ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
169 try: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
170 context = ElementTree.iterparse(in_file, events=("start", "end")) |
13 | 171 except Exception: |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
172 with open(in_file) as handle: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
173 header = handle.read(100) |
13 | 174 sys.exit( |
175 "Invalid data format in XML file %r which starts: %r" % (in_file, header) | |
176 ) | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
177 # turn it into an iterator |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
178 context = iter(context) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
179 # get the root element |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
180 try: |
13 | 181 event, root = next(context) |
182 except Exception: | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
183 with open(in_file) as handle: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
184 header = handle.read(100) |
13 | 185 sys.exit( |
186 "Unable to get root element from XML file %r which starts: %r" | |
187 % (in_file, header) | |
188 ) | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
189 |
13 | 190 re_default_query_id = re.compile(r"^Query_\d+$") |
191 assert re_default_query_id.match(r"Query_101") | |
192 assert not re_default_query_id.match(r"Query_101a") | |
193 assert not re_default_query_id.match(r"MyQuery_101") | |
194 re_default_subject_id = re.compile(r"^Subject_\d+$") | |
195 assert re_default_subject_id.match(r"Subject_1") | |
196 assert not re_default_subject_id.match(r"Subject_") | |
197 assert not re_default_subject_id.match(r"Subject_12a") | |
198 assert not re_default_subject_id.match(r"TheSubject_1") | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
199 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
200 current_query = None |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
201 hit_descrs = [] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
202 for event, elem in context: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
203 # for every <Iteration> tag |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
204 if event == "end" and elem.tag == "Iteration": |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
205 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
206 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> |
13 | 207 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 |
208 # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
209 # <Iteration_query-len>406</Iteration_query-len> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
210 # <Iteration_hits></Iteration_hits> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
211 # |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
212 # Or, from BLAST 2.2.24+ run online |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
213 # <Iteration_query-ID>Query_1</Iteration_query-ID> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
214 # <Iteration_query-def>Sample</Iteration_query-def> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
215 # <Iteration_query-len>516</Iteration_query-len> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
216 # <Iteration_hits>... |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
217 qseqid = elem.findtext("Iteration_query-ID") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
218 if qseqid is None: |
13 | 219 sys.exit( |
220 "Missing <Iteration_query-ID> (could be really old BLAST XML data?)" | |
221 ) | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
222 if re_default_query_id.match(qseqid): |
13 | 223 # Place holder ID, take the first word of the query definition |
224 qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0] | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
225 if current_query is None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
226 # First hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
227 current_query = qseqid |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
228 hit_descrs = [] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
229 elif current_query != qseqid: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
230 # New hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
231 yield current_query, hit_descrs |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
232 current_query = qseqid |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
233 hit_descrs = [] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
234 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
235 # Continuation of previous query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
236 # i.e. This BLAST XML did not use one <Iteration> per query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
237 # sys.stderr.write("Multiple <Iteration> blocks for %s\n" % qseqid) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
238 pass |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
239 # for every <Hit> within <Iteration> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
240 for hit in elem.findall("Iteration_hits/Hit"): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
241 # Expecting either this, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
242 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
243 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
244 # <Hit_accession>P56514</Hit_accession> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
245 # or, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
246 # <Hit_id>Subject_1</Hit_id> |
13 | 247 # <Hit_def>gi|57163783|ref|NP_001009242.1| |
248 # rhodopsin [Felis catus]</Hit_def> | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
249 # <Hit_accession>Subject_1</Hit_accession> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
250 # |
13 | 251 # apparently depending on the parse_deflines switch |
252 sseqid = hit.findtext("Hit_id").split(None, 1)[0] | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
253 hit_def = sseqid + " " + hit.findtext("Hit_def") |
13 | 254 if re_default_subject_id.match(sseqid) and sseqid == hit.findtext( |
255 "Hit_accession" | |
256 ): | |
257 # Place holder ID, take the first word of the subject definition | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
258 hit_def = hit.findtext("Hit_def") |
13 | 259 sseqid = hit_def.split(None, 1)[0] |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
260 assert hit_def not in hit_descrs |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
261 hit_descrs.append(hit_def) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
262 # prevents ElementTree from growing large datastructure |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
263 root.clear() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
264 elem.clear() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
265 if current_query is not None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
266 # Final query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
267 yield current_query, hit_descrs |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
268 |
13 | 269 |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
270 if options.format == "blastxml": |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
271 hits = blastxml_hits(in_file) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
272 elif options.format == "tabular": |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
273 qseqid = get_column(options.qseqid) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
274 sseqid = get_column(options.sseqid) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
275 salltitles = get_column(options.salltitles) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
276 hits = tabular_hits(in_file, qseqid, sseqid, salltitles) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
277 else: |
13 | 278 sys.exit("Unsupported format: %r" % options.format) |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
279 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
280 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
281 def best_hits(descriptions, topN): |
13 | 282 """Truncate given descriptions list to at most N entries.""" |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
283 if len(descriptions) < topN: |
13 | 284 return descriptions + [""] * (topN - len(descriptions)) |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
285 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
286 return descriptions[:topN] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
287 |
13 | 288 |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
289 count = 0 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
290 if out_file is None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
291 outfile = sys.stdout |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
292 else: |
13 | 293 outfile = open(out_file, "w") |
294 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i + 1) for i in range(topN))) | |
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
295 for query, descrs in hits: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
296 count += 1 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
297 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN)))) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
298 if out_file is not None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
299 outfile.close() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
300 # Queries with no hits are not present in tabular BLAST output |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
301 print("%i queries with BLAST results" % count) |