Mercurial > repos > peterjc > blastxml_to_top_descr
comparison tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 13:8dc4ba7eba5d draft default tip
v0.1.2 with Python 3.9 declaration
author | peterjc |
---|---|
date | Sun, 17 Sep 2023 13:01:56 +0000 |
parents | 98f8431dab44 |
children |
comparison
equal
deleted
inserted
replaced
12:fe1ed74793c9 | 13:8dc4ba7eba5d |
---|---|
4 Takes three command line options, input BLAST XML filename, output tabular | 4 Takes three command line options, input BLAST XML filename, output tabular |
5 BLAST filename, number of hits to collect the descriptions of. | 5 BLAST filename, number of hits to collect the descriptions of. |
6 | 6 |
7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. | 7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. |
8 """ | 8 """ |
9 from __future__ import print_function | |
10 | |
9 import os | 11 import os |
12 import re | |
10 import sys | 13 import sys |
11 import re | |
12 from optparse import OptionParser | 14 from optparse import OptionParser |
13 | 15 |
14 if "-v" in sys.argv or "--version" in sys.argv: | 16 if "-v" in sys.argv or "--version" in sys.argv: |
15 print "v0.1.0" | 17 print("v0.1.2") |
16 sys.exit(0) | 18 sys.exit(0) |
17 | 19 |
18 if sys.version_info[:2] >= ( 2, 5 ): | 20 if sys.version_info[:2] >= (2, 5): |
19 import xml.etree.cElementTree as ElementTree | 21 import xml.etree.cElementTree as ElementTree |
20 else: | 22 else: |
21 from galaxy import eggs | 23 from galaxy import eggs # noqa - ignore flake8 F401 |
22 import pkg_resources; pkg_resources.require( "elementtree" ) | 24 import pkg_resources |
25 | |
26 pkg_resources.require("elementtree") | |
23 from elementtree import ElementTree | 27 from elementtree import ElementTree |
24 | 28 |
25 def stop_err( msg ): | |
26 sys.stderr.write("%s\n" % msg) | |
27 sys.exit(1) | |
28 | 29 |
29 usage = """Use as follows: | 30 usage = """Use as follows: |
30 | 31 |
31 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml | 32 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml |
32 | 33 |
37 This will take the top 3 BLAST descriptions from the input BLAST XML file, | 38 This will take the top 3 BLAST descriptions from the input BLAST XML file, |
38 writing them to the specified output file in tabular format. | 39 writing them to the specified output file in tabular format. |
39 """ | 40 """ |
40 | 41 |
41 parser = OptionParser(usage=usage) | 42 parser = OptionParser(usage=usage) |
42 parser.add_option("-t", "--topN", dest="topN", default=3, | 43 parser.add_option( |
43 help="Number of descriptions to collect (in order from file)") | 44 "-t", |
44 parser.add_option("-o", "--output", dest="out_file", default=None, | 45 "--topN", |
45 help="Output filename for tabular file", | 46 dest="topN", |
46 metavar="FILE") | 47 default=3, |
47 parser.add_option("-f", "--format", dest="format", default="blastxml", | 48 help="Number of descriptions to collect (in order from file)", |
48 help="Input format (blastxml or tabular)") | 49 ) |
49 parser.add_option("-q", "--qseqid", dest="qseqid", default="1", | 50 parser.add_option( |
50 help="Column for query 'qseqid' (for tabular input; default 1)") | 51 "-o", |
51 parser.add_option("-s", "--sseqid", dest="sseqid", default="2", | 52 "--output", |
52 help="Column for subject 'sseqid' (for tabular input; default 2)") | 53 dest="out_file", |
53 parser.add_option("-d", "--salltitles", dest="salltitles", default="25", | 54 default=None, |
54 help="Column for descriptions 'salltitles' (for tabular input; default 25)") | 55 help="Output filename for tabular file", |
56 metavar="FILE", | |
57 ) | |
58 parser.add_option( | |
59 "-f", | |
60 "--format", | |
61 dest="format", | |
62 default="blastxml", | |
63 help="Input format (blastxml or tabular)", | |
64 ) | |
65 parser.add_option( | |
66 "-q", | |
67 "--qseqid", | |
68 dest="qseqid", | |
69 default="1", | |
70 help="Column for query 'qseqid' (for tabular input; default 1)", | |
71 ) | |
72 parser.add_option( | |
73 "-s", | |
74 "--sseqid", | |
75 dest="sseqid", | |
76 default="2", | |
77 help="Column for subject 'sseqid' (for tabular input; default 2)", | |
78 ) | |
79 parser.add_option( | |
80 "-d", | |
81 "--salltitles", | |
82 dest="salltitles", | |
83 default="25", | |
84 help="Column for descriptions 'salltitles' (for tabular input; default 25)", | |
85 ) | |
55 (options, args) = parser.parse_args() | 86 (options, args) = parser.parse_args() |
56 | 87 |
57 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: | 88 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: |
58 stop_err("""The API has changed, replace this: | 89 sys.exit( |
90 """The API has changed, replace this: | |
59 | 91 |
60 $ python blastxml_to_top_descr.py input.xml output.tab 3 | 92 $ python blastxml_to_top_descr.py input.xml output.tab 3 |
61 | 93 |
62 with: | 94 with: |
63 | 95 |
64 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml | 96 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml |
65 | 97 |
66 Sorry. | 98 Sorry. |
67 """) | 99 """ |
100 ) | |
68 | 101 |
69 if not args: | 102 if not args: |
70 stop_err("Input filename missing, try -h") | 103 sys.exit("Input filename missing, try -h") |
71 if len(args) > 1: | 104 if len(args) > 1: |
72 stop_err("Expects a single argument, one input filename") | 105 sys.exit("Expects a single argument, one input filename") |
73 in_file = args[0] | 106 in_file = args[0] |
74 out_file = options.out_file | 107 out_file = options.out_file |
75 topN = options.topN | 108 topN = options.topN |
76 | 109 |
77 try: | 110 try: |
78 topN = int(topN) | 111 topN = int(topN) |
79 except ValueError: | 112 except ValueError: |
80 stop_err("Number of hits argument should be an integer (at least 1)") | 113 sys.exit("Number of hits argument should be an integer (at least 1)") |
81 if topN < 1: | 114 if topN < 1: |
82 stop_err("Number of hits argument should be an integer (at least 1)") | 115 sys.exit("Number of hits argument should be an integer (at least 1)") |
83 | 116 |
84 if not os.path.isfile(in_file): | 117 if not os.path.isfile(in_file): |
85 stop_err("Missing input file: %r" % in_file) | 118 sys.exit("Missing input file: %r" % in_file) |
86 | 119 |
87 | 120 |
88 def get_column(value): | 121 def get_column(value): |
89 """Convert column number on command line to Python index.""" | 122 """Convert column number on command line to Python index.""" |
90 if value.startswith("c"): | 123 if value.startswith("c"): |
91 # Ignore c prefix, e.g. "c1" for "1" | 124 # Ignore c prefix, e.g. "c1" for "1" |
92 value = value[1:] | 125 value = value[1:] |
93 try: | 126 try: |
94 col = int(value) | 127 col = int(value) |
95 except: | 128 except ValueError: |
96 stop_err("Expected an integer column number, not %r" % value) | 129 sys.exit("Expected an integer column number, not %r" % value) |
97 if col < 1: | 130 if col < 1: |
98 stop_err("Expect column numbers to be at least one, not %r" % value) | 131 sys.exit("Expect column numbers to be at least one, not %r" % value) |
99 return col - 1 # Python counting! | 132 return col - 1 # Python counting! |
133 | |
100 | 134 |
101 def tabular_hits(in_file, qseqid, sseqid, salltitles): | 135 def tabular_hits(in_file, qseqid, sseqid, salltitles): |
102 """Parse key data from tabular BLAST output. | 136 """Parse key data from tabular BLAST output. |
103 | 137 |
104 Iterator returning tuples (qseqid, list_of_subject_description) | 138 Iterator returning tuples (qseqid, list_of_subject_description) |
105 """ | 139 """ |
106 current_query = None | 140 current_query = None |
107 current_hits = [] | 141 current_hits = [] |
108 with open(in_file) as input: | 142 with open(in_file) as input_handle: |
109 for line in input: | 143 for line in input_handle: |
110 parts = line.rstrip("\n").split("\t") | 144 parts = line.rstrip("\n").split("\t") |
111 query = parts[qseqid] | 145 query = parts[qseqid] |
112 descr = "%s %s" % (parts[sseqid], parts[salltitles]) | 146 descr = "%s %s" % (parts[sseqid], parts[salltitles]) |
113 if current_query is None: | 147 if current_query is None: |
114 # First hit | 148 # First hit |
124 current_hits = [descr] | 158 current_hits = [descr] |
125 if current_query is not None: | 159 if current_query is not None: |
126 # Final query | 160 # Final query |
127 yield current_query, current_hits | 161 yield current_query, current_hits |
128 | 162 |
163 | |
129 def blastxml_hits(in_file): | 164 def blastxml_hits(in_file): |
130 """Parse key data from BLAST XML output. | 165 """Parse key data from BLAST XML output. |
131 | 166 |
132 Iterator returning tuples (qseqid, list_of_subject_description) | 167 Iterator returning tuples (qseqid, list_of_subject_description) |
133 """ | 168 """ |
134 try: | 169 try: |
135 context = ElementTree.iterparse(in_file, events=("start", "end")) | 170 context = ElementTree.iterparse(in_file, events=("start", "end")) |
136 except: | 171 except Exception: |
137 with open(in_file) as handle: | 172 with open(in_file) as handle: |
138 header = handle.read(100) | 173 header = handle.read(100) |
139 stop_err("Invalid data format in XML file %r which starts: %r" % (in_file, header)) | 174 sys.exit( |
175 "Invalid data format in XML file %r which starts: %r" % (in_file, header) | |
176 ) | |
140 # turn it into an iterator | 177 # turn it into an iterator |
141 context = iter(context) | 178 context = iter(context) |
142 # get the root element | 179 # get the root element |
143 try: | 180 try: |
144 event, root = context.next() | 181 event, root = next(context) |
145 except: | 182 except Exception: |
146 with open(in_file) as handle: | 183 with open(in_file) as handle: |
147 header = handle.read(100) | 184 header = handle.read(100) |
148 stop_err("Unable to get root element from XML file %r which starts: %r" % (in_file, header)) | 185 sys.exit( |
149 | 186 "Unable to get root element from XML file %r which starts: %r" |
150 re_default_query_id = re.compile("^Query_\d+$") | 187 % (in_file, header) |
151 assert re_default_query_id.match("Query_101") | 188 ) |
152 assert not re_default_query_id.match("Query_101a") | 189 |
153 assert not re_default_query_id.match("MyQuery_101") | 190 re_default_query_id = re.compile(r"^Query_\d+$") |
154 re_default_subject_id = re.compile("^Subject_\d+$") | 191 assert re_default_query_id.match(r"Query_101") |
155 assert re_default_subject_id.match("Subject_1") | 192 assert not re_default_query_id.match(r"Query_101a") |
156 assert not re_default_subject_id.match("Subject_") | 193 assert not re_default_query_id.match(r"MyQuery_101") |
157 assert not re_default_subject_id.match("Subject_12a") | 194 re_default_subject_id = re.compile(r"^Subject_\d+$") |
158 assert not re_default_subject_id.match("TheSubject_1") | 195 assert re_default_subject_id.match(r"Subject_1") |
159 | 196 assert not re_default_subject_id.match(r"Subject_") |
160 count = 0 | 197 assert not re_default_subject_id.match(r"Subject_12a") |
161 pos_count = 0 | 198 assert not re_default_subject_id.match(r"TheSubject_1") |
199 | |
162 current_query = None | 200 current_query = None |
163 hit_descrs = [] | 201 hit_descrs = [] |
164 for event, elem in context: | 202 for event, elem in context: |
165 # for every <Iteration> tag | 203 # for every <Iteration> tag |
166 if event == "end" and elem.tag == "Iteration": | 204 if event == "end" and elem.tag == "Iteration": |
167 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA | 205 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA |
168 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> | 206 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> |
169 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | 207 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 |
208 # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> | |
170 # <Iteration_query-len>406</Iteration_query-len> | 209 # <Iteration_query-len>406</Iteration_query-len> |
171 # <Iteration_hits></Iteration_hits> | 210 # <Iteration_hits></Iteration_hits> |
172 # | 211 # |
173 # Or, from BLAST 2.2.24+ run online | 212 # Or, from BLAST 2.2.24+ run online |
174 # <Iteration_query-ID>Query_1</Iteration_query-ID> | 213 # <Iteration_query-ID>Query_1</Iteration_query-ID> |
175 # <Iteration_query-def>Sample</Iteration_query-def> | 214 # <Iteration_query-def>Sample</Iteration_query-def> |
176 # <Iteration_query-len>516</Iteration_query-len> | 215 # <Iteration_query-len>516</Iteration_query-len> |
177 # <Iteration_hits>... | 216 # <Iteration_hits>... |
178 qseqid = elem.findtext("Iteration_query-ID") | 217 qseqid = elem.findtext("Iteration_query-ID") |
179 if qseqid is None: | 218 if qseqid is None: |
180 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") | 219 sys.exit( |
220 "Missing <Iteration_query-ID> (could be really old BLAST XML data?)" | |
221 ) | |
181 if re_default_query_id.match(qseqid): | 222 if re_default_query_id.match(qseqid): |
182 #Place holder ID, take the first word of the query definition | 223 # Place holder ID, take the first word of the query definition |
183 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] | 224 qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0] |
184 if current_query is None: | 225 if current_query is None: |
185 # First hit | 226 # First hit |
186 current_query = qseqid | 227 current_query = qseqid |
187 hit_descrs = [] | 228 hit_descrs = [] |
188 elif current_query != qseqid: | 229 elif current_query != qseqid: |
201 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> | 242 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> |
202 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> | 243 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> |
203 # <Hit_accession>P56514</Hit_accession> | 244 # <Hit_accession>P56514</Hit_accession> |
204 # or, | 245 # or, |
205 # <Hit_id>Subject_1</Hit_id> | 246 # <Hit_id>Subject_1</Hit_id> |
206 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> | 247 # <Hit_def>gi|57163783|ref|NP_001009242.1| |
248 # rhodopsin [Felis catus]</Hit_def> | |
207 # <Hit_accession>Subject_1</Hit_accession> | 249 # <Hit_accession>Subject_1</Hit_accession> |
208 # | 250 # |
209 #apparently depending on the parse_deflines switch | 251 # apparently depending on the parse_deflines switch |
210 sseqid = hit.findtext("Hit_id").split(None,1)[0] | 252 sseqid = hit.findtext("Hit_id").split(None, 1)[0] |
211 hit_def = sseqid + " " + hit.findtext("Hit_def") | 253 hit_def = sseqid + " " + hit.findtext("Hit_def") |
212 if re_default_subject_id.match(sseqid) \ | 254 if re_default_subject_id.match(sseqid) and sseqid == hit.findtext( |
213 and sseqid == hit.findtext("Hit_accession"): | 255 "Hit_accession" |
214 #Place holder ID, take the first word of the subject definition | 256 ): |
257 # Place holder ID, take the first word of the subject definition | |
215 hit_def = hit.findtext("Hit_def") | 258 hit_def = hit.findtext("Hit_def") |
216 sseqid = hit_def.split(None,1)[0] | 259 sseqid = hit_def.split(None, 1)[0] |
217 assert hit_def not in hit_descrs | 260 assert hit_def not in hit_descrs |
218 hit_descrs.append(hit_def) | 261 hit_descrs.append(hit_def) |
219 # prevents ElementTree from growing large datastructure | 262 # prevents ElementTree from growing large datastructure |
220 root.clear() | 263 root.clear() |
221 elem.clear() | 264 elem.clear() |
222 if current_query is not None: | 265 if current_query is not None: |
223 # Final query | 266 # Final query |
224 yield current_query, hit_descrs | 267 yield current_query, hit_descrs |
225 | 268 |
269 | |
226 if options.format == "blastxml": | 270 if options.format == "blastxml": |
227 hits = blastxml_hits(in_file) | 271 hits = blastxml_hits(in_file) |
228 elif options.format == "tabular": | 272 elif options.format == "tabular": |
229 qseqid = get_column(options.qseqid) | 273 qseqid = get_column(options.qseqid) |
230 sseqid = get_column(options.sseqid) | 274 sseqid = get_column(options.sseqid) |
231 salltitles = get_column(options.salltitles) | 275 salltitles = get_column(options.salltitles) |
232 hits = tabular_hits(in_file, qseqid, sseqid, salltitles) | 276 hits = tabular_hits(in_file, qseqid, sseqid, salltitles) |
233 else: | 277 else: |
234 stop_err("Unsupported format: %r" % options.format) | 278 sys.exit("Unsupported format: %r" % options.format) |
235 | 279 |
236 | 280 |
237 def best_hits(descriptions, topN): | 281 def best_hits(descriptions, topN): |
282 """Truncate given descriptions list to at most N entries.""" | |
238 if len(descriptions) < topN: | 283 if len(descriptions) < topN: |
239 return descriptions + [""] * (topN - len(descriptions)) | 284 return descriptions + [""] * (topN - len(descriptions)) |
240 else: | 285 else: |
241 return descriptions[:topN] | 286 return descriptions[:topN] |
287 | |
242 | 288 |
243 count = 0 | 289 count = 0 |
244 if out_file is None: | 290 if out_file is None: |
245 outfile = sys.stdout | 291 outfile = sys.stdout |
246 else: | 292 else: |
247 outfile = open(out_file, 'w') | 293 outfile = open(out_file, "w") |
248 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) | 294 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i + 1) for i in range(topN))) |
249 for query, descrs in hits: | 295 for query, descrs in hits: |
250 count += 1 | 296 count += 1 |
251 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN)))) | 297 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN)))) |
252 if out_file is not None: | 298 if out_file is not None: |
253 outfile.close() | 299 outfile.close() |