comparison tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 13:8dc4ba7eba5d draft default tip

v0.1.2 with Python 3.9 declaration
author peterjc
date Sun, 17 Sep 2023 13:01:56 +0000
parents 98f8431dab44
children
comparison
equal deleted inserted replaced
12:fe1ed74793c9 13:8dc4ba7eba5d
4 Takes three command line options, input BLAST XML filename, output tabular 4 Takes three command line options, input BLAST XML filename, output tabular
5 BLAST filename, number of hits to collect the descriptions of. 5 BLAST filename, number of hits to collect the descriptions of.
6 6
7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. 7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits.
8 """ 8 """
9 from __future__ import print_function
10
9 import os 11 import os
12 import re
10 import sys 13 import sys
11 import re
12 from optparse import OptionParser 14 from optparse import OptionParser
13 15
14 if "-v" in sys.argv or "--version" in sys.argv: 16 if "-v" in sys.argv or "--version" in sys.argv:
15 print "v0.1.0" 17 print("v0.1.2")
16 sys.exit(0) 18 sys.exit(0)
17 19
18 if sys.version_info[:2] >= ( 2, 5 ): 20 if sys.version_info[:2] >= (2, 5):
19 import xml.etree.cElementTree as ElementTree 21 import xml.etree.cElementTree as ElementTree
20 else: 22 else:
21 from galaxy import eggs 23 from galaxy import eggs # noqa - ignore flake8 F401
22 import pkg_resources; pkg_resources.require( "elementtree" ) 24 import pkg_resources
25
26 pkg_resources.require("elementtree")
23 from elementtree import ElementTree 27 from elementtree import ElementTree
24 28
25 def stop_err( msg ):
26 sys.stderr.write("%s\n" % msg)
27 sys.exit(1)
28 29
29 usage = """Use as follows: 30 usage = """Use as follows:
30 31
31 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml 32 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml
32 33
37 This will take the top 3 BLAST descriptions from the input BLAST XML file, 38 This will take the top 3 BLAST descriptions from the input BLAST XML file,
38 writing them to the specified output file in tabular format. 39 writing them to the specified output file in tabular format.
39 """ 40 """
40 41
41 parser = OptionParser(usage=usage) 42 parser = OptionParser(usage=usage)
42 parser.add_option("-t", "--topN", dest="topN", default=3, 43 parser.add_option(
43 help="Number of descriptions to collect (in order from file)") 44 "-t",
44 parser.add_option("-o", "--output", dest="out_file", default=None, 45 "--topN",
45 help="Output filename for tabular file", 46 dest="topN",
46 metavar="FILE") 47 default=3,
47 parser.add_option("-f", "--format", dest="format", default="blastxml", 48 help="Number of descriptions to collect (in order from file)",
48 help="Input format (blastxml or tabular)") 49 )
49 parser.add_option("-q", "--qseqid", dest="qseqid", default="1", 50 parser.add_option(
50 help="Column for query 'qseqid' (for tabular input; default 1)") 51 "-o",
51 parser.add_option("-s", "--sseqid", dest="sseqid", default="2", 52 "--output",
52 help="Column for subject 'sseqid' (for tabular input; default 2)") 53 dest="out_file",
53 parser.add_option("-d", "--salltitles", dest="salltitles", default="25", 54 default=None,
54 help="Column for descriptions 'salltitles' (for tabular input; default 25)") 55 help="Output filename for tabular file",
56 metavar="FILE",
57 )
58 parser.add_option(
59 "-f",
60 "--format",
61 dest="format",
62 default="blastxml",
63 help="Input format (blastxml or tabular)",
64 )
65 parser.add_option(
66 "-q",
67 "--qseqid",
68 dest="qseqid",
69 default="1",
70 help="Column for query 'qseqid' (for tabular input; default 1)",
71 )
72 parser.add_option(
73 "-s",
74 "--sseqid",
75 dest="sseqid",
76 default="2",
77 help="Column for subject 'sseqid' (for tabular input; default 2)",
78 )
79 parser.add_option(
80 "-d",
81 "--salltitles",
82 dest="salltitles",
83 default="25",
84 help="Column for descriptions 'salltitles' (for tabular input; default 25)",
85 )
55 (options, args) = parser.parse_args() 86 (options, args) = parser.parse_args()
56 87
57 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: 88 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file:
58 stop_err("""The API has changed, replace this: 89 sys.exit(
90 """The API has changed, replace this:
59 91
60 $ python blastxml_to_top_descr.py input.xml output.tab 3 92 $ python blastxml_to_top_descr.py input.xml output.tab 3
61 93
62 with: 94 with:
63 95
64 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml 96 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml
65 97
66 Sorry. 98 Sorry.
67 """) 99 """
100 )
68 101
69 if not args: 102 if not args:
70 stop_err("Input filename missing, try -h") 103 sys.exit("Input filename missing, try -h")
71 if len(args) > 1: 104 if len(args) > 1:
72 stop_err("Expects a single argument, one input filename") 105 sys.exit("Expects a single argument, one input filename")
73 in_file = args[0] 106 in_file = args[0]
74 out_file = options.out_file 107 out_file = options.out_file
75 topN = options.topN 108 topN = options.topN
76 109
77 try: 110 try:
78 topN = int(topN) 111 topN = int(topN)
79 except ValueError: 112 except ValueError:
80 stop_err("Number of hits argument should be an integer (at least 1)") 113 sys.exit("Number of hits argument should be an integer (at least 1)")
81 if topN < 1: 114 if topN < 1:
82 stop_err("Number of hits argument should be an integer (at least 1)") 115 sys.exit("Number of hits argument should be an integer (at least 1)")
83 116
84 if not os.path.isfile(in_file): 117 if not os.path.isfile(in_file):
85 stop_err("Missing input file: %r" % in_file) 118 sys.exit("Missing input file: %r" % in_file)
86 119
87 120
88 def get_column(value): 121 def get_column(value):
89 """Convert column number on command line to Python index.""" 122 """Convert column number on command line to Python index."""
90 if value.startswith("c"): 123 if value.startswith("c"):
91 # Ignore c prefix, e.g. "c1" for "1" 124 # Ignore c prefix, e.g. "c1" for "1"
92 value = value[1:] 125 value = value[1:]
93 try: 126 try:
94 col = int(value) 127 col = int(value)
95 except: 128 except ValueError:
96 stop_err("Expected an integer column number, not %r" % value) 129 sys.exit("Expected an integer column number, not %r" % value)
97 if col < 1: 130 if col < 1:
98 stop_err("Expect column numbers to be at least one, not %r" % value) 131 sys.exit("Expect column numbers to be at least one, not %r" % value)
99 return col - 1 # Python counting! 132 return col - 1 # Python counting!
133
100 134
101 def tabular_hits(in_file, qseqid, sseqid, salltitles): 135 def tabular_hits(in_file, qseqid, sseqid, salltitles):
102 """Parse key data from tabular BLAST output. 136 """Parse key data from tabular BLAST output.
103 137
104 Iterator returning tuples (qseqid, list_of_subject_description) 138 Iterator returning tuples (qseqid, list_of_subject_description)
105 """ 139 """
106 current_query = None 140 current_query = None
107 current_hits = [] 141 current_hits = []
108 with open(in_file) as input: 142 with open(in_file) as input_handle:
109 for line in input: 143 for line in input_handle:
110 parts = line.rstrip("\n").split("\t") 144 parts = line.rstrip("\n").split("\t")
111 query = parts[qseqid] 145 query = parts[qseqid]
112 descr = "%s %s" % (parts[sseqid], parts[salltitles]) 146 descr = "%s %s" % (parts[sseqid], parts[salltitles])
113 if current_query is None: 147 if current_query is None:
114 # First hit 148 # First hit
124 current_hits = [descr] 158 current_hits = [descr]
125 if current_query is not None: 159 if current_query is not None:
126 # Final query 160 # Final query
127 yield current_query, current_hits 161 yield current_query, current_hits
128 162
163
129 def blastxml_hits(in_file): 164 def blastxml_hits(in_file):
130 """Parse key data from BLAST XML output. 165 """Parse key data from BLAST XML output.
131 166
132 Iterator returning tuples (qseqid, list_of_subject_description) 167 Iterator returning tuples (qseqid, list_of_subject_description)
133 """ 168 """
134 try: 169 try:
135 context = ElementTree.iterparse(in_file, events=("start", "end")) 170 context = ElementTree.iterparse(in_file, events=("start", "end"))
136 except: 171 except Exception:
137 with open(in_file) as handle: 172 with open(in_file) as handle:
138 header = handle.read(100) 173 header = handle.read(100)
139 stop_err("Invalid data format in XML file %r which starts: %r" % (in_file, header)) 174 sys.exit(
175 "Invalid data format in XML file %r which starts: %r" % (in_file, header)
176 )
140 # turn it into an iterator 177 # turn it into an iterator
141 context = iter(context) 178 context = iter(context)
142 # get the root element 179 # get the root element
143 try: 180 try:
144 event, root = context.next() 181 event, root = next(context)
145 except: 182 except Exception:
146 with open(in_file) as handle: 183 with open(in_file) as handle:
147 header = handle.read(100) 184 header = handle.read(100)
148 stop_err("Unable to get root element from XML file %r which starts: %r" % (in_file, header)) 185 sys.exit(
149 186 "Unable to get root element from XML file %r which starts: %r"
150 re_default_query_id = re.compile("^Query_\d+$") 187 % (in_file, header)
151 assert re_default_query_id.match("Query_101") 188 )
152 assert not re_default_query_id.match("Query_101a") 189
153 assert not re_default_query_id.match("MyQuery_101") 190 re_default_query_id = re.compile(r"^Query_\d+$")
154 re_default_subject_id = re.compile("^Subject_\d+$") 191 assert re_default_query_id.match(r"Query_101")
155 assert re_default_subject_id.match("Subject_1") 192 assert not re_default_query_id.match(r"Query_101a")
156 assert not re_default_subject_id.match("Subject_") 193 assert not re_default_query_id.match(r"MyQuery_101")
157 assert not re_default_subject_id.match("Subject_12a") 194 re_default_subject_id = re.compile(r"^Subject_\d+$")
158 assert not re_default_subject_id.match("TheSubject_1") 195 assert re_default_subject_id.match(r"Subject_1")
159 196 assert not re_default_subject_id.match(r"Subject_")
160 count = 0 197 assert not re_default_subject_id.match(r"Subject_12a")
161 pos_count = 0 198 assert not re_default_subject_id.match(r"TheSubject_1")
199
162 current_query = None 200 current_query = None
163 hit_descrs = [] 201 hit_descrs = []
164 for event, elem in context: 202 for event, elem in context:
165 # for every <Iteration> tag 203 # for every <Iteration> tag
166 if event == "end" and elem.tag == "Iteration": 204 if event == "end" and elem.tag == "Iteration":
167 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA 205 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
168 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> 206 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
169 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> 207 # <Iteration_query-def>Endoplasmic reticulum resident protein 44
208 # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
170 # <Iteration_query-len>406</Iteration_query-len> 209 # <Iteration_query-len>406</Iteration_query-len>
171 # <Iteration_hits></Iteration_hits> 210 # <Iteration_hits></Iteration_hits>
172 # 211 #
173 # Or, from BLAST 2.2.24+ run online 212 # Or, from BLAST 2.2.24+ run online
174 # <Iteration_query-ID>Query_1</Iteration_query-ID> 213 # <Iteration_query-ID>Query_1</Iteration_query-ID>
175 # <Iteration_query-def>Sample</Iteration_query-def> 214 # <Iteration_query-def>Sample</Iteration_query-def>
176 # <Iteration_query-len>516</Iteration_query-len> 215 # <Iteration_query-len>516</Iteration_query-len>
177 # <Iteration_hits>... 216 # <Iteration_hits>...
178 qseqid = elem.findtext("Iteration_query-ID") 217 qseqid = elem.findtext("Iteration_query-ID")
179 if qseqid is None: 218 if qseqid is None:
180 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") 219 sys.exit(
220 "Missing <Iteration_query-ID> (could be really old BLAST XML data?)"
221 )
181 if re_default_query_id.match(qseqid): 222 if re_default_query_id.match(qseqid):
182 #Place holder ID, take the first word of the query definition 223 # Place holder ID, take the first word of the query definition
183 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] 224 qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0]
184 if current_query is None: 225 if current_query is None:
185 # First hit 226 # First hit
186 current_query = qseqid 227 current_query = qseqid
187 hit_descrs = [] 228 hit_descrs = []
188 elif current_query != qseqid: 229 elif current_query != qseqid:
201 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> 242 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
202 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> 243 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
203 # <Hit_accession>P56514</Hit_accession> 244 # <Hit_accession>P56514</Hit_accession>
204 # or, 245 # or,
205 # <Hit_id>Subject_1</Hit_id> 246 # <Hit_id>Subject_1</Hit_id>
206 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> 247 # <Hit_def>gi|57163783|ref|NP_001009242.1|
248 # rhodopsin [Felis catus]</Hit_def>
207 # <Hit_accession>Subject_1</Hit_accession> 249 # <Hit_accession>Subject_1</Hit_accession>
208 # 250 #
209 #apparently depending on the parse_deflines switch 251 # apparently depending on the parse_deflines switch
210 sseqid = hit.findtext("Hit_id").split(None,1)[0] 252 sseqid = hit.findtext("Hit_id").split(None, 1)[0]
211 hit_def = sseqid + " " + hit.findtext("Hit_def") 253 hit_def = sseqid + " " + hit.findtext("Hit_def")
212 if re_default_subject_id.match(sseqid) \ 254 if re_default_subject_id.match(sseqid) and sseqid == hit.findtext(
213 and sseqid == hit.findtext("Hit_accession"): 255 "Hit_accession"
214 #Place holder ID, take the first word of the subject definition 256 ):
257 # Place holder ID, take the first word of the subject definition
215 hit_def = hit.findtext("Hit_def") 258 hit_def = hit.findtext("Hit_def")
216 sseqid = hit_def.split(None,1)[0] 259 sseqid = hit_def.split(None, 1)[0]
217 assert hit_def not in hit_descrs 260 assert hit_def not in hit_descrs
218 hit_descrs.append(hit_def) 261 hit_descrs.append(hit_def)
219 # prevents ElementTree from growing large datastructure 262 # prevents ElementTree from growing large datastructure
220 root.clear() 263 root.clear()
221 elem.clear() 264 elem.clear()
222 if current_query is not None: 265 if current_query is not None:
223 # Final query 266 # Final query
224 yield current_query, hit_descrs 267 yield current_query, hit_descrs
225 268
269
226 if options.format == "blastxml": 270 if options.format == "blastxml":
227 hits = blastxml_hits(in_file) 271 hits = blastxml_hits(in_file)
228 elif options.format == "tabular": 272 elif options.format == "tabular":
229 qseqid = get_column(options.qseqid) 273 qseqid = get_column(options.qseqid)
230 sseqid = get_column(options.sseqid) 274 sseqid = get_column(options.sseqid)
231 salltitles = get_column(options.salltitles) 275 salltitles = get_column(options.salltitles)
232 hits = tabular_hits(in_file, qseqid, sseqid, salltitles) 276 hits = tabular_hits(in_file, qseqid, sseqid, salltitles)
233 else: 277 else:
234 stop_err("Unsupported format: %r" % options.format) 278 sys.exit("Unsupported format: %r" % options.format)
235 279
236 280
237 def best_hits(descriptions, topN): 281 def best_hits(descriptions, topN):
282 """Truncate given descriptions list to at most N entries."""
238 if len(descriptions) < topN: 283 if len(descriptions) < topN:
239 return descriptions + [""] * (topN - len(descriptions)) 284 return descriptions + [""] * (topN - len(descriptions))
240 else: 285 else:
241 return descriptions[:topN] 286 return descriptions[:topN]
287
242 288
243 count = 0 289 count = 0
244 if out_file is None: 290 if out_file is None:
245 outfile = sys.stdout 291 outfile = sys.stdout
246 else: 292 else:
247 outfile = open(out_file, 'w') 293 outfile = open(out_file, "w")
248 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) 294 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i + 1) for i in range(topN)))
249 for query, descrs in hits: 295 for query, descrs in hits:
250 count += 1 296 count += 1
251 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN)))) 297 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN))))
252 if out_file is not None: 298 if out_file is not None:
253 outfile.close() 299 outfile.close()