annotate tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 13:8dc4ba7eba5d draft default tip

v0.1.2 with Python 3.9 declaration
author peterjc
date Sun, 17 Sep 2023 13:01:56 +0000
parents 98f8431dab44
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
1 #!/usr/bin/env python
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
2 """Convert a BLAST XML file to a top hits description table.
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
3
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
4 Takes three command line options, input BLAST XML filename, output tabular
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
5 BLAST filename, number of hits to collect the descriptions of.
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
6
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits.
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
8 """
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
9 from __future__ import print_function
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
10
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
11 import os
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
12 import re
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
13 import sys
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
14 from optparse import OptionParser
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
15
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
16 if "-v" in sys.argv or "--version" in sys.argv:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
17 print("v0.1.2")
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
18 sys.exit(0)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
19
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
20 if sys.version_info[:2] >= (2, 5):
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
21 import xml.etree.cElementTree as ElementTree
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
22 else:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
23 from galaxy import eggs # noqa - ignore flake8 F401
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
24 import pkg_resources
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
25
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
26 pkg_resources.require("elementtree")
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
27 from elementtree import ElementTree
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
28
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
29
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
30 usage = """Use as follows:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
31
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
32 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
33
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
34 Or,
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
35
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
36 $ blastxml_to_top_descr.py [--topN 3] --output example.tabular input.xml
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
37
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
38 This will take the top 3 BLAST descriptions from the input BLAST XML file,
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
39 writing them to the specified output file in tabular format.
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
40 """
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
41
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
42 parser = OptionParser(usage=usage)
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
43 parser.add_option(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
44 "-t",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
45 "--topN",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
46 dest="topN",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
47 default=3,
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
48 help="Number of descriptions to collect (in order from file)",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
49 )
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
50 parser.add_option(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
51 "-o",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
52 "--output",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
53 dest="out_file",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
54 default=None,
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
55 help="Output filename for tabular file",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
56 metavar="FILE",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
57 )
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
58 parser.add_option(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
59 "-f",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
60 "--format",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
61 dest="format",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
62 default="blastxml",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
63 help="Input format (blastxml or tabular)",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
64 )
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
65 parser.add_option(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
66 "-q",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
67 "--qseqid",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
68 dest="qseqid",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
69 default="1",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
70 help="Column for query 'qseqid' (for tabular input; default 1)",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
71 )
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
72 parser.add_option(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
73 "-s",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
74 "--sseqid",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
75 dest="sseqid",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
76 default="2",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
77 help="Column for subject 'sseqid' (for tabular input; default 2)",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
78 )
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
79 parser.add_option(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
80 "-d",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
81 "--salltitles",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
82 dest="salltitles",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
83 default="25",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
84 help="Column for descriptions 'salltitles' (for tabular input; default 25)",
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
85 )
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
86 (options, args) = parser.parse_args()
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
87
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
88 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
89 sys.exit(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
90 """The API has changed, replace this:
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
91
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
92 $ python blastxml_to_top_descr.py input.xml output.tab 3
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
93
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
94 with:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
95
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
96 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
97
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
98 Sorry.
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
99 """
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
100 )
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
101
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
102 if not args:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
103 sys.exit("Input filename missing, try -h")
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
104 if len(args) > 1:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
105 sys.exit("Expects a single argument, one input filename")
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
106 in_file = args[0]
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
107 out_file = options.out_file
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
108 topN = options.topN
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
109
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
110 try:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
111 topN = int(topN)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
112 except ValueError:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
113 sys.exit("Number of hits argument should be an integer (at least 1)")
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
114 if topN < 1:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
115 sys.exit("Number of hits argument should be an integer (at least 1)")
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
116
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
117 if not os.path.isfile(in_file):
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
118 sys.exit("Missing input file: %r" % in_file)
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
119
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
120
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
121 def get_column(value):
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
122 """Convert column number on command line to Python index."""
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
123 if value.startswith("c"):
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
124 # Ignore c prefix, e.g. "c1" for "1"
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
125 value = value[1:]
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
126 try:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
127 col = int(value)
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
128 except ValueError:
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
129 sys.exit("Expected an integer column number, not %r" % value)
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
130 if col < 1:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
131 sys.exit("Expect column numbers to be at least one, not %r" % value)
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
132 return col - 1 # Python counting!
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
133
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
134
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
135 def tabular_hits(in_file, qseqid, sseqid, salltitles):
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
136 """Parse key data from tabular BLAST output.
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
137
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
138 Iterator returning tuples (qseqid, list_of_subject_description)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
139 """
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
140 current_query = None
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
141 current_hits = []
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
142 with open(in_file) as input_handle:
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
143 for line in input_handle:
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
144 parts = line.rstrip("\n").split("\t")
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
145 query = parts[qseqid]
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
146 descr = "%s %s" % (parts[sseqid], parts[salltitles])
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
147 if current_query is None:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
148 # First hit
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
149 current_query = query
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
150 current_hits = [descr]
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
151 elif current_query == query:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
152 # Another hit
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
153 current_hits.append(descr)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
154 else:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
155 # New query
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
156 yield current_query, current_hits
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
157 current_query = query
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
158 current_hits = [descr]
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
159 if current_query is not None:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
160 # Final query
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
161 yield current_query, current_hits
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
162
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
163
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
164 def blastxml_hits(in_file):
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
165 """Parse key data from BLAST XML output.
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
166
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
167 Iterator returning tuples (qseqid, list_of_subject_description)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
168 """
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
169 try:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
170 context = ElementTree.iterparse(in_file, events=("start", "end"))
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
171 except Exception:
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
172 with open(in_file) as handle:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
173 header = handle.read(100)
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
174 sys.exit(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
175 "Invalid data format in XML file %r which starts: %r" % (in_file, header)
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
176 )
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
177 # turn it into an iterator
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
178 context = iter(context)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
179 # get the root element
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
180 try:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
181 event, root = next(context)
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
182 except Exception:
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
183 with open(in_file) as handle:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
184 header = handle.read(100)
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
185 sys.exit(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
186 "Unable to get root element from XML file %r which starts: %r"
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
187 % (in_file, header)
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
188 )
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
189
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
190 re_default_query_id = re.compile(r"^Query_\d+$")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
191 assert re_default_query_id.match(r"Query_101")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
192 assert not re_default_query_id.match(r"Query_101a")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
193 assert not re_default_query_id.match(r"MyQuery_101")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
194 re_default_subject_id = re.compile(r"^Subject_\d+$")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
195 assert re_default_subject_id.match(r"Subject_1")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
196 assert not re_default_subject_id.match(r"Subject_")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
197 assert not re_default_subject_id.match(r"Subject_12a")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
198 assert not re_default_subject_id.match(r"TheSubject_1")
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
199
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
200 current_query = None
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
201 hit_descrs = []
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
202 for event, elem in context:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
203 # for every <Iteration> tag
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
204 if event == "end" and elem.tag == "Iteration":
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
205 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
206 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
207 # <Iteration_query-def>Endoplasmic reticulum resident protein 44
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
208 # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
209 # <Iteration_query-len>406</Iteration_query-len>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
210 # <Iteration_hits></Iteration_hits>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
211 #
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
212 # Or, from BLAST 2.2.24+ run online
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
213 # <Iteration_query-ID>Query_1</Iteration_query-ID>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
214 # <Iteration_query-def>Sample</Iteration_query-def>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
215 # <Iteration_query-len>516</Iteration_query-len>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
216 # <Iteration_hits>...
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
217 qseqid = elem.findtext("Iteration_query-ID")
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
218 if qseqid is None:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
219 sys.exit(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
220 "Missing <Iteration_query-ID> (could be really old BLAST XML data?)"
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
221 )
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
222 if re_default_query_id.match(qseqid):
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
223 # Place holder ID, take the first word of the query definition
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
224 qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0]
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
225 if current_query is None:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
226 # First hit
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
227 current_query = qseqid
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
228 hit_descrs = []
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
229 elif current_query != qseqid:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
230 # New hit
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
231 yield current_query, hit_descrs
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
232 current_query = qseqid
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
233 hit_descrs = []
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
234 else:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
235 # Continuation of previous query
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
236 # i.e. This BLAST XML did not use one <Iteration> per query
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
237 # sys.stderr.write("Multiple <Iteration> blocks for %s\n" % qseqid)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
238 pass
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
239 # for every <Hit> within <Iteration>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
240 for hit in elem.findall("Iteration_hits/Hit"):
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
241 # Expecting either this,
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
242 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
243 # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
244 # <Hit_accession>P56514</Hit_accession>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
245 # or,
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
246 # <Hit_id>Subject_1</Hit_id>
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
247 # <Hit_def>gi|57163783|ref|NP_001009242.1|
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
248 # rhodopsin [Felis catus]</Hit_def>
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
249 # <Hit_accession>Subject_1</Hit_accession>
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
250 #
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
251 # apparently depending on the parse_deflines switch
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
252 sseqid = hit.findtext("Hit_id").split(None, 1)[0]
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
253 hit_def = sseqid + " " + hit.findtext("Hit_def")
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
254 if re_default_subject_id.match(sseqid) and sseqid == hit.findtext(
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
255 "Hit_accession"
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
256 ):
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
257 # Place holder ID, take the first word of the subject definition
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
258 hit_def = hit.findtext("Hit_def")
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
259 sseqid = hit_def.split(None, 1)[0]
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
260 assert hit_def not in hit_descrs
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
261 hit_descrs.append(hit_def)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
262 # prevents ElementTree from growing large datastructure
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
263 root.clear()
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
264 elem.clear()
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
265 if current_query is not None:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
266 # Final query
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
267 yield current_query, hit_descrs
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
268
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
269
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
270 if options.format == "blastxml":
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
271 hits = blastxml_hits(in_file)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
272 elif options.format == "tabular":
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
273 qseqid = get_column(options.qseqid)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
274 sseqid = get_column(options.sseqid)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
275 salltitles = get_column(options.salltitles)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
276 hits = tabular_hits(in_file, qseqid, sseqid, salltitles)
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
277 else:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
278 sys.exit("Unsupported format: %r" % options.format)
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
279
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
280
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
281 def best_hits(descriptions, topN):
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
282 """Truncate given descriptions list to at most N entries."""
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
283 if len(descriptions) < topN:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
284 return descriptions + [""] * (topN - len(descriptions))
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
285 else:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
286 return descriptions[:topN]
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
287
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
288
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
289 count = 0
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
290 if out_file is None:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
291 outfile = sys.stdout
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
292 else:
13
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
293 outfile = open(out_file, "w")
8dc4ba7eba5d v0.1.2 with Python 3.9 declaration
peterjc
parents: 11
diff changeset
294 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i + 1) for i in range(topN)))
11
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
295 for query, descrs in hits:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
296 count += 1
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
297 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN))))
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
298 if out_file is not None:
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
299 outfile.close()
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
300 # Queries with no hits are not present in tabular BLAST output
98f8431dab44 Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff changeset
301 print("%i queries with BLAST results" % count)