Mercurial > repos > peterjc > blastxml_to_top_descr
annotate tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 11:98f8431dab44 draft
Uploaded v0.1.0, now also handles extended tabular BLAST output.
author | peterjc |
---|---|
date | Fri, 13 Jun 2014 07:07:35 -0400 |
parents | |
children | 8dc4ba7eba5d |
rev | line source |
---|---|
11
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
1 #!/usr/bin/env python |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
2 """Convert a BLAST XML file to a top hits description table. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
3 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
4 Takes three command line options, input BLAST XML filename, output tabular |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
5 BLAST filename, number of hits to collect the descriptions of. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
6 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
7 Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
8 """ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
9 import os |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
10 import sys |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
11 import re |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
12 from optparse import OptionParser |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
13 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
14 if "-v" in sys.argv or "--version" in sys.argv: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
15 print "v0.1.0" |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
16 sys.exit(0) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
17 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
18 if sys.version_info[:2] >= ( 2, 5 ): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
19 import xml.etree.cElementTree as ElementTree |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
20 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
21 from galaxy import eggs |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
22 import pkg_resources; pkg_resources.require( "elementtree" ) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
23 from elementtree import ElementTree |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
24 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
25 def stop_err( msg ): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
26 sys.stderr.write("%s\n" % msg) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
27 sys.exit(1) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
28 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
29 usage = """Use as follows: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
30 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
31 $ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
32 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
33 Or, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
34 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
35 $ blastxml_to_top_descr.py [--topN 3] --output example.tabular input.xml |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
36 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
37 This will take the top 3 BLAST descriptions from the input BLAST XML file, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
38 writing them to the specified output file in tabular format. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
39 """ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
40 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
41 parser = OptionParser(usage=usage) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
42 parser.add_option("-t", "--topN", dest="topN", default=3, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
43 help="Number of descriptions to collect (in order from file)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
44 parser.add_option("-o", "--output", dest="out_file", default=None, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
45 help="Output filename for tabular file", |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
46 metavar="FILE") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
47 parser.add_option("-f", "--format", dest="format", default="blastxml", |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
48 help="Input format (blastxml or tabular)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
49 parser.add_option("-q", "--qseqid", dest="qseqid", default="1", |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
50 help="Column for query 'qseqid' (for tabular input; default 1)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
51 parser.add_option("-s", "--sseqid", dest="sseqid", default="2", |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
52 help="Column for subject 'sseqid' (for tabular input; default 2)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
53 parser.add_option("-d", "--salltitles", dest="salltitles", default="25", |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
54 help="Column for descriptions 'salltitles' (for tabular input; default 25)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
55 (options, args) = parser.parse_args() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
56 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
57 if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
58 stop_err("""The API has changed, replace this: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
59 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
60 $ python blastxml_to_top_descr.py input.xml output.tab 3 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
61 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
62 with: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
63 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
64 $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
65 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
66 Sorry. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
67 """) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
68 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
69 if not args: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
70 stop_err("Input filename missing, try -h") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
71 if len(args) > 1: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
72 stop_err("Expects a single argument, one input filename") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
73 in_file = args[0] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
74 out_file = options.out_file |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
75 topN = options.topN |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
76 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
77 try: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
78 topN = int(topN) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
79 except ValueError: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
80 stop_err("Number of hits argument should be an integer (at least 1)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
81 if topN < 1: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
82 stop_err("Number of hits argument should be an integer (at least 1)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
83 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
84 if not os.path.isfile(in_file): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
85 stop_err("Missing input file: %r" % in_file) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
86 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
87 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
88 def get_column(value): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
89 """Convert column number on command line to Python index.""" |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
90 if value.startswith("c"): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
91 # Ignore c prefix, e.g. "c1" for "1" |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
92 value = value[1:] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
93 try: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
94 col = int(value) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
95 except: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
96 stop_err("Expected an integer column number, not %r" % value) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
97 if col < 1: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
98 stop_err("Expect column numbers to be at least one, not %r" % value) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
99 return col - 1 # Python counting! |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
100 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
101 def tabular_hits(in_file, qseqid, sseqid, salltitles): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
102 """Parse key data from tabular BLAST output. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
103 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
104 Iterator returning tuples (qseqid, list_of_subject_description) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
105 """ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
106 current_query = None |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
107 current_hits = [] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
108 with open(in_file) as input: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
109 for line in input: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
110 parts = line.rstrip("\n").split("\t") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
111 query = parts[qseqid] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
112 descr = "%s %s" % (parts[sseqid], parts[salltitles]) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
113 if current_query is None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
114 # First hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
115 current_query = query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
116 current_hits = [descr] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
117 elif current_query == query: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
118 # Another hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
119 current_hits.append(descr) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
120 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
121 # New query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
122 yield current_query, current_hits |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
123 current_query = query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
124 current_hits = [descr] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
125 if current_query is not None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
126 # Final query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
127 yield current_query, current_hits |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
128 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
129 def blastxml_hits(in_file): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
130 """Parse key data from BLAST XML output. |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
131 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
132 Iterator returning tuples (qseqid, list_of_subject_description) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
133 """ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
134 try: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
135 context = ElementTree.iterparse(in_file, events=("start", "end")) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
136 except: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
137 with open(in_file) as handle: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
138 header = handle.read(100) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
139 stop_err("Invalid data format in XML file %r which starts: %r" % (in_file, header)) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
140 # turn it into an iterator |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
141 context = iter(context) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
142 # get the root element |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
143 try: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
144 event, root = context.next() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
145 except: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
146 with open(in_file) as handle: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
147 header = handle.read(100) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
148 stop_err("Unable to get root element from XML file %r which starts: %r" % (in_file, header)) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
149 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
150 re_default_query_id = re.compile("^Query_\d+$") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
151 assert re_default_query_id.match("Query_101") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
152 assert not re_default_query_id.match("Query_101a") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
153 assert not re_default_query_id.match("MyQuery_101") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
154 re_default_subject_id = re.compile("^Subject_\d+$") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
155 assert re_default_subject_id.match("Subject_1") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
156 assert not re_default_subject_id.match("Subject_") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
157 assert not re_default_subject_id.match("Subject_12a") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
158 assert not re_default_subject_id.match("TheSubject_1") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
159 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
160 count = 0 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
161 pos_count = 0 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
162 current_query = None |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
163 hit_descrs = [] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
164 for event, elem in context: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
165 # for every <Iteration> tag |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
166 if event == "end" and elem.tag == "Iteration": |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
167 # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
168 # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
169 # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
170 # <Iteration_query-len>406</Iteration_query-len> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
171 # <Iteration_hits></Iteration_hits> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
172 # |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
173 # Or, from BLAST 2.2.24+ run online |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
174 # <Iteration_query-ID>Query_1</Iteration_query-ID> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
175 # <Iteration_query-def>Sample</Iteration_query-def> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
176 # <Iteration_query-len>516</Iteration_query-len> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
177 # <Iteration_hits>... |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
178 qseqid = elem.findtext("Iteration_query-ID") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
179 if qseqid is None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
180 stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
181 if re_default_query_id.match(qseqid): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
182 #Place holder ID, take the first word of the query definition |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
183 qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
184 if current_query is None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
185 # First hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
186 current_query = qseqid |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
187 hit_descrs = [] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
188 elif current_query != qseqid: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
189 # New hit |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
190 yield current_query, hit_descrs |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
191 current_query = qseqid |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
192 hit_descrs = [] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
193 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
194 # Continuation of previous query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
195 # i.e. This BLAST XML did not use one <Iteration> per query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
196 # sys.stderr.write("Multiple <Iteration> blocks for %s\n" % qseqid) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
197 pass |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
198 # for every <Hit> within <Iteration> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
199 for hit in elem.findall("Iteration_hits/Hit"): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
200 # Expecting either this, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
201 # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
202 # <Hit_def>RecName: Full=Rhodopsin</Hit_def> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
203 # <Hit_accession>P56514</Hit_accession> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
204 # or, |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
205 # <Hit_id>Subject_1</Hit_id> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
206 # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
207 # <Hit_accession>Subject_1</Hit_accession> |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
208 # |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
209 #apparently depending on the parse_deflines switch |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
210 sseqid = hit.findtext("Hit_id").split(None,1)[0] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
211 hit_def = sseqid + " " + hit.findtext("Hit_def") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
212 if re_default_subject_id.match(sseqid) \ |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
213 and sseqid == hit.findtext("Hit_accession"): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
214 #Place holder ID, take the first word of the subject definition |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
215 hit_def = hit.findtext("Hit_def") |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
216 sseqid = hit_def.split(None,1)[0] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
217 assert hit_def not in hit_descrs |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
218 hit_descrs.append(hit_def) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
219 # prevents ElementTree from growing large datastructure |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
220 root.clear() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
221 elem.clear() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
222 if current_query is not None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
223 # Final query |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
224 yield current_query, hit_descrs |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
225 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
226 if options.format == "blastxml": |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
227 hits = blastxml_hits(in_file) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
228 elif options.format == "tabular": |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
229 qseqid = get_column(options.qseqid) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
230 sseqid = get_column(options.sseqid) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
231 salltitles = get_column(options.salltitles) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
232 hits = tabular_hits(in_file, qseqid, sseqid, salltitles) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
233 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
234 stop_err("Unsupported format: %r" % options.format) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
235 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
236 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
237 def best_hits(descriptions, topN): |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
238 if len(descriptions) < topN: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
239 return descriptions + [""] * (topN - len(descriptions)) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
240 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
241 return descriptions[:topN] |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
242 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
243 count = 0 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
244 if out_file is None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
245 outfile = sys.stdout |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
246 else: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
247 outfile = open(out_file, 'w') |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
248 outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
249 for query, descrs in hits: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
250 count += 1 |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
251 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN)))) |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
252 if out_file is not None: |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
253 outfile.close() |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
254 # Queries with no hits are not present in tabular BLAST output |
98f8431dab44
Uploaded v0.1.0, now also handles extended tabular BLAST output.
peterjc
parents:
diff
changeset
|
255 print("%i queries with BLAST results" % count) |