comparison blastxml_to_gapped_gff3.py @ 1:497c6bb3b717 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse commit 0887009a23d176b21536c9fd8a18c4fecc417d4f
author iuc
date Thu, 18 Jun 2015 12:10:51 -0400
parents
children 7342f467507b
comparison
equal deleted inserted replaced
0:2c9e5136b416 1:497c6bb3b717
1 #!/usr/bin/perl
2 import re
3 import sys
4 import copy
5 import argparse
6 from BCBio import GFF
7 import logging
8 logging.basicConfig(level=logging.INFO)
9 log = logging.getLogger(name='blastxml2gff3')
10
11 __author__ = "Eric Rasche"
12 __version__ = "0.4.0"
13 __maintainer__ = "Eric Rasche"
14 __email__ = "esr@tamu.edu"
15
16 __doc__ = """
17 BlastXML files, when transformed to GFF3, do not normally show gaps in the
18 blast hits. This tool aims to fill that "gap".
19 """
20
21
22 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False):
23 from Bio.Blast import NCBIXML
24 from Bio.Seq import Seq
25 from Bio.SeqRecord import SeqRecord
26 from Bio.SeqFeature import SeqFeature, FeatureLocation
27
28 blast_records = NCBIXML.parse(blastxml)
29 records = []
30 for record in blast_records:
31 rec = SeqRecord(Seq("ACTG"), id=record.query)
32 for hit in record.alignments:
33 for hsp in hit.hsps:
34 qualifiers = {
35 "source": "blast",
36 "score": hsp.expect,
37 "accession": hit.accession,
38 "hit_id": hit.hit_id,
39 "length": hit.length,
40 "hit_titles": hit.title.split(' >')
41 }
42 desc = hit.title.split(' >')[0]
43 qualifiers['description'] = desc[desc.index(' '):]
44
45 # This required a fair bit of sketching out/match to figure out
46 # the first time.
47 #
48 # the match_start location must account for queries and
49 # subjecst that start at locations other than 1
50 parent_match_start = hsp.query_start - hsp.sbjct_start
51 # The end is the start + hit.length because the match itself
52 # may be longer than the parent feature, so we use the supplied
53 # subject/hit length to calculate the real ending of the target
54 # protein.
55 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
56
57 # However, if the user requests that we trim the feature, then
58 # we need to cut the ``match`` start to 0 to match the parent feature.
59 # We'll also need to cut the end to match the query's end. It (maybe)
60 # should be the feature end? But we don't have access to that data, so
61 # We settle for this.
62 if trim:
63 if parent_match_start < 1:
64 parent_match_start = 0
65
66 if trim or trim_end:
67 if parent_match_end > hsp.query_end:
68 parent_match_end = hsp.query_end + 1
69
70 # The ``protein_match`` feature will hold one or more ``match_part``s
71 top_feature = SeqFeature(
72 FeatureLocation(parent_match_start, parent_match_end),
73 type="protein_match", strand=0,
74 qualifiers=qualifiers
75 )
76
77 # Unlike the parent feature, ``match_part``s have sources.
78 part_qualifiers = {
79 "source": "blast",
80 }
81 top_feature.sub_features = []
82 for start, end, cigar in generate_parts(hsp.query, hsp.match,
83 hsp.sbjct,
84 ignore_under=min_gap):
85 part_qualifiers['Gap'] = cigar
86 part_qualifiers['ID'] = hit.hit_id
87
88 if trim:
89 # If trimming, then we start relative to the
90 # protein_match's start
91 match_part_start = parent_match_start + start
92 else:
93 # Otherwise, we have to account for the subject start's location
94 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
95
96 # We used to use hsp.align_length here, but that includes
97 # gaps in the parent sequence
98 #
99 # Furthermore align_length will give calculation errors in weird places
100 # So we just use (end-start) for simplicity
101 match_part_end = match_part_start + (end - start)
102
103 top_feature.sub_features.append(
104 SeqFeature(
105 FeatureLocation(match_part_start, match_part_end),
106 type="match_part", strand=0,
107 qualifiers=copy.deepcopy(part_qualifiers))
108 )
109
110 rec.features.append(top_feature)
111 records.append(rec)
112 return records
113
114
115 def __remove_query_gaps(query, match, subject):
116 """remove positions in all three based on gaps in query
117
118 In order to simplify math and calculations...we remove all of the gaps
119 based on gap locations in the query sequence::
120
121 Q:ACTG-ACTGACTG
122 S:ACTGAAC---CTG
123
124 will become::
125
126 Q:ACTGACTGACTG
127 S:ACTGAC---CTG
128
129 which greatly simplifies the process of identifying the correct location
130 for a match_part
131 """
132 prev = 0
133 fq = ''
134 fm = ''
135 fs = ''
136 for position in re.finditer('-', query):
137 fq += query[prev:position.start()]
138 fm += match[prev:position.start()]
139 fs += subject[prev:position.start()]
140 prev = position.start() + 1
141 fq += query[prev:]
142 fm += match[prev:]
143 fs += subject[prev:]
144
145 return (fq, fm, fs)
146
147
148 def generate_parts(query, match, subject, ignore_under=3):
149 region_q = []
150 region_m = []
151 region_s = []
152
153 (query, match, subject) = __remove_query_gaps(query, match, subject)
154
155 region_start = -1
156 region_end = -1
157 mismatch_count = 0
158 for i, (q, m, s) in enumerate(zip(query, match, subject)):
159
160 # If we have a match
161 if m != ' ' or m == '+':
162 if region_start == -1:
163 region_start = i
164 # It's a new region, we need to reset or it's pre-seeded with
165 # spaces
166 region_q = []
167 region_m = []
168 region_s = []
169 region_end = i
170 mismatch_count = 0
171 else:
172 mismatch_count += 1
173
174 region_q.append(q)
175 region_m.append(m)
176 region_s.append(s)
177
178 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
179 region_q = region_q[0:-ignore_under]
180 region_m = region_m[0:-ignore_under]
181 region_s = region_s[0:-ignore_under]
182 yield region_start, region_end + 1, \
183 cigar_from_string(region_q, region_m, region_s, strict_m=True)
184 region_q = []
185 region_m = []
186 region_s = []
187
188 region_start = -1
189 region_end = -1
190 mismatch_count = 0
191
192 yield region_start, region_end + 1, \
193 cigar_from_string(region_q, region_m, region_s, strict_m=True)
194
195
196 def _qms_to_matches(query, match, subject, strict_m=True):
197 matchline = []
198
199 for (q, m, s) in zip(query, match, subject):
200 ret = ''
201
202 if m != ' ' or m == '+':
203 ret = '='
204 elif m == ' ':
205 if q == '-':
206 ret = 'D'
207 elif s == '-':
208 ret = 'I'
209 else:
210 ret = 'X'
211 else:
212 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
213
214
215 if strict_m:
216 if ret == '=' or ret == 'X':
217 ret = 'M'
218
219 matchline.append(ret)
220 return matchline
221
222
223 def _matchline_to_cigar(matchline):
224 cigar_line = []
225 last_char = matchline[0]
226 count = 0
227 for char in matchline:
228 if char == last_char:
229 count += 1
230 else:
231 cigar_line.append("%s%s" % (last_char, count))
232 count = 1
233 last_char = char
234 cigar_line.append("%s%s" % (last_char, count))
235 return ' '.join(cigar_line)
236
237
238 def cigar_from_string(query, match, subject, strict_m=True):
239 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
240 if len(matchline) > 0:
241 return _matchline_to_cigar(matchline)
242 else:
243 return ""
244
245
246 if __name__ == '__main__':
247 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
248 parser.add_argument('blastxml', type=file, help='Blast XML Output')
249 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
250 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
251 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
252 args = parser.parse_args()
253
254 result = blastxml2gff3(**vars(args))
255
256 GFF.write(result, sys.stdout)