Mercurial > repos > fubar > jbrowse2
comparison blastxml_to_gapped_gff3.py @ 0:d78175596286 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit cd77dffaad652cfb75b98bde5231beaa6d63cd5b-dirty
author | fubar |
---|---|
date | Mon, 08 Jan 2024 09:20:33 +0000 |
parents | |
children | 4c201a3d4755 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d78175596286 |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import copy | |
4 import logging | |
5 import re | |
6 import sys | |
7 | |
8 from BCBio import GFF | |
9 | |
10 logging.basicConfig(level=logging.INFO) | |
11 log = logging.getLogger(name="blastxml2gff3") | |
12 | |
13 __doc__ = """ | |
14 BlastXML files, when transformed to GFF3, do not normally show gaps in the | |
15 blast hits. This tool aims to fill that "gap". | |
16 """ | |
17 | |
18 | |
19 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False): | |
20 from Bio.Blast import NCBIXML | |
21 from Bio.Seq import Seq | |
22 from Bio.SeqRecord import SeqRecord | |
23 from Bio.SeqFeature import SeqFeature, SimpleLocation | |
24 | |
25 blast_records = NCBIXML.parse(blastxml) | |
26 for idx_record, record in enumerate(blast_records): | |
27 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 | |
28 match_type = { # Currently we can only handle BLASTN, BLASTP | |
29 "BLASTN": "nucleotide_match", | |
30 "BLASTP": "protein_match", | |
31 }.get(record.application, "match") | |
32 | |
33 recid = record.query | |
34 if " " in recid: | |
35 recid = recid[0: recid.index(" ")] | |
36 | |
37 rec = SeqRecord(Seq("ACTG"), id=recid) | |
38 for idx_hit, hit in enumerate(record.alignments): | |
39 for idx_hsp, hsp in enumerate(hit.hsps): | |
40 qualifiers = { | |
41 "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp), | |
42 "source": "blast", | |
43 "score": hsp.expect, | |
44 "accession": hit.accession, | |
45 "hit_id": hit.hit_id, | |
46 "length": hit.length, | |
47 "hit_titles": hit.title.split(" >"), | |
48 } | |
49 if include_seq: | |
50 qualifiers.update( | |
51 { | |
52 "blast_qseq": hsp.query, | |
53 "blast_sseq": hsp.sbjct, | |
54 "blast_mseq": hsp.match, | |
55 } | |
56 ) | |
57 | |
58 for prop in ( | |
59 "score", | |
60 "bits", | |
61 "identities", | |
62 "positives", | |
63 "gaps", | |
64 "align_length", | |
65 "strand", | |
66 "frame", | |
67 "query_start", | |
68 "query_end", | |
69 "sbjct_start", | |
70 "sbjct_end", | |
71 ): | |
72 qualifiers["blast_" + prop] = getattr(hsp, prop, None) | |
73 | |
74 desc = hit.title.split(" >")[0] | |
75 qualifiers["description"] = desc[desc.index(" "):] | |
76 | |
77 # This required a fair bit of sketching out/match to figure out | |
78 # the first time. | |
79 # | |
80 # the match_start location must account for queries and | |
81 # subjecst that start at locations other than 1 | |
82 parent_match_start = hsp.query_start - hsp.sbjct_start | |
83 # The end is the start + hit.length because the match itself | |
84 # may be longer than the parent feature, so we use the supplied | |
85 # subject/hit length to calculate the real ending of the target | |
86 # protein. | |
87 parent_match_end = hsp.query_start + hit.length + hsp.query.count("-") | |
88 | |
89 # If we trim the left end, we need to trim without losing information. | |
90 used_parent_match_start = parent_match_start | |
91 if trim: | |
92 if parent_match_start < 1: | |
93 used_parent_match_start = 0 | |
94 | |
95 if trim or trim_end: | |
96 if parent_match_end > hsp.query_end: | |
97 parent_match_end = hsp.query_end + 1 | |
98 | |
99 # The ``match`` feature will hold one or more ``match_part``s | |
100 top_feature = SeqFeature( | |
101 SimpleLocation(used_parent_match_start, parent_match_end, strand=0), | |
102 type=match_type, | |
103 qualifiers=qualifiers, | |
104 ) | |
105 | |
106 # Unlike the parent feature, ``match_part``s have sources. | |
107 part_qualifiers = { | |
108 "source": "blast", | |
109 } | |
110 top_feature.sub_features = [] | |
111 for idx_part, (start, end, cigar) in enumerate( | |
112 generate_parts( | |
113 hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap | |
114 ) | |
115 ): | |
116 part_qualifiers["Gap"] = cigar | |
117 part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part) | |
118 | |
119 # Otherwise, we have to account for the subject start's location | |
120 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 | |
121 | |
122 # We used to use hsp.align_length here, but that includes | |
123 # gaps in the parent sequence | |
124 # | |
125 # Furthermore align_length will give calculation errors in weird places | |
126 # So we just use (end-start) for simplicity | |
127 match_part_end = match_part_start + (end - start) | |
128 | |
129 top_feature.sub_features.append( | |
130 SeqFeature( | |
131 SimpleLocation(match_part_start, match_part_end, strand=1), | |
132 type="match_part", | |
133 qualifiers=copy.deepcopy(part_qualifiers), | |
134 ) | |
135 ) | |
136 | |
137 rec.features.append(top_feature) | |
138 rec.annotations = {} | |
139 yield rec | |
140 | |
141 | |
142 def __remove_query_gaps(query, match, subject): | |
143 """remove positions in all three based on gaps in query | |
144 | |
145 In order to simplify math and calculations...we remove all of the gaps | |
146 based on gap locations in the query sequence:: | |
147 | |
148 Q:ACTG-ACTGACTG | |
149 S:ACTGAAC---CTG | |
150 | |
151 will become:: | |
152 | |
153 Q:ACTGACTGACTG | |
154 S:ACTGAC---CTG | |
155 | |
156 which greatly simplifies the process of identifying the correct location | |
157 for a match_part | |
158 """ | |
159 prev = 0 | |
160 fq = "" | |
161 fm = "" | |
162 fs = "" | |
163 for position in re.finditer("-", query): | |
164 fq += query[prev: position.start()] | |
165 fm += match[prev: position.start()] | |
166 fs += subject[prev: position.start()] | |
167 prev = position.start() + 1 | |
168 fq += query[prev:] | |
169 fm += match[prev:] | |
170 fs += subject[prev:] | |
171 | |
172 return (fq, fm, fs) | |
173 | |
174 | |
175 def generate_parts(query, match, subject, ignore_under=3): | |
176 region_q = [] | |
177 region_m = [] | |
178 region_s = [] | |
179 | |
180 (query, match, subject) = __remove_query_gaps(query, match, subject) | |
181 | |
182 region_start = -1 | |
183 region_end = -1 | |
184 mismatch_count = 0 | |
185 for i, (q, m, s) in enumerate(zip(query, match, subject)): | |
186 | |
187 # If we have a match | |
188 if m != " " or m == "+": | |
189 if region_start == -1: | |
190 region_start = i | |
191 # It's a new region, we need to reset or it's pre-seeded with | |
192 # spaces | |
193 region_q = [] | |
194 region_m = [] | |
195 region_s = [] | |
196 region_end = i | |
197 mismatch_count = 0 | |
198 else: | |
199 mismatch_count += 1 | |
200 | |
201 region_q.append(q) | |
202 region_m.append(m) | |
203 region_s.append(s) | |
204 | |
205 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: | |
206 region_q = region_q[0:-ignore_under] | |
207 region_m = region_m[0:-ignore_under] | |
208 region_s = region_s[0:-ignore_under] | |
209 yield region_start, region_end + 1, cigar_from_string( | |
210 region_q, region_m, region_s, strict_m=True | |
211 ) | |
212 region_q = [] | |
213 region_m = [] | |
214 region_s = [] | |
215 | |
216 region_start = -1 | |
217 region_end = -1 | |
218 mismatch_count = 0 | |
219 | |
220 yield region_start, region_end + 1, cigar_from_string( | |
221 region_q, region_m, region_s, strict_m=True | |
222 ) | |
223 | |
224 | |
225 def _qms_to_matches(query, match, subject, strict_m=True): | |
226 matchline = [] | |
227 | |
228 for (q, m, s) in zip(query, match, subject): | |
229 ret = "" | |
230 | |
231 if m != " " or m == "+": | |
232 ret = "=" | |
233 elif m == " ": | |
234 if q == "-": | |
235 ret = "D" | |
236 elif s == "-": | |
237 ret = "I" | |
238 else: | |
239 ret = "X" | |
240 else: | |
241 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) | |
242 | |
243 if strict_m: | |
244 if ret == "=" or ret == "X": | |
245 ret = "M" | |
246 | |
247 matchline.append(ret) | |
248 return matchline | |
249 | |
250 | |
251 def _matchline_to_cigar(matchline): | |
252 cigar_line = [] | |
253 last_char = matchline[0] | |
254 count = 0 | |
255 for char in matchline: | |
256 if char == last_char: | |
257 count += 1 | |
258 else: | |
259 cigar_line.append("%s%s" % (last_char, count)) | |
260 count = 1 | |
261 last_char = char | |
262 cigar_line.append("%s%s" % (last_char, count)) | |
263 return " ".join(cigar_line) | |
264 | |
265 | |
266 def cigar_from_string(query, match, subject, strict_m=True): | |
267 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) | |
268 if len(matchline) > 0: | |
269 return _matchline_to_cigar(matchline) | |
270 else: | |
271 return "" | |
272 | |
273 | |
274 if __name__ == "__main__": | |
275 parser = argparse.ArgumentParser( | |
276 description="Convert Blast XML to gapped GFF3", epilog="" | |
277 ) | |
278 parser.add_argument( | |
279 "blastxml", type=argparse.FileType("r"), help="Blast XML Output" | |
280 ) | |
281 parser.add_argument( | |
282 "--min_gap", | |
283 type=int, | |
284 help="Maximum gap size before generating a new match_part", | |
285 default=3, | |
286 ) | |
287 parser.add_argument( | |
288 "--trim", | |
289 action="store_true", | |
290 help="Trim blast hits to be only as long as the parent feature", | |
291 ) | |
292 parser.add_argument( | |
293 "--trim_end", action="store_true", help="Cut blast results off at end of gene" | |
294 ) | |
295 parser.add_argument("--include_seq", action="store_true", help="Include sequence") | |
296 args = parser.parse_args() | |
297 | |
298 for rec in blastxml2gff3(**vars(args)): | |
299 if len(rec.features): | |
300 GFF.write([rec], sys.stdout) |