comparison jbrowse2/blastxml_to_gapped_gff3.py @ 6:88b9b105c09b draft

Uploaded
author fubar
date Fri, 05 Jan 2024 01:58:02 +0000
parents cd5d63cd0eb5
children 234cf4490901
comparison
equal deleted inserted replaced
5:42ca8804cd93 6:88b9b105c09b
4 import logging 4 import logging
5 import re 5 import re
6 import sys 6 import sys
7 7
8 from BCBio import GFF 8 from BCBio import GFF
9
9 logging.basicConfig(level=logging.INFO) 10 logging.basicConfig(level=logging.INFO)
10 log = logging.getLogger(name='blastxml2gff3') 11 log = logging.getLogger(name="blastxml2gff3")
11 12
12 __doc__ = """ 13 __doc__ = """
13 BlastXML files, when transformed to GFF3, do not normally show gaps in the 14 BlastXML files, when transformed to GFF3, do not normally show gaps in the
14 blast hits. This tool aims to fill that "gap". 15 blast hits. This tool aims to fill that "gap".
15 """ 16 """
23 24
24 blast_records = NCBIXML.parse(blastxml) 25 blast_records = NCBIXML.parse(blastxml)
25 for idx_record, record in enumerate(blast_records): 26 for idx_record, record in enumerate(blast_records):
26 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 27 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
27 match_type = { # Currently we can only handle BLASTN, BLASTP 28 match_type = { # Currently we can only handle BLASTN, BLASTP
28 'BLASTN': 'nucleotide_match', 29 "BLASTN": "nucleotide_match",
29 'BLASTP': 'protein_match', 30 "BLASTP": "protein_match",
30 }.get(record.application, 'match') 31 }.get(record.application, "match")
31 32
32 recid = record.query 33 recid = record.query
33 if ' ' in recid: 34 if " " in recid:
34 recid = recid[0:recid.index(' ')] 35 recid = recid[0 : recid.index(" ")]
35 36
36 rec = SeqRecord(Seq("ACTG"), id=recid) 37 rec = SeqRecord(Seq("ACTG"), id=recid)
37 for idx_hit, hit in enumerate(record.alignments): 38 for idx_hit, hit in enumerate(record.alignments):
38 for idx_hsp, hsp in enumerate(hit.hsps): 39 for idx_hsp, hsp in enumerate(hit.hsps):
39 qualifiers = { 40 qualifiers = {
40 "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), 41 "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
41 "source": "blast", 42 "source": "blast",
42 "score": hsp.expect, 43 "score": hsp.expect,
43 "accession": hit.accession, 44 "accession": hit.accession,
44 "hit_id": hit.hit_id, 45 "hit_id": hit.hit_id,
45 "length": hit.length, 46 "length": hit.length,
46 "hit_titles": hit.title.split(' >'), 47 "hit_titles": hit.title.split(" >"),
47 } 48 }
48 if include_seq: 49 if include_seq:
49 qualifiers.update({ 50 qualifiers.update(
50 'blast_qseq': hsp.query, 51 {
51 'blast_sseq': hsp.sbjct, 52 "blast_qseq": hsp.query,
52 'blast_mseq': hsp.match, 53 "blast_sseq": hsp.sbjct,
53 }) 54 "blast_mseq": hsp.match,
54 55 }
55 for prop in ('score', 'bits', 'identities', 'positives', 56 )
56 'gaps', 'align_length', 'strand', 'frame', 57
57 'query_start', 'query_end', 'sbjct_start', 58 for prop in (
58 'sbjct_end'): 59 "score",
59 qualifiers['blast_' + prop] = getattr(hsp, prop, None) 60 "bits",
60 61 "identities",
61 desc = hit.title.split(' >')[0] 62 "positives",
62 qualifiers['description'] = desc[desc.index(' '):] 63 "gaps",
64 "align_length",
65 "strand",
66 "frame",
67 "query_start",
68 "query_end",
69 "sbjct_start",
70 "sbjct_end",
71 ):
72 qualifiers["blast_" + prop] = getattr(hsp, prop, None)
73
74 desc = hit.title.split(" >")[0]
75 qualifiers["description"] = desc[desc.index(" ") :]
63 76
64 # This required a fair bit of sketching out/match to figure out 77 # This required a fair bit of sketching out/match to figure out
65 # the first time. 78 # the first time.
66 # 79 #
67 # the match_start location must account for queries and 80 # the match_start location must account for queries and
69 parent_match_start = hsp.query_start - hsp.sbjct_start 82 parent_match_start = hsp.query_start - hsp.sbjct_start
70 # The end is the start + hit.length because the match itself 83 # The end is the start + hit.length because the match itself
71 # may be longer than the parent feature, so we use the supplied 84 # may be longer than the parent feature, so we use the supplied
72 # subject/hit length to calculate the real ending of the target 85 # subject/hit length to calculate the real ending of the target
73 # protein. 86 # protein.
74 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') 87 parent_match_end = hsp.query_start + hit.length + hsp.query.count("-")
75 88
76 # If we trim the left end, we need to trim without losing information. 89 # If we trim the left end, we need to trim without losing information.
77 used_parent_match_start = parent_match_start 90 used_parent_match_start = parent_match_start
78 if trim: 91 if trim:
79 if parent_match_start < 1: 92 if parent_match_start < 1:
85 98
86 # The ``match`` feature will hold one or more ``match_part``s 99 # The ``match`` feature will hold one or more ``match_part``s
87 top_feature = SeqFeature( 100 top_feature = SeqFeature(
88 SimpleLocation(used_parent_match_start, parent_match_end, strand=0), 101 SimpleLocation(used_parent_match_start, parent_match_end, strand=0),
89 type=match_type, 102 type=match_type,
90 qualifiers=qualifiers 103 qualifiers=qualifiers,
91 ) 104 )
92 105
93 # Unlike the parent feature, ``match_part``s have sources. 106 # Unlike the parent feature, ``match_part``s have sources.
94 part_qualifiers = { 107 part_qualifiers = {
95 "source": "blast", 108 "source": "blast",
96 } 109 }
97 top_feature.sub_features = [] 110 top_feature.sub_features = []
98 for idx_part, (start, end, cigar) in \ 111 for idx_part, (start, end, cigar) in enumerate(
99 enumerate(generate_parts(hsp.query, hsp.match, 112 generate_parts(
100 hsp.sbjct, 113 hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap
101 ignore_under=min_gap)): 114 )
102 part_qualifiers['Gap'] = cigar 115 ):
103 part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) 116 part_qualifiers["Gap"] = cigar
117 part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part)
104 118
105 # Otherwise, we have to account for the subject start's location 119 # Otherwise, we have to account for the subject start's location
106 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 120 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
107 121
108 # We used to use hsp.align_length here, but that includes 122 # We used to use hsp.align_length here, but that includes
114 128
115 top_feature.sub_features.append( 129 top_feature.sub_features.append(
116 SeqFeature( 130 SeqFeature(
117 SimpleLocation(match_part_start, match_part_end, strand=1), 131 SimpleLocation(match_part_start, match_part_end, strand=1),
118 type="match_part", 132 type="match_part",
119 qualifiers=copy.deepcopy(part_qualifiers)) 133 qualifiers=copy.deepcopy(part_qualifiers),
134 )
120 ) 135 )
121 136
122 rec.features.append(top_feature) 137 rec.features.append(top_feature)
123 rec.annotations = {} 138 rec.annotations = {}
124 yield rec 139 yield rec
140 155
141 which greatly simplifies the process of identifying the correct location 156 which greatly simplifies the process of identifying the correct location
142 for a match_part 157 for a match_part
143 """ 158 """
144 prev = 0 159 prev = 0
145 fq = '' 160 fq = ""
146 fm = '' 161 fm = ""
147 fs = '' 162 fs = ""
148 for position in re.finditer('-', query): 163 for position in re.finditer("-", query):
149 fq += query[prev:position.start()] 164 fq += query[prev : position.start()]
150 fm += match[prev:position.start()] 165 fm += match[prev : position.start()]
151 fs += subject[prev:position.start()] 166 fs += subject[prev : position.start()]
152 prev = position.start() + 1 167 prev = position.start() + 1
153 fq += query[prev:] 168 fq += query[prev:]
154 fm += match[prev:] 169 fm += match[prev:]
155 fs += subject[prev:] 170 fs += subject[prev:]
156 171
168 region_end = -1 183 region_end = -1
169 mismatch_count = 0 184 mismatch_count = 0
170 for i, (q, m, s) in enumerate(zip(query, match, subject)): 185 for i, (q, m, s) in enumerate(zip(query, match, subject)):
171 186
172 # If we have a match 187 # If we have a match
173 if m != ' ' or m == '+': 188 if m != " " or m == "+":
174 if region_start == -1: 189 if region_start == -1:
175 region_start = i 190 region_start = i
176 # It's a new region, we need to reset or it's pre-seeded with 191 # It's a new region, we need to reset or it's pre-seeded with
177 # spaces 192 # spaces
178 region_q = [] 193 region_q = []
189 204
190 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: 205 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
191 region_q = region_q[0:-ignore_under] 206 region_q = region_q[0:-ignore_under]
192 region_m = region_m[0:-ignore_under] 207 region_m = region_m[0:-ignore_under]
193 region_s = region_s[0:-ignore_under] 208 region_s = region_s[0:-ignore_under]
194 yield region_start, region_end + 1, \ 209 yield region_start, region_end + 1, cigar_from_string(
195 cigar_from_string(region_q, region_m, region_s, strict_m=True) 210 region_q, region_m, region_s, strict_m=True
211 )
196 region_q = [] 212 region_q = []
197 region_m = [] 213 region_m = []
198 region_s = [] 214 region_s = []
199 215
200 region_start = -1 216 region_start = -1
201 region_end = -1 217 region_end = -1
202 mismatch_count = 0 218 mismatch_count = 0
203 219
204 yield region_start, region_end + 1, \ 220 yield region_start, region_end + 1, cigar_from_string(
205 cigar_from_string(region_q, region_m, region_s, strict_m=True) 221 region_q, region_m, region_s, strict_m=True
222 )
206 223
207 224
208 def _qms_to_matches(query, match, subject, strict_m=True): 225 def _qms_to_matches(query, match, subject, strict_m=True):
209 matchline = [] 226 matchline = []
210 227
211 for (q, m, s) in zip(query, match, subject): 228 for (q, m, s) in zip(query, match, subject):
212 ret = '' 229 ret = ""
213 230
214 if m != ' ' or m == '+': 231 if m != " " or m == "+":
215 ret = '=' 232 ret = "="
216 elif m == ' ': 233 elif m == " ":
217 if q == '-': 234 if q == "-":
218 ret = 'D' 235 ret = "D"
219 elif s == '-': 236 elif s == "-":
220 ret = 'I' 237 ret = "I"
221 else: 238 else:
222 ret = 'X' 239 ret = "X"
223 else: 240 else:
224 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) 241 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
225 242
226 if strict_m: 243 if strict_m:
227 if ret == '=' or ret == 'X': 244 if ret == "=" or ret == "X":
228 ret = 'M' 245 ret = "M"
229 246
230 matchline.append(ret) 247 matchline.append(ret)
231 return matchline 248 return matchline
232 249
233 250
241 else: 258 else:
242 cigar_line.append("%s%s" % (last_char, count)) 259 cigar_line.append("%s%s" % (last_char, count))
243 count = 1 260 count = 1
244 last_char = char 261 last_char = char
245 cigar_line.append("%s%s" % (last_char, count)) 262 cigar_line.append("%s%s" % (last_char, count))
246 return ' '.join(cigar_line) 263 return " ".join(cigar_line)
247 264
248 265
249 def cigar_from_string(query, match, subject, strict_m=True): 266 def cigar_from_string(query, match, subject, strict_m=True):
250 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) 267 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
251 if len(matchline) > 0: 268 if len(matchline) > 0:
252 return _matchline_to_cigar(matchline) 269 return _matchline_to_cigar(matchline)
253 else: 270 else:
254 return "" 271 return ""
255 272
256 273
257 if __name__ == '__main__': 274 if __name__ == "__main__":
258 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') 275 parser = argparse.ArgumentParser(
259 parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output') 276 description="Convert Blast XML to gapped GFF3", epilog=""
260 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) 277 )
261 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') 278 parser.add_argument(
262 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') 279 "blastxml", type=argparse.FileType("r"), help="Blast XML Output"
263 parser.add_argument('--include_seq', action='store_true', help='Include sequence') 280 )
281 parser.add_argument(
282 "--min_gap",
283 type=int,
284 help="Maximum gap size before generating a new match_part",
285 default=3,
286 )
287 parser.add_argument(
288 "--trim",
289 action="store_true",
290 help="Trim blast hits to be only as long as the parent feature",
291 )
292 parser.add_argument(
293 "--trim_end", action="store_true", help="Cut blast results off at end of gene"
294 )
295 parser.add_argument("--include_seq", action="store_true", help="Include sequence")
264 args = parser.parse_args() 296 args = parser.parse_args()
265 297
266 for rec in blastxml2gff3(**vars(args)): 298 for rec in blastxml2gff3(**vars(args)):
267 if len(rec.features): 299 if len(rec.features):
268 GFF.write([rec], sys.stdout) 300 GFF.write([rec], sys.stdout)