Mercurial > repos > fubar > jbrowse2dev
comparison jbrowse2/blastxml_to_gapped_gff3.py @ 6:88b9b105c09b draft
Uploaded
author | fubar |
---|---|
date | Fri, 05 Jan 2024 01:58:02 +0000 |
parents | cd5d63cd0eb5 |
children | 234cf4490901 |
comparison
equal
deleted
inserted
replaced
5:42ca8804cd93 | 6:88b9b105c09b |
---|---|
4 import logging | 4 import logging |
5 import re | 5 import re |
6 import sys | 6 import sys |
7 | 7 |
8 from BCBio import GFF | 8 from BCBio import GFF |
9 | |
9 logging.basicConfig(level=logging.INFO) | 10 logging.basicConfig(level=logging.INFO) |
10 log = logging.getLogger(name='blastxml2gff3') | 11 log = logging.getLogger(name="blastxml2gff3") |
11 | 12 |
12 __doc__ = """ | 13 __doc__ = """ |
13 BlastXML files, when transformed to GFF3, do not normally show gaps in the | 14 BlastXML files, when transformed to GFF3, do not normally show gaps in the |
14 blast hits. This tool aims to fill that "gap". | 15 blast hits. This tool aims to fill that "gap". |
15 """ | 16 """ |
23 | 24 |
24 blast_records = NCBIXML.parse(blastxml) | 25 blast_records = NCBIXML.parse(blastxml) |
25 for idx_record, record in enumerate(blast_records): | 26 for idx_record, record in enumerate(blast_records): |
26 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 | 27 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 |
27 match_type = { # Currently we can only handle BLASTN, BLASTP | 28 match_type = { # Currently we can only handle BLASTN, BLASTP |
28 'BLASTN': 'nucleotide_match', | 29 "BLASTN": "nucleotide_match", |
29 'BLASTP': 'protein_match', | 30 "BLASTP": "protein_match", |
30 }.get(record.application, 'match') | 31 }.get(record.application, "match") |
31 | 32 |
32 recid = record.query | 33 recid = record.query |
33 if ' ' in recid: | 34 if " " in recid: |
34 recid = recid[0:recid.index(' ')] | 35 recid = recid[0 : recid.index(" ")] |
35 | 36 |
36 rec = SeqRecord(Seq("ACTG"), id=recid) | 37 rec = SeqRecord(Seq("ACTG"), id=recid) |
37 for idx_hit, hit in enumerate(record.alignments): | 38 for idx_hit, hit in enumerate(record.alignments): |
38 for idx_hsp, hsp in enumerate(hit.hsps): | 39 for idx_hsp, hsp in enumerate(hit.hsps): |
39 qualifiers = { | 40 qualifiers = { |
40 "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), | 41 "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp), |
41 "source": "blast", | 42 "source": "blast", |
42 "score": hsp.expect, | 43 "score": hsp.expect, |
43 "accession": hit.accession, | 44 "accession": hit.accession, |
44 "hit_id": hit.hit_id, | 45 "hit_id": hit.hit_id, |
45 "length": hit.length, | 46 "length": hit.length, |
46 "hit_titles": hit.title.split(' >'), | 47 "hit_titles": hit.title.split(" >"), |
47 } | 48 } |
48 if include_seq: | 49 if include_seq: |
49 qualifiers.update({ | 50 qualifiers.update( |
50 'blast_qseq': hsp.query, | 51 { |
51 'blast_sseq': hsp.sbjct, | 52 "blast_qseq": hsp.query, |
52 'blast_mseq': hsp.match, | 53 "blast_sseq": hsp.sbjct, |
53 }) | 54 "blast_mseq": hsp.match, |
54 | 55 } |
55 for prop in ('score', 'bits', 'identities', 'positives', | 56 ) |
56 'gaps', 'align_length', 'strand', 'frame', | 57 |
57 'query_start', 'query_end', 'sbjct_start', | 58 for prop in ( |
58 'sbjct_end'): | 59 "score", |
59 qualifiers['blast_' + prop] = getattr(hsp, prop, None) | 60 "bits", |
60 | 61 "identities", |
61 desc = hit.title.split(' >')[0] | 62 "positives", |
62 qualifiers['description'] = desc[desc.index(' '):] | 63 "gaps", |
64 "align_length", | |
65 "strand", | |
66 "frame", | |
67 "query_start", | |
68 "query_end", | |
69 "sbjct_start", | |
70 "sbjct_end", | |
71 ): | |
72 qualifiers["blast_" + prop] = getattr(hsp, prop, None) | |
73 | |
74 desc = hit.title.split(" >")[0] | |
75 qualifiers["description"] = desc[desc.index(" ") :] | |
63 | 76 |
64 # This required a fair bit of sketching out/match to figure out | 77 # This required a fair bit of sketching out/match to figure out |
65 # the first time. | 78 # the first time. |
66 # | 79 # |
67 # the match_start location must account for queries and | 80 # the match_start location must account for queries and |
69 parent_match_start = hsp.query_start - hsp.sbjct_start | 82 parent_match_start = hsp.query_start - hsp.sbjct_start |
70 # The end is the start + hit.length because the match itself | 83 # The end is the start + hit.length because the match itself |
71 # may be longer than the parent feature, so we use the supplied | 84 # may be longer than the parent feature, so we use the supplied |
72 # subject/hit length to calculate the real ending of the target | 85 # subject/hit length to calculate the real ending of the target |
73 # protein. | 86 # protein. |
74 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') | 87 parent_match_end = hsp.query_start + hit.length + hsp.query.count("-") |
75 | 88 |
76 # If we trim the left end, we need to trim without losing information. | 89 # If we trim the left end, we need to trim without losing information. |
77 used_parent_match_start = parent_match_start | 90 used_parent_match_start = parent_match_start |
78 if trim: | 91 if trim: |
79 if parent_match_start < 1: | 92 if parent_match_start < 1: |
85 | 98 |
86 # The ``match`` feature will hold one or more ``match_part``s | 99 # The ``match`` feature will hold one or more ``match_part``s |
87 top_feature = SeqFeature( | 100 top_feature = SeqFeature( |
88 SimpleLocation(used_parent_match_start, parent_match_end, strand=0), | 101 SimpleLocation(used_parent_match_start, parent_match_end, strand=0), |
89 type=match_type, | 102 type=match_type, |
90 qualifiers=qualifiers | 103 qualifiers=qualifiers, |
91 ) | 104 ) |
92 | 105 |
93 # Unlike the parent feature, ``match_part``s have sources. | 106 # Unlike the parent feature, ``match_part``s have sources. |
94 part_qualifiers = { | 107 part_qualifiers = { |
95 "source": "blast", | 108 "source": "blast", |
96 } | 109 } |
97 top_feature.sub_features = [] | 110 top_feature.sub_features = [] |
98 for idx_part, (start, end, cigar) in \ | 111 for idx_part, (start, end, cigar) in enumerate( |
99 enumerate(generate_parts(hsp.query, hsp.match, | 112 generate_parts( |
100 hsp.sbjct, | 113 hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap |
101 ignore_under=min_gap)): | 114 ) |
102 part_qualifiers['Gap'] = cigar | 115 ): |
103 part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) | 116 part_qualifiers["Gap"] = cigar |
117 part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part) | |
104 | 118 |
105 # Otherwise, we have to account for the subject start's location | 119 # Otherwise, we have to account for the subject start's location |
106 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 | 120 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 |
107 | 121 |
108 # We used to use hsp.align_length here, but that includes | 122 # We used to use hsp.align_length here, but that includes |
114 | 128 |
115 top_feature.sub_features.append( | 129 top_feature.sub_features.append( |
116 SeqFeature( | 130 SeqFeature( |
117 SimpleLocation(match_part_start, match_part_end, strand=1), | 131 SimpleLocation(match_part_start, match_part_end, strand=1), |
118 type="match_part", | 132 type="match_part", |
119 qualifiers=copy.deepcopy(part_qualifiers)) | 133 qualifiers=copy.deepcopy(part_qualifiers), |
134 ) | |
120 ) | 135 ) |
121 | 136 |
122 rec.features.append(top_feature) | 137 rec.features.append(top_feature) |
123 rec.annotations = {} | 138 rec.annotations = {} |
124 yield rec | 139 yield rec |
140 | 155 |
141 which greatly simplifies the process of identifying the correct location | 156 which greatly simplifies the process of identifying the correct location |
142 for a match_part | 157 for a match_part |
143 """ | 158 """ |
144 prev = 0 | 159 prev = 0 |
145 fq = '' | 160 fq = "" |
146 fm = '' | 161 fm = "" |
147 fs = '' | 162 fs = "" |
148 for position in re.finditer('-', query): | 163 for position in re.finditer("-", query): |
149 fq += query[prev:position.start()] | 164 fq += query[prev : position.start()] |
150 fm += match[prev:position.start()] | 165 fm += match[prev : position.start()] |
151 fs += subject[prev:position.start()] | 166 fs += subject[prev : position.start()] |
152 prev = position.start() + 1 | 167 prev = position.start() + 1 |
153 fq += query[prev:] | 168 fq += query[prev:] |
154 fm += match[prev:] | 169 fm += match[prev:] |
155 fs += subject[prev:] | 170 fs += subject[prev:] |
156 | 171 |
168 region_end = -1 | 183 region_end = -1 |
169 mismatch_count = 0 | 184 mismatch_count = 0 |
170 for i, (q, m, s) in enumerate(zip(query, match, subject)): | 185 for i, (q, m, s) in enumerate(zip(query, match, subject)): |
171 | 186 |
172 # If we have a match | 187 # If we have a match |
173 if m != ' ' or m == '+': | 188 if m != " " or m == "+": |
174 if region_start == -1: | 189 if region_start == -1: |
175 region_start = i | 190 region_start = i |
176 # It's a new region, we need to reset or it's pre-seeded with | 191 # It's a new region, we need to reset or it's pre-seeded with |
177 # spaces | 192 # spaces |
178 region_q = [] | 193 region_q = [] |
189 | 204 |
190 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: | 205 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: |
191 region_q = region_q[0:-ignore_under] | 206 region_q = region_q[0:-ignore_under] |
192 region_m = region_m[0:-ignore_under] | 207 region_m = region_m[0:-ignore_under] |
193 region_s = region_s[0:-ignore_under] | 208 region_s = region_s[0:-ignore_under] |
194 yield region_start, region_end + 1, \ | 209 yield region_start, region_end + 1, cigar_from_string( |
195 cigar_from_string(region_q, region_m, region_s, strict_m=True) | 210 region_q, region_m, region_s, strict_m=True |
211 ) | |
196 region_q = [] | 212 region_q = [] |
197 region_m = [] | 213 region_m = [] |
198 region_s = [] | 214 region_s = [] |
199 | 215 |
200 region_start = -1 | 216 region_start = -1 |
201 region_end = -1 | 217 region_end = -1 |
202 mismatch_count = 0 | 218 mismatch_count = 0 |
203 | 219 |
204 yield region_start, region_end + 1, \ | 220 yield region_start, region_end + 1, cigar_from_string( |
205 cigar_from_string(region_q, region_m, region_s, strict_m=True) | 221 region_q, region_m, region_s, strict_m=True |
222 ) | |
206 | 223 |
207 | 224 |
208 def _qms_to_matches(query, match, subject, strict_m=True): | 225 def _qms_to_matches(query, match, subject, strict_m=True): |
209 matchline = [] | 226 matchline = [] |
210 | 227 |
211 for (q, m, s) in zip(query, match, subject): | 228 for (q, m, s) in zip(query, match, subject): |
212 ret = '' | 229 ret = "" |
213 | 230 |
214 if m != ' ' or m == '+': | 231 if m != " " or m == "+": |
215 ret = '=' | 232 ret = "=" |
216 elif m == ' ': | 233 elif m == " ": |
217 if q == '-': | 234 if q == "-": |
218 ret = 'D' | 235 ret = "D" |
219 elif s == '-': | 236 elif s == "-": |
220 ret = 'I' | 237 ret = "I" |
221 else: | 238 else: |
222 ret = 'X' | 239 ret = "X" |
223 else: | 240 else: |
224 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) | 241 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) |
225 | 242 |
226 if strict_m: | 243 if strict_m: |
227 if ret == '=' or ret == 'X': | 244 if ret == "=" or ret == "X": |
228 ret = 'M' | 245 ret = "M" |
229 | 246 |
230 matchline.append(ret) | 247 matchline.append(ret) |
231 return matchline | 248 return matchline |
232 | 249 |
233 | 250 |
241 else: | 258 else: |
242 cigar_line.append("%s%s" % (last_char, count)) | 259 cigar_line.append("%s%s" % (last_char, count)) |
243 count = 1 | 260 count = 1 |
244 last_char = char | 261 last_char = char |
245 cigar_line.append("%s%s" % (last_char, count)) | 262 cigar_line.append("%s%s" % (last_char, count)) |
246 return ' '.join(cigar_line) | 263 return " ".join(cigar_line) |
247 | 264 |
248 | 265 |
249 def cigar_from_string(query, match, subject, strict_m=True): | 266 def cigar_from_string(query, match, subject, strict_m=True): |
250 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) | 267 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) |
251 if len(matchline) > 0: | 268 if len(matchline) > 0: |
252 return _matchline_to_cigar(matchline) | 269 return _matchline_to_cigar(matchline) |
253 else: | 270 else: |
254 return "" | 271 return "" |
255 | 272 |
256 | 273 |
257 if __name__ == '__main__': | 274 if __name__ == "__main__": |
258 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') | 275 parser = argparse.ArgumentParser( |
259 parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output') | 276 description="Convert Blast XML to gapped GFF3", epilog="" |
260 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) | 277 ) |
261 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') | 278 parser.add_argument( |
262 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') | 279 "blastxml", type=argparse.FileType("r"), help="Blast XML Output" |
263 parser.add_argument('--include_seq', action='store_true', help='Include sequence') | 280 ) |
281 parser.add_argument( | |
282 "--min_gap", | |
283 type=int, | |
284 help="Maximum gap size before generating a new match_part", | |
285 default=3, | |
286 ) | |
287 parser.add_argument( | |
288 "--trim", | |
289 action="store_true", | |
290 help="Trim blast hits to be only as long as the parent feature", | |
291 ) | |
292 parser.add_argument( | |
293 "--trim_end", action="store_true", help="Cut blast results off at end of gene" | |
294 ) | |
295 parser.add_argument("--include_seq", action="store_true", help="Include sequence") | |
264 args = parser.parse_args() | 296 args = parser.parse_args() |
265 | 297 |
266 for rec in blastxml2gff3(**vars(args)): | 298 for rec in blastxml2gff3(**vars(args)): |
267 if len(rec.features): | 299 if len(rec.features): |
268 GFF.write([rec], sys.stdout) | 300 GFF.write([rec], sys.stdout) |