Mercurial > repos > fubar > jbrowse2dev
diff jbrowse2/blastxml_to_gapped_gff3.py @ 6:88b9b105c09b draft
Uploaded
author | fubar |
---|---|
date | Fri, 05 Jan 2024 01:58:02 +0000 |
parents | cd5d63cd0eb5 |
children | 234cf4490901 |
line wrap: on
line diff
--- a/jbrowse2/blastxml_to_gapped_gff3.py Thu Jan 04 02:18:18 2024 +0000 +++ b/jbrowse2/blastxml_to_gapped_gff3.py Fri Jan 05 01:58:02 2024 +0000 @@ -6,8 +6,9 @@ import sys from BCBio import GFF + logging.basicConfig(level=logging.INFO) -log = logging.getLogger(name='blastxml2gff3') +log = logging.getLogger(name="blastxml2gff3") __doc__ = """ BlastXML files, when transformed to GFF3, do not normally show gaps in the @@ -25,41 +26,53 @@ for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP - 'BLASTN': 'nucleotide_match', - 'BLASTP': 'protein_match', - }.get(record.application, 'match') + "BLASTN": "nucleotide_match", + "BLASTP": "protein_match", + }.get(record.application, "match") recid = record.query - if ' ' in recid: - recid = recid[0:recid.index(' ')] + if " " in recid: + recid = recid[0 : recid.index(" ")] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): for idx_hsp, hsp in enumerate(hit.hsps): qualifiers = { - "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), + "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, - "hit_titles": hit.title.split(' >'), + "hit_titles": hit.title.split(" >"), } if include_seq: - qualifiers.update({ - 'blast_qseq': hsp.query, - 'blast_sseq': hsp.sbjct, - 'blast_mseq': hsp.match, - }) + qualifiers.update( + { + "blast_qseq": hsp.query, + "blast_sseq": hsp.sbjct, + "blast_mseq": hsp.match, + } + ) - for prop in ('score', 'bits', 'identities', 'positives', - 'gaps', 'align_length', 'strand', 'frame', - 'query_start', 'query_end', 'sbjct_start', - 'sbjct_end'): - qualifiers['blast_' + prop] = getattr(hsp, prop, None) + for prop in ( + "score", + "bits", + "identities", + "positives", + "gaps", + "align_length", + "strand", + "frame", + "query_start", + "query_end", + "sbjct_start", + "sbjct_end", + ): + qualifiers["blast_" + prop] = getattr(hsp, prop, None) - desc = hit.title.split(' >')[0] - qualifiers['description'] = desc[desc.index(' '):] + desc = hit.title.split(" >")[0] + qualifiers["description"] = desc[desc.index(" ") :] # This required a fair bit of sketching out/match to figure out # the first time. @@ -71,7 +84,7 @@ # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. - parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') + parent_match_end = hsp.query_start + hit.length + hsp.query.count("-") # If we trim the left end, we need to trim without losing information. used_parent_match_start = parent_match_start @@ -87,7 +100,7 @@ top_feature = SeqFeature( SimpleLocation(used_parent_match_start, parent_match_end, strand=0), type=match_type, - qualifiers=qualifiers + qualifiers=qualifiers, ) # Unlike the parent feature, ``match_part``s have sources. @@ -95,12 +108,13 @@ "source": "blast", } top_feature.sub_features = [] - for idx_part, (start, end, cigar) in \ - enumerate(generate_parts(hsp.query, hsp.match, - hsp.sbjct, - ignore_under=min_gap)): - part_qualifiers['Gap'] = cigar - part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) + for idx_part, (start, end, cigar) in enumerate( + generate_parts( + hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap + ) + ): + part_qualifiers["Gap"] = cigar + part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part) # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 @@ -116,7 +130,8 @@ SeqFeature( SimpleLocation(match_part_start, match_part_end, strand=1), type="match_part", - qualifiers=copy.deepcopy(part_qualifiers)) + qualifiers=copy.deepcopy(part_qualifiers), + ) ) rec.features.append(top_feature) @@ -142,13 +157,13 @@ for a match_part """ prev = 0 - fq = '' - fm = '' - fs = '' - for position in re.finditer('-', query): - fq += query[prev:position.start()] - fm += match[prev:position.start()] - fs += subject[prev:position.start()] + fq = "" + fm = "" + fs = "" + for position in re.finditer("-", query): + fq += query[prev : position.start()] + fm += match[prev : position.start()] + fs += subject[prev : position.start()] prev = position.start() + 1 fq += query[prev:] fm += match[prev:] @@ -170,7 +185,7 @@ for i, (q, m, s) in enumerate(zip(query, match, subject)): # If we have a match - if m != ' ' or m == '+': + if m != " " or m == "+": if region_start == -1: region_start = i # It's a new region, we need to reset or it's pre-seeded with @@ -191,8 +206,9 @@ region_q = region_q[0:-ignore_under] region_m = region_m[0:-ignore_under] region_s = region_s[0:-ignore_under] - yield region_start, region_end + 1, \ - cigar_from_string(region_q, region_m, region_s, strict_m=True) + yield region_start, region_end + 1, cigar_from_string( + region_q, region_m, region_s, strict_m=True + ) region_q = [] region_m = [] region_s = [] @@ -201,31 +217,32 @@ region_end = -1 mismatch_count = 0 - yield region_start, region_end + 1, \ - cigar_from_string(region_q, region_m, region_s, strict_m=True) + yield region_start, region_end + 1, cigar_from_string( + region_q, region_m, region_s, strict_m=True + ) def _qms_to_matches(query, match, subject, strict_m=True): matchline = [] for (q, m, s) in zip(query, match, subject): - ret = '' + ret = "" - if m != ' ' or m == '+': - ret = '=' - elif m == ' ': - if q == '-': - ret = 'D' - elif s == '-': - ret = 'I' + if m != " " or m == "+": + ret = "=" + elif m == " ": + if q == "-": + ret = "D" + elif s == "-": + ret = "I" else: - ret = 'X' + ret = "X" else: log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) if strict_m: - if ret == '=' or ret == 'X': - ret = 'M' + if ret == "=" or ret == "X": + ret = "M" matchline.append(ret) return matchline @@ -243,7 +260,7 @@ count = 1 last_char = char cigar_line.append("%s%s" % (last_char, count)) - return ' '.join(cigar_line) + return " ".join(cigar_line) def cigar_from_string(query, match, subject, strict_m=True): @@ -254,13 +271,28 @@ return "" -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') - parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output') - parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) - parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') - parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') - parser.add_argument('--include_seq', action='store_true', help='Include sequence') +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert Blast XML to gapped GFF3", epilog="" + ) + parser.add_argument( + "blastxml", type=argparse.FileType("r"), help="Blast XML Output" + ) + parser.add_argument( + "--min_gap", + type=int, + help="Maximum gap size before generating a new match_part", + default=3, + ) + parser.add_argument( + "--trim", + action="store_true", + help="Trim blast hits to be only as long as the parent feature", + ) + parser.add_argument( + "--trim_end", action="store_true", help="Cut blast results off at end of gene" + ) + parser.add_argument("--include_seq", action="store_true", help="Include sequence") args = parser.parse_args() for rec in blastxml2gff3(**vars(args)):