jbrowse2dev: jbrowse2/blastxml_to_gapped

comparison jbrowse2/blastxml_to_gapped_gff3.py @ 6:88b9b105c09b draft

Uploaded

author	fubar
date	Fri, 05 Jan 2024 01:58:02 +0000
parents	cd5d63cd0eb5
children	234cf4490901

comparison

equal deleted inserted replaced

-:42ca8804cd93
+:88b9b105c09b
 import logging
 import re
 import sys
 from BCBio import GFF
 logging.basicConfig(level=logging.INFO)
-log = logging.getLogger(name='blastxml2gff3')
+log = logging.getLogger(name="blastxml2gff3")
 __doc__ = """
 BlastXML files, when transformed to GFF3, do not normally show gaps in the
 blast hits. This tool aims to fill that "gap".
 """
 blast_records = NCBIXML.parse(blastxml)
 for idx_record, record in enumerate(blast_records):
 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
 match_type = {  # Currently we can only handle BLASTN, BLASTP
-'BLASTN': 'nucleotide_match',
+"BLASTN": "nucleotide_match",
-'BLASTP': 'protein_match',
+"BLASTP": "protein_match",
-}.get(record.application, 'match')
+}.get(record.application, "match")
 recid = record.query
-if ' ' in recid:
+if " " in recid:
-recid = recid[0:recid.index(' ')]
+recid = recid[0 : recid.index(" ")]
 rec = SeqRecord(Seq("ACTG"), id=recid)
 for idx_hit, hit in enumerate(record.alignments):
 for idx_hsp, hsp in enumerate(hit.hsps):
 qualifiers = {
-"ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
+"ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
 "source": "blast",
 "score": hsp.expect,
 "accession": hit.accession,
 "hit_id": hit.hit_id,
 "length": hit.length,
-"hit_titles": hit.title.split(' >'),
+"hit_titles": hit.title.split(" >"),
 }
 if include_seq:
-qualifiers.update({
+qualifiers.update(
-'blast_qseq': hsp.query,
+{
-'blast_sseq': hsp.sbjct,
+"blast_qseq": hsp.query,
-'blast_mseq': hsp.match,
+"blast_sseq": hsp.sbjct,
-})
+"blast_mseq": hsp.match,
+}
-for prop in ('score', 'bits', 'identities', 'positives',
+)
-'gaps', 'align_length', 'strand', 'frame',
-'query_start', 'query_end', 'sbjct_start',
+for prop in (
-'sbjct_end'):
+"score",
-qualifiers['blast_' + prop] = getattr(hsp, prop, None)
+"bits",
+"identities",
-desc = hit.title.split(' >')[0]
+"positives",
-qualifiers['description'] = desc[desc.index(' '):]
+"gaps",
+"align_length",
+"strand",
+"frame",
+"query_start",
+"query_end",
+"sbjct_start",
+"sbjct_end",
+):
+qualifiers["blast_" + prop] = getattr(hsp, prop, None)
+desc = hit.title.split(" >")[0]
+qualifiers["description"] = desc[desc.index(" ") :]
 # This required a fair bit of sketching out/match to figure out
 # the first time.
 #
 # the match_start location must account for queries and
 parent_match_start = hsp.query_start - hsp.sbjct_start
 # The end is the start + hit.length because the match itself
 # may be longer than the parent feature, so we use the supplied
 # subject/hit length to calculate the real ending of the target
 # protein.
-parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
+parent_match_end = hsp.query_start + hit.length + hsp.query.count("-")
 # If we trim the left end, we need to trim without losing information.
 used_parent_match_start = parent_match_start
 if trim:
 if parent_match_start < 1:
 # The ``match`` feature will hold one or more ``match_part``s
 top_feature = SeqFeature(
 SimpleLocation(used_parent_match_start, parent_match_end, strand=0),
 type=match_type,
-qualifiers=qualifiers
+qualifiers=qualifiers,
 )
 # Unlike the parent feature, ``match_part``s have sources.
 part_qualifiers = {
 "source": "blast",
 }
 top_feature.sub_features = []
-for idx_part, (start, end, cigar) in \
+for idx_part, (start, end, cigar) in enumerate(
-enumerate(generate_parts(hsp.query, hsp.match,
+generate_parts(
-hsp.sbjct,
+hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap
-ignore_under=min_gap)):
+)
-part_qualifiers['Gap'] = cigar
+):
-part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)
+part_qualifiers["Gap"] = cigar
+part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part)
 # Otherwise, we have to account for the subject start's location
 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
 # We used to use hsp.align_length here, but that includes
 top_feature.sub_features.append(
 SeqFeature(
 SimpleLocation(match_part_start, match_part_end, strand=1),
 type="match_part",
-qualifiers=copy.deepcopy(part_qualifiers))
+qualifiers=copy.deepcopy(part_qualifiers),
+)
 )
 rec.features.append(top_feature)
 rec.annotations = {}
 yield rec
 which greatly simplifies the process of identifying the correct location
 for a match_part
 """
 prev = 0
-fq = ''
+fq = ""
-fm = ''
+fm = ""
-fs = ''
+fs = ""
-for position in re.finditer('-', query):
+for position in re.finditer("-", query):
-fq += query[prev:position.start()]
+fq += query[prev : position.start()]
-fm += match[prev:position.start()]
+fm += match[prev : position.start()]
-fs += subject[prev:position.start()]
+fs += subject[prev : position.start()]
 prev = position.start() + 1
 fq += query[prev:]
 fm += match[prev:]
 fs += subject[prev:]
 region_end = -1
 mismatch_count = 0
 for i, (q, m, s) in enumerate(zip(query, match, subject)):
 # If we have a match
-if m != ' ' or m == '+':
+if m != " " or m == "+":
 if region_start == -1:
 region_start = i
 # It's a new region, we need to reset or it's pre-seeded with
 # spaces
 region_q = []
 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1:
 region_q = region_q[0:-ignore_under]
 region_m = region_m[0:-ignore_under]
 region_s = region_s[0:-ignore_under]
-yield region_start, region_end + 1, \
+yield region_start, region_end + 1, cigar_from_string(
-cigar_from_string(region_q, region_m, region_s, strict_m=True)
+region_q, region_m, region_s, strict_m=True
+)
 region_q = []
 region_m = []
 region_s = []
 region_start = -1
 region_end = -1
 mismatch_count = 0
-yield region_start, region_end + 1, \
+yield region_start, region_end + 1, cigar_from_string(
-cigar_from_string(region_q, region_m, region_s, strict_m=True)
+region_q, region_m, region_s, strict_m=True
+)
 def _qms_to_matches(query, match, subject, strict_m=True):
 matchline = []
 for (q, m, s) in zip(query, match, subject):
-ret = ''
+ret = ""
-if m != ' ' or m == '+':
+if m != " " or m == "+":
-ret = '='
+ret = "="
-elif m == ' ':
+elif m == " ":
-if q == '-':
+if q == "-":
-ret = 'D'
+ret = "D"
-elif s == '-':
+elif s == "-":
-ret = 'I'
+ret = "I"
 else:
-ret = 'X'
+ret = "X"
 else:
 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
 if strict_m:
-if ret == '=' or ret == 'X':
+if ret == "=" or ret == "X":
-ret = 'M'
+ret = "M"
 matchline.append(ret)
 return matchline
 else:
 cigar_line.append("%s%s" % (last_char, count))
 count = 1
 last_char = char
 cigar_line.append("%s%s" % (last_char, count))
-return ' '.join(cigar_line)
+return " ".join(cigar_line)
 def cigar_from_string(query, match, subject, strict_m=True):
 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m)
 if len(matchline) > 0:
 return _matchline_to_cigar(matchline)
 else:
 return ""
-if __name__ == '__main__':
+if __name__ == "__main__":
-parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
+parser = argparse.ArgumentParser(
-parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output')
+description="Convert Blast XML to gapped GFF3", epilog=""
-parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
+)
-parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
+parser.add_argument(
-parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
+"blastxml", type=argparse.FileType("r"), help="Blast XML Output"
-parser.add_argument('--include_seq', action='store_true', help='Include sequence')
+)
+parser.add_argument(
+"--min_gap",
+type=int,
+help="Maximum gap size before generating a new match_part",
+default=3,
+)
+parser.add_argument(
+"--trim",
+action="store_true",
+help="Trim blast hits to be only as long as the parent feature",
+)
+parser.add_argument(
+"--trim_end", action="store_true", help="Cut blast results off at end of gene"
+)
+parser.add_argument("--include_seq", action="store_true", help="Include sequence")
 args = parser.parse_args()
 for rec in blastxml2gff3(**vars(args)):
 if len(rec.features):
 GFF.write([rec], sys.stdout)

Mercurial > repos > fubar > jbrowse2dev

comparison jbrowse2/blastxml_to_gapped_gff3.py @ 6:88b9b105c09b draft