diff jbrowse2/blastxml_to_gapped_gff3.py @ 6:88b9b105c09b draft

Uploaded
author fubar
date Fri, 05 Jan 2024 01:58:02 +0000
parents cd5d63cd0eb5
children 234cf4490901
line wrap: on
line diff
--- a/jbrowse2/blastxml_to_gapped_gff3.py	Thu Jan 04 02:18:18 2024 +0000
+++ b/jbrowse2/blastxml_to_gapped_gff3.py	Fri Jan 05 01:58:02 2024 +0000
@@ -6,8 +6,9 @@
 import sys
 
 from BCBio import GFF
+
 logging.basicConfig(level=logging.INFO)
-log = logging.getLogger(name='blastxml2gff3')
+log = logging.getLogger(name="blastxml2gff3")
 
 __doc__ = """
 BlastXML files, when transformed to GFF3, do not normally show gaps in the
@@ -25,41 +26,53 @@
     for idx_record, record in enumerate(blast_records):
         # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
         match_type = {  # Currently we can only handle BLASTN, BLASTP
-            'BLASTN': 'nucleotide_match',
-            'BLASTP': 'protein_match',
-        }.get(record.application, 'match')
+            "BLASTN": "nucleotide_match",
+            "BLASTP": "protein_match",
+        }.get(record.application, "match")
 
         recid = record.query
-        if ' ' in recid:
-            recid = recid[0:recid.index(' ')]
+        if " " in recid:
+            recid = recid[0 : recid.index(" ")]
 
         rec = SeqRecord(Seq("ACTG"), id=recid)
         for idx_hit, hit in enumerate(record.alignments):
             for idx_hsp, hsp in enumerate(hit.hsps):
                 qualifiers = {
-                    "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp),
+                    "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp),
                     "source": "blast",
                     "score": hsp.expect,
                     "accession": hit.accession,
                     "hit_id": hit.hit_id,
                     "length": hit.length,
-                    "hit_titles": hit.title.split(' >'),
+                    "hit_titles": hit.title.split(" >"),
                 }
                 if include_seq:
-                    qualifiers.update({
-                        'blast_qseq': hsp.query,
-                        'blast_sseq': hsp.sbjct,
-                        'blast_mseq': hsp.match,
-                    })
+                    qualifiers.update(
+                        {
+                            "blast_qseq": hsp.query,
+                            "blast_sseq": hsp.sbjct,
+                            "blast_mseq": hsp.match,
+                        }
+                    )
 
-                for prop in ('score', 'bits', 'identities', 'positives',
-                             'gaps', 'align_length', 'strand', 'frame',
-                             'query_start', 'query_end', 'sbjct_start',
-                             'sbjct_end'):
-                    qualifiers['blast_' + prop] = getattr(hsp, prop, None)
+                for prop in (
+                    "score",
+                    "bits",
+                    "identities",
+                    "positives",
+                    "gaps",
+                    "align_length",
+                    "strand",
+                    "frame",
+                    "query_start",
+                    "query_end",
+                    "sbjct_start",
+                    "sbjct_end",
+                ):
+                    qualifiers["blast_" + prop] = getattr(hsp, prop, None)
 
-                desc = hit.title.split(' >')[0]
-                qualifiers['description'] = desc[desc.index(' '):]
+                desc = hit.title.split(" >")[0]
+                qualifiers["description"] = desc[desc.index(" ") :]
 
                 # This required a fair bit of sketching out/match to figure out
                 # the first time.
@@ -71,7 +84,7 @@
                 # may be longer than the parent feature, so we use the supplied
                 # subject/hit length to calculate the real ending of the target
                 # protein.
-                parent_match_end = hsp.query_start + hit.length + hsp.query.count('-')
+                parent_match_end = hsp.query_start + hit.length + hsp.query.count("-")
 
                 # If we trim the left end, we need to trim without losing information.
                 used_parent_match_start = parent_match_start
@@ -87,7 +100,7 @@
                 top_feature = SeqFeature(
                     SimpleLocation(used_parent_match_start, parent_match_end, strand=0),
                     type=match_type,
-                    qualifiers=qualifiers
+                    qualifiers=qualifiers,
                 )
 
                 # Unlike the parent feature, ``match_part``s have sources.
@@ -95,12 +108,13 @@
                     "source": "blast",
                 }
                 top_feature.sub_features = []
-                for idx_part, (start, end, cigar) in \
-                        enumerate(generate_parts(hsp.query, hsp.match,
-                                                 hsp.sbjct,
-                                                 ignore_under=min_gap)):
-                    part_qualifiers['Gap'] = cigar
-                    part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part)
+                for idx_part, (start, end, cigar) in enumerate(
+                    generate_parts(
+                        hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap
+                    )
+                ):
+                    part_qualifiers["Gap"] = cigar
+                    part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part)
 
                     # Otherwise, we have to account for the subject start's location
                     match_part_start = parent_match_start + hsp.sbjct_start + start - 1
@@ -116,7 +130,8 @@
                         SeqFeature(
                             SimpleLocation(match_part_start, match_part_end, strand=1),
                             type="match_part",
-                            qualifiers=copy.deepcopy(part_qualifiers))
+                            qualifiers=copy.deepcopy(part_qualifiers),
+                        )
                     )
 
                 rec.features.append(top_feature)
@@ -142,13 +157,13 @@
     for a match_part
     """
     prev = 0
-    fq = ''
-    fm = ''
-    fs = ''
-    for position in re.finditer('-', query):
-        fq += query[prev:position.start()]
-        fm += match[prev:position.start()]
-        fs += subject[prev:position.start()]
+    fq = ""
+    fm = ""
+    fs = ""
+    for position in re.finditer("-", query):
+        fq += query[prev : position.start()]
+        fm += match[prev : position.start()]
+        fs += subject[prev : position.start()]
         prev = position.start() + 1
     fq += query[prev:]
     fm += match[prev:]
@@ -170,7 +185,7 @@
     for i, (q, m, s) in enumerate(zip(query, match, subject)):
 
         # If we have a match
-        if m != ' ' or m == '+':
+        if m != " " or m == "+":
             if region_start == -1:
                 region_start = i
                 # It's a new region, we need to reset or it's pre-seeded with
@@ -191,8 +206,9 @@
             region_q = region_q[0:-ignore_under]
             region_m = region_m[0:-ignore_under]
             region_s = region_s[0:-ignore_under]
-            yield region_start, region_end + 1, \
-                cigar_from_string(region_q, region_m, region_s, strict_m=True)
+            yield region_start, region_end + 1, cigar_from_string(
+                region_q, region_m, region_s, strict_m=True
+            )
             region_q = []
             region_m = []
             region_s = []
@@ -201,31 +217,32 @@
             region_end = -1
             mismatch_count = 0
 
-    yield region_start, region_end + 1, \
-        cigar_from_string(region_q, region_m, region_s, strict_m=True)
+    yield region_start, region_end + 1, cigar_from_string(
+        region_q, region_m, region_s, strict_m=True
+    )
 
 
 def _qms_to_matches(query, match, subject, strict_m=True):
     matchline = []
 
     for (q, m, s) in zip(query, match, subject):
-        ret = ''
+        ret = ""
 
-        if m != ' ' or m == '+':
-            ret = '='
-        elif m == ' ':
-            if q == '-':
-                ret = 'D'
-            elif s == '-':
-                ret = 'I'
+        if m != " " or m == "+":
+            ret = "="
+        elif m == " ":
+            if q == "-":
+                ret = "D"
+            elif s == "-":
+                ret = "I"
             else:
-                ret = 'X'
+                ret = "X"
         else:
             log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject))
 
         if strict_m:
-            if ret == '=' or ret == 'X':
-                ret = 'M'
+            if ret == "=" or ret == "X":
+                ret = "M"
 
         matchline.append(ret)
     return matchline
@@ -243,7 +260,7 @@
             count = 1
         last_char = char
     cigar_line.append("%s%s" % (last_char, count))
-    return ' '.join(cigar_line)
+    return " ".join(cigar_line)
 
 
 def cigar_from_string(query, match, subject, strict_m=True):
@@ -254,13 +271,28 @@
         return ""
 
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='')
-    parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output')
-    parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3)
-    parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
-    parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
-    parser.add_argument('--include_seq', action='store_true', help='Include sequence')
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert Blast XML to gapped GFF3", epilog=""
+    )
+    parser.add_argument(
+        "blastxml", type=argparse.FileType("r"), help="Blast XML Output"
+    )
+    parser.add_argument(
+        "--min_gap",
+        type=int,
+        help="Maximum gap size before generating a new match_part",
+        default=3,
+    )
+    parser.add_argument(
+        "--trim",
+        action="store_true",
+        help="Trim blast hits to be only as long as the parent feature",
+    )
+    parser.add_argument(
+        "--trim_end", action="store_true", help="Cut blast results off at end of gene"
+    )
+    parser.add_argument("--include_seq", action="store_true", help="Include sequence")
     args = parser.parse_args()
 
     for rec in blastxml2gff3(**vars(args)):