Mercurial > repos > iuc > blastxml_to_gapped_gff3
comparison blastxml_to_gapped_gff3.py @ 0:bd47051afe98 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/blastxml_to_gapped_gff3 commit 8f38145c94ecb1e23c3ff6f0243213dc49d2287e
| author | iuc |
|---|---|
| date | Tue, 20 Dec 2016 09:21:11 -0500 |
| parents | |
| children | 877cd0833221 |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:bd47051afe98 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 import argparse | |
| 3 import copy | |
| 4 import logging | |
| 5 import re | |
| 6 import sys | |
| 7 | |
| 8 from BCBio import GFF | |
| 9 | |
| 10 logging.basicConfig(level=logging.INFO) | |
| 11 log = logging.getLogger(name='blastxml2gff3') | |
| 12 | |
| 13 __author__ = "Eric Rasche" | |
| 14 __version__ = "0.4.0" | |
| 15 __maintainer__ = "Eric Rasche" | |
| 16 __email__ = "esr@tamu.edu" | |
| 17 | |
| 18 __doc__ = """ | |
| 19 BlastXML files, when transformed to GFF3, do not normally show gaps in the | |
| 20 blast hits. This tool aims to fill that "gap". | |
| 21 """ | |
| 22 | |
| 23 | |
| 24 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): | |
| 25 from Bio.Blast import NCBIXML | |
| 26 from Bio.Seq import Seq | |
| 27 from Bio.SeqRecord import SeqRecord | |
| 28 from Bio.SeqFeature import SeqFeature, FeatureLocation | |
| 29 | |
| 30 blast_records = NCBIXML.parse(blastxml) | |
| 31 records = [] | |
| 32 for record in blast_records: | |
| 33 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 | |
| 34 match_type = { # Currently we can only handle BLASTN, BLASTP | |
| 35 'BLASTN': 'nucleotide_match', | |
| 36 'BLASTP': 'protein_match', | |
| 37 }.get(record.application, 'match') | |
| 38 | |
| 39 rec = SeqRecord(Seq("ACTG"), id=record.query) | |
| 40 for hit in record.alignments: | |
| 41 for hsp in hit.hsps: | |
| 42 qualifiers = { | |
| 43 "source": "blast", | |
| 44 "score": hsp.expect, | |
| 45 "accession": hit.accession, | |
| 46 "hit_id": hit.hit_id, | |
| 47 "length": hit.length, | |
| 48 "hit_titles": hit.title.split(' >') | |
| 49 } | |
| 50 desc = hit.title.split(' >')[0] | |
| 51 qualifiers['description'] = desc[desc.index(' '):] | |
| 52 | |
| 53 # This required a fair bit of sketching out/match to figure out | |
| 54 # the first time. | |
| 55 # | |
| 56 # the match_start location must account for queries and | |
| 57 # subjecst that start at locations other than 1 | |
| 58 parent_match_start = hsp.query_start - hsp.sbjct_start | |
| 59 # The end is the start + hit.length because the match itself | |
| 60 # may be longer than the parent feature, so we use the supplied | |
| 61 # subject/hit length to calculate the real ending of the target | |
| 62 # protein. | |
| 63 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') | |
| 64 | |
| 65 # However, if the user requests that we trim the feature, then | |
| 66 # we need to cut the ``match`` start to 0 to match the parent feature. | |
| 67 # We'll also need to cut the end to match the query's end. It (maybe) | |
| 68 # should be the feature end? But we don't have access to that data, so | |
| 69 # We settle for this. | |
| 70 if trim: | |
| 71 if parent_match_start < 1: | |
| 72 parent_match_start = 0 | |
| 73 | |
| 74 if trim or trim_end: | |
| 75 if parent_match_end > hsp.query_end: | |
| 76 parent_match_end = hsp.query_end + 1 | |
| 77 | |
| 78 # The ``match`` feature will hold one or more ``match_part``s | |
| 79 top_feature = SeqFeature( | |
| 80 FeatureLocation(parent_match_start, parent_match_end), | |
| 81 type=match_type, strand=0, | |
| 82 qualifiers=qualifiers | |
| 83 ) | |
| 84 | |
| 85 # Unlike the parent feature, ``match_part``s have sources. | |
| 86 part_qualifiers = { | |
| 87 "source": "blast", | |
| 88 } | |
| 89 top_feature.sub_features = [] | |
| 90 for start, end, cigar in generate_parts(hsp.query, hsp.match, | |
| 91 hsp.sbjct, | |
| 92 ignore_under=min_gap): | |
| 93 part_qualifiers['Gap'] = cigar | |
| 94 part_qualifiers['ID'] = hit.hit_id | |
| 95 | |
| 96 if trim: | |
| 97 # If trimming, then we start relative to the | |
| 98 # match's start | |
| 99 match_part_start = parent_match_start + start | |
| 100 else: | |
| 101 # Otherwise, we have to account for the subject start's location | |
| 102 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 | |
| 103 | |
| 104 # We used to use hsp.align_length here, but that includes | |
| 105 # gaps in the parent sequence | |
| 106 # | |
| 107 # Furthermore align_length will give calculation errors in weird places | |
| 108 # So we just use (end-start) for simplicity | |
| 109 match_part_end = match_part_start + (end - start) | |
| 110 | |
| 111 top_feature.sub_features.append( | |
| 112 SeqFeature( | |
| 113 FeatureLocation(match_part_start, match_part_end), | |
| 114 type="match_part", strand=0, | |
| 115 qualifiers=copy.deepcopy(part_qualifiers)) | |
| 116 ) | |
| 117 | |
| 118 rec.features.append(top_feature) | |
| 119 rec.annotations = {} | |
| 120 records.append(rec) | |
| 121 return records | |
| 122 | |
| 123 | |
| 124 def __remove_query_gaps(query, match, subject): | |
| 125 """remove positions in all three based on gaps in query | |
| 126 | |
| 127 In order to simplify math and calculations...we remove all of the gaps | |
| 128 based on gap locations in the query sequence:: | |
| 129 | |
| 130 Q:ACTG-ACTGACTG | |
| 131 S:ACTGAAC---CTG | |
| 132 | |
| 133 will become:: | |
| 134 | |
| 135 Q:ACTGACTGACTG | |
| 136 S:ACTGAC---CTG | |
| 137 | |
| 138 which greatly simplifies the process of identifying the correct location | |
| 139 for a match_part | |
| 140 """ | |
| 141 prev = 0 | |
| 142 fq = '' | |
| 143 fm = '' | |
| 144 fs = '' | |
| 145 for position in re.finditer('-', query): | |
| 146 fq += query[prev:position.start()] | |
| 147 fm += match[prev:position.start()] | |
| 148 fs += subject[prev:position.start()] | |
| 149 prev = position.start() + 1 | |
| 150 fq += query[prev:] | |
| 151 fm += match[prev:] | |
| 152 fs += subject[prev:] | |
| 153 | |
| 154 return (fq, fm, fs) | |
| 155 | |
| 156 | |
| 157 def generate_parts(query, match, subject, ignore_under=3): | |
| 158 region_q = [] | |
| 159 region_m = [] | |
| 160 region_s = [] | |
| 161 | |
| 162 (query, match, subject) = __remove_query_gaps(query, match, subject) | |
| 163 | |
| 164 region_start = -1 | |
| 165 region_end = -1 | |
| 166 mismatch_count = 0 | |
| 167 for i, (q, m, s) in enumerate(zip(query, match, subject)): | |
| 168 | |
| 169 # If we have a match | |
| 170 if m != ' ' or m == '+': | |
| 171 if region_start == -1: | |
| 172 region_start = i | |
| 173 # It's a new region, we need to reset or it's pre-seeded with | |
| 174 # spaces | |
| 175 region_q = [] | |
| 176 region_m = [] | |
| 177 region_s = [] | |
| 178 region_end = i | |
| 179 mismatch_count = 0 | |
| 180 else: | |
| 181 mismatch_count += 1 | |
| 182 | |
| 183 region_q.append(q) | |
| 184 region_m.append(m) | |
| 185 region_s.append(s) | |
| 186 | |
| 187 if mismatch_count >= ignore_under and region_start != -1 and region_end != -1: | |
| 188 region_q = region_q[0:-ignore_under] | |
| 189 region_m = region_m[0:-ignore_under] | |
| 190 region_s = region_s[0:-ignore_under] | |
| 191 yield region_start, region_end + 1, \ | |
| 192 cigar_from_string(region_q, region_m, region_s, strict_m=True) | |
| 193 region_q = [] | |
| 194 region_m = [] | |
| 195 region_s = [] | |
| 196 | |
| 197 region_start = -1 | |
| 198 region_end = -1 | |
| 199 mismatch_count = 0 | |
| 200 | |
| 201 yield region_start, region_end + 1, \ | |
| 202 cigar_from_string(region_q, region_m, region_s, strict_m=True) | |
| 203 | |
| 204 | |
| 205 def _qms_to_matches(query, match, subject, strict_m=True): | |
| 206 matchline = [] | |
| 207 | |
| 208 for (q, m, s) in zip(query, match, subject): | |
| 209 ret = '' | |
| 210 | |
| 211 if m != ' ' or m == '+': | |
| 212 ret = '=' | |
| 213 elif m == ' ': | |
| 214 if q == '-': | |
| 215 ret = 'D' | |
| 216 elif s == '-': | |
| 217 ret = 'I' | |
| 218 else: | |
| 219 ret = 'X' | |
| 220 else: | |
| 221 log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) | |
| 222 | |
| 223 if strict_m: | |
| 224 if ret == '=' or ret == 'X': | |
| 225 ret = 'M' | |
| 226 | |
| 227 matchline.append(ret) | |
| 228 return matchline | |
| 229 | |
| 230 | |
| 231 def _matchline_to_cigar(matchline): | |
| 232 cigar_line = [] | |
| 233 last_char = matchline[0] | |
| 234 count = 0 | |
| 235 for char in matchline: | |
| 236 if char == last_char: | |
| 237 count += 1 | |
| 238 else: | |
| 239 cigar_line.append("%s%s" % (last_char, count)) | |
| 240 count = 1 | |
| 241 last_char = char | |
| 242 cigar_line.append("%s%s" % (last_char, count)) | |
| 243 return ' '.join(cigar_line) | |
| 244 | |
| 245 | |
| 246 def cigar_from_string(query, match, subject, strict_m=True): | |
| 247 matchline = _qms_to_matches(query, match, subject, strict_m=strict_m) | |
| 248 if len(matchline) > 0: | |
| 249 return _matchline_to_cigar(matchline) | |
| 250 else: | |
| 251 return "" | |
| 252 | |
| 253 | |
| 254 if __name__ == '__main__': | |
| 255 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') | |
| 256 parser.add_argument('blastxml', type=open, help='Blast XML Output') | |
| 257 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) | |
| 258 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') | |
| 259 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') | |
| 260 args = parser.parse_args() | |
| 261 | |
| 262 result = blastxml2gff3(**vars(args)) | |
| 263 GFF.write(result, sys.stdout) |
