Mercurial > repos > iuc > blastxml_to_gapped_gff3
comparison blastxml_to_gapped_gff3.py @ 2:561e827baa5f draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/blastxml_to_gapped_gff3 commit 908f16ea4eb082227437dc93e06e8cb742f5a257
| author | iuc |
|---|---|
| date | Wed, 15 Nov 2017 15:14:58 -0500 |
| parents | 877cd0833221 |
| children |
comparison
equal
deleted
inserted
replaced
| 1:877cd0833221 | 2:561e827baa5f |
|---|---|
| 4 import logging | 4 import logging |
| 5 import re | 5 import re |
| 6 import sys | 6 import sys |
| 7 | 7 |
| 8 from BCBio import GFF | 8 from BCBio import GFF |
| 9 | |
| 10 logging.basicConfig(level=logging.INFO) | 9 logging.basicConfig(level=logging.INFO) |
| 11 log = logging.getLogger(name='blastxml2gff3') | 10 log = logging.getLogger(name='blastxml2gff3') |
| 12 | |
| 13 __author__ = "Eric Rasche" | |
| 14 __version__ = "0.4.0" | |
| 15 __maintainer__ = "Eric Rasche" | |
| 16 __email__ = "esr@tamu.edu" | |
| 17 | 11 |
| 18 __doc__ = """ | 12 __doc__ = """ |
| 19 BlastXML files, when transformed to GFF3, do not normally show gaps in the | 13 BlastXML files, when transformed to GFF3, do not normally show gaps in the |
| 20 blast hits. This tool aims to fill that "gap". | 14 blast hits. This tool aims to fill that "gap". |
| 21 """ | 15 """ |
| 22 | 16 |
| 23 | 17 |
| 24 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False): | 18 def blastxml2gff3(blastxml, min_gap=3, trim=False, trim_end=False, include_seq=False): |
| 25 from Bio.Blast import NCBIXML | 19 from Bio.Blast import NCBIXML |
| 26 from Bio.Seq import Seq | 20 from Bio.Seq import Seq |
| 27 from Bio.SeqRecord import SeqRecord | 21 from Bio.SeqRecord import SeqRecord |
| 28 from Bio.SeqFeature import SeqFeature, FeatureLocation | 22 from Bio.SeqFeature import SeqFeature, FeatureLocation |
| 29 | 23 |
| 30 blast_records = NCBIXML.parse(blastxml) | 24 blast_records = NCBIXML.parse(blastxml) |
| 31 records = [] | 25 for idx_record, record in enumerate(blast_records): |
| 32 for record in blast_records: | |
| 33 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 | 26 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 |
| 34 match_type = { # Currently we can only handle BLASTN, BLASTP | 27 match_type = { # Currently we can only handle BLASTN, BLASTP |
| 35 'BLASTN': 'nucleotide_match', | 28 'BLASTN': 'nucleotide_match', |
| 36 'BLASTP': 'protein_match', | 29 'BLASTP': 'protein_match', |
| 37 }.get(record.application, 'match') | 30 }.get(record.application, 'match') |
| 38 | 31 |
| 39 rec = SeqRecord(Seq("ACTG"), id=record.query) | 32 recid = record.query |
| 40 for hit in record.alignments: | 33 if ' ' in recid: |
| 41 for hsp in hit.hsps: | 34 recid = recid[0:recid.index(' ')] |
| 35 | |
| 36 rec = SeqRecord(Seq("ACTG"), id=recid) | |
| 37 for idx_hit, hit in enumerate(record.alignments): | |
| 38 for idx_hsp, hsp in enumerate(hit.hsps): | |
| 42 qualifiers = { | 39 qualifiers = { |
| 40 "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), | |
| 43 "source": "blast", | 41 "source": "blast", |
| 44 "score": hsp.expect, | 42 "score": hsp.expect, |
| 45 "accession": hit.accession, | 43 "accession": hit.accession, |
| 46 "hit_id": hit.hit_id, | 44 "hit_id": hit.hit_id, |
| 47 "length": hit.length, | 45 "length": hit.length, |
| 48 "hit_titles": hit.title.split(' >') | 46 "hit_titles": hit.title.split(' >'), |
| 49 } | 47 } |
| 48 if include_seq: | |
| 49 qualifiers.update({ | |
| 50 'blast_qseq': hsp.query, | |
| 51 'blast_sseq': hsp.sbjct, | |
| 52 'blast_mseq': hsp.match, | |
| 53 }) | |
| 54 | |
| 55 for prop in ('score', 'bits', 'identities', 'positives', | |
| 56 'gaps', 'align_length', 'strand', 'frame', | |
| 57 'query_start', 'query_end', 'sbjct_start', | |
| 58 'sbjct_end'): | |
| 59 qualifiers['blast_' + prop] = getattr(hsp, prop, None) | |
| 60 | |
| 50 desc = hit.title.split(' >')[0] | 61 desc = hit.title.split(' >')[0] |
| 51 qualifiers['description'] = desc[desc.index(' '):] | 62 qualifiers['description'] = desc[desc.index(' '):] |
| 52 | 63 |
| 53 # This required a fair bit of sketching out/match to figure out | 64 # This required a fair bit of sketching out/match to figure out |
| 54 # the first time. | 65 # the first time. |
| 60 # may be longer than the parent feature, so we use the supplied | 71 # may be longer than the parent feature, so we use the supplied |
| 61 # subject/hit length to calculate the real ending of the target | 72 # subject/hit length to calculate the real ending of the target |
| 62 # protein. | 73 # protein. |
| 63 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') | 74 parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') |
| 64 | 75 |
| 65 # However, if the user requests that we trim the feature, then | 76 # If we trim the left end, we need to trim without losing information. |
| 66 # we need to cut the ``match`` start to 0 to match the parent feature. | 77 used_parent_match_start = parent_match_start |
| 67 # We'll also need to cut the end to match the query's end. It (maybe) | |
| 68 # should be the feature end? But we don't have access to that data, so | |
| 69 # We settle for this. | |
| 70 if trim: | 78 if trim: |
| 71 if parent_match_start < 1: | 79 if parent_match_start < 1: |
| 72 parent_match_start = 0 | 80 used_parent_match_start = 0 |
| 73 | 81 |
| 74 if trim or trim_end: | 82 if trim or trim_end: |
| 75 if parent_match_end > hsp.query_end: | 83 if parent_match_end > hsp.query_end: |
| 76 parent_match_end = hsp.query_end + 1 | 84 parent_match_end = hsp.query_end + 1 |
| 77 | 85 |
| 78 # The ``match`` feature will hold one or more ``match_part``s | 86 # The ``match`` feature will hold one or more ``match_part``s |
| 79 top_feature = SeqFeature( | 87 top_feature = SeqFeature( |
| 80 FeatureLocation(parent_match_start, parent_match_end), | 88 FeatureLocation(used_parent_match_start, parent_match_end), |
| 81 type=match_type, strand=0, | 89 type=match_type, strand=0, |
| 82 qualifiers=qualifiers | 90 qualifiers=qualifiers |
| 83 ) | 91 ) |
| 84 | 92 |
| 85 # Unlike the parent feature, ``match_part``s have sources. | 93 # Unlike the parent feature, ``match_part``s have sources. |
| 86 part_qualifiers = { | 94 part_qualifiers = { |
| 87 "source": "blast", | 95 "source": "blast", |
| 88 } | 96 } |
| 89 top_feature.sub_features = [] | 97 top_feature.sub_features = [] |
| 90 for start, end, cigar in generate_parts(hsp.query, hsp.match, | 98 for idx_part, (start, end, cigar) in \ |
| 91 hsp.sbjct, | 99 enumerate(generate_parts(hsp.query, hsp.match, |
| 92 ignore_under=min_gap): | 100 hsp.sbjct, |
| 101 ignore_under=min_gap)): | |
| 93 part_qualifiers['Gap'] = cigar | 102 part_qualifiers['Gap'] = cigar |
| 94 part_qualifiers['ID'] = hit.hit_id | 103 part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) |
| 95 | 104 |
| 96 if trim: | 105 # Otherwise, we have to account for the subject start's location |
| 97 # If trimming, then we start relative to the | 106 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 |
| 98 # match's start | |
| 99 match_part_start = parent_match_start + start | |
| 100 else: | |
| 101 # Otherwise, we have to account for the subject start's location | |
| 102 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 | |
| 103 | 107 |
| 104 # We used to use hsp.align_length here, but that includes | 108 # We used to use hsp.align_length here, but that includes |
| 105 # gaps in the parent sequence | 109 # gaps in the parent sequence |
| 106 # | 110 # |
| 107 # Furthermore align_length will give calculation errors in weird places | 111 # Furthermore align_length will give calculation errors in weird places |
| 115 qualifiers=copy.deepcopy(part_qualifiers)) | 119 qualifiers=copy.deepcopy(part_qualifiers)) |
| 116 ) | 120 ) |
| 117 | 121 |
| 118 rec.features.append(top_feature) | 122 rec.features.append(top_feature) |
| 119 rec.annotations = {} | 123 rec.annotations = {} |
| 120 records.append(rec) | 124 yield rec |
| 121 return records | |
| 122 | 125 |
| 123 | 126 |
| 124 def __remove_query_gaps(query, match, subject): | 127 def __remove_query_gaps(query, match, subject): |
| 125 """remove positions in all three based on gaps in query | 128 """remove positions in all three based on gaps in query |
| 126 | 129 |
| 251 return "" | 254 return "" |
| 252 | 255 |
| 253 | 256 |
| 254 if __name__ == '__main__': | 257 if __name__ == '__main__': |
| 255 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') | 258 parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') |
| 256 parser.add_argument('blastxml', type=open, help='Blast XML Output') | 259 parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output') |
| 257 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) | 260 parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) |
| 258 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') | 261 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') |
| 259 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') | 262 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') |
| 263 parser.add_argument('--include_seq', action='store_true', help='Include sequence') | |
| 260 args = parser.parse_args() | 264 args = parser.parse_args() |
| 261 | 265 |
| 262 result = blastxml2gff3(**vars(args)) | 266 for rec in blastxml2gff3(**vars(args)): |
| 263 GFF.write(result, sys.stdout) | 267 if len(rec.features): |
| 268 GFF.write([rec], sys.stdout) |
