comparison blastxml_to_gapped_gff3.py @ 3:7342f467507b draft

Uploaded v0.4 of JBrowse
author iuc
date Thu, 31 Dec 2015 13:58:43 -0500
parents 497c6bb3b717
children ad4b9d7eae6a
comparison
equal deleted inserted replaced
2:b6a0e126dbee 3:7342f467507b
26 from Bio.SeqFeature import SeqFeature, FeatureLocation 26 from Bio.SeqFeature import SeqFeature, FeatureLocation
27 27
28 blast_records = NCBIXML.parse(blastxml) 28 blast_records = NCBIXML.parse(blastxml)
29 records = [] 29 records = []
30 for record in blast_records: 30 for record in blast_records:
31 # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343
32 match_type = { # Currently we can only handle BLASTN, BLASTP
33 'BLASTN': 'nucleotide_match',
34 'BLASTP': 'protein_match',
35 }.get(record.application, 'match')
36
31 rec = SeqRecord(Seq("ACTG"), id=record.query) 37 rec = SeqRecord(Seq("ACTG"), id=record.query)
32 for hit in record.alignments: 38 for hit in record.alignments:
33 for hsp in hit.hsps: 39 for hsp in hit.hsps:
34 qualifiers = { 40 qualifiers = {
35 "source": "blast", 41 "source": "blast",
65 71
66 if trim or trim_end: 72 if trim or trim_end:
67 if parent_match_end > hsp.query_end: 73 if parent_match_end > hsp.query_end:
68 parent_match_end = hsp.query_end + 1 74 parent_match_end = hsp.query_end + 1
69 75
70 # The ``protein_match`` feature will hold one or more ``match_part``s 76 # The ``match`` feature will hold one or more ``match_part``s
71 top_feature = SeqFeature( 77 top_feature = SeqFeature(
72 FeatureLocation(parent_match_start, parent_match_end), 78 FeatureLocation(parent_match_start, parent_match_end),
73 type="protein_match", strand=0, 79 type=match_type, strand=0,
74 qualifiers=qualifiers 80 qualifiers=qualifiers
75 ) 81 )
76 82
77 # Unlike the parent feature, ``match_part``s have sources. 83 # Unlike the parent feature, ``match_part``s have sources.
78 part_qualifiers = { 84 part_qualifiers = {
85 part_qualifiers['Gap'] = cigar 91 part_qualifiers['Gap'] = cigar
86 part_qualifiers['ID'] = hit.hit_id 92 part_qualifiers['ID'] = hit.hit_id
87 93
88 if trim: 94 if trim:
89 # If trimming, then we start relative to the 95 # If trimming, then we start relative to the
90 # protein_match's start 96 # match's start
91 match_part_start = parent_match_start + start 97 match_part_start = parent_match_start + start
92 else: 98 else:
93 # Otherwise, we have to account for the subject start's location 99 # Otherwise, we have to account for the subject start's location
94 match_part_start = parent_match_start + hsp.sbjct_start + start - 1 100 match_part_start = parent_match_start + hsp.sbjct_start + start - 1
95 101
106 type="match_part", strand=0, 112 type="match_part", strand=0,
107 qualifiers=copy.deepcopy(part_qualifiers)) 113 qualifiers=copy.deepcopy(part_qualifiers))
108 ) 114 )
109 115
110 rec.features.append(top_feature) 116 rec.features.append(top_feature)
117 rec.annotations = {}
111 records.append(rec) 118 records.append(rec)
112 return records 119 return records
113 120
114 121
115 def __remove_query_gaps(query, match, subject): 122 def __remove_query_gaps(query, match, subject):
250 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') 257 parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature')
251 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') 258 parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene')
252 args = parser.parse_args() 259 args = parser.parse_args()
253 260
254 result = blastxml2gff3(**vars(args)) 261 result = blastxml2gff3(**vars(args))
255
256 GFF.write(result, sys.stdout) 262 GFF.write(result, sys.stdout)