diff blastxmlToGff3.py @ 0:804a93e87cc8 draft

planemo upload for repository https://github.com/Yating-L/jbrowse_hub commit f22711ea7a464bdaf4d5aaea07f2eacf967aa66e-dirty
author yating-l
date Wed, 12 Apr 2017 17:41:55 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blastxmlToGff3.py	Wed Apr 12 17:41:55 2017 -0400
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+
+
+from Bio.Blast import NCBIXML
+from collections import OrderedDict
+import utils
+
+
+def align2cigar(hsp_query, hsp_reference):
+    """
+        Build CIGAR representation from an hsp_query
+        input:
+            hsp_query
+            hsp_sbjct
+        output:
+            CIGAR string
+    """
+    query = hsp_query
+    ref = hsp_reference
+    # preType, curType:
+    # 'M' represents match,
+    # 'I' represents insert a gap into the reference sequence,
+    # 'D' represents insert a gap into the target (delete from reference)
+    # some ideas of this algin2cigar function are coming from
+    # https://gist.github.com/ozagordi/099bdb796507da8d9426
+    prevType = 'M'
+    curType = 'M'
+    count = 0
+    cigar = []
+    num = len(query)
+    for i in range(num):
+        if query[i] == '-':
+            curType = 'D'
+        elif ref[i] == '-':
+            curType = 'I'
+        else:
+            curType = 'M'
+        if curType == prevType:
+            count += 1
+        else:
+            cigar.append('%s%d' % (prevType, count))
+            prevType = curType
+            count = 1
+    cigar.append('%s%d' % (curType, count))
+    return ' '.join(cigar)
+
+def gff3_writer(blast_records, gff3_file):
+    gff3 = open(gff3_file, 'a')
+    gff3.write("##gff-version 3\n")
+    seq_regions = dict()
+    for blast_record in blast_records:
+        query_name = blast_record.query.split(" ")[0]
+        source = blast_record.application
+        method = blast_record.matrix
+        for alignment in blast_record.alignments:
+            group = {
+            "parent_field" : OrderedDict(),
+            "parent_attribute" : OrderedDict(),
+            "alignments" : []
+            }
+            title = alignment.title.split(" ")
+            contig_name = title[len(title) - 1]
+            length = alignment.length
+            group['parent_field']['seqid'] = contig_name
+            group['parent_field']['source'] = source
+            group['parent_field']['type'] = 'match'
+            group['parent_attribute']['ID'] = contig_name + '_' + query_name
+            group['parent_attribute']['method'] = method
+            group['parent_attribute']['length'] = length
+            if contig_name not in seq_regions:
+                gff3.write("##sequence-region " + contig_name + ' 1 ' + str(length) + '\n')
+                seq_regions[contig_name] = length
+            match_num = 0
+            coords = [length, 0]
+            for hsp in alignment.hsps:
+                hsp_align = {}
+                field = OrderedDict()
+                attribute = OrderedDict()
+                ref = hsp.sbjct
+                query = hsp.query
+                field['seqid'] = contig_name
+                field['source'] = source
+                field['type'] = 'match_part'
+                
+                field['start'] = hsp.sbjct_start
+                if field['start'] < coords[0]:
+                    coords[0] = field['start']
+                ref_length = len(ref.replace('-', ''))
+                # if run tblastn, the actual length of reference should be multiplied by 3
+                if source.lower() == "tblastn":
+                    ref_length *= 3
+                field['end'] = field['start'] + ref_length - 1
+                if field['end'] > coords[1]:
+                    coords[1] = field['end']
+                field['score'] = hsp.score
+                #decide if the alignment in the same strand or reverse strand
+                #reading frame
+                # (+, +), (0, 0), (-, -) => +
+                # (+, -), (-, +) => -
+                if hsp.frame[1] * hsp.frame[0] > 0:
+                    field['strand'] = '+'
+                elif hsp.frame[1] * hsp.frame[0] < 0:
+                    field['strand'] = '-'
+                else:
+                    if hsp.frame[0] + hsp.frame[1] >= 0:
+                        field['strand'] = '+'
+                    else:
+                        field['strand'] = '-'
+                field['phase'] = '.'
+
+                target_start = hsp.query_start
+                target_len = len(query.replace('-', ''))
+                # if run blastx, the actual length of query should be multiplied by 3
+                if source.lower() == "blastx":
+                    target_len *= 3
+                target_end = target_start + target_len -1
+                attribute['ID'] = group['parent_attribute']['ID'] + '_match_' + str(match_num)
+                attribute['Parent'] = group['parent_attribute']['ID']
+                attribute['Target'] = query_name + " " + str(target_start) + " " + str(target_end)
+                attribute['Gap'] = align2cigar(query, ref)
+                #store the query sequence and match string in the file in order to display alignment with BlastAlignment plugin
+                attribute['subject'] = hsp.sbjct
+                attribute['query'] = hsp.query
+                attribute['match'] = hsp.match
+                attribute['gaps'] = attribute['match'].count(' ')
+                similar = attribute['match'].count('+')
+                attribute['identities'] = len(attribute['match']) - similar - attribute['gaps']
+                attribute['positives'] = attribute['identities'] + similar
+                attribute['expect'] = hsp.expect
+                # show reading frame attribute only if the frame is not (0, 0)
+                attribute['frame'] = hsp.frame[1]
+                match_num += 1
+                hsp_align['field'] = field
+                hsp_align['attribute'] = attribute
+                group['alignments'].append(hsp_align)
+            group['parent_field']['start'] = coords[0]
+            group['parent_field']['end'] = coords[1]
+            group['parent_field']['score'] = group['parent_field']['strand'] = group['parent_field']['phase'] = '.'
+            group['parent_attribute']['match_num'] = match_num
+            group['alignments'].sort(key=lambda x: (x['field']['start'], x['field']['end']))
+            utils.write_features(group['parent_field'], group['parent_attribute'], gff3)
+            prev_end = -1
+            for align in group['alignments']:
+                overlap = ''
+                if align['field']['start'] <= prev_end:
+                    overlap += str(align['field']['start']) + ',' + str(prev_end)
+                prev_end = align['field']['end']
+                align['attribute']['overlap'] = overlap
+                utils.write_features(align['field'], align['attribute'], gff3)
+    gff3.close()
+
+def blastxml2gff3(xml_file, gff3_file):
+    result_handle = open(xml_file)
+    blast_records = NCBIXML.parse(result_handle)
+    gff3_writer(blast_records, gff3_file)
+
+if __name__ == "__main__":
+    blastxml2gff3("../dbia3/raw/tblastn_dmel-hits-translation-r6.11.fa_vs_nucleotide_BLAST_database_from_data_3.blastxml", "gff3.txt")
+