diff blastxml_to_tabular_selectable.py @ 0:2bd0cbccb3c6

Uploaded
author galaxyp
date Wed, 08 Oct 2014 19:38:28 -0400
parents
children 5da5dcc5e13a
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/blastxml_to_tabular_selectable.py	Wed Oct 08 19:38:28 2014 -0400
@@ -0,0 +1,329 @@
+#!/usr/bin/env python
+"""Convert a BLAST XML file to 12 column tabular output
+
+Takes three command line options, input BLAST XML filename, output tabular
+BLAST filename, output format (std for standard 12 columns, or ext for the
+extended 25 columns offered in the BLAST+ wrappers).
+
+The 12 columns output are 'qseqid sseqid pident length mismatch gapopen qstart
+qend sstart send evalue bitscore' or 'std' at the BLAST+ command line, which
+mean:
+   
+====== ========= ============================================
+Column NCBI name Description
+------ --------- --------------------------------------------
+     1 qseqid    Query Seq-id (ID of your sequence)
+     2 sseqid    Subject Seq-id (ID of the database hit)
+     3 pident    Percentage of identical matches
+     4 length    Alignment length
+     5 mismatch  Number of mismatches
+     6 gapopen   Number of gap openings
+     7 qstart    Start of alignment in query
+     8 qend      End of alignment in query
+     9 sstart    Start of alignment in subject (database hit)
+    10 send      End of alignment in subject (database hit)
+    11 evalue    Expectation value (E-value)
+    12 bitscore  Bit score
+====== ========= ============================================
+
+The additional columns offered in the Galaxy BLAST+ wrappers are:
+
+====== ============= ===========================================
+Column NCBI name     Description
+------ ------------- -------------------------------------------
+    13 sallseqid     All subject Seq-id(s), separated by a ';'
+    14 score         Raw score
+    15 nident        Number of identical matches
+    16 positive      Number of positive-scoring matches
+    17 gaps          Total number of gaps
+    18 ppos          Percentage of positive-scoring matches
+    19 qframe        Query frame
+    20 sframe        Subject frame
+    21 qseq          Aligned part of query sequence
+    22 sseq          Aligned part of subject sequence
+    23 qlen          Query sequence length
+    24 slen          Subject sequence length
+    25 salltitles    All subject titles, separated by '<>'
+====== ============= ===========================================
+
+Most of these fields are given explicitly in the XML file, others some like
+the percentage identity and the number of gap openings must be calculated.
+
+Be aware that the sequence in the extended tabular output or XML direct from
+BLAST+ may or may not use XXXX masking on regions of low complexity. This
+can throw the off the calculation of percentage identity and gap openings.
+[In fact, both BLAST 2.2.24+ and 2.2.25+ have a subtle bug in this regard,
+with these numbers changing depending on whether or not the low complexity
+filter is used.]
+
+This script attempts to produce identical output to what BLAST+ would have done.
+However, check this with "diff -b ..." since BLAST+ sometimes includes an extra
+space character (probably a bug).
+"""
+import sys
+import re
+import os
+from optparse import OptionParser
+
+if "-v" in sys.argv or "--version" in sys.argv:
+    print "v0.0.12"
+    sys.exit(0)
+
+if sys.version_info[:2] >= ( 2, 5 ):
+    try:
+        from xml.etree import cElementTree as ElementTree
+    except ImportError:
+        from xml.etree import ElementTree as ElementTree
+else:
+    from galaxy import eggs
+    import pkg_resources; pkg_resources.require( "elementtree" )
+    from elementtree import ElementTree
+
+def stop_err( msg ):
+    sys.stderr.write("%s\n" % msg)
+    sys.exit(1)
+
+usage = "usage: %prog [options] blastxml[,...]"
+parser = OptionParser(usage=usage)
+parser.add_option('-o','--output', dest='output', default = None, help='output file path', metavar="FILE")
+parser.add_option("-c", "--columns", dest="columns", default='std', help="[std|ext|colname[,colname,...]] std: 12 column, ext: 25 column, or user specified columns")
+parser.add_option("-a", "--allqueries", action="store_true", dest="allqueries", default=False, help="ouptut row for each query, including those with no hits")
+parser.add_option("-d", "--qdef", action="store_true", dest="qdef", default=False, help="use Iteration_query-def for qseqid")
+parser.add_option('-u', '--unmatched', dest='unmatched', default = None, help='unmatched queries file path',  metavar="FILE")
+parser.add_option('-m', '--maxhits', dest='maxhits', type="int", default = 1, help='maximum number of HITs for output for a Query')
+parser.add_option('-p', '--maxhsps', dest='maxhsps', type="int", default = 1, help='maximum number of HSPs for output for a Hit')
+(options, args) = parser.parse_args()
+
+colnames = 'qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore,sallseqid,score,nident,positive,gaps,ppos,qframe,sframe,qseq,sseq,qlen,slen,salltitles'.split(',')
+
+if len(args) < 1:
+    parser.error("no BLASTXML input files")
+
+if options.columns == "std":
+    out_fmt = 'std'
+    extended = False
+    columns = colnames[0:12]
+elif options.columns == "x22":
+    stop_err("Format argument x22 has been replaced with ext (extended 25 columns)")
+elif options.columns == "ext":
+    out_fmt = 'ext'
+    extended = True
+    columns = colnames[0:25]
+else: 
+    out_fmt = 'cols'
+    extended = True
+    sel_cols = options.columns.split(',')
+    columns = []
+    for col in sel_cols:
+        if col in colnames:
+            columns.append(col)
+
+for in_file in args:
+    if not os.path.isfile(in_file):
+        stop_err("Input BLAST XML file not found: %s" % in_file)
+
+unhitfile = open(options.unmatched,'w') if options.unmatched else None
+
+re_default_query_id = re.compile("^Query_\d+$")
+assert re_default_query_id.match("Query_101")
+assert not re_default_query_id.match("Query_101a")
+assert not re_default_query_id.match("MyQuery_101")
+re_default_subject_id = re.compile("^Subject_\d+$")
+assert re_default_subject_id.match("Subject_1")
+assert not re_default_subject_id.match("Subject_")
+assert not re_default_subject_id.match("Subject_12a")
+assert not re_default_subject_id.match("TheSubject_1")
+    
+header = "#%s\n" % "\t".join(columns)
+outfile = open(options.output, 'w') if options.output else sys.stdout
+outfile.write(header)
+
+def handle_event(event, elem):
+    blast_program = None
+    if event == "end" and elem.tag == "BlastOutput_program":
+        blast_program = elem.text
+    # for every <Iteration> tag
+    if event == "end" and elem.tag == "Iteration":
+        #Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA
+        # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID>
+        # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def>
+        # <Iteration_query-len>406</Iteration_query-len>
+        # <Iteration_hits></Iteration_hits>
+        #
+        #Or, from BLAST 2.2.24+ run online
+        # <Iteration_query-ID>Query_1</Iteration_query-ID>
+        # <Iteration_query-def>Sample</Iteration_query-def>
+        # <Iteration_query-len>516</Iteration_query-len>
+        # <Iteration_hits>...
+        qseqid = elem.findtext("Iteration_query-ID")
+        if options.qdef or re_default_query_id.match(qseqid):
+            #Place holder ID, take the first word of the query definition
+            qseqid = elem.findtext("Iteration_query-def").split(None,1)[0]
+        qlen = int(elem.findtext("Iteration_query-len"))
+        # If no hits in this iteration
+        if not elem.find("Iteration_hits/Hit"):
+            if unhitfile:
+                unhitfile.write("%s\n" % qseqid)
+            if options.allqueries:
+                if columns and len(columns) > 0:
+                    v = []
+                    for name in columns:
+                        if name == 'qseqid':
+                            v.append(qseqid)
+                        elif name == 'qlen':
+                            v.append(str(qlen))
+                        else:
+                            v.append('')
+                    outfile.write("\t".join(v) + "\n")
+        # for every <Hit> within <Iteration>
+        for hitn, hit in enumerate(elem.findall("Iteration_hits/Hit")):
+            if hitn >= options.maxhits:
+                break
+            #Expecting either this,
+            # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id>
+            # <Hit_def>RecName: Full=Rhodopsin</Hit_def>
+            # <Hit_accession>P56514</Hit_accession>
+            #or,
+            # <Hit_id>Subject_1</Hit_id>
+            # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def>
+            # <Hit_accession>Subject_1</Hit_accession>
+            #
+            #apparently depending on the parse_deflines switch
+            #
+            #Or, with a local database not using -parse_seqids can get this,
+            # <Hit_id>gnl|BL_ORD_ID|2</Hit_id>
+            # <Hit_def>chrIII gi|240255695|ref|NC_003074.8| Arabidopsis thaliana chromosome 3, complete sequence</Hit_def>
+            # <Hit_accession>2</Hit_accession>
+            sseqid = hit.findtext("Hit_id").split(None,1)[0]
+            hit_def = sseqid + " " + hit.findtext("Hit_def")
+            if re_default_subject_id.match(sseqid) \
+            and sseqid == hit.findtext("Hit_accession"):
+                #Place holder ID, take the first word of the subject definition
+                hit_def = hit.findtext("Hit_def")
+                sseqid = hit_def.split(None,1)[0]
+            # for every <Hsp> within <Hit>
+            for hspn,hsp in enumerate(hit.findall("Hit_hsps/Hsp")):
+                if hspn >= options.maxhsps:
+                    break
+                nident = hsp.findtext("Hsp_identity")
+                length = hsp.findtext("Hsp_align-len")
+                pident = "%0.2f" % (100*float(nident)/float(length))
+
+                q_seq = hsp.findtext("Hsp_qseq")
+                h_seq = hsp.findtext("Hsp_hseq")
+                m_seq = hsp.findtext("Hsp_midline")
+                assert len(q_seq) == len(h_seq) == len(m_seq) == int(length)
+                gapopen = str(len(q_seq.replace('-', ' ').split())-1  + \
+                              len(h_seq.replace('-', ' ').split())-1)
+
+                mismatch = m_seq.count(' ') + m_seq.count('+') \
+                         - q_seq.count('-') - h_seq.count('-')
+                #TODO - Remove this alternative mismatch calculation and test
+                #once satisifed there are no problems
+                expected_mismatch = len(q_seq) \
+                                  - sum(1 for q,h in zip(q_seq, h_seq) \
+                                        if q == h or q == "-" or h == "-")
+                xx = sum(1 for q,h in zip(q_seq, h_seq) if q=="X" and h=="X")
+                if not (expected_mismatch - q_seq.count("X") <= int(mismatch) <= expected_mismatch + xx):
+                    stop_err("%s vs %s mismatches, expected %i <= %i <= %i" \
+                             % (qseqid, sseqid, expected_mismatch - q_seq.count("X"),
+                                int(mismatch), expected_mismatch))
+
+                #TODO - Remove this alternative identity calculation and test
+                #once satisifed there are no problems
+                expected_identity = sum(1 for q,h in zip(q_seq, h_seq) if q == h)
+                if not (expected_identity - xx <= int(nident) <= expected_identity + q_seq.count("X")):
+                    stop_err("%s vs %s identities, expected %i <= %i <= %i" \
+                             % (qseqid, sseqid, expected_identity, int(nident),
+                                expected_identity + q_seq.count("X")))
+                
+
+                evalue = hsp.findtext("Hsp_evalue")
+                if evalue == "0":
+                    evalue = "0.0"
+                else:
+                    evalue = "%0.0e" % float(evalue)
+                
+                bitscore = float(hsp.findtext("Hsp_bit-score"))
+                if bitscore < 100:
+                    #Seems to show one decimal place for lower scores
+                    bitscore = "%0.1f" % bitscore
+                else:
+                    #Note BLAST does not round to nearest int, it truncates
+                    bitscore = "%i" % bitscore
+
+                values = [qseqid,
+                          sseqid,
+                          pident,
+                          length, #hsp.findtext("Hsp_align-len")
+                          str(mismatch),
+                          gapopen,
+                          hsp.findtext("Hsp_query-from"), #qstart,
+                          hsp.findtext("Hsp_query-to"), #qend,
+                          hsp.findtext("Hsp_hit-from"), #sstart,
+                          hsp.findtext("Hsp_hit-to"), #send,
+                          evalue, #hsp.findtext("Hsp_evalue") in scientific notation
+                          bitscore, #hsp.findtext("Hsp_bit-score") rounded
+                          ]
+
+                if extended:
+                    sallseqid = ";".join(name.split(None,1)[0] for name in hit_def.split(">"))
+                    salltitles = "<>".join(name.split(None,1)[1] for name in hit_def.split(" >"))
+                    #print hit_def, "-->", sallseqid
+                    positive = hsp.findtext("Hsp_positive")
+                    ppos = "%0.2f" % (100*float(positive)/float(length))
+                    qframe = hsp.findtext("Hsp_query-frame")
+                    sframe = hsp.findtext("Hsp_hit-frame")
+                    if blast_program == "blastp":
+                        #Probably a bug in BLASTP that they use 0 or 1 depending on format
+                        if qframe == "0": qframe = "1"
+                        if sframe == "0": sframe = "1"
+                    slen = int(hit.findtext("Hit_len"))
+                    values.extend([sallseqid,
+                                   hsp.findtext("Hsp_score"), #score,
+                                   nident,
+                                   positive,
+                                   hsp.findtext("Hsp_gaps"), #gaps,
+                                   ppos,
+                                   qframe,
+                                   sframe,
+                                   #NOTE - for blastp, XML shows original seq, tabular uses XXX masking
+                                   q_seq,
+                                   h_seq,
+                                   str(qlen),
+                                   str(slen),
+                                   salltitles,
+                                   ])
+                if out_fmt == 'cols':
+                    if columns and len(columns) > 0:
+                        v = []
+                        for name in columns:
+                            v.append(values[colnames.index(name)])
+                        values = v
+                #print "\t".join(values) 
+                outfile.write("\t".join(values) + "\n")
+        # prevents ElementTree from growing large datastructure
+        root.clear()
+        elem.clear()
+
+
+for in_file in args:
+    # get an iterable
+    try: 
+        context = ElementTree.iterparse(in_file, events=("start", "end"))
+    except:
+        stop_err("Invalid data format.")
+    # turn it into an iterator
+    context = iter(context)
+    # get the root element
+    try:
+        event, root = context.next()
+    except:
+        stop_err( "Invalid data format." )
+    for event, elem in context:
+        handle_event(event, elem)
+
+if unhitfile:
+    unhitfile.close()
+if options.output:
+    outfile.close()