Mercurial > repos > peterjc > blastxml_to_top_descr
diff tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 11:98f8431dab44 draft
Uploaded v0.1.0, now also handles extended tabular BLAST output.
author | peterjc |
---|---|
date | Fri, 13 Jun 2014 07:07:35 -0400 |
parents | |
children | 8dc4ba7eba5d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/blastxml_to_top_descr/blastxml_to_top_descr.py Fri Jun 13 07:07:35 2014 -0400 @@ -0,0 +1,255 @@ +#!/usr/bin/env python +"""Convert a BLAST XML file to a top hits description table. + +Takes three command line options, input BLAST XML filename, output tabular +BLAST filename, number of hits to collect the descriptions of. + +Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. +""" +import os +import sys +import re +from optparse import OptionParser + +if "-v" in sys.argv or "--version" in sys.argv: + print "v0.1.0" + sys.exit(0) + +if sys.version_info[:2] >= ( 2, 5 ): + import xml.etree.cElementTree as ElementTree +else: + from galaxy import eggs + import pkg_resources; pkg_resources.require( "elementtree" ) + from elementtree import ElementTree + +def stop_err( msg ): + sys.stderr.write("%s\n" % msg) + sys.exit(1) + +usage = """Use as follows: + +$ blastxml_to_top_descr.py [-t 3] -o example.tabular input.xml + +Or, + +$ blastxml_to_top_descr.py [--topN 3] --output example.tabular input.xml + +This will take the top 3 BLAST descriptions from the input BLAST XML file, +writing them to the specified output file in tabular format. +""" + +parser = OptionParser(usage=usage) +parser.add_option("-t", "--topN", dest="topN", default=3, + help="Number of descriptions to collect (in order from file)") +parser.add_option("-o", "--output", dest="out_file", default=None, + help="Output filename for tabular file", + metavar="FILE") +parser.add_option("-f", "--format", dest="format", default="blastxml", + help="Input format (blastxml or tabular)") +parser.add_option("-q", "--qseqid", dest="qseqid", default="1", + help="Column for query 'qseqid' (for tabular input; default 1)") +parser.add_option("-s", "--sseqid", dest="sseqid", default="2", + help="Column for subject 'sseqid' (for tabular input; default 2)") +parser.add_option("-d", "--salltitles", dest="salltitles", default="25", + help="Column for descriptions 'salltitles' (for tabular input; default 25)") +(options, args) = parser.parse_args() + +if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: + stop_err("""The API has changed, replace this: + +$ python blastxml_to_top_descr.py input.xml output.tab 3 + +with: + +$ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml + +Sorry. +""") + +if not args: + stop_err("Input filename missing, try -h") +if len(args) > 1: + stop_err("Expects a single argument, one input filename") +in_file = args[0] +out_file = options.out_file +topN = options.topN + +try: + topN = int(topN) +except ValueError: + stop_err("Number of hits argument should be an integer (at least 1)") +if topN < 1: + stop_err("Number of hits argument should be an integer (at least 1)") + +if not os.path.isfile(in_file): + stop_err("Missing input file: %r" % in_file) + + +def get_column(value): + """Convert column number on command line to Python index.""" + if value.startswith("c"): + # Ignore c prefix, e.g. "c1" for "1" + value = value[1:] + try: + col = int(value) + except: + stop_err("Expected an integer column number, not %r" % value) + if col < 1: + stop_err("Expect column numbers to be at least one, not %r" % value) + return col - 1 # Python counting! + +def tabular_hits(in_file, qseqid, sseqid, salltitles): + """Parse key data from tabular BLAST output. + + Iterator returning tuples (qseqid, list_of_subject_description) + """ + current_query = None + current_hits = [] + with open(in_file) as input: + for line in input: + parts = line.rstrip("\n").split("\t") + query = parts[qseqid] + descr = "%s %s" % (parts[sseqid], parts[salltitles]) + if current_query is None: + # First hit + current_query = query + current_hits = [descr] + elif current_query == query: + # Another hit + current_hits.append(descr) + else: + # New query + yield current_query, current_hits + current_query = query + current_hits = [descr] + if current_query is not None: + # Final query + yield current_query, current_hits + +def blastxml_hits(in_file): + """Parse key data from BLAST XML output. + + Iterator returning tuples (qseqid, list_of_subject_description) + """ + try: + context = ElementTree.iterparse(in_file, events=("start", "end")) + except: + with open(in_file) as handle: + header = handle.read(100) + stop_err("Invalid data format in XML file %r which starts: %r" % (in_file, header)) + # turn it into an iterator + context = iter(context) + # get the root element + try: + event, root = context.next() + except: + with open(in_file) as handle: + header = handle.read(100) + stop_err("Unable to get root element from XML file %r which starts: %r" % (in_file, header)) + + re_default_query_id = re.compile("^Query_\d+$") + assert re_default_query_id.match("Query_101") + assert not re_default_query_id.match("Query_101a") + assert not re_default_query_id.match("MyQuery_101") + re_default_subject_id = re.compile("^Subject_\d+$") + assert re_default_subject_id.match("Subject_1") + assert not re_default_subject_id.match("Subject_") + assert not re_default_subject_id.match("Subject_12a") + assert not re_default_subject_id.match("TheSubject_1") + + count = 0 + pos_count = 0 + current_query = None + hit_descrs = [] + for event, elem in context: + # for every <Iteration> tag + if event == "end" and elem.tag == "Iteration": + # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA + # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> + # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> + # <Iteration_query-len>406</Iteration_query-len> + # <Iteration_hits></Iteration_hits> + # + # Or, from BLAST 2.2.24+ run online + # <Iteration_query-ID>Query_1</Iteration_query-ID> + # <Iteration_query-def>Sample</Iteration_query-def> + # <Iteration_query-len>516</Iteration_query-len> + # <Iteration_hits>... + qseqid = elem.findtext("Iteration_query-ID") + if qseqid is None: + stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") + if re_default_query_id.match(qseqid): + #Place holder ID, take the first word of the query definition + qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] + if current_query is None: + # First hit + current_query = qseqid + hit_descrs = [] + elif current_query != qseqid: + # New hit + yield current_query, hit_descrs + current_query = qseqid + hit_descrs = [] + else: + # Continuation of previous query + # i.e. This BLAST XML did not use one <Iteration> per query + # sys.stderr.write("Multiple <Iteration> blocks for %s\n" % qseqid) + pass + # for every <Hit> within <Iteration> + for hit in elem.findall("Iteration_hits/Hit"): + # Expecting either this, + # <Hit_id>gi|3024260|sp|P56514.1|OPSD_BUFBU</Hit_id> + # <Hit_def>RecName: Full=Rhodopsin</Hit_def> + # <Hit_accession>P56514</Hit_accession> + # or, + # <Hit_id>Subject_1</Hit_id> + # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> + # <Hit_accession>Subject_1</Hit_accession> + # + #apparently depending on the parse_deflines switch + sseqid = hit.findtext("Hit_id").split(None,1)[0] + hit_def = sseqid + " " + hit.findtext("Hit_def") + if re_default_subject_id.match(sseqid) \ + and sseqid == hit.findtext("Hit_accession"): + #Place holder ID, take the first word of the subject definition + hit_def = hit.findtext("Hit_def") + sseqid = hit_def.split(None,1)[0] + assert hit_def not in hit_descrs + hit_descrs.append(hit_def) + # prevents ElementTree from growing large datastructure + root.clear() + elem.clear() + if current_query is not None: + # Final query + yield current_query, hit_descrs + +if options.format == "blastxml": + hits = blastxml_hits(in_file) +elif options.format == "tabular": + qseqid = get_column(options.qseqid) + sseqid = get_column(options.sseqid) + salltitles = get_column(options.salltitles) + hits = tabular_hits(in_file, qseqid, sseqid, salltitles) +else: + stop_err("Unsupported format: %r" % options.format) + + +def best_hits(descriptions, topN): + if len(descriptions) < topN: + return descriptions + [""] * (topN - len(descriptions)) + else: + return descriptions[:topN] + +count = 0 +if out_file is None: + outfile = sys.stdout +else: + outfile = open(out_file, 'w') +outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) +for query, descrs in hits: + count += 1 + outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN)))) +if out_file is not None: + outfile.close() +# Queries with no hits are not present in tabular BLAST output +print("%i queries with BLAST results" % count)