Mercurial > repos > peterjc > blastxml_to_top_descr
diff tools/blastxml_to_top_descr/blastxml_to_top_descr.py @ 13:8dc4ba7eba5d draft default tip
v0.1.2 with Python 3.9 declaration
author | peterjc |
---|---|
date | Sun, 17 Sep 2023 13:01:56 +0000 |
parents | 98f8431dab44 |
children |
line wrap: on
line diff
--- a/tools/blastxml_to_top_descr/blastxml_to_top_descr.py Wed Jul 30 05:36:52 2014 -0400 +++ b/tools/blastxml_to_top_descr/blastxml_to_top_descr.py Sun Sep 17 13:01:56 2023 +0000 @@ -6,25 +6,26 @@ Assumes the hits are pre-sorted, so "best" 3 hits gives first 3 hits. """ +from __future__ import print_function + import os +import re import sys -import re from optparse import OptionParser if "-v" in sys.argv or "--version" in sys.argv: - print "v0.1.0" + print("v0.1.2") sys.exit(0) -if sys.version_info[:2] >= ( 2, 5 ): +if sys.version_info[:2] >= (2, 5): import xml.etree.cElementTree as ElementTree else: - from galaxy import eggs - import pkg_resources; pkg_resources.require( "elementtree" ) + from galaxy import eggs # noqa - ignore flake8 F401 + import pkg_resources + + pkg_resources.require("elementtree") from elementtree import ElementTree -def stop_err( msg ): - sys.stderr.write("%s\n" % msg) - sys.exit(1) usage = """Use as follows: @@ -39,23 +40,54 @@ """ parser = OptionParser(usage=usage) -parser.add_option("-t", "--topN", dest="topN", default=3, - help="Number of descriptions to collect (in order from file)") -parser.add_option("-o", "--output", dest="out_file", default=None, - help="Output filename for tabular file", - metavar="FILE") -parser.add_option("-f", "--format", dest="format", default="blastxml", - help="Input format (blastxml or tabular)") -parser.add_option("-q", "--qseqid", dest="qseqid", default="1", - help="Column for query 'qseqid' (for tabular input; default 1)") -parser.add_option("-s", "--sseqid", dest="sseqid", default="2", - help="Column for subject 'sseqid' (for tabular input; default 2)") -parser.add_option("-d", "--salltitles", dest="salltitles", default="25", - help="Column for descriptions 'salltitles' (for tabular input; default 25)") +parser.add_option( + "-t", + "--topN", + dest="topN", + default=3, + help="Number of descriptions to collect (in order from file)", +) +parser.add_option( + "-o", + "--output", + dest="out_file", + default=None, + help="Output filename for tabular file", + metavar="FILE", +) +parser.add_option( + "-f", + "--format", + dest="format", + default="blastxml", + help="Input format (blastxml or tabular)", +) +parser.add_option( + "-q", + "--qseqid", + dest="qseqid", + default="1", + help="Column for query 'qseqid' (for tabular input; default 1)", +) +parser.add_option( + "-s", + "--sseqid", + dest="sseqid", + default="2", + help="Column for subject 'sseqid' (for tabular input; default 2)", +) +parser.add_option( + "-d", + "--salltitles", + dest="salltitles", + default="25", + help="Column for descriptions 'salltitles' (for tabular input; default 25)", +) (options, args) = parser.parse_args() if len(sys.argv) == 4 and len(args) == 3 and not options.out_file: - stop_err("""The API has changed, replace this: + sys.exit( + """The API has changed, replace this: $ python blastxml_to_top_descr.py input.xml output.tab 3 @@ -64,12 +96,13 @@ $ python blastxml_to_top_descr.py -o output.tab -t 3 input.xml Sorry. -""") +""" + ) if not args: - stop_err("Input filename missing, try -h") + sys.exit("Input filename missing, try -h") if len(args) > 1: - stop_err("Expects a single argument, one input filename") + sys.exit("Expects a single argument, one input filename") in_file = args[0] out_file = options.out_file topN = options.topN @@ -77,12 +110,12 @@ try: topN = int(topN) except ValueError: - stop_err("Number of hits argument should be an integer (at least 1)") + sys.exit("Number of hits argument should be an integer (at least 1)") if topN < 1: - stop_err("Number of hits argument should be an integer (at least 1)") + sys.exit("Number of hits argument should be an integer (at least 1)") if not os.path.isfile(in_file): - stop_err("Missing input file: %r" % in_file) + sys.exit("Missing input file: %r" % in_file) def get_column(value): @@ -92,11 +125,12 @@ value = value[1:] try: col = int(value) - except: - stop_err("Expected an integer column number, not %r" % value) + except ValueError: + sys.exit("Expected an integer column number, not %r" % value) if col < 1: - stop_err("Expect column numbers to be at least one, not %r" % value) - return col - 1 # Python counting! + sys.exit("Expect column numbers to be at least one, not %r" % value) + return col - 1 # Python counting! + def tabular_hits(in_file, qseqid, sseqid, salltitles): """Parse key data from tabular BLAST output. @@ -105,8 +139,8 @@ """ current_query = None current_hits = [] - with open(in_file) as input: - for line in input: + with open(in_file) as input_handle: + for line in input_handle: parts = line.rstrip("\n").split("\t") query = parts[qseqid] descr = "%s %s" % (parts[sseqid], parts[salltitles]) @@ -126,6 +160,7 @@ # Final query yield current_query, current_hits + def blastxml_hits(in_file): """Parse key data from BLAST XML output. @@ -133,32 +168,35 @@ """ try: context = ElementTree.iterparse(in_file, events=("start", "end")) - except: + except Exception: with open(in_file) as handle: header = handle.read(100) - stop_err("Invalid data format in XML file %r which starts: %r" % (in_file, header)) + sys.exit( + "Invalid data format in XML file %r which starts: %r" % (in_file, header) + ) # turn it into an iterator context = iter(context) # get the root element try: - event, root = context.next() - except: + event, root = next(context) + except Exception: with open(in_file) as handle: header = handle.read(100) - stop_err("Unable to get root element from XML file %r which starts: %r" % (in_file, header)) + sys.exit( + "Unable to get root element from XML file %r which starts: %r" + % (in_file, header) + ) - re_default_query_id = re.compile("^Query_\d+$") - assert re_default_query_id.match("Query_101") - assert not re_default_query_id.match("Query_101a") - assert not re_default_query_id.match("MyQuery_101") - re_default_subject_id = re.compile("^Subject_\d+$") - assert re_default_subject_id.match("Subject_1") - assert not re_default_subject_id.match("Subject_") - assert not re_default_subject_id.match("Subject_12a") - assert not re_default_subject_id.match("TheSubject_1") + re_default_query_id = re.compile(r"^Query_\d+$") + assert re_default_query_id.match(r"Query_101") + assert not re_default_query_id.match(r"Query_101a") + assert not re_default_query_id.match(r"MyQuery_101") + re_default_subject_id = re.compile(r"^Subject_\d+$") + assert re_default_subject_id.match(r"Subject_1") + assert not re_default_subject_id.match(r"Subject_") + assert not re_default_subject_id.match(r"Subject_12a") + assert not re_default_subject_id.match(r"TheSubject_1") - count = 0 - pos_count = 0 current_query = None hit_descrs = [] for event, elem in context: @@ -166,7 +204,8 @@ if event == "end" and elem.tag == "Iteration": # Expecting either this, from BLAST 2.2.25+ using FASTA vs FASTA # <Iteration_query-ID>sp|Q9BS26|ERP44_HUMAN</Iteration_query-ID> - # <Iteration_query-def>Endoplasmic reticulum resident protein 44 OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> + # <Iteration_query-def>Endoplasmic reticulum resident protein 44 + # OS=Homo sapiens GN=ERP44 PE=1 SV=1</Iteration_query-def> # <Iteration_query-len>406</Iteration_query-len> # <Iteration_hits></Iteration_hits> # @@ -177,10 +216,12 @@ # <Iteration_hits>... qseqid = elem.findtext("Iteration_query-ID") if qseqid is None: - stop_err("Missing <Iteration_query-ID> (could be really old BLAST XML data?)") + sys.exit( + "Missing <Iteration_query-ID> (could be really old BLAST XML data?)" + ) if re_default_query_id.match(qseqid): - #Place holder ID, take the first word of the query definition - qseqid = elem.findtext("Iteration_query-def").split(None,1)[0] + # Place holder ID, take the first word of the query definition + qseqid = elem.findtext("Iteration_query-def").split(None, 1)[0] if current_query is None: # First hit current_query = qseqid @@ -203,17 +244,19 @@ # <Hit_accession>P56514</Hit_accession> # or, # <Hit_id>Subject_1</Hit_id> - # <Hit_def>gi|57163783|ref|NP_001009242.1| rhodopsin [Felis catus]</Hit_def> + # <Hit_def>gi|57163783|ref|NP_001009242.1| + # rhodopsin [Felis catus]</Hit_def> # <Hit_accession>Subject_1</Hit_accession> # - #apparently depending on the parse_deflines switch - sseqid = hit.findtext("Hit_id").split(None,1)[0] + # apparently depending on the parse_deflines switch + sseqid = hit.findtext("Hit_id").split(None, 1)[0] hit_def = sseqid + " " + hit.findtext("Hit_def") - if re_default_subject_id.match(sseqid) \ - and sseqid == hit.findtext("Hit_accession"): - #Place holder ID, take the first word of the subject definition + if re_default_subject_id.match(sseqid) and sseqid == hit.findtext( + "Hit_accession" + ): + # Place holder ID, take the first word of the subject definition hit_def = hit.findtext("Hit_def") - sseqid = hit_def.split(None,1)[0] + sseqid = hit_def.split(None, 1)[0] assert hit_def not in hit_descrs hit_descrs.append(hit_def) # prevents ElementTree from growing large datastructure @@ -223,6 +266,7 @@ # Final query yield current_query, hit_descrs + if options.format == "blastxml": hits = blastxml_hits(in_file) elif options.format == "tabular": @@ -231,21 +275,23 @@ salltitles = get_column(options.salltitles) hits = tabular_hits(in_file, qseqid, sseqid, salltitles) else: - stop_err("Unsupported format: %r" % options.format) + sys.exit("Unsupported format: %r" % options.format) def best_hits(descriptions, topN): + """Truncate given descriptions list to at most N entries.""" if len(descriptions) < topN: - return descriptions + [""] * (topN - len(descriptions)) + return descriptions + [""] * (topN - len(descriptions)) else: return descriptions[:topN] + count = 0 if out_file is None: outfile = sys.stdout else: - outfile = open(out_file, 'w') -outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i+1) for i in range(topN))) + outfile = open(out_file, "w") +outfile.write("#Query\t%s\n" % "\t".join("BLAST hit %i" % (i + 1) for i in range(topN))) for query, descrs in hits: count += 1 outfile.write("%s\t%s\n" % (query, "\t".join(best_hits(descrs, topN))))