Mercurial > repos > jankanis > blast2html
view blast2html.py @ 78:c0804e6443c6 py2.6
comment
author | Jan Kanis <jan.code@jankanis.nl> |
---|---|
date | Thu, 19 Jun 2014 16:29:33 +0200 |
parents | 7d0d46168fd5 |
children | 9fb1a7d67317 |
line wrap: on
line source
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # Actually this program works with both python 2 and 3, tested against python 2.6 # Copyright The Hyve B.V. 2014 # License: GPL version 3 or (at your option) any higher version from __future__ import unicode_literals, division import sys import math import warnings import six, codecs from six.moves import builtins from os import path from itertools import repeat import argparse from lxml import objectify import jinja2 _filters = dict(float='float') def filter(func_or_name): "Decorator to register a function as filter in the current jinja environment" if isinstance(func_or_name, six.string_types): def inner(func): _filters[func_or_name] = func.__name__ return func return inner else: _filters[func_or_name.__name__] = func_or_name.__name__ return func_or_name def color_idx(length): if length < 40: return 0 elif length < 50: return 1 elif length < 80: return 2 elif length < 200: return 3 return 4 @filter def fmt(val, fmt): return format(float(val), fmt) @filter def numfmt(val): """Format numbers in decimal notation, but without excessive trailing 0's. Default python float formatting will use scientific notation for some values, or append trailing zeros with the 'f' format type, and the number of digits differs between python 2 and 3.""" fpart, ipart = math.modf(val) if fpart == 0: return str(int(val)) # round to 10 to get identical representations in python 2 and 3 s = format(round(val, 10), '.10f').rstrip('0') if s[-1] == '.': s += '0' return s @filter def firsttitle(hit): return hit.Hit_def.text.split('>')[0] @filter def othertitles(hit): """Split a hit.Hit_def that contains multiple titles up, splitting out the hit ids from the titles.""" id_titles = hit.Hit_def.text.split('>') titles = [] for t in id_titles[1:]: fullid, title = t.split(' ', 1) hitid, id = fullid.split('|', 2)[1:3] titles.append(dict(id = id, hitid = hitid, fullid = fullid, title = title)) return titles @filter def hitid(hit): hitid = hit.Hit_id.text s = hitid.split('|', 2) if len(s) >= 2: return s[1] return hitid @filter def seqid(hit): hitid = hit.Hit_id.text s = hitid.split('|', 2) if len(s) >= 3: return s[2] return hitid @filter def alignment_pre(hsp): """Create the preformatted alignment blocks""" # line break length linewidth = 60 qfrom = int(hsp['Hsp_query-from']) qto = int(hsp['Hsp_query-to']) qframe = int(hsp['Hsp_query-frame']) hfrom = int(hsp['Hsp_hit-from']) hto = int(hsp['Hsp_hit-to']) hframe = int(hsp['Hsp_hit-frame']) qseq = hsp.Hsp_qseq.text midline = hsp.Hsp_midline.text hseq = hsp.Hsp_hseq.text if not qframe in (1, -1): warnings.warn("Error in BlastXML input: Hsp node {0} has a Hsp_query-frame of {1}. (should be 1 or -1)".format(nodeid(hsp), qframe)) qframe = -1 if qframe < 0 else 1 if not hframe in (1, -1): warnings.warn("Error in BlastXML input: Hsp node {0} has a Hsp_hit-frame of {1}. (should be 1 or -1)".format(nodeid(hsp), hframe)) hframe = -1 if hframe < 0 else 1 def split(txt): return [txt[i:i+linewidth] for i in range(0, len(txt), linewidth)] for qs, mid, hs, offset in zip(split(qseq), split(midline), split(hseq), range(0, len(qseq), linewidth)): yield ( "Query {0:>7} {1} {2}\n".format(qfrom+offset*qframe, qs, qfrom+(offset+len(qs)-1)*qframe) + " {0:7} {1}\n".format('', mid) + "Subject{0:>7} {1} {2}".format(hfrom+offset*hframe, hs, hfrom+(offset+len(hs)-1)*hframe) ) if qfrom+(len(qseq)-1)*qframe != qto: warnings.warn("Error in BlastXML input: Hsp node {0} qseq length mismatch: from {1} to {2} length {3}".format( nodeid(hsp), qfrom, qto, len(qseq))) if hfrom+(len(hseq)-1)*hframe != hto: warnings.warn("Error in BlastXML input: Hsp node {0} hseq length mismatch: from {1} to {2} length {3}".format( nodeid(hsp), hfrom, hto, len(hseq))) @filter('len') def blastxml_len(node): if node.tag == 'Hsp': return int(node['Hsp_align-len']) elif node.tag == 'Iteration': return int(node['Iteration_query-len']) raise Exception("Unknown XML node type: "+node.tag) @filter def nodeid(node): id = [] if node.tag == 'Hsp': id.insert(0, node.Hsp_num.text) node = node.getparent().getparent() assert node.tag == 'Hit' if node.tag == 'Hit': id.insert(0, node.Hit_num.text) node = node.getparent().getparent() assert node.tag == 'Iteration' if node.tag == 'Iteration': id.insert(0, node['Iteration_iter-num'].text) return '-'.join(id) raise ValueError("The nodeid filter can only be applied to Hsp, Hit or Iteration nodes in a BlastXML document") @filter def asframe(frame): if frame == 1: return 'Plus' elif frame == -1: return 'Minus' raise Exception("frame should be either +1 or -1") def genelink(hit, type='genbank', hsp=None): if not isinstance(hit, six.string_types): hit = hitid(hit) link = "http://www.ncbi.nlm.nih.gov/nucleotide/{0}?report={1}&log$=nuclalign".format(hit, type) if hsp != None: link += "&from={0}&to={1}".format(hsp['Hsp_hit-from'], hsp['Hsp_hit-to']) return link # javascript escape filter based on Django's, from https://github.com/dsissitka/khan-website/blob/master/templatefilters.py#L112-139 # I've removed the html escapes, since html escaping is already being performed by the template engine. # The r'\u0027' syntax doesn't work the way we need to in python 2.6 with unicode_literals _base_js_escapes = ( ('\\', '\\u005C'), ('\'', '\\u0027'), ('"', '\\u0022'), # ('>', '\\u003E'), # ('<', '\\u003C'), # ('&', '\\u0026'), # ('=', '\\u003D'), # ('-', '\\u002D'), # (';', '\\u003B'), (u'\u2028', '\\u2028'), (u'\u2029', '\\u2029') ) # Escape every ASCII character with a value less than 32. This is # needed a.o. to prevent html parsers from jumping out of javascript # parsing mode. _js_escapes = (_base_js_escapes + tuple(('%c' % z, '\\u%04X' % z) for z in range(32))) @filter def js_string_escape(value): """ Javascript string literal escape. Note that this only escapes data for embedding within javascript string literals, not in general javascript snippets. """ value = six.text_type(value) for bad, good in _js_escapes: value = value.replace(bad, good) return value @filter def hits(result): # sort hits by longest hotspot first return sorted(result.Iteration_hits.findall('Hit'), key=lambda h: max(blastxml_len(hsp) for hsp in h.Hit_hsps.Hsp), reverse=True) class BlastVisualize: colors = ('black', 'blue', 'green', 'magenta', 'red') max_scale_labels = 10 def __init__(self, input, templatedir, templatename): self.input = input self.templatename = templatename self.blast = objectify.parse(self.input).getroot() self.loader = jinja2.FileSystemLoader(searchpath=templatedir, encoding='utf-8') self.environment = jinja2.Environment(loader=self.loader, lstrip_blocks=True, trim_blocks=True, autoescape=True) self._addfilters(self.environment) def _addfilters(self, environment): for filtername, funcname in _filters.items(): try: environment.filters[filtername] = getattr(self, funcname) except AttributeError: try: environment.filters[filtername] = globals()[funcname] except KeyError: environment.filters[filtername] = getattr(builtins, funcname) def render(self, output): template = self.environment.get_template(self.templatename) params = (('Query ID', self.blast["BlastOutput_query-ID"]), ('Query definition', self.blast["BlastOutput_query-def"]), ('Query length', self.blast["BlastOutput_query-len"]), ('Program', self.blast.BlastOutput_version), ('Database', self.blast.BlastOutput_db), ) result = template.render(blast=self.blast, iterations=self.blast.BlastOutput_iterations.Iteration, colors=self.colors, genelink=genelink, params=params) if six.PY2: result = result.encode('utf-8') output.write(result) @filter def match_colors(self, result): """ An iterator that yields lists of length-color pairs. """ query_length = blastxml_len(result) percent_multiplier = 100 / query_length for hit in hits(result): # sort hotspots from short to long, so we can overwrite index colors of # short matches with those of long ones. hotspots = sorted(hit.Hit_hsps.Hsp, key=lambda hsp: blastxml_len(hsp)) table = bytearray([255]) * query_length for hsp in hotspots: frm = hsp['Hsp_query-from'] - 1 to = int(hsp['Hsp_query-to']) table[frm:to] = repeat(color_idx(blastxml_len(hsp)), to - frm) matches = [] last = table[0] count = 0 for i in range(query_length): if table[i] == last: count += 1 continue matches.append((count * percent_multiplier, self.colors[last] if last != 255 else 'transparent')) last = table[i] count = 1 matches.append((count * percent_multiplier, self.colors[last] if last != 255 else 'transparent')) yield dict(colors=matches, hit=hit, defline=firsttitle(hit)) @filter def queryscale(self, result): query_length = blastxml_len(result) skip = math.ceil(query_length / self.max_scale_labels) percent_multiplier = 100 / query_length for i in range(1, query_length+1): if i % skip == 0: yield dict(label = i, width = skip * percent_multiplier) if query_length % skip != 0: yield dict(label = query_length, width = (query_length % skip) * percent_multiplier) @filter def hit_info(self, result): query_length = blastxml_len(result) for hit in hits(result): hsps = hit.Hit_hsps.Hsp cover = [False] * query_length for hsp in hsps: cover[hsp['Hsp_query-from']-1 : int(hsp['Hsp_query-to'])] = repeat(True, blastxml_len(hsp)) cover_count = cover.count(True) def hsp_val(path): return (float(hsp[path]) for hsp in hsps) yield dict(hit = hit, title = firsttitle(hit), maxscore = "{0:.1f}".format(max(hsp_val('Hsp_bit-score'))), totalscore = "{0:.1f}".format(sum(hsp_val('Hsp_bit-score'))), cover = "{0:.0%}".format(cover_count / query_length), e_value = "{0:.4g}".format(min(hsp_val('Hsp_evalue'))), # FIXME: is this the correct formula vv? # float(...) because non-flooring division doesn't work with lxml elements in python 2.6 ident = "{0:.0%}".format(float(min(float(hsp.Hsp_identity) / blastxml_len(hsp) for hsp in hsps))), accession = hit.Hit_accession) def main(): default_template = path.join(path.dirname(__file__), 'blast2html.html.jinja') parser = argparse.ArgumentParser(description="Convert a BLAST XML result into a nicely readable html page", usage="{0} [-i] INPUT [-o OUTPUT]".format(sys.argv[0])) input_group = parser.add_mutually_exclusive_group(required=True) input_group.add_argument('positional_arg', metavar='INPUT', nargs='?', type=argparse.FileType(mode='r'), help='The input Blast XML file, same as -i/--input') input_group.add_argument('-i', '--input', type=argparse.FileType(mode='r'), help='The input Blast XML file') parser.add_argument('-o', '--output', type=argparse.FileType(mode='w'), default=sys.stdout, help='The output html file') # We just want the file name here, so jinja can open the file # itself. But it is easier to just use a FileType so argparse can # handle the errors. This introduces a small race condition when # jinja later tries to re-open the template file, but we don't # care too much. parser.add_argument('--template', type=argparse.FileType(mode='r'), default=default_template, help='The template file to use. Defaults to blast_html.html.jinja') args = parser.parse_args() if args.input == None: args.input = args.positional_arg if args.input == None: parser.error('no input specified') templatedir, templatename = path.split(args.template.name) args.template.close() if not templatedir: templatedir = '.' b = BlastVisualize(args.input, templatedir, templatename) b.render(args.output) if __name__ == '__main__': main()