Mercurial > repos > davidmurphy > codonlogo
diff corebio/ssearch_io/fasta.py @ 4:4d47ab2b7bcc
Uploaded
author | davidmurphy |
---|---|
date | Fri, 13 Jan 2012 07:18:19 -0500 |
parents | c55bdc2fb9fa |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/ssearch_io/fasta.py Fri Jan 13 07:18:19 2012 -0500 @@ -0,0 +1,349 @@ +# Copyright (c) 2006 John Gilman +# +# This software is distributed under the MIT Open Source License. +# <http://www.opensource.org/licenses/mit-license.html> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + + + + +"""Read the output of a fasta sequence similarity search. + +FASTA is a DNA and Protein sequence alignment software package first described +by David J. Lipman and William R. Pearson in 1985. In addition to rapid +heuristic search methods, the FASTA package provides SSEARCH, an implementation +of the optimal Smith Waterman algorithm. + +The module can parse the output from fasta, ssearch and other search programs +in the fasta collection. It will parse both default ('-m 1') and compact +('-m 9 -d 0') output. + +Refs: + ftp.virginia.edu/pub/fasta + http://en.wikipedia.org/wiki/FASTA +""" + + + +from corebio.utils import Reiterate, Token, isblank +from corebio.ssearch_io import Report, Result, Hit, Annotation, Alignment +from math import floor +import re + +__all__ = 'read' + +_rangere = re.compile(r"\((\d+)-\d+:(\d+)-\d+\)") + + +def read(fin) : + """Read and parse a fasta search output file. + + returns: a list of Results + """ + scanner = _scan(fin) + + report = None + result = None + hit = None + #query_seq = None + #target_seq = None + alignment_num = 0 + + + for token in scanner : + #print token + typeof = token.typeof + value = token.data + + if typeof == 'begin_report' : + report = Report() + elif typeof == 'algorithm' : + report.algorithm = value + elif typeof == 'algorithm_version' : + report.algorithm_version = value + elif typeof == 'algorithm_reference' : + report.algorithm_reference = value + elif typeof == 'database_name' : + report.database_name = value + elif typeof == 'database_letters' : + report.database_letters = value + elif typeof == 'database_entries' : + report.database_entries = value + elif typeof == 'end_report' : + # Final sanity checking + break + elif typeof == 'parameter' : + key = value[0] + value = value[1] + report.parameters[key] = value + + elif typeof == 'begin_result' : + result = Result() + report.results.append(result) + + elif typeof == 'query_name' : + result.query.name = value + elif typeof == 'query_description' : + result.query.description = value + elif typeof == 'end_result' : + pass + + elif typeof == 'begin_hit' : + hit = Hit() + elif typeof == 'target_name' : + hit.target.name = value + elif typeof == 'target_description' : + hit.target.description = value + elif typeof == 'target_length' : + hit.target.length = value + elif typeof == 'raw_score' : + hit.raw_score = value + elif typeof == 'bit_score' : + hit.bit_score = value + elif typeof == 'significance' : + hit.significance = value + elif typeof == 'end_hit' : + result.hits.append(hit) + hit = None + + elif typeof == 'begin_alignment' : + alignment = Alignment() + tseq = [] + qseq = [] + elif typeof == 'end_alignment' : + tseq = ''.join(tseq) + qseq = ''.join(qseq) + L = max (len(tseq), len(qseq) ) + tseq = tseq.ljust(L).replace(' ', '.') + qseq = qseq.ljust(L).replace(' ', '.') + alignment.query_seq = tseq + alignment.target_seq = qseq + result.hits[alignment_num].alignments.append(alignment) + alignment_num+=1 + tseq = None + qseq = None + elif typeof == 'target_seq' : + tseq += value + elif typeof == 'query_seq' : + qseq += value + elif typeof == 'alignment_raw_score' : + alignment.raw_score = value + + elif typeof == 'alignment_bit_score' : + alignment.bit_score = value + elif typeof == 'alignment_significance' : + alignment.significance = value + elif typeof == 'alignment_length' : + alignment.length = value + elif typeof == 'alignment_similar' : + alignment.similar = value + elif typeof == 'alignment_identical' : + alignment.identical = value + elif typeof == 'alignment_query_start' : + alignment.query_start = value + elif typeof == 'alignment_target_start' : + alignment.target_start = value + + else: + # Should never get here. + raise RuntimeError("Unrecoverable internal parse error (SPE)") + pass + + + return report +# End method read() + + +def _scan(fin) : + + def next_nonempty(i) : + L = i.next() + while L.strip() == '': L = i.next() + return L + + + lines = Reiterate(iter(fin)) + try : + + yield Token("begin_report", lineno= lines.index()) + + # find header line : "SSEARCH searches a sequence data bank" + L = lines.next() + + if L[0] == '#' : + yield Token("parameter", ("command", L[1:].strip()), lines.index()) + L = lines.next() + + while not L : L= lines.next() + algorithm = L.split()[0] + expected = [ "SSEARCH", "FASTA","TFASTA","FASTX", + "FASTY","TFASTX","TFASTY"] + if algorithm not in expected: + raise ValueError("Parse failed: line %d" % lines.index() ) + yield Token ("algorithm", algorithm, lines.index() ) + + # Next line should be the version + L = lines.next() + if not L.startswith(" version") : + raise ValueError("Parse failed: Cannot find version.") + yield Token( "algorithm_version", L[8:].split()[0].strip(), lines.index()) + + # Algorithm reference + L = lines.next() + if not L.startswith("Please cite:") : + raise ValueError("Parse failed: Expecting citation" + L) + cite = lines.next().strip() + ' ' + lines.next().strip() + yield Token( "algorithm_reference", cite) + + # Find line "searching testset.fa library" + L = lines.next() + while not L.startswith("searching") : L = lines.next() + yield Token("database_name", L[10:-8], lines.index() ) + + # Results + L = lines.next() + while isblank(L) : L = lines.next() + if ">>>" not in L : + raise ValueError("Parse failed on line %d: " % lines.index()) + + while ">>>" in L : + yield Token("begin_result", lineno= lines.index()) + index = L.find('>>>') + (name, description) = L[index+3:].split(' ',1) + yield Token("query_name", name, lines.index()) + yield Token("query_description", description, lines.index()) + + while not L.startswith("The best scores are:") : + L = lines.next() + L = lines.next() + # hits + while not isblank(L) : + lineno = lines.index() + desc = L[0:49] + yield Token("begin_hit", lineno= lineno) + yield Token("target_description", desc, lineno, 0) + yield Token("target_name", desc.split(' ',1)[0], lineno, 0) + yield Token("target_length", int(L[52:56]), lineno, 52) + fields = L[57:].split() + raw, bit, sig = fields[0], fields[1], fields[2] + #print raw, bit, sig + yield Token("raw_score", float(raw), lineno, 57) + yield Token("bit_score", float(bit), lineno) + yield Token("significance", float(sig), lineno) + yield Token("end_hit", lineno=lineno) + L = lines.next() + + # Optimal alignment information + L = next_nonempty(lines) + #print ">>>", L, L.startswith('>>') + while L.startswith('>>'): + if L.startswith('>>>') : break + + yield Token("begin_alignment", lineno=lines.index() ) + + # 1 2 3 4 + #01234567890123456789012345678901234567890123456789 + # s-w opt: 46 Z-score: 70.7 bits: 18.5 E(): 3.6 + L = lines.next() + fields = L.split() + raw, bit, sig = fields[2], fields[6], fields[8] + yield Token("alignment_raw_score", float(raw), lineno) + yield Token("alignment_bit_score", float(bit), lineno) + yield Token("alignment_significance", float(sig), lineno) + + #Smith-Waterman score: 46; 38.095% identity (71.429% similar) in 21 aa overlap (2-22:36-56) + L = lines.next() + lineno = lines.index() + fields = L.split() + assert( len(fields) ==12) + alen = int(fields[8]) + identical = int( floor(0.5+alen* float(fields[3][:-1])/100.)) + similar = int( floor(0.5+alen* float(fields[3][:-1])/100.)) + yield Token("alignment_length", alen, lineno) + yield Token("alignment_similar", similar, lineno) + yield Token("alignment_identical", identical, lineno) + + m = _rangere.match( fields[11]) + assert (m is not None) + yield Token("alignment_query_start", int(m.group(1))-1, lineno) + yield Token("alignment_target_start", int(m.group(2))-1, lineno) + + + count = 1 + while True: + L = lines.next() + count += 1 + + + + if L.startswith('>>'): break + if '>>>' in L: + lines.push(L) + break + if 'residues' in L and 'sequences' in L : + lines.push(L) + break + if not L or L[0].isspace() : continue + + + # there are 2 lines before the first query sequence (but + # we started the count at 1). There is 1 line between query + # and target, 3 lines between target and query, unless the + # query ends before the ends and the target wraps onto another + # Then there are two lines between target and target. + +# Smith-Waterman score: 34; 35.294% identity ... +# +# 30 40 50 60 70 +# d1pfsa EGFLHLEDKPHPLQCQFFVESVIPAGSYQVPYRINVNNG-RPELAFDFKAMKRA +# : . . .:: .: .:: +# d8rxna MKKYVCTVCGYEYDPAEGDPDNGVKPGTSFDDLPADWVCPVCGA +# 10 20 30 40 +# +# d8rxna PKSEFEAA +# 50 + + lineno=lines.index() + if count==4 : + yield Token("query_seq", L[7:].rstrip(), lineno) + else : + yield Token("target_seq", L[7:].rstrip(),lineno) + count = 0 + + yield Token("end_alignment", lineno=lines.index() ) + yield Token("end_result", lineno= lines.index()) + L = next_nonempty(lines) + # End results + + # "13355 residues in 93 query sequences" + # "13355 residues in 93 library sequences" + #print '>>', L + LL = L.split() + yield Token("database_letters",int(LL[0]), lines.index() ) + yield Token("database_entries", int(LL[3]), lines.index() ) + + yield Token("end_report", lineno= lines.index()) + except StopIteration : + raise ValueError("Premature end of file ") + + + + +