Mercurial > repos > davidmurphy > codonlogo
diff corebio/seq_io/phylip_io.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/corebio/seq_io/phylip_io.py Thu Oct 27 12:09:09 2011 -0400 @@ -0,0 +1,167 @@ +#!/usr/bin/env python + +# Copyright (c) 2005 David D. Ding <dding@berkeley.edu> +# +# This software is distributed under the MIT Open Source License. +# <http://www.opensource.org/licenses/mit-license.html> +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the "Software"), +# to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, +# and/or sell copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +"""Read Sequences in interleaved Phylip format (not sequential) and returns a +list of sequences. Phylips is a very common phylogeny generating sequence type +that has the following traits +1) First line contains number of species and number of characters in a species' +sequence. Options can may follow, and they can be spaced or unspaced. Options are +simply letters such as A and W after the number of characters. +2) Options doesn't have to contain U in order for a usertree to appear. +3) If there are options then options appear first, then the sequences. For the +first iteration of sequences the first ten spaces are reserved for names of +options and species, the rest is for sequences. +4) For the second and following iterations the names are removed, only +sequence appears +4) At end of file an usertree may appear. First there is a number that indicts +the number of lines the usertree will take, and then the usertrees follow. + +Examples: + 6 50 W +W 0101001111 0101110101 01011 +dmras1 GTCGTCGTTG GACCTGGAGG CGTGG +hschras GTGGTGGTGG GCGCCGGCCG TGTGG +ddrasa GTTATTGTTG GTGGTGGTGG TGTCG +spras GTAGTTGTAG GAGATGGTGG TGTTG +scras1 GTAGTTGTCG GTGGAGGTGG CGTTG +scras2 GTCGTCGTTG GTGGTGGTGG TGTTG + +0101001111 0101110101 01011 +GTCGTCGTTG GACCTGGAGG CGTGG +GTGGTGGTGG GCGCCGGCCG TGTGG +GTTATTGTTG GTGGTGGTGG TGTCG +GTAGTTGTAG GAGATGGTGG TGTTG +GTAGTTGTCG GTGGAGGTGG CGTTG +GTCGTCGTTG GTGGTGGTGG TGTTG + +1 +((dmras1,ddrasa),((hschras,spras),(scras1,scras2))); + + +""" + +from corebio.seq import * + +names = ( 'phylip',) +extensions = ('phy',) + +def iterseq(fin, alphabet=None): + """Iterate over the sequences in the file.""" + # Default implementation + return iter(read(fin, alphabet) ) + + +#Read takes in a phylip file name, read it, processes it, and returns a SeqList +def read(fin, alphabet=None): + + + sequence=[] #where sequences are stored + idents=[] + num_seq=0 + num_total_seq=0 #length of sequence of 1 species + tracker=0 #track what sequence the line is on + usertree_tracker=0 #track usertree lines + options='' #options + num_options=0 #number/lens of options - U + + line=fin.readline() + while line: + s_line=line.split() #for ease of use, not used in all scenarios, but easier on the eye + + if s_line == []: #see nothing do nothing + pass + + elif (s_line[0].isdigit() and len(s_line) == 1 and len(sequence)==num_seq and len(sequence[0])==num_total_seq): #identifies usertree + usertree_tracker = int(s_line[0]) + pass + + elif num_options > 0: + if len(sequence) < num_seq: + if s_line[0][0] in options: + num_options -= 1 + pass + else: + raise ValueError('Not an option, but it should be one') + else: + num_options -= 1 + pass + + elif usertree_tracker > 0: #baskically skip usertree + if len(sequence[num_seq-1]) == num_total_seq: + usertree_tracker -=1 + pass + else: + raise ValueError('User Tree in Wrong Place') + + #####problems parse error unexpected + elif s_line[0].isdigit(): + if len(s_line) >= 2 and len(sequence) == 0: #identifies first line of file + num_seq = int(s_line[0]) #get number of sequences + num_total_seq = int(s_line[1]) #get length of sequences + if len(s_line) > 2: #takes care of the options + options= (''.join(s_line[2:])) + num_options=len(options) - options.count('U') + else: + raise ValueError('parse error') + + + #when options end, this take care of the sequence + elif num_options == 0: + if (num_seq==0): + raise ValueError("Empty File, or possibly wrong file") + elif tracker < num_seq: + if num_seq > len(sequence): + sequence.append(''.join(line[10:].split())) #removes species name + idents.append(line[0:10].strip()) + tracker +=1 + + else: + sequence[tracker] += (''.join(s_line)) + tracker +=1 + + if tracker == num_seq: + tracker = 0 + num_options = len(options)-options.count('U') + + line=fin.readline() + + if len(sequence) != len(idents) or len(sequence)!=num_seq: + raise ValueError("Number of different sequences wrong") + + seqs = [] + for i in range (0, len(idents)): + if len(sequence[i])==num_total_seq: + seqs.append(Seq(sequence[i], alphabet, idents[i])) + else: + raise ValueError("extra sequence in list") + + return SeqList(seqs) + + + + + +