Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/phylip_io.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 #!/usr/bin/env python | |
2 | |
3 # Copyright (c) 2005 David D. Ding <dding@berkeley.edu> | |
4 # | |
5 # This software is distributed under the MIT Open Source License. | |
6 # <http://www.opensource.org/licenses/mit-license.html> | |
7 # | |
8 # Permission is hereby granted, free of charge, to any person obtaining a | |
9 # copy of this software and associated documentation files (the "Software"), | |
10 # to deal in the Software without restriction, including without limitation | |
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
12 # and/or sell copies of the Software, and to permit persons to whom the | |
13 # Software is furnished to do so, subject to the following conditions: | |
14 # | |
15 # The above copyright notice and this permission notice shall be included | |
16 # in all copies or substantial portions of the Software. | |
17 # | |
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
24 # THE SOFTWARE. | |
25 # | |
26 | |
27 """Read Sequences in interleaved Phylip format (not sequential) and returns a | |
28 list of sequences. Phylips is a very common phylogeny generating sequence type | |
29 that has the following traits | |
30 1) First line contains number of species and number of characters in a species' | |
31 sequence. Options can may follow, and they can be spaced or unspaced. Options are | |
32 simply letters such as A and W after the number of characters. | |
33 2) Options doesn't have to contain U in order for a usertree to appear. | |
34 3) If there are options then options appear first, then the sequences. For the | |
35 first iteration of sequences the first ten spaces are reserved for names of | |
36 options and species, the rest is for sequences. | |
37 4) For the second and following iterations the names are removed, only | |
38 sequence appears | |
39 4) At end of file an usertree may appear. First there is a number that indicts | |
40 the number of lines the usertree will take, and then the usertrees follow. | |
41 | |
42 Examples: | |
43 6 50 W | |
44 W 0101001111 0101110101 01011 | |
45 dmras1 GTCGTCGTTG GACCTGGAGG CGTGG | |
46 hschras GTGGTGGTGG GCGCCGGCCG TGTGG | |
47 ddrasa GTTATTGTTG GTGGTGGTGG TGTCG | |
48 spras GTAGTTGTAG GAGATGGTGG TGTTG | |
49 scras1 GTAGTTGTCG GTGGAGGTGG CGTTG | |
50 scras2 GTCGTCGTTG GTGGTGGTGG TGTTG | |
51 | |
52 0101001111 0101110101 01011 | |
53 GTCGTCGTTG GACCTGGAGG CGTGG | |
54 GTGGTGGTGG GCGCCGGCCG TGTGG | |
55 GTTATTGTTG GTGGTGGTGG TGTCG | |
56 GTAGTTGTAG GAGATGGTGG TGTTG | |
57 GTAGTTGTCG GTGGAGGTGG CGTTG | |
58 GTCGTCGTTG GTGGTGGTGG TGTTG | |
59 | |
60 1 | |
61 ((dmras1,ddrasa),((hschras,spras),(scras1,scras2))); | |
62 | |
63 | |
64 """ | |
65 | |
66 from corebio.seq import * | |
67 | |
68 names = ( 'phylip',) | |
69 extensions = ('phy',) | |
70 | |
71 def iterseq(fin, alphabet=None): | |
72 """Iterate over the sequences in the file.""" | |
73 # Default implementation | |
74 return iter(read(fin, alphabet) ) | |
75 | |
76 | |
77 #Read takes in a phylip file name, read it, processes it, and returns a SeqList | |
78 def read(fin, alphabet=None): | |
79 | |
80 | |
81 sequence=[] #where sequences are stored | |
82 idents=[] | |
83 num_seq=0 | |
84 num_total_seq=0 #length of sequence of 1 species | |
85 tracker=0 #track what sequence the line is on | |
86 usertree_tracker=0 #track usertree lines | |
87 options='' #options | |
88 num_options=0 #number/lens of options - U | |
89 | |
90 line=fin.readline() | |
91 while line: | |
92 s_line=line.split() #for ease of use, not used in all scenarios, but easier on the eye | |
93 | |
94 if s_line == []: #see nothing do nothing | |
95 pass | |
96 | |
97 elif (s_line[0].isdigit() and len(s_line) == 1 and len(sequence)==num_seq and len(sequence[0])==num_total_seq): #identifies usertree | |
98 usertree_tracker = int(s_line[0]) | |
99 pass | |
100 | |
101 elif num_options > 0: | |
102 if len(sequence) < num_seq: | |
103 if s_line[0][0] in options: | |
104 num_options -= 1 | |
105 pass | |
106 else: | |
107 raise ValueError('Not an option, but it should be one') | |
108 else: | |
109 num_options -= 1 | |
110 pass | |
111 | |
112 elif usertree_tracker > 0: #baskically skip usertree | |
113 if len(sequence[num_seq-1]) == num_total_seq: | |
114 usertree_tracker -=1 | |
115 pass | |
116 else: | |
117 raise ValueError('User Tree in Wrong Place') | |
118 | |
119 #####problems parse error unexpected | |
120 elif s_line[0].isdigit(): | |
121 if len(s_line) >= 2 and len(sequence) == 0: #identifies first line of file | |
122 num_seq = int(s_line[0]) #get number of sequences | |
123 num_total_seq = int(s_line[1]) #get length of sequences | |
124 if len(s_line) > 2: #takes care of the options | |
125 options= (''.join(s_line[2:])) | |
126 num_options=len(options) - options.count('U') | |
127 else: | |
128 raise ValueError('parse error') | |
129 | |
130 | |
131 #when options end, this take care of the sequence | |
132 elif num_options == 0: | |
133 if (num_seq==0): | |
134 raise ValueError("Empty File, or possibly wrong file") | |
135 elif tracker < num_seq: | |
136 if num_seq > len(sequence): | |
137 sequence.append(''.join(line[10:].split())) #removes species name | |
138 idents.append(line[0:10].strip()) | |
139 tracker +=1 | |
140 | |
141 else: | |
142 sequence[tracker] += (''.join(s_line)) | |
143 tracker +=1 | |
144 | |
145 if tracker == num_seq: | |
146 tracker = 0 | |
147 num_options = len(options)-options.count('U') | |
148 | |
149 line=fin.readline() | |
150 | |
151 if len(sequence) != len(idents) or len(sequence)!=num_seq: | |
152 raise ValueError("Number of different sequences wrong") | |
153 | |
154 seqs = [] | |
155 for i in range (0, len(idents)): | |
156 if len(sequence[i])==num_total_seq: | |
157 seqs.append(Seq(sequence[i], alphabet, idents[i])) | |
158 else: | |
159 raise ValueError("extra sequence in list") | |
160 | |
161 return SeqList(seqs) | |
162 | |
163 | |
164 | |
165 | |
166 | |
167 |