0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 # Copyright (c) 2005 David D. Ding <dding@berkeley.edu>
|
|
4 #
|
|
5 # This software is distributed under the MIT Open Source License.
|
|
6 # <http://www.opensource.org/licenses/mit-license.html>
|
|
7 #
|
|
8 # Permission is hereby granted, free of charge, to any person obtaining a
|
|
9 # copy of this software and associated documentation files (the "Software"),
|
|
10 # to deal in the Software without restriction, including without limitation
|
|
11 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
12 # and/or sell copies of the Software, and to permit persons to whom the
|
|
13 # Software is furnished to do so, subject to the following conditions:
|
|
14 #
|
|
15 # The above copyright notice and this permission notice shall be included
|
|
16 # in all copies or substantial portions of the Software.
|
|
17 #
|
|
18 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
19 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
20 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
21 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
22 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
23 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
24 # THE SOFTWARE.
|
|
25 #
|
|
26
|
|
27 """Read Sequences in interleaved Phylip format (not sequential) and returns a
|
|
28 list of sequences. Phylips is a very common phylogeny generating sequence type
|
|
29 that has the following traits
|
|
30 1) First line contains number of species and number of characters in a species'
|
|
31 sequence. Options can may follow, and they can be spaced or unspaced. Options are
|
|
32 simply letters such as A and W after the number of characters.
|
|
33 2) Options doesn't have to contain U in order for a usertree to appear.
|
|
34 3) If there are options then options appear first, then the sequences. For the
|
|
35 first iteration of sequences the first ten spaces are reserved for names of
|
|
36 options and species, the rest is for sequences.
|
|
37 4) For the second and following iterations the names are removed, only
|
|
38 sequence appears
|
|
39 4) At end of file an usertree may appear. First there is a number that indicts
|
|
40 the number of lines the usertree will take, and then the usertrees follow.
|
|
41
|
|
42 Examples:
|
|
43 6 50 W
|
|
44 W 0101001111 0101110101 01011
|
|
45 dmras1 GTCGTCGTTG GACCTGGAGG CGTGG
|
|
46 hschras GTGGTGGTGG GCGCCGGCCG TGTGG
|
|
47 ddrasa GTTATTGTTG GTGGTGGTGG TGTCG
|
|
48 spras GTAGTTGTAG GAGATGGTGG TGTTG
|
|
49 scras1 GTAGTTGTCG GTGGAGGTGG CGTTG
|
|
50 scras2 GTCGTCGTTG GTGGTGGTGG TGTTG
|
|
51
|
|
52 0101001111 0101110101 01011
|
|
53 GTCGTCGTTG GACCTGGAGG CGTGG
|
|
54 GTGGTGGTGG GCGCCGGCCG TGTGG
|
|
55 GTTATTGTTG GTGGTGGTGG TGTCG
|
|
56 GTAGTTGTAG GAGATGGTGG TGTTG
|
|
57 GTAGTTGTCG GTGGAGGTGG CGTTG
|
|
58 GTCGTCGTTG GTGGTGGTGG TGTTG
|
|
59
|
|
60 1
|
|
61 ((dmras1,ddrasa),((hschras,spras),(scras1,scras2)));
|
|
62
|
|
63
|
|
64 """
|
|
65
|
|
66 from corebio.seq import *
|
|
67
|
|
68 names = ( 'phylip',)
|
|
69 extensions = ('phy',)
|
|
70
|
|
71 def iterseq(fin, alphabet=None):
|
|
72 """Iterate over the sequences in the file."""
|
|
73 # Default implementation
|
|
74 return iter(read(fin, alphabet) )
|
|
75
|
|
76
|
|
77 #Read takes in a phylip file name, read it, processes it, and returns a SeqList
|
|
78 def read(fin, alphabet=None):
|
|
79
|
|
80
|
|
81 sequence=[] #where sequences are stored
|
|
82 idents=[]
|
|
83 num_seq=0
|
|
84 num_total_seq=0 #length of sequence of 1 species
|
|
85 tracker=0 #track what sequence the line is on
|
|
86 usertree_tracker=0 #track usertree lines
|
|
87 options='' #options
|
|
88 num_options=0 #number/lens of options - U
|
|
89
|
|
90 line=fin.readline()
|
|
91 while line:
|
|
92 s_line=line.split() #for ease of use, not used in all scenarios, but easier on the eye
|
|
93
|
|
94 if s_line == []: #see nothing do nothing
|
|
95 pass
|
|
96
|
|
97 elif (s_line[0].isdigit() and len(s_line) == 1 and len(sequence)==num_seq and len(sequence[0])==num_total_seq): #identifies usertree
|
|
98 usertree_tracker = int(s_line[0])
|
|
99 pass
|
|
100
|
|
101 elif num_options > 0:
|
|
102 if len(sequence) < num_seq:
|
|
103 if s_line[0][0] in options:
|
|
104 num_options -= 1
|
|
105 pass
|
|
106 else:
|
|
107 raise ValueError('Not an option, but it should be one')
|
|
108 else:
|
|
109 num_options -= 1
|
|
110 pass
|
|
111
|
|
112 elif usertree_tracker > 0: #baskically skip usertree
|
|
113 if len(sequence[num_seq-1]) == num_total_seq:
|
|
114 usertree_tracker -=1
|
|
115 pass
|
|
116 else:
|
|
117 raise ValueError('User Tree in Wrong Place')
|
|
118
|
|
119 #####problems parse error unexpected
|
|
120 elif s_line[0].isdigit():
|
|
121 if len(s_line) >= 2 and len(sequence) == 0: #identifies first line of file
|
|
122 num_seq = int(s_line[0]) #get number of sequences
|
|
123 num_total_seq = int(s_line[1]) #get length of sequences
|
|
124 if len(s_line) > 2: #takes care of the options
|
|
125 options= (''.join(s_line[2:]))
|
|
126 num_options=len(options) - options.count('U')
|
|
127 else:
|
|
128 raise ValueError('parse error')
|
|
129
|
|
130
|
|
131 #when options end, this take care of the sequence
|
|
132 elif num_options == 0:
|
|
133 if (num_seq==0):
|
|
134 raise ValueError("Empty File, or possibly wrong file")
|
|
135 elif tracker < num_seq:
|
|
136 if num_seq > len(sequence):
|
|
137 sequence.append(''.join(line[10:].split())) #removes species name
|
|
138 idents.append(line[0:10].strip())
|
|
139 tracker +=1
|
|
140
|
|
141 else:
|
|
142 sequence[tracker] += (''.join(s_line))
|
|
143 tracker +=1
|
|
144
|
|
145 if tracker == num_seq:
|
|
146 tracker = 0
|
|
147 num_options = len(options)-options.count('U')
|
|
148
|
|
149 line=fin.readline()
|
|
150
|
|
151 if len(sequence) != len(idents) or len(sequence)!=num_seq:
|
|
152 raise ValueError("Number of different sequences wrong")
|
|
153
|
|
154 seqs = []
|
|
155 for i in range (0, len(idents)):
|
|
156 if len(sequence[i])==num_total_seq:
|
|
157 seqs.append(Seq(sequence[i], alphabet, idents[i]))
|
|
158 else:
|
|
159 raise ValueError("extra sequence in list")
|
|
160
|
|
161 return SeqList(seqs)
|
|
162
|
|
163
|
|
164
|
|
165
|
|
166
|
|
167
|