Mercurial > repos > davidmurphy > codonlogo
comparison corebio/seq_io/_nexus/Trees.py @ 0:c55bdc2fb9fa
Uploaded
author | davidmurphy |
---|---|
date | Thu, 27 Oct 2011 12:09:09 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55bdc2fb9fa |
---|---|
1 | |
2 # | |
3 # Trees.py | |
4 # | |
5 # Copyright 2005 by Frank Kauff & Cymon J. Cox. All rights reserved. | |
6 # This code is part of the Biopython distribution and governed by its | |
7 # license. Please see the LICENSE file that should have been included | |
8 # as part of this package. | |
9 # | |
10 # Tree class handles phylogenetic trees. Provides a set of methods to read and write newick-format tree | |
11 # descriptions, get information about trees (monphyly of taxon sets, congruence between trees, common ancestors,...) | |
12 # and to manipulate trees (reroot trees, split terminal nodes). | |
13 # | |
14 # Bug reports welcome: fkauff@duke.edu | |
15 # | |
16 | |
17 import sys, random, sets | |
18 import Nodes | |
19 | |
20 PRECISION_BRANCHLENGTH=6 | |
21 PRECISION_SUPPORT=6 | |
22 | |
23 class TreeError(Exception): pass | |
24 | |
25 class NodeData: | |
26 """Stores tree-relevant data associated with nodes (e.g. branches or otus).""" | |
27 def __init__(self,taxon=None,branchlength=0.0,support=None): | |
28 self.taxon=taxon | |
29 self.branchlength=branchlength | |
30 self.support=support | |
31 | |
32 class Tree(Nodes.Chain): | |
33 """Represents a tree using a chain of nodes with on predecessor (=ancestor) | |
34 and multiple successors (=subclades). | |
35 """ | |
36 # A newick tree is parsed into nested list and then converted to a node list in two stages | |
37 # mostly due to historical reasons. This could be done in one swoop). Note: parentheses ( ) and | |
38 # colon : are not allowed in taxon names. This is against NEXUS standard, but makes life much | |
39 # easier when parsing trees. | |
40 | |
41 ## NOTE: Tree should store its data class in something like self.dataclass=data, | |
42 ## so that nodes that are generated have easy access to the data class | |
43 ## Some routines use automatically NodeData, this needs to be more concise | |
44 | |
45 def __init__(self,tree=None,weight=1.0,rooted=False,name='',data=NodeData,values_are_support=False,max_support=1.0): | |
46 """Ntree(self,tree).""" | |
47 Nodes.Chain.__init__(self) | |
48 self.dataclass=data | |
49 self.__values_are_support=values_are_support | |
50 self.max_support=max_support | |
51 self.weight=weight | |
52 self.rooted=rooted | |
53 self.name=name | |
54 root=Nodes.Node(data()) | |
55 self.add(root) | |
56 self.root=root.id | |
57 if tree: # use the tree we have | |
58 # if Tree is called from outside Nexus parser, we need to get rid of linebreaks, etc | |
59 tree=tree.strip().replace('\n','').replace('\r','') | |
60 # there's discrepancy whether newick allows semicolons et the end | |
61 tree=tree.rstrip(';') | |
62 self._add_subtree(parent_id=root.id,tree=self._parse(tree)[0]) | |
63 | |
64 def _parse(self,tree): | |
65 """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively.""" | |
66 | |
67 if tree.count('(')!=tree.count(')'): | |
68 raise TreeError, 'Parentheses do not match in (sub)tree: '+tree | |
69 if tree.count('(')==0: # a leaf | |
70 colon=tree.rfind(':') | |
71 if colon>-1: | |
72 return [tree[:colon],self._get_values(tree[colon+1:])] | |
73 else: | |
74 return [tree,[None]] | |
75 else: | |
76 closing=tree.rfind(')') | |
77 val=self._get_values(tree[closing+1:]) | |
78 if not val: | |
79 val=[None] | |
80 subtrees=[] | |
81 plevel=0 | |
82 prev=1 | |
83 for p in range(1,closing): | |
84 if tree[p]=='(': | |
85 plevel+=1 | |
86 elif tree[p]==')': | |
87 plevel-=1 | |
88 elif tree[p]==',' and plevel==0: | |
89 subtrees.append(tree[prev:p]) | |
90 prev=p+1 | |
91 subtrees.append(tree[prev:closing]) | |
92 subclades=[self._parse(subtree) for subtree in subtrees] | |
93 return [subclades,val] | |
94 | |
95 def _add_subtree(self,parent_id=None,tree=None): | |
96 """Adds leaf or tree (in newick format) to a parent_id. (self,parent_id,tree).""" | |
97 | |
98 if parent_id is None: | |
99 raise TreeError('Need node_id to connect to.') | |
100 for st in tree: | |
101 if type(st[0])==list: # it's a subtree | |
102 nd=self.dataclass() | |
103 if len(st[1])>=2: # if there's two values, support comes first. Is that always so? | |
104 nd.support=st[1][0] | |
105 if st[1][1] is not None: | |
106 nd.branchlength=st[1][1] | |
107 elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths | |
108 if not self.__values_are_support: # default | |
109 if st[1][0] is not None: | |
110 nd.branchlength=st[1][0] | |
111 else: | |
112 nd.support=st[1][0] | |
113 sn=Nodes.Node(nd) | |
114 self.add(sn,parent_id) | |
115 self._add_subtree(sn.id,st[0]) | |
116 else: # it's a leaf | |
117 nd=self.dataclass() | |
118 nd.taxon=st[0] | |
119 if len(st)>1: | |
120 if len(st[1])>=2: # if there's two values, support comes first. Is that always so? | |
121 nd.support=st[1][0] | |
122 if st[1][1] is not None: | |
123 nd.branchlength=st[1][1] | |
124 elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths | |
125 if not self.__values_are_support: # default | |
126 if st[1][0] is not None: | |
127 nd.branchlength=st[1][0] | |
128 else: | |
129 nd.support=st[1][0] | |
130 leaf=Nodes.Node(nd) | |
131 self.add(leaf,parent_id) | |
132 | |
133 def _get_values(self, text): | |
134 """Extracts values (support/branchlength) from xx[:yyy], xx.""" | |
135 | |
136 if text=='': | |
137 return None | |
138 return [float(t) for t in text.split(':') if t.strip()] | |
139 | |
140 def _walk(self,node=None): | |
141 """Return all node_ids downwards from a node.""" | |
142 | |
143 if node is None: | |
144 node=self.root | |
145 for n in self.node(node).succ: | |
146 yield n | |
147 for sn in self._walk(n): | |
148 yield sn | |
149 | |
150 def node(self,node_id): | |
151 """Return the instance of node_id. | |
152 | |
153 node = node(self,node_id) | |
154 """ | |
155 if node_id not in self.chain: | |
156 raise TreeError('Unknown node_id: %d' % node_id) | |
157 return self.chain[node_id] | |
158 | |
159 def split(self,parent_id=None,n=2,branchlength=1.0): | |
160 """Speciation: generates n (default two) descendants of a node. | |
161 | |
162 [new ids] = split(self,parent_id=None,n=2,branchlength=1.0): | |
163 """ | |
164 if parent_id is None: | |
165 raise TreeError('Missing node_id.') | |
166 ids=[] | |
167 parent_data=self.chain[parent_id].data | |
168 for i in range(n): | |
169 node=Nodes.Node() | |
170 if parent_data: | |
171 node.data=self.dataclass() | |
172 # each node has taxon and branchlength attribute | |
173 if parent_data.taxon: | |
174 node.data.taxon=parent_data.taxon+str(i) | |
175 node.data.branchlength=branchlength | |
176 ids.append(self.add(node,parent_id)) | |
177 return ids | |
178 | |
179 def search_taxon(self,taxon): | |
180 """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes. | |
181 | |
182 node_id = search_taxon(self,taxon) | |
183 """ | |
184 for id,node in self.chain.items(): | |
185 if node.data.taxon==taxon: | |
186 return id | |
187 return None | |
188 | |
189 def prune(self,taxon): | |
190 """Prunes a terminal taxon from the tree. | |
191 | |
192 id_of_previous_node = prune(self,taxon) | |
193 If taxon is from a bifurcation, the connectiong node will be collapsed | |
194 and its branchlength added to remaining terminal node. This might be no | |
195 longer a meaningful value' | |
196 """ | |
197 | |
198 id=self.search_taxon(taxon) | |
199 if id is None: | |
200 raise TreeError('Taxon not found: %s' % taxon) | |
201 elif id not in self.get_terminals(): | |
202 raise TreeError('Not a terminal taxon: %s' % taxon) | |
203 else: | |
204 prev=self.unlink(id) | |
205 self.kill(id) | |
206 if not prev==self.root and len(self.node(prev).succ)==1: | |
207 succ=self.node(prev).succ[0] | |
208 new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength | |
209 self.collapse(prev) | |
210 self.node(succ).data.branchlength=new_bl | |
211 return prev | |
212 | |
213 def get_taxa(self,node_id=None): | |
214 """Return a list of all otus downwards from a node (self, node_id). | |
215 | |
216 nodes = get_taxa(self,node_id=None) | |
217 """ | |
218 | |
219 if node_id is None: | |
220 node_id=self.root | |
221 if node_id not in self.chain: | |
222 raise TreeError('Unknown node_id: %d.' % node_id) | |
223 if self.chain[node_id].succ==[]: | |
224 if self.chain[node_id].data: | |
225 return [self.chain[node_id].data.taxon] | |
226 else: | |
227 return None | |
228 else: | |
229 list=[] | |
230 for succ in self.chain[node_id].succ: | |
231 list.extend(self.get_taxa(succ)) | |
232 return list | |
233 | |
234 def get_terminals(self): | |
235 """Return a list of all terminal nodes.""" | |
236 return [i for i in self.all_ids() if self.node(i).succ==[]] | |
237 | |
238 def sum_branchlength(self,root=None,node=None): | |
239 """Adds up the branchlengths from root (default self.root) to node. | |
240 | |
241 sum = sum_branchlength(self,root=None,node=None) | |
242 """ | |
243 | |
244 if root is None: | |
245 root=self.root | |
246 if node is None: | |
247 raise TreeError('Missing node id.') | |
248 blen=0.0 | |
249 while node is not None and node is not root: | |
250 blen+=self.node(node).data.branchlength | |
251 node=self.node(node).prev | |
252 return blen | |
253 | |
254 def set_subtree(self,node): | |
255 """Return subtree as a set of nested sets. | |
256 | |
257 sets = set_subtree(self,node) | |
258 """ | |
259 | |
260 if self.node(node).succ==[]: | |
261 return self.node(node).data.taxon | |
262 else: | |
263 return sets.Set([self.set_subtree(n) for n in self.node(node).succ]) | |
264 | |
265 def is_identical(self,tree2): | |
266 """Compare tree and tree2 for identity. | |
267 | |
268 result = is_identical(self,tree2) | |
269 """ | |
270 return self.set_subtree(self.root)==tree2.set_subtree(tree2.root) | |
271 | |
272 def is_compatible(self,tree2,threshold,strict=True): | |
273 """Compares branches with support>threshold for compatibility. | |
274 | |
275 result = is_compatible(self,tree2,threshold) | |
276 """ | |
277 | |
278 # check if both trees have the same set of taxa. strict=True enforces this. | |
279 missing2=sets.Set(self.get_taxa())-sets.Set(tree2.get_taxa()) | |
280 missing1=sets.Set(tree2.get_taxa())-sets.Set(self.get_taxa()) | |
281 if strict and (missing1 or missing2): | |
282 if missing1: | |
283 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing1) , self.name) | |
284 if missing2: | |
285 print 'Taxon/taxa %s is/are missing in tree %s' % (','.join(missing2) , tree2.name) | |
286 raise TreeError, 'Can\'t compare trees with different taxon compositions.' | |
287 t1=[(sets.Set(self.get_taxa(n)),self.node(n).data.support) for n in self.all_ids() if \ | |
288 self.node(n).succ and\ | |
289 (self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)] | |
290 t2=[(sets.Set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.all_ids() if \ | |
291 tree2.node(n).succ and\ | |
292 (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)] | |
293 conflict=[] | |
294 for (st1,sup1) in t1: | |
295 for (st2,sup2) in t2: | |
296 if not st1.issubset(st2) and not st2.issubset(st1): # don't hiccup on upstream nodes | |
297 intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2 # all three are non-empty sets | |
298 # if notin1==missing1 or notin2==missing2 <==> st1.issubset(st2) or st2.issubset(st1) ??? | |
299 if intersect and not (notin1.issubset(missing1) or notin2.issubset(missing2)): # omit conflicts due to missing taxa | |
300 conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2)) | |
301 return conflict | |
302 | |
303 def common_ancestor(self,node1,node2): | |
304 """Return the common ancestor that connects to nodes. | |
305 | |
306 node_id = common_ancestor(self,node1,node2) | |
307 """ | |
308 | |
309 l1=[self.root]+self.trace(self.root,node1) | |
310 l2=[self.root]+self.trace(self.root,node2) | |
311 return [n for n in l1 if n in l2][-1] | |
312 | |
313 | |
314 def distance(self,node1,node2): | |
315 """Add and return the sum of the branchlengths between two nodes. | |
316 dist = distance(self,node1,node2) | |
317 """ | |
318 | |
319 ca=self.common_ancestor(node1,node2) | |
320 return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2) | |
321 | |
322 def is_monophyletic(self,taxon_list): | |
323 """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise. | |
324 | |
325 result = is_monophyletic(self,taxon_list) | |
326 """ | |
327 if isinstance(taxon_list,str): | |
328 taxon_set=sets.Set([taxon_list]) | |
329 else: | |
330 taxon_set=sets.Set(taxon_list) | |
331 node_id=self.root | |
332 while 1: | |
333 subclade_taxa=sets.Set(self.get_taxa(node_id)) | |
334 if subclade_taxa==taxon_set: # are we there? | |
335 return node_id | |
336 else: # check subnodes | |
337 for subnode in self.chain[node_id].succ: | |
338 if sets.Set(self.get_taxa(subnode)).issuperset(taxon_set): # taxon_set is downstream | |
339 node_id=subnode | |
340 break # out of for loop | |
341 else: | |
342 return -1 # taxon set was not with successors, for loop exhausted | |
343 | |
344 def is_bifurcating(self,node=None): | |
345 """Return True if tree downstream of node is strictly bifurcating.""" | |
346 if not node: | |
347 node=self.root | |
348 if node==self.root and len(self.node(node).succ)==3: #root can be trifurcating, because it has no ancestor | |
349 return self.is_bifurcating(self.node(node).succ[0]) and \ | |
350 self.is_bifurcating(self.node(node).succ[1]) and \ | |
351 self.is_bifurcating(self.node(node).succ[2]) | |
352 if len(self.node(node).succ)==2: | |
353 return self.is_bifurcating(self.node(node).succ[0]) and self.is_bifurcating(self.node(node).succ[1]) | |
354 elif len(self.node(node).succ)==0: | |
355 return True | |
356 else: | |
357 return False | |
358 | |
359 | |
360 | |
361 def branchlength2support(self): | |
362 """Move values stored in data.branchlength to data.support, and set branchlength to 0.0 | |
363 | |
364 This is necessary when support has been stored as branchlength (e.g. paup), and has thus | |
365 been read in as branchlength. | |
366 """ | |
367 | |
368 for n in self.chain.keys(): | |
369 self.node(n).data.support=self.node(n).data.branchlength | |
370 self.node(n).data.branchlength=0.0 | |
371 | |
372 def convert_absolute_support(self,nrep): | |
373 """Convert absolute support (clade-count) to rel. frequencies. | |
374 | |
375 Some software (e.g. PHYLIP consense) just calculate how often clades appear, instead of | |
376 calculating relative frequencies.""" | |
377 | |
378 for n in self._walk(): | |
379 if self.node(n).data.support: | |
380 self.node(n).data.support/=float(nrep) | |
381 | |
382 def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True): | |
383 """Generates a random tree with ntax taxa and/or taxa from taxlabels. | |
384 | |
385 new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True) | |
386 Trees are bifurcating by default. (Polytomies not yet supported). | |
387 """ | |
388 | |
389 if not ntax and taxon_list: | |
390 ntax=len(taxon_list) | |
391 elif not taxon_list and ntax: | |
392 taxon_list=['taxon'+str(i+1) for i in range(ntax)] | |
393 elif not ntax and not taxon_list: | |
394 raise TreeError('Either numer of taxa or list of taxa must be specified.') | |
395 elif ntax<>len(taxon_list): | |
396 raise TreeError('Length of taxon list must correspond to ntax.') | |
397 # initiate self with empty root | |
398 self.__init__() | |
399 terminals=self.get_terminals() | |
400 # bifurcate randomly at terminal nodes until ntax is reached | |
401 while len(terminals)<ntax: | |
402 newsplit=random.choice(terminals) | |
403 new_terminals=self.split(parent_id=newsplit,branchlength=branchlength) | |
404 # if desired, give some variation to the branch length | |
405 if branchlength_sd: | |
406 for nt in new_terminals: | |
407 bl=random.gauss(branchlength,branchlength_sd) | |
408 if bl<0: | |
409 bl=0 | |
410 self.node(nt).data.branchlength=bl | |
411 terminals.extend(new_terminals) | |
412 terminals.remove(newsplit) | |
413 # distribute taxon labels randomly | |
414 random.shuffle(taxon_list) | |
415 for (node,name) in zip(terminals,taxon_list): | |
416 self.node(node).data.taxon=name | |
417 | |
418 def display(self): | |
419 """Quick and dirty lists of all nodes.""" | |
420 table=[('#','taxon','prev','succ','brlen','blen (sum)','support')] | |
421 for i in self.all_ids(): | |
422 n=self.node(i) | |
423 if not n.data: | |
424 table.append((str(i),'-',str(n.prev),str(n.succ),'-','-','-')) | |
425 else: | |
426 tx=n.data.taxon | |
427 if not tx: | |
428 tx='-' | |
429 blength=n.data.branchlength | |
430 if blength is None: | |
431 blength='-' | |
432 sum_blength='-' | |
433 else: | |
434 sum_blength=self.sum_branchlength(node=i) | |
435 support=n.data.support | |
436 if support is None: | |
437 support='-' | |
438 table.append((str(i),tx,str(n.prev),str(n.succ),blength,sum_blength,support)) | |
439 print '\n'.join(['%3s %32s %15s %15s %8s %10s %8s' % l for l in table]) | |
440 print '\nRoot: ',self.root | |
441 | |
442 def to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True,plain_newick=False): | |
443 """Return a paup compatible tree line. | |
444 | |
445 to_string(self,support_as_branchlengths=False,branchlengths_only=False,plain=True) | |
446 """ | |
447 # if there's a conflict in the arguments, we override plain=True | |
448 if support_as_branchlengths or branchlengths_only: | |
449 plain=False | |
450 self.support_as_branchlengths=support_as_branchlengths | |
451 self.branchlengths_only=branchlengths_only | |
452 self.plain=plain | |
453 | |
454 def make_info_string(data,terminal=False): | |
455 """Creates nicely formatted support/branchlengths.""" | |
456 # CHECK FORMATTING | |
457 if self.plain: # plain tree only. That's easy. | |
458 return '' | |
459 elif self.support_as_branchlengths: # support as branchlengths (eg. PAUP), ignore actual branchlengths | |
460 if terminal: # terminal branches have 100% support | |
461 return ':%1.2f' % self.max_support | |
462 else: | |
463 return ':%1.2f' % (data.support) | |
464 elif self.branchlengths_only: # write only branchlengths, ignore support | |
465 return ':%1.5f' % (data.branchlength) | |
466 else: # write suport and branchlengths (e.g. .con tree of mrbayes) | |
467 if terminal: | |
468 return ':%1.5f' % (data.branchlength) | |
469 else: | |
470 if data.branchlength is not None and data.support is not None: # we have blen and suppport | |
471 return '%1.2f:%1.5f' % (data.support,data.branchlength) | |
472 elif data.branchlength is not None: # we have only blen | |
473 return '0.00000:%1.5f' % (data.branchlength) | |
474 elif data.support is not None: # we have only support | |
475 return '%1.2f:0.00000' % (data.support) | |
476 else: | |
477 return '0.00:0.00000' | |
478 | |
479 def newickize(node): | |
480 """Convert a node tree to a newick tree recursively.""" | |
481 | |
482 if not self.node(node).succ: #terminal | |
483 return self.node(node).data.taxon+make_info_string(self.node(node).data,terminal=True) | |
484 else: | |
485 return '(%s)%s' % (','.join(map(newickize,self.node(node).succ)),make_info_string(self.node(node).data)) | |
486 return subtree | |
487 | |
488 treeline=['tree'] | |
489 if self.name: | |
490 treeline.append(self.name) | |
491 else: | |
492 treeline.append('a_tree') | |
493 treeline.append('=') | |
494 if self.weight<>1: | |
495 treeline.append('[&W%s]' % str(round(float(self.weight),3))) | |
496 if self.rooted: | |
497 treeline.append('[&R]') | |
498 treeline.append('(%s)' % ','.join(map(newickize,self.node(self.root).succ))) | |
499 treeline.append(';') | |
500 if plain_newick: | |
501 return treeline[-2] | |
502 else: | |
503 return ' '.join(treeline) | |
504 | |
505 def __str__(self): | |
506 """Short version of to_string(), gives plain tree""" | |
507 return self.to_string(plain=True) | |
508 | |
509 def unroot(self): | |
510 """Defines a unrooted Tree structure, using data of a rooted Tree.""" | |
511 | |
512 # travel down the rooted tree structure and save all branches and the nodes they connect | |
513 | |
514 def _get_branches(node): | |
515 branches=[] | |
516 for b in self.node(node).succ: | |
517 branches.append([node,b,self.node(b).data.branchlength,self.node(b).data.support]) | |
518 branches.extend(_get_branches(b)) | |
519 return branches | |
520 | |
521 self.unrooted=_get_branches(self.root) | |
522 # if root is bifurcating, then it is eliminated | |
523 if len(self.node(self.root).succ)==2: | |
524 # find the two branches that connect to root | |
525 rootbranches=[b for b in self.unrooted if self.root in b[:2]] | |
526 b1=self.unrooted.pop(self.unrooted.index(rootbranches[0])) | |
527 b2=self.unrooted.pop(self.unrooted.index(rootbranches[1])) | |
528 # Connect them two each other. If both have support, it should be identical (or one set to None?). | |
529 # If both have branchlengths, they will be added | |
530 newbranch=[b1[1],b2[1],b1[2]+b2[2]] | |
531 if b1[3] is None: | |
532 newbranch.append(b2[3]) # either None (both rootbranches are unsupported) or some support | |
533 elif b2[3] is None: | |
534 newbranch.append(b1[3]) # dito | |
535 elif b1[3]==b2[3]: | |
536 newbranch.append(b1[3]) # identical support | |
537 elif b1[3]==0 or b2[3]==0: | |
538 newbranch.append(b1[3]+b2[3]) # one is 0, take the other | |
539 else: | |
540 raise TreeError, 'Support mismatch in bifurcating root: %f, %f' % (float(b1[3]),float(b2[3])) | |
541 self.unrooted.append(newbranch) | |
542 | |
543 def root_with_outgroup(self,outgroup=None): | |
544 | |
545 def _connect_subtree(parent,child): | |
546 """Hook subtree starting with node child to parent.""" | |
547 for i,branch in enumerate(self.unrooted): | |
548 if parent in branch[:2] and child in branch[:2]: | |
549 branch=self.unrooted.pop(i) | |
550 break | |
551 else: | |
552 raise TreeError, 'Unable to connect nodes for rooting: nodes %d and %d are not connected' % (parent,child) | |
553 self.link(parent,child) | |
554 self.node(child).data.branchlength=branch[2] | |
555 self.node(child).data.support=branch[3] | |
556 #now check if there are more branches connected to the child, and if so, connect them | |
557 child_branches=[b for b in self.unrooted if child in b[:2]] | |
558 for b in child_branches: | |
559 if child==b[0]: | |
560 succ=b[1] | |
561 else: | |
562 succ=b[0] | |
563 _connect_subtree(child,succ) | |
564 | |
565 # check the outgroup we're supposed to root with | |
566 if outgroup is None: | |
567 return self.root | |
568 outgroup_node=self.is_monophyletic(outgroup) | |
569 if outgroup_node==-1: | |
570 return -1 | |
571 # if tree is already rooted with outgroup on a bifurcating root, | |
572 # or the outgroup includes all taxa on the tree, then we're fine | |
573 if (len(self.node(self.root).succ)==2 and outgroup_node in self.node(self.root).succ) or outgroup_node==self.root: | |
574 return self.root | |
575 | |
576 self.unroot() | |
577 # now we find the branch that connects outgroup and ingroup | |
578 #print self.node(outgroup_node).prev | |
579 for i,b in enumerate(self.unrooted): | |
580 if outgroup_node in b[:2] and self.node(outgroup_node).prev in b[:2]: | |
581 root_branch=self.unrooted.pop(i) | |
582 break | |
583 else: | |
584 raise TreeError, 'Unrooted and rooted Tree do not match' | |
585 if outgroup_node==root_branch[1]: | |
586 ingroup_node=root_branch[0] | |
587 else: | |
588 ingroup_node=root_branch[1] | |
589 # now we destroy the old tree structure, but keep node data. Nodes will be reconnected according to new outgroup | |
590 for n in self.all_ids(): | |
591 self.node(n).prev=None | |
592 self.node(n).succ=[] | |
593 # now we just add both subtrees (outgroup and ingroup) branch for branch | |
594 root=Nodes.Node(data=NodeData()) # new root | |
595 self.add(root) # add to tree description | |
596 self.root=root.id # set as root | |
597 self.unrooted.append([root.id,ingroup_node,root_branch[2],root_branch[3]]) # add branch to ingroup to unrooted tree | |
598 self.unrooted.append([root.id,outgroup_node,0.0,0.0]) # add branch to outgroup to unrooted tree | |
599 _connect_subtree(root.id,ingroup_node) # add ingroup | |
600 _connect_subtree(root.id,outgroup_node) # add outgroup | |
601 # if theres still a lonely node in self.chain, then it's the old root, and we delete it | |
602 oldroot=[i for i in self.all_ids() if self.node(i).prev is None and i!=self.root] | |
603 if len(oldroot)>1: | |
604 raise TreeError, 'Isolated nodes in tree description: %s' % ','.join(oldroot) | |
605 elif len(oldroot)==1: | |
606 self.kill(oldroot[0]) | |
607 return self.root | |
608 | |
609 | |
610 def consensus(trees, threshold=0.5,outgroup=None): | |
611 """Compute a majority rule consensus tree of all clades with relative frequency>=threshold from a list of trees.""" | |
612 | |
613 total=len(trees) | |
614 if total==0: | |
615 return None | |
616 # shouldn't we make sure that it's NodeData or subclass?? | |
617 dataclass=trees[0].dataclass | |
618 max_support=trees[0].max_support | |
619 clades={} | |
620 #countclades={} | |
621 alltaxa=sets.Set(trees[0].get_taxa()) | |
622 # calculate calde frequencies | |
623 c=0 | |
624 for t in trees: | |
625 c+=1 | |
626 #if c%50==0: | |
627 # print c | |
628 if alltaxa!=sets.Set(t.get_taxa()): | |
629 raise TreeError, 'Trees for consensus must contain the same taxa' | |
630 t.root_with_outgroup(outgroup=outgroup) | |
631 for st_node in t._walk(t.root): | |
632 subclade_taxa=t.get_taxa(st_node) | |
633 subclade_taxa.sort() | |
634 subclade_taxa=str(subclade_taxa) # lists are not hashable | |
635 if subclade_taxa in clades: | |
636 clades[subclade_taxa]+=float(t.weight)/total | |
637 else: | |
638 clades[subclade_taxa]=float(t.weight)/total | |
639 #if subclade_taxa in countclades: | |
640 # countclades[subclade_taxa]+=t.weight | |
641 #else: | |
642 # countclades[subclade_taxa]=t.weight | |
643 # weed out clades below threshold | |
644 for (c,p) in clades.items(): | |
645 if p<threshold: | |
646 del clades[c] | |
647 # create a tree with a root node | |
648 consensus=Tree(name='consensus_%2.1f' % float(threshold),data=dataclass) | |
649 # each clade needs a node in the new tree, add them as isolated nodes | |
650 for (c,s) in clades.items(): | |
651 node=Nodes.Node(data=dataclass()) | |
652 node.data.support=s | |
653 node.data.taxon=sets.Set(eval(c)) | |
654 consensus.add(node) | |
655 # set root node data | |
656 consensus.node(consensus.root).data.support=None | |
657 consensus.node(consensus.root).data.taxon=alltaxa | |
658 # we sort the nodes by no. of taxa in the clade, so root will be the last | |
659 consensus_ids=consensus.all_ids() | |
660 consensus_ids.sort(lambda x,y:len(consensus.node(x).data.taxon)-len(consensus.node(y).data.taxon)) | |
661 # now we just have to hook each node to the next smallest node that includes all taxa of the current | |
662 for i,current in enumerate(consensus_ids[:-1]): # skip the last one which is the root | |
663 #print '----' | |
664 #print 'current: ',consensus.node(current).data.taxon | |
665 # search remaining nodes | |
666 for parent in consensus_ids[i+1:]: | |
667 #print 'parent: ',consensus.node(parent).data.taxon | |
668 if consensus.node(parent).data.taxon.issuperset(consensus.node(current).data.taxon): | |
669 break | |
670 else: | |
671 sys.exit('corrupt tree structure?') | |
672 # internal nodes don't have taxa | |
673 if len(consensus.node(current).data.taxon)==1: | |
674 consensus.node(current).data.taxon=consensus.node(current).data.taxon.pop() | |
675 # reset the support for terminal nodes to maximum | |
676 #consensus.node(current).data.support=max_support | |
677 else: | |
678 consensus.node(current).data.taxon=None | |
679 consensus.link(parent,current) | |
680 # eliminate root taxon name | |
681 consensus.node(consensus_ids[-1]).data.taxon=None | |
682 return consensus | |
683 | |
684 | |
685 | |
686 |