1#! /usr/bin/env python 2# -*- coding: utf-8 -*- 3 4############################################################################## 5## DendroPy Phylogenetic Computing Library. 6## 7## Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder. 8## All rights reserved. 9## 10## See "LICENSE.rst" for terms and conditions of usage. 11## 12## If you use this work or any portion thereof in published work, 13## please cite it as: 14## 15## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library 16## for phylogenetic computing. Bioinformatics 26: 1569-1571. 17## 18############################################################################## 19 20""" 21Classes and Methods for working with tree reconciliation, fitting, embedding, 22contained/containing etc. 23""" 24 25import dendropy 26from dendropy.model import coalescent 27 28class ContainingTree(dendropy.Tree): 29 """ 30 A "containing tree" is a (usually rooted) tree data structure within which 31 other trees are "contained". For example, species trees and their contained 32 gene trees; host trees and their contained parasite trees; biogeographical 33 "area" trees and their contained species or taxon trees. 34 """ 35 36 def __init__(self, 37 containing_tree, 38 contained_taxon_namespace, 39 contained_to_containing_taxon_map, 40 contained_trees=None, 41 fit_containing_edge_lengths=True, 42 collapse_empty_edges=True, 43 ultrametricity_precision=False, 44 ignore_root_deep_coalescences=True, 45 **kwargs): 46 """ 47 __init__ converts ``self`` to ContainingTree class, embedding the trees 48 given in the list, ``contained_trees.`` 49 50 51 Mandatory Arguments: 52 53 ``containing_tree`` 54 A |Tree| or |Tree|-like object that describes the topological 55 constraints or conditions of the containing tree (e.g., species, 56 host, or biogeographical area trees). 57 58 ``contained_taxon_namespace`` 59 A |TaxonNamespace| object that will be used to manage the taxa of 60 the contained trees. 61 62 ``contained_to_containing_taxon_map`` 63 A |TaxonNamespaceMapping| object mapping |Taxon| objects in the 64 contained |TaxonNamespace| to corresponding |Taxon| objects in the 65 containing tree. 66 67 Optional Arguments: 68 69 ``contained_trees`` 70 An iterable container of |Tree| or |Tree|-like objects that 71 will be contained into ``containing_tree``; e.g. gene or 72 parasite trees. 73 74 ``fit_containing_edge_lengths`` 75 If |True| [default], then the branch lengths of 76 ``containing_tree`` will be adjusted to fit the contained tree 77 as they are added. Otherwise, the containing tree edge lengths 78 will not be changed. 79 80 ``collapse_empty_edges`` 81 If |True| [default], after edge lengths are adjusted, 82 zero-length branches will be collapsed. 83 84 ``ultrametricity_precision`` 85 If |False| [default], then trees will not be checked for 86 ultrametricity. Otherwise this is the threshold within which 87 all node to tip distances for sister nodes must be equal. 88 89 ``ignore_root_deep_coalescences`` 90 If |True| [default], then deep coalescences in the root will 91 not be counted. 92 93 Other Keyword Arguments: Will be passed to Tree(). 94 95 """ 96 if "taxon_namespace" not in kwargs: 97 kwargs["taxon_namespace"] = containing_tree.taxon_namespace 98 dendropy.Tree.__init__(self, 99 containing_tree, 100 taxon_namespace=containing_tree.taxon_namespace) 101 self.original_tree = containing_tree 102 for edge in self.postorder_edge_iter(): 103 edge.head_contained_edges = {} 104 edge.tail_contained_edges = {} 105 edge.containing_taxa = set() 106 edge.contained_taxa = set() 107 self._contained_taxon_namespace = contained_taxon_namespace 108 self._contained_to_containing_taxon_map = None 109 self._contained_trees = None 110 self._set_contained_to_containing_taxon_map(contained_to_containing_taxon_map) 111 self.fit_containing_edge_lengths = fit_containing_edge_lengths 112 self.collapse_empty_edges = collapse_empty_edges 113 self.ultrametricity_precision = ultrametricity_precision 114 self.ignore_root_deep_coalescences = ignore_root_deep_coalescences 115 if contained_trees: 116 self._set_contained_trees(contained_trees) 117 if self.contained_trees: 118 self.rebuild(rebuild_taxa=False) 119 120 def _set_contained_taxon_namespace(self, taxon_namespace): 121 self._contained_taxon_namespace = taxon_namespace 122 123 def _get_contained_taxon_namespace(self): 124 if self._contained_taxon_namespace is None: 125 self._contained_taxon_namespace = dendropy.TaxonNamespace() 126 return self._contained_taxon_namespace 127 128 contained_taxon_namespace = property(_get_contained_taxon_namespace) 129 130 def _set_contained_to_containing_taxon_map(self, contained_to_containing_taxon_map): 131 """ 132 Sets mapping of |Taxon| objects of the genes/parasite/etc. to that of 133 the population/species/host/etc. 134 Creates mapping (e.g., species to genes) and decorates edges of self 135 with sets of both containing |Taxon| objects and the contained 136 |Taxon| objects that map to them. 137 """ 138 if isinstance(contained_to_containing_taxon_map, dendropy.TaxonNamespaceMapping): 139 if self._contained_taxon_namespace is not contained_to_containing_taxon_map.domain_taxon_namespace: 140 raise ValueError("Domain TaxonNamespace of TaxonNamespaceMapping ('domain_taxon_namespace') not the same as 'contained_taxon_namespace' TaxonNamespace") 141 self._contained_to_containing_taxon_map = contained_to_containing_taxon_map 142 else: 143 self._contained_to_containing_taxon_map = dendropy.TaxonNamespaceMapping( 144 mapping_dict=contained_to_containing_taxon_map, 145 domain_taxon_namespace=self.contained_taxon_namespace, 146 range_taxon_namespace=self.taxon_namespace) 147 self.build_edge_taxa_sets() 148 149 def _get_contained_to_containing_taxon_map(self): 150 return self._contained_to_containing_taxon_map 151 152 contained_to_containing_taxon_map = property(_get_contained_to_containing_taxon_map) 153 154 def _set_contained_trees(self, trees): 155 if hasattr(trees, 'taxon_namespace'): 156 if self._contained_taxon_namespace is None: 157 self._contained_taxon_namespace = trees.taxon_namespace 158 elif self._contained_taxon_namespace is not trees.taxon_namespace: 159 raise ValueError("'contained_taxon_namespace' of ContainingTree is not the same TaxonNamespace object of 'contained_trees'") 160 self._contained_trees = dendropy.TreeList(trees, taxon_namespace=self._contained_taxon_namespace) 161 if self._contained_taxon_namespace is None: 162 self._contained_taxon_namespace = self._contained_trees.taxon_namespace 163 164 def _get_contained_trees(self): 165 if self._contained_trees is None: 166 self._contained_trees = dendropy.TreeList(taxon_namespace=self._contained_taxon_namespace) 167 return self._contained_trees 168 169 contained_trees = property(_get_contained_trees) 170 171 def _get_containing_to_contained_taxa_map(self): 172 return self._contained_to_containing_taxon_map.reverse 173 174 containing_to_contained_taxa_map = property(_get_containing_to_contained_taxa_map) 175 176 def clear(self): 177 """ 178 Clears all contained trees and mapped edges. 179 """ 180 self.contained_trees = dendropy.TreeList(taxon_namespace=self._contained_to_containing_taxon_map.domain_taxa) 181 self.clear_contained_edges() 182 183 def clear_contained_edges(self): 184 """ 185 Clears all contained mapped edges. 186 """ 187 for edge in self.postorder_edge_iter(): 188 edge.head_contained_edges = {} 189 edge.tail_contained_edges = {} 190 191 def fit_edge_lengths(self, contained_trees): 192 """ 193 Recalculate node ages / edge lengths of containing tree to accomodate 194 contained trees. 195 """ 196 197 # set the ages 198 for node in self.postorder_node_iter(): 199 if node.is_internal(): 200 disjunct_leaf_set_list_split_bitmasks = [] 201 for i in node.child_nodes(): 202 disjunct_leaf_set_list_split_bitmasks.append(self.taxon_namespace.taxa_bitmask(taxa=i.edge.containing_taxa)) 203 min_age = float('inf') 204 for et in contained_trees: 205 min_age = self._find_youngest_intergroup_age(et, disjunct_leaf_set_list_split_bitmasks, min_age) 206 node.age = max( [min_age] + [cn.age for cn in node.child_nodes()] ) 207 else: 208 node.age = 0 209 210 # set the corresponding edge lengths 211 self.set_edge_lengths_from_node_ages() 212 213 # collapse 0-length branches 214 if self.collapse_empty_edges: 215 self.collapse_unweighted_edges() 216 217 def rebuild(self, rebuild_taxa=True): 218 """ 219 Recalculate edge taxa sets, node ages / edge lengths of containing 220 tree, and embed edges of contained trees. 221 """ 222 if rebuild_taxa: 223 self.build_edge_taxa_sets() 224 if self.fit_containing_edge_lengths: 225 self.fit_edge_lengths(self.contained_trees) 226 self.clear_contained_edges() 227 for et in self.contained_trees: 228 self.embed_tree(et) 229 230 def embed_tree(self, contained_tree): 231 """ 232 Map edges of contained tree into containing tree (i.e., self). 233 """ 234 if self.seed_node.age is None: 235 self.calc_node_ages(ultrametricity_precision=self.ultrametricity_precision) 236 if contained_tree not in self.contained_trees: 237 self.contained_trees.append(contained_tree) 238 if self.fit_containing_edge_lengths: 239 self.fit_edge_lengths(self.contained_trees) 240 if contained_tree.seed_node.age is None: 241 contained_tree.calc_node_ages(ultrametricity_precision=self.ultrametricity_precision) 242 contained_leaves = contained_tree.leaf_nodes() 243 taxon_to_contained = {} 244 for nd in contained_leaves: 245 containing_taxon = self.contained_to_containing_taxon_map[nd.taxon] 246 x = taxon_to_contained.setdefault(containing_taxon, set()) 247 x.add(nd.edge) 248 for containing_edge in self.postorder_edge_iter(): 249 if containing_edge.is_terminal(): 250 containing_edge.head_contained_edges[contained_tree] = taxon_to_contained[containing_edge.head_node.taxon] 251 else: 252 containing_edge.head_contained_edges[contained_tree] = set() 253 for nd in containing_edge.head_node.child_nodes(): 254 containing_edge.head_contained_edges[contained_tree].update(nd.edge.tail_contained_edges[contained_tree]) 255 256 if containing_edge.tail_node is None: 257 if containing_edge.length is not None: 258 target_age = containing_edge.head_node.age + containing_edge.length 259 else: 260 # assume all coalesce? 261 containing_edge.tail_contained_edges[contained_tree] = set([contained_tree.seed_node.edge]) 262 continue 263 else: 264 target_age = containing_edge.tail_node.age 265 266 containing_edge.tail_contained_edges[contained_tree] = set() 267 for contained_edge in containing_edge.head_contained_edges[contained_tree]: 268 if contained_edge.tail_node is not None: 269 remaining = target_age - contained_edge.tail_node.age 270 elif contained_edge.length is not None: 271 remaining = target_age - (contained_edge.head_node.age + contained_edge.length) 272 else: 273 continue 274 while remaining > 0: 275 if contained_edge.tail_node is not None: 276 contained_edge = contained_edge.tail_node.edge 277 else: 278 if contained_edge.length is not None and (remaining - contained_edge.length) <= 0: 279 contained_edge = None 280 remaining = 0 281 break 282 else: 283 remaining = 0 284 break 285 if contained_edge and remaining > 0: 286 remaining -= contained_edge.length 287 if contained_edge is not None: 288 containing_edge.tail_contained_edges[contained_tree].add(contained_edge) 289 290 def build_edge_taxa_sets(self): 291 """ 292 Rebuilds sets of containing and corresponding contained taxa at each 293 edge. 294 """ 295 for edge in self.postorder_edge_iter(): 296 if edge.is_terminal(): 297 edge.containing_taxa = set([edge.head_node.taxon]) 298 else: 299 edge.containing_taxa = set() 300 for i in edge.head_node.child_nodes(): 301 edge.containing_taxa.update(i.edge.containing_taxa) 302 edge.contained_taxa = set() 303 for t in edge.containing_taxa: 304 edge.contained_taxa.update(self.containing_to_contained_taxa_map[t]) 305 306 def num_deep_coalescences(self): 307 """ 308 Returns total number of deep coalescences of the contained trees. 309 """ 310 return sum(self.deep_coalescences().values()) 311 312 def deep_coalescences(self): 313 """ 314 Returns dictionary where the contained trees are keys, and the number of 315 deep coalescences corresponding to the tree are values. 316 """ 317 dc = {} 318 for tree in self.contained_trees: 319 for edge in self.postorder_edge_iter(): 320 if edge.tail_node is None and self.ignore_root_deep_coalescences: 321 continue 322 try: 323 dc[tree] += len(edge.tail_contained_edges[tree]) - 1 324 except KeyError: 325 dc[tree] = len(edge.tail_contained_edges[tree]) - 1 326 return dc 327 328 def embed_contained_kingman(self, 329 edge_pop_size_attr='pop_size', 330 default_pop_size=1, 331 label=None, 332 rng=None, 333 use_expected_tmrca=False): 334 """ 335 Simulates, *embeds*, and returns a "censored" (Kingman) neutral coalescence tree 336 conditional on self. 337 338 ``rng`` 339 Random number generator to use. If |None|, the default will 340 be used. 341 342 ``edge_pop_size_attr`` 343 Name of attribute of self's edges that specify the population 344 size. If this attribute does not exist, then the population 345 size is taken to be 1. 346 347 Note that all edge-associated taxon sets must be up-to-date (otherwise, 348 ``build_edge_taxa_sets()`` should be called). 349 """ 350 et = self.simulate_contained_kingman( 351 edge_pop_size_attr=edge_pop_size_attr, 352 default_pop_size=default_pop_size, 353 label=label, 354 rng=rng, 355 use_expected_tmrca=use_expected_tmrca) 356 self.embed_tree(et) 357 return et 358 359 def simulate_contained_kingman(self, 360 edge_pop_size_attr='pop_size', 361 default_pop_size=1, 362 label=None, 363 rng=None, 364 use_expected_tmrca=False): 365 """ 366 Simulates and returns a "censored" (Kingman) neutral coalescence tree 367 conditional on self. 368 369 ``rng`` 370 Random number generator to use. If |None|, the default will 371 be used. 372 373 ``edge_pop_size_attr`` 374 Name of attribute of self's edges that specify the population 375 size. If this attribute does not exist, then the population 376 size is taken to be 1. 377 378 Note that all edge-associated taxon sets must be up-to-date (otherwise, 379 ``build_edge_taxa_sets()`` should be called), and that the tree 380 is *not* added to the set of contained trees. For the latter, call 381 ``embed_contained_kingman``. 382 """ 383 384 # Dictionary that maps nodes of containing tree to list of 385 # corresponding nodes on gene tree, initially populated with leaf 386 # nodes. 387 contained_nodes = {} 388 for nd in self.leaf_node_iter(): 389 contained_nodes[nd] = [] 390 for gt in nd.edge.contained_taxa: 391 gn = dendropy.Node(taxon=gt) 392 contained_nodes[nd].append(gn) 393 394 # Generate the tree structure 395 for edge in self.postorder_edge_iter(): 396 if edge.head_node.parent_node is None: 397 # root: run unconstrained coalescence until just one gene node 398 # remaining 399 if hasattr(edge, edge_pop_size_attr): 400 pop_size = getattr(edge, edge_pop_size_attr) 401 else: 402 pop_size = default_pop_size 403 if len(contained_nodes[edge.head_node]) > 1: 404 final = coalescent.coalesce_nodes(nodes=contained_nodes[edge.head_node], 405 pop_size=pop_size, 406 period=None, 407 rng=rng, 408 use_expected_tmrca=use_expected_tmrca) 409 else: 410 final = contained_nodes[edge.head_node] 411 else: 412 # run until next coalescence event, as determined by this edge 413 # size. 414 if hasattr(edge, edge_pop_size_attr): 415 pop_size = getattr(edge, edge_pop_size_attr) 416 else: 417 pop_size = default_pop_size 418 remaining = coalescent.coalesce_nodes(nodes=contained_nodes[edge.head_node], 419 pop_size=pop_size, 420 period=edge.length, 421 rng=rng, 422 use_expected_tmrca=use_expected_tmrca) 423 try: 424 contained_nodes[edge.tail_node].extend(remaining) 425 except KeyError: 426 contained_nodes[edge.tail_node] = remaining 427 428 # Create and return the full tree 429 contained_tree = dendropy.Tree(taxon_namespace=self.contained_taxon_namespace, label=label) 430 contained_tree.seed_node = final[0] 431 contained_tree.is_rooted = True 432 return contained_tree 433 434 def _find_youngest_intergroup_age(self, contained_tree, disjunct_leaf_set_list_split_bitmasks, starting_min_age=None): 435 """ 436 Find the age of the youngest MRCA of disjunct leaf sets. 437 """ 438 if starting_min_age is None: 439 starting_min_age = float('inf') 440 if contained_tree.seed_node.age is None: 441 contained_tree.calc_node_ages(ultrametricity_precision=self.ultrametricity_precision) 442 for nd in contained_tree.ageorder_node_iter(include_leaves=False): 443 if nd.age > starting_min_age: 444 break 445 prev_intersections = False 446 for bm in disjunct_leaf_set_list_split_bitmasks: 447 if bm & nd.edge.split_bitmask: 448 if prev_intersections: 449 return nd.age 450 prev_intersections = True 451 return starting_min_age 452 453 def write_as_mesquite(self, out, **kwargs): 454 """ 455 For debugging purposes, write out a Mesquite-format file. 456 """ 457 from dendropy.dataio import nexuswriter 458 nw = nexuswriter.NexusWriter(**kwargs) 459 nw.is_write_block_titles = True 460 out.write("#NEXUS\n\n") 461 nw._write_taxa_block(out, self.taxon_namespace) 462 out.write('\n') 463 nw._write_taxa_block(out, self.contained_trees.taxon_namespace) 464 if self.contained_trees.taxon_namespace.label: 465 domain_title = self.contained_trees.taxon_namespace.label 466 else: 467 domain_title = self.contained_trees.taxon_namespace.oid 468 contained_taxon_namespace = self.contained_trees.taxon_namespace 469 contained_label = self.contained_trees.label 470 out.write('\n') 471 self._contained_to_containing_taxon_map.write_mesquite_association_block(out) 472 out.write('\n') 473 nw._write_trees_block(out, dendropy.TreeList([self], taxon_namespace=self.taxon_namespace)) 474 out.write('\n') 475 nw._write_trees_block(out, dendropy.TreeList(self.contained_trees, taxon_namespace=contained_taxon_namespace, label=contained_label)) 476 out.write('\n') 477 478def reconciliation_discordance(gene_tree, species_tree): 479 """ 480 Given two trees (with splits encoded), this returns the number of gene 481 duplications implied by the gene tree reconciled on the species tree, based 482 on the algorithm described here: 483 484 Goodman, M. J. Czelnusiniak, G. W. Moore, A. E. Romero-Herrera, and 485 G. Matsuda. 1979. Fitting the gene lineage into its species lineage, 486 a parsimony strategy illustrated by cladograms constructed from globin 487 sequences. Syst. Zool. 19: 99-113. 488 489 Maddison, W. P. 1997. Gene trees in species trees. Syst. Biol. 46: 490 523-536. 491 492 This function requires that the gene tree and species tree *have the same 493 leaf set*. Note that for correct results, 494 495 (a) trees must be rooted (i.e., is_rooted = True) 496 (b) split masks must have been added as rooted (i.e., when 497 encode_splits was called, is_rooted must have been set to True) 498 499 """ 500 taxa_mask = species_tree.taxon_namespace.all_taxa_bitmask() 501 species_node_gene_nodes = {} 502 gene_node_species_nodes = {} 503 for gnd in gene_tree.postorder_node_iter(): 504 gn_children = gnd.child_nodes() 505 if len(gn_children) > 0: 506 ssplit = 0 507 for gn_child in gn_children: 508 ssplit = ssplit | gene_node_species_nodes[gn_child].edge.leafset_bitmask 509 sanc = species_tree.mrca(start_node=species_tree.seed_node, leafset_bitmask=ssplit) 510 gene_node_species_nodes[gnd] = sanc 511 if sanc not in species_node_gene_nodes: 512 species_node_gene_nodes[sanc] = [] 513 species_node_gene_nodes[sanc].append(gnd) 514 else: 515 gene_node_species_nodes[gnd] = species_tree.find_node(lambda x : x.taxon == gnd.taxon) 516 contained_gene_lineages = {} 517 for snd in species_tree.postorder_node_iter(): 518 if snd in species_node_gene_nodes: 519 for gnd in species_node_gene_nodes[snd]: 520 for gnd_child in gnd.child_nodes(): 521 sanc = gene_node_species_nodes[gnd_child] 522 p = sanc 523 while p is not None and p != snd: 524 if p.edge not in contained_gene_lineages: 525 contained_gene_lineages[p.edge] = 0 526 contained_gene_lineages[p.edge] += 1 527 p = p.parent_node 528 529 dc = 0 530 for v in contained_gene_lineages.values(): 531 dc += v - 1 532 return dc 533 534def monophyletic_partition_discordance(tree, taxon_namespace_partition): 535 """ 536 Returns the number of deep coalescences on tree ``tree`` that would result 537 if the taxa in ``tax_sets`` formed K mutually-exclusive monophyletic groups, 538 where K = len(tax_sets) 539 ``taxon_namespace_partition`` == TaxonNamespacePartition 540 """ 541 542 tax_sets = taxon_namespace_partition.subsets() 543 544 # from dendropy.model import parsimony 545 # taxon_state_sets_map = {} 546 # assert tree.taxon_namespace is taxon_namespace_partition.taxon_namespace 547 # for taxon in tree.taxon_namespace: 548 # taxon_state_sets_map[taxon] = [0 for i in range(len(tax_sets))] 549 # for idx, ts in enumerate(tax_sets): 550 # for taxon in ts: 551 # taxon_state_sets_map[taxon][idx] = 1 552 # for taxon in tree.taxon_namespace: 553 # taxon_state_sets_map[taxon] = [set([i]) for i in taxon_state_sets_map[taxon]] 554 # return parsimony.fitch_down_pass( 555 # postorder_nodes=tree.postorder_node_iter(), 556 # taxon_state_sets_map=taxon_state_sets_map 557 # ) 558 559 dc_tree = dendropy.Tree() 560 dc_tree.taxon_namespace = dendropy.TaxonNamespace() 561 for t in range(len(tax_sets)): 562 dc_tree.taxon_namespace.add_taxon(dendropy.Taxon(label=str(t))) 563 def _get_dc_taxon(nd): 564 for idx, tax_set in enumerate(tax_sets): 565 if nd.taxon in tax_set: 566 return dc_tree.taxon_namespace[idx] 567 assert "taxon not found in partition: '%s'" % nd.taxon.label 568 src_dc_map = {} 569 for snd in tree.postorder_node_iter(): 570 nnd = dendropy.Node() 571 src_dc_map[snd] = nnd 572 children = snd.child_nodes() 573 if len(children) == 0: 574 nnd.taxon = _get_dc_taxon(snd) 575 else: 576 taxa_set = [] 577 for cnd in children: 578 dc_node = src_dc_map[cnd] 579 if len(dc_node.child_nodes()) > 1: 580 nnd.add_child(dc_node) 581 else: 582 ctax = dc_node.taxon 583 if ctax is not None and ctax not in taxa_set: 584 taxa_set.append(ctax) 585 del src_dc_map[cnd] 586 if len(taxa_set) > 1: 587 for t in taxa_set: 588 cnd = dendropy.Node() 589 cnd.taxon = t 590 nnd.add_child(cnd) 591 else: 592 if len(nnd.child_nodes()) == 0: 593 nnd.taxon = taxa_set[0] 594 elif len(taxa_set) == 1: 595 cnd = dendropy.Node() 596 cnd.taxon = taxa_set[0] 597 nnd.add_child(cnd) 598 dc_tree.seed_node = nnd 599 return len(dc_tree.leaf_nodes()) - len(tax_sets) 600 601