1#! /usr/bin/env python 2# -*- coding: utf-8 -*- 3 4############################################################################## 5## DendroPy Phylogenetic Computing Library. 6## 7## Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder. 8## All rights reserved. 9## 10## See "LICENSE.rst" for terms and conditions of usage. 11## 12## If you use this work or any portion thereof in published work, 13## please cite it as: 14## 15## Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library 16## for phylogenetic computing. Bioinformatics 26: 1569-1571. 17## 18############################################################################## 19 20""" 21Models, modeling and model-fitting of parsimony. 22""" 23 24from functools import reduce 25import operator 26import dendropy 27from dendropy.utility.error import TaxonNamespaceIdentityError 28 29class _NodeStateSetMap(dict): 30 def __init__(self, taxon_state_sets_map=None): 31 self.taxon_state_sets_map = taxon_state_sets_map 32 def __getitem__(self, key): 33 try: 34 return dict.__getitem__(self, key) 35 except KeyError: 36 v = self.taxon_state_sets_map[key.taxon] 37 self[key] = v 38 return v 39 40def _store_sets_as_attr(n, state_sets_attr_name, v): 41 setattr(n, state_sets_attr_name, v) 42 43def _retrieve_state_sets_from_attr(n, state_sets_attr_name, taxon_state_sets_map): 44 try: 45 return getattr(n, state_sets_attr_name) 46 except AttributeError: 47 v = taxon_state_sets_map[n.taxon] 48 setattr(n, state_sets_attr_name, v) 49 return v 50 51def fitch_down_pass( 52 postorder_nodes, 53 state_sets_attr_name="state_sets", 54 taxon_state_sets_map=None, 55 weights=None, 56 score_by_character_list=None, 57 ): 58 """ 59 Returns the parsimony score given a list of nodes in postorder and 60 associated states, using Fitch's (1971) unordered parsimony algorithm. 61 62 Parameters 63 ---------- 64 postorder_nodes : iterable of/over |Node| objects 65 An iterable of |Node| objects in in order of post-order 66 traversal of the tree. 67 state_sets_attr_name : str 68 Name of attribute on |Node| objects in which state set lists 69 will stored/accessed. If |None|, then state sets will not be stored on 70 the tree. 71 taxon_state_sets_map : dict[taxon] = state sets 72 A dictionary that takes a taxon object as a key and returns a state set 73 list as a value. This will be used to populate the state set of a node 74 that has not yet had its state sets scored and recorded (typically, 75 leaves of a tree that has not yet been processed). 76 weights : iterable 77 A list of weights for each pattern. 78 score_by_character_list : None or list 79 If not |None|, should be a reference to a list object. 80 This list will be populated by the scores on a character-by-character 81 basis. 82 83 Returns 84 ------- 85 s : int 86 Parismony score of tree. 87 88 Notes 89 ----- 90 Currently this requires a bifurcating tree (even at the root). 91 92 Examples 93 -------- 94 95 Assume that we have a tree, ``tree``, and an associated data set, ``data``:: 96 97 import dendropy 98 from dendropy.model.parsimony import fitch_down_pass 99 100 taxa = dendropy.TaxonNamespace() 101 data = dendropy.StandardCharacterMatrix.get_from_path( 102 "apternodus.chars.nexus", 103 "nexus", 104 taxon_namespace=taxa) 105 tree = dendropy.Tree.get_from_path( 106 "apternodus.tre", 107 "nexus", 108 taxon_namespace=taxa) 109 taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True) 110 111 The following will return the parsimony score of the ``tree`` with 112 respect to the data in ``data``:: 113 114 score = fitch_down_pass( 115 nodes=tree.postorder_node_iter(), 116 taxon_state_sets_map=taxon_set_map) 117 print(score) 118 119 In the above, every |Node| object of ``tree`` will have an attribute 120 added, "state_sets", that stores the list of state sets from the analysis:: 121 122 for nd in tree: 123 print(nd.state_sets) 124 125 If you want to store the list of state sets in a different attribute, e.g., 126 "analysis1_states":: 127 128 score = fitch_down_pass( 129 nodes=tree.postorder_node_iter(), 130 state_sets_attr_name="analysis1_states", 131 taxon_state_sets_map=taxon_set_map) 132 print(score) 133 for nd in tree: 134 print(nd.analysis1_states) 135 136 Or not to store these at all:: 137 138 score = fitch_down_pass( 139 nodes=tree.postorder_node_iter(), 140 state_sets_attr_name=None, 141 taxon_state_sets_map=taxon_set_map) 142 print(score) 143 144 Scoring custom data can be done by something like the following:: 145 146 taxa = dendropy.TaxonNamespace() 147 taxon_state_sets_map = {} 148 t1 = taxa.require_taxon("A") 149 t2 = taxa.require_taxon("B") 150 t3 = taxa.require_taxon("C") 151 t4 = taxa.require_taxon("D") 152 t5 = taxa.require_taxon("E") 153 taxon_state_sets_map[t1] = [ set([0,1]), set([0,1]), set([0]), set([0]) ] 154 taxon_state_sets_map[t2] = [ set([1]), set([1]), set([1]), set([0]) ] 155 taxon_state_sets_map[t3] = [ set([0]), set([1]), set([1]), set([0]) ] 156 taxon_state_sets_map[t4] = [ set([0]), set([1]), set([0,1]), set([1]) ] 157 taxon_state_sets_map[t5] = [ set([1]), set([0]), set([1]), set([1]) ] 158 tree = dendropy.Tree.get_from_string( 159 "(A,(B,(C,(D,E))));", "newick", 160 taxon_namespace=taxa) 161 score = fitch_down_pass(tree.postorder_node_iter(), 162 taxon_state_sets_map=taxon_state_sets_map) 163 print(score) 164 165 """ 166 if score_by_character_list is not None: 167 assert len(score_by_character_list) == 0 168 for idx in range(len(list(taxon_state_sets_map.values())[0])): # this is unacceptable! 169 score_by_character_list.append(0) 170 score = 0 171 if state_sets_attr_name is None: 172 node_state_set_map = _NodeStateSetMap(taxon_state_sets_map) 173 get_node_state_sets = lambda node : node_state_set_map[node] 174 set_node_state_sets = lambda node, v : node_state_set_map.__setitem__(node, v) 175 else: 176 get_node_state_sets = lambda node : _retrieve_state_sets_from_attr(node, state_sets_attr_name, taxon_state_sets_map) 177 set_node_state_sets = lambda node, v : _store_sets_as_attr(node, state_sets_attr_name, v) 178 for nd in postorder_nodes: 179 c = nd.child_nodes() 180 if not c: 181 ss = get_node_state_sets(nd) 182 continue 183 left_c, right_c = c[:2] 184 remaining = c[2:] 185 left_ssl = get_node_state_sets(left_c) 186 while True: 187 right_ssl = get_node_state_sets(right_c) 188 result = [] 189 for n, ssp in enumerate(zip(left_ssl, right_ssl)): 190 left_ss, right_ss = ssp 191 inter = left_ss.intersection(right_ss) 192 if inter: 193 result.append(inter) 194 else: 195 if weights is None: 196 wt = 1 197 else: 198 wt = weights[n] 199 score += wt 200 result.append(left_ss.union(left_ss, right_ss)) 201 if score_by_character_list is not None: 202 # try: 203 # score_by_character_list[n] += wt 204 # except IndexError: 205 # score_by_character_list.append(wt) 206 score_by_character_list[n] += wt 207 if remaining: 208 right_c = remaining.pop(0) 209 left_ssl = result 210 else: 211 break 212 # setattr(nd, state_sets_attr_name, result) 213 set_node_state_sets(nd, result) 214 return score 215 216def fitch_up_pass( 217 preorder_node_list, 218 state_sets_attr_name="state_sets", 219 taxon_state_sets_map=None): 220 """ 221 Finalizes the state set lists associated with each node using the "final 222 phase" of Fitch's (1971) unordered parsimony algorithm. 223 224 Parameters 225 ---------- 226 postorder_nodes : iterable of/over |Node| objects 227 An iterable of |Node| objects in in order of post-order 228 traversal of the tree. 229 state_sets_attr_name : str 230 Name of attribute on |Node| objects in which state set lists 231 will stored/accessed. If |None|, then state sets will not be stored on 232 the tree. 233 taxon_state_sets_map : dict[taxon] = state sets 234 A dictionary that takes a taxon object as a key and returns a state set 235 list as a value. This will be used to populate the state set of a node 236 that has not yet had its state sets scored and recorded (typically, 237 leaves of a tree that has not yet been processed). 238 239 Notes 240 ----- 241 Currently this requires a bifurcating tree (even at the root). 242 243 Examples 244 -------- 245 246 :: 247 248 taxa = dendropy.TaxonNamespace() 249 data = dendropy.StandardCharacterMatrix.get_from_path( 250 "apternodus.chars.nexus", 251 "nexus", 252 taxon_namespace=taxa) 253 tree = dendropy.Tree.get_from_path( 254 "apternodus.tre", 255 "nexus", 256 taxon_namespace=taxa) 257 taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True) 258 score = fitch_down_pass(tree.postorder_node_iter(), 259 taxon_state_sets_map=taxon_state_sets_map) 260 print(score) 261 fitch_up_pass(tree.preorder_node_iter()) 262 for nd in tree: 263 print(nd.state_sets) 264 265 """ 266 node_state_sets_map = {} 267 for nd in preorder_node_list: 268 c = nd.child_nodes() 269 p = nd.parent_node 270 if (not c) or (not p): 271 continue 272 assert(len(c) == 2) 273 left_c, right_c = c 274 try: 275 left_ssl = getattr(left_c, state_sets_attr_name) 276 except AttributeError: 277 if not taxon_state_sets_map: 278 raise 279 left_ssl = taxon_state_sets_map[left_c.taxon] 280 try: 281 right_ssl = getattr(right_c, state_sets_attr_name) 282 except AttributeError: 283 if not taxon_state_sets_map: 284 raise 285 right_ssl = taxon_state_sets_map[right_c.taxon] 286 par_ssl = getattr(p, state_sets_attr_name) 287 curr_ssl = getattr(nd, state_sets_attr_name) 288 result = [] 289 for n, ssp in enumerate(zip(par_ssl, curr_ssl, left_ssl, right_ssl)): 290 par_ss, curr_ss, left_ss, right_ss = ssp 291 292 down_parup_inter = par_ss.intersection(curr_ss) 293 if down_parup_inter == par_ss: 294 final_ss = down_parup_inter 295 else: 296 rl_inter = left_ss.intersection(right_ss) 297 if not rl_inter: 298 final_ss = par_ss.union(curr_ss) 299 else: 300 in_par_and_left = par_ss.intersection(left_ss) 301 in_par_and_right = par_ss.intersection(right_ss) 302 final_ss = in_par_and_left.union(in_par_and_right, curr_ss) 303 #_LOG.debug("downpass = %s, par = %s, left = %s, right = %s, final_ss= %s" % 304 # (str(curr_ss), str(par_ss), str(left_ss), str(right_ss), str(final_ss))) 305 result.append(final_ss) 306 setattr(nd, state_sets_attr_name, result) 307 308 309def parsimony_score( 310 tree, 311 chars, 312 gaps_as_missing=True, 313 weights=None, 314 score_by_character_list=None, 315 ): 316 """ 317 Calculates the score of a tree, ``tree``, given some character data, 318 ``chars``, under the parsimony model using the Fitch algorithm. 319 320 Parameters 321 ---------- 322 tree : a |Tree| instance 323 A |Tree| to be scored. Must reference the same |TaxonNamespace| as 324 ``chars``. 325 chars : a |CharacterMatrix| instance 326 A |CharacterMatrix|-derived object with data to be scored. Must have 327 the same |TaxonNamespace| as ``tree``. 328 gap_as_missing : bool 329 If |True| [default], then gaps will be treated as missing data. 330 If |False|, then gaps will be treated as a new/additional state. 331 weights : iterable 332 A list of weights for each pattern/column in the matrix. 333 score_by_character_list : None or list 334 If not |None|, should be a reference to a list object. 335 This list will be populated by the scores on a character-by-character 336 basis. 337 338 Returns 339 ------- 340 pscore : int 341 The parsimony score of the tree given the data. 342 343 Examples 344 -------- 345 346 :: 347 348 import dendropy 349 from dendropy.calculate import treescore 350 351 # establish common taxon namespace 352 taxon_namespace = dendropy.TaxonNamespace() 353 354 # Read data; if data is, e.g., "standard", use StandardCharacterMatrix. 355 # If unsure of data type, can do: 356 # dataset = dendropy.DataSet.get( 357 # path="path/to/file.nex", 358 # schema="nexus", 359 # taxon_namespace=tns,) 360 # chars = dataset.char_matrices[0] 361 chars = dendropy.DnaCharacterMatrix.get( 362 path="pythonidae.chars.nexus", 363 schema="nexus", 364 taxon_namespace=taxon_namespace) 365 tree = dendropy.Tree.get( 366 path="pythonidae.mle.newick", 367 schema="newick", 368 taxon_namespace=taxon_namespace) 369 370 # We store the site-specific scores here 371 # This is optional; if we do not want to 372 # use the per-site scores, just pass in |None| 373 # for the ``score_by_character_list`` argument 374 # or do not specify this argument at all. 375 score_by_character_list = [] 376 377 score = treescore.parsimony_score( 378 tree, 379 chars, 380 gaps_as_missing=False, 381 score_by_character_list=score_by_character_list) 382 383 # Print the results: the score 384 print("Score: {}".format(score)) 385 386 # Print the results: the per-site scores 387 for idx, x in enumerate(score_by_character_list): 388 print("{}: {}".format(idx+1, x)) 389 390 Notes 391 ----- 392 393 If the same data is going to be used to score multiple trees or multiple times, 394 it is probably better to generate the 'taxon_state_sets_map' once and call 395 "fitch_down_pass" directly yourself, as this function generates a new map 396 each time. 397 398 """ 399 if tree.taxon_namespace is not chars.taxon_namespace: 400 raise TaxonNamespaceIdentityError(tree, data) 401 taxon_state_sets_map = chars.taxon_state_sets_map(gaps_as_missing=gaps_as_missing) 402 nodes = tree.postorder_node_iter() 403 pscore = fitch_down_pass(nodes, 404 taxon_state_sets_map=taxon_state_sets_map, 405 weights=weights, 406 score_by_character_list=score_by_character_list) 407 return pscore 408 409