1# ---------------------------------------------------------------------------- 2# Copyright (c) 2013--, scikit-bio development team. 3# 4# Distributed under the terms of the Modified BSD License. 5# 6# The full license is in the file COPYING.txt, distributed with this software. 7# ---------------------------------------------------------------------------- 8 9import functools 10 11import numpy as np 12 13from skbio.util._decorator import experimental 14from skbio.diversity._util import (_validate_counts_matrix, 15 _validate_otu_ids_and_tree, 16 _vectorize_counts_and_tree) 17from skbio.diversity._phylogenetic import _tip_distances 18 19 20# The default value indicating whether normalization should be applied 21# for weighted UniFrac. This is used in two locations, so set in a single 22# variable to avoid the code base becoming out of sync in the event of a 23# change in this default value. 24_normalize_weighted_unifrac_by_default = False 25 26 27@experimental(as_of="0.4.1") 28def unweighted_unifrac(u_counts, v_counts, otu_ids, tree, validate=True): 29 """ Compute unweighted UniFrac 30 31 Parameters 32 ---------- 33 u_counts, v_counts: list, np.array 34 Vectors of counts/abundances of OTUs for two samples. Must be equal 35 length. 36 otu_ids: list, np.array 37 Vector of OTU ids corresponding to tip names in ``tree``. Must be the 38 same length as ``u_counts`` and ``v_counts``. 39 tree: skbio.TreeNode 40 Tree relating the OTUs in otu_ids. The set of tip names in the tree can 41 be a superset of ``otu_ids``, but not a subset. 42 validate: bool, optional 43 If `False`, validation of the input won't be performed. This step can 44 be slow, so if validation is run elsewhere it can be disabled here. 45 However, invalid input data can lead to invalid results or error 46 messages that are hard to interpret, so this step should not be 47 bypassed if you're not certain that your input data are valid. See 48 :mod:`skbio.diversity` for the description of what validation entails 49 so you can determine if you can safely disable validation. 50 51 Returns 52 ------- 53 float 54 The unweighted UniFrac distance between the two samples. 55 56 Raises 57 ------ 58 ValueError, MissingNodeError, DuplicateNodeError 59 If validation fails. Exact error will depend on what was invalid. 60 61 See Also 62 -------- 63 weighted_unifrac 64 skbio.diversity 65 skbio.diversity.beta_diversity 66 67 Notes 68 ----- 69 Unweighted UniFrac was originally described in [1]_. A discussion of 70 unweighted (qualitative) versus weighted (quantitative) diversity metrics 71 is presented in [2]_. Deeper mathematical discussions of this metric is 72 presented in [3]_. 73 74 If computing unweighted UniFrac for multiple pairs of samples, using 75 ``skbio.diversity.beta_diversity`` will be much faster than calling this 76 function individually on each sample. 77 78 This implementation differs from that in PyCogent (and therefore QIIME 79 versions less than 2.0.0) by imposing a few additional restrictions on the 80 inputs. First, the input tree must be rooted. In PyCogent, if an unrooted 81 tree was provided that had a single trifurcating node (a newick convention 82 for unrooted trees) that node was considered the root of the tree. Next, 83 all OTU IDs must be tips in the tree. PyCogent would silently ignore OTU 84 IDs that were not present the tree. To reproduce UniFrac results from 85 PyCogent with scikit-bio, ensure that your PyCogent UniFrac calculations 86 are performed on a rooted tree and that all OTU IDs are present in the 87 tree. 88 89 This implementation of unweighted UniFrac is the array-based implementation 90 described in [4]_. 91 92 References 93 ---------- 94 .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for 95 comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235 96 (2005). 97 98 .. [2] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative 99 and qualitative beta diversity measures lead to different insights into 100 factors that structure microbial communities. Appl. Environ. Microbiol. 101 73, 1576-1585 (2007). 102 103 .. [3] Lozupone, C., Lladser, M. E., Knights, D., Stombaugh, J. & Knight, 104 R. UniFrac: an effective distance metric for microbial community 105 comparison. ISME J. 5, 169-172 (2011). 106 107 .. [4] Hamady M, Lozupone C, Knight R. Fast UniFrac: facilitating high- 108 throughput phylogenetic analyses of microbial communities including 109 analysis of pyrosequencing and PhyloChip data. ISME J. 4(1):17-27 110 (2010). 111 112 Examples 113 -------- 114 Assume we have the following abundance data for two samples, ``u`` and 115 ``v``, represented as a pair of counts vectors. These counts represent the 116 number of times specific Operational Taxonomic Units, or OTUs, were 117 observed in each of the samples. 118 119 >>> u_counts = [1, 0, 0, 4, 1, 2, 3, 0] 120 >>> v_counts = [0, 1, 1, 6, 0, 1, 0, 0] 121 122 Because UniFrac is a phylogenetic diversity metric, we need to know which 123 OTU each count corresponds to, which we'll provide as ``otu_ids``. 124 125 >>> otu_ids = ['OTU1', 'OTU2', 'OTU3', 'OTU4', 'OTU5', 'OTU6', 'OTU7', 126 ... 'OTU8'] 127 128 We also need a phylogenetic tree that relates the OTUs to one another. 129 130 >>> from io import StringIO 131 >>> from skbio import TreeNode 132 >>> tree = TreeNode.read(StringIO( 133 ... '(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,' 134 ... '(OTU4:0.75,(OTU5:0.5,((OTU6:0.33,OTU7:0.62):0.5' 135 ... ',OTU8:0.5):0.5):0.5):1.25):0.0)root;')) 136 137 We can then compute the unweighted UniFrac distance between the samples. 138 139 >>> from skbio.diversity.beta import unweighted_unifrac 140 >>> uu = unweighted_unifrac(u_counts, v_counts, otu_ids, tree) 141 >>> print(round(uu, 2)) 142 0.37 143 144 """ 145 u_node_counts, v_node_counts, _, _, tree_index =\ 146 _setup_pairwise_unifrac(u_counts, v_counts, otu_ids, tree, validate, 147 normalized=False, unweighted=True) 148 return _unweighted_unifrac(u_node_counts, v_node_counts, 149 tree_index['length']) 150 151 152@experimental(as_of="0.4.1") 153def weighted_unifrac(u_counts, v_counts, otu_ids, tree, 154 normalized=_normalize_weighted_unifrac_by_default, 155 validate=True): 156 """ Compute weighted UniFrac with or without branch length normalization 157 158 Parameters 159 ---------- 160 u_counts, v_counts: list, np.array 161 Vectors of counts/abundances of OTUs for two samples. Must be equal 162 length. 163 otu_ids: list, np.array 164 Vector of OTU ids corresponding to tip names in ``tree``. Must be the 165 same length as ``u_counts`` and ``v_counts``. 166 tree: skbio.TreeNode 167 Tree relating the OTUs in otu_ids. The set of tip names in the tree can 168 be a superset of ``otu_ids``, but not a subset. 169 normalized: boolean, optional 170 If ``True``, apply branch length normalization, which is described in 171 [1]_. Resulting distances will then be in the range ``[0, 1]``. 172 validate: bool, optional 173 If `False`, validation of the input won't be performed. This step can 174 be slow, so if validation is run elsewhere it can be disabled here. 175 However, invalid input data can lead to invalid results or error 176 messages that are hard to interpret, so this step should not be 177 bypassed if you're not certain that your input data are valid. See 178 :mod:`skbio.diversity` for the description of what validation entails 179 so you can determine if you can safely disable validation. 180 181 Returns 182 ------- 183 float 184 The weighted UniFrac distance between the two samples. 185 186 Raises 187 ------ 188 ValueError, MissingNodeError, DuplicateNodeError 189 If validation fails. Exact error will depend on what was invalid. 190 191 See Also 192 -------- 193 unweighted_unifrac 194 skbio.diversity 195 skbio.diversity.beta_diversity 196 197 Notes 198 ----- 199 Weighted UniFrac was originally described in [1]_, which includes a 200 discussion of unweighted (qualitative) versus weighted (quantitiative) 201 diversity metrics. Deeper mathemtical discussions of this metric is 202 presented in [2]_. 203 204 If computing weighted UniFrac for multiple pairs of samples, using 205 ``skbio.diversity.beta_diversity`` will be much faster than calling this 206 function individually on each sample. 207 208 This implementation differs from that in PyCogent (and therefore QIIME 209 versions less than 2.0.0) by imposing a few additional restrictions on the 210 inputs. First, the input tree must be rooted. In PyCogent, if an unrooted 211 tree was provided that had a single trifurcating node (a newick convention 212 for unrooted trees) that node was considered the root of the tree. Next, 213 all OTU IDs must be tips in the tree. PyCogent would silently ignore OTU 214 IDs that were not present the tree. To reproduce UniFrac results from 215 PyCogent with scikit-bio, ensure that your PyCogent UniFrac calculations 216 are performed on a rooted tree and that all OTU IDs are present in the 217 tree. 218 219 This implementation of weighted UniFrac is the array-based implementation 220 described in [3]_. 221 222 References 223 ---------- 224 .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative 225 and qualitative beta diversity measures lead to different insights into 226 factors that structure microbial communities. Appl. Environ. Microbiol. 227 73, 1576-1585 (2007). 228 229 .. [2] Lozupone, C., Lladser, M. E., Knights, D., Stombaugh, J. & Knight, 230 R. UniFrac: an effective distance metric for microbial community 231 comparison. ISME J. 5, 169-172 (2011). 232 233 .. [3] Hamady M, Lozupone C, Knight R. Fast UniFrac: facilitating high- 234 throughput phylogenetic analyses of microbial communities including 235 analysis of pyrosequencing and PhyloChip data. ISME J. 4(1):17-27 236 (2010). 237 238 Examples 239 -------- 240 Assume we have the following abundance data for two samples, ``u`` and 241 ``v``, represented as a pair of counts vectors. These counts represent the 242 number of times specific Operational Taxonomic Units, or OTUs, were 243 observed in each of the samples. 244 245 >>> u_counts = [1, 0, 0, 4, 1, 2, 3, 0] 246 >>> v_counts = [0, 1, 1, 6, 0, 1, 0, 0] 247 248 Because UniFrac is a phylogenetic diversity metric, we need to know which 249 OTU each count corresponds to, which we'll provide as ``otu_ids``. 250 251 >>> otu_ids = ['OTU1', 'OTU2', 'OTU3', 'OTU4', 'OTU5', 'OTU6', 'OTU7', 252 ... 'OTU8'] 253 254 We also need a phylogenetic tree that relates the OTUs to one another. 255 256 >>> from io import StringIO 257 >>> from skbio import TreeNode 258 >>> tree = TreeNode.read(StringIO( 259 ... '(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,' 260 ... '(OTU4:0.75,(OTU5:0.5,((OTU6:0.33,OTU7:0.62):0.5' 261 ... ',OTU8:0.5):0.5):0.5):1.25):0.0)root;')) 262 263 Compute the weighted UniFrac distance between the samples. 264 265 >>> from skbio.diversity.beta import weighted_unifrac 266 >>> wu = weighted_unifrac(u_counts, v_counts, otu_ids, tree) 267 >>> print(round(wu, 2)) 268 1.54 269 270 Compute the weighted UniFrac distance between the samples including 271 branch length normalization so the value falls in the range ``[0.0, 1.0]``. 272 273 >>> wu = weighted_unifrac(u_counts, v_counts, otu_ids, tree, 274 ... normalized=True) 275 >>> print(round(wu, 2)) 276 0.33 277 278 """ 279 u_node_counts, v_node_counts, u_total_count, v_total_count, tree_index =\ 280 _setup_pairwise_unifrac(u_counts, v_counts, otu_ids, tree, validate, 281 normalized=normalized, unweighted=False) 282 branch_lengths = tree_index['length'] 283 284 if normalized: 285 tip_indices = _get_tip_indices(tree_index) 286 node_to_root_distances = _tip_distances(branch_lengths, tree, 287 tip_indices) 288 return _weighted_unifrac_normalized(u_node_counts, v_node_counts, 289 u_total_count, v_total_count, 290 branch_lengths, 291 node_to_root_distances) 292 else: 293 return _weighted_unifrac(u_node_counts, v_node_counts, 294 u_total_count, v_total_count, 295 branch_lengths)[0] 296 297 298def _validate(u_counts, v_counts, otu_ids, tree): 299 _validate_counts_matrix([u_counts, v_counts], suppress_cast=True) 300 _validate_otu_ids_and_tree(counts=u_counts, otu_ids=otu_ids, tree=tree) 301 302 303def _setup_pairwise_unifrac(u_counts, v_counts, otu_ids, tree, validate, 304 normalized, unweighted): 305 306 if validate: 307 _validate(u_counts, v_counts, otu_ids, tree) 308 309 # temporarily store u_counts and v_counts in a 2-D array as that's what 310 # _vectorize_counts_and_tree takes 311 u_counts = np.asarray(u_counts) 312 v_counts = np.asarray(v_counts) 313 counts = np.vstack([u_counts, v_counts]) 314 counts_by_node, tree_index, branch_lengths = \ 315 _vectorize_counts_and_tree(counts, otu_ids, tree) 316 # unpack counts vectors for single pairwise UniFrac calculation 317 u_node_counts = counts_by_node[0] 318 v_node_counts = counts_by_node[1] 319 320 u_total_count = u_counts.sum() 321 v_total_count = v_counts.sum() 322 323 return (u_node_counts, v_node_counts, u_total_count, v_total_count, 324 tree_index) 325 326 327def _unweighted_unifrac(u_node_counts, v_node_counts, branch_lengths): 328 """ 329 Parameters 330 ---------- 331 u_node_counts, v_node_counts : np.array 332 Vectors indicating presence (value greater than zero) and absence 333 (value equal to zero) of nodes in two samples, `u` and `v`. Order is 334 assumed to be the same as in `branch_lengths`. 335 branch_lengths : np.array 336 Vector of branch lengths of all nodes (tips and internal nodes) in 337 postorder representation of their tree. 338 339 Returns 340 ------- 341 float 342 Unweighted UniFrac distance between samples. 343 344 Notes 345 ----- 346 The count vectors passed here correspond to all nodes in the tree, not 347 just the tips. 348 349 """ 350 unique_nodes = np.logical_xor(u_node_counts, v_node_counts) 351 observed_nodes = np.logical_or(u_node_counts, v_node_counts) 352 unique_branch_length = (branch_lengths * unique_nodes).sum() 353 observed_branch_length = (branch_lengths * observed_nodes).sum() 354 if observed_branch_length == 0.0: 355 # handle special case to avoid division by zero 356 return 0.0 357 return unique_branch_length / observed_branch_length 358 359 360def _weighted_unifrac(u_node_counts, v_node_counts, u_total_count, 361 v_total_count, branch_lengths): 362 """ 363 Parameters 364 ---------- 365 u_node_counts, v_node_counts : np.array 366 Vectors indicating presence (value greater than zero) and absence 367 (value equal to zero) of nodes in two samples, `u` and `v`. Order is 368 assumed to be the same as in `branch_lengths`. 369 u_total_count, v_total_counts : int 370 The sum of ``u_node_counts`` and ``v_node_counts`` vectors, 371 respectively. This could be computed internally, but since this is a 372 private method and the calling function has already generated these 373 values, this saves an iteration over each of these vectors. 374 branch_lengths : np.array 375 Vector of branch lengths of all nodes (tips and internal nodes) in 376 postorder representation of their tree. 377 378 Returns 379 ------- 380 float 381 Weighted UniFrac distance between samples. 382 np.array of float 383 Proportional abundance of each node in tree in sample `u` 384 np.array of float 385 Proportional abundance of each node in tree in sample `v` 386 387 """ 388 if u_total_count > 0: 389 # convert to relative abundances if there are any counts 390 u_node_proportions = u_node_counts / u_total_count 391 else: 392 # otherwise, we'll just do the computation with u_node_counts, which 393 # is necessarily all zeros 394 u_node_proportions = u_node_counts 395 396 if v_total_count > 0: 397 v_node_proportions = v_node_counts / v_total_count 398 else: 399 v_node_proportions = v_node_counts 400 401 wu = (branch_lengths * 402 np.absolute(u_node_proportions - v_node_proportions)).sum() 403 return wu, u_node_proportions, v_node_proportions 404 405 406def _weighted_unifrac_normalized(u_node_counts, v_node_counts, u_total_count, 407 v_total_count, branch_lengths, 408 node_to_root_distances): 409 """ 410 Parameters 411 ---------- 412 u_node_counts, v_node_counts : np.array 413 Vectors indicating presence (value greater than zero) and absence 414 (value equal to zero) of nodes in two samples, `u` and `v`. Order is 415 assumed to be the same as in `branch_lengths`. 416 u_total_count, v_total_counts : int 417 The sum of ``u_node_counts`` and ``v_node_counts`` vectors, 418 respectively. This could be computed internally, but since this is a 419 private method and the calling function has already generated these 420 values, this saves an iteration over each of these vectors. 421 tree: skbio.TreeNode 422 Tree relating the OTUs. 423 424 Returns 425 ------- 426 float 427 Normalized weighted UniFrac distance between samples. 428 429 Notes 430 ----- 431 The count vectors passed here correspond to all nodes in the tree, not 432 just the tips. 433 434 """ 435 if u_total_count == 0.0 and v_total_count == 0.0: 436 # handle special case to avoid division by zero 437 return 0.0 438 u, u_node_proportions, v_node_proportions = _weighted_unifrac( 439 u_node_counts, v_node_counts, u_total_count, v_total_count, 440 branch_lengths) 441 c = _weighted_unifrac_branch_correction( 442 node_to_root_distances, u_node_proportions, v_node_proportions) 443 444 return u / c 445 446 447def _setup_multiple_unifrac(counts, otu_ids, tree, validate): 448 if validate: 449 _validate_otu_ids_and_tree(counts[0], otu_ids, tree) 450 451 counts_by_node, tree_index, branch_lengths = \ 452 _vectorize_counts_and_tree(counts, otu_ids, tree) 453 454 return counts_by_node, tree_index, branch_lengths 455 456 457def _setup_multiple_unweighted_unifrac(counts, otu_ids, tree, validate): 458 """ Create optimized pdist-compatible unweighted UniFrac function 459 460 Parameters 461 ---------- 462 counts : 2D array_like of ints or floats 463 Matrix containing count/abundance data where each row contains counts 464 of observations in a given sample. 465 otu_ids: list, np.array 466 Vector of OTU ids corresponding to tip names in ``tree``. Must be the 467 same length as ``u_counts`` and ``v_counts``. These IDs do not need to 468 be in tip order with respect to the tree. 469 tree: skbio.TreeNode 470 Tree relating the OTUs in otu_ids. The set of tip names in the tree can 471 be a superset of ``otu_ids``, but not a subset. 472 validate: bool, optional 473 If `False`, validation of the input won't be performed. 474 475 Returns 476 ------- 477 function 478 Optimized pairwise unweighted UniFrac calculator that can be passed 479 to ``scipy.spatial.distance.pdist``. 480 2D np.array of ints, floats 481 Counts of all nodes in ``tree``. 482 483 """ 484 counts_by_node, _, branch_lengths = \ 485 _setup_multiple_unifrac(counts, otu_ids, tree, validate) 486 487 f = functools.partial(_unweighted_unifrac, branch_lengths=branch_lengths) 488 489 return f, counts_by_node 490 491 492def _setup_multiple_weighted_unifrac(counts, otu_ids, tree, normalized, 493 validate): 494 """ Create optimized pdist-compatible weighted UniFrac function 495 496 Parameters 497 ---------- 498 counts : 2D array_like of ints or floats 499 Matrix containing count/abundance data where each row contains counts 500 of observations in a given sample. 501 otu_ids: list, np.array 502 Vector of OTU ids corresponding to tip names in ``tree``. Must be the 503 same length as ``u_counts`` and ``v_counts``. These IDs do not need to 504 be in tip order with respect to the tree. 505 tree: skbio.TreeNode 506 Tree relating the OTUs in otu_ids. The set of tip names in the tree can 507 be a superset of ``otu_ids``, but not a subset. 508 validate: bool, optional 509 If `False`, validation of the input won't be performed. 510 511 Returns 512 ------- 513 function 514 Optimized pairwise unweighted UniFrac calculator that can be passed 515 to ``scipy.spatial.distance.pdist``. 516 2D np.array of ints, floats 517 Counts of all nodes in ``tree``. 518 519 """ 520 counts_by_node, tree_index, branch_lengths = \ 521 _setup_multiple_unifrac(counts, otu_ids, tree, validate) 522 tip_indices = _get_tip_indices(tree_index) 523 524 if normalized: 525 node_to_root_distances = _tip_distances(branch_lengths, tree, 526 tip_indices) 527 528 def f(u_node_counts, v_node_counts): 529 u_total_count = np.take(u_node_counts, tip_indices).sum() 530 v_total_count = np.take(v_node_counts, tip_indices).sum() 531 u = _weighted_unifrac_normalized( 532 u_node_counts, v_node_counts, u_total_count, v_total_count, 533 branch_lengths, node_to_root_distances) 534 return u 535 else: 536 537 def f(u_node_counts, v_node_counts): 538 u_total_count = np.take(u_node_counts, tip_indices).sum() 539 v_total_count = np.take(v_node_counts, tip_indices).sum() 540 u, _, _ = _weighted_unifrac(u_node_counts, v_node_counts, 541 u_total_count, v_total_count, 542 branch_lengths) 543 return u 544 545 return f, counts_by_node 546 547 548def _get_tip_indices(tree_index): 549 tip_indices = np.array([n.id for n in tree_index['id_index'].values() 550 if n.is_tip()]) 551 return tip_indices 552 553 554def _weighted_unifrac_branch_correction(node_to_root_distances, 555 u_node_proportions, 556 v_node_proportions): 557 """Calculates weighted unifrac branch length correction. 558 559 Parameters 560 ---------- 561 node_to_root_distances : np.ndarray 562 1D column vector of branch lengths in post order form. There should be 563 positions in this vector for all nodes in the tree, but only tips 564 should be non-zero. 565 u_node_proportions, v_node_proportions : np.ndarray 566 Proportional abundace of observations of all nodes in the tree in 567 samples ``u`` and ``v``, respectively. 568 u_total_count, v_total_count : float 569 The sum of the observations in samples ``u`` and ``v``, respectively. 570 571 Returns 572 ------- 573 np.ndarray 574 The corrected branch lengths 575 """ 576 return (node_to_root_distances.ravel() * 577 (u_node_proportions + v_node_proportions)).sum() 578