1# ----------------------------------------------------------------------------
2# Copyright (c) 2013--, scikit-bio development team.
3#
4# Distributed under the terms of the Modified BSD License.
5#
6# The full license is in the file COPYING.txt, distributed with this software.
7# ----------------------------------------------------------------------------
8
9import functools
10
11import numpy as np
12
13from skbio.util._decorator import experimental
14from skbio.diversity._util import (_validate_counts_matrix,
15                                   _validate_otu_ids_and_tree,
16                                   _vectorize_counts_and_tree)
17from skbio.diversity._phylogenetic import _tip_distances
18
19
20# The default value indicating whether normalization should be applied
21# for weighted UniFrac. This is used in two locations, so set in a single
22# variable to avoid the code base becoming out of sync in the event of a
23# change in this default value.
24_normalize_weighted_unifrac_by_default = False
25
26
27@experimental(as_of="0.4.1")
28def unweighted_unifrac(u_counts, v_counts, otu_ids, tree, validate=True):
29    """ Compute unweighted UniFrac
30
31    Parameters
32    ----------
33    u_counts, v_counts: list, np.array
34        Vectors of counts/abundances of OTUs for two samples. Must be equal
35        length.
36    otu_ids: list, np.array
37        Vector of OTU ids corresponding to tip names in ``tree``. Must be the
38        same length as ``u_counts`` and ``v_counts``.
39    tree: skbio.TreeNode
40        Tree relating the OTUs in otu_ids. The set of tip names in the tree can
41        be a superset of ``otu_ids``, but not a subset.
42    validate: bool, optional
43        If `False`, validation of the input won't be performed. This step can
44        be slow, so if validation is run elsewhere it can be disabled here.
45        However, invalid input data can lead to invalid results or error
46        messages that are hard to interpret, so this step should not be
47        bypassed if you're not certain that your input data are valid. See
48        :mod:`skbio.diversity` for the description of what validation entails
49        so you can determine if you can safely disable validation.
50
51    Returns
52    -------
53    float
54        The unweighted UniFrac distance between the two samples.
55
56    Raises
57    ------
58    ValueError, MissingNodeError, DuplicateNodeError
59        If validation fails. Exact error will depend on what was invalid.
60
61    See Also
62    --------
63    weighted_unifrac
64    skbio.diversity
65    skbio.diversity.beta_diversity
66
67    Notes
68    -----
69    Unweighted UniFrac was originally described in [1]_. A discussion of
70    unweighted (qualitative) versus weighted (quantitative) diversity metrics
71    is presented in [2]_. Deeper mathematical discussions of this metric is
72    presented in [3]_.
73
74    If computing unweighted UniFrac for multiple pairs of samples, using
75    ``skbio.diversity.beta_diversity`` will be much faster than calling this
76    function individually on each sample.
77
78    This implementation differs from that in PyCogent (and therefore QIIME
79    versions less than 2.0.0) by imposing a few additional restrictions on the
80    inputs. First, the input tree must be rooted. In PyCogent, if an unrooted
81    tree was provided that had a single trifurcating node (a newick convention
82    for unrooted trees) that node was considered the root of the tree. Next,
83    all OTU IDs must be tips in the tree. PyCogent would silently ignore OTU
84    IDs that were not present the tree. To reproduce UniFrac results from
85    PyCogent with scikit-bio, ensure that your PyCogent UniFrac calculations
86    are performed on a rooted tree and that all OTU IDs are present in the
87    tree.
88
89    This implementation of unweighted UniFrac is the array-based implementation
90    described in [4]_.
91
92    References
93    ----------
94    .. [1] Lozupone, C. & Knight, R. UniFrac: a new phylogenetic method for
95       comparing microbial communities. Appl. Environ. Microbiol. 71, 8228-8235
96       (2005).
97
98    .. [2] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative
99       and qualitative beta diversity measures lead to different insights into
100       factors that structure microbial communities. Appl. Environ. Microbiol.
101       73, 1576-1585 (2007).
102
103    .. [3] Lozupone, C., Lladser, M. E., Knights, D., Stombaugh, J. & Knight,
104       R. UniFrac: an effective distance metric for microbial community
105       comparison. ISME J. 5, 169-172 (2011).
106
107    .. [4] Hamady M, Lozupone C, Knight R. Fast UniFrac: facilitating high-
108       throughput phylogenetic analyses of microbial communities including
109       analysis of pyrosequencing and PhyloChip data.  ISME J. 4(1):17-27
110       (2010).
111
112    Examples
113    --------
114    Assume we have the following abundance data for two samples, ``u`` and
115    ``v``, represented as a pair of counts vectors. These counts represent the
116    number of times specific Operational Taxonomic Units, or OTUs, were
117    observed in each of the samples.
118
119    >>> u_counts = [1, 0, 0, 4, 1, 2, 3, 0]
120    >>> v_counts = [0, 1, 1, 6, 0, 1, 0, 0]
121
122    Because UniFrac is a phylogenetic diversity metric, we need to know which
123    OTU each count corresponds to, which we'll provide as ``otu_ids``.
124
125    >>> otu_ids = ['OTU1', 'OTU2', 'OTU3', 'OTU4', 'OTU5', 'OTU6', 'OTU7',
126    ...            'OTU8']
127
128    We also need a phylogenetic tree that relates the OTUs to one another.
129
130    >>> from io import StringIO
131    >>> from skbio import TreeNode
132    >>> tree = TreeNode.read(StringIO(
133    ...                      '(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,'
134    ...                      '(OTU4:0.75,(OTU5:0.5,((OTU6:0.33,OTU7:0.62):0.5'
135    ...                      ',OTU8:0.5):0.5):0.5):1.25):0.0)root;'))
136
137    We can then compute the unweighted UniFrac distance between the samples.
138
139    >>> from skbio.diversity.beta import unweighted_unifrac
140    >>> uu = unweighted_unifrac(u_counts, v_counts, otu_ids, tree)
141    >>> print(round(uu, 2))
142    0.37
143
144    """
145    u_node_counts, v_node_counts, _, _, tree_index =\
146        _setup_pairwise_unifrac(u_counts, v_counts, otu_ids, tree, validate,
147                                normalized=False, unweighted=True)
148    return _unweighted_unifrac(u_node_counts, v_node_counts,
149                               tree_index['length'])
150
151
152@experimental(as_of="0.4.1")
153def weighted_unifrac(u_counts, v_counts, otu_ids, tree,
154                     normalized=_normalize_weighted_unifrac_by_default,
155                     validate=True):
156    """ Compute weighted UniFrac with or without branch length normalization
157
158    Parameters
159    ----------
160    u_counts, v_counts: list, np.array
161        Vectors of counts/abundances of OTUs for two samples. Must be equal
162        length.
163    otu_ids: list, np.array
164        Vector of OTU ids corresponding to tip names in ``tree``. Must be the
165        same length as ``u_counts`` and ``v_counts``.
166    tree: skbio.TreeNode
167        Tree relating the OTUs in otu_ids. The set of tip names in the tree can
168        be a superset of ``otu_ids``, but not a subset.
169    normalized: boolean, optional
170        If ``True``, apply branch length normalization, which is described in
171        [1]_. Resulting distances will then be in the range ``[0, 1]``.
172    validate: bool, optional
173        If `False`, validation of the input won't be performed. This step can
174        be slow, so if validation is run elsewhere it can be disabled here.
175        However, invalid input data can lead to invalid results or error
176        messages that are hard to interpret, so this step should not be
177        bypassed if you're not certain that your input data are valid. See
178        :mod:`skbio.diversity` for the description of what validation entails
179        so you can determine if you can safely disable validation.
180
181    Returns
182    -------
183    float
184        The weighted UniFrac distance between the two samples.
185
186    Raises
187    ------
188    ValueError, MissingNodeError, DuplicateNodeError
189        If validation fails. Exact error will depend on what was invalid.
190
191    See Also
192    --------
193    unweighted_unifrac
194    skbio.diversity
195    skbio.diversity.beta_diversity
196
197    Notes
198    -----
199    Weighted UniFrac was originally described in [1]_, which includes a
200    discussion of unweighted (qualitative) versus weighted (quantitiative)
201    diversity metrics. Deeper mathemtical discussions of this metric is
202    presented in [2]_.
203
204    If computing weighted UniFrac for multiple pairs of samples, using
205    ``skbio.diversity.beta_diversity`` will be much faster than calling this
206    function individually on each sample.
207
208    This implementation differs from that in PyCogent (and therefore QIIME
209    versions less than 2.0.0) by imposing a few additional restrictions on the
210    inputs. First, the input tree must be rooted. In PyCogent, if an unrooted
211    tree was provided that had a single trifurcating node (a newick convention
212    for unrooted trees) that node was considered the root of the tree. Next,
213    all OTU IDs must be tips in the tree. PyCogent would silently ignore OTU
214    IDs that were not present the tree. To reproduce UniFrac results from
215    PyCogent with scikit-bio, ensure that your PyCogent UniFrac calculations
216    are performed on a rooted tree and that all OTU IDs are present in the
217    tree.
218
219    This implementation of weighted UniFrac is the array-based implementation
220    described in [3]_.
221
222    References
223    ----------
224    .. [1] Lozupone, C. A., Hamady, M., Kelley, S. T. & Knight, R. Quantitative
225       and qualitative beta diversity measures lead to different insights into
226       factors that structure microbial communities. Appl. Environ. Microbiol.
227       73, 1576-1585 (2007).
228
229    .. [2] Lozupone, C., Lladser, M. E., Knights, D., Stombaugh, J. & Knight,
230       R. UniFrac: an effective distance metric for microbial community
231       comparison. ISME J. 5, 169-172 (2011).
232
233    .. [3] Hamady M, Lozupone C, Knight R. Fast UniFrac: facilitating high-
234       throughput phylogenetic analyses of microbial communities including
235       analysis of pyrosequencing and PhyloChip data.  ISME J. 4(1):17-27
236       (2010).
237
238    Examples
239    --------
240    Assume we have the following abundance data for two samples, ``u`` and
241    ``v``, represented as a pair of counts vectors. These counts represent the
242    number of times specific Operational Taxonomic Units, or OTUs, were
243    observed in each of the samples.
244
245    >>> u_counts = [1, 0, 0, 4, 1, 2, 3, 0]
246    >>> v_counts = [0, 1, 1, 6, 0, 1, 0, 0]
247
248    Because UniFrac is a phylogenetic diversity metric, we need to know which
249    OTU each count corresponds to, which we'll provide as ``otu_ids``.
250
251    >>> otu_ids = ['OTU1', 'OTU2', 'OTU3', 'OTU4', 'OTU5', 'OTU6', 'OTU7',
252    ...            'OTU8']
253
254    We also need a phylogenetic tree that relates the OTUs to one another.
255
256    >>> from io import StringIO
257    >>> from skbio import TreeNode
258    >>> tree = TreeNode.read(StringIO(
259    ...                      '(((((OTU1:0.5,OTU2:0.5):0.5,OTU3:1.0):1.0):0.0,'
260    ...                      '(OTU4:0.75,(OTU5:0.5,((OTU6:0.33,OTU7:0.62):0.5'
261    ...                      ',OTU8:0.5):0.5):0.5):1.25):0.0)root;'))
262
263    Compute the weighted UniFrac distance between the samples.
264
265    >>> from skbio.diversity.beta import weighted_unifrac
266    >>> wu = weighted_unifrac(u_counts, v_counts, otu_ids, tree)
267    >>> print(round(wu, 2))
268    1.54
269
270    Compute the weighted UniFrac distance between the samples including
271    branch length normalization so the value falls in the range ``[0.0, 1.0]``.
272
273    >>> wu = weighted_unifrac(u_counts, v_counts, otu_ids, tree,
274    ...                       normalized=True)
275    >>> print(round(wu, 2))
276    0.33
277
278    """
279    u_node_counts, v_node_counts, u_total_count, v_total_count, tree_index =\
280        _setup_pairwise_unifrac(u_counts, v_counts, otu_ids, tree, validate,
281                                normalized=normalized, unweighted=False)
282    branch_lengths = tree_index['length']
283
284    if normalized:
285        tip_indices = _get_tip_indices(tree_index)
286        node_to_root_distances = _tip_distances(branch_lengths, tree,
287                                                tip_indices)
288        return _weighted_unifrac_normalized(u_node_counts, v_node_counts,
289                                            u_total_count, v_total_count,
290                                            branch_lengths,
291                                            node_to_root_distances)
292    else:
293        return _weighted_unifrac(u_node_counts, v_node_counts,
294                                 u_total_count, v_total_count,
295                                 branch_lengths)[0]
296
297
298def _validate(u_counts, v_counts, otu_ids, tree):
299    _validate_counts_matrix([u_counts, v_counts], suppress_cast=True)
300    _validate_otu_ids_and_tree(counts=u_counts, otu_ids=otu_ids, tree=tree)
301
302
303def _setup_pairwise_unifrac(u_counts, v_counts, otu_ids, tree, validate,
304                            normalized, unweighted):
305
306    if validate:
307        _validate(u_counts, v_counts, otu_ids, tree)
308
309    # temporarily store u_counts and v_counts in a 2-D array as that's what
310    # _vectorize_counts_and_tree takes
311    u_counts = np.asarray(u_counts)
312    v_counts = np.asarray(v_counts)
313    counts = np.vstack([u_counts, v_counts])
314    counts_by_node, tree_index, branch_lengths = \
315        _vectorize_counts_and_tree(counts, otu_ids, tree)
316    # unpack counts vectors for single pairwise UniFrac calculation
317    u_node_counts = counts_by_node[0]
318    v_node_counts = counts_by_node[1]
319
320    u_total_count = u_counts.sum()
321    v_total_count = v_counts.sum()
322
323    return (u_node_counts, v_node_counts, u_total_count, v_total_count,
324            tree_index)
325
326
327def _unweighted_unifrac(u_node_counts, v_node_counts, branch_lengths):
328    """
329    Parameters
330    ----------
331    u_node_counts, v_node_counts : np.array
332        Vectors indicating presence (value greater than zero) and absence
333        (value equal to zero) of nodes in two samples, `u` and `v`. Order is
334        assumed to be the same as in `branch_lengths`.
335    branch_lengths : np.array
336        Vector of branch lengths of all nodes (tips and internal nodes) in
337        postorder representation of their tree.
338
339    Returns
340    -------
341    float
342        Unweighted UniFrac distance between samples.
343
344    Notes
345    -----
346    The count vectors passed here correspond to all nodes in the tree, not
347    just the tips.
348
349    """
350    unique_nodes = np.logical_xor(u_node_counts, v_node_counts)
351    observed_nodes = np.logical_or(u_node_counts, v_node_counts)
352    unique_branch_length = (branch_lengths * unique_nodes).sum()
353    observed_branch_length = (branch_lengths * observed_nodes).sum()
354    if observed_branch_length == 0.0:
355        # handle special case to avoid division by zero
356        return 0.0
357    return unique_branch_length / observed_branch_length
358
359
360def _weighted_unifrac(u_node_counts, v_node_counts, u_total_count,
361                      v_total_count, branch_lengths):
362    """
363    Parameters
364    ----------
365    u_node_counts, v_node_counts : np.array
366        Vectors indicating presence (value greater than zero) and absence
367        (value equal to zero) of nodes in two samples, `u` and `v`. Order is
368        assumed to be the same as in `branch_lengths`.
369    u_total_count, v_total_counts : int
370        The sum of ``u_node_counts`` and ``v_node_counts`` vectors,
371        respectively. This could be computed internally, but since this is a
372        private method and the calling function has already generated these
373        values, this saves an iteration over each of these vectors.
374    branch_lengths : np.array
375        Vector of branch lengths of all nodes (tips and internal nodes) in
376        postorder representation of their tree.
377
378    Returns
379    -------
380    float
381        Weighted UniFrac distance between samples.
382    np.array of float
383        Proportional abundance of each node in tree in sample `u`
384    np.array of float
385        Proportional abundance of each node in tree in sample `v`
386
387    """
388    if u_total_count > 0:
389        # convert to relative abundances if there are any counts
390        u_node_proportions = u_node_counts / u_total_count
391    else:
392        # otherwise, we'll just do the computation with u_node_counts, which
393        # is necessarily all zeros
394        u_node_proportions = u_node_counts
395
396    if v_total_count > 0:
397        v_node_proportions = v_node_counts / v_total_count
398    else:
399        v_node_proportions = v_node_counts
400
401    wu = (branch_lengths *
402          np.absolute(u_node_proportions - v_node_proportions)).sum()
403    return wu, u_node_proportions, v_node_proportions
404
405
406def _weighted_unifrac_normalized(u_node_counts, v_node_counts, u_total_count,
407                                 v_total_count, branch_lengths,
408                                 node_to_root_distances):
409    """
410    Parameters
411    ----------
412    u_node_counts, v_node_counts : np.array
413         Vectors indicating presence (value greater than zero) and absence
414         (value equal to zero) of nodes in two samples, `u` and `v`. Order is
415         assumed to be the same as in `branch_lengths`.
416    u_total_count, v_total_counts : int
417         The sum of ``u_node_counts`` and ``v_node_counts`` vectors,
418         respectively. This could be computed internally, but since this is a
419         private method and the calling function has already generated these
420         values, this saves an iteration over each of these vectors.
421    tree: skbio.TreeNode
422         Tree relating the OTUs.
423
424    Returns
425    -------
426    float
427        Normalized weighted UniFrac distance between samples.
428
429    Notes
430    -----
431    The count vectors passed here correspond to all nodes in the tree, not
432    just the tips.
433
434    """
435    if u_total_count == 0.0 and v_total_count == 0.0:
436        # handle special case to avoid division by zero
437        return 0.0
438    u, u_node_proportions, v_node_proportions = _weighted_unifrac(
439        u_node_counts, v_node_counts, u_total_count, v_total_count,
440        branch_lengths)
441    c = _weighted_unifrac_branch_correction(
442        node_to_root_distances, u_node_proportions, v_node_proportions)
443
444    return u / c
445
446
447def _setup_multiple_unifrac(counts, otu_ids, tree, validate):
448    if validate:
449        _validate_otu_ids_and_tree(counts[0], otu_ids, tree)
450
451    counts_by_node, tree_index, branch_lengths = \
452        _vectorize_counts_and_tree(counts, otu_ids, tree)
453
454    return counts_by_node, tree_index, branch_lengths
455
456
457def _setup_multiple_unweighted_unifrac(counts, otu_ids, tree, validate):
458    """ Create optimized pdist-compatible unweighted UniFrac function
459
460    Parameters
461    ----------
462    counts : 2D array_like of ints or floats
463        Matrix containing count/abundance data where each row contains counts
464        of observations in a given sample.
465    otu_ids: list, np.array
466        Vector of OTU ids corresponding to tip names in ``tree``. Must be the
467        same length as ``u_counts`` and ``v_counts``. These IDs do not need to
468        be in tip order with respect to the tree.
469    tree: skbio.TreeNode
470        Tree relating the OTUs in otu_ids. The set of tip names in the tree can
471        be a superset of ``otu_ids``, but not a subset.
472    validate: bool, optional
473        If `False`, validation of the input won't be performed.
474
475    Returns
476    -------
477    function
478        Optimized pairwise unweighted UniFrac calculator that can be passed
479        to ``scipy.spatial.distance.pdist``.
480    2D np.array of ints, floats
481        Counts of all nodes in ``tree``.
482
483    """
484    counts_by_node, _, branch_lengths = \
485        _setup_multiple_unifrac(counts, otu_ids, tree, validate)
486
487    f = functools.partial(_unweighted_unifrac, branch_lengths=branch_lengths)
488
489    return f, counts_by_node
490
491
492def _setup_multiple_weighted_unifrac(counts, otu_ids, tree, normalized,
493                                     validate):
494    """ Create optimized pdist-compatible weighted UniFrac function
495
496    Parameters
497    ----------
498    counts : 2D array_like of ints or floats
499        Matrix containing count/abundance data where each row contains counts
500        of observations in a given sample.
501    otu_ids: list, np.array
502        Vector of OTU ids corresponding to tip names in ``tree``. Must be the
503        same length as ``u_counts`` and ``v_counts``. These IDs do not need to
504        be in tip order with respect to the tree.
505    tree: skbio.TreeNode
506        Tree relating the OTUs in otu_ids. The set of tip names in the tree can
507        be a superset of ``otu_ids``, but not a subset.
508    validate: bool, optional
509        If `False`, validation of the input won't be performed.
510
511    Returns
512    -------
513    function
514        Optimized pairwise unweighted UniFrac calculator that can be passed
515        to ``scipy.spatial.distance.pdist``.
516    2D np.array of ints, floats
517        Counts of all nodes in ``tree``.
518
519    """
520    counts_by_node, tree_index, branch_lengths = \
521        _setup_multiple_unifrac(counts, otu_ids, tree, validate)
522    tip_indices = _get_tip_indices(tree_index)
523
524    if normalized:
525        node_to_root_distances = _tip_distances(branch_lengths, tree,
526                                                tip_indices)
527
528        def f(u_node_counts, v_node_counts):
529            u_total_count = np.take(u_node_counts, tip_indices).sum()
530            v_total_count = np.take(v_node_counts, tip_indices).sum()
531            u = _weighted_unifrac_normalized(
532                    u_node_counts, v_node_counts, u_total_count, v_total_count,
533                    branch_lengths, node_to_root_distances)
534            return u
535    else:
536
537        def f(u_node_counts, v_node_counts):
538            u_total_count = np.take(u_node_counts, tip_indices).sum()
539            v_total_count = np.take(v_node_counts, tip_indices).sum()
540            u, _, _ = _weighted_unifrac(u_node_counts, v_node_counts,
541                                        u_total_count, v_total_count,
542                                        branch_lengths)
543            return u
544
545    return f, counts_by_node
546
547
548def _get_tip_indices(tree_index):
549    tip_indices = np.array([n.id for n in tree_index['id_index'].values()
550                            if n.is_tip()])
551    return tip_indices
552
553
554def _weighted_unifrac_branch_correction(node_to_root_distances,
555                                        u_node_proportions,
556                                        v_node_proportions):
557    """Calculates weighted unifrac branch length correction.
558
559    Parameters
560    ----------
561    node_to_root_distances : np.ndarray
562        1D column vector of branch lengths in post order form. There should be
563        positions in this vector for all nodes in the tree, but only tips
564        should be non-zero.
565    u_node_proportions, v_node_proportions : np.ndarray
566        Proportional abundace of observations of all nodes in the tree in
567        samples ``u`` and ``v``, respectively.
568    u_total_count, v_total_count : float
569        The sum of the observations in samples ``u`` and ``v``, respectively.
570
571    Returns
572    -------
573    np.ndarray
574        The corrected branch lengths
575    """
576    return (node_to_root_distances.ravel() *
577            (u_node_proportions + v_node_proportions)).sum()
578