1#! /usr/bin/env python
2# -*- coding: utf-8 -*-
3
4##############################################################################
5##  DendroPy Phylogenetic Computing Library.
6##
7##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
8##  All rights reserved.
9##
10##  See "LICENSE.rst" for terms and conditions of usage.
11##
12##  If you use this work or any portion thereof in published work,
13##  please cite it as:
14##
15##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
16##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
17##
18##############################################################################
19
20"""
21Models, modeling and model-fitting of parsimony.
22"""
23
24from functools import reduce
25import operator
26import dendropy
27from dendropy.utility.error import TaxonNamespaceIdentityError
28
29class _NodeStateSetMap(dict):
30    def __init__(self, taxon_state_sets_map=None):
31        self.taxon_state_sets_map = taxon_state_sets_map
32    def __getitem__(self, key):
33        try:
34            return dict.__getitem__(self, key)
35        except KeyError:
36            v = self.taxon_state_sets_map[key.taxon]
37            self[key] = v
38            return v
39
40def _store_sets_as_attr(n, state_sets_attr_name, v):
41    setattr(n, state_sets_attr_name, v)
42
43def _retrieve_state_sets_from_attr(n, state_sets_attr_name, taxon_state_sets_map):
44    try:
45        return getattr(n, state_sets_attr_name)
46    except AttributeError:
47        v = taxon_state_sets_map[n.taxon]
48        setattr(n, state_sets_attr_name, v)
49        return v
50
51def fitch_down_pass(
52        postorder_nodes,
53        state_sets_attr_name="state_sets",
54        taxon_state_sets_map=None,
55        weights=None,
56        score_by_character_list=None,
57        ):
58    """
59    Returns the parsimony score given a list of nodes in postorder and
60    associated states, using Fitch's (1971) unordered parsimony algorithm.
61
62    Parameters
63    ----------
64    postorder_nodes : iterable of/over |Node| objects
65        An iterable of |Node| objects in in order of post-order
66        traversal of the tree.
67    state_sets_attr_name : str
68        Name of attribute on |Node| objects in which state set lists
69        will stored/accessed. If |None|, then state sets will not be stored on
70        the tree.
71    taxon_state_sets_map : dict[taxon] = state sets
72        A dictionary that takes a taxon object as a key and returns a state set
73        list as a value. This will be used to populate the state set of a node
74        that has not yet had its state sets scored and recorded (typically,
75        leaves of a tree that has not yet been processed).
76    weights : iterable
77        A list of weights for each pattern.
78    score_by_character_list : None or list
79        If not |None|, should be a reference to a list object.
80        This list will be populated by the scores on a character-by-character
81        basis.
82
83    Returns
84    -------
85    s : int
86        Parismony score of tree.
87
88    Notes
89    -----
90    Currently this requires a bifurcating tree (even at the root).
91
92    Examples
93    --------
94
95    Assume that we have a tree, ``tree``, and an associated data set, ``data``::
96
97        import dendropy
98        from dendropy.model.parsimony import fitch_down_pass
99
100        taxa = dendropy.TaxonNamespace()
101        data = dendropy.StandardCharacterMatrix.get_from_path(
102                "apternodus.chars.nexus",
103                "nexus",
104                taxon_namespace=taxa)
105        tree = dendropy.Tree.get_from_path(
106                "apternodus.tre",
107                "nexus",
108                taxon_namespace=taxa)
109        taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True)
110
111    The following will return the parsimony score of the ``tree`` with
112    respect to the data in ``data``::
113
114        score = fitch_down_pass(
115                nodes=tree.postorder_node_iter(),
116                taxon_state_sets_map=taxon_set_map)
117        print(score)
118
119    In the above, every |Node| object of ``tree`` will have an attribute
120    added, "state_sets", that stores the list of state sets from the analysis::
121
122        for nd in tree:
123            print(nd.state_sets)
124
125    If you want to store the list of state sets in a different attribute, e.g.,
126    "analysis1_states"::
127
128        score = fitch_down_pass(
129                nodes=tree.postorder_node_iter(),
130                state_sets_attr_name="analysis1_states",
131                taxon_state_sets_map=taxon_set_map)
132        print(score)
133        for nd in tree:
134            print(nd.analysis1_states)
135
136    Or not to store these at all::
137
138        score = fitch_down_pass(
139                nodes=tree.postorder_node_iter(),
140                state_sets_attr_name=None,
141                taxon_state_sets_map=taxon_set_map)
142        print(score)
143
144    Scoring custom data can be done by something like the following::
145
146        taxa = dendropy.TaxonNamespace()
147        taxon_state_sets_map = {}
148        t1 = taxa.require_taxon("A")
149        t2 = taxa.require_taxon("B")
150        t3 = taxa.require_taxon("C")
151        t4 = taxa.require_taxon("D")
152        t5 = taxa.require_taxon("E")
153        taxon_state_sets_map[t1] = [ set([0,1]),  set([0,1]),  set([0]),     set([0]) ]
154        taxon_state_sets_map[t2] = [ set([1]),    set([1]),    set([1]),     set([0]) ]
155        taxon_state_sets_map[t3] = [ set([0]),    set([1]),    set([1]),     set([0]) ]
156        taxon_state_sets_map[t4] = [ set([0]),    set([1]),    set([0,1]),   set([1]) ]
157        taxon_state_sets_map[t5] = [ set([1]),    set([0]),    set([1]),     set([1]) ]
158        tree = dendropy.Tree.get_from_string(
159                "(A,(B,(C,(D,E))));", "newick",
160                taxon_namespace=taxa)
161        score = fitch_down_pass(tree.postorder_node_iter(),
162                taxon_state_sets_map=taxon_state_sets_map)
163        print(score)
164
165    """
166    if score_by_character_list is not None:
167        assert len(score_by_character_list) == 0
168        for idx in range(len(list(taxon_state_sets_map.values())[0])): # this is unacceptable!
169            score_by_character_list.append(0)
170    score = 0
171    if state_sets_attr_name is None:
172        node_state_set_map = _NodeStateSetMap(taxon_state_sets_map)
173        get_node_state_sets = lambda node : node_state_set_map[node]
174        set_node_state_sets = lambda node, v : node_state_set_map.__setitem__(node, v)
175    else:
176        get_node_state_sets = lambda node : _retrieve_state_sets_from_attr(node, state_sets_attr_name, taxon_state_sets_map)
177        set_node_state_sets = lambda node, v : _store_sets_as_attr(node, state_sets_attr_name, v)
178    for nd in postorder_nodes:
179        c = nd.child_nodes()
180        if not c:
181            ss = get_node_state_sets(nd)
182            continue
183        left_c, right_c = c[:2]
184        remaining = c[2:]
185        left_ssl = get_node_state_sets(left_c)
186        while True:
187            right_ssl = get_node_state_sets(right_c)
188            result = []
189            for n, ssp in enumerate(zip(left_ssl, right_ssl)):
190                left_ss, right_ss = ssp
191                inter = left_ss.intersection(right_ss)
192                if inter:
193                    result.append(inter)
194                else:
195                    if weights is None:
196                        wt = 1
197                    else:
198                        wt = weights[n]
199                    score += wt
200                    result.append(left_ss.union(left_ss, right_ss))
201                    if score_by_character_list is not None:
202                        # try:
203                        #     score_by_character_list[n] += wt
204                        # except IndexError:
205                        #     score_by_character_list.append(wt)
206                        score_by_character_list[n] += wt
207            if remaining:
208                right_c = remaining.pop(0)
209                left_ssl = result
210            else:
211                break
212        # setattr(nd, state_sets_attr_name, result)
213        set_node_state_sets(nd, result)
214    return score
215
216def fitch_up_pass(
217        preorder_node_list,
218        state_sets_attr_name="state_sets",
219        taxon_state_sets_map=None):
220    """
221    Finalizes the state set lists associated with each node using the "final
222    phase" of Fitch's (1971) unordered parsimony algorithm.
223
224    Parameters
225    ----------
226    postorder_nodes : iterable of/over |Node| objects
227        An iterable of |Node| objects in in order of post-order
228        traversal of the tree.
229    state_sets_attr_name : str
230        Name of attribute on |Node| objects in which state set lists
231        will stored/accessed. If |None|, then state sets will not be stored on
232        the tree.
233    taxon_state_sets_map : dict[taxon] = state sets
234        A dictionary that takes a taxon object as a key and returns a state set
235        list as a value. This will be used to populate the state set of a node
236        that has not yet had its state sets scored and recorded (typically,
237        leaves of a tree that has not yet been processed).
238
239    Notes
240    -----
241    Currently this requires a bifurcating tree (even at the root).
242
243    Examples
244    --------
245
246    ::
247
248        taxa = dendropy.TaxonNamespace()
249        data = dendropy.StandardCharacterMatrix.get_from_path(
250                "apternodus.chars.nexus",
251                "nexus",
252                taxon_namespace=taxa)
253        tree = dendropy.Tree.get_from_path(
254                "apternodus.tre",
255                "nexus",
256                taxon_namespace=taxa)
257        taxon_state_sets_map = data.taxon_state_sets_map(gaps_as_missing=True)
258        score = fitch_down_pass(tree.postorder_node_iter(),
259                taxon_state_sets_map=taxon_state_sets_map)
260        print(score)
261        fitch_up_pass(tree.preorder_node_iter())
262        for nd in tree:
263            print(nd.state_sets)
264
265    """
266    node_state_sets_map = {}
267    for nd in preorder_node_list:
268        c = nd.child_nodes()
269        p = nd.parent_node
270        if (not c) or (not p):
271            continue
272        assert(len(c) == 2)
273        left_c, right_c = c
274        try:
275            left_ssl = getattr(left_c, state_sets_attr_name)
276        except AttributeError:
277            if not taxon_state_sets_map:
278                raise
279            left_ssl = taxon_state_sets_map[left_c.taxon]
280        try:
281            right_ssl = getattr(right_c, state_sets_attr_name)
282        except AttributeError:
283            if not taxon_state_sets_map:
284                raise
285            right_ssl = taxon_state_sets_map[right_c.taxon]
286        par_ssl = getattr(p, state_sets_attr_name)
287        curr_ssl = getattr(nd, state_sets_attr_name)
288        result = []
289        for n, ssp in enumerate(zip(par_ssl, curr_ssl, left_ssl, right_ssl)):
290            par_ss, curr_ss, left_ss, right_ss = ssp
291
292            down_parup_inter = par_ss.intersection(curr_ss)
293            if down_parup_inter == par_ss:
294                final_ss = down_parup_inter
295            else:
296                rl_inter = left_ss.intersection(right_ss)
297                if not rl_inter:
298                    final_ss = par_ss.union(curr_ss)
299                else:
300                    in_par_and_left = par_ss.intersection(left_ss)
301                    in_par_and_right = par_ss.intersection(right_ss)
302                    final_ss = in_par_and_left.union(in_par_and_right, curr_ss)
303            #_LOG.debug("downpass = %s, par = %s, left = %s, right = %s, final_ss= %s" %
304            #                    (str(curr_ss), str(par_ss), str(left_ss), str(right_ss), str(final_ss)))
305            result.append(final_ss)
306        setattr(nd, state_sets_attr_name, result)
307
308
309def parsimony_score(
310        tree,
311        chars,
312        gaps_as_missing=True,
313        weights=None,
314        score_by_character_list=None,
315        ):
316    """
317    Calculates the score of a tree, ``tree``, given some character data,
318    ``chars``, under the parsimony model using the Fitch algorithm.
319
320    Parameters
321    ----------
322    tree : a |Tree| instance
323        A |Tree| to be scored. Must reference the same |TaxonNamespace| as
324        ``chars``.
325    chars : a |CharacterMatrix| instance
326        A |CharacterMatrix|-derived object with data to be scored. Must have
327        the same |TaxonNamespace| as ``tree``.
328    gap_as_missing : bool
329        If |True| [default], then gaps will be treated as missing data.
330        If |False|, then gaps will be treated as a new/additional state.
331    weights : iterable
332        A list of weights for each pattern/column in the matrix.
333    score_by_character_list : None or list
334        If not |None|, should be a reference to a list object.
335        This list will be populated by the scores on a character-by-character
336        basis.
337
338    Returns
339    -------
340    pscore : int
341        The parsimony score of the tree given the data.
342
343    Examples
344    --------
345
346    ::
347
348        import dendropy
349        from dendropy.calculate import treescore
350
351        # establish common taxon namespace
352        taxon_namespace = dendropy.TaxonNamespace()
353
354        # Read data; if data is, e.g., "standard", use StandardCharacterMatrix.
355        # If unsure of data type, can do:
356        #       dataset = dendropy.DataSet.get(
357        #               path="path/to/file.nex",
358        #               schema="nexus",
359        #               taxon_namespace=tns,)
360        #       chars = dataset.char_matrices[0]
361        chars = dendropy.DnaCharacterMatrix.get(
362                path="pythonidae.chars.nexus",
363                schema="nexus",
364                taxon_namespace=taxon_namespace)
365        tree = dendropy.Tree.get(
366                path="pythonidae.mle.newick",
367                schema="newick",
368                taxon_namespace=taxon_namespace)
369
370        # We store the site-specific scores here
371        # This is optional; if we do not want to
372        # use the per-site scores, just pass in |None|
373        # for the ``score_by_character_list`` argument
374        # or do not specify this argument at all.
375        score_by_character_list = []
376
377        score = treescore.parsimony_score(
378                tree,
379                chars,
380                gaps_as_missing=False,
381                score_by_character_list=score_by_character_list)
382
383        # Print the results: the score
384        print("Score: {}".format(score))
385
386        # Print the results: the per-site scores
387        for idx, x in enumerate(score_by_character_list):
388            print("{}: {}".format(idx+1, x))
389
390    Notes
391    -----
392
393    If the same data is going to be used to score multiple trees or multiple times,
394    it is probably better to generate the 'taxon_state_sets_map' once and call
395    "fitch_down_pass" directly yourself, as this function generates a new map
396    each time.
397
398    """
399    if tree.taxon_namespace is not chars.taxon_namespace:
400        raise TaxonNamespaceIdentityError(tree, data)
401    taxon_state_sets_map = chars.taxon_state_sets_map(gaps_as_missing=gaps_as_missing)
402    nodes = tree.postorder_node_iter()
403    pscore = fitch_down_pass(nodes,
404            taxon_state_sets_map=taxon_state_sets_map,
405            weights=weights,
406            score_by_character_list=score_by_character_list)
407    return pscore
408
409