1#!/usr/bin/env python3
2"""
3This module implements FP-growth [1] frequent pattern mining algorithm with
4bucketing optimization [2] for conditional databases of few items.
5
6The entry points are :obj:`frequent_itemsets()`, :obj:`association_rules()`, and
7:obj:`rules_stats()` functions below.
8
9
10[1]: J. Han, J. Pei, Y. Yin, R. Mao.
11     Mining Frequent Patterns without Candidate Generation: A
12     Frequent-Pattern Tree Approach. 2004.
13     https://www.cs.sfu.ca/~jpei/publications/dami03_fpgrowth.pdf
14
15[2]: R. Agrawal, C. Aggarwal, V. Prasad.
16     Depth first generation of long patterns. 2000.
17     http://www.cs.tau.ac.il/~fiat/dmsem03/Depth%20First%20Generation%20of%20Long%20Patterns%20-%202000.pdf
18
19[3]: R. Agrawal, et al.
20     Fast Discovery of Association Rules. 1996.
21     http://cs-people.bu.edu/evimaria/cs565/advances.pdf
22
23
24Examples
25--------
26Here's an example from R. Agrawal's original Apriori article [3 § 12.2.2].
27Given a database of transactions:
28
29>>> T = [[1,    3, 4   ],
30...      [   2, 3,    5],
31...      [1, 2, 3,    5],
32...      [   2,       5]]
33
34We can enumerate all frequent itemsets with support greater than two
35transactions:
36
37>>> from orangecontrib.associate.fpgrowth import *  # doctest: +SKIP
38>>> itemsets = frequent_itemsets(T, 2)
39
40Note, functions in this module produce generators.
41The results space can explode quite quickly
42and can easily be too large to fit in your RAM. By using generators, you can
43filter the results to your liking `as you pass them`.
44
45>>> itemsets
46<generator object ...>
47>>> list(itemsets)
48[(frozenset({1}), 2),
49 (frozenset({2}), 3),
50 (frozenset({3}), 3),
51 (frozenset({1, 3}), 2),
52 (frozenset({2, 3}), 2),
53 (frozenset({5}), 3),
54 (frozenset({2, 5}), 3),
55 (frozenset({3, 5}), 2),
56 (frozenset({2, 3, 5}), 2)]
57
58We can try it with a larger and more real-world database of categorical values:
59
60>>> import Orange
61>>> data = Orange.data.Table('zoo')
62>>> data
63[[1, 0, 0, 1, 0, ... | mammal] {aardvark},
64 [1, 0, 0, 1, 0, ... | mammal] {antelope},
65 [0, 0, 1, 0, 0, ... | fish] {bass},
66 [1, 0, 0, 1, 0, ... | mammal] {bear},
67 [1, 0, 0, 1, 0, ... | mammal] {boar},
68 ...
69]
70
71We can't use table data directly; we first have to one-hot transform it:
72
73>>> X, mapping = OneHot.encode(data, include_class=True)
74
75We get a database we can use to find frequent itemsets, and a mapping we will
76use later to revert the transformation.
77
78>>> X
79array([[False,  True, ...,  True, False],
80       [False,  True, ...,  True, False],
81       [ True, False, ..., False, False],
82       ...,
83       [False,  True, ...,  True, False],
84       [ True, False, ..., False, False],
85       [ True, False, ..., False, False]], dtype=bool)
86>>> sorted(mapping.items())
87[(0, (0, 0)),
88 (1, (0, 1)),
89 (2, (1, 0)),
90 (3, (1, 1)),
91 ...
92 (40, (16, 4)),
93 (41, (16, 5)),
94 (42, (16, 6))]
95
96We want itemsets with >40% support:
97
98>>> itemsets = dict(frequent_itemsets(X, .4))
99>>> len(itemsets)
100520
101
102The transaction-coded items corresponding to class values are:
103
104>>> class_items = {item
105...                for item, var, _ in OneHot.decode(mapping, data, mapping)
106...                if var is data.domain.class_var}
107>>> sorted(class_items)
108[36, 37, 38, 39, 40, 41, 42]
109
110That makes sense as our class variable has seven values:
111
112>>> data.domain.class_var.values
113['amphibian', 'bird', 'fish', 'insect', 'invertebrate', 'mammal', 'reptile']
114
115Now we can generate all association rules that have consequent equal to one
116of the class values and >80% confidence (i.e. classification rules):
117
118>>> rules = [(P, Q, supp, conf)
119...          for P, Q, supp, conf in association_rules(itemsets, .8)
120...          if len(Q) == 1 and Q & class_items]
121>>> len(rules)
12218
123>>> rules
124[(frozenset({17, 2, 19, 20, 7}), frozenset({41}), 41, 1.0),
125 (frozenset({17, 2, 19, 7}), frozenset({41}), 41, 1.0),
126 ...
127 (frozenset({20, 7}), frozenset({41}), 41, 1.0),
128 (frozenset({7}), frozenset({41}), 41, 1.0)]
129
130To make them more helpful, we can use ``mapping`` to transform the rules' items
131back into table domain values, e.g. for first five rules:
132
133>>> names = {item: '{}={}'.format(var.name, val)
134...          for item, var, val in OneHot.decode(mapping, data, mapping)}
135>>> for ante, cons, supp, conf in rules[:5]:
136...     print(', '.join(names[i] for i in ante), '-->',
137...           names[next(iter(cons))],
138...           '(supp: {}, conf: {})'.format(supp, conf))
139backbone=1, feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
140backbone=1, feathers=0, breathes=1, milk=1 --> type=mammal (supp: 41, conf: 1.0)
141backbone=1, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
142feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0)
143backbone=1, feathers=0, breathes=1, venomous=0 --> type=mammal (supp: 41, conf: 0.87...)
144
145
146Reference with further examples below.
147"""
148
149# TODO: Consider FPClose from "Efficiently using prefix-trees in mining frequent itemsets"
150# TODO: Consider ExAnte: Anticipated data reduction in constrained pattern mining
151
152from collections import defaultdict, Iterator
153from itertools import combinations, chain
154from functools import reduce
155
156import numpy as np
157from scipy.sparse import issparse, spmatrix
158
159
160_FP_TREE_EMPTY = (None, [])
161_BUCKETING_FEW_ITEMS = 10
162
163
164class _Node(dict):
165    def __init__(self, item=None, parent=None, count=None):
166        self.item = item
167        self.parent = parent
168        self.count = count
169
170
171def _bucketing_count(db, frequent_items, min_support):
172    """
173    Bucket counting (bucketing) optimization for databases where few items
174    are frequent ([2] § 5).
175    """
176    # Forward and inverse mapping of frequent_items to [0, n_items)
177    inv_map = dict(enumerate(frequent_items)).__getitem__
178    fwd_map = {v: k for k, v in inv_map.__self__.items()}.__getitem__
179    # Project transactions
180    k = len(frequent_items)
181    buckets = [0] * 2**k
182    for count, transaction in db:
183        set_bits = (fwd_map(i) for i in frequent_items.intersection(transaction))
184        tid = reduce(lambda a, b: a | 1 << b, set_bits, 0)
185        buckets[tid] += count
186    # Aggregate bucketing counts ([2], Figure 5)
187    for i in range(0, k):
188        i = 2**i
189        for j in range(2**k):
190            if j & i == 0:
191                buckets[j] += buckets[j + i]
192    # Announce results
193    buckets = enumerate(buckets)
194    next(buckets)  # Skip 000...0
195    for tid, count in buckets:
196        if count >= min_support:
197            yield frozenset(inv_map(i) for i, b in enumerate(reversed(bin(tid))) if b == '1'), count
198
199
200# Replace above bucketing count with the one from C module
201try:
202    from orangecontrib.associate._fpgrowth import bucketing_count as _bucketing_count, \
203                                                  BUCKETING_FEW_ITEMS as _BUCKETING_FEW_ITEMS
204except ImportError:
205    # The module may not have been compiled due to compiler missing (e.g. on WinDOS);
206    # just use above Python code
207    pass
208
209
210def _fp_tree_insert(item, T, node_links, count):
211    """ Insert item into _Node-tree T and return the new node """
212    node = T.get(item)
213    if node is None:
214        node = T[item] = _Node(item, T, count)
215        node_links[item].append(node)
216    else:  # Node for this item already in T, just inc its count
217        node.count += count
218    return node
219
220
221def _fp_tree(db, min_support):
222    """
223    FP-tree construction ([1] § 2.1, Algorithm 1).
224
225    If frequent items in db are determined to be less than threshold,
226    "bucketing" [2] is used instead.
227
228    Returns
229    -------
230    tuple
231        (FP-tree, None) or (None, list of frequent itemsets with support)
232    """
233    if not isinstance(db, list): db = list(db)
234
235    if not db:
236        return _FP_TREE_EMPTY
237
238    # Used to count item support so it can be reported when generating itemsets
239    item_support = defaultdict(int)
240    # Used for ordering transactions' items for "optimally" "compressed" tree
241    node_support = defaultdict(int)
242    for count, transaction in db:
243        for item in transaction:
244            item_support[item] += count
245            node_support[item] += 1
246    # Only ever consider items that have min_support
247    frequent_items = {item
248                      for item, support in item_support.items()
249                      if support >= min_support}
250
251    # Short-circuit, if possible
252    n_items = len(frequent_items)
253    if 0 == n_items:
254        return _FP_TREE_EMPTY
255    if 1 == n_items:
256        item = frequent_items.pop()
257        return None, ((frozenset({item}), item_support[item]),)
258    if n_items <= _BUCKETING_FEW_ITEMS:
259        return None, ((frozenset(itemset), support)
260                      for itemset, support in _bucketing_count(db, frequent_items, min_support))
261
262    # "The items [...] should be ordered in the frequency descending order of
263    # node occurrence of each item instead of its support" ([1], p. 12, bottom)
264    sort_index = {item: i
265                  for i, item in
266                      enumerate(sorted(frequent_items,
267                                       key=node_support.__getitem__,
268                                       reverse=True))}.__getitem__
269    # Only retain frequent items and sort them
270    db = ((count, sorted(frequent_items.intersection(transaction),
271                         key=sort_index))
272          for count, transaction in db)
273
274    root = _Node()
275    node_links = defaultdict(list)
276    for count, transaction in db:
277        T = root
278        for item in transaction:
279            T = _fp_tree_insert(item, T, node_links, count)
280    # Sorted support-descending (in reverse because popping from the back for efficiency)
281    root.node_links = sorted(node_links.items(), key=lambda i: -sort_index(i[0]))
282    return root, None
283
284
285def _powerset(lst):
286    """
287    >>> list(_powerset([1, 2, 3]))
288    [(1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)]
289    """
290    return chain.from_iterable(combinations(lst, r)
291                               for r in range(1, len(lst) + 1))
292
293
294def _single_prefix_path(root):
295    """ Return (single-prefix path, rest of tree with new root) """
296    path = []
297    tree = root
298    node_links = root.node_links
299    while len(tree) == 1:
300        tree = next(iter(tree.values()))
301        path.append((tree.item, tree.count))
302        node_links.pop()
303    tree.parent, tree.item, tree.node_links = None, None, node_links
304    return path, tree
305
306
307def _prefix_paths(tree, nodes):
308    """ Generate all paths of tree leading to all item nodes """
309    for node in nodes:
310        path = []
311        support = node.count
312        node = node.parent
313        while node.item is not None:
314            path.append(node.item)
315            node = node.parent
316        if path:
317            yield support, path
318
319
320def _freq_patterns_single(P, alpha, min_support):
321    """ Yield subsets of P as (frequent itemset, support) """
322    for itemset in _powerset(P):
323        yield alpha.union(i[0] for i in itemset), itemset[-1][1]
324
325
326def _freq_patterns_multi(Q, alpha, min_support):
327    """ Mine multi-path FP-tree """
328    for item, nodes in reversed(Q.node_links):
329        support = sum(n.count for n in nodes)
330        beta = alpha.union({item})
331        yield beta, support
332        tree, got_itemsets = _fp_tree(_prefix_paths(Q, nodes), min_support)
333        if got_itemsets:
334            for itemset, support in got_itemsets:
335                yield beta.union(itemset), support
336        elif tree is not None:
337            yield from _fp_growth(tree, beta, min_support)
338
339
340def _fp_growth(tree, alpha, min_support):
341    """ FP-growth ([1], § 3.3, Algorithm 2). """
342    # Single prefix path optimization ([1] § 3.1)
343    P, Q = _single_prefix_path(tree) if len(tree) == 1 else ([], tree)
344    # Return P×Q
345    yield from _freq_patterns_single(P, alpha, min_support)
346    for itemsetQ, supportQ in _freq_patterns_multi(Q, alpha, min_support):
347        yield itemsetQ, supportQ
348        for itemsetP, supportP in _freq_patterns_single(P, alpha, min_support):
349            yield itemsetQ | itemsetP, supportQ
350
351
352def frequent_itemsets(X, min_support=.2):
353    """
354    Generator yielding frequent itemsets from database X.
355
356    Parameters
357    ----------
358    X : list or numpy.ndarray or scipy.sparse.spmatrix or iterator
359        The database of transactions where each transaction is a collection
360        of integer items. If `numpy.ndarray`, the items are considered to be
361        indices of non-zero columns.
362    min_support : float or int
363        If float in range (0, 1), percent of minimal support for itemset to
364        be considered frequent. If int > 1, the absolute number of instances.
365        For example, general iterators don't have defined length, so you need
366        to pass the absolute minimal support as int.
367
368    Yields
369    ------
370    itemset: frozenset
371        Iteratively yields all itemsets (as frozensets of item indices) with
372        support greater or equal to specified `min_support`.
373    support: int
374        Itemset's support as number of instaances.
375
376    Examples
377    --------
378    Have a database of 50 transactions, 100 possible items:
379
380    >>> import numpy as np
381    >>> np.random.seed(0)
382    >>> X = np.random.random((50, 100)) > .9
383
384    Convert it to sparse so we show this type is supported:
385
386    >>> from scipy.sparse import lil_matrix  # other types would convert to LIL anyway
387    >>> X = lil_matrix(X)
388
389    Count the number of itemsets of at least two items with support greater
390    than 4%:
391
392    >>> sum(1 for itemset, support in frequent_itemsets(X, .05)
393    ...     if len(itemset) >= 2)
394    72
395
396    Let's get all the itemsets with at least 20% support:
397
398    >>> gen = frequent_itemsets(X, .2)
399    >>> gen
400    <generator object ...>
401
402    >>> itemsets = list(gen)
403    >>> itemsets
404    [(frozenset({4}), 11), (frozenset({25}), 10)]
405
406    We get the same result by specifying the support as absolute number:
407
408    >>> list(frequent_itemsets(X, 10)) == itemsets
409    True
410
411    So the items '4' and '25' (fifth and twenty sixth columns of X) are the
412    only items (and itemsets) that appear 10 or more times. Let's check this:
413
414    >>> (X.sum(axis=0) >= 10).nonzero()[1]
415    array([ 4, 25])
416
417    Conclusion: Given databases of uniformly distributed random data,
418    there's not much to work with.
419    """
420    if not isinstance(X, (np.ndarray, spmatrix, list, Iterator)):
421        raise TypeError('X must be (sparse) array of boolean values, or'
422                        'list of lists of ints, or iterator of such')
423    if not (isinstance(min_support, int) and min_support > 0 or
424            isinstance(min_support, float) and 0 < min_support <= 1):
425        raise ValueError('min_support must be an integer number of instances,'
426                         'or a percent fraction in (0, 1]')
427
428    min_support *= (1 if isinstance(min_support, int) else
429                    len(X) if isinstance(X, list) else
430                    X.shape[0])
431    min_support = max(1, int(np.ceil(min_support)))
432
433    if issparse(X):
434        X = X.tolil().rows
435    elif isinstance(X, np.ndarray):
436        X = (t.nonzero()[-1] for t in X)
437    elif isinstance(X, (list, tuple)):
438        is_list_of_lists = isinstance(next(iter(X), []), (list, tuple))
439        has_int_elements = isinstance(next(iter(next(filter(None, iter(X)), [])), 1), int)
440        if not is_list_of_lists or not has_int_elements:
441            raise ValueError('X must be a list of lists of int')
442
443    db = ((1, transaction) for transaction in X)  # 1 is initial item support
444    tree, itemsets = _fp_tree(db, min_support)
445    if itemsets:
446        yield from itemsets
447    if tree:
448        yield from _fp_growth(tree, frozenset(), min_support)
449
450
451def _association_rules(left, right, last_item, support, min_confidence, itemsets):
452    if not left: return
453    confidence = support / itemsets[left]
454    if confidence >= min_confidence:
455        yield left, right, support, confidence
456        for item in left:
457            if item > last_item: continue  # This ensures same rules aren't visited twice
458            yield from _association_rules(
459                left - {item}, right | {item},
460                item, support, min_confidence, itemsets)
461
462
463def association_rules(itemsets, min_confidence, itemset=None):
464    """
465    Generate association rules ([3] § 12.3) from dict of itemsets' supports
466    (from :obj:`frequent_itemsets()`). If `itemset` is provided, only generate
467    its rules.
468
469    Parameters
470    ----------
471    itemsets: dict
472        A `dict` mapping itemsets to their supports. Can be generated by
473        feeding the output of `frequent_itemsets()` to `dict` constructor.
474    min_confidence: float
475        Confidence percent. Defined as `itemset_support / antecedent_support`.
476    itemset: frozenset
477        Itemset the association rules of which we are interested in.
478
479    Yields
480    ------
481    antecedent: frozenset
482        The LHS of the association rule.
483    consequent: frozenset
484        The RHS of the association rule.
485    support: int
486        The number of instances supporting (containing) this rule.
487    confidence: float
488        ``total_support / lhs_support``.
489
490    Examples
491    --------
492    >>> np.random.seed(0)
493    >>> N = 100
494    >>> X = np.random.random((N, 100)) > .9
495
496    Find all itemsets with at least 5% support:
497
498    >>> itemsets = dict(frequent_itemsets(X, .05))
499    >>> len(itemsets)
500    116
501
502    Generate all association rules from these itemsets with minimum
503    50% confidence:
504
505    >>> rules = association_rules(itemsets, .5)
506    >>> rules
507    <generator object ...>
508    >>> rules = list(rules)
509    >>> len(rules)
510    7
511    >>> rules
512    [(frozenset({36}), frozenset({25}), 5, 0.55...),
513     (frozenset({63}), frozenset({58}), 5, 0.5),
514     ...
515     (frozenset({30}), frozenset({32}), 5, 0.55...),
516     (frozenset({75}), frozenset({98}), 5, 0.5)]
517
518    Or only the rules for a particular itemset:
519
520    >>> list(association_rules(itemsets, .3, frozenset({75, 98})))
521    [(frozenset({75}), frozenset({98}), 5, 0.5),
522     (frozenset({98}), frozenset({75}), 5, 0.45...)]
523
524    """
525    assert (isinstance(itemsets, dict) and
526            isinstance(next(iter(itemsets), frozenset()), frozenset))
527    assert 0 < min_confidence <= 1
528    from_itemsets = (itemset,) if itemset else sorted(itemsets, key=len, reverse=True)
529    for itemset in from_itemsets:
530        support = itemsets[itemset]
531        for item in itemset:
532            right = frozenset({item})
533            yield from _association_rules(
534                itemset - right, right,
535                item, support, min_confidence, itemsets)
536
537
538def rules_stats(rules, itemsets, n_examples):
539    """
540    Generate additional stats for rules generated by :obj:`association_rules()`.
541
542    Parameters
543    ----------
544    rules: iterable
545        Rules as output by `association_rules()`.
546    itemsets: dict
547        The itemsets as obtained by `dict(frequent_itemsets(...))`.
548    n_examples: int
549        The total number of instances (for calculating coverage, lift,
550        and leverage).
551
552    Yields
553    ------
554    atecedent: frozenset
555        The LHS of the association rule.
556    consequent: frozenset
557        The RHS of the association rule.
558    support: int
559        Support as an absolute number of instances.
560    confidence: float
561        The confidence percent, calculated as: ``total_support / lhs_rupport``.
562    coverage: float
563        Calculated as: ``lhs_support / n_examples``
564    strength: float
565        Calculated as: ``rhs_support / lhs_examples``
566    lift: float
567        Calculated as: ``n_examples * total_support / lhs_support / rhs_support``
568    leverage: float
569        Calculated as: ``(total_support * n_examples - lhs_support * rhs_support) / n_examples**2``
570
571    Examples
572    --------
573    >>> N = 30
574    >>> X = np.random.random((N, 50)) > .9
575    >>> itemsets = dict(frequent_itemsets(X, .1))
576    >>> rules = association_rules(itemsets, .6)
577    >>> list(rules_stats(rules, itemsets, N))
578    [(frozenset({15}), frozenset({0}), 3, 0.75, 0.13..., 1.5, 3.75, 0.073...),
579     (frozenset({47}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...),
580     (frozenset({27}), frozenset({22}), 4, 0.66..., 0.2, 1.16..., 2.85..., 0.086...),
581     (frozenset({19}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...)]
582
583    """
584    assert (isinstance(itemsets, dict) and
585            isinstance(next(iter(itemsets), frozenset()), frozenset))
586    assert n_examples > 0
587    for left, right, support, confidence in rules:
588        l_support, r_support = itemsets[left], itemsets[right]
589        coverage = l_support / n_examples
590        strength = r_support / l_support
591        lift = n_examples * confidence / r_support
592        leverage = (support*n_examples - l_support*r_support) / n_examples**2
593        yield (left, right, support, confidence,
594               coverage, strength, lift, leverage)
595
596
597def __fp_tree_count_nodes(tree):
598    count = 1 if tree.item is not None else 0
599    for t in tree.values():
600        count += __fp_tree_count_nodes(t)
601    return count
602
603
604def __fp_tree_max_height(tree):
605    if tree:
606        return max((1 if tree.item is not None else 0) +
607                   __fp_tree_max_height(child) for child in tree.values())
608    return 1 if tree.item is not None else 0
609
610
611class OneHot:
612    """
613    Encode discrete Orange.data.Table into a 2D array of binary attributes.
614    """
615    @staticmethod
616    def encode(table, include_class=False):
617        """
618        Return a tuple of
619        (bool (one hot) ndarray, {col: (variable_index, value_index)} mapping)
620
621        If the input table is sparse, a list of nonzero column indices
622        per row (LIL rows) is returned instead of the one-hot ndarray.
623        """
624        X, encoded, mapping = table.X, [], {}
625        if issparse(X):
626            encoded = X.tolil().rows.tolist()
627            for i, var in enumerate(table.domain.attributes):
628                mapping[i] = i, 0
629        else:
630            for i, var in enumerate(table.domain.attributes):
631                if not var.is_discrete: continue
632                for j, val in enumerate(var.values):
633                    mapping[len(mapping)] = i, j
634                    encoded.append(X[:, i] == j)
635
636        if include_class and table.domain.has_discrete_class:
637            i, var = len(table.domain.attributes), table.domain.class_var
638            for j, val in enumerate(var.values):
639                mapping[len(mapping)] = i, j
640                if issparse(X):
641                    for row in encoded:
642                        row.append(i + j)
643                else:
644                    encoded.append(table.Y == j)
645
646        if not issparse(X):
647            encoded = np.column_stack(encoded) if encoded else None
648        return encoded, mapping
649
650    @staticmethod
651    def decode(itemset, table, mapping):
652        """Yield sorted (item, variable, value) tuples (one for each item)"""
653        attributes = table.domain.attributes
654        for item in itemset:
655            ivar, ival = mapping[item]
656            var = attributes[ivar] if ivar < len(attributes) else table.domain.class_var
657            yield item, var, (var.values[ival] if var.is_discrete else 0)
658
659
660def preprocess(table):
661    """
662    This function applies a one-hot transform to Orange data table, making it
663    suitable as an `X` input into :obj:`frequent_itemsets()` above.
664
665    For a more fine-grained control, use :obj:`OneHot` methods directly.
666
667    Parameters
668    ----------
669    table: Orange.data.Table
670        The table to encode into `X` compatible with `frequent_itemsets()`
671        above.
672
673    Returns
674    -------
675    X: numpy.ndarray
676        The table's `X` with one-hot tranfsorm applied.
677
678
679    Examples
680    --------
681    For a more concrete example, i.e. using non-uniform data:
682
683    >>> from Orange.data import Table
684    >>> table = Table('voting')
685    >>> table
686    [[n, y, n, y, y, ... | republican],
687     [n, y, n, y, y, ... | republican],
688     [?, y, y, ?, y, ... | democrat],
689     [n, y, y, n, ?, ... | democrat],
690     [y, y, y, n, y, ... | democrat],
691     ...
692    ]
693
694    Table, as-is, can't be used with :obj:`frequent_itemsets()` directly (it can,
695    but it would produce garbage). We first need to one-hot transform it, i.e.
696    make binary columns for each value of each of its discrete variables.
697
698    >>> X = preprocess(table)
699    >>> X
700    array([[ True, False, False, ...,  True,  True, False],
701           [ True, False, False, ..., False,  True, False],
702           ...,
703           [ True, False,  True, ...,  True,  True, False],
704           [ True, False, False, ..., False,  True, False]], dtype=bool)
705
706    Now we `can` use it.
707
708    Note: the transformation includes class if it's discrete. For a
709    finer-grained control, including the variable values to columns mapping,
710    use :obj:`OneHot` class directly.
711    """
712    if table.domain.has_continuous_attributes():
713        raise ValueError('Frequent itemsets require all variables to be discrete')
714    encoded, mapping = OneHot.encode(table, table.domain.has_discrete_class)
715    return encoded
716
717
718if __name__ == '__main__':
719    import doctest
720    import __main__, builtins
721
722    class Context(dict):
723        # See http://bugs.python.org/issue26303
724        def copy(self): return self
725        def clear(self): pass
726
727    globals = __main__.__dict__.copy()
728    globals.update(builtins.__dict__)
729
730    doctest.testmod(globs=Context(globals),
731                    optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS)
732