1#!/usr/bin/env python3 2""" 3This module implements FP-growth [1] frequent pattern mining algorithm with 4bucketing optimization [2] for conditional databases of few items. 5 6The entry points are :obj:`frequent_itemsets()`, :obj:`association_rules()`, and 7:obj:`rules_stats()` functions below. 8 9 10[1]: J. Han, J. Pei, Y. Yin, R. Mao. 11 Mining Frequent Patterns without Candidate Generation: A 12 Frequent-Pattern Tree Approach. 2004. 13 https://www.cs.sfu.ca/~jpei/publications/dami03_fpgrowth.pdf 14 15[2]: R. Agrawal, C. Aggarwal, V. Prasad. 16 Depth first generation of long patterns. 2000. 17 http://www.cs.tau.ac.il/~fiat/dmsem03/Depth%20First%20Generation%20of%20Long%20Patterns%20-%202000.pdf 18 19[3]: R. Agrawal, et al. 20 Fast Discovery of Association Rules. 1996. 21 http://cs-people.bu.edu/evimaria/cs565/advances.pdf 22 23 24Examples 25-------- 26Here's an example from R. Agrawal's original Apriori article [3 § 12.2.2]. 27Given a database of transactions: 28 29>>> T = [[1, 3, 4 ], 30... [ 2, 3, 5], 31... [1, 2, 3, 5], 32... [ 2, 5]] 33 34We can enumerate all frequent itemsets with support greater than two 35transactions: 36 37>>> from orangecontrib.associate.fpgrowth import * # doctest: +SKIP 38>>> itemsets = frequent_itemsets(T, 2) 39 40Note, functions in this module produce generators. 41The results space can explode quite quickly 42and can easily be too large to fit in your RAM. By using generators, you can 43filter the results to your liking `as you pass them`. 44 45>>> itemsets 46<generator object ...> 47>>> list(itemsets) 48[(frozenset({1}), 2), 49 (frozenset({2}), 3), 50 (frozenset({3}), 3), 51 (frozenset({1, 3}), 2), 52 (frozenset({2, 3}), 2), 53 (frozenset({5}), 3), 54 (frozenset({2, 5}), 3), 55 (frozenset({3, 5}), 2), 56 (frozenset({2, 3, 5}), 2)] 57 58We can try it with a larger and more real-world database of categorical values: 59 60>>> import Orange 61>>> data = Orange.data.Table('zoo') 62>>> data 63[[1, 0, 0, 1, 0, ... | mammal] {aardvark}, 64 [1, 0, 0, 1, 0, ... | mammal] {antelope}, 65 [0, 0, 1, 0, 0, ... | fish] {bass}, 66 [1, 0, 0, 1, 0, ... | mammal] {bear}, 67 [1, 0, 0, 1, 0, ... | mammal] {boar}, 68 ... 69] 70 71We can't use table data directly; we first have to one-hot transform it: 72 73>>> X, mapping = OneHot.encode(data, include_class=True) 74 75We get a database we can use to find frequent itemsets, and a mapping we will 76use later to revert the transformation. 77 78>>> X 79array([[False, True, ..., True, False], 80 [False, True, ..., True, False], 81 [ True, False, ..., False, False], 82 ..., 83 [False, True, ..., True, False], 84 [ True, False, ..., False, False], 85 [ True, False, ..., False, False]], dtype=bool) 86>>> sorted(mapping.items()) 87[(0, (0, 0)), 88 (1, (0, 1)), 89 (2, (1, 0)), 90 (3, (1, 1)), 91 ... 92 (40, (16, 4)), 93 (41, (16, 5)), 94 (42, (16, 6))] 95 96We want itemsets with >40% support: 97 98>>> itemsets = dict(frequent_itemsets(X, .4)) 99>>> len(itemsets) 100520 101 102The transaction-coded items corresponding to class values are: 103 104>>> class_items = {item 105... for item, var, _ in OneHot.decode(mapping, data, mapping) 106... if var is data.domain.class_var} 107>>> sorted(class_items) 108[36, 37, 38, 39, 40, 41, 42] 109 110That makes sense as our class variable has seven values: 111 112>>> data.domain.class_var.values 113['amphibian', 'bird', 'fish', 'insect', 'invertebrate', 'mammal', 'reptile'] 114 115Now we can generate all association rules that have consequent equal to one 116of the class values and >80% confidence (i.e. classification rules): 117 118>>> rules = [(P, Q, supp, conf) 119... for P, Q, supp, conf in association_rules(itemsets, .8) 120... if len(Q) == 1 and Q & class_items] 121>>> len(rules) 12218 123>>> rules 124[(frozenset({17, 2, 19, 20, 7}), frozenset({41}), 41, 1.0), 125 (frozenset({17, 2, 19, 7}), frozenset({41}), 41, 1.0), 126 ... 127 (frozenset({20, 7}), frozenset({41}), 41, 1.0), 128 (frozenset({7}), frozenset({41}), 41, 1.0)] 129 130To make them more helpful, we can use ``mapping`` to transform the rules' items 131back into table domain values, e.g. for first five rules: 132 133>>> names = {item: '{}={}'.format(var.name, val) 134... for item, var, val in OneHot.decode(mapping, data, mapping)} 135>>> for ante, cons, supp, conf in rules[:5]: 136... print(', '.join(names[i] for i in ante), '-->', 137... names[next(iter(cons))], 138... '(supp: {}, conf: {})'.format(supp, conf)) 139backbone=1, feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0) 140backbone=1, feathers=0, breathes=1, milk=1 --> type=mammal (supp: 41, conf: 1.0) 141backbone=1, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0) 142feathers=0, breathes=1, venomous=0, milk=1 --> type=mammal (supp: 41, conf: 1.0) 143backbone=1, feathers=0, breathes=1, venomous=0 --> type=mammal (supp: 41, conf: 0.87...) 144 145 146Reference with further examples below. 147""" 148 149# TODO: Consider FPClose from "Efficiently using prefix-trees in mining frequent itemsets" 150# TODO: Consider ExAnte: Anticipated data reduction in constrained pattern mining 151 152from collections import defaultdict, Iterator 153from itertools import combinations, chain 154from functools import reduce 155 156import numpy as np 157from scipy.sparse import issparse, spmatrix 158 159 160_FP_TREE_EMPTY = (None, []) 161_BUCKETING_FEW_ITEMS = 10 162 163 164class _Node(dict): 165 def __init__(self, item=None, parent=None, count=None): 166 self.item = item 167 self.parent = parent 168 self.count = count 169 170 171def _bucketing_count(db, frequent_items, min_support): 172 """ 173 Bucket counting (bucketing) optimization for databases where few items 174 are frequent ([2] § 5). 175 """ 176 # Forward and inverse mapping of frequent_items to [0, n_items) 177 inv_map = dict(enumerate(frequent_items)).__getitem__ 178 fwd_map = {v: k for k, v in inv_map.__self__.items()}.__getitem__ 179 # Project transactions 180 k = len(frequent_items) 181 buckets = [0] * 2**k 182 for count, transaction in db: 183 set_bits = (fwd_map(i) for i in frequent_items.intersection(transaction)) 184 tid = reduce(lambda a, b: a | 1 << b, set_bits, 0) 185 buckets[tid] += count 186 # Aggregate bucketing counts ([2], Figure 5) 187 for i in range(0, k): 188 i = 2**i 189 for j in range(2**k): 190 if j & i == 0: 191 buckets[j] += buckets[j + i] 192 # Announce results 193 buckets = enumerate(buckets) 194 next(buckets) # Skip 000...0 195 for tid, count in buckets: 196 if count >= min_support: 197 yield frozenset(inv_map(i) for i, b in enumerate(reversed(bin(tid))) if b == '1'), count 198 199 200# Replace above bucketing count with the one from C module 201try: 202 from orangecontrib.associate._fpgrowth import bucketing_count as _bucketing_count, \ 203 BUCKETING_FEW_ITEMS as _BUCKETING_FEW_ITEMS 204except ImportError: 205 # The module may not have been compiled due to compiler missing (e.g. on WinDOS); 206 # just use above Python code 207 pass 208 209 210def _fp_tree_insert(item, T, node_links, count): 211 """ Insert item into _Node-tree T and return the new node """ 212 node = T.get(item) 213 if node is None: 214 node = T[item] = _Node(item, T, count) 215 node_links[item].append(node) 216 else: # Node for this item already in T, just inc its count 217 node.count += count 218 return node 219 220 221def _fp_tree(db, min_support): 222 """ 223 FP-tree construction ([1] § 2.1, Algorithm 1). 224 225 If frequent items in db are determined to be less than threshold, 226 "bucketing" [2] is used instead. 227 228 Returns 229 ------- 230 tuple 231 (FP-tree, None) or (None, list of frequent itemsets with support) 232 """ 233 if not isinstance(db, list): db = list(db) 234 235 if not db: 236 return _FP_TREE_EMPTY 237 238 # Used to count item support so it can be reported when generating itemsets 239 item_support = defaultdict(int) 240 # Used for ordering transactions' items for "optimally" "compressed" tree 241 node_support = defaultdict(int) 242 for count, transaction in db: 243 for item in transaction: 244 item_support[item] += count 245 node_support[item] += 1 246 # Only ever consider items that have min_support 247 frequent_items = {item 248 for item, support in item_support.items() 249 if support >= min_support} 250 251 # Short-circuit, if possible 252 n_items = len(frequent_items) 253 if 0 == n_items: 254 return _FP_TREE_EMPTY 255 if 1 == n_items: 256 item = frequent_items.pop() 257 return None, ((frozenset({item}), item_support[item]),) 258 if n_items <= _BUCKETING_FEW_ITEMS: 259 return None, ((frozenset(itemset), support) 260 for itemset, support in _bucketing_count(db, frequent_items, min_support)) 261 262 # "The items [...] should be ordered in the frequency descending order of 263 # node occurrence of each item instead of its support" ([1], p. 12, bottom) 264 sort_index = {item: i 265 for i, item in 266 enumerate(sorted(frequent_items, 267 key=node_support.__getitem__, 268 reverse=True))}.__getitem__ 269 # Only retain frequent items and sort them 270 db = ((count, sorted(frequent_items.intersection(transaction), 271 key=sort_index)) 272 for count, transaction in db) 273 274 root = _Node() 275 node_links = defaultdict(list) 276 for count, transaction in db: 277 T = root 278 for item in transaction: 279 T = _fp_tree_insert(item, T, node_links, count) 280 # Sorted support-descending (in reverse because popping from the back for efficiency) 281 root.node_links = sorted(node_links.items(), key=lambda i: -sort_index(i[0])) 282 return root, None 283 284 285def _powerset(lst): 286 """ 287 >>> list(_powerset([1, 2, 3])) 288 [(1,), (2,), (3,), (1, 2), (1, 3), (2, 3), (1, 2, 3)] 289 """ 290 return chain.from_iterable(combinations(lst, r) 291 for r in range(1, len(lst) + 1)) 292 293 294def _single_prefix_path(root): 295 """ Return (single-prefix path, rest of tree with new root) """ 296 path = [] 297 tree = root 298 node_links = root.node_links 299 while len(tree) == 1: 300 tree = next(iter(tree.values())) 301 path.append((tree.item, tree.count)) 302 node_links.pop() 303 tree.parent, tree.item, tree.node_links = None, None, node_links 304 return path, tree 305 306 307def _prefix_paths(tree, nodes): 308 """ Generate all paths of tree leading to all item nodes """ 309 for node in nodes: 310 path = [] 311 support = node.count 312 node = node.parent 313 while node.item is not None: 314 path.append(node.item) 315 node = node.parent 316 if path: 317 yield support, path 318 319 320def _freq_patterns_single(P, alpha, min_support): 321 """ Yield subsets of P as (frequent itemset, support) """ 322 for itemset in _powerset(P): 323 yield alpha.union(i[0] for i in itemset), itemset[-1][1] 324 325 326def _freq_patterns_multi(Q, alpha, min_support): 327 """ Mine multi-path FP-tree """ 328 for item, nodes in reversed(Q.node_links): 329 support = sum(n.count for n in nodes) 330 beta = alpha.union({item}) 331 yield beta, support 332 tree, got_itemsets = _fp_tree(_prefix_paths(Q, nodes), min_support) 333 if got_itemsets: 334 for itemset, support in got_itemsets: 335 yield beta.union(itemset), support 336 elif tree is not None: 337 yield from _fp_growth(tree, beta, min_support) 338 339 340def _fp_growth(tree, alpha, min_support): 341 """ FP-growth ([1], § 3.3, Algorithm 2). """ 342 # Single prefix path optimization ([1] § 3.1) 343 P, Q = _single_prefix_path(tree) if len(tree) == 1 else ([], tree) 344 # Return P×Q 345 yield from _freq_patterns_single(P, alpha, min_support) 346 for itemsetQ, supportQ in _freq_patterns_multi(Q, alpha, min_support): 347 yield itemsetQ, supportQ 348 for itemsetP, supportP in _freq_patterns_single(P, alpha, min_support): 349 yield itemsetQ | itemsetP, supportQ 350 351 352def frequent_itemsets(X, min_support=.2): 353 """ 354 Generator yielding frequent itemsets from database X. 355 356 Parameters 357 ---------- 358 X : list or numpy.ndarray or scipy.sparse.spmatrix or iterator 359 The database of transactions where each transaction is a collection 360 of integer items. If `numpy.ndarray`, the items are considered to be 361 indices of non-zero columns. 362 min_support : float or int 363 If float in range (0, 1), percent of minimal support for itemset to 364 be considered frequent. If int > 1, the absolute number of instances. 365 For example, general iterators don't have defined length, so you need 366 to pass the absolute minimal support as int. 367 368 Yields 369 ------ 370 itemset: frozenset 371 Iteratively yields all itemsets (as frozensets of item indices) with 372 support greater or equal to specified `min_support`. 373 support: int 374 Itemset's support as number of instaances. 375 376 Examples 377 -------- 378 Have a database of 50 transactions, 100 possible items: 379 380 >>> import numpy as np 381 >>> np.random.seed(0) 382 >>> X = np.random.random((50, 100)) > .9 383 384 Convert it to sparse so we show this type is supported: 385 386 >>> from scipy.sparse import lil_matrix # other types would convert to LIL anyway 387 >>> X = lil_matrix(X) 388 389 Count the number of itemsets of at least two items with support greater 390 than 4%: 391 392 >>> sum(1 for itemset, support in frequent_itemsets(X, .05) 393 ... if len(itemset) >= 2) 394 72 395 396 Let's get all the itemsets with at least 20% support: 397 398 >>> gen = frequent_itemsets(X, .2) 399 >>> gen 400 <generator object ...> 401 402 >>> itemsets = list(gen) 403 >>> itemsets 404 [(frozenset({4}), 11), (frozenset({25}), 10)] 405 406 We get the same result by specifying the support as absolute number: 407 408 >>> list(frequent_itemsets(X, 10)) == itemsets 409 True 410 411 So the items '4' and '25' (fifth and twenty sixth columns of X) are the 412 only items (and itemsets) that appear 10 or more times. Let's check this: 413 414 >>> (X.sum(axis=0) >= 10).nonzero()[1] 415 array([ 4, 25]) 416 417 Conclusion: Given databases of uniformly distributed random data, 418 there's not much to work with. 419 """ 420 if not isinstance(X, (np.ndarray, spmatrix, list, Iterator)): 421 raise TypeError('X must be (sparse) array of boolean values, or' 422 'list of lists of ints, or iterator of such') 423 if not (isinstance(min_support, int) and min_support > 0 or 424 isinstance(min_support, float) and 0 < min_support <= 1): 425 raise ValueError('min_support must be an integer number of instances,' 426 'or a percent fraction in (0, 1]') 427 428 min_support *= (1 if isinstance(min_support, int) else 429 len(X) if isinstance(X, list) else 430 X.shape[0]) 431 min_support = max(1, int(np.ceil(min_support))) 432 433 if issparse(X): 434 X = X.tolil().rows 435 elif isinstance(X, np.ndarray): 436 X = (t.nonzero()[-1] for t in X) 437 elif isinstance(X, (list, tuple)): 438 is_list_of_lists = isinstance(next(iter(X), []), (list, tuple)) 439 has_int_elements = isinstance(next(iter(next(filter(None, iter(X)), [])), 1), int) 440 if not is_list_of_lists or not has_int_elements: 441 raise ValueError('X must be a list of lists of int') 442 443 db = ((1, transaction) for transaction in X) # 1 is initial item support 444 tree, itemsets = _fp_tree(db, min_support) 445 if itemsets: 446 yield from itemsets 447 if tree: 448 yield from _fp_growth(tree, frozenset(), min_support) 449 450 451def _association_rules(left, right, last_item, support, min_confidence, itemsets): 452 if not left: return 453 confidence = support / itemsets[left] 454 if confidence >= min_confidence: 455 yield left, right, support, confidence 456 for item in left: 457 if item > last_item: continue # This ensures same rules aren't visited twice 458 yield from _association_rules( 459 left - {item}, right | {item}, 460 item, support, min_confidence, itemsets) 461 462 463def association_rules(itemsets, min_confidence, itemset=None): 464 """ 465 Generate association rules ([3] § 12.3) from dict of itemsets' supports 466 (from :obj:`frequent_itemsets()`). If `itemset` is provided, only generate 467 its rules. 468 469 Parameters 470 ---------- 471 itemsets: dict 472 A `dict` mapping itemsets to their supports. Can be generated by 473 feeding the output of `frequent_itemsets()` to `dict` constructor. 474 min_confidence: float 475 Confidence percent. Defined as `itemset_support / antecedent_support`. 476 itemset: frozenset 477 Itemset the association rules of which we are interested in. 478 479 Yields 480 ------ 481 antecedent: frozenset 482 The LHS of the association rule. 483 consequent: frozenset 484 The RHS of the association rule. 485 support: int 486 The number of instances supporting (containing) this rule. 487 confidence: float 488 ``total_support / lhs_support``. 489 490 Examples 491 -------- 492 >>> np.random.seed(0) 493 >>> N = 100 494 >>> X = np.random.random((N, 100)) > .9 495 496 Find all itemsets with at least 5% support: 497 498 >>> itemsets = dict(frequent_itemsets(X, .05)) 499 >>> len(itemsets) 500 116 501 502 Generate all association rules from these itemsets with minimum 503 50% confidence: 504 505 >>> rules = association_rules(itemsets, .5) 506 >>> rules 507 <generator object ...> 508 >>> rules = list(rules) 509 >>> len(rules) 510 7 511 >>> rules 512 [(frozenset({36}), frozenset({25}), 5, 0.55...), 513 (frozenset({63}), frozenset({58}), 5, 0.5), 514 ... 515 (frozenset({30}), frozenset({32}), 5, 0.55...), 516 (frozenset({75}), frozenset({98}), 5, 0.5)] 517 518 Or only the rules for a particular itemset: 519 520 >>> list(association_rules(itemsets, .3, frozenset({75, 98}))) 521 [(frozenset({75}), frozenset({98}), 5, 0.5), 522 (frozenset({98}), frozenset({75}), 5, 0.45...)] 523 524 """ 525 assert (isinstance(itemsets, dict) and 526 isinstance(next(iter(itemsets), frozenset()), frozenset)) 527 assert 0 < min_confidence <= 1 528 from_itemsets = (itemset,) if itemset else sorted(itemsets, key=len, reverse=True) 529 for itemset in from_itemsets: 530 support = itemsets[itemset] 531 for item in itemset: 532 right = frozenset({item}) 533 yield from _association_rules( 534 itemset - right, right, 535 item, support, min_confidence, itemsets) 536 537 538def rules_stats(rules, itemsets, n_examples): 539 """ 540 Generate additional stats for rules generated by :obj:`association_rules()`. 541 542 Parameters 543 ---------- 544 rules: iterable 545 Rules as output by `association_rules()`. 546 itemsets: dict 547 The itemsets as obtained by `dict(frequent_itemsets(...))`. 548 n_examples: int 549 The total number of instances (for calculating coverage, lift, 550 and leverage). 551 552 Yields 553 ------ 554 atecedent: frozenset 555 The LHS of the association rule. 556 consequent: frozenset 557 The RHS of the association rule. 558 support: int 559 Support as an absolute number of instances. 560 confidence: float 561 The confidence percent, calculated as: ``total_support / lhs_rupport``. 562 coverage: float 563 Calculated as: ``lhs_support / n_examples`` 564 strength: float 565 Calculated as: ``rhs_support / lhs_examples`` 566 lift: float 567 Calculated as: ``n_examples * total_support / lhs_support / rhs_support`` 568 leverage: float 569 Calculated as: ``(total_support * n_examples - lhs_support * rhs_support) / n_examples**2`` 570 571 Examples 572 -------- 573 >>> N = 30 574 >>> X = np.random.random((N, 50)) > .9 575 >>> itemsets = dict(frequent_itemsets(X, .1)) 576 >>> rules = association_rules(itemsets, .6) 577 >>> list(rules_stats(rules, itemsets, N)) 578 [(frozenset({15}), frozenset({0}), 3, 0.75, 0.13..., 1.5, 3.75, 0.073...), 579 (frozenset({47}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...), 580 (frozenset({27}), frozenset({22}), 4, 0.66..., 0.2, 1.16..., 2.85..., 0.086...), 581 (frozenset({19}), frozenset({22}), 3, 0.6, 0.16..., 1.4, 2.57..., 0.061...)] 582 583 """ 584 assert (isinstance(itemsets, dict) and 585 isinstance(next(iter(itemsets), frozenset()), frozenset)) 586 assert n_examples > 0 587 for left, right, support, confidence in rules: 588 l_support, r_support = itemsets[left], itemsets[right] 589 coverage = l_support / n_examples 590 strength = r_support / l_support 591 lift = n_examples * confidence / r_support 592 leverage = (support*n_examples - l_support*r_support) / n_examples**2 593 yield (left, right, support, confidence, 594 coverage, strength, lift, leverage) 595 596 597def __fp_tree_count_nodes(tree): 598 count = 1 if tree.item is not None else 0 599 for t in tree.values(): 600 count += __fp_tree_count_nodes(t) 601 return count 602 603 604def __fp_tree_max_height(tree): 605 if tree: 606 return max((1 if tree.item is not None else 0) + 607 __fp_tree_max_height(child) for child in tree.values()) 608 return 1 if tree.item is not None else 0 609 610 611class OneHot: 612 """ 613 Encode discrete Orange.data.Table into a 2D array of binary attributes. 614 """ 615 @staticmethod 616 def encode(table, include_class=False): 617 """ 618 Return a tuple of 619 (bool (one hot) ndarray, {col: (variable_index, value_index)} mapping) 620 621 If the input table is sparse, a list of nonzero column indices 622 per row (LIL rows) is returned instead of the one-hot ndarray. 623 """ 624 X, encoded, mapping = table.X, [], {} 625 if issparse(X): 626 encoded = X.tolil().rows.tolist() 627 for i, var in enumerate(table.domain.attributes): 628 mapping[i] = i, 0 629 else: 630 for i, var in enumerate(table.domain.attributes): 631 if not var.is_discrete: continue 632 for j, val in enumerate(var.values): 633 mapping[len(mapping)] = i, j 634 encoded.append(X[:, i] == j) 635 636 if include_class and table.domain.has_discrete_class: 637 i, var = len(table.domain.attributes), table.domain.class_var 638 for j, val in enumerate(var.values): 639 mapping[len(mapping)] = i, j 640 if issparse(X): 641 for row in encoded: 642 row.append(i + j) 643 else: 644 encoded.append(table.Y == j) 645 646 if not issparse(X): 647 encoded = np.column_stack(encoded) if encoded else None 648 return encoded, mapping 649 650 @staticmethod 651 def decode(itemset, table, mapping): 652 """Yield sorted (item, variable, value) tuples (one for each item)""" 653 attributes = table.domain.attributes 654 for item in itemset: 655 ivar, ival = mapping[item] 656 var = attributes[ivar] if ivar < len(attributes) else table.domain.class_var 657 yield item, var, (var.values[ival] if var.is_discrete else 0) 658 659 660def preprocess(table): 661 """ 662 This function applies a one-hot transform to Orange data table, making it 663 suitable as an `X` input into :obj:`frequent_itemsets()` above. 664 665 For a more fine-grained control, use :obj:`OneHot` methods directly. 666 667 Parameters 668 ---------- 669 table: Orange.data.Table 670 The table to encode into `X` compatible with `frequent_itemsets()` 671 above. 672 673 Returns 674 ------- 675 X: numpy.ndarray 676 The table's `X` with one-hot tranfsorm applied. 677 678 679 Examples 680 -------- 681 For a more concrete example, i.e. using non-uniform data: 682 683 >>> from Orange.data import Table 684 >>> table = Table('voting') 685 >>> table 686 [[n, y, n, y, y, ... | republican], 687 [n, y, n, y, y, ... | republican], 688 [?, y, y, ?, y, ... | democrat], 689 [n, y, y, n, ?, ... | democrat], 690 [y, y, y, n, y, ... | democrat], 691 ... 692 ] 693 694 Table, as-is, can't be used with :obj:`frequent_itemsets()` directly (it can, 695 but it would produce garbage). We first need to one-hot transform it, i.e. 696 make binary columns for each value of each of its discrete variables. 697 698 >>> X = preprocess(table) 699 >>> X 700 array([[ True, False, False, ..., True, True, False], 701 [ True, False, False, ..., False, True, False], 702 ..., 703 [ True, False, True, ..., True, True, False], 704 [ True, False, False, ..., False, True, False]], dtype=bool) 705 706 Now we `can` use it. 707 708 Note: the transformation includes class if it's discrete. For a 709 finer-grained control, including the variable values to columns mapping, 710 use :obj:`OneHot` class directly. 711 """ 712 if table.domain.has_continuous_attributes(): 713 raise ValueError('Frequent itemsets require all variables to be discrete') 714 encoded, mapping = OneHot.encode(table, table.domain.has_discrete_class) 715 return encoded 716 717 718if __name__ == '__main__': 719 import doctest 720 import __main__, builtins 721 722 class Context(dict): 723 # See http://bugs.python.org/issue26303 724 def copy(self): return self 725 def clear(self): pass 726 727 globals = __main__.__dict__.copy() 728 globals.update(builtins.__dict__) 729 730 doctest.testmod(globs=Context(globals), 731 optionflags=doctest.NORMALIZE_WHITESPACE | doctest.ELLIPSIS) 732