1# Tree::Trie, a module implementing a trie data structure. 2# A formal description of tries can be found at: 3# http://www.cs.queensu.ca/home/daver/235/Notes/Tries.pdf 4 5package Tree::Trie; 6 7use strict; 8use warnings; 9 10our $VERSION = "1.9"; 11 12# A handful of helpful constants 13use constant DEFAULT_END_MARKER => ''; 14 15use constant BOOLEAN => 0; 16use constant CHOOSE => 1; 17use constant COUNT => 2; 18use constant PREFIX => 3; 19use constant EXACT => 4; 20 21## Public methods begin here 22 23# The constructor method. It's very simple. 24sub new { 25 my($proto) = shift; 26 my($options) = shift; 27 my($class) = ref($proto) || $proto; 28 my($self) = {}; 29 bless($self, $class); 30 $self->{_MAINHASHREF} = {}; 31 # These are default values 32 $self->{_END} = &DEFAULT_END_MARKER; 33 $self->{_DEEPSEARCH} = CHOOSE; 34 $self->{_FREEZE_END} = 0; 35 unless ( defined($options) && (ref($options) eq "HASH") ) { 36 $options = {}; 37 } 38 $self->deepsearch($options->{'deepsearch'}); 39 if (exists $options->{end_marker}) { 40 $self->end_marker($options->{end_marker}); 41 } 42 if (exists $options->{freeze_end_marker}) { 43 $self->freeze_end_marker($options->{freeze_end_marker}); 44 } 45 return($self); 46} 47 48# Sets the value of the end marker, for those people who think they know 49# better than Tree::Trie. Note it does not allow the setting of single 50# character end markers. 51sub end_marker { 52 my $self = shift; 53 if ($_[0] && length $_[0] > 1) { 54 # If they decide to set a new end marker, we have to be sure to 55 # go through and update all existing markers. 56 my $newend = shift; 57 my @refs = ($self->{_MAINHASHREF}); 58 while (@refs) { 59 my $ref = shift @refs; 60 for my $key (keys %{$ref}) { 61 if ($key eq $self->{_END}) { 62 $ref->{$newend} = $ref->{$key}; 63 delete $ref->{$key}; 64 } 65 else { 66 push(@refs, $ref->{$key}); 67 } 68 } 69 } 70 $self->{_END} = $newend; 71 } 72 return $self->{_END}; 73} 74 75# Sets the option to not attempt to update the end marker based on added 76# letters. 77# The above is the most awkward sentence I have ever written. 78sub freeze_end_marker { 79 my $self = shift; 80 if (scalar @_) { 81 if (shift) { 82 $self->{_FREEZE_END} = 1; 83 } 84 else { 85 $self->{_FREEZE_END} = 0; 86 } 87 } 88 return $self->{_FREEZE_END}; 89} 90 91# Sets the value of the deepsearch parameter. Can be passed either words 92# describing the parameter, or their numerical equivalents. Legal values 93# are: 94# boolean => 0 95# choose => 1 96# count => 2 97# prefix => 3 98# exact => 4 99# See the POD for the 'lookup' method for details on this option. 100sub deepsearch { 101 my($self) = shift; 102 my($option) = shift; 103 if(defined($option)) { 104 if ($option eq BOOLEAN || $option eq 'boolean') { 105 $self->{_DEEPSEARCH} = BOOLEAN; 106 } 107 elsif ($option eq CHOOSE || $option eq 'choose') { 108 $self->{_DEEPSEARCH} = CHOOSE; 109 } 110 elsif ($option eq COUNT || $option eq 'count') { 111 $self->{_DEEPSEARCH} = COUNT; 112 } 113 elsif ($option eq PREFIX || $option eq 'prefix') { 114 $self->{_DEEPSEARCH} = PREFIX; 115 } 116 elsif ($option eq EXACT || $option eq 'exact') { 117 $self->{_DEEPSEARCH} = EXACT; 118 } 119 } 120 return $self->{_DEEPSEARCH}; 121} 122 123# The add() method takes a list of words as arguments and attempts to add 124# them to the trie. In list context, returns a list of words successfully 125# added. In scalar context, returns a count of these words. As of this 126# version, the only reason a word can fail to be added is if it is already 127# in the trie. Or, I suppose, if there was a bug. :) 128sub add { 129 my($self) = shift; 130 my(@words) = @_; 131 132 my @retarray; 133 my $retnum = 0; 134 135 # Process each word... 136 for my $word (@words) { 137 # And just call the internal thingy for it. 138 if ($self->_add_internal($word, undef)) { 139 # Updating return values as needed 140 if (wantarray) { 141 push(@retarray,$word); 142 } 143 else { 144 $retnum++; 145 } 146 } 147 } 148 # When done, return results. 149 return (wantarray ? @retarray : $retnum); 150} 151 152# add_data() takes a hash of word => data pairs, adds the words to the trie and 153# associates the data to those words. 154sub add_data { 155 my($self) = shift; 156 my($retnum, @retarray); 157 my $word = ""; 158 # Making sure that we've gotten data in pairs. Can't just turn @_ 159 # into %data, because that would stringify arrayrefs 160 while(defined($word = shift) && @_) { 161 # This also just uses the internal add method. 162 if ($self->_add_internal($word, shift())) { 163 if (wantarray) { 164 push(@retarray, $word); 165 } 166 else { 167 $retnum++; 168 } 169 } 170 } 171 return @retarray if wantarray; 172 return $retnum; 173} 174 175# add_all() takes one or more other tries and adds all of their entries 176# to the trie. If both tries have data stored for the same key, the data 177# from the trie on which this method was invoked will be overwritten. I can't 178# think of anything useful to return from this method, so it has no return 179# value. If you can think of anything that would make sense, please let me 180# know. 181# This idea and most of its implementation come from Aaron Stone. 182# Thanks! 183sub add_all { 184 my $self = shift; 185 for my $trie (@_) { 186 my $ignore_end = ( 187 $self->{_FREEZE_END} || 188 ($self->{_END} eq $trie->{_END}) 189 ); 190 my @nodepairs = ({ 191 from => $trie->{_MAINHASHREF}, 192 to => $self->{_MAINHASHREF}, 193 }); 194 while (scalar @nodepairs) { 195 my $np = pop @nodepairs; 196 for my $letter (keys %{$np->{from}}) { 197 unless ($ignore_end) { 198 if ($letter eq $self->{_END}) { 199 $self->end_marker($self->_gen_new_marker( 200 bad => [$letter], 201 )); 202 } 203 } 204 if ($letter eq $trie->{_END}) { 205 $np->{to}{$self->{_END}} = $np->{from}{$trie->{_END}}; 206 } 207 else { 208 unless (exists $np->{to}{$letter}) { 209 $np->{to}{$letter} = {}; 210 } 211 push @nodepairs, { 212 from => $np->{from}{$letter}, 213 to => $np->{to}->{$letter}, 214 }; 215 } 216 } 217 } 218 } 219} 220 221# delete_data() takes a list of words in the trie and deletes the associated 222# data from the internal data store. In list context, returns a list of words 223# whose associated data have been removed -- in scalar context, returns a count 224# thereof. 225sub delete_data { 226 my($self, @words) = @_; 227 my($retnum, @retarray) = 0; 228 my @letters; 229 # Process each word... 230 for my $word (@words) { 231 if (ref($word) eq 'ARRAY') { 232 @letters = (@{$word}); 233 } 234 else { 235 @letters = split(//, $word); 236 } 237 my $ref = $self->{_MAINHASHREF}; 238 # Walk down the tree... 239 for my $letter (@letters) { 240 if ($ref->{$letter}) { 241 $ref = $ref->{$letter}; 242 } 243 else { 244 # This will cause the test right after this loop to fail and 245 # skip the the next word -- we want that because if we're here 246 # it means the word isn't in the trie. 247 $ref = {}; 248 last; 249 } 250 } 251 next unless (exists $ref->{$self->{_END}}); 252 # This is all we need to do to clear out the data 253 $ref->{$self->{_END}} = undef; 254 if (wantarray) { 255 push(@retarray, $word); 256 } 257 else { 258 $retnum++; 259 } 260 } 261 if (wantarray) { 262 return @retarray; 263 } 264 else { 265 return $retnum; 266 } 267} 268 269# The lookup() method searches for words (or beginnings of words) in the trie. 270# It takes a single word as an argument and, in list context, returns a list 271# of all the words in the trie which begin with the given word. In scalar 272# context, the return value depends on the value of the deepsearch parameter. 273# An optional second argument is available: This should be a numerical 274# argument, and specifies 2 things: first, that you want only word suffixes 275# to be returned, and second, the maximum length of those suffices. All 276# other configurations still apply. See the POD on this method for more 277# details. 278sub lookup { 279 my($self) = shift; 280 my($word) = shift; 281 # This is the argument for doing suffix lookup. 282 my($suff_length) = shift; 283 284 # Abstraction is kind of cool 285 return $self->_lookup_internal( 286 word => $word, 287 suff_len => $suff_length, 288 want_arr => wantarray(), 289 data => 0, 290 ); 291} 292 293# lookup_data() works basically the same as lookup, with the following 294# exceptions -- in list context, returns a hash of ward => data pairings, 295# and in scalar context, wherever it would return a word, it will instead 296# return the datum associated with that word. Note that, depending on 297# the deepsearch setting, lookup_data and lookup may return exactly the 298# same scalar context. 299sub lookup_data { 300 my($self, $word) = @_; 301 302 return $self->_lookup_internal( 303 word => $word, 304 want_arr => wantarray(), 305 data => 1, 306 ); 307} 308 309# The remove() method takes a list of words and, surprisingly, removes them 310# from the trie. It returns, in scalar context, the number of words removed. 311# In list context, returns a list of the words removed. As of now, the only 312# reason a word would fail to be removed is if it's not in the trie in the 313# first place. Or, again, if there's a bug... :) 314sub remove { 315 my($self) = shift; 316 my(@words) = @_; 317 318 my($letter,$ref) = ("","",""); 319 my(@letters,@ldn,@retarray); 320 my($retnum) = 0; 321 # The basic strategy here is as follows: 322 ## 323 # We walk down the trie one node at a time. If at any point, we see that a 324 # node can be deleted (that is, its only child is the one which continues the 325 # word we're deleting) then we mark it as the 'last deleteable'. If at any 326 # point we find a node which *cannot* be deleted (it has more children other 327 # than the one for the word we're working on), then we unmark our 'last 328 # deleteable' from before. Once done, delete from the last deleteable node 329 # down. 330 331 for my $word (@words) { 332 if (ref($word) eq 'ARRAY') { 333 @letters = (@{$word}); 334 } 335 else { 336 @letters = split('',$word); 337 } 338 # For each word, we need to put the leaf node entry at the end of the list 339 # of letters. We then reset the starting ref, and @ldn, which stands for 340 # 'last deleteable node'. It contains the ref of the hash and the key to 341 # be deleted. It does not seem possible to store a value passable to 342 # the 'delete' builtin in a scalar, so we're forced to do this. 343 push(@letters,$self->{_END}); 344 $ref = $self->{_MAINHASHREF}; 345 @ldn = (); 346 347 # This is a special case, if the first letter of the word is the only 348 # key of the main hash. I might not really need it, but this works as 349 # it is. 350 if (((scalar keys(%{ $ref })) == 1) && (exists $ref->{$letters[0]})) { 351 @ldn = ($ref); 352 } 353 # And now we go down the trie, as described above. 354 while (defined($letter = shift(@letters))) { 355 # We break out if we're at the end, or if we're run out of trie before 356 # finding the end of the word -- that is, if the word isn't in the 357 # trie. 358 last if ($letter eq $self->{_END}); 359 last unless exists($ref->{$letter}); 360 if ( 361 scalar keys(%{ $ref->{$letter} }) == 1 && 362 exists $ref->{$letter}{$letters[0]} 363 ) { 364 unless (scalar @ldn) { 365 @ldn = ($ref,$letter); 366 } 367 } 368 else { 369 @ldn = (); 370 } 371 $ref = $ref->{$letter}; 372 } 373 # If we broke out and there were still letters left in @letters, then the 374 # word must not be in the trie. Furthermore, if we got all the way to 375 # the end, but there's no leaf node, the word must not be in the trie. 376 next if (scalar @letters); 377 next unless (exists($ref->{$self->{_END}})); 378 # If @ldn is empty, then the only deleteable node is the leaf node, so 379 # we set this up. 380 if (scalar @ldn == 0) { 381 @ldn = ($ref,$self->{_END}); 382 } 383 # If there's only one entry in @ldn, then it's the ref of the top of our 384 # Trie. If that's marked as deleteable, then we can just nuke the entire 385 # hash. 386 if (scalar @ldn == 1) { 387 %{ $ldn[0] } = (); 388 } 389 # Otherwise, we just delete the key we want to. 390 else { 391 delete($ldn[0]->{$ldn[1]}); 392 } 393 # And then just return stuff. 394 if (wantarray) { 395 push (@retarray,$word); 396 } 397 else { 398 $retnum++; 399 } 400 } 401 if (wantarray) { 402 return @retarray; 403 } 404 return $retnum; 405} 406 407## These are PRIVATE METHODS. Don't call them directly unless you really 408 # know what you're doing, or you enjoy things working funny. 409 410# The _walktree() sub takes a word beginning and a hashref (hopefully to a trie) 411# and walks down the trie, gathering all of the word endings and retuning them 412# appended to the word beginning. 413sub _walktree { 414 my($self, %args) = @_; 415 my $word = $args{word}; 416 my $ref = $args{ref}; 417 # These 2 arguments are used to control how far down the tree this 418 # path will go. 419 # This first argument is passed in by external subs 420 my $suffix_length = $args{suf_len} || 0; 421 # And this one is used only by the recursive calls. 422 my $walked_suffix_length = $args{walked} || 0; 423 424 my $wantref = ref($word) eq 'ARRAY'; 425 426 my($key) = ""; 427 my(@retarray) = (); 428 my($ret) = 0; 429 430 # For some reason, I used to think this was complicated and had a lot of 431 # stupid, useless code here. It's a lot simpler now. If the key we find 432 # is our magic reference, then we just give back the word. Otherwise, we 433 # walk down the new subtree we've discovered. 434 foreach $key (keys %{ $ref }) { 435 if ($key eq $self->{_END}) { 436 if (wantarray) { 437 push(@retarray,$word); 438 if ($args{data}) { 439 push(@retarray, $ref->{$key}); 440 } 441 } 442 else { 443 $ret++; 444 } 445 next; 446 } 447 my $nextval = $wantref ? [(@{$word}), $key] : $word . $key; 448 # If we've reached the max depth we need to travel for the suffix (if 449 # specified), then stop and collect everything up. 450 if ($suffix_length > 0 && ($suffix_length - $walked_suffix_length == 1)) { 451 if (wantarray) { 452 push(@retarray, $nextval); 453 } 454 else { 455 $ret++; 456 } 457 } 458 else { 459 # Look, recursion! 460 my %arguments = ( 461 word => $nextval, 462 'ref' => $ref->{$key}, 463 suf_len => $suffix_length, 464 walked => $walked_suffix_length + 1, 465 data => $args{data}, 466 ); 467 if (wantarray) { 468 push(@retarray, $self->_walktree(%arguments)); 469 } 470 else { 471 $ret += scalar $self->_walktree(%arguments); 472 } 473 } 474 } 475 if (wantarray) { 476 return @retarray; 477 } 478 else { 479 return $ret; 480 } 481} 482 483# This code used to use some fairly hoary recursive code which caused it to 484# run fairly slowly, mainly due to the relatively slow way that perl handles 485# OO method invocation. This was pointed out to me by Justin Hicks, and he 486# helped me fix it up, to be quite a bit more reasonable now. 487sub _lookup_internal { 488 my $self = shift; 489 my %args = @_; 490 my($ref) = $self->{_MAINHASHREF}; 491 492 my($letter, $nextletter) = ("", ""); 493 my(@letters) = (); 494 my(@retarray) = (); 495 my($wantref) = 0; 496 497 my $word = $args{word}; 498 499 # Here we split the word up into letters in the appropriate way. 500 if (ref($word) eq 'ARRAY') { 501 @letters = (@{$word}); 502 # Keeping track of what kind of word it was. 503 $wantref = 1; 504 } 505 else { 506 @letters = split('',$word); 507 } 508 509 # These three are to keep hold of possibly returned values. 510 my $lastword = $wantref ? [] : ""; 511 my $lastwordref = undef; 512 my $pref = $wantref ? [] : ""; 513 514 # Like everything else, we step across each letter. 515 while(defined($letter = shift(@letters))) { 516 # This is to keep track of stuff for the "prefix" version of deepsearch. 517 if ($self->{_DEEPSEARCH} == PREFIX && !$args{want_arr}) { 518 if (exists $ref->{$self->{_END}}) { 519 # The "data" argument tells us if we want to return the word 520 # or the data associated with it. 521 if ($args{data}) { 522 $lastwordref = $ref; 523 } 524 elsif ($wantref) { 525 push(@{$lastword}, @{$pref}); 526 } 527 else { 528 $lastword .= $pref; 529 } 530 $pref = $wantref ? [] : ""; 531 } 532 unless ($args{data}) { 533 if ($wantref) { 534 push(@{$pref}, $letter); 535 } 536 else { 537 $pref .= $letter; 538 } 539 } 540 } 541 # If, at any point, we find that we've run out of tree before we've run out 542 # of word, then there is nothing in the trie that begins with the input 543 # word, so we return appropriately. 544 unless (exists $ref->{$letter}) { 545 # Array case. 546 if ($args{want_arr}) { 547 return (); 548 } 549 # "count" case. 550 elsif ($self->{_DEEPSEARCH} == COUNT) { 551 return 0; 552 } 553 # "prefix" case. 554 elsif ($self->{_DEEPSEARCH} == PREFIX) { 555 if ($args{data} && $lastwordref) { 556 return $lastwordref->{$self->{_END}}; 557 } 558 if (($wantref && scalar @{$lastword}) || length $lastword) { 559 return $lastword; 560 } 561 return undef; 562 } 563 # All other deepsearch cases are the same. 564 else { 565 return undef; 566 } 567 } 568 # If the letter is there, we just walk one step down the trie. 569 $ref = $ref->{$letter}; 570 } 571 # Once we've walked all the way down the tree to the end of the word we were 572 # given, there are a few things that can be done, depending on the context 573 # that the method was called in. 574 if ($args{want_arr}) { 575 # If they want an array, then we use the walktree subroutine to collect all 576 # of the words beneath our current location in the trie, and return them. 577 @retarray = $self->_walktree( 578 # When fetching suffixes, we don't want to give the word begnning. 579 word => $args{suff_len} ? "" : $word, 580 'ref' => $ref, 581 suf_len => $args{suff_len}, 582 data => $args{data}, 583 ); 584 return @retarray; 585 } 586 else { 587 if ($self->{_DEEPSEARCH} == BOOLEAN) { 588 # Here, the user only wants to know if any words in the trie begin 589 # with their word, so that's what we give them. 590 return 1; 591 } 592 elsif ($self->{_DEEPSEARCH} == EXACT) { 593 # In this case, the user wants us to return something only if the 594 # exact word exists in the trie, and undef otherwise. 595 # This option only really makes sense with when looking up data, 596 # as otherwise it's essentially the same as BOOLEAN, above, but it 597 # doesn't hurt to allow it to work with normal lookup, either. 598 # I'd initially left this out because I didn't see a use for it, but 599 # thanks to Otmal Lendl for pointing out to me a situation in which 600 # it would be helpful to have. 601 if (exists $ref->{$self->{_END}}) { 602 if ($args{data}) { 603 return $ref->{$self->{_END}}; 604 } 605 return $word; 606 } 607 return undef; 608 } 609 elsif ($self->{_DEEPSEARCH} == CHOOSE) { 610 # If they want this, then we continue to walk down the trie, collecting 611 # letters, until we find a leaf node, at which point we stop. Note that 612 # this works properly if the exact word is in the trie. Yay. 613 # Of course, making it work that way means that we tend to get shorter 614 # words in choose... is this a bad thing? I dunno. 615 my($stub) = $wantref ? [] : ""; 616 while (scalar keys %{$ref} && !exists $ref->{$self->{_END}}) { 617 $nextletter = each(%{ $ref }); 618 # I need to call this to clear the each() call. Wish I didn't... 619 keys(%{ $ref }); 620 if ($wantref) { 621 push(@{$stub}, $nextletter); 622 } 623 else { 624 $stub .= $nextletter; 625 } 626 $ref = $ref->{$nextletter}; 627 # If we're doing suffixes, bail out early once it's the right length. 628 if ($args{suff_len}) { 629 my $cmpr = $wantref ? scalar @{$stub} : length $stub; 630 last if $cmpr == $args{suff_len}; 631 } 632 } 633 if ($args{data}) { 634 return $ref->{$self->{_END}}; 635 } 636 # If they've specified a suffix length, then they don't want the 637 # beginning part of the word. 638 if ($args{suff_len}) { 639 return $stub; 640 } 641 # Otherwise, they do. 642 else { 643 return $wantref ? [@{$word}, @{$stub}] : $word . $stub; 644 } 645 } 646 elsif ($self->{_DEEPSEARCH} == COUNT) { 647 # Here, the user simply wants a count of words in the trie that begin 648 # with their word, so we get that by calling our walktree method in 649 # scalar context. 650 return scalar $self->_walktree( 651 # When fetching suffixes, we don't want to give the word begnning. 652 word => $args{suff_len} ? "" : $word, 653 'ref' => $ref, 654 suf_len => $args{suff_len}, 655 ); 656 } 657 elsif ($self->{_DEEPSEARCH} == PREFIX) { 658 # This is the "longest prefix found" case. 659 if (exists $ref->{$self->{_END}}) { 660 if ($args{data}) { 661 return $ref->{$self->{_END}}; 662 } 663 if ($wantref) { 664 return [@{$lastword}, @{$pref}]; 665 } 666 else { 667 return $lastword . $pref; 668 } 669 } 670 if ($args{data}) { 671 return $lastwordref->{$self->{_END}}; 672 } 673 return $lastword; 674 } 675 } 676} 677 678# This is the method which does all of the heavy lifting for add and 679# add_data. Given a word and a datum, it walks down the trie until 680# it finds a branch that hasn't been created yet. It then makes the rest 681# of the branch, and slaps an end marker and the datum inside of it. 682sub _add_internal { 683 my $self = shift; 684 my $word = shift; 685 my $datum = shift; 686 my @letters; 687 # We don't NEED to split a string into letters; Any array of tokens 688 # will do. 689 if (ref($word) eq 'ARRAY') { 690 # Note: this is a copy 691 @letters = (@{$word}); 692 # Because in this case, a "letter" can be more than on character 693 # long, we have to make sure we don't collide with whatever we're 694 # using as an end marker. 695 # However, if the user is feeling all fanciful and told us not to 696 # bother, we won't. 697 unless ($self->{_FREEZE_END}) { 698 for my $letter (@letters) { 699 if ($letter eq $self->{_END}) { 700 # If we had a collision, then make a new end marker. 701 $self->end_marker($self->_gen_new_marker( 702 bad => \@letters, 703 )); 704 last; 705 } 706 } 707 } 708 } 709 else { 710 @letters = split('',$word); 711 } 712 # Start at the top of the Trie... 713 my $ref = $self->{_MAINHASHREF}; 714 # This will walk down the trie as far as it can, until it either runs 715 # out of word or out of trie. 716 while ( 717 (scalar @letters) && 718 exists($ref->{$letters[0]}) 719 ) { 720 $ref = $ref->{shift(@letters)}; 721 } 722 # If it ran out of trie before it ran out of word then this will create 723 # the rest of the trie structure. 724 for my $letter (@letters) { 725 $ref = $ref->{$letter} = {}; 726 } 727 # In either case, this will make the new end marker for the end of the 728 # word (assuming it wasn't already there) and set the return value 729 # appropriately. 730 my $ret = 1; 731 if (exists $ref->{$self->{_END}}) { 732 $ret = 0; 733 } 734 else { 735 $ref->{$self->{_END}} = undef; 736 } 737 # This will set the data if it was provided. 738 if (defined $datum) { 739 $ref->{$self->{_END}} = $datum; 740 } 741 return $ret; 742} 743 744# This uses a heuristic (that is, a crappy method) to generate a new 745# end marker for the trie. In addition to being sure that whatever is 746# generated is not in use as a letter in the trie, it also makes a bold 747# yet mostly vain attempt to try to make something that might not be 748# used in the future. 749# In general, I do not try to make this functionality good or fast or 750# perfect -- if it's being called often, the module is being mis-used. 751# If a user is using multi-character letters, then they ought to find 752# a string that will be safe and set it themselves. 753sub _gen_new_marker { 754 my $self = shift; 755 my %args = @_; 756 # This will keep track of all of the letters used in the trie already 757 my %used = (); 758 # This will keep track of what lengths they are 759 my %sizes = (); 760 # First we process the letters of the word which sparked this 761 # re-evaluation. 762 for my $letter (@{$args{bad}}) { 763 my $len = length($letter); 764 if ($len != 1) { 765 $used{$letter}++; 766 $sizes{$len}++; 767 } 768 } 769 # Then we walk the tree and get the info on all the other letters. 770 my @refs = ($self->{_MAINHASHREF}); 771 while (@refs) { 772 my $ref = shift @refs; 773 for my $key (keys %{$ref}) { 774 # Note we don't even care about length 1 letters. 775 if ( 776 (length($key) != 1) && 777 ($key ne $self->{_END}) 778 ) { 779 $used{$key}++; 780 $sizes{length($key)}++; 781 push(@refs, $ref->{$key}); 782 } 783 } 784 } 785 # The idea here is that we want to make the end marker as small as possible, 786 # as it's stuck all over the place. However, we don't want to spend forever 787 # trying to find one that isn't in use. 788 # So, we find the smallest length such that there are fewer than 1/4 of 789 # the total number of possible letters in use of that length, and we make 790 # a key of that length. 791 my $newlen = 2; 792 for my $len (sort keys %sizes) { 793 # Yes, I know there are well more than 26 available compositors, but 794 # this will only mean I'm being too careful. 795 if ($sizes{$len} < ((26 ** $len) / 4)) { 796 $newlen = $len; 797 last; 798 } 799 else { 800 # This makes it so that if all existing lengths are too full ( !! ) 801 # then we will just use a key that's one longer than the longest 802 # one already there. 803 $newlen = $len + 1; 804 } 805 } 806 # Now we just generate end markers until we find one that isn't in use. 807 my $newend; 808 do { 809 $newend = join '', map { chr(int(rand(128))) } (('') x $newlen); 810 } while (exists($used{$newend})); 811 # And return it. 812 return $newend; 813} 814 815# Strewth! 8161; 817 818__END__ 819 820=head1 NAME 821 822 823Tree::Trie - A data structure optimized for prefix lookup. 824 825=head1 SYNOPSIS 826 827 use Tree::Trie; 828 use strict; 829 830 my($trie) = new Tree::Trie; 831 $trie->add(qw[aeode calliope clio erato euterpe melete melpomene mneme 832 polymnia terpsichore thalia urania]); 833 my(@all) = $trie->lookup(""); 834 my(@ms) = $trie->lookup("m"); 835 $" = "--"; 836 print "All muses: @all\nMuses beginning with 'm': @ms\n"; 837 my(@deleted) = $trie->remove(qw[calliope thalia doc]); 838 print "Deleted muses: @deleted\n"; 839 840 841=head1 DESCRIPTION 842 843This module implements a trie data structure. The term "trie" comes from the 844word reB<trie>val, but is generally pronounced like "try". A trie is a tree 845structure (or directed acyclic graph), the nodes of which represent letters 846in a word. For example, the final lookup for the word 'bob' would look 847something like C<$ref-E<gt>{'b'}{'o'}{'b'}{'00'}> (the 00 being an 848end marker). Only nodes which would represent words in the trie exist, making 849the structure slightly smaller than a hash of the same data set. 850 851The advantages of the trie over other data storage methods is that lookup 852times are O(1) WRT the size of the index. For sparse data sets, it is probably 853not as efficient as performing a binary search on a sorted list, and for small 854files, it has a lot of overhead. The main advantage (at least from my 855perspective) is that it provides a relatively cheap method for finding a list 856of words in a large, dense data set which B<begin> with a certain string. 857 858The term "word" in this documentation can refer to one of two things: either a 859reference to an array of strings, or a scalar which is not a reference. In 860the case of the former, each element of the array is treated as a "letter" 861of the "word". In the case of the latter, the scalar is evaluated in string 862context and it is split into its component letters. Return values of methods 863match the values of what is passed in -- that is, if you call lookup() with 864an array reference, the return value will be an array reference (if 865appropriate). 866 867NOTE: The return semantics of the lookup_data method have CHANGED from version 8681.0 to version 1.1. If you use this method, be sure to see the perldoc on 869that method for details. 870 871=head1 METHODS 872 873=over 4 874 875 876=item new() 877 878=item new({I<option0> => I<value0>, I<option1> => I<value1>, ...}) 879 880This is the constructor method for the class. You may optionally pass it 881a hash reference with a set of I<option> => I<value> pairs. The options 882which can be set at object creation-time are "deepsearch", "end_marker" and 883"freeze_end_marker". See the documentation on the methods which set and 884report those values for more information. 885 886=item $trie->add(I<I<word>>, I<word1>, ...) 887 888This method attempts to add the words to the trie. Returns, in list 889context, the words successfully added to the trie. In scalar context, returns 890the number of words successfully added. As of this release, the only reason 891a word would fail to be added is if it is already in the trie. 892 893=item $trie->add_all(I<I<trie>>, I<trie1>, ...) 894 895This method adds all of the words from the argument tries to the trie. By 896performing the traversal of both source and target tries simultaneously, 897this mechanism is much faster first doing a lookup on one trie and then an 898add on the other. Has no return value. 899 900=item $trie->add_data(I<I<word>> => I<data0>, I<word1> => I<data1>, ...) 901 902This method works in basically the same way as C<add()>, except in addition to 903adding words to the trie, it also adds data associated with those words. Data 904values may be overwritten by adding data for words already in the trie. Its 905return value is the same and applies only to new words added to the trie, not 906data modified in existing words. 907 908=item $trie->remove(I<I<word>>, I<word1>, ...) 909 910This method attempts to remove the words from the trie. Returns, in 911list context, the words successfully removed from the trie. In scalar context, 912returns the number of words successfully removed. As of this release, the only 913reason a word would fail to be removed is if it is not already in the trie. 914 915=item $trie->delete_data(I<I<word>>, I<word1>, ...) 916 917This method simply deletes data associated with words in the trie. It 918is the equivalent to perl's delete builtin operating on a hash. It returns 919the number of data items deleted in scalar context, or a list of words 920for which data has been removed, in list context. 921 922=item $trie->lookup(I<word>) 923 924=item $trie->lookup(I<word>, I<suffix_length>) 925 926This method performs lookups on the trie. In list context, it returns a 927complete list of words in the trie which begin with I<word>. 928In scalar context, the value returned depends on the setting of the 'deepsearch' 929option. You can set this option while creating your Trie object, or by using 930the deepsearch method. Valid deepsearch values are: 931 932boolean: Will return a true value if any word in the trie begins with I<word>. 933This setting is the fastest. 934 935choose: Will return one word in the trie that begins with I<word>, or undef if 936nothing is found. If I<word> exists in the trie exactly, it will be returned. 937 938count: Will return a count of the words in the trie that begin with I<word>. 939This operation may require walking the entire tree, so it can possibly be 940significantly slower than other options. 941 942prefix: Will return the longest entry in the trie that is a prefix of I<word>. 943For example, if you had a list of file system mount points in your trie, you 944could use this option, pass in the full path of a file, and would be returned 945the name of the mount point on which the file could be found. 946 947exact: If the exact word searched for exists in the trie, will return that 948word (or the data associated therewith), undef otherwise. This is essentially 949equivalent to a hash lookup, but it does have utility in some cases. 950 951For reasons of backwards compatibility, 'choose' is the default value 952of this option. 953 954To get a list of all words in the trie, use C<lookup("")> in list context. 955 956If the I<suffix_length> option is provided, the behavior is a little bit 957different: Instead of returning words from the trie, it will instead return 958suffixes that follow I<word>, and those suffixes will be no longer than the 959numerical value of the option. If the option's value is negative, suffixes 960of all lengths will be returned. This option only has effect if the 961call to lookup() is in list context, or if the 'deepsearch' parameter 962is set to either 'count' or 'choose'. It has no meaning for the other 963scalar deepsearch settings, and will be ignored in those cases. 964 965For example, assume your trie contains 'foo', 'food' and 'fish'. 966C<lookup('f', 1)> would return 'o' and 'i'. C<lookup('f', 3)> would 967return 'oo', 'ood' and 'ish'. C<lookup('fo', -1)> would return 'o' and 968'od'. In scalar context, these calls would return what you'd expect, based 969on the value of deepsearch, with the 'count' and 'choose' options operating 970only over the set of suffixes. That is, The first call would return 2 971with 'count', and either 'o' or 'i' with 'choose'. 972 973Note that C<lookup("", -1)> is the same as C<lookup("")>. 974 975=item $trie->lookup_data(I<word>) 976 977This method operates in essentially the same way as C<lookup()>, with the 978exception that in list context it returns a list of word => data value 979pairs and in scalar context, where C<lookup()> would return a word, 980C<lookup_data()> returns the data value associated with that word. In 981cases where the deepsearch setting is such that C<lookup()> would 982return a number, C<lookup_data()> will return the same number. 983 984Please note that the return value in list context is NOT a hash. It can 985be coerced into a hash, and if you are not using any multi-character letters 986in your trie, this will work fine. However otherwise, if it is coerced into 987a hash, all the of the array references (remember, words are array refs when 988using multi-character letters) will be stringified, which renders them (for 989the most part) useless. 990 991=item $trie->deepsearch() 992 993=item $trie->deepsearch(I<new_setting>) 994 995If option is specified, sets the deepsearch parameter. Option may be one of: 996'boolean', 'choose', 'count', 'prefix'. Please see the documentation for the 997lookup method for the details of what these options mean. Returns the 998current (new) value of the deepsearch parameter. 999 1000=item $trie->end_marker() 1001 1002=item $trie->end_marker(I<new_marker>) 1003 1004If the marker is provided, sets the string used internally to indicate the 1005end of a word in the trie to that marker. Doing this causes a complete 1006traversal of the trie, where all old end markers are replaced with the new 1007one. This can get very slow, so try to call this method when the trie is 1008still small. Returns the current (new) end marker value. 1009 1010=item $trie->freeze_end_marker() 1011 1012=item $trie->freeze_end_marker(I<new_flag>) 1013 1014If flag is provided and a true value, turns off checking and automatic 1015updating of the end marker. If flag is provided and false, turns this 1016checking on. Returns the current (new) truth value of this setting. 1017 1018=back 1019 1020=head1 End Markers 1021 1022=head2 Overview 1023 1024The following discussion is only important for those people using 1025multi-character letters, or words as array references. If you are just 1026using this module with words as simple strings, you may disregard this 1027section. 1028 1029First, it's important to understand how data is stored in the trie. As 1030described above, the trie structure is basically just a complicated hash of 1031hashes, with each key of each has being a letter. There needs to be a distinct 1032way of determining when we're at the end of a word; we can't just use the 1033end of the hash structure as a guide, because we need to distinguish between 1034the word "barn" being in the trie and the words "bar" and "barn" being there. 1035 1036The answer is an end marker -- a distinct token that signifies that we're 1037at the end of the word. Using the above example, if "bar" and "barn" are 1038in the trie, then the keys of the hash at "r" would be "n" and this end 1039marker. Choosing this end marker is easy when all letters are just one 1040character -- we just choose any two-character string and we know that it will 1041never match a letter. However, once we allow arbitrary multi-character 1042letters, then things get much more difficult: there is no possible end 1043marker which can be guaranteed to always work. Here is where we enter 1044some dark water. 1045 1046=head2 Dark Water 1047 1048In order to make sure that the end marker is always safe, we must check 1049incoming letters on every word submission. If the word is an array ref, then 1050each letter in it is compared to the current end marker. This does add 1051overhead, but it's necessary. If it is found that a letter does conflict 1052with the end marker, then we choose a new end marker. 1053 1054In order to find a new end marker, we obviously need to find a string that 1055isn't already being used in the trie. This requires a complete traversal of 1056the trie to collect a complete set of the letters in use. Once we have this 1057it is a simple exercise to generate a new marker which is not in use. 1058 1059Then we must replace the marker. This of course requires a complete 1060traversal once again. As you can see, this adds a bit of overhead to working 1061with multi-character letters, but it's neccessary to make sure things keep 1062working correctly. This should be fine for people with small data sets, 1063or who just do a bunch of additions ahead of time and then only do lookups. 1064However, if computation time is important to you, there are ways to 1065avoid this mess. 1066 1067=head2 Speeding Things Up 1068 1069One way to speed things up is to avoid the need to replace the end marker. 1070You can set the trie's end marker using the C<end_marker()> method, or at 1071creation time, by passing the C<end_marker> option to the trie in its 1072constructor's option hashref. Note that setting the end marker causes 1073a trie traversal, as it must update existing data. As such, you want to 1074set the end marker as soon as possible. 1075 1076Note that end marker MUST be at least 2 characters long. 1077 1078Just setting the end marker though, won't stop the trie from checking each 1079letter as you add arrayref words. If you are 100% sure that the end 1080marker you set won't ever show up in an added word, you can either use 1081the C<freeze_end_marker()> method or the C<freeze_end_marker> construction 1082option to tell the trie not to check any more. However, be careful -- 1083once this option is enabled, the data structure is no longer self-policing, 1084so if a letter that matches your end marker does end up slipping in, strange 1085things will begin to happen. 1086 1087=head2 Examples 1088 1089Here are some situations in which you might want to use the methods described 1090in the previous section. 1091 1092Let's say your application takes user input data describing travel across 1093the united states, and each node in the trie is a two-letter state abbreviation. 1094In this case, it would probably be fairly safe to set your end marker to 1095something like '00'. However, since this is user-supplied data, you don't 1096want to let some user break your whole system by entering '00', so you should 1097probably not freeze the end marker in this case. 1098 1099Let's say you're using the trie for a networking application -- your words 1100will be IP addresses, and your letters will be the four "quads" of an IP 1101address. In this case you can safely set your end marker to 'xx' or anything 1102with letters in it, and know that there will never be a collision. It is 1103entirely reasonable to set the freeze tag in this case. 1104 1105=head1 Future Work 1106 1107=over 4 1108 1109=item * 1110 1111There are a few methods of compression that allow you same some amount of space 1112in the trie. I have to figure out which ones are worth implementing. I may 1113end up making the different compression methods configurable. 1114 1115I have now made one of them the default. It's the least effective one, of 1116course. 1117 1118=item * 1119 1120The ability to have Tree::Trie be backed by a "live" file instead of keeping 1121data in memory. This is, unfortunately, more complicated than simply using 1122TIE, so this will take some amount of work. 1123 1124=back 1125 1126=head1 Known Problems 1127 1128=over 4 1129 1130=item * 1131 1132None at this time. 1133 1134=back 1135 1136=head1 AUTHOR 1137 1138Copyright 2011 Avi Finkel <F<avi@finkel.org>> 1139 1140This package is free software and is provided "as is" without express 1141or implied warranty. It may be used, redistributed and/or modified 1142under the terms of the Perl Artistic License (see 1143http://www.perl.com/perl/misc/Artistic.html) 1144 1145=cut 1146