1# Tree::Trie, a module implementing a trie data structure.
2# A formal description of tries can be found at:
3# http://www.cs.queensu.ca/home/daver/235/Notes/Tries.pdf
4
5package Tree::Trie;
6
7use strict;
8use warnings;
9
10our $VERSION = "1.9";
11
12# A handful of helpful constants
13use constant DEFAULT_END_MARKER => '';
14
15use constant BOOLEAN => 0;
16use constant CHOOSE  => 1;
17use constant COUNT   => 2;
18use constant PREFIX  => 3;
19use constant EXACT   => 4;
20
21##   Public methods begin here
22
23# The constructor method.  It's very simple.
24sub new {
25	my($proto) = shift;
26	my($options) = shift;
27	my($class) = ref($proto) || $proto;
28	my($self) = {};
29	bless($self, $class);
30	$self->{_MAINHASHREF} = {};
31	# These are default values
32	$self->{_END} = &DEFAULT_END_MARKER;
33	$self->{_DEEPSEARCH} = CHOOSE;
34	$self->{_FREEZE_END} = 0;
35	unless ( defined($options) && (ref($options) eq "HASH") ) {
36		$options = {};
37	}
38	$self->deepsearch($options->{'deepsearch'});
39	if (exists $options->{end_marker}) {
40		$self->end_marker($options->{end_marker});
41	}
42	if (exists $options->{freeze_end_marker}) {
43		$self->freeze_end_marker($options->{freeze_end_marker});
44	}
45	return($self);
46}
47
48# Sets the value of the end marker, for those people who think they know
49# better than Tree::Trie.  Note it does not allow the setting of single
50# character end markers.
51sub end_marker {
52	my $self = shift;
53	if ($_[0] && length $_[0] > 1) {
54		# If they decide to set a new end marker, we have to be sure to
55		# go through and update all existing markers.
56		my $newend = shift;
57		my @refs = ($self->{_MAINHASHREF});
58		while (@refs) {
59			my $ref = shift @refs;
60			for my $key (keys %{$ref}) {
61				if ($key eq $self->{_END}) {
62					$ref->{$newend} = $ref->{$key};
63					delete $ref->{$key};
64				}
65				else {
66					push(@refs, $ref->{$key});
67				}
68			}
69		}
70		$self->{_END} = $newend;
71	}
72	return $self->{_END};
73}
74
75# Sets the option to not attempt to update the end marker based on added
76# letters.
77# The above is the most awkward sentence I have ever written.
78sub freeze_end_marker {
79	my $self = shift;
80	if (scalar @_) {
81		if (shift) {
82			$self->{_FREEZE_END} = 1;
83		}
84		else {
85			$self->{_FREEZE_END} = 0;
86		}
87	}
88	return $self->{_FREEZE_END};
89}
90
91# Sets the value of the deepsearch parameter.  Can be passed either words
92# describing the parameter, or their numerical equivalents.  Legal values
93# are:
94# boolean => 0
95# choose => 1
96# count => 2
97# prefix => 3
98# exact => 4
99# See the POD for the 'lookup' method for details on this option.
100sub deepsearch {
101	my($self) = shift;
102	my($option) = shift;
103	if(defined($option)) {
104		if ($option eq BOOLEAN || $option eq 'boolean') {
105			$self->{_DEEPSEARCH} = BOOLEAN;
106		}
107		elsif ($option eq CHOOSE || $option eq 'choose') {
108			$self->{_DEEPSEARCH} = CHOOSE;
109		}
110		elsif ($option eq COUNT || $option eq 'count') {
111			$self->{_DEEPSEARCH} = COUNT;
112		}
113		elsif ($option eq PREFIX || $option eq 'prefix') {
114			$self->{_DEEPSEARCH} = PREFIX;
115		}
116		elsif ($option eq EXACT || $option eq 'exact') {
117			$self->{_DEEPSEARCH} = EXACT;
118		}
119	}
120	return $self->{_DEEPSEARCH};
121}
122
123# The add() method takes a list of words as arguments and attempts to add
124# them to the trie. In list context, returns a list of words successfully
125# added.  In scalar context, returns a count of these words.  As of this
126# version, the only reason a word can fail to be added is if it is already
127# in the trie.  Or, I suppose, if there was a bug. :)
128sub add {
129	my($self) = shift;
130	my(@words) = @_;
131
132	my @retarray;
133	my $retnum = 0;
134
135	# Process each word...
136	for my $word (@words) {
137		# And just call the internal thingy for it.
138		if ($self->_add_internal($word, undef)) {
139			# Updating return values as needed
140			if (wantarray) {
141				push(@retarray,$word);
142			}
143			else {
144				$retnum++;
145			}
146		}
147	}
148	# When done, return results.
149	return (wantarray ? @retarray : $retnum);
150}
151
152# add_data() takes a hash of word => data pairs, adds the words to the trie and
153# associates the data to those words.
154sub add_data {
155	my($self) = shift;
156	my($retnum, @retarray);
157	my $word = "";
158	# Making sure that we've gotten data in pairs.  Can't just turn @_
159	# into %data, because that would stringify arrayrefs
160	while(defined($word = shift) && @_) {
161		# This also just uses the internal add method.
162		if ($self->_add_internal($word, shift())) {
163			if (wantarray) {
164				push(@retarray, $word);
165			}
166			else {
167				$retnum++;
168			}
169		}
170	}
171	return @retarray if wantarray;
172	return $retnum;
173}
174
175# add_all() takes one or more other tries and adds all of their entries
176# to the trie.  If both tries have data stored for the same key, the data
177# from the trie on which this method was invoked will be overwritten.  I can't
178# think of anything useful to return from this method, so it has no return
179# value.  If you can think of anything that would make sense, please let me
180# know.
181# This idea and most of its implementation come from Aaron Stone.
182# Thanks!
183sub add_all {
184	my $self = shift;
185	for my $trie (@_) {
186		my $ignore_end = (
187			 $self->{_FREEZE_END} ||
188			($self->{_END} eq $trie->{_END})
189		);
190		my @nodepairs = ({
191			from => $trie->{_MAINHASHREF},
192			to   => $self->{_MAINHASHREF},
193		});
194		while (scalar @nodepairs) {
195			my $np = pop @nodepairs;
196			for my $letter (keys %{$np->{from}}) {
197				unless ($ignore_end) {
198					if ($letter eq $self->{_END}) {
199						$self->end_marker($self->_gen_new_marker(
200							bad => [$letter],
201						));
202					}
203				}
204				if ($letter eq $trie->{_END}) {
205					$np->{to}{$self->{_END}} = $np->{from}{$trie->{_END}};
206				}
207				else {
208					unless (exists $np->{to}{$letter}) {
209						$np->{to}{$letter} = {};
210					}
211					push @nodepairs, {
212						from => $np->{from}{$letter},
213						to   => $np->{to}->{$letter},
214					};
215				}
216			}
217		}
218	}
219}
220
221# delete_data() takes a list of words in the trie and deletes the associated
222# data from the internal data store.  In list context, returns a list of words
223# whose associated data have been removed -- in scalar context, returns a count
224# thereof.
225sub delete_data {
226	my($self, @words) = @_;
227	my($retnum, @retarray) = 0;
228	my @letters;
229	# Process each word...
230	for my $word (@words) {
231		if (ref($word) eq 'ARRAY') {
232			@letters = (@{$word});
233		}
234		else {
235			@letters = split(//, $word);
236		}
237		my $ref = $self->{_MAINHASHREF};
238		# Walk down the tree...
239		for my $letter (@letters) {
240			if ($ref->{$letter}) {
241				$ref = $ref->{$letter};
242			}
243			else {
244				# This will cause the test right after this loop to fail and
245				# skip the the next word -- we want that because if we're here
246				# it means the word isn't in the trie.
247				$ref = {};
248				last;
249			}
250		}
251		next unless (exists $ref->{$self->{_END}});
252		# This is all we need to do to clear out the data
253		$ref->{$self->{_END}} = undef;
254		if (wantarray) {
255			push(@retarray, $word);
256		}
257		else {
258			$retnum++;
259		}
260	}
261	if (wantarray) {
262		return @retarray;
263	}
264	else {
265		return $retnum;
266	}
267}
268
269# The lookup() method searches for words (or beginnings of words) in the trie.
270# It takes a single word as an argument and, in list context, returns a list
271# of all the words in the trie which begin with the given word.  In scalar
272# context, the return value depends on the value of the deepsearch parameter.
273# An optional second argument is available:  This should be a numerical
274# argument, and specifies 2 things: first, that you want only word suffixes
275# to be returned, and second, the maximum length of those suffices.  All
276# other configurations still apply. See the POD on this method for more
277# details.
278sub lookup {
279	my($self) = shift;
280	my($word) = shift;
281	# This is the argument for doing suffix lookup.
282	my($suff_length) = shift;
283
284	# Abstraction is kind of cool
285	return $self->_lookup_internal(
286		word     => $word,
287		suff_len => $suff_length,
288		want_arr => wantarray(),
289		data     => 0,
290	);
291}
292
293# lookup_data() works basically the same as lookup, with the following
294# exceptions -- in list context, returns a hash of ward => data pairings,
295# and in scalar context, wherever it would return a word, it will instead
296# return the datum associated with that word.  Note that, depending on
297# the deepsearch setting, lookup_data and lookup may return exactly the
298# same scalar context.
299sub lookup_data {
300	my($self, $word) = @_;
301
302	return $self->_lookup_internal(
303		word     => $word,
304		want_arr => wantarray(),
305		data     => 1,
306	);
307}
308
309# The remove() method takes a list of words and, surprisingly, removes them
310# from the trie.  It returns, in scalar context, the number of words removed.
311# In list context, returns a list of the words removed.  As of now, the only
312# reason a word would fail to be removed is if it's not in the trie in the
313# first place.  Or, again, if there's a bug...  :)
314sub remove {
315	my($self) = shift;
316	my(@words) = @_;
317
318	my($letter,$ref) = ("","","");
319	my(@letters,@ldn,@retarray);
320	my($retnum) = 0;
321	# The basic strategy here is as follows:
322	##
323	# We walk down the trie one node at a time.  If at any point, we see that a
324	# node can be deleted (that is, its only child is the one which continues the
325	# word we're deleting) then we mark it as the 'last deleteable'.  If at any
326	# point we find a node which *cannot* be deleted (it has more children other
327	# than the one for the word we're working on), then we unmark our 'last
328	# deleteable' from before.  Once done, delete from the last deleteable node
329	# down.
330
331	for my $word (@words) {
332		if (ref($word) eq 'ARRAY') {
333			@letters = (@{$word});
334		}
335		else {
336			@letters = split('',$word);
337		}
338		# For each word, we need to put the leaf node entry at the end of the list
339		# of letters.  We then reset the starting ref, and @ldn, which stands for
340		# 'last deleteable node'.  It contains the ref of the hash and the key to
341		# be deleted.  It does not seem possible to store a value passable to
342		# the 'delete' builtin in a scalar, so we're forced to do this.
343		push(@letters,$self->{_END});
344		$ref = $self->{_MAINHASHREF};
345		@ldn = ();
346
347		# This is a special case, if the first letter of the word is the only
348		# key of the main hash.  I might not really need it, but this works as
349		# it is.
350		if (((scalar keys(%{ $ref })) == 1) && (exists $ref->{$letters[0]})) {
351			@ldn = ($ref);
352		}
353		# And now we go down the trie, as described above.
354		while (defined($letter = shift(@letters))) {
355			# We break out if we're at the end, or if we're run out of trie before
356			# finding the end of the word -- that is, if the word isn't in the
357			# trie.
358			last if ($letter eq $self->{_END});
359			last unless exists($ref->{$letter});
360			if (
361				scalar keys(%{ $ref->{$letter} }) == 1 &&
362				exists $ref->{$letter}{$letters[0]}
363			) {
364				unless (scalar @ldn) {
365					@ldn = ($ref,$letter);
366				}
367			}
368			else {
369				@ldn = ();
370			}
371			$ref = $ref->{$letter};
372		}
373		# If we broke out and there were still letters left in @letters, then the
374		# word must not be in the trie.  Furthermore, if we got all the way to
375		# the end, but there's no leaf node, the word must not be in the trie.
376		next if (scalar @letters);
377		next unless (exists($ref->{$self->{_END}}));
378		# If @ldn is empty, then the only deleteable node is the leaf node, so
379		# we set this up.
380		if (scalar @ldn == 0) {
381			@ldn = ($ref,$self->{_END});
382		}
383		# If there's only one entry in @ldn, then it's the ref of the top of our
384		# Trie.  If that's marked as deleteable, then we can just nuke the entire
385		# hash.
386		if (scalar @ldn == 1) {
387			%{ $ldn[0] } = ();
388		}
389		# Otherwise, we just delete the key we want to.
390		else {
391			delete($ldn[0]->{$ldn[1]});
392		}
393		# And then just return stuff.
394		if (wantarray) {
395			push (@retarray,$word);
396		}
397		else {
398			$retnum++;
399		}
400	}
401	if (wantarray) {
402		return @retarray;
403	}
404	return $retnum;
405}
406
407## These are PRIVATE METHODS.  Don't call them directly unless you really
408 # know what you're doing, or you enjoy things working funny.
409
410# The _walktree() sub takes a word beginning and a hashref (hopefully to a trie)
411# and walks down the trie, gathering all of the word endings and retuning them
412# appended to the word beginning.
413sub _walktree {
414	my($self, %args) = @_;
415	my $word = $args{word};
416	my $ref = $args{ref};
417	# These 2 arguments are used to control how far down the tree this
418	# path will go.
419	# This first argument is passed in by external subs
420	my $suffix_length = $args{suf_len} || 0;
421	# And this one is used only by the recursive calls.
422	my $walked_suffix_length = $args{walked} || 0;
423
424	my $wantref = ref($word) eq 'ARRAY';
425
426	my($key) = "";
427	my(@retarray) = ();
428	my($ret) = 0;
429
430	# For some reason, I used to think this was complicated and had a lot of
431	# stupid, useless code here.  It's a lot simpler now.  If the key we find
432	# is our magic reference, then we just give back the word.  Otherwise, we
433	# walk down the new subtree we've discovered.
434	foreach $key (keys %{ $ref }) {
435		if ($key eq $self->{_END}) {
436			if (wantarray) {
437				push(@retarray,$word);
438				if ($args{data}) {
439					push(@retarray, $ref->{$key});
440				}
441			}
442			else {
443				$ret++;
444			}
445			next;
446		}
447		my $nextval = $wantref ? [(@{$word}), $key] : $word . $key;
448		# If we've reached the max depth we need to travel for the suffix (if
449		# specified), then stop and collect everything up.
450		if ($suffix_length > 0 && ($suffix_length - $walked_suffix_length == 1)) {
451			if (wantarray) {
452				push(@retarray, $nextval);
453			}
454			else {
455				$ret++;
456			}
457		}
458		else {
459			# Look, recursion!
460			my %arguments = (
461				word    => $nextval,
462				'ref'   => $ref->{$key},
463				suf_len => $suffix_length,
464				walked  => $walked_suffix_length + 1,
465				data    => $args{data},
466			);
467			if (wantarray) {
468				push(@retarray, $self->_walktree(%arguments));
469			}
470			else {
471				$ret += scalar $self->_walktree(%arguments);
472			}
473		}
474	}
475	if (wantarray) {
476		return @retarray;
477	}
478	else {
479		return $ret;
480	}
481}
482
483# This code used to use some fairly hoary recursive code which caused it to
484# run fairly slowly, mainly due to the relatively slow way that perl handles
485# OO method invocation.  This was pointed out to me by Justin Hicks, and he
486# helped me fix it up, to be quite a bit more reasonable now.
487sub _lookup_internal {
488	my $self = shift;
489	my %args = @_;
490	my($ref) = $self->{_MAINHASHREF};
491
492	my($letter, $nextletter) = ("", "");
493	my(@letters) = ();
494	my(@retarray) = ();
495	my($wantref) = 0;
496
497	my $word = $args{word};
498
499	# Here we split the word up into letters in the appropriate way.
500	if (ref($word) eq 'ARRAY') {
501		@letters = (@{$word});
502		# Keeping track of what kind of word it was.
503		$wantref = 1;
504	}
505	else {
506		@letters = split('',$word);
507	}
508
509	# These three are to keep hold of possibly returned values.
510	my $lastword = $wantref ? [] : "";
511	my $lastwordref = undef;
512	my $pref = $wantref ? [] : "";
513
514	# Like everything else, we step across each letter.
515	while(defined($letter = shift(@letters))) {
516		# This is to keep track of stuff for the "prefix" version of deepsearch.
517		if ($self->{_DEEPSEARCH} == PREFIX && !$args{want_arr}) {
518			if (exists $ref->{$self->{_END}}) {
519				# The "data" argument tells us if we want to return the word
520				# or the data associated with it.
521				if ($args{data}) {
522					$lastwordref = $ref;
523				}
524				elsif ($wantref) {
525					push(@{$lastword}, @{$pref});
526				}
527				else {
528					$lastword .= $pref;
529				}
530				$pref = $wantref ? [] : "";
531			}
532			unless ($args{data}) {
533				if ($wantref) {
534					push(@{$pref}, $letter);
535				}
536				else {
537					$pref .= $letter;
538				}
539			}
540		}
541		# If, at any point, we find that we've run out of tree before we've run out
542		# of word, then there is nothing in the trie that begins with the input
543		# word, so we return appropriately.
544		unless (exists $ref->{$letter}) {
545			# Array case.
546			if ($args{want_arr}) {
547				return ();
548			}
549			# "count" case.
550			elsif ($self->{_DEEPSEARCH} == COUNT) {
551				return 0;
552			}
553			# "prefix" case.
554			elsif ($self->{_DEEPSEARCH} == PREFIX) {
555				if ($args{data} && $lastwordref) {
556					return $lastwordref->{$self->{_END}};
557				}
558				if (($wantref && scalar @{$lastword}) || length $lastword) {
559					return $lastword;
560				}
561				return undef;
562			}
563			# All other deepsearch cases are the same.
564			else {
565				return undef;
566			}
567		}
568		# If the letter is there, we just walk one step down the trie.
569		$ref = $ref->{$letter};
570	}
571	# Once we've walked all the way down the tree to the end of the word we were
572	# given, there are a few things that can be done, depending on the context
573	# that the method was called in.
574	if ($args{want_arr}) {
575		# If they want an array, then we use the walktree subroutine to collect all
576		# of the words beneath our current location in the trie, and return them.
577		@retarray = $self->_walktree(
578			# When fetching suffixes, we don't want to give the word begnning.
579			word    => $args{suff_len} ? "" : $word,
580			'ref'   => $ref,
581			suf_len => $args{suff_len},
582			data    => $args{data},
583		);
584		return @retarray;
585	}
586	else {
587		if ($self->{_DEEPSEARCH} == BOOLEAN) {
588			# Here, the user only wants to know if any words in the trie begin
589			# with their word, so that's what we give them.
590			return 1;
591		}
592		elsif ($self->{_DEEPSEARCH} == EXACT) {
593			# In this case, the user wants us to return something only if the
594			# exact word exists in the trie, and undef otherwise.
595			# This option only really makes sense with when looking up data,
596			# as otherwise it's essentially the same as BOOLEAN, above, but it
597			# doesn't hurt to allow it to work with normal lookup, either.
598			# I'd initially left this out because I didn't see a use for it, but
599			# thanks to Otmal Lendl for pointing out to me a situation in which
600			# it would be helpful to have.
601			if (exists $ref->{$self->{_END}}) {
602				if ($args{data}) {
603					return $ref->{$self->{_END}};
604				}
605				return $word;
606			}
607			return undef;
608		}
609		elsif ($self->{_DEEPSEARCH} == CHOOSE) {
610			# If they want this, then we continue to walk down the trie, collecting
611			# letters, until we find a leaf node, at which point we stop.  Note that
612			# this works properly if the exact word is in the trie.  Yay.
613			# Of course, making it work that way means that we tend to get shorter
614			# words in choose...  is this a bad thing?  I dunno.
615			my($stub) = $wantref ? [] : "";
616			while (scalar keys %{$ref} && !exists $ref->{$self->{_END}}) {
617				$nextletter = each(%{ $ref });
618				# I need to call this to clear the each() call.  Wish I didn't...
619				keys(%{ $ref });
620				if ($wantref) {
621					push(@{$stub}, $nextletter);
622				}
623				else {
624					$stub .= $nextletter;
625				}
626				$ref = $ref->{$nextletter};
627				# If we're doing suffixes, bail out early once it's the right length.
628				if ($args{suff_len}) {
629					my $cmpr = $wantref ? scalar @{$stub} : length $stub;
630					last if $cmpr == $args{suff_len};
631				}
632			}
633			if ($args{data}) {
634				return $ref->{$self->{_END}};
635			}
636			# If they've specified a suffix length, then they don't want the
637			# beginning part of the word.
638			if ($args{suff_len}) {
639				return $stub;
640			}
641			# Otherwise, they do.
642			else {
643				return $wantref ? [@{$word}, @{$stub}] : $word . $stub;
644			}
645		}
646		elsif ($self->{_DEEPSEARCH} == COUNT) {
647			# Here, the user simply wants a count of words in the trie that begin
648			# with their word, so we get that by calling our walktree method in
649			# scalar context.
650			return scalar $self->_walktree(
651				# When fetching suffixes, we don't want to give the word begnning.
652				word    => $args{suff_len} ? "" : $word,
653				'ref'   => $ref,
654				suf_len => $args{suff_len},
655			);
656		}
657		elsif ($self->{_DEEPSEARCH} == PREFIX) {
658			# This is the "longest prefix found" case.
659			if (exists $ref->{$self->{_END}}) {
660				if ($args{data}) {
661					return $ref->{$self->{_END}};
662				}
663				if ($wantref) {
664					return [@{$lastword}, @{$pref}];
665				}
666				else {
667					return $lastword . $pref;
668				}
669			}
670			if ($args{data}) {
671				return $lastwordref->{$self->{_END}};
672			}
673			return $lastword;
674		}
675	}
676}
677
678# This is the method which does all of the heavy lifting for add and
679# add_data.  Given a word and a datum, it walks down the trie until
680# it finds a branch that hasn't been created yet.  It then makes the rest
681# of the branch, and slaps an end marker and the datum inside of it.
682sub _add_internal {
683	my $self = shift;
684	my $word = shift;
685	my $datum = shift;
686	my @letters;
687	# We don't NEED to split a string into letters; Any array of tokens
688	# will do.
689	if (ref($word) eq 'ARRAY') {
690		# Note: this is a copy
691		@letters = (@{$word});
692		# Because in this case, a "letter" can be more than on character
693		# long, we have to make sure we don't collide with whatever we're
694		# using as an end marker.
695		# However, if the user is feeling all fanciful and told us not to
696		# bother, we won't.
697		unless ($self->{_FREEZE_END}) {
698			for my $letter (@letters) {
699				if ($letter eq $self->{_END}) {
700					# If we had a collision, then make a new end marker.
701					$self->end_marker($self->_gen_new_marker(
702						bad => \@letters,
703					));
704					last;
705				}
706			}
707		}
708	}
709	else {
710		@letters = split('',$word);
711	}
712	# Start at the top of the Trie...
713	my $ref = $self->{_MAINHASHREF};
714	# This will walk down the trie as far as it can, until it either runs
715	# out of word or out of trie.
716	while (
717		(scalar @letters) &&
718		exists($ref->{$letters[0]})
719	) {
720		$ref = $ref->{shift(@letters)};
721	}
722	# If it ran out of trie before it ran out of word then this will create
723	# the rest of the trie structure.
724	for my $letter (@letters) {
725		$ref = $ref->{$letter} = {};
726	}
727	# In either case, this will make the new end marker for the end of the
728	# word (assuming it wasn't already there) and set the return value
729	# appropriately.
730	my $ret = 1;
731	if (exists $ref->{$self->{_END}}) {
732		$ret = 0;
733	}
734	else {
735		$ref->{$self->{_END}} = undef;
736	}
737	# This will set the data if it was provided.
738	if (defined $datum) {
739		$ref->{$self->{_END}} = $datum;
740	}
741	return $ret;
742}
743
744# This uses a heuristic (that is, a crappy method) to generate a new
745# end marker for the trie.  In addition to being sure that whatever is
746# generated is not in use as a letter in the trie, it also makes a bold
747# yet mostly vain attempt to try to make something that might not be
748# used in the future.
749# In general, I do not try to make this functionality good or fast or
750# perfect -- if it's being called often, the module is being mis-used.
751# If a user is using multi-character letters, then they ought to find
752# a string that will be safe and set it themselves.
753sub _gen_new_marker {
754	my $self = shift;
755	my %args = @_;
756	# This will keep track of all of the letters used in the trie already
757	my %used = ();
758	# This will keep track of what lengths they are
759	my %sizes = ();
760	# First we process the letters of the word which sparked this
761	# re-evaluation.
762	for my $letter (@{$args{bad}}) {
763		my $len = length($letter);
764		if ($len != 1) {
765			$used{$letter}++;
766			$sizes{$len}++;
767		}
768	}
769	# Then we walk the tree and get the info on all the other letters.
770	my @refs = ($self->{_MAINHASHREF});
771	while (@refs) {
772		my $ref = shift @refs;
773		for my $key (keys %{$ref}) {
774			# Note we don't even care about length 1 letters.
775			if (
776				(length($key) != 1) &&
777				($key ne $self->{_END})
778			) {
779				$used{$key}++;
780				$sizes{length($key)}++;
781				push(@refs, $ref->{$key});
782			}
783		}
784	}
785	# The idea here is that we want to make the end marker as small as possible,
786	# as it's stuck all over the place.  However, we don't want to spend forever
787	# trying to find one that isn't in use.
788	# So, we find the smallest length such that there are fewer than 1/4 of
789	# the total number of possible letters in use of that length, and we make
790	# a key of that length.
791	my $newlen = 2;
792	for my $len (sort keys %sizes) {
793		# Yes, I know there are well more than 26 available compositors, but
794		# this will only mean I'm being too careful.
795		if ($sizes{$len} < ((26 ** $len) / 4)) {
796			$newlen = $len;
797			last;
798		}
799		else {
800			# This makes it so that if all existing lengths are too full ( !! )
801			# then we will just use a key that's one longer than the longest
802			# one already there.
803			$newlen = $len + 1;
804		}
805	}
806	# Now we just generate end markers until we find one that isn't in use.
807	my $newend;
808	do {
809		$newend = join '', map { chr(int(rand(128))) } (('') x $newlen);
810	} while (exists($used{$newend}));
811	# And return it.
812	return $newend;
813}
814
815# Strewth!
8161;
817
818__END__
819
820=head1 NAME
821
822
823Tree::Trie - A data structure optimized for prefix lookup.
824
825=head1 SYNOPSIS
826
827 use Tree::Trie;
828 use strict;
829
830 my($trie) = new Tree::Trie;
831 $trie->add(qw[aeode calliope clio erato euterpe melete melpomene mneme
832   polymnia terpsichore thalia urania]);
833 my(@all) = $trie->lookup("");
834 my(@ms)  = $trie->lookup("m");
835 $" = "--";
836 print "All muses: @all\nMuses beginning with 'm': @ms\n";
837 my(@deleted) = $trie->remove(qw[calliope thalia doc]);
838 print "Deleted muses: @deleted\n";
839
840
841=head1 DESCRIPTION
842
843This module implements a trie data structure.  The term "trie" comes from the
844word reB<trie>val, but is generally pronounced like "try".  A trie is a tree
845structure (or directed acyclic graph), the nodes of which represent letters
846in a word.  For example, the final lookup for the word 'bob' would look
847something like C<$ref-E<gt>{'b'}{'o'}{'b'}{'00'}> (the 00 being an
848end marker).  Only nodes which would represent words in the trie exist, making
849the structure slightly smaller than a hash of the same data set.
850
851The advantages of the trie over other data storage methods is that lookup
852times are O(1) WRT the size of the index.  For sparse data sets, it is probably
853not as efficient as performing a binary search on a sorted list, and for small
854files, it has a lot of overhead.  The main advantage (at least from my
855perspective) is that it provides a relatively cheap method for finding a list
856of words in a large, dense data set which B<begin> with a certain string.
857
858The term "word" in this documentation can refer to one of two things: either a
859reference to an array of strings, or a scalar which is not a reference.  In
860the case of the former, each element of the array is treated as a "letter"
861of the "word".  In the case of the latter, the scalar is evaluated in string
862context and it is split into its component letters.  Return values of methods
863match the values of what is passed in -- that is, if you call lookup() with
864an array reference, the return value will be an array reference (if
865appropriate).
866
867NOTE: The return semantics of the lookup_data method have CHANGED from version
8681.0 to version 1.1.  If you use this method, be sure to see the perldoc on
869that method for details.
870
871=head1 METHODS
872
873=over 4
874
875
876=item new()
877
878=item new({I<option0> => I<value0>, I<option1> => I<value1>, ...})
879
880This is the constructor method for the class.  You may optionally pass it
881a hash reference with a set of I<option> => I<value> pairs.  The options
882which can be set at object creation-time are "deepsearch", "end_marker" and
883"freeze_end_marker".  See the documentation on the methods which set and
884report those values for more information.
885
886=item $trie->add(I<I<word>>, I<word1>, ...)
887
888This method attempts to add the words to the trie.  Returns, in list
889context, the words successfully added to the trie.  In scalar context, returns
890the number of words successfully added.  As of this release, the only reason
891a word would fail to be added is if it is already in the trie.
892
893=item $trie->add_all(I<I<trie>>, I<trie1>, ...)
894
895This method adds all of the words from the argument tries to the trie.  By
896performing the traversal of both source and target tries simultaneously,
897this mechanism is much faster first doing a lookup on one trie and then an
898add on the other.  Has no return value.
899
900=item $trie->add_data(I<I<word>> => I<data0>, I<word1> => I<data1>, ...)
901
902This method works in basically the same way as C<add()>, except in addition to
903adding words to the trie, it also adds data associated with those words.  Data
904values may be overwritten by adding data for words already in the trie.  Its
905return value is the same and applies only to new words added to the trie, not
906data modified in existing words.
907
908=item $trie->remove(I<I<word>>, I<word1>, ...)
909
910This method attempts to remove the words from the trie.  Returns, in
911list context, the words successfully removed from the trie.  In scalar context,
912returns the number of words successfully removed.  As of this release, the only
913reason a word would fail to be removed is if it is not already in the trie.
914
915=item $trie->delete_data(I<I<word>>, I<word1>, ...)
916
917This method simply deletes data associated with words in the trie.  It
918is the equivalent to perl's delete builtin operating on a hash.  It returns
919the number of data items deleted in scalar context, or a list of words
920for which data has been removed, in list context.
921
922=item $trie->lookup(I<word>)
923
924=item $trie->lookup(I<word>, I<suffix_length>)
925
926This method performs lookups on the trie.  In list context, it returns a
927complete list of words in the trie which begin with I<word>.
928In scalar context, the value returned depends on the setting of the 'deepsearch'
929option.  You can set this option while creating your Trie object, or by using
930the deepsearch method.  Valid deepsearch values are:
931
932boolean: Will return a true value if any word in the trie begins with I<word>.
933This setting is the fastest.
934
935choose: Will return one word in the trie that begins with I<word>, or undef if
936nothing is found.  If I<word> exists in the trie exactly, it will be returned.
937
938count: Will return a count of the words in the trie that begin with I<word>.
939This operation may require walking the entire tree, so it can possibly be
940significantly slower than other options.
941
942prefix: Will return the longest entry in the trie that is a prefix of I<word>.
943For example, if you had a list of file system mount points in your trie, you
944could use this option, pass in the full path of a file, and would be returned
945the name of the mount point on which the file could be found.
946
947exact: If the exact word searched for exists in the trie, will return that
948word (or the data associated therewith), undef otherwise.  This is essentially
949equivalent to a hash lookup, but it does have utility in some cases.
950
951For reasons of backwards compatibility, 'choose' is the default value
952of this option.
953
954To get a list of all words in the trie, use C<lookup("")> in list context.
955
956If the I<suffix_length> option is provided, the behavior is a little bit
957different:  Instead of returning words from the trie, it will instead return
958suffixes that follow I<word>, and those suffixes will be no longer than the
959numerical value of the option.  If the option's value is negative, suffixes
960of all lengths will be returned.  This option only has effect if the
961call to lookup() is in list context, or if the 'deepsearch' parameter
962is set to either 'count' or 'choose'.  It has no meaning for the other
963scalar deepsearch settings, and will be ignored in those cases.
964
965For example, assume your trie contains 'foo', 'food' and 'fish'.
966C<lookup('f', 1)> would return 'o' and 'i'.  C<lookup('f', 3)> would
967return 'oo', 'ood' and 'ish'.  C<lookup('fo', -1)> would return 'o' and
968'od'.  In scalar context, these calls would return what you'd expect, based
969on the value of deepsearch, with the 'count' and 'choose' options operating
970only over the set of suffixes.  That is, The first call would return 2
971with 'count', and either 'o' or 'i' with 'choose'.
972
973Note that C<lookup("", -1)> is the same as C<lookup("")>.
974
975=item $trie->lookup_data(I<word>)
976
977This method operates in essentially the same way as C<lookup()>, with the
978exception that in list context it returns a list of word => data value
979pairs and in scalar context, where C<lookup()> would return a word,
980C<lookup_data()> returns the data value associated with that word.  In
981cases where the deepsearch setting is such that C<lookup()> would
982return a number, C<lookup_data()> will return the same number.
983
984Please note that the return value in list context is NOT a hash.  It can
985be coerced into a hash, and if you are not using any multi-character letters
986in your trie, this will work fine.  However otherwise, if it is coerced into
987a hash, all the of the array references (remember, words are array refs when
988using multi-character letters) will be stringified, which renders them (for
989the most part) useless.
990
991=item $trie->deepsearch()
992
993=item $trie->deepsearch(I<new_setting>)
994
995If option is specified, sets the deepsearch parameter.  Option may be one of:
996'boolean', 'choose', 'count', 'prefix'.  Please see the documentation for the
997lookup method for the details of what these options mean.  Returns the
998current (new) value of the deepsearch parameter.
999
1000=item $trie->end_marker()
1001
1002=item $trie->end_marker(I<new_marker>)
1003
1004If the marker is provided, sets the string used internally to indicate the
1005end of a word in the trie to that marker.  Doing this causes a complete
1006traversal of the trie, where all old end markers are replaced with the new
1007one.  This can get very slow, so try to call this method when the trie is
1008still small.  Returns the current (new) end marker value.
1009
1010=item $trie->freeze_end_marker()
1011
1012=item $trie->freeze_end_marker(I<new_flag>)
1013
1014If flag is provided and a true value, turns off checking and automatic
1015updating of the end marker.  If flag is provided and false, turns this
1016checking on.  Returns the current (new) truth value of this setting.
1017
1018=back
1019
1020=head1 End Markers
1021
1022=head2 Overview
1023
1024The following discussion is only important for those people using
1025multi-character letters, or words as array references.  If you are just
1026using this module with words as simple strings, you may disregard this
1027section.
1028
1029First, it's important to understand how data is stored in the trie.  As
1030described above, the trie structure is basically just a complicated hash of
1031hashes, with each key of each has being a letter.  There needs to be a distinct
1032way of determining when we're at the end of a word; we can't just use the
1033end of the hash structure as a guide, because we need to distinguish between
1034the word "barn" being in the trie and the words "bar" and "barn" being there.
1035
1036The answer is an end marker -- a distinct token that signifies that we're
1037at the end of the word.  Using the above example, if "bar" and "barn" are
1038in the trie, then the keys of the hash at "r" would be "n" and this end
1039marker.  Choosing this end marker is easy when all letters are just one
1040character -- we just choose any two-character string and we know that it will
1041never match a letter.  However, once we allow arbitrary multi-character
1042letters, then things get much more difficult: there is no possible end
1043marker which can be guaranteed to always work.  Here is where we enter
1044some dark water.
1045
1046=head2 Dark Water
1047
1048In order to make sure that the end marker is always safe, we must check
1049incoming letters on every word submission.  If the word is an array ref, then
1050each letter in it is compared to the current end marker.  This does add
1051overhead, but it's necessary.  If it is found that a letter does conflict
1052with the end marker, then we choose a new end marker.
1053
1054In order to find a new end marker, we obviously need to find a string that
1055isn't already being used in the trie.  This requires a complete traversal of
1056the trie to collect a complete set of the letters in use.  Once we have this
1057it is a simple exercise to generate a new marker which is not in use.
1058
1059Then we must replace the marker.  This of course requires a complete
1060traversal once again.  As you can see, this adds a bit of overhead to working
1061with multi-character letters, but it's neccessary to make sure things keep
1062working correctly.  This should be fine for people with small data sets,
1063or who just do a bunch of additions ahead of time and then only do lookups.
1064However, if computation time is important to you, there are ways to
1065avoid this mess.
1066
1067=head2 Speeding Things Up
1068
1069One way to speed things up is to avoid the need to replace the end marker.
1070You can set the trie's end marker using the C<end_marker()> method, or at
1071creation time, by passing the C<end_marker> option to the trie in its
1072constructor's option hashref.  Note that setting the end marker causes
1073a trie traversal, as it must update existing data.  As such, you want to
1074set the end marker as soon as possible.
1075
1076Note that end marker MUST be at least 2 characters long.
1077
1078Just setting the end marker though, won't stop the trie from checking each
1079letter as you add arrayref words.  If you are 100% sure that the end
1080marker you set won't ever show up in an added word, you can either use
1081the C<freeze_end_marker()> method or the C<freeze_end_marker> construction
1082option to tell the trie not to check any more.  However, be careful --
1083once this option is enabled, the data structure is no longer self-policing,
1084so if a letter that matches your end marker does end up slipping in, strange
1085things will begin to happen.
1086
1087=head2 Examples
1088
1089Here are some situations in which you might want to use the methods described
1090in the previous section.
1091
1092Let's say your application takes user input data describing travel across
1093the united states, and each node in the trie is a two-letter state abbreviation.
1094In this case, it would probably be fairly safe to set your end marker to
1095something like '00'.  However, since this is user-supplied data, you don't
1096want to let some user break your whole system by entering '00', so you should
1097probably not freeze the end marker in this case.
1098
1099Let's say you're using the trie for a networking application -- your words
1100will be IP addresses, and your letters will be the four "quads" of an IP
1101address.  In this case you can safely set your end marker to 'xx' or anything
1102with letters in it, and know that there will never be a collision.  It is
1103entirely reasonable to set the freeze tag in this case.
1104
1105=head1 Future Work
1106
1107=over 4
1108
1109=item *
1110
1111There are a few methods of compression that allow you same some amount of space
1112in the trie.  I have to figure out which ones are worth implementing.  I may
1113end up making the different compression methods configurable.
1114
1115I have now made one of them the default.  It's the least effective one, of
1116course.
1117
1118=item *
1119
1120The ability to have Tree::Trie be backed by a "live" file instead of keeping
1121data in memory.  This is, unfortunately, more complicated than simply using
1122TIE, so this will take some amount of work.
1123
1124=back
1125
1126=head1 Known Problems
1127
1128=over 4
1129
1130=item *
1131
1132None at this time.
1133
1134=back
1135
1136=head1 AUTHOR
1137
1138Copyright 2011 Avi Finkel <F<avi@finkel.org>>
1139
1140This package is free software and is provided "as is" without express
1141or implied warranty.  It may be used, redistributed and/or modified
1142under the terms of the Perl Artistic License (see
1143http://www.perl.com/perl/misc/Artistic.html)
1144
1145=cut
1146