1package Locale::Maketext::Fuzzy;
2$Locale::Maketext::Fuzzy::VERSION = '0.11';
3
4use 5.005;
5use strict;
6use Locale::Maketext;
7use base 'Locale::Maketext';
8
9sub override_maketext {
10    my ( $class, $flag ) = @_;
11    $class = ref($class) if ref($class);
12
13    no strict 'refs';
14
15    if ($flag) {
16        *{"$class\::maketext"} = \&maketext_fuzzy;
17    }
18    elsif ( @_ >= 2 ) {
19        delete ${"$class\::"}{maketext};
20    }
21
22    return ( defined &{"$class\::maketext"} ? 1 : 0 );
23}
24
25# Global cache of entries and their regexified forms
26my %regex_cache;
27
28sub maketext_fuzzy {
29    my ( $handle, $phrase ) = splice( @_, 0, 2 );
30
31    # An array of all lexicon hashrefs
32    my @lexicons = @{ $handle->_lex_refs };
33
34    # Try exact match if possible at all.
35    foreach my $lex (@lexicons) {
36        return $handle->SUPER::maketext( $phrase, @_ )
37          if exists $lex->{$phrase};
38    }
39
40    # Keys are matched entries; values are arrayrefs of extracted params
41    my %candidate;
42
43    # Fuzzy match phase 1 -- extract all candidates
44    foreach my $lex (@lexicons) {
45
46        # We're not interested in non-bracketed entries, so ignore them
47        foreach my $entry ( grep /(?:(?<!~)(?:~~)*)\[/, keys %{$lex} ) {
48            # Skip entries which are _only_ brackets and whitespace.
49            # The most value they could add is rearrangement, and that
50            # is almost certainly incorrect.
51            next if $entry =~ /^\s*(\[[^]]+\]\s*)+$/;
52
53            my $re = ( $regex_cache{$entry} ||= [ _regexify($entry) ] );
54            my @vars = ( $phrase =~ $re->[0] ) or next;
55            $candidate{$entry} ||=
56              ( @{ $re->[1] } ? [ @vars[ @{ $re->[1] } ] ] : \@vars );
57        }
58    }
59
60    # Fail early if we cannot find anything that matches
61    return $phrase unless %candidate;
62
63    # Fuzzy match phase 2 -- select the best candidate
64    $phrase = (
65        sort {
66
67            # For now, we just use a very crude heuristic: "Longer is better"
68            length($b) <=> length($a)
69              or $b cmp $a
70          } keys %candidate
71    )[0];
72
73    return $handle->SUPER::maketext( $phrase, @{ $candidate{$phrase} }, @_ );
74}
75
76sub _regexify {
77    my $text = quotemeta(shift);
78    my @ords;
79
80    $text =~ s{
81	(				# capture into $1...
82	    (?<!\\~)(?:\\~\\~)*		#   an even number of ~ characters
83	)				#   (to be restored back)
84	\\\[				# opening bracket
85
86	(				# capture into $2...
87	    (?:				#   any numbers of
88		[^~\]]			#     ordinary non-] characters
89		    |			#       or
90		~\\?.			#     escaped characters
91	    )*
92	)
93	\\\]				# closing bracket
94    }{
95	$1._paramify($2, \@ords)
96    }egx;
97
98    $text =~ s/\Q.*?\E$/.*/;
99    return qr/^$text$/, \@ords;
100}
101
102sub _paramify {
103    my ( $text, $ordref ) = @_;
104    my $out = '(.*?)';
105    my @choices = split( /\\,/, $text );
106
107    if ( $choices[0] =~ /^(?:\w+|\\#|\\\*)$/ ) {
108
109        # Do away with the function name
110        shift @choices unless $choices[0] =~ /^_(?:\d+|\\\*)$/;
111
112        # Build an alternate regex to weed out vars
113        $out .= '(?:' . join(
114            '|',
115            sort {
116                length($b) <=> length($a)    # longest first
117              } map {
118                /^_(?:(\d+)|\\\*)$/
119                  ? do {
120                    push @{$ordref}, ( $1 - 1 ) if defined $1;
121                    '';
122                  }
123                  : $_                       # turn _1, _2, _*... into ''
124              } @choices
125        ) . ')';
126
127        $out =~ s/\Q(?:)\E$//;
128    }
129
130    return $out;
131}
132
1331;
134
135=head1 NAME
136
137Locale::Maketext::Fuzzy - Maketext from already interpolated strings
138
139=head1 SYNOPSIS
140
141    package MyApp::L10N;
142    use base 'Locale::Maketext::Fuzzy'; # instead of Locale::Maketext
143
144    package MyApp::L10N::de;
145    use base 'MyApp::L10N';
146    our %Lexicon = (
147	# Exact match should always be preferred if possible
148	"0 camels were released."
149	    => "Exact match",
150
151	# Fuzzy match candidate
152	"[quant,_1,camel was,camels were] released."
153	    => "[quant,_1,Kamel wurde,Kamele wurden] freigegeben.",
154
155	# This could also match fuzzily, but is less preferred
156	"[_2] released[_1]"
157	    => "[_1][_2] ist frei[_1]",
158    );
159
160    package main;
161    my $lh = MyApp::L10N->get_handle('de');
162
163    # All ->maketext calls below will become ->maketext_fuzzy instead
164    $lh->override_maketext(1);
165
166    # This prints "Exact match"
167    print $lh->maketext('0 camels were released.');
168
169    # "1 Kamel wurde freigegeben." -- quant() gets 1
170    print $lh->maketext('1 camel was released.');
171
172    # "2 Kamele wurden freigegeben." -- quant() gets 2
173    print $lh->maketext('2 camels were released.');
174
175    # "3 Kamele wurden freigegeben." -- parameters are ignored
176    print $lh->maketext('3 released.');
177
178    # "4 Kamele wurden freigegeben." -- normal usage
179    print $lh->maketext('[*,_1,camel was,camels were] released.', 4);
180
181    # "!Perl ist frei!" -- matches the broader one
182    # Note that the sequence ([_2] before [_1]) is preserved
183    print $lh->maketext('Perl released!');
184
185=head1 DESCRIPTION
186
187This module is a subclass of C<Locale::Maketext>, with additional
188support for localizing messages that already contains interpolated
189variables.
190
191This is most useful when the messages are returned by external sources
192-- for example, to match C<dir: command not found> against
193C<[_1]: command not found>.
194
195Of course, this module is also useful if you're simply too lazy
196to use the
197
198    $lh->maketext("[quant,_1,file,files] deleted.", $count);
199
200syntax, but wish to write
201
202    $lh->maketext_fuzzy("$count files deleted");
203
204instead, and have the correct plural form figured out automatically.
205
206If C<maketext_fuzzy> seems too long to type for you, this module
207also provides a C<override_maketext> method to turn I<all> C<maketext>
208calls into C<maketext_fuzzy> calls.
209
210=head1 METHODS
211
212=head2 $lh->maketext_fuzzy(I<key>[, I<parameters...>]);
213
214That method takes exactly the same arguments as the C<maketext> method
215of C<Locale::Maketext>.
216
217If I<key> is found in lexicons, it is applied in the same way as
218C<maketext>.  Otherwise, it looks at all lexicon entries that could
219possibly yield I<key>, by turning C<[...]> sequences into C<(.*?)> and
220match the resulting regular expression against I<key>.
221
222Once it finds all candidate entries, the longest one replaces the
223I<key> for the real C<maketext> call.  Variables matched by its bracket
224sequences (C<$1>, C<$2>...) are placed before I<parameters>; the order
225of variables in the matched entry are correctly preserved.
226
227For example, if the matched entry in C<%Lexicon> is C<Test [_1]>,
228this call:
229
230    $fh->maketext_fuzzy("Test string", "param");
231
232is equivalent to this:
233
234    $fh->maketext("Test [_1]", "string", "param");
235
236However, most of the time you won't need to supply I<parameters> to
237a C<maketext_fuzzy> call, since all parameters are already interpolated
238into the string.
239
240=head2 $lh->override_maketext([I<flag>]);
241
242If I<flag> is true, this accessor method turns C<$lh-E<gt>maketext>
243into an alias for C<$lh-E<gt>maketext_fuzzy>, so all consecutive
244C<maketext> calls in the C<$lh>'s packages are automatically fuzzy.
245A false I<flag> restores the original behaviour.  If the flag is not
246specified, returns the current status of override; the default is
2470 (no overriding).
248
249Note that this call only modifies the symbol table of the I<language
250class> that C<$lh> belongs to, so other languages are not affected.
251If you want to override all language handles in a certain application,
252try this:
253
254    MyApp::L10N->override_maketext(1);
255
256=head1 CAVEATS
257
258=over 4
259
260=item *
261
262The "longer is better" heuristic to determine the best match is
263reasonably good, but could certainly be improved.
264
265=item *
266
267Currently, C<"[quant,_1,file] deleted"> won't match C<"3 files deleted">;
268you'll have to write C<"[quant,_1,file,files] deleted"> instead, or
269simply use C<"[_1] file deleted"> as the lexicon key and put the correct
270plural form handling into the corresponding value.
271
272=item *
273
274When used in combination with C<Locale::Maketext::Lexicon>'s C<Tie>
275backend, all keys would be iterated over each time a fuzzy match is
276performed, and may cause serious speed penalty.  Patches welcome.
277
278=back
279
280=head1 SEE ALSO
281
282L<Locale::Maketext>, L<Locale::Maketext::Lexicon>
283
284=head1 HISTORY
285
286This particular module was written to facilitate an I<auto-extraction>
287layer for Slashcode's I<Template Toolkit> provider, based on
288C<HTML::Parser> and C<Template::Parser>.  It would work like this:
289
290    Input | <B>from the [% story.dept %] dept.</B>
291    Output| <B>[%|loc( story.dept )%]from the [_1] dept.[%END%]</B>
292
293Now, this layer suffers from the same linguistic problems as an
294ordinary C<Msgcat> or C<Gettext> framework does -- what if we want
295to make ordinals from C<[% story.dept %]> (i.e. C<from the 3rd dept.>),
296or expand the C<dept.> to C<department> / C<departments>?
297
298The same problem occurred in RT's web interface, where it had to
299localize messages returned by external modules, which may already
300contain interpolated variables, e.g. C<"Successfully deleted 7
301ticket(s) in 'c:\temp'.">.
302
303Since I didn't have the time to refactor C<DBI> and C<DBI::SearchBuilder>,
304I devised a C<loc_match> method to pre-process their messages into one
305of the I<candidate strings>, then applied the matched string to C<maketext>.
306
307Afterwards, I realized that instead of preparing a set of candidate
308strings, I could actually match against the original I<lexicon file>
309(i.e. PO files via C<Locale::Maketext::Lexicon>).  This is how
310C<Locale::Maketext::Fuzzy> was born.
311
312=head1 AUTHORS
313
314Audrey Tang E<lt>cpan@audreyt.orgE<gt>
315
316=head1 CC0 1.0 Universal
317
318To the extent possible under law, 唐鳳 has waived all copyright and related
319or neighboring rights to Locale-Maketext-Fuzzy.
320
321This work is published from Taiwan.
322
323L<http://creativecommons.org/publicdomain/zero/1.0>
324
325=cut
326