1package Locale::Maketext::Fuzzy; 2$Locale::Maketext::Fuzzy::VERSION = '0.11'; 3 4use 5.005; 5use strict; 6use Locale::Maketext; 7use base 'Locale::Maketext'; 8 9sub override_maketext { 10 my ( $class, $flag ) = @_; 11 $class = ref($class) if ref($class); 12 13 no strict 'refs'; 14 15 if ($flag) { 16 *{"$class\::maketext"} = \&maketext_fuzzy; 17 } 18 elsif ( @_ >= 2 ) { 19 delete ${"$class\::"}{maketext}; 20 } 21 22 return ( defined &{"$class\::maketext"} ? 1 : 0 ); 23} 24 25# Global cache of entries and their regexified forms 26my %regex_cache; 27 28sub maketext_fuzzy { 29 my ( $handle, $phrase ) = splice( @_, 0, 2 ); 30 31 # An array of all lexicon hashrefs 32 my @lexicons = @{ $handle->_lex_refs }; 33 34 # Try exact match if possible at all. 35 foreach my $lex (@lexicons) { 36 return $handle->SUPER::maketext( $phrase, @_ ) 37 if exists $lex->{$phrase}; 38 } 39 40 # Keys are matched entries; values are arrayrefs of extracted params 41 my %candidate; 42 43 # Fuzzy match phase 1 -- extract all candidates 44 foreach my $lex (@lexicons) { 45 46 # We're not interested in non-bracketed entries, so ignore them 47 foreach my $entry ( grep /(?:(?<!~)(?:~~)*)\[/, keys %{$lex} ) { 48 # Skip entries which are _only_ brackets and whitespace. 49 # The most value they could add is rearrangement, and that 50 # is almost certainly incorrect. 51 next if $entry =~ /^\s*(\[[^]]+\]\s*)+$/; 52 53 my $re = ( $regex_cache{$entry} ||= [ _regexify($entry) ] ); 54 my @vars = ( $phrase =~ $re->[0] ) or next; 55 $candidate{$entry} ||= 56 ( @{ $re->[1] } ? [ @vars[ @{ $re->[1] } ] ] : \@vars ); 57 } 58 } 59 60 # Fail early if we cannot find anything that matches 61 return $phrase unless %candidate; 62 63 # Fuzzy match phase 2 -- select the best candidate 64 $phrase = ( 65 sort { 66 67 # For now, we just use a very crude heuristic: "Longer is better" 68 length($b) <=> length($a) 69 or $b cmp $a 70 } keys %candidate 71 )[0]; 72 73 return $handle->SUPER::maketext( $phrase, @{ $candidate{$phrase} }, @_ ); 74} 75 76sub _regexify { 77 my $text = quotemeta(shift); 78 my @ords; 79 80 $text =~ s{ 81 ( # capture into $1... 82 (?<!\\~)(?:\\~\\~)* # an even number of ~ characters 83 ) # (to be restored back) 84 \\\[ # opening bracket 85 86 ( # capture into $2... 87 (?: # any numbers of 88 [^~\]] # ordinary non-] characters 89 | # or 90 ~\\?. # escaped characters 91 )* 92 ) 93 \\\] # closing bracket 94 }{ 95 $1._paramify($2, \@ords) 96 }egx; 97 98 $text =~ s/\Q.*?\E$/.*/; 99 return qr/^$text$/, \@ords; 100} 101 102sub _paramify { 103 my ( $text, $ordref ) = @_; 104 my $out = '(.*?)'; 105 my @choices = split( /\\,/, $text ); 106 107 if ( $choices[0] =~ /^(?:\w+|\\#|\\\*)$/ ) { 108 109 # Do away with the function name 110 shift @choices unless $choices[0] =~ /^_(?:\d+|\\\*)$/; 111 112 # Build an alternate regex to weed out vars 113 $out .= '(?:' . join( 114 '|', 115 sort { 116 length($b) <=> length($a) # longest first 117 } map { 118 /^_(?:(\d+)|\\\*)$/ 119 ? do { 120 push @{$ordref}, ( $1 - 1 ) if defined $1; 121 ''; 122 } 123 : $_ # turn _1, _2, _*... into '' 124 } @choices 125 ) . ')'; 126 127 $out =~ s/\Q(?:)\E$//; 128 } 129 130 return $out; 131} 132 1331; 134 135=head1 NAME 136 137Locale::Maketext::Fuzzy - Maketext from already interpolated strings 138 139=head1 SYNOPSIS 140 141 package MyApp::L10N; 142 use base 'Locale::Maketext::Fuzzy'; # instead of Locale::Maketext 143 144 package MyApp::L10N::de; 145 use base 'MyApp::L10N'; 146 our %Lexicon = ( 147 # Exact match should always be preferred if possible 148 "0 camels were released." 149 => "Exact match", 150 151 # Fuzzy match candidate 152 "[quant,_1,camel was,camels were] released." 153 => "[quant,_1,Kamel wurde,Kamele wurden] freigegeben.", 154 155 # This could also match fuzzily, but is less preferred 156 "[_2] released[_1]" 157 => "[_1][_2] ist frei[_1]", 158 ); 159 160 package main; 161 my $lh = MyApp::L10N->get_handle('de'); 162 163 # All ->maketext calls below will become ->maketext_fuzzy instead 164 $lh->override_maketext(1); 165 166 # This prints "Exact match" 167 print $lh->maketext('0 camels were released.'); 168 169 # "1 Kamel wurde freigegeben." -- quant() gets 1 170 print $lh->maketext('1 camel was released.'); 171 172 # "2 Kamele wurden freigegeben." -- quant() gets 2 173 print $lh->maketext('2 camels were released.'); 174 175 # "3 Kamele wurden freigegeben." -- parameters are ignored 176 print $lh->maketext('3 released.'); 177 178 # "4 Kamele wurden freigegeben." -- normal usage 179 print $lh->maketext('[*,_1,camel was,camels were] released.', 4); 180 181 # "!Perl ist frei!" -- matches the broader one 182 # Note that the sequence ([_2] before [_1]) is preserved 183 print $lh->maketext('Perl released!'); 184 185=head1 DESCRIPTION 186 187This module is a subclass of C<Locale::Maketext>, with additional 188support for localizing messages that already contains interpolated 189variables. 190 191This is most useful when the messages are returned by external sources 192-- for example, to match C<dir: command not found> against 193C<[_1]: command not found>. 194 195Of course, this module is also useful if you're simply too lazy 196to use the 197 198 $lh->maketext("[quant,_1,file,files] deleted.", $count); 199 200syntax, but wish to write 201 202 $lh->maketext_fuzzy("$count files deleted"); 203 204instead, and have the correct plural form figured out automatically. 205 206If C<maketext_fuzzy> seems too long to type for you, this module 207also provides a C<override_maketext> method to turn I<all> C<maketext> 208calls into C<maketext_fuzzy> calls. 209 210=head1 METHODS 211 212=head2 $lh->maketext_fuzzy(I<key>[, I<parameters...>]); 213 214That method takes exactly the same arguments as the C<maketext> method 215of C<Locale::Maketext>. 216 217If I<key> is found in lexicons, it is applied in the same way as 218C<maketext>. Otherwise, it looks at all lexicon entries that could 219possibly yield I<key>, by turning C<[...]> sequences into C<(.*?)> and 220match the resulting regular expression against I<key>. 221 222Once it finds all candidate entries, the longest one replaces the 223I<key> for the real C<maketext> call. Variables matched by its bracket 224sequences (C<$1>, C<$2>...) are placed before I<parameters>; the order 225of variables in the matched entry are correctly preserved. 226 227For example, if the matched entry in C<%Lexicon> is C<Test [_1]>, 228this call: 229 230 $fh->maketext_fuzzy("Test string", "param"); 231 232is equivalent to this: 233 234 $fh->maketext("Test [_1]", "string", "param"); 235 236However, most of the time you won't need to supply I<parameters> to 237a C<maketext_fuzzy> call, since all parameters are already interpolated 238into the string. 239 240=head2 $lh->override_maketext([I<flag>]); 241 242If I<flag> is true, this accessor method turns C<$lh-E<gt>maketext> 243into an alias for C<$lh-E<gt>maketext_fuzzy>, so all consecutive 244C<maketext> calls in the C<$lh>'s packages are automatically fuzzy. 245A false I<flag> restores the original behaviour. If the flag is not 246specified, returns the current status of override; the default is 2470 (no overriding). 248 249Note that this call only modifies the symbol table of the I<language 250class> that C<$lh> belongs to, so other languages are not affected. 251If you want to override all language handles in a certain application, 252try this: 253 254 MyApp::L10N->override_maketext(1); 255 256=head1 CAVEATS 257 258=over 4 259 260=item * 261 262The "longer is better" heuristic to determine the best match is 263reasonably good, but could certainly be improved. 264 265=item * 266 267Currently, C<"[quant,_1,file] deleted"> won't match C<"3 files deleted">; 268you'll have to write C<"[quant,_1,file,files] deleted"> instead, or 269simply use C<"[_1] file deleted"> as the lexicon key and put the correct 270plural form handling into the corresponding value. 271 272=item * 273 274When used in combination with C<Locale::Maketext::Lexicon>'s C<Tie> 275backend, all keys would be iterated over each time a fuzzy match is 276performed, and may cause serious speed penalty. Patches welcome. 277 278=back 279 280=head1 SEE ALSO 281 282L<Locale::Maketext>, L<Locale::Maketext::Lexicon> 283 284=head1 HISTORY 285 286This particular module was written to facilitate an I<auto-extraction> 287layer for Slashcode's I<Template Toolkit> provider, based on 288C<HTML::Parser> and C<Template::Parser>. It would work like this: 289 290 Input | <B>from the [% story.dept %] dept.</B> 291 Output| <B>[%|loc( story.dept )%]from the [_1] dept.[%END%]</B> 292 293Now, this layer suffers from the same linguistic problems as an 294ordinary C<Msgcat> or C<Gettext> framework does -- what if we want 295to make ordinals from C<[% story.dept %]> (i.e. C<from the 3rd dept.>), 296or expand the C<dept.> to C<department> / C<departments>? 297 298The same problem occurred in RT's web interface, where it had to 299localize messages returned by external modules, which may already 300contain interpolated variables, e.g. C<"Successfully deleted 7 301ticket(s) in 'c:\temp'.">. 302 303Since I didn't have the time to refactor C<DBI> and C<DBI::SearchBuilder>, 304I devised a C<loc_match> method to pre-process their messages into one 305of the I<candidate strings>, then applied the matched string to C<maketext>. 306 307Afterwards, I realized that instead of preparing a set of candidate 308strings, I could actually match against the original I<lexicon file> 309(i.e. PO files via C<Locale::Maketext::Lexicon>). This is how 310C<Locale::Maketext::Fuzzy> was born. 311 312=head1 AUTHORS 313 314Audrey Tang E<lt>cpan@audreyt.orgE<gt> 315 316=head1 CC0 1.0 Universal 317 318To the extent possible under law, 唐鳳 has waived all copyright and related 319or neighboring rights to Locale-Maketext-Fuzzy. 320 321This work is published from Taiwan. 322 323L<http://creativecommons.org/publicdomain/zero/1.0> 324 325=cut 326