1;;;;# -*-coding:utf-8;-*- µ ← col73 2 3require 5; 4use 5.8.0; 5package Text::Unidecode; 6$Last_Modified =' Time-stamp: "2016-11-26 05:01:56 MST"'; 7use utf8; 8use strict; 9use integer; # vroom vroom! 10use vars qw($VERSION @ISA @EXPORT @Char $UNKNOWN $NULLMAP $TABLE_SIZE $Last_Modified 11 $Note_Broken_Tables %Broken_Table_Size %Broken_Table_Copy 12); 13$VERSION = '1.30'; 14require Exporter; 15@ISA = ('Exporter'); 16@EXPORT = ('unidecode'); 17$Note_Broken_Tables = 0; 18BEGIN { *DEBUG = sub () {0} unless defined &DEBUG } 19$UNKNOWN = '[?] '; 20$TABLE_SIZE = 256; 21$NULLMAP = [( $UNKNOWN ) x $TABLE_SIZE]; # for blocks we can't load 22 23#-------------------------------------------------------------------------- 24{ 25 my $x = join '', "\x00" .. "\x7F"; 26 die "the 7-bit purity test fails!" unless $x eq unidecode($x); 27} 28 29#-------------------------------------------------------------------------- 30 31sub unidecode { 32 # Destructive in void context -- in other contexts, nondestructive. 33 34 unless(@_) { # Sanity: Nothing coming in! 35 return() if wantarray; 36 return ''; 37 } 38 39 if( defined wantarray ) { 40 # We're in list or scalar context (i.e., just not void context.) 41 # So make @_'s items no longer be aliases. 42 @_ = map $_, @_; 43 } else { 44 # Otherwise (if we're in void context), then just let @_ stay 45 # aliases, and alter their elements IN-PLACE! 46 } 47 48 foreach my $n (@_) { 49 next unless defined $n; 50 51 # Shut up potentially fatal warnings about UTF-16 surrogate 52 # characters when running under perl -w 53 # This is per https://rt.cpan.org/Ticket/Display.html?id=97456 54 no warnings 'utf8'; 55 56 $n =~ s~([^\x00-\x7f])~${$Char[ord($1)>>8]||t($1)}[ord($1)&255]~egs; 57 } 58 # That means: 59 # Replace character 0xABCD with $Char[0xAB][0xCD], loading 60 # the table 0xAB as needed. 61 # 62 #====================================================================== 63 # 64 # Yes, that's dense code. It's the warp core! 65 # Here is an expansion into pseudocode... as best as I can manage it... 66 # 67 # $character = $1; 68 # $charnum = ord($character); 69 # $charnum_lowbits = $charnum & 255; 70 # $charnum_highbits = $charnum >> 8; 71 # 72 # $table_ref = $Char->[$charnum_highbits]; 73 # 74 # if($table_ref) { 75 # # As expected, we got the arrayref for this table. 76 # } else { 77 # # Uhoh, we couldn't find the arrayref for this table. 78 # # So we call t($character). 79 # # It loads a table. Namely, it does: 80 # Load_Table_For( $charnum_highbits ); 81 # # ...which does magic, and puts something in 82 # # $Char->[$charnum_highbits], 83 # # so NOW we actually CAN do: 84 # $table_ref = $Char->[$charnum_highbits]; 85 # } 86 # 87 # $for_this_char 88 # = $table_ref->[ $charnum_lowbits ]; 89 # 90 # # Although the syntax we actually use is the odd 91 # but COMPLETE EQUIVALENT to this syntax: 92 # 93 # $for_this_char 94 # = ${ $table_ref }[ $charnum_lowbits ]; 95 # 96 # and $for_this_char is the replacement text for this 97 # character, in: 98 # $n =~ s~(char)~replacement~egs 99 # 100 # (And why did I use s~x~y~ instead of s/x/y/ ? 101 # It's all the same for Perl: perldoc perlretut says: 102 # As with the match "m//" operator, "s///" can 103 # use other delimiters, such as "s!!!" and "s{}{}", 104 # I didn't do it for sake of obscurity. I think it's just to 105 # keep my editor's syntax highlighter from crashing, 106 # which was a problem with s/// when the insides are as gory 107 # as we have here. 108 109 return unless defined wantarray; # void context 110 return @_ if wantarray; # normal list context -- return the copies 111 # Else normal scalar context: 112 return $_[0] if @_ == 1; 113 return join '', @_; # rarer fallthru: a list in, but a scalar out. 114} 115 116#====================================================================== 117 118sub make_placeholder_map { 119 return [( $UNKNOWN ) x $TABLE_SIZE ]; 120} 121sub make_placeholder_map_nulls { 122 return [( "" ) x $TABLE_SIZE ]; 123} 124 125#====================================================================== 126 127sub t { # "t" is for "t"able. 128 # Load (and return) a char table for this character 129 # this should get called only once per table per session. 130 my $bank = ord($_[0]) >> 8; 131 return $Char[$bank] if $Char[$bank]; 132 133 load_bank($bank); 134 135 # Now see how that fared... 136 137 if(ref($Char[$bank] || '') ne 'ARRAY') { 138 DEBUG > 1 and print 139 " Loading failed for bank $bank (err $@). Using null map.\n"; 140 return $Char[$bank] = $NULLMAP; 141 } 142 143 144 DEBUG > 1 and print " Loading succeeded.\n"; 145 my $cb = $Char[$bank]; 146 147 # Sanity-check it: 148 if(@$cb == $TABLE_SIZE) { 149 # As expected. Fallthru. 150 151 } else { 152 if($Note_Broken_Tables) { 153 $Broken_Table_Size{$bank} = scalar @$cb; 154 $Broken_Table_Copy{$bank} = [ @$cb ]; 155 } 156 157 if(@$cb > $TABLE_SIZE) { 158 DEBUG and print "Bank $bank is too large-- it has ", scalar @$cb, 159 " entries in it. Pruning.\n"; 160 splice @$cb, $TABLE_SIZE; 161 # That two-argument form splices everything off into nowhere, 162 # starting with the first overage character. 163 164 } elsif( @$cb < $TABLE_SIZE) { 165 DEBUG and print "Bank $bank is too small-- it has ", scalar @$cb, 166 " entries in it. Now padding it.\n"; 167 if(@$cb == 0) { 168 DEBUG and print " (Yes, ZERO entries!)\n"; 169 } 170 push @$cb, 171 ( $UNKNOWN ) x ( $TABLE_SIZE - @$cb) 172 # i.e., however many items, times the deficit 173 ; 174 # And fallthru... 175 176 } else { 177 die "UNREACHABLE CODE HERE (INSANE)"; 178 } 179 } 180 181 # Check for undefness in block: 182 183 for(my $i = 0; $i < $TABLE_SIZE; ++$i) { 184 unless(defined $cb->[$i]) { 185 DEBUG and printf "Undef at position %d in block x%02x\n", 186 $i, $bank; 187 $cb->[$i] = ''; 188 } 189 } 190 191 return $Char[$bank]; 192} 193 194#----------------------------------------------------------------------- 195 196our $eval_loaded_okay; 197 198sub load_bank { 199 200 # This is in its own sub, for sake of sweeping the scary thing 201 # (namely, a call to eval) under the rug. 202 # I.e., to paraphrase what Larry Wall once said to me: if 203 # you're going to do something odd, maybe you should do it 204 # in private. 205 206 my($banknum) = @_; # just as an integer value 207 208 DEBUG and printf 209 "# Eval-loading %s::x%02x ...\n"; 210 211 $eval_loaded_okay = 0; 212 my $code = 213 sprintf( "require %s::x%02x; \$eval_loaded_okay = 1;\n", 214 __PACKAGE__, 215 $banknum); 216 217 { 218 local $SIG{'__DIE__'}; 219 eval($code); 220 } 221 222 return 1 if $eval_loaded_okay; 223 return 0; 224} 225 226#====================================================================== 227 2281; 229__END__ 230 231=encoding utf8 232 233=head1 NAME 234 235Text::Unidecode -- plain ASCII transliterations of Unicode text 236 237=head1 SYNOPSIS 238 239 use utf8; 240 use Text::Unidecode; 241 print unidecode( 242 "北亰\n" 243 # Chinese characters for Beijing (U+5317 U+4EB0) 244 ); 245 246 # That prints: Bei Jing 247 248=head1 DESCRIPTION 249 250It often happens that you have non-Roman text data in Unicode, but 251you can't display it-- usually because you're trying to 252show it to a user via an application that doesn't support Unicode, 253or because the fonts you need aren't accessible. You could 254represent the Unicode characters as "???????" or 255"\15BA\15A0\1610...", but that's nearly useless to the user who 256actually wants to read what the text says. 257 258What Text::Unidecode provides is a function, C<unidecode(...)> that 259takes Unicode data and tries to represent it in US-ASCII characters 260(i.e., the universally displayable characters between 0x00 and 2610x7F). The representation is 262almost always an attempt at I<transliteration>-- i.e., conveying, 263in Roman letters, the pronunciation expressed by the text in 264some other writing system. (See the example in the synopsis.) 265 266 267NOTE: 268 269To make sure your perldoc/Pod viewing setup for viewing this page is 270working: The six-letter word "résumé" should look like "resume" with 271an "/" accent on each "e". 272 273For further tests, and help if that doesn't work, see below, 274L</A POD ENCODING TEST>. 275 276 277=head1 DESIGN PHILOSOPHY 278 279Unidecode's ability to transliterate from a given language is limited 280by two factors: 281 282=over 283 284=item * The amount and quality of data in the written form of the 285original language 286 287So if you have Hebrew data 288that has no vowel points in it, then Unidecode cannot guess what 289vowels should appear in a pronunciation. 290S f y hv n vwls n th npt, y wn't gt ny vwls 291n th tpt. (This is a specific application of the general principle 292of "Garbage In, Garbage Out".) 293 294=item * Basic limitations in the Unidecode design 295 296Writing a real and clever transliteration algorithm for any single 297language usually requires a lot of time, and at least a passable 298knowledge of the language involved. But Unicode text can convey 299more languages than I could possibly learn (much less create a 300transliterator for) in the entire rest of my lifetime. So I put 301a cap on how intelligent Unidecode could be, by insisting that 302it support only context-I<in>sensitive transliteration. That means 303missing the finer details of any given writing system, 304while still hopefully being useful. 305 306=back 307 308Unidecode, in other words, is quick and 309dirty. Sometimes the output is not so dirty at all: 310Russian and Greek seem to work passably; and 311while Thaana (Divehi, AKA Maldivian) is a definitely non-Western 312writing system, setting up a mapping from it to Roman letters 313seems to work pretty well. But sometimes the output is I<very 314dirty:> Unidecode does quite badly on Japanese and Thai. 315 316If you want a smarter transliteration for a particular language 317than Unidecode provides, then you should look for (or write) 318a transliteration algorithm specific to that language, and apply 319it instead of (or at least before) applying Unidecode. 320 321In other words, Unidecode's 322approach is broad (knowing about dozens of writing systems), but 323shallow (not being meticulous about any of them). 324 325=head1 FUNCTIONS 326 327Text::Unidecode provides one function, C<unidecode(...)>, which 328is exported by default. It can be used in a variety of calling contexts: 329 330=over 331 332=item C<$out = unidecode( $in );> # scalar context 333 334This returns a copy of $in, transliterated. 335 336=item C<$out = unidecode( @in );> # scalar context 337 338This is the same as C<$out = unidecode(join "", @in);> 339 340=item C<@out = unidecode( @in );> # list context 341 342This returns a list consisting of copies of @in, each transliterated. This 343is the same as C<@out = map scalar(unidecode($_)), @in;> 344 345=item C<unidecode( @items );> # void context 346 347=item C<unidecode( @bar, $foo, @baz );> # void context 348 349Each item on input is replaced with its transliteration. This 350is the same as C<for(@bar, $foo, @baz) { $_ = unidecode($_) }> 351 352=back 353 354You should make a minimum of assumptions about the output of 355C<unidecode(...)>. For example, if you assume an all-alphabetic 356(Unicode) string passed to C<unidecode(...)> will return an all-alphabetic 357string, you're wrong-- some alphabetic Unicode characters are 358transliterated as strings containing punctuation (e.g., the 359Armenian letter "Թ" (U+0539), currently transliterates as "T`" 360(capital-T then a backtick). 361 362However, these are the assumptions you I<can> make: 363 364=over 365 366=item * 367 368Each character 0x0000 - 0x007F transliterates as itself. That is, 369C<unidecode(...)> is 7-bit pure. 370 371=item * 372 373The output of C<unidecode(...)> always consists entirely of US-ASCII 374characters-- i.e., characters 0x0000 - 0x007F. 375 376=item * 377 378All Unicode characters translate to a sequence of (any number of) 379characters that are newline ("\n") or in the range 0x0020-0x007E. That 380is, no Unicode character translates to "\x01", for example. (Although if 381you have a "\x01" on input, you'll get a "\x01" in output.) 382 383=item * 384 385Yes, some transliterations produce a "\n" but it's just a few, and 386only with good reason. Note that the value of newline ("\n") varies 387from platform to platform-- see L<perlport>. 388 389=item * 390 391Some Unicode characters may transliterate to nothing (i.e., empty string). 392 393=item * 394 395Very many Unicode characters transliterate to multi-character sequences. 396E.g., Unihan character U+5317, "北", transliterates as the four-character string 397"Bei ". 398 399=item * 400 401Within these constraints, I<I may change> the transliteration of characters 402in future versions. For example, if someone convinces me that 403that the Armenian letter "Թ", currently transliterated as "T`", would 404be better transliterated as "D", I I<may> well make that change. 405 406=item * 407 408Unfortunately, there are many characters that Unidecode doesn't know a 409transliteration for. This is generally because the character has been 410added since I last revised the Unidecode data tables. I'm I<always> 411catching up! 412 413=back 414 415=head1 DESIGN GOALS AND CONSTRAINTS 416 417Text::Unidecode is meant to be a transliterator of last resort, 418to be used once you've decided that you can't just display the 419Unicode data as is, I<and once you've decided you don't have a 420more clever, language-specific transliterator available,> or once 421you've I<already applied> smarter algorithms or mappings that you prefer 422and you now just want Unidecode to do cleanup. 423 424Unidecode 425transliterates context-insensitively-- that is, a given character is 426replaced with the same US-ASCII (7-bit ASCII) character or characters, 427no matter what the surrounding characters are. 428 429The main reason I'm making Text::Unidecode work with only 430context-insensitive substitution is that it's fast, dumb, and 431straightforward enough to be feasible. It doesn't tax my 432(quite limited) knowledge of world languages. It doesn't require 433me writing a hundred lines of code to get the Thai syllabification 434right (and never knowing whether I've gotten it wrong, because I 435don't know Thai), or spending a year trying to get Text::Unidecode 436to use the ChaSen algorithm for Japanese, or trying to write heuristics 437for telling the difference between Japanese, Chinese, or Korean, so 438it knows how to transliterate any given Uni-Han glyph. And 439moreover, context-insensitive substitution is still mostly useful, 440but still clearly couldn't be mistaken for authoritative. 441 442Text::Unidecode is an example of the 80/20 rule in 443action-- you get 80% of the usefulness using just 20% of a 444"real" solution. 445 446A "real" approach to transliteration for any given language can 447involve such increasingly tricky contextual factors as these: 448 449=over 450 451=item The previous / preceding character(s) 452 453What a given symbol "X" means, could 454depend on whether it's followed by a consonant, or by vowel, or 455by some diacritic character. 456 457=item Syllables 458 459A character "X" at end of a syllable could mean something 460different from when it's at the start-- which is especially problematic 461when the language involved doesn't explicitly mark where one syllable 462stops and the next starts. 463 464=item Parts of speech 465 466What "X" sounds like at the end of a word, 467depends on whether that word is a noun, or a verb, or what. 468 469=item Meaning 470 471By semantic context, you can tell that this ideogram "X" means "shoe" 472(pronounced one way) and not "time" (pronounced another), 473and that's how you know to transliterate it one way instead of the other. 474 475=item Origin of the word 476 477"X" means one thing in loanwords and/or placenames (and 478derivatives thereof), and another in native words. 479 480=item "It's just that way" 481 482"X" normally makes 483the /X/ sound, except for this list of seventy exceptions (and words based 484on them, sometimes indirectly). Or: you never can tell which of the three 485ways to pronounce "X" this word actually uses; you just have to know 486which it is, so keep a dictionary on hand! 487 488=item Language 489 490The character "X" is actually used in several different languages, and you 491have to figure out which you're looking at before you can determine how 492to transliterate it. 493 494=back 495 496Out of a desire to avoid being mired in I<any> of these kinds of 497contextual factors, I chose to exclude I<all of them> and just stick 498with context-insensitive replacement. 499 500 501=head1 A POD ENCODING TEST 502 503=over 504 505=item * 506 507"Brontë" is six characters that should look like "Bronte", but 508with double-dots on the "e" character. 509 510=item * 511 512"Résumé" is six characters that should look like "Resume", but 513with /-shaped accents on the "e" characters. 514 515=item * 516 517"læti" should be I<four> letters long-- the second letter should not 518be two letters "ae", but should be a single letter that 519looks like an "a" entirely fused with an "e". 520 521=item * 522 523"χρονος" is six Greek characters that should look kind of like: xpovoc 524 525=item * 526 527"КАК ВАС ЗОВУТ" is three short Russian words that should look a 528lot like: KAK BAC 3OBYT 529 530=item * 531 532"ടധ" is two Malayalam characters that should look like: sw 533 534=item * 535 536"丫二十一" is four Chinese characters that should look like: C<Y=+-> 537 538=item * 539 540"Hello" is five characters that should look like: Hello 541 542=back 543 544If all of those come out right, your Pod viewing setup is working 545fine-- welcome to the 2010s! If those are full of garbage characters, 546consider viewing this page as HTML at 547L<https://metacpan.org/pod/Text::Unidecode> 548or 549L<http://search.cpan.org/perldoc?Text::Unidecode> 550 551 552If things look mostly okay, but the Malayalam and/or the Chinese are 553just question-marks or empty boxes, it's probably just that your 554computer lacks the fonts for those. 555 556=head1 TODO 557 558Lots: 559 560* Rebuild the Unihan database. (Talk about hitting a moving target!) 561 562* Add tone-numbers for Mandarin hanzi? Namely: In Unihan, when tone 563marks are present (like in "kMandarin: dào", should I continue to 564transliterate as just "Dao", or should I put in the tone number: 565"Dao4"? It would be pretty jarring to have digits appear where 566previously there was just alphabetic stuff-- But tone numbers 567make Chinese more readable. 568(I have a clever idea about doing this, for Unidecode v2 or v3.) 569 570* Start dealing with characters over U+FFFF. Cuneiform! Emojis! Whatever! 571 572* Fill in all the little characters that have crept into the Misc Symbols 573Etc blocks. 574 575* More things that need tending to are detailed in the TODO.txt file, 576included in this distribution. Normal installs probably don't leave 577the TODO.txt lying around, but if nothing else, you can see it at 578L<http://search.cpan.org/search?dist=Text::Unidecode> 579 580=head1 MOTTO 581 582The Text::Unidecode motto is: 583 584 It's better than nothing! 585 586...in I<both> meanings: 1) seeing the output of C<unidecode(...)> is 587better than just having all font-unavailable Unicode characters 588replaced with "?"'s, or rendered as gibberish; and 2) it's the 589worst, i.e., there's nothing that Text::Unidecode's algorithm is 590better than. All sensible transliteration algorithms (like for 591German, see below) are going to be smarter than Unidecode's. 592 593=head1 WHEN YOU DON'T LIKE WHAT UNIDECODE DOES 594 595I will repeat the above, because some people miss it: 596 597Text::Unidecode is meant to be a transliterator of I<last resort,> 598to be used once you've decided that you can't just display the 599Unicode data as is, I<and once you've decided you don't have a 600more clever, language-specific transliterator available>-- or once 601you've I<already applied> a smarter algorithm and now just want Unidecode 602to do cleanup. 603 604In other words, when you don't like what Unidecode does, I<do it 605yourself.> Really, that's what the above says. Here's how 606you would do this for German, for example: 607 608In German, there's the typographical convention that an umlaut (the 609double-dots on: ä ö ü) can be written as an "-e", like with "Schön" 610becoming "Schoen". But Unidecode doesn't do that-- I have Unidecode 611simply drop the umlaut accent and give back "Schon". 612 613(I chose this not because I'm a big meanie, but because 614I<generally> changing "ü" to "ue" is disastrous for all text 615that's I<not in German>. Finnish "Hyvää päivää" would turn 616into "Hyvaeae paeivaeae". And I discourage you from being I<yet 617another> German who emails me, trying to impel me to consider 618a typographical nicety of German to be more important than 619I<all other languages>.) 620 621If you know that the text you're handling is probably in German, and 622you want to apply the "umlaut becomes -e" rule, here's how to do it 623for yourself (and then use Unidecode as I<the fallback> afterwards): 624 625 use utf8; # <-- probably necessary. 626 627 our( %German_Characters ) = qw( 628 Ä AE ä ae 629 Ö OE ö oe 630 Ü UE ü ue 631 ß ss 632 ); 633 634 use Text::Unidecode qw(unidecode); 635 636 sub german_to_ascii { 637 my($german_text) = @_; 638 639 $german_text =~ 640 s/([ÄäÖöÜüß])/$German_Characters{$1}/g; 641 642 # And now, as a *fallthrough*: 643 $german_text = unidecode( $german_text ); 644 return $german_text; 645 } 646 647To pick another example, here's something that's not about a 648specific language, but simply having a preference that may or 649may not agree with Unidecode's (i.e., mine). Consider the "¥" 650symbol. Unidecode changes that to "Y=". If you want "¥" as 651"YEN", then... 652 653 use Text::Unidecode qw(unidecode); 654 655 sub my_favorite_unidecode { 656 my($text) = @_; 657 658 $text =~ s/¥/YEN/g; 659 660 # ...and anything else you like, such as: 661 $text =~ s/€/Euro/g; 662 663 # And then, as a fallback,... 664 $text = unidecode($text); 665 666 return $text; 667 } 668 669Then if you do: 670 671 print my_favorite_unidecode("You just won ¥250,000 and €40,000!!!"); 672 673...you'll get: 674 675 You just won YEN250,000 and Euro40,000!!! 676 677...just as you like it. 678 679(By the way, the reason I<I> don't have Unidecode just turn "¥" into "YEN" 680is that the same symbol also stands for yuan, the Chinese 681currency. A "Y=" is nicely, I<safely> neutral as to whether 682we're talking about yen or yuan-- Japan, or China.) 683 684Another example: for hanzi/kanji/hanja, I have designed 685Unidecode to transliterate according to the value that that 686character has in Mandarin (otherwise Cantonese,...). Some 687users have complained that applying Unidecode to Japanese 688produces gibberish. 689 690To make a long story short: transliterating from Japanese is 691I<difficult> and it requires a I<lot> of context-sensitivity. 692If you have text that you're fairly sure is in 693Japanese, you're going to have to use a Japanese-specific 694algorithm to transliterate Japanese into ASCII. (And then 695you can call Unidecode on the output from that-- it is useful 696for, for example, turning fullwidth characters into 697their normal (ASCII) forms. 698 699(Note, as of August 2016: I have titanic but tentative plans for 700making the value of Unihan characters be something you could set 701parameters for at runtime, in changing the order of "Mandarin else 702Cantonese else..." in the value retrieval. Currently that preference 703list is hardwired on my end, at module-build time. Other options I'm 704considering allowing for: whether the Mandarin and Cantonese values 705should have the tone numbers on them; whether every Unihan value 706should have a terminal space; and maybe other clever stuff I haven't 707thought of yet.) 708 709 710=head1 CAVEATS 711 712If you get really implausible nonsense out of C<unidecode(...)>, make 713sure that the input data really is a utf8 string. See 714L<perlunicode> and L<perlunitut>. 715 716I<Unidecode will work disastrously bad on Japanese.> That's because 717Japanese is very very hard. To extend the Unidecode motto, 718Unidecode is better than nothing, and with Japanese, I<just barely!> 719 720On pure Mandarin, Unidecode will frequently give odd values-- 721that's because a single hanzi can have several readings, and Unidecode 722only knows what the Unihan database says is the most common one. 723 724 725=head1 THANKS 726 727Thanks to (in only the sloppiest of sorta-chronological order): 728Jordan Lachler, Harald Tveit Alvestrand, Melissa Axelrod, 729Abhijit Menon-Sen, Mark-Jason Dominus, Joe Johnston, 730Conrad Heiney, fileformat.info, 731Philip Newton, 唐鳳, Tomaž Šolc, Mike Doherty, JT Smith and the 732MadMongers, Arden Ogg, Craig Copris, 733David Cusimano, Brendan Byrd, Hex Martin, 734and 735I<many> 736other pals who have helped with the ideas or values for Unidecode's 737transliterations, or whose help has been in the 738secret F5 tornado that constitutes the internals of Unidecode's 739implementation. 740 741And thank you to the many people who have encouraged me to plug away 742at this project. A decade went by before I had any idea that more 743than about 4 or 5 people were using or getting any value 744out of Unidecode. I am told that actually 745my figure was missing some zeroes on the end! 746 747 748=head1 PORTS 749 750Some wonderful people have ported Unidecode to other languages! 751 752=over 753 754=item * 755 756Python: L<https://pypi.python.org/pypi/Unidecode> 757 758=item * 759 760PHP: L<https://github.com/silverstripe-labs/silverstripe-unidecode> 761 762=item * 763 764Ruby: L<http://www.rubydoc.info/gems/unidecode/1.0.0/frames> 765 766=item * 767 768JavaScript: L<https://www.npmjs.org/package/unidecode> 769 770=item * 771 772Java: L<https://github.com/xuender/unidecode> 773 774=back 775 776I can't vouch for the details of each port, but these are clever 777people, so I'm sure they did a fine job. 778 779 780=head1 SEE ALSO 781 782An article I wrote for I<The Perl Journal> about 783Unidecode: L<http://interglacial.com/tpj/22/> 784(B<READ IT!>) 785 786Jukka Korpela's L<http://www.cs.tut.fi/~jkorpela/fui.html8> which is 787brilliantly useful, and its code is brilliant (so, view source!). I 788was I<kinda> thinking about maybe doing something I<sort of> like that 789for the v2.x versions of Unicode-- but now he's got me convinced that 790I should go right ahead. 791 792Tom Christiansen's 793I<Perl Unicode Cookbook>, 794L<http://www.perl.com/pub/2012/04/perlunicook-standard-preamble.html> 795 796Unicode Consortium: L<http://www.unicode.org/> 797 798Searchable Unihan database: 799L<http://www.unicode.org/cgi-bin/GetUnihanData.pl> 800 801Geoffrey Sampson. 1990. I<Writing Systems: A Linguistic Introduction.> 802ISBN: 0804717567 803 804Randall K. Barry (editor). 1997. I<ALA-LC Romanization Tables: 805Transliteration Schemes for Non-Roman Scripts.> 806ISBN: 0844409405 807[ALA is the American Library Association; LC is the Library of 808Congress.] 809 810Rupert Snell. 2000. I<Beginner's Hindi Script (Teach Yourself 811Books).> ISBN: 0658009109 812 813=head1 LICENSE 814 815Copyright (c) 2001, 2014, 2015, 2016 Sean M. Burke. 816 817Unidecode is distributed under the Perl Artistic License 818( L<perlartistic> ), namely: 819 820This library is free software; you can redistribute it and/or modify 821it under the same terms as Perl itself. 822 823This program is distributed in the hope that it will be useful, but 824without any warranty; without even the implied warranty of 825merchantability or fitness for a particular purpose. 826 827=head1 DISCLAIMER 828 829Much of Text::Unidecode's internal data is based on data from The 830Unicode Consortium, with which I am unaffiliated. A good deal of the 831internal data comes from suggestions that have been contributed by 832people other than myself. 833 834The views and conclusions contained in my software and documentation 835are my own-- they should not be interpreted as representing official 836policies, either expressed or implied, of The Unicode Consortium; nor 837should they be interpreted as necessarily the views or conclusions of 838people who have contributed to this project. 839 840Moreover, I discourage you from inferring that choices that I've made 841in Unidecode reflect political or linguistic prejudices on my 842part. Just because Unidecode doesn't do great on your language, 843or just because it might seem to do better on some another 844language, please don't think I'm out to get you! 845 846=head1 AUTHOR 847 848Your pal, Sean M. Burke C<sburke@cpan.org> 849 850=head1 O HAI! 851 852If you're using Unidecode for anything interesting, be cool and 853email me, I'm always curious what people use this for. (The 854answers so far have surprised me!) 855 856=cut 857 858#################### SCOOBIE SNACK #################### 859 860Lest there be any REMAINING doubt that the Unicode Consortium has 861a sense of humor, the CDROM that comes with /The Unicode Standard, 862Version 3.0/ book, has an audio track of the Unicode anthem [!]. 863The lyrics are: 864 865 Unicode, Oh Unicode! 866 -------------------- 867 868 Oh, beautiful for Uni-Han, 869 for spacious User Zone! 870 For rampant scripts of India 871 and polar Nunavut! 872 873 Chorus: 874 Unicode, Oh Unicode! 875 May all your code points shine forever 876 and your beacon light the world! 877 878 Oh, marvelous for sixteen bits, 879 for precious surrogates! 880 For Bi-Di algorithm dear 881 and stalwart I-P-A! 882 883 Oh, glorious for Hangul fair, 884 for symbols mathematical! 885 For myriad exotic scripts 886 and punctuation we adore! 887 888# End. 889