1;;;;# -*-coding:utf-8;-*-                                               µ ← col73
2
3require 5;
4use 5.8.0;
5package Text::Unidecode;
6$Last_Modified =' Time-stamp: "2016-11-26 05:01:56 MST"';
7use utf8;
8use strict;
9use integer; # vroom vroom!
10use vars qw($VERSION @ISA @EXPORT @Char $UNKNOWN $NULLMAP $TABLE_SIZE $Last_Modified
11   $Note_Broken_Tables %Broken_Table_Size %Broken_Table_Copy
12);
13$VERSION = '1.30';
14require Exporter;
15@ISA = ('Exporter');
16@EXPORT = ('unidecode');
17$Note_Broken_Tables = 0;
18BEGIN { *DEBUG = sub () {0} unless defined &DEBUG }
19$UNKNOWN = '[?] ';
20$TABLE_SIZE = 256;
21$NULLMAP = [( $UNKNOWN ) x $TABLE_SIZE];  # for blocks we can't load
22
23#--------------------------------------------------------------------------
24{
25  my $x = join '', "\x00" .. "\x7F";
26  die "the 7-bit purity test fails!" unless $x eq unidecode($x);
27}
28
29#--------------------------------------------------------------------------
30
31sub unidecode {
32  # Destructive in void context -- in other contexts, nondestructive.
33
34  unless(@_) {  # Sanity: Nothing coming in!
35    return() if wantarray;
36    return '';
37  }
38
39  if( defined wantarray ) {
40    # We're in list or scalar context (i.e., just not void context.)
41    #  So make @_'s items no longer be aliases.
42    @_ = map $_, @_;
43  } else {
44    # Otherwise (if we're in void context), then just let @_ stay
45    #  aliases, and alter their elements IN-PLACE!
46  }
47
48  foreach my $n (@_) {
49    next unless defined $n;
50
51    # Shut up potentially fatal warnings about UTF-16 surrogate
52    # characters when running under perl -w
53    # This is per https://rt.cpan.org/Ticket/Display.html?id=97456
54    no warnings 'utf8';
55
56    $n =~ s~([^\x00-\x7f])~${$Char[ord($1)>>8]||t($1)}[ord($1)&255]~egs;
57  }
58  # That means:
59  #   Replace character 0xABCD with $Char[0xAB][0xCD], loading
60  #    the table 0xAB as needed.
61  #
62  #======================================================================
63  #
64  # Yes, that's dense code.  It's the warp core!
65  # Here is an expansion into pseudocode... as best as I can manage it...
66  #
67  #     $character = $1;
68  #     $charnum = ord($character);
69  #     $charnum_lowbits  = $charnum & 255;
70  #     $charnum_highbits = $charnum >> 8;
71  #
72  #     $table_ref = $Char->[$charnum_highbits];
73  #
74  #     if($table_ref) {
75  #       # As expected, we got the arrayref for this table.
76  #     } else {
77  #       # Uhoh, we couldn't find the arrayref for this table.
78  #       # So we call t($character).
79  #       #  It loads a table.  Namely, it does:
80  #       Load_Table_For( $charnum_highbits );
81  #        # ...which does magic, and puts something in
82  #        #     $Char->[$charnum_highbits],
83  #        #     so NOW we actually CAN do:
84  #       $table_ref = $Char->[$charnum_highbits];
85  #     }
86  #
87  #     $for_this_char
88  #       = $table_ref->[ $charnum_lowbits ];
89  #
90  #     # Although the syntax we actually use is the odd
91  #      but COMPLETE EQUIVALENT to this syntax:
92  #
93  #     $for_this_char
94  #       = ${ $table_ref }[ $charnum_lowbits ];
95  #
96  #     and $for_this_char is the replacement text for this
97  #      character, in:
98  #      $n =~ s~(char)~replacement~egs
99  #
100  #  (And why did I use s~x~y~ instead of s/x/y/ ?
101  #  It's all the same for Perl: perldoc perlretut says:
102  #       As with the match "m//" operator, "s///" can
103  #       use other delimiters, such as "s!!!" and "s{}{}",
104  #  I didn't do it for sake of obscurity. I think it's just to
105  #  keep my editor's syntax highlighter from crashing,
106  #  which was a problem with s/// when the insides are as gory
107  #  as we have here.
108
109  return unless defined wantarray; # void context
110  return @_ if wantarray;  # normal list context -- return the copies
111  # Else normal scalar context:
112  return $_[0] if @_ == 1;
113  return join '', @_;      # rarer fallthru: a list in, but a scalar out.
114}
115
116#======================================================================
117
118sub make_placeholder_map {
119  return [( $UNKNOWN ) x $TABLE_SIZE ];
120}
121sub make_placeholder_map_nulls {
122  return [( "" ) x $TABLE_SIZE ];
123}
124
125#======================================================================
126
127sub t {   # "t" is for "t"able.
128  # Load (and return) a char table for this character
129  # this should get called only once per table per session.
130  my $bank = ord($_[0]) >> 8;
131  return $Char[$bank] if $Char[$bank];
132
133  load_bank($bank);
134
135  # Now see how that fared...
136
137  if(ref($Char[$bank] || '') ne 'ARRAY') {
138    DEBUG > 1 and print
139      " Loading failed for bank $bank (err $@).  Using null map.\n";
140    return $Char[$bank] = $NULLMAP;
141  }
142
143
144  DEBUG > 1 and print " Loading succeeded.\n";
145  my $cb = $Char[$bank];
146
147  # Sanity-check it:
148  if(@$cb == $TABLE_SIZE) {
149    # As expected.  Fallthru.
150
151  } else {
152    if($Note_Broken_Tables) {
153      $Broken_Table_Size{$bank} = scalar @$cb;
154      $Broken_Table_Copy{$bank} = [ @$cb ];
155    }
156
157    if(@$cb > $TABLE_SIZE) {
158      DEBUG and print "Bank $bank is too large-- it has ", scalar @$cb,
159        " entries in it.  Pruning.\n";
160      splice @$cb, $TABLE_SIZE;
161       # That two-argument form splices everything off into nowhere,
162       #  starting with the first overage character.
163
164    } elsif( @$cb < $TABLE_SIZE) {
165      DEBUG and print "Bank $bank is too small-- it has ", scalar @$cb,
166        " entries in it.  Now padding it.\n";
167      if(@$cb == 0) {
168        DEBUG and print "  (Yes, ZERO entries!)\n";
169      }
170      push @$cb,
171	  ( $UNKNOWN )  x  ( $TABLE_SIZE - @$cb)
172	  # i.e., however many items, times the deficit
173      ;
174      # And fallthru...
175
176    } else {
177      die "UNREACHABLE CODE HERE (INSANE)";
178    }
179  }
180
181  # Check for undefness in block:
182
183  for(my $i = 0; $i < $TABLE_SIZE; ++$i) {
184    unless(defined $cb->[$i]) {
185      DEBUG and printf "Undef at position %d in block x%02x\n",
186        $i, $bank;
187      $cb->[$i] = '';
188    }
189  }
190
191  return $Char[$bank];
192}
193
194#-----------------------------------------------------------------------
195
196our $eval_loaded_okay;
197
198sub load_bank {
199
200  # This is in its own sub, for sake of sweeping the scary thing
201  #  (namely, a call to eval) under the rug.
202  # I.e., to paraphrase what Larry Wall once said to me: if
203  #  you're going to do something odd, maybe you should do it
204  #  in private.
205
206  my($banknum) = @_;  # just as an integer value
207
208  DEBUG and printf
209      "# Eval-loading %s::x%02x ...\n";
210
211  $eval_loaded_okay = 0;
212  my $code =
213      sprintf( "require %s::x%02x; \$eval_loaded_okay = 1;\n",
214               __PACKAGE__,
215	       $banknum);
216
217  {
218    local $SIG{'__DIE__'};
219    eval($code);
220  }
221
222  return 1 if $eval_loaded_okay;
223  return 0;
224}
225
226#======================================================================
227
2281;
229__END__
230
231=encoding utf8
232
233=head1 NAME
234
235Text::Unidecode -- plain ASCII transliterations of Unicode text
236
237=head1 SYNOPSIS
238
239  use utf8;
240  use Text::Unidecode;
241  print unidecode(
242    "北亰\n"
243    # Chinese characters for Beijing (U+5317 U+4EB0)
244  );
245
246  # That prints: Bei Jing
247
248=head1 DESCRIPTION
249
250It often happens that you have non-Roman text data in Unicode, but
251you can't display it-- usually because you're trying to
252show it to a user via an application that doesn't support Unicode,
253or because the fonts you need aren't accessible.  You could
254represent the Unicode characters as "???????" or
255"\15BA\15A0\1610...", but that's nearly useless to the user who
256actually wants to read what the text says.
257
258What Text::Unidecode provides is a function, C<unidecode(...)> that
259takes Unicode data and tries to represent it in US-ASCII characters
260(i.e., the universally displayable characters between 0x00 and
2610x7F).  The representation is
262almost always an attempt at I<transliteration>-- i.e., conveying,
263in Roman letters, the pronunciation expressed by the text in
264some other writing system.  (See the example in the synopsis.)
265
266
267NOTE:
268
269To make sure your perldoc/Pod viewing setup for viewing this page is
270working: The six-letter word "résumé" should look like "resume" with
271an "/" accent on each "e".
272
273For further tests, and help if that doesn't work, see below,
274L</A POD ENCODING TEST>.
275
276
277=head1 DESIGN PHILOSOPHY
278
279Unidecode's ability to transliterate from a given language is limited
280by two factors:
281
282=over
283
284=item * The amount and quality of data in the written form of the
285original language
286
287So if you have Hebrew data
288that has no vowel points in it, then Unidecode cannot guess what
289vowels should appear in a pronunciation.
290S f y hv n vwls n th npt, y wn't gt ny vwls
291n th tpt.  (This is a specific application of the general principle
292of "Garbage In, Garbage Out".)
293
294=item * Basic limitations in the Unidecode design
295
296Writing a real and clever transliteration algorithm for any single
297language usually requires a lot of time, and at least a passable
298knowledge of the language involved.  But Unicode text can convey
299more languages than I could possibly learn (much less create a
300transliterator for) in the entire rest of my lifetime.  So I put
301a cap on how intelligent Unidecode could be, by insisting that
302it support only context-I<in>sensitive transliteration.  That means
303missing the finer details of any given writing system,
304while still hopefully being useful.
305
306=back
307
308Unidecode, in other words, is quick and
309dirty.  Sometimes the output is not so dirty at all:
310Russian and Greek seem to work passably; and
311while Thaana (Divehi, AKA Maldivian) is a definitely non-Western
312writing system, setting up a mapping from it to Roman letters
313seems to work pretty well.  But sometimes the output is I<very
314dirty:> Unidecode does quite badly on Japanese and Thai.
315
316If you want a smarter transliteration for a particular language
317than Unidecode provides, then you should look for (or write)
318a transliteration algorithm specific to that language, and apply
319it instead of (or at least before) applying Unidecode.
320
321In other words, Unidecode's
322approach is broad (knowing about dozens of writing systems), but
323shallow (not being meticulous about any of them).
324
325=head1 FUNCTIONS
326
327Text::Unidecode provides one function, C<unidecode(...)>, which
328is exported by default.  It can be used in a variety of calling contexts:
329
330=over
331
332=item C<$out = unidecode( $in );> # scalar context
333
334This returns a copy of $in, transliterated.
335
336=item C<$out = unidecode( @in );> # scalar context
337
338This is the same as C<$out = unidecode(join "", @in);>
339
340=item C<@out = unidecode( @in );> # list context
341
342This returns a list consisting of copies of @in, each transliterated.  This
343is the same as C<@out = map scalar(unidecode($_)), @in;>
344
345=item C<unidecode( @items );> # void context
346
347=item C<unidecode( @bar, $foo, @baz );> # void context
348
349Each item on input is replaced with its transliteration.  This
350is the same as C<for(@bar, $foo, @baz) { $_ = unidecode($_) }>
351
352=back
353
354You should make a minimum of assumptions about the output of
355C<unidecode(...)>.  For example, if you assume an all-alphabetic
356(Unicode) string passed to C<unidecode(...)> will return an all-alphabetic
357string, you're wrong-- some alphabetic Unicode characters are
358transliterated as strings containing punctuation (e.g., the
359Armenian letter "Թ" (U+0539), currently transliterates as "T`"
360(capital-T then a backtick).
361
362However, these are the assumptions you I<can> make:
363
364=over
365
366=item *
367
368Each character 0x0000 - 0x007F transliterates as itself.  That is,
369C<unidecode(...)> is 7-bit pure.
370
371=item *
372
373The output of C<unidecode(...)> always consists entirely of US-ASCII
374characters-- i.e., characters 0x0000 - 0x007F.
375
376=item *
377
378All Unicode characters translate to a sequence of (any number of)
379characters that are newline ("\n") or in the range 0x0020-0x007E.  That
380is, no Unicode character translates to "\x01", for example.  (Although if
381you have a "\x01" on input, you'll get a "\x01" in output.)
382
383=item *
384
385Yes, some transliterations produce a "\n" but it's just a few, and
386only with good reason.  Note that the value of newline ("\n") varies
387from platform to platform-- see L<perlport>.
388
389=item *
390
391Some Unicode characters may transliterate to nothing (i.e., empty string).
392
393=item *
394
395Very many Unicode characters transliterate to multi-character sequences.
396E.g., Unihan character U+5317, "北", transliterates as the four-character string
397"Bei ".
398
399=item *
400
401Within these constraints, I<I may change> the transliteration of characters
402in future versions.  For example, if someone convinces me that
403that the Armenian letter "Թ", currently transliterated as "T`", would
404be better transliterated as "D", I I<may> well make that change.
405
406=item *
407
408Unfortunately, there are many characters that Unidecode doesn't know a
409transliteration for.  This is generally because the character has been
410added since I last revised the Unidecode data tables.  I'm I<always>
411catching up!
412
413=back
414
415=head1 DESIGN GOALS AND CONSTRAINTS
416
417Text::Unidecode is meant to be a transliterator of last resort,
418to be used once you've decided that you can't just display the
419Unicode data as is, I<and once you've decided you don't have a
420more clever, language-specific transliterator available,> or once
421you've I<already applied> smarter algorithms or mappings that you prefer
422and you now just want Unidecode to do cleanup.
423
424Unidecode
425transliterates context-insensitively-- that is, a given character is
426replaced with the same US-ASCII (7-bit ASCII) character or characters,
427no matter what the surrounding characters are.
428
429The main reason I'm making Text::Unidecode work with only
430context-insensitive substitution is that it's fast, dumb, and
431straightforward enough to be feasible.  It doesn't tax my
432(quite limited) knowledge of world languages.  It doesn't require
433me writing a hundred lines of code to get the Thai syllabification
434right (and never knowing whether I've gotten it wrong, because I
435don't know Thai), or spending a year trying to get Text::Unidecode
436to use the ChaSen algorithm for Japanese, or trying to write heuristics
437for telling the difference between Japanese, Chinese, or Korean, so
438it knows how to transliterate any given Uni-Han glyph.  And
439moreover, context-insensitive substitution is still mostly useful,
440but still clearly couldn't be mistaken for authoritative.
441
442Text::Unidecode is an example of the 80/20 rule in
443action-- you get 80% of the usefulness using just 20% of a
444"real" solution.
445
446A "real" approach to transliteration for any given language can
447involve such increasingly tricky contextual factors as these:
448
449=over
450
451=item The previous / preceding character(s)
452
453What a given symbol "X" means, could
454depend on whether it's followed by a consonant, or by vowel, or
455by some diacritic character.
456
457=item Syllables
458
459A character "X" at end of a syllable could mean something
460different from when it's at the start-- which is especially problematic
461when the language involved doesn't explicitly mark where one syllable
462stops and the next starts.
463
464=item Parts of speech
465
466What "X" sounds like at the end of a word,
467depends on whether that word is a noun, or a verb, or what.
468
469=item Meaning
470
471By semantic context, you can tell that this ideogram "X" means "shoe"
472(pronounced one way) and not "time" (pronounced another),
473and that's how you know to transliterate it one way instead of the other.
474
475=item Origin of the word
476
477"X" means one thing in loanwords and/or placenames (and
478derivatives thereof), and another in native words.
479
480=item "It's just that way"
481
482"X" normally makes
483the /X/ sound, except for this list of seventy exceptions (and words based
484on them, sometimes indirectly).  Or: you never can tell which of the three
485ways to pronounce "X" this word actually uses; you just have to know
486which it is, so keep a dictionary on hand!
487
488=item Language
489
490The character "X" is actually used in several different languages, and you
491have to figure out which you're looking at before you can determine how
492to transliterate it.
493
494=back
495
496Out of a desire to avoid being mired in I<any> of these kinds of
497contextual factors, I chose to exclude I<all of them> and just stick
498with context-insensitive replacement.
499
500
501=head1 A POD ENCODING TEST
502
503=over
504
505=item *
506
507"Brontë" is six characters that should look like "Bronte", but
508with double-dots on the "e" character.
509
510=item *
511
512"Résumé" is six characters that should look like "Resume", but
513with /-shaped accents on the "e" characters.
514
515=item *
516
517"læti" should be I<four> letters long-- the second letter should not
518be two letters "ae", but should be a single letter that
519looks like an "a" entirely fused with an "e".
520
521=item *
522
523"χρονος" is six Greek characters that should look kind of like: xpovoc
524
525=item *
526
527"КАК ВАС ЗОВУТ" is three short Russian words that should look a
528lot like: KAK BAC 3OBYT
529
530=item *
531
532"ടധ" is two Malayalam characters that should look like: sw
533
534=item *
535
536"丫二十一" is four Chinese characters that should look like: C<Y=+->
537
538=item *
539
540"Hello" is five characters that should look like: Hello
541
542=back
543
544If all of those come out right, your Pod viewing setup is working
545fine-- welcome to the 2010s!  If those are full of garbage characters,
546consider viewing this page as HTML at
547L<https://metacpan.org/pod/Text::Unidecode>
548or
549L<http://search.cpan.org/perldoc?Text::Unidecode>
550
551
552If things look mostly okay, but the Malayalam and/or the Chinese are
553just question-marks or empty boxes, it's probably just that your
554computer lacks the fonts for those.
555
556=head1 TODO
557
558Lots:
559
560* Rebuild the Unihan database.  (Talk about hitting a moving target!)
561
562* Add tone-numbers for Mandarin hanzi?  Namely: In Unihan, when tone
563marks are present (like in "kMandarin: dào", should I continue to
564transliterate as just "Dao", or should I put in the tone number:
565"Dao4"?  It would be pretty jarring to have digits appear where
566previously there was just alphabetic stuff-- But tone numbers
567make Chinese more readable.
568(I have a clever idea about doing this, for Unidecode v2 or v3.)
569
570* Start dealing with characters over U+FFFF.  Cuneiform! Emojis! Whatever!
571
572* Fill in all the little characters that have crept into the Misc Symbols
573Etc blocks.
574
575* More things that need tending to are detailed in the TODO.txt file,
576included in this distribution.  Normal installs probably don't leave
577the TODO.txt lying around, but if nothing else, you can see it at
578L<http://search.cpan.org/search?dist=Text::Unidecode>
579
580=head1 MOTTO
581
582The Text::Unidecode motto is:
583
584  It's better than nothing!
585
586...in I<both> meanings: 1) seeing the output of C<unidecode(...)> is
587better than just having all font-unavailable Unicode characters
588replaced with "?"'s, or rendered as gibberish; and 2) it's the
589worst, i.e., there's nothing that Text::Unidecode's algorithm is
590better than.  All sensible transliteration algorithms (like for
591German, see below) are going to be smarter than Unidecode's.
592
593=head1 WHEN YOU DON'T LIKE WHAT UNIDECODE DOES
594
595I will repeat the above, because some people miss it:
596
597Text::Unidecode is meant to be a transliterator of I<last resort,>
598to be used once you've decided that you can't just display the
599Unicode data as is, I<and once you've decided you don't have a
600more clever, language-specific transliterator available>-- or once
601you've I<already applied> a smarter algorithm and now just want Unidecode
602to do cleanup.
603
604In other words, when you don't like what Unidecode does, I<do it
605yourself.>  Really, that's what the above says.  Here's how
606you would do this for German, for example:
607
608In German, there's the typographical convention that an umlaut (the
609double-dots on: ä ö ü) can be written as an "-e", like with "Schön"
610becoming "Schoen".  But Unidecode doesn't do that-- I have Unidecode
611simply drop the umlaut accent and give back "Schon".
612
613(I chose this not because I'm a big meanie, but because
614I<generally> changing "ü" to "ue" is disastrous for all text
615that's I<not in German>.  Finnish "Hyvää päivää" would turn
616into "Hyvaeae paeivaeae".  And I discourage you from being I<yet
617another> German who emails me, trying to impel me to consider
618a typographical nicety of German to be more important than
619I<all other languages>.)
620
621If you know that the text you're handling is probably in German, and
622you want to apply the "umlaut becomes -e" rule, here's how to do it
623for yourself (and then use Unidecode as I<the fallback> afterwards):
624
625  use utf8;  # <-- probably necessary.
626
627  our( %German_Characters ) = qw(
628   Ä AE   ä ae
629   Ö OE   ö oe
630   Ü UE   ü ue
631   ß ss
632  );
633
634  use Text::Unidecode qw(unidecode);
635
636  sub german_to_ascii {
637    my($german_text) = @_;
638
639    $german_text =~
640      s/([ÄäÖöÜüß])/$German_Characters{$1}/g;
641
642    # And now, as a *fallthrough*:
643    $german_text = unidecode( $german_text );
644    return $german_text;
645  }
646
647To pick another example, here's something that's not about a
648specific language, but simply having a preference that may or
649may not agree with Unidecode's (i.e., mine).  Consider the "¥"
650symbol.  Unidecode changes that to "Y=".  If you want "¥" as
651"YEN", then...
652
653  use Text::Unidecode qw(unidecode);
654
655  sub my_favorite_unidecode {
656    my($text) = @_;
657
658    $text =~ s/¥/YEN/g;
659
660    # ...and anything else you like, such as:
661    $text =~ s/€/Euro/g;
662
663    # And then, as a fallback,...
664    $text = unidecode($text);
665
666    return $text;
667  }
668
669Then if you do:
670
671  print my_favorite_unidecode("You just won ¥250,000 and €40,000!!!");
672
673...you'll get:
674
675  You just won YEN250,000 and Euro40,000!!!
676
677...just as you like it.
678
679(By the way, the reason I<I> don't have Unidecode just turn "¥" into "YEN"
680is that the same symbol also stands for yuan, the Chinese
681currency.  A "Y=" is nicely, I<safely> neutral as to whether
682we're talking about yen or yuan-- Japan, or China.)
683
684Another example: for hanzi/kanji/hanja, I have designed
685Unidecode to transliterate according to the value that that
686character has in Mandarin (otherwise Cantonese,...).  Some
687users have complained that applying Unidecode to Japanese
688produces gibberish.
689
690To make a long story short: transliterating from Japanese is
691I<difficult> and it requires a I<lot> of context-sensitivity.
692If you have text that you're fairly sure is in
693Japanese, you're going to have to use a Japanese-specific
694algorithm to transliterate Japanese into ASCII.  (And then
695you can call Unidecode on the output from that-- it is useful
696for, for example, turning fullwidth characters into
697their normal (ASCII) forms.
698
699(Note, as of August 2016: I have titanic but tentative plans for
700making the value of Unihan characters be something you could set
701parameters for at runtime, in changing the order of "Mandarin else
702Cantonese else..." in the value retrieval.  Currently that preference
703list is hardwired on my end, at module-build time.  Other options I'm
704considering allowing for: whether the Mandarin and Cantonese values
705should have the tone numbers on them; whether every Unihan value
706should have a terminal space; and maybe other clever stuff I haven't
707thought of yet.)
708
709
710=head1 CAVEATS
711
712If you get really implausible nonsense out of C<unidecode(...)>, make
713sure that the input data really is a utf8 string.  See
714L<perlunicode> and L<perlunitut>.
715
716I<Unidecode will work disastrously bad on Japanese.> That's because
717Japanese is very very hard.  To extend the Unidecode motto,
718Unidecode is better than nothing, and with Japanese, I<just barely!>
719
720On pure Mandarin, Unidecode will frequently give odd values--
721that's because a single hanzi can have several readings, and Unidecode
722only knows what the Unihan database says is the most common one.
723
724
725=head1 THANKS
726
727Thanks to (in only the sloppiest of sorta-chronological order):
728Jordan Lachler, Harald Tveit Alvestrand, Melissa Axelrod,
729Abhijit Menon-Sen, Mark-Jason Dominus, Joe Johnston,
730Conrad Heiney, fileformat.info,
731Philip Newton, 唐鳳, Tomaž Šolc, Mike Doherty, JT Smith and the
732MadMongers, Arden Ogg, Craig Copris,
733David Cusimano, Brendan Byrd, Hex Martin,
734and
735I<many>
736other pals who have helped with the ideas or values for Unidecode's
737transliterations, or whose help has been in the
738secret F5 tornado that constitutes the internals of Unidecode's
739implementation.
740
741And thank you to the many people who have encouraged me to plug away
742at this project.  A decade went by before I had any idea that more
743than about 4 or 5 people were using or getting any value
744out of Unidecode.  I am told that actually
745my figure was missing some zeroes on the end!
746
747
748=head1 PORTS
749
750Some wonderful people have ported Unidecode to other languages!
751
752=over
753
754=item *
755
756Python: L<https://pypi.python.org/pypi/Unidecode>
757
758=item *
759
760PHP: L<https://github.com/silverstripe-labs/silverstripe-unidecode>
761
762=item *
763
764Ruby: L<http://www.rubydoc.info/gems/unidecode/1.0.0/frames>
765
766=item *
767
768JavaScript: L<https://www.npmjs.org/package/unidecode>
769
770=item *
771
772Java: L<https://github.com/xuender/unidecode>
773
774=back
775
776I can't vouch for the details of each port, but these are clever
777people, so I'm sure they did a fine job.
778
779
780=head1 SEE ALSO
781
782An article I wrote for I<The Perl Journal> about
783Unidecode:  L<http://interglacial.com/tpj/22/>
784(B<READ IT!>)
785
786Jukka Korpela's L<http://www.cs.tut.fi/~jkorpela/fui.html8> which is
787brilliantly useful, and its code is brilliant (so, view source!).  I
788was I<kinda> thinking about maybe doing something I<sort of> like that
789for the v2.x versions of Unicode-- but now he's got me convinced that
790I should go right ahead.
791
792Tom Christiansen's
793I<Perl Unicode Cookbook>,
794L<http://www.perl.com/pub/2012/04/perlunicook-standard-preamble.html>
795
796Unicode Consortium: L<http://www.unicode.org/>
797
798Searchable Unihan database:
799L<http://www.unicode.org/cgi-bin/GetUnihanData.pl>
800
801Geoffrey Sampson.  1990.  I<Writing Systems: A Linguistic Introduction.>
802ISBN: 0804717567
803
804Randall K. Barry (editor).  1997.  I<ALA-LC Romanization Tables:
805Transliteration Schemes for Non-Roman Scripts.>
806ISBN: 0844409405
807[ALA is the American Library Association; LC is the Library of
808Congress.]
809
810Rupert Snell.  2000.  I<Beginner's Hindi Script (Teach Yourself
811Books).>  ISBN: 0658009109
812
813=head1 LICENSE
814
815Copyright (c) 2001, 2014, 2015, 2016 Sean M. Burke.
816
817Unidecode is distributed under the Perl Artistic License
818( L<perlartistic> ), namely:
819
820This library is free software; you can redistribute it and/or modify
821it under the same terms as Perl itself.
822
823This program is distributed in the hope that it will be useful, but
824without any warranty; without even the implied warranty of
825merchantability or fitness for a particular purpose.
826
827=head1 DISCLAIMER
828
829Much of Text::Unidecode's internal data is based on data from The
830Unicode Consortium, with which I am unaffiliated.  A good deal of the
831internal data comes from suggestions that have been contributed by
832people other than myself.
833
834The views and conclusions contained in my software and documentation
835are my own-- they should not be interpreted as representing official
836policies, either expressed or implied, of The Unicode Consortium; nor
837should they be interpreted as necessarily the views or conclusions of
838people who have contributed to this project.
839
840Moreover, I discourage you from inferring that choices that I've made
841in Unidecode reflect political or linguistic prejudices on my
842part.  Just because Unidecode doesn't do great on your language,
843or just because it might seem to do better on some another
844language, please don't think I'm out to get you!
845
846=head1 AUTHOR
847
848Your pal, Sean M. Burke C<sburke@cpan.org>
849
850=head1 O HAI!
851
852If you're using Unidecode for anything interesting, be cool and
853email me, I'm always curious what people use this for.  (The
854answers so far have surprised me!)
855
856=cut
857
858#################### SCOOBIE SNACK ####################
859
860Lest there be any REMAINING doubt that the Unicode Consortium has
861a sense of humor, the CDROM that comes with /The Unicode Standard,
862Version 3.0/ book, has an audio track of the Unicode anthem [!].
863The lyrics are:
864
865	Unicode, Oh Unicode!
866	--------------------
867
868	Oh, beautiful for Uni-Han,
869	for spacious User Zone!
870	For rampant scripts of India
871	and polar Nunavut!
872
873	  Chorus:
874		Unicode, Oh Unicode!
875		May all your code points shine forever
876		and your beacon light the world!
877
878	Oh, marvelous for sixteen bits,
879	for precious surrogates!
880	For Bi-Di algorithm dear
881	and stalwart I-P-A!
882
883	Oh, glorious for Hangul fair,
884	for symbols mathematical!
885	For myriad exotic scripts
886	and punctuation we adore!
887
888# End.
889