1package Unicode::Normalize;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7
8no warnings 'utf8';
9
10our $VERSION = '1.32';
11our $PACKAGE = __PACKAGE__;
12
13our @EXPORT = qw( NFC NFD NFKC NFKD );
14our @EXPORT_OK = qw(
15    normalize decompose reorder compose
16    checkNFD checkNFKD checkNFC checkNFKC check
17    getCanon getCompat getComposite getCombinClass
18    isExclusion isSingleton isNonStDecomp isComp2nd isComp_Ex
19    isNFD_NO isNFC_NO isNFC_MAYBE isNFKD_NO isNFKC_NO isNFKC_MAYBE
20    FCD checkFCD FCC checkFCC composeContiguous splitOnLastStarter
21    normalize_partial NFC_partial NFD_partial NFKC_partial NFKD_partial
22);
23our %EXPORT_TAGS = (
24    all       => [ @EXPORT, @EXPORT_OK ],
25    normalize => [ @EXPORT, qw/normalize decompose reorder compose/ ],
26    check     => [ qw/checkNFD checkNFKD checkNFC checkNFKC check/ ],
27    fast      => [ qw/FCD checkFCD FCC checkFCC composeContiguous/ ],
28);
29
30##
31## utilities for tests
32##
33
34                             # No EBCDIC support on early perls
35*to_native = ($::IS_ASCII || $] < 5.008)
36             ? sub { return shift }
37             : sub { utf8::unicode_to_native(shift) };
38
39*from_native = ($::IS_ASCII || $] < 5.008)
40             ? sub { return shift }
41             : sub { utf8::native_to_unicode(shift) };
42
43# The .t files are all in terms of Unicode, so xlate to/from native
44sub dot_t_pack_U {
45    return pack('U*', map { to_native($_) } @_);
46}
47
48sub dot_t_unpack_U {
49
50    # The empty pack returns an empty UTF-8 string, so the effect is to force
51    # the shifted parameter into being UTF-8.  This allows this to work on
52    # Perl 5.6, where there is no utf8::upgrade().
53    return map { from_native($_) } unpack('U*', shift(@_).pack('U*'));
54}
55
56sub get_printable_string ($) {
57    use bytes;
58    my $s = shift;
59
60    # DeMorgan's laws cause this to mean ascii printables
61    return $s if $s =~ /[^[:^ascii:][:^print:]]/;
62
63    return join " ", map { sprintf "\\x%02x", ord $_ } split "", $s;
64}
65
66sub ok ($$;$) {
67    my $count_ref = shift;  # Test number in caller
68    my $p = my $r = shift;
69    my $x;
70    if (@_) {
71        $x = shift;
72        $p = !defined $x ? !defined $r : !defined $r ? 0 : $r eq $x;
73    }
74
75    print $p ? "ok" : "not ok", ' ', ++$$count_ref, "\n";
76
77    return if $p;
78
79    my (undef, $file, $line) = caller(1);
80    print STDERR "# Failed test $$count_ref at $file line $line\n";
81
82    return unless defined $x;
83
84    print STDERR "#      got ", get_printable_string($r), "\n";
85    print STDERR "# expected ", get_printable_string($x), "\n";
86}
87
88require Exporter;
89
90##### The above part is common to XS and PP #####
91
92our @ISA = qw(Exporter);
93use XSLoader ();
94XSLoader::load( 'Unicode::Normalize', $VERSION );
95
96##### The below part is common to XS and PP #####
97
98##
99## normalize
100##
101
102sub FCD ($) {
103    my $str = shift;
104    return checkFCD($str) ? $str : NFD($str);
105}
106
107our %formNorm = (
108    NFC  => \&NFC,	C  => \&NFC,
109    NFD  => \&NFD,	D  => \&NFD,
110    NFKC => \&NFKC,	KC => \&NFKC,
111    NFKD => \&NFKD,	KD => \&NFKD,
112    FCD  => \&FCD,	FCC => \&FCC,
113);
114
115sub normalize($$)
116{
117    my $form = shift;
118    my $str = shift;
119    if (exists $formNorm{$form}) {
120	return $formNorm{$form}->($str);
121    }
122    croak($PACKAGE."::normalize: invalid form name: $form");
123}
124
125##
126## partial
127##
128
129sub normalize_partial ($$) {
130    if (exists $formNorm{$_[0]}) {
131	my $n = normalize($_[0], $_[1]);
132	my($p, $u) = splitOnLastStarter($n);
133	$_[1] = $u;
134	return $p;
135    }
136    croak($PACKAGE."::normalize_partial: invalid form name: $_[0]");
137}
138
139sub NFD_partial ($) { return normalize_partial('NFD', $_[0]) }
140sub NFC_partial ($) { return normalize_partial('NFC', $_[0]) }
141sub NFKD_partial($) { return normalize_partial('NFKD',$_[0]) }
142sub NFKC_partial($) { return normalize_partial('NFKC',$_[0]) }
143
144##
145## check
146##
147
148our %formCheck = (
149    NFC  => \&checkNFC, 	C  => \&checkNFC,
150    NFD  => \&checkNFD, 	D  => \&checkNFD,
151    NFKC => \&checkNFKC,	KC => \&checkNFKC,
152    NFKD => \&checkNFKD,	KD => \&checkNFKD,
153    FCD  => \&checkFCD, 	FCC => \&checkFCC,
154);
155
156sub check($$)
157{
158    my $form = shift;
159    my $str = shift;
160    if (exists $formCheck{$form}) {
161	return $formCheck{$form}->($str);
162    }
163    croak($PACKAGE."::check: invalid form name: $form");
164}
165
1661;
167__END__
168
169=head1 NAME
170
171Unicode::Normalize - Unicode Normalization Forms
172
173=head1 SYNOPSIS
174
175(1) using function names exported by default:
176
177  use Unicode::Normalize;
178
179  $NFD_string  = NFD($string);  # Normalization Form D
180  $NFC_string  = NFC($string);  # Normalization Form C
181  $NFKD_string = NFKD($string); # Normalization Form KD
182  $NFKC_string = NFKC($string); # Normalization Form KC
183
184(2) using function names exported on request:
185
186  use Unicode::Normalize 'normalize';
187
188  $NFD_string  = normalize('D',  $string);  # Normalization Form D
189  $NFC_string  = normalize('C',  $string);  # Normalization Form C
190  $NFKD_string = normalize('KD', $string);  # Normalization Form KD
191  $NFKC_string = normalize('KC', $string);  # Normalization Form KC
192
193=head1 DESCRIPTION
194
195Parameters:
196
197C<$string> is used as a string under character semantics (see L<perlunicode>).
198
199C<$code_point> should be an unsigned integer representing a Unicode code point.
200
201Note: Between XSUB and pure Perl, there is an incompatibility
202about the interpretation of C<$code_point> as a decimal number.
203XSUB converts C<$code_point> to an unsigned integer, but pure Perl does not.
204Do not use a floating point nor a negative sign in C<$code_point>.
205
206=head2 Normalization Forms
207
208=over 4
209
210=item C<$NFD_string = NFD($string)>
211
212It returns the Normalization Form D (formed by canonical decomposition).
213
214=item C<$NFC_string = NFC($string)>
215
216It returns the Normalization Form C (formed by canonical decomposition
217followed by canonical composition).
218
219=item C<$NFKD_string = NFKD($string)>
220
221It returns the Normalization Form KD (formed by compatibility decomposition).
222
223=item C<$NFKC_string = NFKC($string)>
224
225It returns the Normalization Form KC (formed by compatibility decomposition
226followed by B<canonical> composition).
227
228=item C<$FCD_string = FCD($string)>
229
230If the given string is in FCD ("Fast C or D" form; cf. UTN #5),
231it returns the string without modification; otherwise it returns an FCD string.
232
233Note: FCD is not always unique, then plural forms may be equivalent
234each other. C<FCD()> will return one of these equivalent forms.
235
236=item C<$FCC_string = FCC($string)>
237
238It returns the FCC form ("Fast C Contiguous"; cf. UTN #5).
239
240Note: FCC is unique, as well as four normalization forms (NF*).
241
242=item C<$normalized_string = normalize($form_name, $string)>
243
244It returns the normalization form of C<$form_name>.
245
246As C<$form_name>, one of the following names must be given.
247
248  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
249  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
250  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
251  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
252
253  'FCD'          for "Fast C or D" Form  (UTN #5)
254  'FCC'          for "Fast C Contiguous" (UTN #5)
255
256=back
257
258=head2 Decomposition and Composition
259
260=over 4
261
262=item C<$decomposed_string = decompose($string [, $useCompatMapping])>
263
264It returns the concatenation of the decomposition of each character
265in the string.
266
267If the second parameter (a boolean) is omitted or false,
268the decomposition is canonical decomposition;
269if the second parameter (a boolean) is true,
270the decomposition is compatibility decomposition.
271
272The string returned is not always in NFD/NFKD. Reordering may be required.
273
274 $NFD_string  = reorder(decompose($string));       # eq. to NFD()
275 $NFKD_string = reorder(decompose($string, TRUE)); # eq. to NFKD()
276
277=item C<$reordered_string = reorder($string)>
278
279It returns the result of reordering the combining characters
280according to Canonical Ordering Behavior.
281
282For example, when you have a list of NFD/NFKD strings,
283you can get the concatenated NFD/NFKD string from them, by saying
284
285    $concat_NFD  = reorder(join '', @NFD_strings);
286    $concat_NFKD = reorder(join '', @NFKD_strings);
287
288=item C<$composed_string = compose($string)>
289
290It returns the result of canonical composition
291without applying any decomposition.
292
293For example, when you have a NFD/NFKD string,
294you can get its NFC/NFKC string, by saying
295
296    $NFC_string  = compose($NFD_string);
297    $NFKC_string = compose($NFKD_string);
298
299=item C<($processed, $unprocessed) = splitOnLastStarter($normalized)>
300
301It returns two strings: the first one, C<$processed>, is a part
302before the last starter, and the second one, C<$unprocessed> is
303another part after the first part. A starter is a character having
304a combining class of zero (see UAX #15).
305
306Note that C<$processed> may be empty (when C<$normalized> contains no
307starter or starts with the last starter), and then C<$unprocessed>
308should be equal to the entire C<$normalized>.
309
310When you have a C<$normalized> string and an C<$unnormalized> string
311following it, a simple concatenation is wrong:
312
313 $concat = $normalized . normalize($form, $unnormalized); # wrong!
314
315Instead of it, do like this:
316
317 ($processed, $unprocessed) = splitOnLastStarter($normalized);
318 $concat = $processed . normalize($form,$unprocessed.$unnormalized);
319
320C<splitOnLastStarter()> should be called with a pre-normalized parameter
321C<$normalized>, that is in the same form as C<$form> you want.
322
323If you have an array of C<@string> that should be concatenated and then
324normalized, you can do like this:
325
326    my $result = "";
327    my $unproc = "";
328    foreach my $str (@string) {
329        $unproc .= $str;
330        my $n = normalize($form, $unproc);
331        my($p, $u) = splitOnLastStarter($n);
332        $result .= $p;
333        $unproc  = $u;
334    }
335    $result .= $unproc;
336    # instead of normalize($form, join('', @string))
337
338=item C<$processed = normalize_partial($form, $unprocessed)>
339
340A wrapper for the combination of C<normalize()> and C<splitOnLastStarter()>.
341Note that C<$unprocessed> will be modified as a side-effect.
342
343If you have an array of C<@string> that should be concatenated and then
344normalized, you can do like this:
345
346    my $result = "";
347    my $unproc = "";
348    foreach my $str (@string) {
349        $unproc .= $str;
350        $result .= normalize_partial($form, $unproc);
351    }
352    $result .= $unproc;
353    # instead of normalize($form, join('', @string))
354
355=item C<$processed = NFD_partial($unprocessed)>
356
357It does like C<normalize_partial('NFD', $unprocessed)>.
358Note that C<$unprocessed> will be modified as a side-effect.
359
360=item C<$processed = NFC_partial($unprocessed)>
361
362It does like C<normalize_partial('NFC', $unprocessed)>.
363Note that C<$unprocessed> will be modified as a side-effect.
364
365=item C<$processed = NFKD_partial($unprocessed)>
366
367It does like C<normalize_partial('NFKD', $unprocessed)>.
368Note that C<$unprocessed> will be modified as a side-effect.
369
370=item C<$processed = NFKC_partial($unprocessed)>
371
372It does like C<normalize_partial('NFKC', $unprocessed)>.
373Note that C<$unprocessed> will be modified as a side-effect.
374
375=back
376
377=head2 Quick Check
378
379(see Annex 8, UAX #15; and F<lib/unicore/DerivedNormalizationProps.txt>)
380
381The following functions check whether the string is in that normalization form.
382
383The result returned will be one of the following:
384
385    YES     The string is in that normalization form.
386    NO      The string is not in that normalization form.
387    MAYBE   Dubious. Maybe yes, maybe no.
388
389=over 4
390
391=item C<$result = checkNFD($string)>
392
393It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
394
395=item C<$result = checkNFC($string)>
396
397It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
398C<undef> if C<MAYBE>.
399
400=item C<$result = checkNFKD($string)>
401
402It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
403
404=item C<$result = checkNFKC($string)>
405
406It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
407C<undef> if C<MAYBE>.
408
409=item C<$result = checkFCD($string)>
410
411It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>.
412
413=item C<$result = checkFCC($string)>
414
415It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
416C<undef> if C<MAYBE>.
417
418Note: If a string is not in FCD, it must not be in FCC.
419So C<checkFCC($not_FCD_string)> should return C<NO>.
420
421=item C<$result = check($form_name, $string)>
422
423It returns true (C<1>) if C<YES>; false (C<empty string>) if C<NO>;
424C<undef> if C<MAYBE>.
425
426As C<$form_name>, one of the following names must be given.
427
428  'C'  or 'NFC'  for Normalization Form C  (UAX #15)
429  'D'  or 'NFD'  for Normalization Form D  (UAX #15)
430  'KC' or 'NFKC' for Normalization Form KC (UAX #15)
431  'KD' or 'NFKD' for Normalization Form KD (UAX #15)
432
433  'FCD'          for "Fast C or D" Form  (UTN #5)
434  'FCC'          for "Fast C Contiguous" (UTN #5)
435
436=back
437
438B<Note>
439
440In the cases of NFD, NFKD, and FCD, the answer must be
441either C<YES> or C<NO>. The answer C<MAYBE> may be returned
442in the cases of NFC, NFKC, and FCC.
443
444A C<MAYBE> string should contain at least one combining character
445or the like. For example, C<COMBINING ACUTE ACCENT> has
446the MAYBE_NFC/MAYBE_NFKC property.
447
448Both C<checkNFC("A\N{COMBINING ACUTE ACCENT}")>
449and C<checkNFC("B\N{COMBINING ACUTE ACCENT}")> will return C<MAYBE>.
450C<"A\N{COMBINING ACUTE ACCENT}"> is not in NFC
451(its NFC is C<"\N{LATIN CAPITAL LETTER A WITH ACUTE}">),
452while C<"B\N{COMBINING ACUTE ACCENT}"> is in NFC.
453
454If you want to check exactly, compare the string with its NFC/NFKC/FCC.
455
456    if ($string eq NFC($string)) {
457        # $string is exactly normalized in NFC;
458    } else {
459        # $string is not normalized in NFC;
460    }
461
462    if ($string eq NFKC($string)) {
463        # $string is exactly normalized in NFKC;
464    } else {
465        # $string is not normalized in NFKC;
466    }
467
468=head2 Character Data
469
470These functions are interface of character data used internally.
471If you want only to get Unicode normalization forms, you don't need
472call them yourself.
473
474=over 4
475
476=item C<$canonical_decomposition = getCanon($code_point)>
477
478If the character is canonically decomposable (including Hangul Syllables),
479it returns the (full) canonical decomposition as a string.
480Otherwise it returns C<undef>.
481
482B<Note:> According to the Unicode standard, the canonical decomposition
483of the character that is not canonically decomposable is same as
484the character itself.
485
486=item C<$compatibility_decomposition = getCompat($code_point)>
487
488If the character is compatibility decomposable (including Hangul Syllables),
489it returns the (full) compatibility decomposition as a string.
490Otherwise it returns C<undef>.
491
492B<Note:> According to the Unicode standard, the compatibility decomposition
493of the character that is not compatibility decomposable is same as
494the character itself.
495
496=item C<$code_point_composite = getComposite($code_point_here, $code_point_next)>
497
498If two characters here and next (as code points) are composable
499(including Hangul Jamo/Syllables and Composition Exclusions),
500it returns the code point of the composite.
501
502If they are not composable, it returns C<undef>.
503
504=item C<$combining_class = getCombinClass($code_point)>
505
506It returns the combining class (as an integer) of the character.
507
508=item C<$may_be_composed_with_prev_char = isComp2nd($code_point)>
509
510It returns a boolean whether the character of the specified codepoint
511may be composed with the previous one in a certain composition
512(including Hangul Compositions, but excluding
513Composition Exclusions and Non-Starter Decompositions).
514
515=item C<$is_exclusion = isExclusion($code_point)>
516
517It returns a boolean whether the code point is a composition exclusion.
518
519=item C<$is_singleton = isSingleton($code_point)>
520
521It returns a boolean whether the code point is a singleton
522
523=item C<$is_non_starter_decomposition = isNonStDecomp($code_point)>
524
525It returns a boolean whether the code point has Non-Starter Decomposition.
526
527=item C<$is_Full_Composition_Exclusion = isComp_Ex($code_point)>
528
529It returns a boolean of the derived property Comp_Ex
530(Full_Composition_Exclusion). This property is generated from
531Composition Exclusions + Singletons + Non-Starter Decompositions.
532
533=item C<$NFD_is_NO = isNFD_NO($code_point)>
534
535It returns a boolean of the derived property NFD_NO
536(NFD_Quick_Check=No).
537
538=item C<$NFC_is_NO = isNFC_NO($code_point)>
539
540It returns a boolean of the derived property NFC_NO
541(NFC_Quick_Check=No).
542
543=item C<$NFC_is_MAYBE = isNFC_MAYBE($code_point)>
544
545It returns a boolean of the derived property NFC_MAYBE
546(NFC_Quick_Check=Maybe).
547
548=item C<$NFKD_is_NO = isNFKD_NO($code_point)>
549
550It returns a boolean of the derived property NFKD_NO
551(NFKD_Quick_Check=No).
552
553=item C<$NFKC_is_NO = isNFKC_NO($code_point)>
554
555It returns a boolean of the derived property NFKC_NO
556(NFKC_Quick_Check=No).
557
558=item C<$NFKC_is_MAYBE = isNFKC_MAYBE($code_point)>
559
560It returns a boolean of the derived property NFKC_MAYBE
561(NFKC_Quick_Check=Maybe).
562
563=back
564
565=head1 EXPORT
566
567C<NFC>, C<NFD>, C<NFKC>, C<NFKD>: by default.
568
569C<normalize> and other some functions: on request.
570
571=head1 CAVEATS
572
573=over 4
574
575=item Perl's version vs. Unicode version
576
577Since this module refers to perl core's Unicode database in the directory
578F</lib/unicore> (or formerly F</lib/unicode>), the Unicode version of
579normalization implemented by this module depends on what has been
580compiled into your perl.  The following table lists the default Unicode
581version that comes with various perl versions.  (It is possible to change
582the Unicode version in any perl version to be any earlier Unicode version,
583so one could cause Unicode 3.2 to be used in any perl version starting with
5845.8.0.  Read F<C<$Config{privlib}>/unicore/README.perl> for details.
585
586    perl's version     implemented Unicode version
587       5.6.1              3.0.1
588       5.7.2              3.1.0
589       5.7.3              3.1.1 (normalization is same as 3.1.0)
590       5.8.0              3.2.0
591         5.8.1-5.8.3      4.0.0
592         5.8.4-5.8.6      4.0.1 (normalization is same as 4.0.0)
593         5.8.7-5.8.8      4.1.0
594       5.10.0             5.0.0
595        5.8.9, 5.10.1     5.1.0
596       5.12.x             5.2.0
597       5.14.x             6.0.0
598       5.16.x             6.1.0
599       5.18.x             6.2.0
600       5.20.x             6.3.0
601       5.22.x             7.0.0
602
603=item Correction of decomposition mapping
604
605In older Unicode versions, a small number of characters (all of which are
606CJK compatibility ideographs as far as they have been found) may have
607an erroneous decomposition mapping (see
608F<lib/unicore/NormalizationCorrections.txt>).
609Anyhow, this module will neither refer to
610F<lib/unicore/NormalizationCorrections.txt>
611nor provide any specific version of normalization. Therefore this module
612running on an older perl with an older Unicode database may use
613the erroneous decomposition mapping blindly conforming to the Unicode database.
614
615=item Revised definition of canonical composition
616
617In Unicode 4.1.0, the definition D2 of canonical composition (which
618affects NFC and NFKC) has been changed (see Public Review Issue #29
619and recent UAX #15). This module has used the newer definition
620since the version 0.07 (Oct 31, 2001).
621This module will not support the normalization according to the older
622definition, even if the Unicode version implemented by perl is
623lower than 4.1.0.
624
625=back
626
627=head1 AUTHOR
628
629SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
630
631Currently maintained by <perl5-porters@perl.org>
632
633Copyright(C) 2001-2012, SADAHIRO Tomoyuki. Japan. All rights reserved.
634
635=head1 LICENSE
636
637This module is free software; you can redistribute it
638and/or modify it under the same terms as Perl itself.
639
640=head1 SEE ALSO
641
642=over 4
643
644=item L<http://www.unicode.org/reports/tr15/>
645
646Unicode Normalization Forms - UAX #15
647
648=item L<http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt>
649
650Composition Exclusion Table
651
652=item L<http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt>
653
654Derived Normalization Properties
655
656=item L<http://www.unicode.org/Public/UNIDATA/NormalizationCorrections.txt>
657
658Normalization Corrections
659
660=item L<http://www.unicode.org/review/pr-29.html>
661
662Public Review Issue #29: Normalization Issue
663
664=item L<http://www.unicode.org/notes/tn5/>
665
666Canonical Equivalence in Applications - UTN #5
667
668=back
669
670=cut
671