xref: /openbsd/gnu/usr.bin/perl/lib/Unicode/UCD.t (revision 404b540a)
1#!perl -w
2BEGIN {
3    if (ord("A") != 65) {
4	print "1..0 # Skip: EBCDIC\n";
5	exit 0;
6    }
7    chdir 't' if -d 't';
8    @INC = '../lib';
9    @INC = "::lib" if $^O eq 'MacOS'; # module parses @INC itself
10    require Config; import Config;
11    if ($Config{'extensions'} !~ /\bStorable\b/) {
12        print "1..0 # Skip: Storable was not built; Unicode::UCD uses Storable\n";
13        exit 0;
14    }
15}
16
17use strict;
18use Unicode::UCD;
19use Test::More;
20
21BEGIN { plan tests => 239 };
22
23use Unicode::UCD 'charinfo';
24
25my $charinfo;
26
27$charinfo = charinfo(0x41);
28
29is($charinfo->{code},           '0041', 'LATIN CAPITAL LETTER A');
30is($charinfo->{name},           'LATIN CAPITAL LETTER A');
31is($charinfo->{category},       'Lu');
32is($charinfo->{combining},      '0');
33is($charinfo->{bidi},           'L');
34is($charinfo->{decomposition},  '');
35is($charinfo->{decimal},        '');
36is($charinfo->{digit},          '');
37is($charinfo->{numeric},        '');
38is($charinfo->{mirrored},       'N');
39is($charinfo->{unicode10},      '');
40is($charinfo->{comment},        '');
41is($charinfo->{upper},          '');
42is($charinfo->{lower},          '0061');
43is($charinfo->{title},          '');
44is($charinfo->{block},          'Basic Latin');
45is($charinfo->{script},         'Latin');
46
47$charinfo = charinfo(0x100);
48
49is($charinfo->{code},           '0100', 'LATIN CAPITAL LETTER A WITH MACRON');
50is($charinfo->{name},           'LATIN CAPITAL LETTER A WITH MACRON');
51is($charinfo->{category},       'Lu');
52is($charinfo->{combining},      '0');
53is($charinfo->{bidi},           'L');
54is($charinfo->{decomposition},  '0041 0304');
55is($charinfo->{decimal},        '');
56is($charinfo->{digit},          '');
57is($charinfo->{numeric},        '');
58is($charinfo->{mirrored},       'N');
59is($charinfo->{unicode10},      'LATIN CAPITAL LETTER A MACRON');
60is($charinfo->{comment},        '');
61is($charinfo->{upper},          '');
62is($charinfo->{lower},          '0101');
63is($charinfo->{title},          '');
64is($charinfo->{block},          'Latin Extended-A');
65is($charinfo->{script},         'Latin');
66
67# 0x0590 is in the Hebrew block but unused.
68
69$charinfo = charinfo(0x590);
70
71is($charinfo->{code},          undef,	'0x0590 - unused Hebrew');
72is($charinfo->{name},          undef);
73is($charinfo->{category},      undef);
74is($charinfo->{combining},     undef);
75is($charinfo->{bidi},          undef);
76is($charinfo->{decomposition}, undef);
77is($charinfo->{decimal},       undef);
78is($charinfo->{digit},         undef);
79is($charinfo->{numeric},       undef);
80is($charinfo->{mirrored},      undef);
81is($charinfo->{unicode10},     undef);
82is($charinfo->{comment},       undef);
83is($charinfo->{upper},         undef);
84is($charinfo->{lower},         undef);
85is($charinfo->{title},         undef);
86is($charinfo->{block},         undef);
87is($charinfo->{script},        undef);
88
89# 0x05d0 is in the Hebrew block and used.
90
91$charinfo = charinfo(0x5d0);
92
93is($charinfo->{code},           '05D0', '05D0 - used Hebrew');
94is($charinfo->{name},           'HEBREW LETTER ALEF');
95is($charinfo->{category},       'Lo');
96is($charinfo->{combining},      '0');
97is($charinfo->{bidi},           'R');
98is($charinfo->{decomposition},  '');
99is($charinfo->{decimal},        '');
100is($charinfo->{digit},          '');
101is($charinfo->{numeric},        '');
102is($charinfo->{mirrored},       'N');
103is($charinfo->{unicode10},      '');
104is($charinfo->{comment},        '');
105is($charinfo->{upper},          '');
106is($charinfo->{lower},          '');
107is($charinfo->{title},          '');
108is($charinfo->{block},          'Hebrew');
109is($charinfo->{script},         'Hebrew');
110
111# An open syllable in Hangul.
112
113$charinfo = charinfo(0xAC00);
114
115is($charinfo->{code},           'AC00', 'HANGUL SYLLABLE-AC00');
116is($charinfo->{name},           'HANGUL SYLLABLE-AC00');
117is($charinfo->{category},       'Lo');
118is($charinfo->{combining},      '0');
119is($charinfo->{bidi},           'L');
120is($charinfo->{decomposition},  undef);
121is($charinfo->{decimal},        '');
122is($charinfo->{digit},          '');
123is($charinfo->{numeric},        '');
124is($charinfo->{mirrored},       'N');
125is($charinfo->{unicode10},      '');
126is($charinfo->{comment},        '');
127is($charinfo->{upper},          '');
128is($charinfo->{lower},          '');
129is($charinfo->{title},          '');
130is($charinfo->{block},          'Hangul Syllables');
131is($charinfo->{script},         'Hangul');
132
133# A closed syllable in Hangul.
134
135$charinfo = charinfo(0xAE00);
136
137is($charinfo->{code},           'AE00', 'HANGUL SYLLABLE-AE00');
138is($charinfo->{name},           'HANGUL SYLLABLE-AE00');
139is($charinfo->{category},       'Lo');
140is($charinfo->{combining},      '0');
141is($charinfo->{bidi},           'L');
142is($charinfo->{decomposition},  undef);
143is($charinfo->{decimal},        '');
144is($charinfo->{digit},          '');
145is($charinfo->{numeric},        '');
146is($charinfo->{mirrored},       'N');
147is($charinfo->{unicode10},      '');
148is($charinfo->{comment},        '');
149is($charinfo->{upper},          '');
150is($charinfo->{lower},          '');
151is($charinfo->{title},          '');
152is($charinfo->{block},          'Hangul Syllables');
153is($charinfo->{script},         'Hangul');
154
155$charinfo = charinfo(0x1D400);
156
157is($charinfo->{code},           '1D400', 'MATHEMATICAL BOLD CAPITAL A');
158is($charinfo->{name},           'MATHEMATICAL BOLD CAPITAL A');
159is($charinfo->{category},       'Lu');
160is($charinfo->{combining},      '0');
161is($charinfo->{bidi},           'L');
162is($charinfo->{decomposition},  '<font> 0041');
163is($charinfo->{decimal},        '');
164is($charinfo->{digit},          '');
165is($charinfo->{numeric},        '');
166is($charinfo->{mirrored},       'N');
167is($charinfo->{unicode10},      '');
168is($charinfo->{comment},        '');
169is($charinfo->{upper},          '');
170is($charinfo->{lower},          '');
171is($charinfo->{title},          '');
172is($charinfo->{block},          'Mathematical Alphanumeric Symbols');
173is($charinfo->{script},         'Common');
174
175$charinfo = charinfo(0x9FBA);	#Bug 58428
176
177is($charinfo->{code},           '9FBA', 'U+9FBA');
178is($charinfo->{name},           'CJK UNIFIED IDEOGRAPH-9FBA');
179is($charinfo->{category},       'Lo');
180is($charinfo->{combining},      '0');
181is($charinfo->{bidi},           'L');
182is($charinfo->{decomposition},  '');
183is($charinfo->{decimal},        '');
184is($charinfo->{digit},          '');
185is($charinfo->{numeric},        '');
186is($charinfo->{mirrored},       'N');
187is($charinfo->{unicode10},      '');
188is($charinfo->{comment},        '');
189is($charinfo->{upper},          '');
190is($charinfo->{lower},          '');
191is($charinfo->{title},          '');
192is($charinfo->{block},          'CJK Unified Ideographs');
193is($charinfo->{script},         'Han');
194
195use Unicode::UCD qw(charblock charscript);
196
197# 0x0590 is in the Hebrew block but unused.
198
199is(charblock(0x590),          'Hebrew', '0x0590 - Hebrew unused charblock');
200is(charscript(0x590),         undef,    '0x0590 - Hebrew unused charscript');
201
202$charinfo = charinfo(0xbe);
203
204is($charinfo->{code},           '00BE', 'VULGAR FRACTION THREE QUARTERS');
205is($charinfo->{name},           'VULGAR FRACTION THREE QUARTERS');
206is($charinfo->{category},       'No');
207is($charinfo->{combining},      '0');
208is($charinfo->{bidi},           'ON');
209is($charinfo->{decomposition},  '<fraction> 0033 2044 0034');
210is($charinfo->{decimal},        '');
211is($charinfo->{digit},          '');
212is($charinfo->{numeric},        '3/4');
213is($charinfo->{mirrored},       'N');
214is($charinfo->{unicode10},      'FRACTION THREE QUARTERS');
215is($charinfo->{comment},        '');
216is($charinfo->{upper},          '');
217is($charinfo->{lower},          '');
218is($charinfo->{title},          '');
219is($charinfo->{block},          'Latin-1 Supplement');
220is($charinfo->{script},         'Common');
221
222use Unicode::UCD qw(charblocks charscripts);
223
224my $charblocks = charblocks();
225
226ok(exists $charblocks->{Thai}, 'Thai charblock exists');
227is($charblocks->{Thai}->[0]->[0], hex('0e00'));
228ok(!exists $charblocks->{PigLatin}, 'PigLatin charblock does not exist');
229
230my $charscripts = charscripts();
231
232ok(exists $charscripts->{Armenian}, 'Armenian charscript exists');
233is($charscripts->{Armenian}->[0]->[0], hex('0531'));
234ok(!exists $charscripts->{PigLatin}, 'PigLatin charscript does not exist');
235
236my $charscript;
237
238$charscript = charscript("12ab");
239is($charscript, 'Ethiopic', 'Ethiopic charscript');
240
241$charscript = charscript("0x12ab");
242is($charscript, 'Ethiopic');
243
244$charscript = charscript("U+12ab");
245is($charscript, 'Ethiopic');
246
247my $ranges;
248
249$ranges = charscript('Ogham');
250is($ranges->[1]->[0], hex('1681'), 'Ogham charscript');
251is($ranges->[1]->[1], hex('169a'));
252
253use Unicode::UCD qw(charinrange);
254
255$ranges = charscript('Cherokee');
256ok(!charinrange($ranges, "139f"), 'Cherokee charscript');
257ok( charinrange($ranges, "13a0"));
258ok( charinrange($ranges, "13f4"));
259ok(!charinrange($ranges, "13f5"));
260
261use Unicode::UCD qw(general_categories);
262
263my $gc = general_categories();
264
265ok(exists $gc->{L}, 'has L');
266is($gc->{L}, 'Letter', 'L is Letter');
267is($gc->{Lu}, 'UppercaseLetter', 'Lu is UppercaseLetter');
268
269use Unicode::UCD qw(bidi_types);
270
271my $bt = bidi_types();
272
273ok(exists $bt->{L}, 'has L');
274is($bt->{L}, 'Left-to-Right', 'L is Left-to-Right');
275is($bt->{AL}, 'Right-to-Left Arabic', 'AL is Right-to-Left Arabic');
276
277# If this fails, then maybe one should look at the Unicode changes to see
278# what else might need to be updated.
279is(Unicode::UCD::UnicodeVersion, '5.1.0', 'UnicodeVersion');
280
281use Unicode::UCD qw(compexcl);
282
283ok(!compexcl(0x0100), 'compexcl');
284ok( compexcl(0x0958));
285
286use Unicode::UCD qw(casefold);
287
288my $casefold;
289
290$casefold = casefold(0x41);
291
292is($casefold->{code}, '0041', 'casefold 0x41 code');
293is($casefold->{status}, 'C', 'casefold 0x41 status');
294is($casefold->{mapping}, '0061', 'casefold 0x41 mapping');
295is($casefold->{full}, '0061', 'casefold 0x41 full');
296is($casefold->{simple}, '0061', 'casefold 0x41 simple');
297is($casefold->{turkic}, "", 'casefold 0x41 turkic');
298
299$casefold = casefold(0xdf);
300
301is($casefold->{code}, '00DF', 'casefold 0xDF code');
302is($casefold->{status}, 'F', 'casefold 0xDF status');
303is($casefold->{mapping}, '0073 0073', 'casefold 0xDF mapping');
304is($casefold->{full}, '0073 0073', 'casefold 0xDF full');
305is($casefold->{simple}, "", 'casefold 0xDF simple');
306is($casefold->{turkic}, "", 'casefold 0xDF turkic');
307
308# Do different tests depending on if version <= 3.1, or not.
309(my $version = Unicode::UCD::UnicodeVersion) =~ /^(\d+)\.(\d+)/;
310if (defined $1 && ($1 <= 2 || $1 == 3 && defined $2 && $2 <= 1)) {
311	$casefold = casefold(0x130);
312
313	is($casefold->{code}, '0130', 'casefold 0x130 code');
314	is($casefold->{status}, 'I' , 'casefold 0x130 status');
315	is($casefold->{mapping}, '0069', 'casefold 0x130 mapping');
316	is($casefold->{full}, '0069', 'casefold 0x130 full');
317	is($casefold->{simple}, "0069", 'casefold 0x130 simple');
318	is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
319
320	$casefold = casefold(0x131);
321
322	is($casefold->{code}, '0131', 'casefold 0x131 code');
323	is($casefold->{status}, 'I' , 'casefold 0x131 status');
324	is($casefold->{mapping}, '0069', 'casefold 0x131 mapping');
325	is($casefold->{full}, '0069', 'casefold 0x131 full');
326	is($casefold->{simple}, "0069", 'casefold 0x131 simple');
327	is($casefold->{turkic}, "0069", 'casefold 0x131 turkic');
328} else {
329	$casefold = casefold(0x49);
330
331	is($casefold->{code}, '0049', 'casefold 0x49 code');
332	is($casefold->{status}, 'C' , 'casefold 0x49 status');
333	is($casefold->{mapping}, '0069', 'casefold 0x49 mapping');
334	is($casefold->{full}, '0069', 'casefold 0x49 full');
335	is($casefold->{simple}, "0069", 'casefold 0x49 simple');
336	is($casefold->{turkic}, "0131", 'casefold 0x49 turkic');
337
338	$casefold = casefold(0x130);
339
340	is($casefold->{code}, '0130', 'casefold 0x130 code');
341	is($casefold->{status}, 'F' , 'casefold 0x130 status');
342	is($casefold->{mapping}, '0069 0307', 'casefold 0x130 mapping');
343	is($casefold->{full}, '0069 0307', 'casefold 0x130 full');
344	is($casefold->{simple}, "", 'casefold 0x130 simple');
345	is($casefold->{turkic}, "0069", 'casefold 0x130 turkic');
346}
347
348$casefold = casefold(0x1F88);
349
350is($casefold->{code}, '1F88', 'casefold 0x1F88 code');
351is($casefold->{status}, 'S' , 'casefold 0x1F88 status');
352is($casefold->{mapping}, '1F80', 'casefold 0x1F88 mapping');
353is($casefold->{full}, '1F00 03B9', 'casefold 0x1F88 full');
354is($casefold->{simple}, '1F80', 'casefold 0x1F88 simple');
355is($casefold->{turkic}, "", 'casefold 0x1F88 turkic');
356
357ok(!casefold(0x20));
358
359use Unicode::UCD qw(casespec);
360
361my $casespec;
362
363ok(!casespec(0x41));
364
365$casespec = casespec(0xdf);
366
367ok($casespec->{code} eq '00DF' &&
368   $casespec->{lower} eq '00DF'  &&
369   $casespec->{title} eq '0053 0073'  &&
370   $casespec->{upper} eq '0053 0053' &&
371   !defined $casespec->{condition}, 'casespec 0xDF');
372
373$casespec = casespec(0x307);
374
375ok($casespec->{az}->{code} eq '0307' &&
376   !defined $casespec->{az}->{lower} &&
377   $casespec->{az}->{title} eq '0307'  &&
378   $casespec->{az}->{upper} eq '0307' &&
379   $casespec->{az}->{condition} eq 'az After_I',
380   'casespec 0x307');
381
382# perl #7305 UnicodeCD::compexcl is weird
383
384for (1) {my $a=compexcl $_}
385ok(1, 'compexcl read-only $_: perl #7305');
386map {compexcl $_} %{{1=>2}};
387ok(1, 'compexcl read-only hash: perl #7305');
388
389is(Unicode::UCD::_getcode('123'),     123, "_getcode(123)");
390is(Unicode::UCD::_getcode('0123'),  0x123, "_getcode(0123)");
391is(Unicode::UCD::_getcode('0x123'), 0x123, "_getcode(0x123)");
392is(Unicode::UCD::_getcode('0X123'), 0x123, "_getcode(0X123)");
393is(Unicode::UCD::_getcode('U+123'), 0x123, "_getcode(U+123)");
394is(Unicode::UCD::_getcode('u+123'), 0x123, "_getcode(u+123)");
395is(Unicode::UCD::_getcode('U+1234'),   0x1234, "_getcode(U+1234)");
396is(Unicode::UCD::_getcode('U+12345'), 0x12345, "_getcode(U+12345)");
397is(Unicode::UCD::_getcode('123x'),    undef, "_getcode(123x)");
398is(Unicode::UCD::_getcode('x123'),    undef, "_getcode(x123)");
399is(Unicode::UCD::_getcode('0x123x'),  undef, "_getcode(x123)");
400is(Unicode::UCD::_getcode('U+123x'),  undef, "_getcode(x123)");
401
402{
403    my $r1 = charscript('Latin');
404    my $n1 = @$r1;
405    is($n1, 42, "number of ranges in Latin script (Unicode 5.1.0)");
406    shift @$r1 while @$r1;
407    my $r2 = charscript('Latin');
408    is(@$r2, $n1, "modifying results should not mess up internal caches");
409}
410
411{
412	is(charinfo(0xdeadbeef), undef, "[perl #23273] warnings in Unicode::UCD");
413}
414
415use Unicode::UCD qw(namedseq);
416
417is(namedseq("KATAKANA LETTER AINU P"), "\x{31F7}\x{309A}", "namedseq");
418is(namedseq("KATAKANA LETTER AINU Q"), undef);
419is(namedseq(), undef);
420is(namedseq(qw(foo bar)), undef);
421my @ns = namedseq("KATAKANA LETTER AINU P");
422is(scalar @ns, 2);
423is($ns[0], 0x31F7);
424is($ns[1], 0x309A);
425my %ns = namedseq();
426is($ns{"KATAKANA LETTER AINU P"}, "\x{31F7}\x{309A}");
427@ns = namedseq(42);
428is(@ns, 0);
429
430