1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2008-2018. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20
21-module(bs_utf_SUITE).
22
23-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1,
24	 init_per_group/2,end_per_group/2,
25	 utf8_roundtrip/1,unused_utf_char/1,utf16_roundtrip/1,
26	 utf32_roundtrip/1,guard/1,extreme_tripping/1,
27	 literals/1,coverage/1]).
28
29-include_lib("common_test/include/ct.hrl").
30
31suite() -> [{ct_hooks,[ts_install_cth]}].
32
33all() ->
34    [utf8_roundtrip, unused_utf_char, utf16_roundtrip,
35     utf32_roundtrip, guard, extreme_tripping, literals,
36     coverage].
37
38groups() ->
39    [].
40
41init_per_suite(Config) ->
42    test_lib:recompile(?MODULE),
43    Config.
44
45end_per_suite(_Config) ->
46    ok.
47
48init_per_group(_GroupName, Config) ->
49    Config.
50
51end_per_group(_GroupName, Config) ->
52    Config.
53
54
55utf8_roundtrip(Config) when is_list(Config) ->
56    [utf8_roundtrip_1(P) || P <- utf_data()],
57    ok.
58
59utf8_roundtrip_1({Str,Bin,Bin}) ->
60    Str = utf8_to_list(Bin),
61    Bin = list_to_utf8(Str),
62    [ok = utf8_guard(C, <<42,C/utf8>>) || C <- Str],
63    [error = utf8_guard(C, <<C/utf8>>) || C <- Str],
64    ok.
65
66utf8_guard(C, Bin) when <<42,C/utf8>> =:= Bin -> ok;
67utf8_guard(_, _) -> error.
68
69utf8_to_list(<<C/utf8,T/binary>>) ->
70    [C|utf8_to_list(T)];
71utf8_to_list(<<>>) -> [].
72
73list_to_utf8(L) ->
74    list_to_utf8(L, <<>>).
75
76list_to_utf8([H|T], Bin) ->
77    list_to_utf8(T, <<Bin/binary,H/utf8>>);
78list_to_utf8([], Bin) -> Bin.
79
80unused_utf_char(Config) when is_list(Config) ->
81    [true = utf8_len(Utf8) =:= length(Str) ||
82	{Str,Utf8} <- utf_data()],
83    ok.
84
85utf8_len(B) ->
86    utf8_len(B, 0).
87
88utf8_len(<<_/utf8,T/binary>>, N) ->
89    utf8_len(T, N+1);
90utf8_len(<<>>, N) -> N.
91
92utf16_roundtrip(Config) when is_list(Config) ->
93    {Str,Big,Big,Little,Little} = utf16_data(),
94    4 = utf16_big_len(Big),
95    4 = utf16_little_len(Little),
96    Str = big_utf16_to_list(Big),
97    Str = little_utf16_to_list(Little),
98
99    Big = list_to_big_utf16(Str),
100    Little = list_to_little_utf16(Str),
101
102    ok.
103
104utf16_big_len(B) ->
105    utf16_big_len(B, 0).
106
107utf16_big_len(<<_/utf16,T/binary>>, N) ->
108    utf16_big_len(T, N+1);
109utf16_big_len(<<>>, N) -> N.
110
111utf16_little_len(B) ->
112    utf16_little_len(B, 0).
113
114utf16_little_len(<<_/little-utf16,T/binary>>, N) ->
115    utf16_little_len(T, N+1);
116utf16_little_len(<<>>, N) -> N.
117
118list_to_big_utf16(List) ->
119    list_to_big_utf16(List, <<>>).
120
121list_to_big_utf16([H|T], Bin) ->
122    list_to_big_utf16(T, <<Bin/binary,H/utf16>>);
123list_to_big_utf16([], Bin) -> Bin.
124
125list_to_little_utf16(List) ->
126    list_to_little_utf16(List, <<>>).
127
128list_to_little_utf16([H|T], Bin) ->
129    list_to_little_utf16(T, <<Bin/binary,H/little-utf16>>);
130list_to_little_utf16([], Bin) -> Bin.
131
132big_utf16_to_list(<<H/utf16,T/binary>>) ->
133    [H|big_utf16_to_list(T)];
134big_utf16_to_list(<<>>) -> [].
135
136little_utf16_to_list(<<H/little-utf16,T/binary>>) ->
137    [H|little_utf16_to_list(T)];
138little_utf16_to_list(<<>>) -> [].
139
140utf32_roundtrip(Config) when is_list(Config) ->
141    {Str,Big,Big,Little,Little} = utf32_data(),
142    4 = utf32_big_len(Big),
143    4 = utf32_little_len(Little),
144    Str = big_utf32_to_list(Big),
145    Str = little_utf32_to_list(Little),
146
147    Big = list_to_big_utf32(Str),
148    Little = list_to_little_utf32(Str),
149
150    ok.
151
152utf32_big_len(B) ->
153    utf32_big_len(B, 0).
154
155utf32_big_len(<<_/utf32,T/binary>>, N) ->
156    utf32_big_len(T, N+1);
157utf32_big_len(<<>>, N) -> N.
158
159utf32_little_len(B) ->
160    utf32_little_len(B, 0).
161
162utf32_little_len(<<_/little-utf32,T/binary>>, N) ->
163    utf32_little_len(T, N+1);
164utf32_little_len(<<>>, N) -> N.
165
166list_to_big_utf32(List) ->
167    list_to_big_utf32(List, <<>>).
168
169list_to_big_utf32([H|T], Bin) ->
170    list_to_big_utf32(T, <<Bin/binary,H/utf32>>);
171list_to_big_utf32([], Bin) -> Bin.
172
173list_to_little_utf32(List) ->
174    list_to_little_utf32(List, <<>>).
175
176list_to_little_utf32([H|T], Bin) ->
177    list_to_little_utf32(T, <<Bin/binary,H/little-utf32>>);
178list_to_little_utf32([], Bin) -> Bin.
179
180big_utf32_to_list(<<H/utf32,T/binary>>) ->
181    [H|big_utf32_to_list(T)];
182big_utf32_to_list(<<>>) -> [].
183
184little_utf32_to_list(<<H/little-utf32,T/binary>>) ->
185    [H|little_utf32_to_list(T)];
186little_utf32_to_list(<<>>) -> [].
187
188
189guard(Config) when is_list(Config) ->
190    error = do_guard(16#D800),
191    ok.
192
193do_guard(C) when byte_size(<<C/utf8>>) =/= 42 -> ok;
194do_guard(C) when byte_size(<<C/utf16>>) =/= 42 -> ok;
195do_guard(C) when byte_size(<<C/utf32>>) =/= 42 -> ok;
196do_guard(_) -> error.
197
198%% The purpose of this test is to make sure that
199%% the delayed creation of sub-binaries works.
200
201extreme_tripping(Config) when is_list(Config) ->
202    Unicode = lists:seq(0, 1024),
203    Utf8 = unicode_to_utf8(Unicode, <<>>),
204    Utf16 = utf8_to_utf16(Utf8, <<>>),
205    Utf32 = utf8_to_utf32(Utf8, <<>>),
206    Utf32 = utf16_to_utf32(Utf16, <<>>),
207    Utf8 = utf32_to_utf8(Utf32, <<>>),
208    Unicode = utf32_to_unicode(Utf32),
209    ok.
210
211unicode_to_utf8([C|T], Bin) ->
212    unicode_to_utf8(T, <<Bin/bytes,C/utf8>>);
213unicode_to_utf8([], Bin) -> Bin.
214
215utf8_to_utf16(<<C/utf8,T/binary>>, Bin) ->
216    utf8_to_utf16(T, <<Bin/bytes,C/utf16>>);
217utf8_to_utf16(<<>>, Bin) -> Bin.
218
219utf16_to_utf32(<<C/utf16,T/binary>>, Bin) ->
220    utf16_to_utf32(T, <<Bin/bytes,C/utf32>>);
221utf16_to_utf32(<<>>, Bin) -> Bin.
222
223utf8_to_utf32(<<C/utf8,T/binary>>, Bin) ->
224    utf8_to_utf32(T, <<Bin/bytes,C/utf32>>);
225utf8_to_utf32(<<>>, Bin) -> Bin.
226
227utf32_to_utf8(<<C/utf32,T/binary>>, Bin) ->
228    utf32_to_utf8(T, <<Bin/bytes,C/utf8>>);
229utf32_to_utf8(<<>>, Bin) -> Bin.
230
231utf32_to_unicode(<<C/utf32,T/binary>>) ->
232    [C|utf32_to_unicode(T)];
233utf32_to_unicode(<<>>) -> [].
234
235literals(Config) when is_list(Config) ->
236    abc_utf8 = match_literal(<<"abc"/utf8>>),
237    abc_utf8 = match_literal(<<$a,$b,$c>>),
238    abc_utf8 = match_literal(<<$a/utf8,$b/utf8,$c/utf8>>),
239
240    abc_utf16be = match_literal(<<"abc"/utf16>>),
241    abc_utf16be = match_literal(<<$a:16,$b:16,$c:16>>),
242    abc_utf16le = match_literal(<<"abc"/little-utf16>>),
243    abc_utf16le = match_literal(<<$a:16/little,$b:16/little,$c:16/little>>),
244
245    abc_utf32be = match_literal(<<"abc"/utf32>>),
246    abc_utf32be = match_literal(<<$a:32,$b:32,$c:32>>),
247    abc_utf32le = match_literal(<<"abc"/little-utf32>>),
248    abc_utf32le = match_literal(<<$a:32/little,$b:32/little,$c:32/little>>),
249
250    bjorn_utf8 = match_literal(<<"bj\366rn"/utf8>>),
251    bjorn_utf8 = match_literal(<<$b,$j,195,182,$r,$n>>),
252
253    bjorn_utf16be = match_literal(<<"bj\366rn"/utf16>>),
254    bjorn_utf16be = match_literal(<<$b:16,$j:16,246:16,$r:16,$n:16>>),
255    bjorn_utf16le = match_literal(<<"bj\366rn"/little-utf16>>),
256    bjorn_utf16le = match_literal(<<$b:16/little,$j:16/little,
257					 246:16/little,$r:16/little,
258					 $n:16/little>>),
259    <<244,143,191,191>> = <<16#10ffff/utf8>>,
260
261    %% Invalid literals.
262    I = 0,
263    {'EXIT',{badarg,_}} = (catch <<(-1)/utf8,I/utf8>>),
264    {'EXIT',{badarg,_}} = (catch <<(-1)/utf16,I/utf8>>),
265    {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf16,I/utf8>>),
266    {'EXIT',{badarg,_}} = (catch <<(-1)/utf32,I/utf8>>),
267    {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf32,I/utf8>>),
268    {'EXIT',{badarg,_}} = (catch <<16#D800/utf8,I/utf8>>),
269    {'EXIT',{badarg,_}} = (catch <<16#D800/utf16,I/utf8>>),
270    {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf16,I/utf8>>),
271    {'EXIT',{badarg,_}} = (catch <<16#D800/utf32,I/utf8>>),
272    {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf32,I/utf8>>),
273
274    B = 16#10FFFF+1,
275    {'EXIT',{badarg,_}} = (catch <<B/utf8>>),
276    {'EXIT',{badarg,_}} = (catch <<B/utf16>>),
277    {'EXIT',{badarg,_}} = (catch <<B/little-utf16>>),
278    {'EXIT',{badarg,_}} = (catch <<B/utf32>>),
279    {'EXIT',{badarg,_}} = (catch <<B/little-utf32>>),
280
281    %% Matching of bad literals.
282    error = bad_literal_match(<<237,160,128>>), %16#D800 in UTF-8
283    error = bad_literal_match(<<244,144,128,128>>), %16#110000 in UTF-8
284
285    error = bad_literal_match(<<16#D800:32>>),
286    error = bad_literal_match(<<16#110000:32>>),
287    error = bad_literal_match(<<16#D800:32/little>>),
288    error = bad_literal_match(<<16#110000:32/little>>),
289
290    ok.
291
292match_literal(<<"abc"/utf8>>) -> abc_utf8;
293match_literal(<<"abc"/big-utf16>>) -> abc_utf16be;
294match_literal(<<"abc"/little-utf16>>) -> abc_utf16le;
295match_literal(<<"abc"/big-utf32>>) -> abc_utf32be;
296match_literal(<<"abc"/little-utf32>>) -> abc_utf32le;
297match_literal(<<"bj\366rn"/utf8>>) -> bjorn_utf8;
298match_literal(<<"bj\366rn"/big-utf16>>) -> bjorn_utf16be;
299match_literal(<<"bj\366rn"/little-utf16>>) -> bjorn_utf16le.
300
301bad_literal_match(<<16#D800/utf8>>) -> ok;
302bad_literal_match(<<16#110000/utf8>>) -> ok;
303bad_literal_match(<<16#D800/utf32>>) -> ok;
304bad_literal_match(<<16#110000/utf32>>) -> ok;
305bad_literal_match(<<16#D800/little-utf32>>) -> ok;
306bad_literal_match(<<16#110000/little-utf32>>) -> ok;
307bad_literal_match(_) -> error.
308
309coverage(Config) when is_list(Config) ->
310    %% Cover bit syntax matching optimizations in v3_kernel.
311    0 = coverage_1(<<4096/utf8,65536/utf8,0>>),
312    1 = coverage_1(<<4096/utf8,65536/utf8,1>>),
313
314    0 = coverage_2(<<4096/utf8,65536/utf8,0>>),
315    1 = coverage_2(<<1024/utf8,1025/utf8,1>>),
316
317    fc(catch coverage_3(1)),
318
319    %% Cover beam_flatten (combining the heap allocation in
320    %% a subsequent test_heap instruction into the bs_init2
321    %% instruction).
322    {ok,<<533/utf8>>} = cover_test_heap_utf8(533),
323    {ok,<<1024/utf16>>} = cover_test_heap_utf16(1024),
324    {ok,<<7966/utf32>>} = cover_test_heap_utf32(7966),
325
326    ok.
327
328coverage_1(<<4096/utf8,65536/utf8,0>>) -> 0;
329coverage_1(<<4096/utf8,65536/utf8,1>>) -> 1.
330
331coverage_2(<<4096/utf8,65536/utf8,0>>) -> 0;
332coverage_2(<<1024/utf8,1025/utf8,1>>) -> 1.
333
334coverage_3(<<16#7fffffff/utf8,65536/utf8,0>>) -> 0.
335
336cover_test_heap_utf8(C) -> {ok,<<C/utf8>>}.
337cover_test_heap_utf16(C) -> {ok,<<C/utf16>>}.
338cover_test_heap_utf32(C) -> {ok,<<C/utf32>>}.
339
340utf_data() ->
341%% From RFC-3629.
342
343    %% Give the compiler a chance to do some constant propagation.
344    NotIdentical = 16#2262,
345
346    [
347     %% "A<NOT IDENTICAL TO><ALPHA>."
348     {[16#0041,NotIdentical,16#0391,16#002E],
349      <<16#0041/utf8,NotIdentical/utf8,16#0391/utf8,16#002E/utf8>>,
350      <<16#41,16#E2,16#89,16#A2,16#CE,16#91,16#2E>>},
351
352     %% Korean "hangugeo" (meaning "the Korean language")
353     {[16#D55C,16#AD6D,16#C5B4],
354      <<16#D55C/utf8,16#AD6D/utf8,16#C5B4/utf8>>,
355      <<16#ED,16#95,16#9C,16#EA,16#B5,16#AD,16#EC,16#96,16#B4>>},
356
357     %% Japanese "nihongo" (meaning "the Japanese language").
358     {[16#65E5,16#672C,16#8A9E],
359      <<16#65E5/utf8,16#672C/utf8,16#8A9E/utf8>>,
360      <<16#E6,16#97,16#A5,16#E6,16#9C,16#AC,16#E8,16#AA,16#9E>>}
361    ].
362
363utf16_data() ->
364    %% Example from RFC-2781. "*=Ra", where "*" represents a
365    %% hypothetical Ra hieroglyph (code point 16#12345).
366
367    %% Give the compiler a chance to do some constant propagation.
368    RaHieroglyph = 16#12345,
369
370    %% First as a list of Unicode characters.
371    {[RaHieroglyph,16#3D,16#52,16#61],
372
373     %% Big endian (the two binaries should be equal).
374     <<RaHieroglyph/big-utf16,16#3D/big-utf16,16#52/big-utf16,16#61/big-utf16>>,
375     <<16#D8,16#08,16#DF,16#45,16#00,16#3D,16#00,16#52,16#00,16#61>>,
376
377     %% Little endian (the two binaries should be equal).
378     <<RaHieroglyph/little-utf16,16#3D/little-utf16,
379      16#52/little-utf16,16#61/little-utf16>>,
380     <<16#08,16#D8,16#45,16#DF,16#3D,16#00,16#52,16#00,16#61,16#00>>}.
381
382utf32_data() ->
383    %% "A<NOT IDENTICAL TO><ALPHA>."
384    NotIdentical = 16#2262,
385    {[16#0041,NotIdentical,16#0391,16#002E],
386
387     %% Big endian.
388     <<16#0041/utf32,NotIdentical/utf32,16#0391/utf32,16#002E/utf32>>,
389     <<16#41:32,NotIdentical:32,16#0391:32,16#2E:32>>,
390
391     %% Little endian.
392     <<16#0041/little-utf32,NotIdentical/little-utf32,
393      16#0391/little-utf32,16#002E/little-utf32>>,
394     <<16#41:32/little,NotIdentical:32/little,
395      16#0391:32/little,16#2E:32/little>>}.
396
397fc({'EXIT',{function_clause,_}}) -> ok;
398fc({'EXIT',{{case_clause,_},_}}) when ?MODULE =:= bs_utf_inline_SUITE -> ok.
399