1%% -*- erlang-indent-level: 2 -*-
2%%-------------------------------------------------------------------
3%% Purpose: test support for UTF datatypes in binaries
4%%
5%% Most of it taken from emulator/test/bs_utf_SUITE.erl
6%%-------------------------------------------------------------------
7
8-module(bs_utf).
9
10-export([test/0]).
11
12-include_lib("common_test/include/ct.hrl").
13
14test() ->
15  ok = utf8_cm65(),
16  ok = utf8_roundtrip(),
17  ok = utf16_roundtrip(),
18  ok = utf32_roundtrip(),
19  %% The following were problematic for the LLVM backend
20  ok = utf8_illegal_sequences(),
21  ok = utf16_illegal_sequences(),
22  ok = utf32_illegal_sequences(),
23  ok.
24
25%%-------------------------------------------------------------------
26%% A test with construction and matching
27
28utf8_cm65() ->
29  <<65>> = b65utf8(),
30  ok = m(<<65>>).
31
32b65utf8() ->
33  <<65/utf8>>.
34
35m(<<65/utf8>>) ->
36  ok.
37
38%%-------------------------------------------------------------------
39
40utf8_roundtrip() ->
41  ok = utf8_roundtrip(0, 16#D7FF),
42  ok = utf8_roundtrip(16#E000, 16#10FFFF),
43  ok.
44
45utf8_roundtrip(First, Last) when First =< Last ->
46  Bin = int_to_utf8(First),
47  Bin = id(<<First/utf8>>),
48  Bin = id(<<(id(<<>>))/binary,First/utf8>>),
49  Unaligned = id(<<3:2,First/utf8>>),
50  <<_:2,Bin/binary>> = Unaligned,
51  <<First/utf8>> = Bin,
52  <<First/utf8>> = make_unaligned(Bin),
53  utf8_roundtrip(First+1, Last);
54utf8_roundtrip(_, _) ->
55  ok.
56
57%%-------------------------------------------------------------------
58
59utf16_roundtrip() ->
60  Big = fun utf16_big_roundtrip/1,
61  Little = fun utf16_little_roundtrip/1,
62  PidRefs = [spawn_monitor(fun() -> do_utf16_roundtrip(Fun) end) ||
63	      Fun <- [Big,Little]],
64  [receive {'DOWN', Ref, process, Pid, Reason} -> normal=Reason end ||
65    {Pid, Ref} <- PidRefs],
66  ok.
67
68do_utf16_roundtrip(Fun) ->
69  do_utf16_roundtrip(0, 16#D7FF, Fun),
70  do_utf16_roundtrip(16#E000, 16#10FFFF, Fun).
71
72do_utf16_roundtrip(First, Last, Fun) when First =< Last ->
73  Fun(First),
74  do_utf16_roundtrip(First+1, Last, Fun);
75do_utf16_roundtrip(_, _, _) -> ok.
76
77utf16_big_roundtrip(Char) ->
78  Bin = id(<<Char/utf16>>),
79  Bin = id(<<(id(<<>>))/binary,Char/utf16>>),
80  Unaligned = id(<<3:2,Char/utf16>>),
81  <<_:2,Bin/binary>> = Unaligned,
82  <<Char/utf16>> = Bin,
83  <<Char/utf16>> = make_unaligned(Bin),
84  ok.
85
86utf16_little_roundtrip(Char) ->
87  Bin = id(<<Char/little-utf16>>),
88  Bin = id(<<(id(<<>>))/binary,Char/little-utf16>>),
89  Unaligned = id(<<3:2,Char/little-utf16>>),
90  <<_:2,Bin/binary>> = Unaligned,
91  <<Char/little-utf16>> = Bin,
92  <<Char/little-utf16>> = make_unaligned(Bin),
93  ok.
94
95%%-------------------------------------------------------------------
96
97utf32_roundtrip() ->
98  Big = fun utf32_big_roundtrip/1,
99  Little = fun utf32_little_roundtrip/1,
100  PidRefs = [spawn_monitor(fun() -> do_utf32_roundtrip(Fun) end) ||
101	      Fun <- [Big,Little]],
102  [receive {'DOWN', Ref, process, Pid, Reason} -> normal=Reason end ||
103    {Pid, Ref} <- PidRefs],
104  ok.
105
106do_utf32_roundtrip(Fun) ->
107  do_utf32_roundtrip(0, 16#D7FF, Fun),
108  do_utf32_roundtrip(16#E000, 16#10FFFF, Fun).
109
110do_utf32_roundtrip(First, Last, Fun) when First =< Last ->
111  Fun(First),
112  do_utf32_roundtrip(First+1, Last, Fun);
113do_utf32_roundtrip(_, _, _) -> ok.
114
115utf32_big_roundtrip(Char) ->
116  Bin = id(<<Char/utf32>>),
117  Bin = id(<<(id(<<>>))/binary,Char/utf32>>),
118  Unaligned = id(<<3:2,Char/utf32>>),
119  <<_:2,Bin/binary>> = Unaligned,
120  <<Char/utf32>> = Bin,
121  <<Char/utf32>> = make_unaligned(Bin),
122  ok.
123
124utf32_little_roundtrip(Char) ->
125  Bin = id(<<Char/little-utf32>>),
126  Bin = id(<<(id(<<>>))/binary,Char/little-utf32>>),
127  Unaligned = id(<<3:2,Char/little-utf32>>),
128  <<_:2,Bin/binary>> = Unaligned,
129  <<Char/little-utf32>> = Bin,
130  <<Char/little-utf32>> = make_unaligned(Bin),
131  ok.
132
133%%-------------------------------------------------------------------
134
135utf8_illegal_sequences() ->
136  fail_range(16#10FFFF+1, 16#10FFFF+512), % Too large.
137  fail_range(16#D800, 16#DFFF),	    % Reserved for UTF-16.
138
139  %% Illegal first character.
140  [fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)],
141
142  %% Short sequences.
143  short_sequences(16#80, 16#10FFFF),
144
145  %% Overlong sequences. (Using more bytes than necessary
146  %% is not allowed.)
147  overlong(0, 127, 2),
148  overlong(128, 16#7FF, 3),
149  overlong(16#800, 16#FFFF, 4),
150  ok.
151
152fail_range(Char, End) when Char =< End ->
153  {'EXIT', _} = (catch <<Char/utf8>>),
154  Bin = int_to_utf8(Char),
155  fail(Bin),
156  fail_range(Char+1, End);
157fail_range(_, _) -> ok.
158
159short_sequences(Char, End) ->
160  Step = (End - Char) div erlang:system_info(schedulers) + 1,
161  PidRefs = short_sequences_1(Char, Step, End),
162  [receive {'DOWN', Ref, process, Pid, Reason} -> normal=Reason end ||
163    {Pid, Ref} <- PidRefs],
164  ok.
165
166short_sequences_1(Char, Step, End) when Char =< End ->
167  CharEnd = lists:min([Char+Step-1,End]),
168  [spawn_monitor(fun() ->
169		     %% io:format("~p - ~p\n", [Char, CharEnd]),
170		     do_short_sequences(Char, CharEnd)
171		 end)|short_sequences_1(Char+Step, Step, End)];
172short_sequences_1(_, _, _) -> [].
173
174do_short_sequences(Char, End) when Char =< End ->
175  short_sequence(Char),
176  do_short_sequences(Char+1, End);
177do_short_sequences(_, _) -> ok.
178
179short_sequence(I) ->
180  case int_to_utf8(I) of
181    <<S0:3/binary,_:8>> ->
182      <<S1:2/binary,R1:8>> = S0,
183      <<S2:1/binary,_:8>> = S1,
184      fail(S0),
185      fail(S1),
186      fail(S2),
187      fail(<<S2/binary,16#7F,R1,R1>>),
188      fail(<<S1/binary,16#7F,R1>>),
189      fail(<<S0/binary,16#7F>>);
190    <<S0:2/binary,_:8>> ->
191      <<S1:1/binary,R1:8>> = S0,
192      fail(S0),
193      fail(S1),
194      fail(<<S0/binary,16#7F>>),
195      fail(<<S1/binary,16#7F>>),
196      fail(<<S1/binary,16#7F,R1>>);
197    <<S:1/binary,_:8>> ->
198      fail(S),
199      fail(<<S/binary,16#7F>>)
200  end.
201
202overlong(Char, Last, NumBytes) when Char =< Last ->
203  overlong(Char, NumBytes),
204  overlong(Char+1, Last, NumBytes);
205overlong(_, _, _) -> ok.
206
207overlong(Char, NumBytes) when NumBytes < 5 ->
208  case int_to_utf8(Char, NumBytes) of
209    <<Char/utf8>>=Bin ->
210      ?t:fail({illegal_encoding_accepted,Bin,Char});
211    <<OtherChar/utf8>>=Bin ->
212      ?t:fail({illegal_encoding_accepted,Bin,Char,OtherChar});
213    _ -> ok
214  end,
215  overlong(Char, NumBytes+1);
216overlong(_, _) -> ok.
217
218fail(Bin) ->
219  fail_1(Bin),
220  fail_1(make_unaligned(Bin)).
221
222fail_1(<<Char/utf8>> = Bin) ->
223  ?t:fail({illegal_encoding_accepted, Bin, Char});
224fail_1(_) -> ok.
225
226%%-------------------------------------------------------------------
227
228utf16_illegal_sequences() ->
229  utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), % Too large.
230  utf16_fail_range(16#D800, 16#DFFF),		% Reserved for UTF-16.
231  lonely_hi_surrogate(16#D800, 16#DFFF),
232  leading_lo_surrogate(16#DC00, 16#DFFF),
233  ok.
234
235utf16_fail_range(Char, End) when Char =< End ->
236  {'EXIT', _} = (catch <<Char/big-utf16>>),
237  {'EXIT', _} = (catch <<Char/little-utf16>>),
238  utf16_fail_range(Char+1, End);
239utf16_fail_range(_, _) -> ok.
240
241lonely_hi_surrogate(Char, End) when Char =< End ->
242  BinBig = <<Char:16/big>>,
243  BinLittle = <<Char:16/little>>,
244  case {BinBig,BinLittle} of
245    {<<Bad/big-utf16>>,_} ->
246      ?t:fail({lonely_hi_surrogate_accepted,Bad});
247    {_,<<Bad/little-utf16>>} ->
248      ?t:fail({lonely_hi_surrogate_accepted,Bad});
249    {_,_} ->
250      ok
251  end,
252  lonely_hi_surrogate(Char+1, End);
253lonely_hi_surrogate(_, _) -> ok.
254
255leading_lo_surrogate(Char, End) when Char =< End ->
256  leading_lo_surrogate(Char, 16#D800, 16#DFFF),
257  leading_lo_surrogate(Char+1, End);
258leading_lo_surrogate(_, _) -> ok.
259
260leading_lo_surrogate(HiSurr, LoSurr, End) when LoSurr =< End ->
261  BinBig = <<HiSurr:16/big,LoSurr:16/big>>,
262  BinLittle = <<HiSurr:16/little,LoSurr:16/little>>,
263  case {BinBig,BinLittle} of
264    {<<Bad/big-utf16,_/bits>>,_} ->
265      ?t:fail({leading_lo_surrogate_accepted,Bad});
266    {_,<<Bad/little-utf16,_/bits>>} ->
267      ?t:fail({leading_lo_surrogate_accepted,Bad});
268    {_,_} ->
269      ok
270  end,
271  leading_lo_surrogate(HiSurr, LoSurr+1, End);
272leading_lo_surrogate(_, _, _) -> ok.
273
274%%-------------------------------------------------------------------
275
276utf32_illegal_sequences() ->
277  utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), % Too large.
278  utf32_fail_range(16#D800, 16#DFFF),		% Reserved for UTF-16.
279  utf32_fail_range(-100, -1),
280  ok.
281
282utf32_fail_range(Char, End) when Char =< End ->
283  {'EXIT', _} = (catch <<Char/big-utf32>>),
284  {'EXIT', _} = (catch <<Char/little-utf32>>),
285  case {<<Char:32>>,<<Char:32/little>>} of
286    {<<Unexpected/utf32>>,_} ->
287      ?t:fail(Unexpected);
288    {_,<<Unexpected/little-utf32>>} ->
289      ?t:fail(Unexpected);
290    {_,_} -> ok
291  end,
292  utf32_fail_range(Char+1, End);
293utf32_fail_range(_, _) -> ok.
294
295%%-------------------------------------------------------------------
296%% This function intentionally allows construction of UTF-8 sequence
297%% in illegal ranges.
298
299int_to_utf8(I) when I =< 16#7F ->
300  <<I>>;
301int_to_utf8(I) when I =< 16#7FF ->
302  B2 = I,
303  B1 = (I bsr 6),
304  <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>;
305int_to_utf8(I) when I =< 16#FFFF ->
306  B3 = I,
307  B2 = (I bsr 6),
308  B1 = (I bsr 12),
309  <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>;
310int_to_utf8(I) when I =< 16#3FFFFF ->
311  B4 = I,
312  B3 = (I bsr 6),
313  B2 = (I bsr 12),
314  B1 = (I bsr 18),
315  <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>;
316int_to_utf8(I) when I =< 16#3FFFFFF ->
317  B5 = I,
318  B4 = (I bsr 6),
319  B3 = (I bsr 12),
320  B2 = (I bsr 18),
321  B1 = (I bsr 24),
322  <<1:1,1:1,1:1,1:1,1:1,0:1,B1:2,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6,
323    1:1,0:1,B5:6>>.
324
325%% int_to_utf8(I, NumberOfBytes) -> Binary.
326%%  This function can be used to construct overlong sequences.
327int_to_utf8(I, 1) ->
328  <<I>>;
329int_to_utf8(I, 2) ->
330  B2 = I,
331  B1 = (I bsr 6),
332  <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>;
333int_to_utf8(I, 3) ->
334  B3 = I,
335  B2 = (I bsr 6),
336  B1 = (I bsr 12),
337  <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>;
338int_to_utf8(I, 4) ->
339  B4 = I,
340  B3 = (I bsr 6),
341  B2 = (I bsr 12),
342  B1 = (I bsr 18),
343  <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>.
344
345%%-------------------------------------------------------------------
346
347make_unaligned(Bin0) when is_binary(Bin0) ->
348  Bin1 = <<0:3,Bin0/binary,31:5>>,
349  Sz = byte_size(Bin0),
350  <<0:3,Bin:Sz/binary,31:5>> = id(Bin1),
351  Bin.
352
353%%-------------------------------------------------------------------
354%% Just to prevent compiler optimizations
355
356id(X) -> X.
357