1%%
2%% %CopyrightBegin%
3%%
4%% Copyright Ericsson AB 2008-2016. All Rights Reserved.
5%%
6%% Licensed under the Apache License, Version 2.0 (the "License");
7%% you may not use this file except in compliance with the License.
8%% You may obtain a copy of the License at
9%%
10%%     http://www.apache.org/licenses/LICENSE-2.0
11%%
12%% Unless required by applicable law or agreed to in writing, software
13%% distributed under the License is distributed on an "AS IS" BASIS,
14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15%% See the License for the specific language governing permissions and
16%% limitations under the License.
17%%
18%% %CopyrightEnd%
19%%
20
21-module(bs_utf_SUITE).
22
23-export([all/0, suite/0,
24	 utf8_roundtrip/1,utf16_roundtrip/1,utf32_roundtrip/1,
25	 utf8_illegal_sequences/1,utf16_illegal_sequences/1,
26	 utf32_illegal_sequences/1,
27	 bad_construction/1]).
28
29-include_lib("common_test/include/ct.hrl").
30
31-define(FAIL(Expr), fail_check(catch Expr, ??Expr, [])).
32
33suite() ->
34    [{ct_hooks,[ts_install_cth]},
35     {timetrap, {minutes, 6}}].
36
37all() ->
38    [utf8_roundtrip, utf16_roundtrip, utf32_roundtrip,
39     utf8_illegal_sequences, utf16_illegal_sequences,
40     utf32_illegal_sequences, bad_construction].
41
42utf8_roundtrip(Config) when is_list(Config) ->
43    utf8_roundtrip(0, 16#D7FF),
44    utf8_roundtrip(16#E000, 16#10FFFF),
45    ok.
46
47utf8_roundtrip(First, Last) when First =< Last ->
48    Bin = int_to_utf8(First),
49    Bin = id(<<First/utf8>>),
50    Bin = id(<<(id(<<>>))/binary,First/utf8>>),
51    Unaligned = id(<<3:2,First/utf8>>),
52    <<_:2,Bin/binary>> = Unaligned,
53    <<First/utf8>> = Bin,
54    <<First/utf8>> = make_unaligned(Bin),
55    utf8_roundtrip(First+1, Last);
56utf8_roundtrip(_, _) -> ok.
57
58utf16_roundtrip(Config) when is_list(Config) ->
59    Big = fun utf16_big_roundtrip/1,
60    Little = fun utf16_little_roundtrip/1,
61    PidRefs = [spawn_monitor(fun() ->
62                                     do_utf16_roundtrip(Fun)
63                             end) || Fun <- [Big,Little]],
64    [receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end || {Pid,Ref} <- PidRefs],
65    ok.
66
67do_utf16_roundtrip(Fun) ->
68    do_utf16_roundtrip(0, 16#D7FF, Fun),
69    do_utf16_roundtrip(16#E000, 16#10FFFF, Fun).
70
71do_utf16_roundtrip(First, Last, Fun) when First =< Last ->
72    Fun(First),
73    do_utf16_roundtrip(First+1, Last, Fun);
74do_utf16_roundtrip(_, _, _) -> ok.
75
76utf16_big_roundtrip(Char) ->
77    Bin = id(<<Char/utf16>>),
78    Bin = id(<<(id(<<>>))/binary,Char/utf16>>),
79    Unaligned = id(<<3:2,Char/utf16>>),
80    <<_:2,Bin/binary>> = Unaligned,
81    <<Char/utf16>> = Bin,
82    <<Char/utf16>> = make_unaligned(Bin),
83    ok.
84
85utf16_little_roundtrip(Char) ->
86    Bin = id(<<Char/little-utf16>>),
87    Bin = id(<<(id(<<>>))/binary,Char/little-utf16>>),
88    Unaligned = id(<<3:2,Char/little-utf16>>),
89    <<_:2,Bin/binary>> = Unaligned,
90    <<Char/little-utf16>> = Bin,
91    <<Char/little-utf16>> = make_unaligned(Bin),
92    ok.
93
94utf32_roundtrip(Config) when is_list(Config) ->
95    Big = fun utf32_big_roundtrip/1,
96    Little = fun utf32_little_roundtrip/1,
97    PidRefs = [spawn_monitor(fun() ->
98				     do_utf32_roundtrip(Fun)
99			     end) || Fun <- [Big,Little]],
100    [receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end ||
101	{Pid,Ref} <- PidRefs],
102    ok.
103
104do_utf32_roundtrip(Fun) ->
105    do_utf32_roundtrip(0, 16#D7FF, Fun),
106    do_utf32_roundtrip(16#E000, 16#10FFFF, Fun).
107
108do_utf32_roundtrip(First, Last, Fun) when First =< Last ->
109    Fun(First),
110    do_utf32_roundtrip(First+1, Last, Fun);
111do_utf32_roundtrip(_, _, _) -> ok.
112
113utf32_big_roundtrip(Char) ->
114    Bin = id(<<Char/utf32>>),
115    Bin = id(<<(id(<<>>))/binary,Char/utf32>>),
116    Unaligned = id(<<3:2,Char/utf32>>),
117    <<_:2,Bin/binary>> = Unaligned,
118    <<Char/utf32>> = Bin,
119    <<Char/utf32>> = make_unaligned(Bin),
120    ok.
121
122utf32_little_roundtrip(Char) ->
123    Bin = id(<<Char/little-utf32>>),
124    Bin = id(<<(id(<<>>))/binary,Char/little-utf32>>),
125    Unaligned = id(<<3:2,Char/little-utf32>>),
126    <<_:2,Bin/binary>> = Unaligned,
127    <<Char/little-utf32>> = Bin,
128    <<Char/little-utf32>> = make_unaligned(Bin),
129    ok.
130
131utf8_illegal_sequences(Config) when is_list(Config) ->
132    fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
133    fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
134
135    %% Illegal first character.
136    [fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)],
137
138    %% Short sequences.
139    short_sequences(16#80, 16#10FFFF),
140
141    %% Overlong sequences. (Using more bytes than necessary
142    %% is not allowed.)
143    overlong(0, 127, 2),
144    overlong(128, 16#7FF, 3),
145    overlong(16#800, 16#FFFF, 4),
146    ok.
147
148fail_range(Char, End) when Char =< End ->
149    {'EXIT',_} = (catch <<Char/utf8>>),
150    Bin = int_to_utf8(Char),
151    fail(Bin),
152    fail_range(Char+1, End);
153fail_range(_, _) -> ok.
154
155short_sequences(Char, End) ->
156    Step = (End - Char) div erlang:system_info(schedulers) + 1,
157    PidRefs = short_sequences_1(Char, Step, End),
158    [receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end ||
159	{Pid,Ref} <- PidRefs],
160    ok.
161
162short_sequences_1(Char, Step, End) when Char =< End ->
163    CharEnd = lists:min([Char+Step-1,End]),
164    [spawn_monitor(fun() ->
165                           io:format("~p - ~p\n", [Char,CharEnd]),
166                           do_short_sequences(Char, CharEnd)
167                   end)|short_sequences_1(Char+Step, Step, End)];
168short_sequences_1(_, _, _) -> [].
169
170do_short_sequences(Char, End) when Char =< End ->
171    short_sequence(Char),
172    do_short_sequences(Char+1, End);
173do_short_sequences(_, _) -> ok.
174
175short_sequence(I) ->
176    case int_to_utf8(I) of
177	<<S0:3/binary,_:8>> ->
178	    <<S1:2/binary,R1:8>> = S0,
179	    <<S2:1/binary,_:8>> = S1,
180	    fail(S0),
181	    fail(S1),
182	    fail(S2),
183	    fail(<<S2/binary,16#7F,R1,R1>>),
184	    fail(<<S1/binary,16#7F,R1>>),
185	    fail(<<S0/binary,16#7F>>);
186	<<S0:2/binary,_:8>> ->
187	    <<S1:1/binary,R1:8>> = S0,
188	    fail(S0),
189	    fail(S1),
190	    fail(<<S0/binary,16#7F>>),
191	    fail(<<S1/binary,16#7F>>),
192	    fail(<<S1/binary,16#7F,R1>>);
193	<<S:1/binary,_:8>> ->
194	    fail(S),
195	    fail(<<S/binary,16#7F>>)
196    end.
197
198overlong(Char, Last, NumBytes) when Char =< Last ->
199    overlong(Char, NumBytes),
200    overlong(Char+1, Last, NumBytes);
201overlong(_, _, _) -> ok.
202
203overlong(Char, NumBytes) when NumBytes < 5 ->
204    case int_to_utf8(Char, NumBytes) of
205	<<Char/utf8>>=Bin ->
206	    ct:fail({illegal_encoding_accepted,Bin,Char});
207	<<OtherChar/utf8>>=Bin ->
208	    ct:fail({illegal_encoding_accepted,Bin,Char,OtherChar});
209	_ -> ok
210    end,
211    overlong(Char, NumBytes+1);
212overlong(_, _) -> ok.
213
214fail(Bin) ->
215    fail_1(Bin),
216    fail_1(make_unaligned(Bin)).
217
218fail_1(<<Char/utf8>>=Bin) ->
219    ct:fail({illegal_encoding_accepted,Bin,Char});
220fail_1(_) -> ok.
221
222
223utf16_illegal_sequences(Config) when is_list(Config) ->
224    utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
225    utf16_fail_range(16#D800, 16#DFFF),	          %Reserved for UTF-16.
226
227    lonely_hi_surrogate(16#D800, 16#DFFF),
228    leading_lo_surrogate(16#DC00, 16#DFFF),
229
230    ok.
231
232utf16_fail_range(Char, End) when Char =< End ->
233    {'EXIT',_} = (catch <<Char/big-utf16>>),
234    {'EXIT',_} = (catch <<Char/little-utf16>>),
235    utf16_fail_range(Char+1, End);
236utf16_fail_range(_, _) -> ok.
237
238lonely_hi_surrogate(Char, End) when Char =< End ->
239    BinBig = <<Char:16/big>>,
240    BinLittle = <<Char:16/little>>,
241    case {BinBig,BinLittle} of
242	{<<Bad/big-utf16>>,_} ->
243	    ct:fail({lonely_hi_surrogate_accepted,Bad});
244	{_,<<Bad/little-utf16>>} ->
245	    ct:fail({lonely_hi_surrogate_accepted,Bad});
246	{_,_} ->
247	    ok
248    end,
249    lonely_hi_surrogate(Char+1, End);
250lonely_hi_surrogate(_, _) -> ok.
251
252leading_lo_surrogate(Char, End) when Char =< End ->
253    leading_lo_surrogate(Char, 16#D800, 16#DFFF),
254    leading_lo_surrogate(Char+1, End);
255leading_lo_surrogate(_, _) -> ok.
256
257leading_lo_surrogate(HiSurr, LoSurr, End) when LoSurr =< End ->
258    BinBig = <<HiSurr:16/big,LoSurr:16/big>>,
259    BinLittle = <<HiSurr:16/little,LoSurr:16/little>>,
260    case {BinBig,BinLittle} of
261	{<<Bad/big-utf16,_/bits>>,_} ->
262	    ct:fail({leading_lo_surrogate_accepted,Bad});
263	{_,<<Bad/little-utf16,_/bits>>} ->
264	    ct:fail({leading_lo_surrogate_accepted,Bad});
265	{_,_} ->
266	    ok
267    end,
268    leading_lo_surrogate(HiSurr, LoSurr+1, End);
269leading_lo_surrogate(_, _, _) -> ok.
270
271utf32_illegal_sequences(Config) when is_list(Config) ->
272    utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
273    utf32_fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
274    utf32_fail_range(-100, -1),
275    ok.
276
277utf32_fail_range(Char, End) when Char =< End ->
278    {'EXIT',_} = (catch <<Char/big-utf32>>),
279    {'EXIT',_} = (catch <<Char/little-utf32>>),
280    case {<<Char:32>>,<<Char:32/little>>} of
281        {<<Unexpected/utf32>>,_} ->
282            ct:fail(Unexpected);
283        {_,<<Unexpected/little-utf32>>} ->
284            ct:fail(Unexpected);
285        {_,_} -> ok
286    end,
287    utf32_fail_range(Char+1, End);
288utf32_fail_range(_, _) -> ok.
289
290bad_construction(Config) when is_list(Config) ->
291    ?FAIL(<<3.14/utf8>>),
292    ?FAIL(<<3.1415/utf16>>),
293    ?FAIL(<<3.1415/utf32>>),
294
295    ?FAIL(<<(-1)/utf8>>),
296    ?FAIL(<<(-1)/utf16>>),
297    {'EXIT',_} = (catch <<(id(-1))/utf8>>),
298    {'EXIT',_} = (catch <<(id(-1))/utf16>>),
299    {'EXIT',_} = (catch <<(id(-1))/utf32>>),
300
301    ?FAIL(<<16#D800/utf8>>),
302    ?FAIL(<<16#D800/utf16>>),
303    ?FAIL(<<16#D800/utf32>>),
304
305    ok.
306
307%% This function intentionally allows construction of
308%% UTF-8 sequence in illegal ranges.
309int_to_utf8(I) when I =< 16#7F ->
310    <<I>>;
311int_to_utf8(I) when I =< 16#7FF ->
312    B2 = I,
313    B1 = (I bsr 6),
314    <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>;
315int_to_utf8(I) when I =< 16#FFFF ->
316    B3 = I,
317    B2 = (I bsr 6),
318    B1 = (I bsr 12),
319    <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>;
320int_to_utf8(I) when I =< 16#3FFFFF ->
321    B4 = I,
322    B3 = (I bsr 6),
323    B2 = (I bsr 12),
324    B1 = (I bsr 18),
325    <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>;
326int_to_utf8(I) when I =< 16#3FFFFFF ->
327    B5 = I,
328    B4 = (I bsr 6),
329    B3 = (I bsr 12),
330    B2 = (I bsr 18),
331    B1 = (I bsr 24),
332    <<1:1,1:1,1:1,1:1,1:1,0:1,B1:2,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6,
333     1:1,0:1,B5:6>>.
334
335%% int_to_utf8(I, NumberOfBytes) -> Binary.
336%%  This function can be used to construct overlong sequences.
337int_to_utf8(I, 1) ->
338    <<I>>;
339int_to_utf8(I, 2) ->
340    B2 = I,
341    B1 = (I bsr 6),
342    <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>;
343int_to_utf8(I, 3) ->
344    B3 = I,
345    B2 = (I bsr 6),
346    B1 = (I bsr 12),
347    <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>;
348int_to_utf8(I, 4) ->
349    B4 = I,
350    B3 = (I bsr 6),
351    B2 = (I bsr 12),
352    B1 = (I bsr 18),
353    <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>.
354
355make_unaligned(Bin0) when is_binary(Bin0) ->
356    Bin1 = <<0:3,Bin0/binary,31:5>>,
357    Sz = byte_size(Bin0),
358    <<0:3,Bin:Sz/binary,31:5>> = id(Bin1),
359    Bin.
360
361fail_check({'EXIT',{badarg,_}}, Str, Vars) ->
362    try	evaluate(Str, Vars) of
363	Res ->
364	    io:format("Interpreted result: ~p", [Res]),
365	    ct:fail(did_not_fail_in_intepreted_code)
366    catch
367	error:badarg ->
368	    ok
369    end;
370fail_check(Res, _, _) ->
371    io:format("Compiled result: ~p", [Res]),
372    ct:fail(did_not_fail_in_compiled_code).
373
374evaluate(Str, Vars) ->
375    {ok,Tokens,_} =
376	erl_scan:string(Str ++ " . "),
377    {ok, [Expr]} = erl_parse:parse_exprs(Tokens),
378    case erl_eval:expr(Expr, Vars) of
379	{value, Result, _} ->
380	    Result
381    end.
382
383id(I) -> I.
384