1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2008-2016. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20 21-module(bs_utf_SUITE). 22 23-export([all/0, suite/0, 24 utf8_roundtrip/1,utf16_roundtrip/1,utf32_roundtrip/1, 25 utf8_illegal_sequences/1,utf16_illegal_sequences/1, 26 utf32_illegal_sequences/1, 27 bad_construction/1]). 28 29-include_lib("common_test/include/ct.hrl"). 30 31-define(FAIL(Expr), fail_check(catch Expr, ??Expr, [])). 32 33suite() -> 34 [{ct_hooks,[ts_install_cth]}, 35 {timetrap, {minutes, 6}}]. 36 37all() -> 38 [utf8_roundtrip, utf16_roundtrip, utf32_roundtrip, 39 utf8_illegal_sequences, utf16_illegal_sequences, 40 utf32_illegal_sequences, bad_construction]. 41 42utf8_roundtrip(Config) when is_list(Config) -> 43 utf8_roundtrip(0, 16#D7FF), 44 utf8_roundtrip(16#E000, 16#10FFFF), 45 ok. 46 47utf8_roundtrip(First, Last) when First =< Last -> 48 Bin = int_to_utf8(First), 49 Bin = id(<<First/utf8>>), 50 Bin = id(<<(id(<<>>))/binary,First/utf8>>), 51 Unaligned = id(<<3:2,First/utf8>>), 52 <<_:2,Bin/binary>> = Unaligned, 53 <<First/utf8>> = Bin, 54 <<First/utf8>> = make_unaligned(Bin), 55 utf8_roundtrip(First+1, Last); 56utf8_roundtrip(_, _) -> ok. 57 58utf16_roundtrip(Config) when is_list(Config) -> 59 Big = fun utf16_big_roundtrip/1, 60 Little = fun utf16_little_roundtrip/1, 61 PidRefs = [spawn_monitor(fun() -> 62 do_utf16_roundtrip(Fun) 63 end) || Fun <- [Big,Little]], 64 [receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end || {Pid,Ref} <- PidRefs], 65 ok. 66 67do_utf16_roundtrip(Fun) -> 68 do_utf16_roundtrip(0, 16#D7FF, Fun), 69 do_utf16_roundtrip(16#E000, 16#10FFFF, Fun). 70 71do_utf16_roundtrip(First, Last, Fun) when First =< Last -> 72 Fun(First), 73 do_utf16_roundtrip(First+1, Last, Fun); 74do_utf16_roundtrip(_, _, _) -> ok. 75 76utf16_big_roundtrip(Char) -> 77 Bin = id(<<Char/utf16>>), 78 Bin = id(<<(id(<<>>))/binary,Char/utf16>>), 79 Unaligned = id(<<3:2,Char/utf16>>), 80 <<_:2,Bin/binary>> = Unaligned, 81 <<Char/utf16>> = Bin, 82 <<Char/utf16>> = make_unaligned(Bin), 83 ok. 84 85utf16_little_roundtrip(Char) -> 86 Bin = id(<<Char/little-utf16>>), 87 Bin = id(<<(id(<<>>))/binary,Char/little-utf16>>), 88 Unaligned = id(<<3:2,Char/little-utf16>>), 89 <<_:2,Bin/binary>> = Unaligned, 90 <<Char/little-utf16>> = Bin, 91 <<Char/little-utf16>> = make_unaligned(Bin), 92 ok. 93 94utf32_roundtrip(Config) when is_list(Config) -> 95 Big = fun utf32_big_roundtrip/1, 96 Little = fun utf32_little_roundtrip/1, 97 PidRefs = [spawn_monitor(fun() -> 98 do_utf32_roundtrip(Fun) 99 end) || Fun <- [Big,Little]], 100 [receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end || 101 {Pid,Ref} <- PidRefs], 102 ok. 103 104do_utf32_roundtrip(Fun) -> 105 do_utf32_roundtrip(0, 16#D7FF, Fun), 106 do_utf32_roundtrip(16#E000, 16#10FFFF, Fun). 107 108do_utf32_roundtrip(First, Last, Fun) when First =< Last -> 109 Fun(First), 110 do_utf32_roundtrip(First+1, Last, Fun); 111do_utf32_roundtrip(_, _, _) -> ok. 112 113utf32_big_roundtrip(Char) -> 114 Bin = id(<<Char/utf32>>), 115 Bin = id(<<(id(<<>>))/binary,Char/utf32>>), 116 Unaligned = id(<<3:2,Char/utf32>>), 117 <<_:2,Bin/binary>> = Unaligned, 118 <<Char/utf32>> = Bin, 119 <<Char/utf32>> = make_unaligned(Bin), 120 ok. 121 122utf32_little_roundtrip(Char) -> 123 Bin = id(<<Char/little-utf32>>), 124 Bin = id(<<(id(<<>>))/binary,Char/little-utf32>>), 125 Unaligned = id(<<3:2,Char/little-utf32>>), 126 <<_:2,Bin/binary>> = Unaligned, 127 <<Char/little-utf32>> = Bin, 128 <<Char/little-utf32>> = make_unaligned(Bin), 129 ok. 130 131utf8_illegal_sequences(Config) when is_list(Config) -> 132 fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large. 133 fail_range(16#D800, 16#DFFF), %Reserved for UTF-16. 134 135 %% Illegal first character. 136 [fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)], 137 138 %% Short sequences. 139 short_sequences(16#80, 16#10FFFF), 140 141 %% Overlong sequences. (Using more bytes than necessary 142 %% is not allowed.) 143 overlong(0, 127, 2), 144 overlong(128, 16#7FF, 3), 145 overlong(16#800, 16#FFFF, 4), 146 ok. 147 148fail_range(Char, End) when Char =< End -> 149 {'EXIT',_} = (catch <<Char/utf8>>), 150 Bin = int_to_utf8(Char), 151 fail(Bin), 152 fail_range(Char+1, End); 153fail_range(_, _) -> ok. 154 155short_sequences(Char, End) -> 156 Step = (End - Char) div erlang:system_info(schedulers) + 1, 157 PidRefs = short_sequences_1(Char, Step, End), 158 [receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end || 159 {Pid,Ref} <- PidRefs], 160 ok. 161 162short_sequences_1(Char, Step, End) when Char =< End -> 163 CharEnd = lists:min([Char+Step-1,End]), 164 [spawn_monitor(fun() -> 165 io:format("~p - ~p\n", [Char,CharEnd]), 166 do_short_sequences(Char, CharEnd) 167 end)|short_sequences_1(Char+Step, Step, End)]; 168short_sequences_1(_, _, _) -> []. 169 170do_short_sequences(Char, End) when Char =< End -> 171 short_sequence(Char), 172 do_short_sequences(Char+1, End); 173do_short_sequences(_, _) -> ok. 174 175short_sequence(I) -> 176 case int_to_utf8(I) of 177 <<S0:3/binary,_:8>> -> 178 <<S1:2/binary,R1:8>> = S0, 179 <<S2:1/binary,_:8>> = S1, 180 fail(S0), 181 fail(S1), 182 fail(S2), 183 fail(<<S2/binary,16#7F,R1,R1>>), 184 fail(<<S1/binary,16#7F,R1>>), 185 fail(<<S0/binary,16#7F>>); 186 <<S0:2/binary,_:8>> -> 187 <<S1:1/binary,R1:8>> = S0, 188 fail(S0), 189 fail(S1), 190 fail(<<S0/binary,16#7F>>), 191 fail(<<S1/binary,16#7F>>), 192 fail(<<S1/binary,16#7F,R1>>); 193 <<S:1/binary,_:8>> -> 194 fail(S), 195 fail(<<S/binary,16#7F>>) 196 end. 197 198overlong(Char, Last, NumBytes) when Char =< Last -> 199 overlong(Char, NumBytes), 200 overlong(Char+1, Last, NumBytes); 201overlong(_, _, _) -> ok. 202 203overlong(Char, NumBytes) when NumBytes < 5 -> 204 case int_to_utf8(Char, NumBytes) of 205 <<Char/utf8>>=Bin -> 206 ct:fail({illegal_encoding_accepted,Bin,Char}); 207 <<OtherChar/utf8>>=Bin -> 208 ct:fail({illegal_encoding_accepted,Bin,Char,OtherChar}); 209 _ -> ok 210 end, 211 overlong(Char, NumBytes+1); 212overlong(_, _) -> ok. 213 214fail(Bin) -> 215 fail_1(Bin), 216 fail_1(make_unaligned(Bin)). 217 218fail_1(<<Char/utf8>>=Bin) -> 219 ct:fail({illegal_encoding_accepted,Bin,Char}); 220fail_1(_) -> ok. 221 222 223utf16_illegal_sequences(Config) when is_list(Config) -> 224 utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large. 225 utf16_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16. 226 227 lonely_hi_surrogate(16#D800, 16#DFFF), 228 leading_lo_surrogate(16#DC00, 16#DFFF), 229 230 ok. 231 232utf16_fail_range(Char, End) when Char =< End -> 233 {'EXIT',_} = (catch <<Char/big-utf16>>), 234 {'EXIT',_} = (catch <<Char/little-utf16>>), 235 utf16_fail_range(Char+1, End); 236utf16_fail_range(_, _) -> ok. 237 238lonely_hi_surrogate(Char, End) when Char =< End -> 239 BinBig = <<Char:16/big>>, 240 BinLittle = <<Char:16/little>>, 241 case {BinBig,BinLittle} of 242 {<<Bad/big-utf16>>,_} -> 243 ct:fail({lonely_hi_surrogate_accepted,Bad}); 244 {_,<<Bad/little-utf16>>} -> 245 ct:fail({lonely_hi_surrogate_accepted,Bad}); 246 {_,_} -> 247 ok 248 end, 249 lonely_hi_surrogate(Char+1, End); 250lonely_hi_surrogate(_, _) -> ok. 251 252leading_lo_surrogate(Char, End) when Char =< End -> 253 leading_lo_surrogate(Char, 16#D800, 16#DFFF), 254 leading_lo_surrogate(Char+1, End); 255leading_lo_surrogate(_, _) -> ok. 256 257leading_lo_surrogate(HiSurr, LoSurr, End) when LoSurr =< End -> 258 BinBig = <<HiSurr:16/big,LoSurr:16/big>>, 259 BinLittle = <<HiSurr:16/little,LoSurr:16/little>>, 260 case {BinBig,BinLittle} of 261 {<<Bad/big-utf16,_/bits>>,_} -> 262 ct:fail({leading_lo_surrogate_accepted,Bad}); 263 {_,<<Bad/little-utf16,_/bits>>} -> 264 ct:fail({leading_lo_surrogate_accepted,Bad}); 265 {_,_} -> 266 ok 267 end, 268 leading_lo_surrogate(HiSurr, LoSurr+1, End); 269leading_lo_surrogate(_, _, _) -> ok. 270 271utf32_illegal_sequences(Config) when is_list(Config) -> 272 utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large. 273 utf32_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16. 274 utf32_fail_range(-100, -1), 275 ok. 276 277utf32_fail_range(Char, End) when Char =< End -> 278 {'EXIT',_} = (catch <<Char/big-utf32>>), 279 {'EXIT',_} = (catch <<Char/little-utf32>>), 280 case {<<Char:32>>,<<Char:32/little>>} of 281 {<<Unexpected/utf32>>,_} -> 282 ct:fail(Unexpected); 283 {_,<<Unexpected/little-utf32>>} -> 284 ct:fail(Unexpected); 285 {_,_} -> ok 286 end, 287 utf32_fail_range(Char+1, End); 288utf32_fail_range(_, _) -> ok. 289 290bad_construction(Config) when is_list(Config) -> 291 ?FAIL(<<3.14/utf8>>), 292 ?FAIL(<<3.1415/utf16>>), 293 ?FAIL(<<3.1415/utf32>>), 294 295 ?FAIL(<<(-1)/utf8>>), 296 ?FAIL(<<(-1)/utf16>>), 297 {'EXIT',_} = (catch <<(id(-1))/utf8>>), 298 {'EXIT',_} = (catch <<(id(-1))/utf16>>), 299 {'EXIT',_} = (catch <<(id(-1))/utf32>>), 300 301 ?FAIL(<<16#D800/utf8>>), 302 ?FAIL(<<16#D800/utf16>>), 303 ?FAIL(<<16#D800/utf32>>), 304 305 ok. 306 307%% This function intentionally allows construction of 308%% UTF-8 sequence in illegal ranges. 309int_to_utf8(I) when I =< 16#7F -> 310 <<I>>; 311int_to_utf8(I) when I =< 16#7FF -> 312 B2 = I, 313 B1 = (I bsr 6), 314 <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>; 315int_to_utf8(I) when I =< 16#FFFF -> 316 B3 = I, 317 B2 = (I bsr 6), 318 B1 = (I bsr 12), 319 <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>; 320int_to_utf8(I) when I =< 16#3FFFFF -> 321 B4 = I, 322 B3 = (I bsr 6), 323 B2 = (I bsr 12), 324 B1 = (I bsr 18), 325 <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>; 326int_to_utf8(I) when I =< 16#3FFFFFF -> 327 B5 = I, 328 B4 = (I bsr 6), 329 B3 = (I bsr 12), 330 B2 = (I bsr 18), 331 B1 = (I bsr 24), 332 <<1:1,1:1,1:1,1:1,1:1,0:1,B1:2,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6, 333 1:1,0:1,B5:6>>. 334 335%% int_to_utf8(I, NumberOfBytes) -> Binary. 336%% This function can be used to construct overlong sequences. 337int_to_utf8(I, 1) -> 338 <<I>>; 339int_to_utf8(I, 2) -> 340 B2 = I, 341 B1 = (I bsr 6), 342 <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>; 343int_to_utf8(I, 3) -> 344 B3 = I, 345 B2 = (I bsr 6), 346 B1 = (I bsr 12), 347 <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>; 348int_to_utf8(I, 4) -> 349 B4 = I, 350 B3 = (I bsr 6), 351 B2 = (I bsr 12), 352 B1 = (I bsr 18), 353 <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>. 354 355make_unaligned(Bin0) when is_binary(Bin0) -> 356 Bin1 = <<0:3,Bin0/binary,31:5>>, 357 Sz = byte_size(Bin0), 358 <<0:3,Bin:Sz/binary,31:5>> = id(Bin1), 359 Bin. 360 361fail_check({'EXIT',{badarg,_}}, Str, Vars) -> 362 try evaluate(Str, Vars) of 363 Res -> 364 io:format("Interpreted result: ~p", [Res]), 365 ct:fail(did_not_fail_in_intepreted_code) 366 catch 367 error:badarg -> 368 ok 369 end; 370fail_check(Res, _, _) -> 371 io:format("Compiled result: ~p", [Res]), 372 ct:fail(did_not_fail_in_compiled_code). 373 374evaluate(Str, Vars) -> 375 {ok,Tokens,_} = 376 erl_scan:string(Str ++ " . "), 377 {ok, [Expr]} = erl_parse:parse_exprs(Tokens), 378 case erl_eval:expr(Expr, Vars) of 379 {value, Result, _} -> 380 Result 381 end. 382 383id(I) -> I. 384