1%% -*- erlang-indent-level: 2 -*- 2%%------------------------------------------------------------------- 3%% Purpose: test support for UTF datatypes in binaries 4%% 5%% Most of it taken from emulator/test/bs_utf_SUITE.erl 6%%------------------------------------------------------------------- 7 8-module(bs_utf). 9 10-export([test/0]). 11 12-include_lib("common_test/include/ct.hrl"). 13 14test() -> 15 ok = utf8_cm65(), 16 ok = utf8_roundtrip(), 17 ok = utf16_roundtrip(), 18 ok = utf32_roundtrip(), 19 %% The following were problematic for the LLVM backend 20 ok = utf8_illegal_sequences(), 21 ok = utf16_illegal_sequences(), 22 ok = utf32_illegal_sequences(), 23 ok. 24 25%%------------------------------------------------------------------- 26%% A test with construction and matching 27 28utf8_cm65() -> 29 <<65>> = b65utf8(), 30 ok = m(<<65>>). 31 32b65utf8() -> 33 <<65/utf8>>. 34 35m(<<65/utf8>>) -> 36 ok. 37 38%%------------------------------------------------------------------- 39 40utf8_roundtrip() -> 41 ok = utf8_roundtrip(0, 16#D7FF), 42 ok = utf8_roundtrip(16#E000, 16#10FFFF), 43 ok. 44 45utf8_roundtrip(First, Last) when First =< Last -> 46 Bin = int_to_utf8(First), 47 Bin = id(<<First/utf8>>), 48 Bin = id(<<(id(<<>>))/binary,First/utf8>>), 49 Unaligned = id(<<3:2,First/utf8>>), 50 <<_:2,Bin/binary>> = Unaligned, 51 <<First/utf8>> = Bin, 52 <<First/utf8>> = make_unaligned(Bin), 53 utf8_roundtrip(First+1, Last); 54utf8_roundtrip(_, _) -> 55 ok. 56 57%%------------------------------------------------------------------- 58 59utf16_roundtrip() -> 60 Big = fun utf16_big_roundtrip/1, 61 Little = fun utf16_little_roundtrip/1, 62 PidRefs = [spawn_monitor(fun() -> do_utf16_roundtrip(Fun) end) || 63 Fun <- [Big,Little]], 64 [receive {'DOWN', Ref, process, Pid, Reason} -> normal=Reason end || 65 {Pid, Ref} <- PidRefs], 66 ok. 67 68do_utf16_roundtrip(Fun) -> 69 do_utf16_roundtrip(0, 16#D7FF, Fun), 70 do_utf16_roundtrip(16#E000, 16#10FFFF, Fun). 71 72do_utf16_roundtrip(First, Last, Fun) when First =< Last -> 73 Fun(First), 74 do_utf16_roundtrip(First+1, Last, Fun); 75do_utf16_roundtrip(_, _, _) -> ok. 76 77utf16_big_roundtrip(Char) -> 78 Bin = id(<<Char/utf16>>), 79 Bin = id(<<(id(<<>>))/binary,Char/utf16>>), 80 Unaligned = id(<<3:2,Char/utf16>>), 81 <<_:2,Bin/binary>> = Unaligned, 82 <<Char/utf16>> = Bin, 83 <<Char/utf16>> = make_unaligned(Bin), 84 ok. 85 86utf16_little_roundtrip(Char) -> 87 Bin = id(<<Char/little-utf16>>), 88 Bin = id(<<(id(<<>>))/binary,Char/little-utf16>>), 89 Unaligned = id(<<3:2,Char/little-utf16>>), 90 <<_:2,Bin/binary>> = Unaligned, 91 <<Char/little-utf16>> = Bin, 92 <<Char/little-utf16>> = make_unaligned(Bin), 93 ok. 94 95%%------------------------------------------------------------------- 96 97utf32_roundtrip() -> 98 Big = fun utf32_big_roundtrip/1, 99 Little = fun utf32_little_roundtrip/1, 100 PidRefs = [spawn_monitor(fun() -> do_utf32_roundtrip(Fun) end) || 101 Fun <- [Big,Little]], 102 [receive {'DOWN', Ref, process, Pid, Reason} -> normal=Reason end || 103 {Pid, Ref} <- PidRefs], 104 ok. 105 106do_utf32_roundtrip(Fun) -> 107 do_utf32_roundtrip(0, 16#D7FF, Fun), 108 do_utf32_roundtrip(16#E000, 16#10FFFF, Fun). 109 110do_utf32_roundtrip(First, Last, Fun) when First =< Last -> 111 Fun(First), 112 do_utf32_roundtrip(First+1, Last, Fun); 113do_utf32_roundtrip(_, _, _) -> ok. 114 115utf32_big_roundtrip(Char) -> 116 Bin = id(<<Char/utf32>>), 117 Bin = id(<<(id(<<>>))/binary,Char/utf32>>), 118 Unaligned = id(<<3:2,Char/utf32>>), 119 <<_:2,Bin/binary>> = Unaligned, 120 <<Char/utf32>> = Bin, 121 <<Char/utf32>> = make_unaligned(Bin), 122 ok. 123 124utf32_little_roundtrip(Char) -> 125 Bin = id(<<Char/little-utf32>>), 126 Bin = id(<<(id(<<>>))/binary,Char/little-utf32>>), 127 Unaligned = id(<<3:2,Char/little-utf32>>), 128 <<_:2,Bin/binary>> = Unaligned, 129 <<Char/little-utf32>> = Bin, 130 <<Char/little-utf32>> = make_unaligned(Bin), 131 ok. 132 133%%------------------------------------------------------------------- 134 135utf8_illegal_sequences() -> 136 fail_range(16#10FFFF+1, 16#10FFFF+512), % Too large. 137 fail_range(16#D800, 16#DFFF), % Reserved for UTF-16. 138 139 %% Illegal first character. 140 [fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)], 141 142 %% Short sequences. 143 short_sequences(16#80, 16#10FFFF), 144 145 %% Overlong sequences. (Using more bytes than necessary 146 %% is not allowed.) 147 overlong(0, 127, 2), 148 overlong(128, 16#7FF, 3), 149 overlong(16#800, 16#FFFF, 4), 150 ok. 151 152fail_range(Char, End) when Char =< End -> 153 {'EXIT', _} = (catch <<Char/utf8>>), 154 Bin = int_to_utf8(Char), 155 fail(Bin), 156 fail_range(Char+1, End); 157fail_range(_, _) -> ok. 158 159short_sequences(Char, End) -> 160 Step = (End - Char) div erlang:system_info(schedulers) + 1, 161 PidRefs = short_sequences_1(Char, Step, End), 162 [receive {'DOWN', Ref, process, Pid, Reason} -> normal=Reason end || 163 {Pid, Ref} <- PidRefs], 164 ok. 165 166short_sequences_1(Char, Step, End) when Char =< End -> 167 CharEnd = lists:min([Char+Step-1,End]), 168 [spawn_monitor(fun() -> 169 %% io:format("~p - ~p\n", [Char, CharEnd]), 170 do_short_sequences(Char, CharEnd) 171 end)|short_sequences_1(Char+Step, Step, End)]; 172short_sequences_1(_, _, _) -> []. 173 174do_short_sequences(Char, End) when Char =< End -> 175 short_sequence(Char), 176 do_short_sequences(Char+1, End); 177do_short_sequences(_, _) -> ok. 178 179short_sequence(I) -> 180 case int_to_utf8(I) of 181 <<S0:3/binary,_:8>> -> 182 <<S1:2/binary,R1:8>> = S0, 183 <<S2:1/binary,_:8>> = S1, 184 fail(S0), 185 fail(S1), 186 fail(S2), 187 fail(<<S2/binary,16#7F,R1,R1>>), 188 fail(<<S1/binary,16#7F,R1>>), 189 fail(<<S0/binary,16#7F>>); 190 <<S0:2/binary,_:8>> -> 191 <<S1:1/binary,R1:8>> = S0, 192 fail(S0), 193 fail(S1), 194 fail(<<S0/binary,16#7F>>), 195 fail(<<S1/binary,16#7F>>), 196 fail(<<S1/binary,16#7F,R1>>); 197 <<S:1/binary,_:8>> -> 198 fail(S), 199 fail(<<S/binary,16#7F>>) 200 end. 201 202overlong(Char, Last, NumBytes) when Char =< Last -> 203 overlong(Char, NumBytes), 204 overlong(Char+1, Last, NumBytes); 205overlong(_, _, _) -> ok. 206 207overlong(Char, NumBytes) when NumBytes < 5 -> 208 case int_to_utf8(Char, NumBytes) of 209 <<Char/utf8>>=Bin -> 210 ?t:fail({illegal_encoding_accepted,Bin,Char}); 211 <<OtherChar/utf8>>=Bin -> 212 ?t:fail({illegal_encoding_accepted,Bin,Char,OtherChar}); 213 _ -> ok 214 end, 215 overlong(Char, NumBytes+1); 216overlong(_, _) -> ok. 217 218fail(Bin) -> 219 fail_1(Bin), 220 fail_1(make_unaligned(Bin)). 221 222fail_1(<<Char/utf8>> = Bin) -> 223 ?t:fail({illegal_encoding_accepted, Bin, Char}); 224fail_1(_) -> ok. 225 226%%------------------------------------------------------------------- 227 228utf16_illegal_sequences() -> 229 utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), % Too large. 230 utf16_fail_range(16#D800, 16#DFFF), % Reserved for UTF-16. 231 lonely_hi_surrogate(16#D800, 16#DFFF), 232 leading_lo_surrogate(16#DC00, 16#DFFF), 233 ok. 234 235utf16_fail_range(Char, End) when Char =< End -> 236 {'EXIT', _} = (catch <<Char/big-utf16>>), 237 {'EXIT', _} = (catch <<Char/little-utf16>>), 238 utf16_fail_range(Char+1, End); 239utf16_fail_range(_, _) -> ok. 240 241lonely_hi_surrogate(Char, End) when Char =< End -> 242 BinBig = <<Char:16/big>>, 243 BinLittle = <<Char:16/little>>, 244 case {BinBig,BinLittle} of 245 {<<Bad/big-utf16>>,_} -> 246 ?t:fail({lonely_hi_surrogate_accepted,Bad}); 247 {_,<<Bad/little-utf16>>} -> 248 ?t:fail({lonely_hi_surrogate_accepted,Bad}); 249 {_,_} -> 250 ok 251 end, 252 lonely_hi_surrogate(Char+1, End); 253lonely_hi_surrogate(_, _) -> ok. 254 255leading_lo_surrogate(Char, End) when Char =< End -> 256 leading_lo_surrogate(Char, 16#D800, 16#DFFF), 257 leading_lo_surrogate(Char+1, End); 258leading_lo_surrogate(_, _) -> ok. 259 260leading_lo_surrogate(HiSurr, LoSurr, End) when LoSurr =< End -> 261 BinBig = <<HiSurr:16/big,LoSurr:16/big>>, 262 BinLittle = <<HiSurr:16/little,LoSurr:16/little>>, 263 case {BinBig,BinLittle} of 264 {<<Bad/big-utf16,_/bits>>,_} -> 265 ?t:fail({leading_lo_surrogate_accepted,Bad}); 266 {_,<<Bad/little-utf16,_/bits>>} -> 267 ?t:fail({leading_lo_surrogate_accepted,Bad}); 268 {_,_} -> 269 ok 270 end, 271 leading_lo_surrogate(HiSurr, LoSurr+1, End); 272leading_lo_surrogate(_, _, _) -> ok. 273 274%%------------------------------------------------------------------- 275 276utf32_illegal_sequences() -> 277 utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), % Too large. 278 utf32_fail_range(16#D800, 16#DFFF), % Reserved for UTF-16. 279 utf32_fail_range(-100, -1), 280 ok. 281 282utf32_fail_range(Char, End) when Char =< End -> 283 {'EXIT', _} = (catch <<Char/big-utf32>>), 284 {'EXIT', _} = (catch <<Char/little-utf32>>), 285 case {<<Char:32>>,<<Char:32/little>>} of 286 {<<Unexpected/utf32>>,_} -> 287 ?t:fail(Unexpected); 288 {_,<<Unexpected/little-utf32>>} -> 289 ?t:fail(Unexpected); 290 {_,_} -> ok 291 end, 292 utf32_fail_range(Char+1, End); 293utf32_fail_range(_, _) -> ok. 294 295%%------------------------------------------------------------------- 296%% This function intentionally allows construction of UTF-8 sequence 297%% in illegal ranges. 298 299int_to_utf8(I) when I =< 16#7F -> 300 <<I>>; 301int_to_utf8(I) when I =< 16#7FF -> 302 B2 = I, 303 B1 = (I bsr 6), 304 <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>; 305int_to_utf8(I) when I =< 16#FFFF -> 306 B3 = I, 307 B2 = (I bsr 6), 308 B1 = (I bsr 12), 309 <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>; 310int_to_utf8(I) when I =< 16#3FFFFF -> 311 B4 = I, 312 B3 = (I bsr 6), 313 B2 = (I bsr 12), 314 B1 = (I bsr 18), 315 <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>; 316int_to_utf8(I) when I =< 16#3FFFFFF -> 317 B5 = I, 318 B4 = (I bsr 6), 319 B3 = (I bsr 12), 320 B2 = (I bsr 18), 321 B1 = (I bsr 24), 322 <<1:1,1:1,1:1,1:1,1:1,0:1,B1:2,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6, 323 1:1,0:1,B5:6>>. 324 325%% int_to_utf8(I, NumberOfBytes) -> Binary. 326%% This function can be used to construct overlong sequences. 327int_to_utf8(I, 1) -> 328 <<I>>; 329int_to_utf8(I, 2) -> 330 B2 = I, 331 B1 = (I bsr 6), 332 <<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>; 333int_to_utf8(I, 3) -> 334 B3 = I, 335 B2 = (I bsr 6), 336 B1 = (I bsr 12), 337 <<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>; 338int_to_utf8(I, 4) -> 339 B4 = I, 340 B3 = (I bsr 6), 341 B2 = (I bsr 12), 342 B1 = (I bsr 18), 343 <<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>. 344 345%%------------------------------------------------------------------- 346 347make_unaligned(Bin0) when is_binary(Bin0) -> 348 Bin1 = <<0:3,Bin0/binary,31:5>>, 349 Sz = byte_size(Bin0), 350 <<0:3,Bin:Sz/binary,31:5>> = id(Bin1), 351 Bin. 352 353%%------------------------------------------------------------------- 354%% Just to prevent compiler optimizations 355 356id(X) -> X. 357