1%% 2%% %CopyrightBegin% 3%% 4%% Copyright Ericsson AB 2008-2018. All Rights Reserved. 5%% 6%% Licensed under the Apache License, Version 2.0 (the "License"); 7%% you may not use this file except in compliance with the License. 8%% You may obtain a copy of the License at 9%% 10%% http://www.apache.org/licenses/LICENSE-2.0 11%% 12%% Unless required by applicable law or agreed to in writing, software 13%% distributed under the License is distributed on an "AS IS" BASIS, 14%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15%% See the License for the specific language governing permissions and 16%% limitations under the License. 17%% 18%% %CopyrightEnd% 19%% 20 21-module(bs_utf_SUITE). 22 23-export([all/0, suite/0,groups/0,init_per_suite/1, end_per_suite/1, 24 init_per_group/2,end_per_group/2, 25 utf8_roundtrip/1,unused_utf_char/1,utf16_roundtrip/1, 26 utf32_roundtrip/1,guard/1,extreme_tripping/1, 27 literals/1,coverage/1]). 28 29-include_lib("common_test/include/ct.hrl"). 30 31suite() -> [{ct_hooks,[ts_install_cth]}]. 32 33all() -> 34 [utf8_roundtrip, unused_utf_char, utf16_roundtrip, 35 utf32_roundtrip, guard, extreme_tripping, literals, 36 coverage]. 37 38groups() -> 39 []. 40 41init_per_suite(Config) -> 42 test_lib:recompile(?MODULE), 43 Config. 44 45end_per_suite(_Config) -> 46 ok. 47 48init_per_group(_GroupName, Config) -> 49 Config. 50 51end_per_group(_GroupName, Config) -> 52 Config. 53 54 55utf8_roundtrip(Config) when is_list(Config) -> 56 [utf8_roundtrip_1(P) || P <- utf_data()], 57 ok. 58 59utf8_roundtrip_1({Str,Bin,Bin}) -> 60 Str = utf8_to_list(Bin), 61 Bin = list_to_utf8(Str), 62 [ok = utf8_guard(C, <<42,C/utf8>>) || C <- Str], 63 [error = utf8_guard(C, <<C/utf8>>) || C <- Str], 64 ok. 65 66utf8_guard(C, Bin) when <<42,C/utf8>> =:= Bin -> ok; 67utf8_guard(_, _) -> error. 68 69utf8_to_list(<<C/utf8,T/binary>>) -> 70 [C|utf8_to_list(T)]; 71utf8_to_list(<<>>) -> []. 72 73list_to_utf8(L) -> 74 list_to_utf8(L, <<>>). 75 76list_to_utf8([H|T], Bin) -> 77 list_to_utf8(T, <<Bin/binary,H/utf8>>); 78list_to_utf8([], Bin) -> Bin. 79 80unused_utf_char(Config) when is_list(Config) -> 81 [true = utf8_len(Utf8) =:= length(Str) || 82 {Str,Utf8} <- utf_data()], 83 ok. 84 85utf8_len(B) -> 86 utf8_len(B, 0). 87 88utf8_len(<<_/utf8,T/binary>>, N) -> 89 utf8_len(T, N+1); 90utf8_len(<<>>, N) -> N. 91 92utf16_roundtrip(Config) when is_list(Config) -> 93 {Str,Big,Big,Little,Little} = utf16_data(), 94 4 = utf16_big_len(Big), 95 4 = utf16_little_len(Little), 96 Str = big_utf16_to_list(Big), 97 Str = little_utf16_to_list(Little), 98 99 Big = list_to_big_utf16(Str), 100 Little = list_to_little_utf16(Str), 101 102 ok. 103 104utf16_big_len(B) -> 105 utf16_big_len(B, 0). 106 107utf16_big_len(<<_/utf16,T/binary>>, N) -> 108 utf16_big_len(T, N+1); 109utf16_big_len(<<>>, N) -> N. 110 111utf16_little_len(B) -> 112 utf16_little_len(B, 0). 113 114utf16_little_len(<<_/little-utf16,T/binary>>, N) -> 115 utf16_little_len(T, N+1); 116utf16_little_len(<<>>, N) -> N. 117 118list_to_big_utf16(List) -> 119 list_to_big_utf16(List, <<>>). 120 121list_to_big_utf16([H|T], Bin) -> 122 list_to_big_utf16(T, <<Bin/binary,H/utf16>>); 123list_to_big_utf16([], Bin) -> Bin. 124 125list_to_little_utf16(List) -> 126 list_to_little_utf16(List, <<>>). 127 128list_to_little_utf16([H|T], Bin) -> 129 list_to_little_utf16(T, <<Bin/binary,H/little-utf16>>); 130list_to_little_utf16([], Bin) -> Bin. 131 132big_utf16_to_list(<<H/utf16,T/binary>>) -> 133 [H|big_utf16_to_list(T)]; 134big_utf16_to_list(<<>>) -> []. 135 136little_utf16_to_list(<<H/little-utf16,T/binary>>) -> 137 [H|little_utf16_to_list(T)]; 138little_utf16_to_list(<<>>) -> []. 139 140utf32_roundtrip(Config) when is_list(Config) -> 141 {Str,Big,Big,Little,Little} = utf32_data(), 142 4 = utf32_big_len(Big), 143 4 = utf32_little_len(Little), 144 Str = big_utf32_to_list(Big), 145 Str = little_utf32_to_list(Little), 146 147 Big = list_to_big_utf32(Str), 148 Little = list_to_little_utf32(Str), 149 150 ok. 151 152utf32_big_len(B) -> 153 utf32_big_len(B, 0). 154 155utf32_big_len(<<_/utf32,T/binary>>, N) -> 156 utf32_big_len(T, N+1); 157utf32_big_len(<<>>, N) -> N. 158 159utf32_little_len(B) -> 160 utf32_little_len(B, 0). 161 162utf32_little_len(<<_/little-utf32,T/binary>>, N) -> 163 utf32_little_len(T, N+1); 164utf32_little_len(<<>>, N) -> N. 165 166list_to_big_utf32(List) -> 167 list_to_big_utf32(List, <<>>). 168 169list_to_big_utf32([H|T], Bin) -> 170 list_to_big_utf32(T, <<Bin/binary,H/utf32>>); 171list_to_big_utf32([], Bin) -> Bin. 172 173list_to_little_utf32(List) -> 174 list_to_little_utf32(List, <<>>). 175 176list_to_little_utf32([H|T], Bin) -> 177 list_to_little_utf32(T, <<Bin/binary,H/little-utf32>>); 178list_to_little_utf32([], Bin) -> Bin. 179 180big_utf32_to_list(<<H/utf32,T/binary>>) -> 181 [H|big_utf32_to_list(T)]; 182big_utf32_to_list(<<>>) -> []. 183 184little_utf32_to_list(<<H/little-utf32,T/binary>>) -> 185 [H|little_utf32_to_list(T)]; 186little_utf32_to_list(<<>>) -> []. 187 188 189guard(Config) when is_list(Config) -> 190 error = do_guard(16#D800), 191 ok. 192 193do_guard(C) when byte_size(<<C/utf8>>) =/= 42 -> ok; 194do_guard(C) when byte_size(<<C/utf16>>) =/= 42 -> ok; 195do_guard(C) when byte_size(<<C/utf32>>) =/= 42 -> ok; 196do_guard(_) -> error. 197 198%% The purpose of this test is to make sure that 199%% the delayed creation of sub-binaries works. 200 201extreme_tripping(Config) when is_list(Config) -> 202 Unicode = lists:seq(0, 1024), 203 Utf8 = unicode_to_utf8(Unicode, <<>>), 204 Utf16 = utf8_to_utf16(Utf8, <<>>), 205 Utf32 = utf8_to_utf32(Utf8, <<>>), 206 Utf32 = utf16_to_utf32(Utf16, <<>>), 207 Utf8 = utf32_to_utf8(Utf32, <<>>), 208 Unicode = utf32_to_unicode(Utf32), 209 ok. 210 211unicode_to_utf8([C|T], Bin) -> 212 unicode_to_utf8(T, <<Bin/bytes,C/utf8>>); 213unicode_to_utf8([], Bin) -> Bin. 214 215utf8_to_utf16(<<C/utf8,T/binary>>, Bin) -> 216 utf8_to_utf16(T, <<Bin/bytes,C/utf16>>); 217utf8_to_utf16(<<>>, Bin) -> Bin. 218 219utf16_to_utf32(<<C/utf16,T/binary>>, Bin) -> 220 utf16_to_utf32(T, <<Bin/bytes,C/utf32>>); 221utf16_to_utf32(<<>>, Bin) -> Bin. 222 223utf8_to_utf32(<<C/utf8,T/binary>>, Bin) -> 224 utf8_to_utf32(T, <<Bin/bytes,C/utf32>>); 225utf8_to_utf32(<<>>, Bin) -> Bin. 226 227utf32_to_utf8(<<C/utf32,T/binary>>, Bin) -> 228 utf32_to_utf8(T, <<Bin/bytes,C/utf8>>); 229utf32_to_utf8(<<>>, Bin) -> Bin. 230 231utf32_to_unicode(<<C/utf32,T/binary>>) -> 232 [C|utf32_to_unicode(T)]; 233utf32_to_unicode(<<>>) -> []. 234 235literals(Config) when is_list(Config) -> 236 <<>> = id(<<""/utf8>>), 237 <<>> = id(<<""/utf16>>), 238 <<>> = id(<<""/little-utf16>>), 239 <<>> = id(<<""/native-utf16>>), 240 <<>> = id(<<""/utf32>>), 241 <<>> = id(<<""/little-utf32>>), 242 <<>> = id(<<""/native-utf32>>), 243 244 abc_utf8 = match_literal(<<"abc"/utf8>>), 245 abc_utf8 = match_literal(<<$a,$b,$c>>), 246 abc_utf8 = match_literal(<<$a/utf8,$b/utf8,$c/utf8>>), 247 248 abc_utf16be = match_literal(<<"abc"/utf16>>), 249 abc_utf16be = match_literal(<<$a:16,$b:16,$c:16>>), 250 abc_utf16le = match_literal(<<"abc"/little-utf16>>), 251 abc_utf16le = match_literal(<<$a:16/little,$b:16/little,$c:16/little>>), 252 253 abc_utf32be = match_literal(<<"abc"/utf32>>), 254 abc_utf32be = match_literal(<<$a:32,$b:32,$c:32>>), 255 abc_utf32le = match_literal(<<"abc"/little-utf32>>), 256 abc_utf32le = match_literal(<<$a:32/little,$b:32/little,$c:32/little>>), 257 258 mm_utf8 = match_literal(<<"Мастер и Маргарита"/utf8>>), 259 mm_utf16be = match_literal(<<"Мастер и Маргарита"/utf16>>), 260 mm_utf32be = match_literal(<<"Мастер и Маргарита"/utf32>>), 261 262 bjorn_utf8 = match_literal(<<"bj\366rn"/utf8>>), 263 bjorn_utf8 = match_literal(<<$b,$j,195,182,$r,$n>>), 264 265 bjorn_utf16be = match_literal(<<"bj\366rn"/utf16>>), 266 bjorn_utf16be = match_literal(<<$b:16,$j:16,246:16,$r:16,$n:16>>), 267 bjorn_utf16le = match_literal(<<"bj\366rn"/little-utf16>>), 268 bjorn_utf16le = match_literal(<<$b:16/little,$j:16/little, 269 246:16/little,$r:16/little, 270 $n:16/little>>), 271 <<244,143,191,191>> = <<16#10ffff/utf8>>, 272 273 %% Invalid literals. 274 I = 0, 275 {'EXIT',{badarg,_}} = (catch <<(-1)/utf8,I/utf8>>), 276 {'EXIT',{badarg,_}} = (catch <<(-1)/utf16,I/utf8>>), 277 {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf16,I/utf8>>), 278 {'EXIT',{badarg,_}} = (catch <<(-1)/utf32,I/utf8>>), 279 {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf32,I/utf8>>), 280 {'EXIT',{badarg,_}} = (catch <<16#D800/utf8,I/utf8>>), 281 {'EXIT',{badarg,_}} = (catch <<16#D800/utf16,I/utf8>>), 282 {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf16,I/utf8>>), 283 {'EXIT',{badarg,_}} = (catch <<16#D800/utf32,I/utf8>>), 284 {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf32,I/utf8>>), 285 286 B = 16#10FFFF+1, 287 {'EXIT',{badarg,_}} = (catch <<B/utf8>>), 288 {'EXIT',{badarg,_}} = (catch <<B/utf16>>), 289 {'EXIT',{badarg,_}} = (catch <<B/little-utf16>>), 290 {'EXIT',{badarg,_}} = (catch <<B/utf32>>), 291 {'EXIT',{badarg,_}} = (catch <<B/little-utf32>>), 292 293 %% Matching of bad literals. 294 error = bad_literal_match(<<237,160,128>>), %16#D800 in UTF-8 295 error = bad_literal_match(<<244,144,128,128>>), %16#110000 in UTF-8 296 297 error = bad_literal_match(<<16#D800:32>>), 298 error = bad_literal_match(<<16#110000:32>>), 299 error = bad_literal_match(<<16#D800:32/little>>), 300 error = bad_literal_match(<<16#110000:32/little>>), 301 302 ok. 303 304match_literal(<<"abc"/utf8>>) -> abc_utf8; 305match_literal(<<"abc"/big-utf16>>) -> abc_utf16be; 306match_literal(<<"abc"/little-utf16>>) -> abc_utf16le; 307match_literal(<<"abc"/big-utf32>>) -> abc_utf32be; 308match_literal(<<"abc"/little-utf32>>) -> abc_utf32le; 309match_literal(<<"Мастер и Маргарита"/utf8>>) -> mm_utf8; 310match_literal(<<"Мастер и Маргарита"/utf16>>) -> mm_utf16be; 311match_literal(<<"Мастер и Маргарита"/big-utf32>>) -> mm_utf32be; 312match_literal(<<"bj\366rn"/utf8>>) -> bjorn_utf8; 313match_literal(<<"bj\366rn"/big-utf16>>) -> bjorn_utf16be; 314match_literal(<<"bj\366rn"/little-utf16>>) -> bjorn_utf16le. 315 316bad_literal_match(<<16#D800/utf8>>) -> ok; 317bad_literal_match(<<16#110000/utf8>>) -> ok; 318bad_literal_match(<<16#D800/utf32>>) -> ok; 319bad_literal_match(<<16#110000/utf32>>) -> ok; 320bad_literal_match(<<16#D800/little-utf32>>) -> ok; 321bad_literal_match(<<16#110000/little-utf32>>) -> ok; 322bad_literal_match(_) -> error. 323 324coverage(Config) when is_list(Config) -> 325 %% Cover bit syntax matching optimizations in v3_kernel. 326 0 = coverage_1(<<4096/utf8,65536/utf8,0>>), 327 1 = coverage_1(<<4096/utf8,65536/utf8,1>>), 328 329 0 = coverage_2(<<4096/utf8,65536/utf8,0>>), 330 1 = coverage_2(<<1024/utf8,1025/utf8,1>>), 331 332 fc(catch coverage_3(1)), 333 334 %% Cover beam_flatten (combining the heap allocation in 335 %% a subsequent test_heap instruction into the bs_init2 336 %% instruction). 337 {ok,<<533/utf8>>} = cover_test_heap_utf8(533), 338 {ok,<<1024/utf16>>} = cover_test_heap_utf16(1024), 339 {ok,<<7966/utf32>>} = cover_test_heap_utf32(7966), 340 341 ok. 342 343coverage_1(<<4096/utf8,65536/utf8,0>>) -> 0; 344coverage_1(<<4096/utf8,65536/utf8,1>>) -> 1. 345 346coverage_2(<<4096/utf8,65536/utf8,0>>) -> 0; 347coverage_2(<<1024/utf8,1025/utf8,1>>) -> 1. 348 349coverage_3(<<16#7fffffff/utf8,65536/utf8,0>>) -> 0. 350 351cover_test_heap_utf8(C) -> {ok,<<C/utf8>>}. 352cover_test_heap_utf16(C) -> {ok,<<C/utf16>>}. 353cover_test_heap_utf32(C) -> {ok,<<C/utf32>>}. 354 355utf_data() -> 356%% From RFC-3629. 357 358 %% Give the compiler a chance to do some constant propagation. 359 NotIdentical = 16#2262, 360 361 [ 362 %% "A<NOT IDENTICAL TO><ALPHA>." 363 {[16#0041,NotIdentical,16#0391,16#002E], 364 <<16#0041/utf8,NotIdentical/utf8,16#0391/utf8,16#002E/utf8>>, 365 <<16#41,16#E2,16#89,16#A2,16#CE,16#91,16#2E>>}, 366 367 %% Korean "hangugeo" (meaning "the Korean language") 368 {[16#D55C,16#AD6D,16#C5B4], 369 <<16#D55C/utf8,16#AD6D/utf8,16#C5B4/utf8>>, 370 <<16#ED,16#95,16#9C,16#EA,16#B5,16#AD,16#EC,16#96,16#B4>>}, 371 372 %% Japanese "nihongo" (meaning "the Japanese language"). 373 {[16#65E5,16#672C,16#8A9E], 374 <<16#65E5/utf8,16#672C/utf8,16#8A9E/utf8>>, 375 <<16#E6,16#97,16#A5,16#E6,16#9C,16#AC,16#E8,16#AA,16#9E>>} 376 ]. 377 378utf16_data() -> 379 %% Example from RFC-2781. "*=Ra", where "*" represents a 380 %% hypothetical Ra hieroglyph (code point 16#12345). 381 382 %% Give the compiler a chance to do some constant propagation. 383 RaHieroglyph = 16#12345, 384 385 %% First as a list of Unicode characters. 386 {[RaHieroglyph,16#3D,16#52,16#61], 387 388 %% Big endian (the two binaries should be equal). 389 <<RaHieroglyph/big-utf16,16#3D/big-utf16,16#52/big-utf16,16#61/big-utf16>>, 390 <<16#D8,16#08,16#DF,16#45,16#00,16#3D,16#00,16#52,16#00,16#61>>, 391 392 %% Little endian (the two binaries should be equal). 393 <<RaHieroglyph/little-utf16,16#3D/little-utf16, 394 16#52/little-utf16,16#61/little-utf16>>, 395 <<16#08,16#D8,16#45,16#DF,16#3D,16#00,16#52,16#00,16#61,16#00>>}. 396 397utf32_data() -> 398 %% "A<NOT IDENTICAL TO><ALPHA>." 399 NotIdentical = 16#2262, 400 {[16#0041,NotIdentical,16#0391,16#002E], 401 402 %% Big endian. 403 <<16#0041/utf32,NotIdentical/utf32,16#0391/utf32,16#002E/utf32>>, 404 <<16#41:32,NotIdentical:32,16#0391:32,16#2E:32>>, 405 406 %% Little endian. 407 <<16#0041/little-utf32,NotIdentical/little-utf32, 408 16#0391/little-utf32,16#002E/little-utf32>>, 409 <<16#41:32/little,NotIdentical:32/little, 410 16#0391:32/little,16#2E:32/little>>}. 411 412fc({'EXIT',{function_clause,_}}) -> ok; 413fc({'EXIT',{{case_clause,_},_}}) when ?MODULE =:= bs_utf_inline_SUITE -> ok. 414 415id(I) -> I. 416