1%% @copyright 2010 Mochi Media, Inc. 2%% @author Bob Ippolito <bob@mochimedia.com> 3%% 4%% Permission is hereby granted, free of charge, to any person obtaining a 5%% copy of this software and associated documentation files (the "Software"), 6%% to deal in the Software without restriction, including without limitation 7%% the rights to use, copy, modify, merge, publish, distribute, sublicense, 8%% and/or sell copies of the Software, and to permit persons to whom the 9%% Software is furnished to do so, subject to the following conditions: 10%% 11%% The above copyright notice and this permission notice shall be included in 12%% all copies or substantial portions of the Software. 13%% 14%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 20%% DEALINGS IN THE SOFTWARE. 21 22%% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring 23%% invalid bytes. 24 25-module(mochiutf8). 26-export([valid_utf8_bytes/1, codepoint_to_bytes/1, codepoints_to_bytes/1]). 27-export([bytes_to_codepoints/1, bytes_foldl/3, codepoint_foldl/3]). 28-export([read_codepoint/1, len/1]). 29 30%% External API 31 32-type unichar_low() :: 0..16#d7ff. 33-type unichar_high() :: 16#e000..16#10ffff. 34-type unichar() :: unichar_low() | unichar_high(). 35 36-spec codepoint_to_bytes(unichar()) -> binary(). 37%% @doc Convert a unicode codepoint to UTF-8 bytes. 38codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) -> 39 %% U+0000 - U+007F - 7 bits 40 <<C>>; 41codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) -> 42 %% U+0080 - U+07FF - 11 bits 43 <<0:5, B1:5, B0:6>> = <<C:16>>, 44 <<2#110:3, B1:5, 45 2#10:2, B0:6>>; 46codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso 47 (C < 16#D800 orelse C > 16#DFFF) -> 48 %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) 49 <<B2:4, B1:6, B0:6>> = <<C:16>>, 50 <<2#1110:4, B2:4, 51 2#10:2, B1:6, 52 2#10:2, B0:6>>; 53codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) -> 54 %% U+10000 - U+10FFFF - 21 bits 55 <<0:3, B3:3, B2:6, B1:6, B0:6>> = <<C:24>>, 56 <<2#11110:5, B3:3, 57 2#10:2, B2:6, 58 2#10:2, B1:6, 59 2#10:2, B0:6>>. 60 61-spec codepoints_to_bytes([unichar()]) -> binary(). 62%% @doc Convert a list of codepoints to a UTF-8 binary. 63codepoints_to_bytes(L) -> 64 <<<<(codepoint_to_bytes(C))/binary>> || C <- L>>. 65 66-spec read_codepoint(binary()) -> {unichar(), binary(), binary()}. 67read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) -> 68 %% U+0000 - U+007F - 7 bits 69 <<B:1/binary, _/binary>> = Bin, 70 {C, B, Rest}; 71read_codepoint(Bin = <<2#110:3, B1:5, 72 2#10:2, B0:6, 73 Rest/binary>>) -> 74 %% U+0080 - U+07FF - 11 bits 75 case <<B1:5, B0:6>> of 76 <<C:11>> when C >= 16#80 -> 77 <<B:2/binary, _/binary>> = Bin, 78 {C, B, Rest} 79 end; 80read_codepoint(Bin = <<2#1110:4, B2:4, 81 2#10:2, B1:6, 82 2#10:2, B0:6, 83 Rest/binary>>) -> 84 %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) 85 case <<B2:4, B1:6, B0:6>> of 86 <<C:16>> when (C >= 16#0800 andalso C =< 16#FFFF) andalso 87 (C < 16#D800 orelse C > 16#DFFF) -> 88 <<B:3/binary, _/binary>> = Bin, 89 {C, B, Rest} 90 end; 91read_codepoint(Bin = <<2#11110:5, B3:3, 92 2#10:2, B2:6, 93 2#10:2, B1:6, 94 2#10:2, B0:6, 95 Rest/binary>>) -> 96 %% U+10000 - U+10FFFF - 21 bits 97 case <<B3:3, B2:6, B1:6, B0:6>> of 98 <<C:21>> when (C >= 16#010000 andalso C =< 16#10FFFF) -> 99 <<B:4/binary, _/binary>> = Bin, 100 {C, B, Rest} 101 end. 102 103-spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _. 104codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) -> 105 Acc; 106codepoint_foldl(F, Acc, Bin) -> 107 {C, _, Rest} = read_codepoint(Bin), 108 codepoint_foldl(F, F(C, Acc), Rest). 109 110-spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _. 111bytes_foldl(F, Acc, <<>>) when is_function(F, 2) -> 112 Acc; 113bytes_foldl(F, Acc, Bin) -> 114 {_, B, Rest} = read_codepoint(Bin), 115 bytes_foldl(F, F(B, Acc), Rest). 116 117-spec bytes_to_codepoints(binary()) -> [unichar()]. 118bytes_to_codepoints(B) -> 119 lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)). 120 121-spec len(binary()) -> non_neg_integer(). 122len(<<>>) -> 123 0; 124len(B) -> 125 {_, _, Rest} = read_codepoint(B), 126 1 + len(Rest). 127 128-spec valid_utf8_bytes(B::binary()) -> binary(). 129%% @doc Return only the bytes in B that represent valid UTF-8. Uses 130%% the following recursive algorithm: skip one byte if B does not 131%% follow UTF-8 syntax (a 1-4 byte encoding of some number), 132%% skip sequence of 2-4 bytes if it represents an overlong encoding 133%% or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF). 134valid_utf8_bytes(B) when is_binary(B) -> 135 binary_skip_bytes(B, invalid_utf8_indexes(B)). 136 137%% Internal API 138 139-spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary(). 140%% @doc Return B, but skipping the 0-based indexes in L. 141binary_skip_bytes(B, []) -> 142 B; 143binary_skip_bytes(B, L) -> 144 binary_skip_bytes(B, L, 0, []). 145 146%% @private 147-spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary(). 148binary_skip_bytes(B, [], _N, Acc) -> 149 iolist_to_binary(lists:reverse([B | Acc])); 150binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) -> 151 binary_skip_bytes(RestB, RestL, 1 + N, Acc); 152binary_skip_bytes(<<C, RestB/binary>>, L, N, Acc) -> 153 binary_skip_bytes(RestB, L, 1 + N, [C | Acc]). 154 155-spec invalid_utf8_indexes(binary()) -> [non_neg_integer()]. 156%% @doc Return the 0-based indexes in B that are not valid UTF-8. 157invalid_utf8_indexes(B) -> 158 invalid_utf8_indexes(B, 0, []). 159 160%% @private. 161-spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()]. 162invalid_utf8_indexes(<<C, Rest/binary>>, N, Acc) when C < 16#80 -> 163 %% U+0000 - U+007F - 7 bits 164 invalid_utf8_indexes(Rest, 1 + N, Acc); 165invalid_utf8_indexes(<<C1, C2, Rest/binary>>, N, Acc) 166 when C1 band 16#E0 =:= 16#C0, 167 C2 band 16#C0 =:= 16#80 -> 168 %% U+0080 - U+07FF - 11 bits 169 case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of 170 C when C < 16#80 -> 171 %% Overlong encoding. 172 invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]); 173 _ -> 174 %% Upper bound U+07FF does not need to be checked 175 invalid_utf8_indexes(Rest, 2 + N, Acc) 176 end; 177invalid_utf8_indexes(<<C1, C2, C3, Rest/binary>>, N, Acc) 178 when C1 band 16#F0 =:= 16#E0, 179 C2 band 16#C0 =:= 16#80, 180 C3 band 16#C0 =:= 16#80 -> 181 %% U+0800 - U+FFFF - 16 bits 182 case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor 183 (C3 band 16#3F) of 184 C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) -> 185 %% Overlong encoding or surrogate. 186 invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]); 187 _ -> 188 %% Upper bound U+FFFF does not need to be checked 189 invalid_utf8_indexes(Rest, 3 + N, Acc) 190 end; 191invalid_utf8_indexes(<<C1, C2, C3, C4, Rest/binary>>, N, Acc) 192 when C1 band 16#F8 =:= 16#F0, 193 C2 band 16#C0 =:= 16#80, 194 C3 band 16#C0 =:= 16#80, 195 C4 band 16#C0 =:= 16#80 -> 196 %% U+10000 - U+10FFFF - 21 bits 197 case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor 198 (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of 199 C when (C < 16#10000) orelse (C > 16#10FFFF) -> 200 %% Overlong encoding or invalid code point. 201 invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]); 202 _ -> 203 invalid_utf8_indexes(Rest, 4 + N, Acc) 204 end; 205invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) -> 206 %% Invalid char 207 invalid_utf8_indexes(Rest, 1 + N, [N | Acc]); 208invalid_utf8_indexes(<<>>, _N, Acc) -> 209 lists:reverse(Acc). 210 211%% 212%% Tests 213%% 214-ifdef(TEST). 215-include_lib("eunit/include/eunit.hrl"). 216 217binary_skip_bytes_test() -> 218 ?assertEqual(<<"foo">>, 219 binary_skip_bytes(<<"foo">>, [])), 220 ?assertEqual(<<"foobar">>, 221 binary_skip_bytes(<<"foo bar">>, [3])), 222 ?assertEqual(<<"foo">>, 223 binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])), 224 ?assertEqual(<<"oo bar">>, 225 binary_skip_bytes(<<"foo bar">>, [0])), 226 ok. 227 228invalid_utf8_indexes_test() -> 229 ?assertEqual( 230 [], 231 invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)), 232 ?assertEqual( 233 [0], 234 invalid_utf8_indexes(<<128>>)), 235 ?assertEqual( 236 [57,59,60,64,66,67], 237 invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (", 238 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)), 239 ok. 240 241codepoint_to_bytes_test() -> 242 %% U+0000 - U+007F - 7 bits 243 %% U+0080 - U+07FF - 11 bits 244 %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points) 245 %% U+10000 - U+10FFFF - 21 bits 246 ?assertEqual( 247 <<"a">>, 248 codepoint_to_bytes($a)), 249 ?assertEqual( 250 <<16#c2, 16#80>>, 251 codepoint_to_bytes(16#80)), 252 ?assertEqual( 253 <<16#df, 16#bf>>, 254 codepoint_to_bytes(16#07ff)), 255 ?assertEqual( 256 <<16#ef, 16#bf, 16#bf>>, 257 codepoint_to_bytes(16#ffff)), 258 ?assertEqual( 259 <<16#f4, 16#8f, 16#bf, 16#bf>>, 260 codepoint_to_bytes(16#10ffff)), 261 ok. 262 263bytes_foldl_test() -> 264 ?assertEqual( 265 <<"abc">>, 266 bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, <<"abc">>)), 267 ?assertEqual( 268 <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>, 269 bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, 270 <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), 271 ok. 272 273bytes_to_codepoints_test() -> 274 ?assertEqual( 275 "abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff], 276 bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), 277 ok. 278 279codepoint_foldl_test() -> 280 ?assertEqual( 281 "cba", 282 codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)), 283 ?assertEqual( 284 [16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"], 285 codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], 286 <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)), 287 ok. 288 289len_test() -> 290 ?assertEqual( 291 29, 292 len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)), 293 ok. 294 295codepoints_to_bytes_test() -> 296 ?assertEqual( 297 iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))), 298 codepoints_to_bytes(lists:seq(1, 1000))), 299 ok. 300 301valid_utf8_bytes_test() -> 302 ?assertEqual( 303 <<"invalid U+11ffff: ">>, 304 valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)), 305 ?assertEqual( 306 <<"U+10ffff: ", 244, 143, 191, 191>>, 307 valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)), 308 ?assertEqual( 309 <<"overlong 2-byte encoding (a): ">>, 310 valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)), 311 ?assertEqual( 312 <<"overlong 2-byte encoding (!): ">>, 313 valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)), 314 ?assertEqual( 315 <<"mu: ", 194, 181>>, 316 valid_utf8_bytes(<<"mu: ", 194, 181>>)), 317 ?assertEqual( 318 <<"bad coding bytes: ">>, 319 valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)), 320 ?assertEqual( 321 <<"low surrogate (unpaired): ">>, 322 valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)), 323 ?assertEqual( 324 <<"high surrogate (unpaired): ">>, 325 valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)), 326 ?assertEqual( 327 <<"unicode snowman for you: ", 226, 152, 131>>, 328 valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)), 329 ?assertEqual( 330 <<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>, 331 valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (", 332 167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)), 333 ok. 334 335-endif. 336