1%% @copyright 2010 Mochi Media, Inc.
2%% @author Bob Ippolito <bob@mochimedia.com>
3%%
4%% Permission is hereby granted, free of charge, to any person obtaining a
5%% copy of this software and associated documentation files (the "Software"),
6%% to deal in the Software without restriction, including without limitation
7%% the rights to use, copy, modify, merge, publish, distribute, sublicense,
8%% and/or sell copies of the Software, and to permit persons to whom the
9%% Software is furnished to do so, subject to the following conditions:
10%%
11%% The above copyright notice and this permission notice shall be included in
12%% all copies or substantial portions of the Software.
13%%
14%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15%% IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16%% FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17%% THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18%% LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19%% FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20%% DEALINGS IN THE SOFTWARE.
21
22%% @doc Algorithm to convert any binary to a valid UTF-8 sequence by ignoring
23%%      invalid bytes.
24
25-module(mochiutf8).
26-export([valid_utf8_bytes/1, codepoint_to_bytes/1, codepoints_to_bytes/1]).
27-export([bytes_to_codepoints/1, bytes_foldl/3, codepoint_foldl/3]).
28-export([read_codepoint/1, len/1]).
29
30%% External API
31
32-type unichar_low() :: 0..16#d7ff.
33-type unichar_high() :: 16#e000..16#10ffff.
34-type unichar() :: unichar_low() | unichar_high().
35
36-spec codepoint_to_bytes(unichar()) -> binary().
37%% @doc Convert a unicode codepoint to UTF-8 bytes.
38codepoint_to_bytes(C) when (C >= 16#00 andalso C =< 16#7f) ->
39    %% U+0000 - U+007F - 7 bits
40    <<C>>;
41codepoint_to_bytes(C) when (C >= 16#080 andalso C =< 16#07FF) ->
42    %% U+0080 - U+07FF - 11 bits
43    <<0:5, B1:5, B0:6>> = <<C:16>>,
44    <<2#110:3, B1:5,
45      2#10:2, B0:6>>;
46codepoint_to_bytes(C) when (C >= 16#0800 andalso C =< 16#FFFF) andalso
47                           (C < 16#D800 orelse C > 16#DFFF) ->
48    %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
49    <<B2:4, B1:6, B0:6>> = <<C:16>>,
50    <<2#1110:4, B2:4,
51      2#10:2, B1:6,
52      2#10:2, B0:6>>;
53codepoint_to_bytes(C) when (C >= 16#010000 andalso C =< 16#10FFFF) ->
54    %% U+10000 - U+10FFFF - 21 bits
55    <<0:3, B3:3, B2:6, B1:6, B0:6>> = <<C:24>>,
56    <<2#11110:5, B3:3,
57      2#10:2, B2:6,
58      2#10:2, B1:6,
59      2#10:2, B0:6>>.
60
61-spec codepoints_to_bytes([unichar()]) -> binary().
62%% @doc Convert a list of codepoints to a UTF-8 binary.
63codepoints_to_bytes(L) ->
64    <<<<(codepoint_to_bytes(C))/binary>> || C <- L>>.
65
66-spec read_codepoint(binary()) -> {unichar(), binary(), binary()}.
67read_codepoint(Bin = <<2#0:1, C:7, Rest/binary>>) ->
68    %% U+0000 - U+007F - 7 bits
69    <<B:1/binary, _/binary>> = Bin,
70    {C, B, Rest};
71read_codepoint(Bin = <<2#110:3, B1:5,
72                       2#10:2, B0:6,
73                       Rest/binary>>) ->
74    %% U+0080 - U+07FF - 11 bits
75    case <<B1:5, B0:6>> of
76        <<C:11>> when C >= 16#80 ->
77            <<B:2/binary, _/binary>> = Bin,
78            {C, B, Rest}
79    end;
80read_codepoint(Bin = <<2#1110:4, B2:4,
81                       2#10:2, B1:6,
82                       2#10:2, B0:6,
83                       Rest/binary>>) ->
84    %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
85    case <<B2:4, B1:6, B0:6>> of
86        <<C:16>> when (C >= 16#0800 andalso C =< 16#FFFF) andalso
87                      (C < 16#D800 orelse C > 16#DFFF) ->
88            <<B:3/binary, _/binary>> = Bin,
89            {C, B, Rest}
90    end;
91read_codepoint(Bin = <<2#11110:5, B3:3,
92                       2#10:2, B2:6,
93                       2#10:2, B1:6,
94                       2#10:2, B0:6,
95                       Rest/binary>>) ->
96    %% U+10000 - U+10FFFF - 21 bits
97    case <<B3:3, B2:6, B1:6, B0:6>> of
98        <<C:21>> when (C >= 16#010000 andalso C =< 16#10FFFF) ->
99            <<B:4/binary, _/binary>> = Bin,
100            {C, B, Rest}
101    end.
102
103-spec codepoint_foldl(fun((unichar(), _) -> _), _, binary()) -> _.
104codepoint_foldl(F, Acc, <<>>) when is_function(F, 2) ->
105    Acc;
106codepoint_foldl(F, Acc, Bin) ->
107    {C, _, Rest} = read_codepoint(Bin),
108    codepoint_foldl(F, F(C, Acc), Rest).
109
110-spec bytes_foldl(fun((binary(), _) -> _), _, binary()) -> _.
111bytes_foldl(F, Acc, <<>>) when is_function(F, 2) ->
112    Acc;
113bytes_foldl(F, Acc, Bin) ->
114    {_, B, Rest} = read_codepoint(Bin),
115    bytes_foldl(F, F(B, Acc), Rest).
116
117-spec bytes_to_codepoints(binary()) -> [unichar()].
118bytes_to_codepoints(B) ->
119    lists:reverse(codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], B)).
120
121-spec len(binary()) -> non_neg_integer().
122len(<<>>) ->
123    0;
124len(B) ->
125    {_, _, Rest} = read_codepoint(B),
126    1 + len(Rest).
127
128-spec valid_utf8_bytes(B::binary()) -> binary().
129%% @doc Return only the bytes in B that represent valid UTF-8. Uses
130%%      the following recursive algorithm: skip one byte if B does not
131%%      follow UTF-8 syntax (a 1-4 byte encoding of some number),
132%%      skip sequence of 2-4 bytes if it represents an overlong encoding
133%%      or bad code point (surrogate U+D800 - U+DFFF or > U+10FFFF).
134valid_utf8_bytes(B) when is_binary(B) ->
135    binary_skip_bytes(B, invalid_utf8_indexes(B)).
136
137%% Internal API
138
139-spec binary_skip_bytes(binary(), [non_neg_integer()]) -> binary().
140%% @doc Return B, but skipping the 0-based indexes in L.
141binary_skip_bytes(B, []) ->
142    B;
143binary_skip_bytes(B, L) ->
144    binary_skip_bytes(B, L, 0, []).
145
146%% @private
147-spec binary_skip_bytes(binary(), [non_neg_integer()], non_neg_integer(), iolist()) -> binary().
148binary_skip_bytes(B, [], _N, Acc) ->
149    iolist_to_binary(lists:reverse([B | Acc]));
150binary_skip_bytes(<<_, RestB/binary>>, [N | RestL], N, Acc) ->
151    binary_skip_bytes(RestB, RestL, 1 + N, Acc);
152binary_skip_bytes(<<C, RestB/binary>>, L, N, Acc) ->
153    binary_skip_bytes(RestB, L, 1 + N, [C | Acc]).
154
155-spec invalid_utf8_indexes(binary()) -> [non_neg_integer()].
156%% @doc Return the 0-based indexes in B that are not valid UTF-8.
157invalid_utf8_indexes(B) ->
158    invalid_utf8_indexes(B, 0, []).
159
160%% @private.
161-spec invalid_utf8_indexes(binary(), non_neg_integer(), [non_neg_integer()]) -> [non_neg_integer()].
162invalid_utf8_indexes(<<C, Rest/binary>>, N, Acc) when C < 16#80 ->
163    %% U+0000 - U+007F - 7 bits
164    invalid_utf8_indexes(Rest, 1 + N, Acc);
165invalid_utf8_indexes(<<C1, C2, Rest/binary>>, N, Acc)
166  when C1 band 16#E0 =:= 16#C0,
167       C2 band 16#C0 =:= 16#80 ->
168    %% U+0080 - U+07FF - 11 bits
169    case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of
170	C when C < 16#80 ->
171            %% Overlong encoding.
172            invalid_utf8_indexes(Rest, 2 + N, [1 + N, N | Acc]);
173        _ ->
174            %% Upper bound U+07FF does not need to be checked
175            invalid_utf8_indexes(Rest, 2 + N, Acc)
176    end;
177invalid_utf8_indexes(<<C1, C2, C3, Rest/binary>>, N, Acc)
178  when C1 band 16#F0 =:= 16#E0,
179       C2 band 16#C0 =:= 16#80,
180       C3 band 16#C0 =:= 16#80 ->
181    %% U+0800 - U+FFFF - 16 bits
182    case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
183	(C3 band 16#3F) of
184	C when (C < 16#800) orelse (C >= 16#D800 andalso C =< 16#DFFF) ->
185	    %% Overlong encoding or surrogate.
186            invalid_utf8_indexes(Rest, 3 + N, [2 + N, 1 + N, N | Acc]);
187	_ ->
188            %% Upper bound U+FFFF does not need to be checked
189	    invalid_utf8_indexes(Rest, 3 + N, Acc)
190    end;
191invalid_utf8_indexes(<<C1, C2, C3, C4, Rest/binary>>, N, Acc)
192  when C1 band 16#F8 =:= 16#F0,
193       C2 band 16#C0 =:= 16#80,
194       C3 band 16#C0 =:= 16#80,
195       C4 band 16#C0 =:= 16#80 ->
196    %% U+10000 - U+10FFFF - 21 bits
197    case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor
198           (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of
199	C when (C < 16#10000) orelse (C > 16#10FFFF) ->
200	    %% Overlong encoding or invalid code point.
201	    invalid_utf8_indexes(Rest, 4 + N, [3 + N, 2 + N, 1 + N, N | Acc]);
202	_ ->
203	    invalid_utf8_indexes(Rest, 4 + N, Acc)
204    end;
205invalid_utf8_indexes(<<_, Rest/binary>>, N, Acc) ->
206    %% Invalid char
207    invalid_utf8_indexes(Rest, 1 + N, [N | Acc]);
208invalid_utf8_indexes(<<>>, _N, Acc) ->
209    lists:reverse(Acc).
210
211%%
212%% Tests
213%%
214-ifdef(TEST).
215-include_lib("eunit/include/eunit.hrl").
216
217binary_skip_bytes_test() ->
218    ?assertEqual(<<"foo">>,
219                 binary_skip_bytes(<<"foo">>, [])),
220    ?assertEqual(<<"foobar">>,
221                 binary_skip_bytes(<<"foo bar">>, [3])),
222    ?assertEqual(<<"foo">>,
223                 binary_skip_bytes(<<"foo bar">>, [3, 4, 5, 6])),
224    ?assertEqual(<<"oo bar">>,
225                 binary_skip_bytes(<<"foo bar">>, [0])),
226    ok.
227
228invalid_utf8_indexes_test() ->
229    ?assertEqual(
230       [],
231       invalid_utf8_indexes(<<"unicode snowman for you: ", 226, 152, 131>>)),
232    ?assertEqual(
233       [0],
234       invalid_utf8_indexes(<<128>>)),
235    ?assertEqual(
236       [57,59,60,64,66,67],
237       invalid_utf8_indexes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
238                              167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
239    ok.
240
241codepoint_to_bytes_test() ->
242    %% U+0000 - U+007F - 7 bits
243    %% U+0080 - U+07FF - 11 bits
244    %% U+0800 - U+FFFF - 16 bits (excluding UTC-16 surrogate code points)
245    %% U+10000 - U+10FFFF - 21 bits
246    ?assertEqual(
247       <<"a">>,
248       codepoint_to_bytes($a)),
249    ?assertEqual(
250       <<16#c2, 16#80>>,
251       codepoint_to_bytes(16#80)),
252    ?assertEqual(
253       <<16#df, 16#bf>>,
254       codepoint_to_bytes(16#07ff)),
255    ?assertEqual(
256       <<16#ef, 16#bf, 16#bf>>,
257       codepoint_to_bytes(16#ffff)),
258    ?assertEqual(
259       <<16#f4, 16#8f, 16#bf, 16#bf>>,
260       codepoint_to_bytes(16#10ffff)),
261    ok.
262
263bytes_foldl_test() ->
264    ?assertEqual(
265       <<"abc">>,
266       bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>, <<"abc">>)),
267    ?assertEqual(
268       <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>,
269       bytes_foldl(fun (B, Acc) -> <<Acc/binary, B/binary>> end, <<>>,
270                   <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
271    ok.
272
273bytes_to_codepoints_test() ->
274    ?assertEqual(
275       "abc" ++ [16#2603, 16#4e2d, 16#85, 16#10ffff],
276       bytes_to_codepoints(<<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
277    ok.
278
279codepoint_foldl_test() ->
280    ?assertEqual(
281       "cba",
282       codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [], <<"abc">>)),
283    ?assertEqual(
284       [16#10ffff, 16#85, 16#4e2d, 16#2603 | "cba"],
285       codepoint_foldl(fun (C, Acc) -> [C | Acc] end, [],
286                       <<"abc", 226, 152, 131, 228, 184, 173, 194, 133, 244,143,191,191>>)),
287    ok.
288
289len_test() ->
290    ?assertEqual(
291       29,
292       len(<<"unicode snowman for you: ", 226, 152, 131, 228, 184, 173, 194, 133, 244, 143, 191, 191>>)),
293    ok.
294
295codepoints_to_bytes_test() ->
296    ?assertEqual(
297       iolist_to_binary(lists:map(fun codepoint_to_bytes/1, lists:seq(1, 1000))),
298       codepoints_to_bytes(lists:seq(1, 1000))),
299    ok.
300
301valid_utf8_bytes_test() ->
302    ?assertEqual(
303       <<"invalid U+11ffff: ">>,
304       valid_utf8_bytes(<<"invalid U+11ffff: ", 244, 159, 191, 191>>)),
305    ?assertEqual(
306       <<"U+10ffff: ", 244, 143, 191, 191>>,
307       valid_utf8_bytes(<<"U+10ffff: ", 244, 143, 191, 191>>)),
308    ?assertEqual(
309       <<"overlong 2-byte encoding (a): ">>,
310       valid_utf8_bytes(<<"overlong 2-byte encoding (a): ", 2#11000001, 2#10100001>>)),
311    ?assertEqual(
312       <<"overlong 2-byte encoding (!): ">>,
313       valid_utf8_bytes(<<"overlong 2-byte encoding (!): ", 2#11000000, 2#10100001>>)),
314    ?assertEqual(
315       <<"mu: ", 194, 181>>,
316       valid_utf8_bytes(<<"mu: ", 194, 181>>)),
317    ?assertEqual(
318       <<"bad coding bytes: ">>,
319       valid_utf8_bytes(<<"bad coding bytes: ", 2#10011111, 2#10111111, 2#11111111>>)),
320    ?assertEqual(
321       <<"low surrogate (unpaired): ">>,
322       valid_utf8_bytes(<<"low surrogate (unpaired): ", 237, 176, 128>>)),
323    ?assertEqual(
324       <<"high surrogate (unpaired): ">>,
325       valid_utf8_bytes(<<"high surrogate (unpaired): ", 237, 191, 191>>)),
326    ?assertEqual(
327       <<"unicode snowman for you: ", 226, 152, 131>>,
328       valid_utf8_bytes(<<"unicode snowman for you: ", 226, 152, 131>>)),
329    ?assertEqual(
330       <<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (AISPW))">>,
331       valid_utf8_bytes(<<"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; (",
332                          167, 65, 170, 186, 73, 83, 80, 166, 87, 186, 217, 41, 41>>)),
333    ok.
334
335-endif.
336