1 /*
2 * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3 *
4 * SPDX-License-Identifier: MPL-2.0
5 *
6 * This Source Code Form is subject to the terms of the Mozilla Public
7 * License, v. 2.0. If a copy of the MPL was not distributed with this
8 * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9 *
10 * See the COPYRIGHT file distributed with this work for additional
11 * information regarding copyright ownership.
12 */
13
14 #include <string.h>
15
16 #include <isc/utf8.h>
17 #include <isc/util.h>
18
19 /*
20 * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
21 * Also see RFC 3629.
22 *
23 * Char. number range | UTF-8 octet sequence
24 * (hexadecimal) | (binary)
25 * --------------------+---------------------------------------------
26 * 0000 0000-0000 007F | 0xxxxxxx
27 * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
28 * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
29 * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
30 */
31 bool
isc_utf8_valid(const unsigned char * buf,size_t len)32 isc_utf8_valid(const unsigned char *buf, size_t len) {
33 REQUIRE(buf != NULL);
34
35 for (size_t i = 0; i < len; i++) {
36 if (buf[i] <= 0x7f) {
37 continue;
38 }
39 if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
40 (buf[i + 1] & 0xc0) == 0x80) {
41 unsigned int w;
42 w = (buf[i] & 0x1f) << 6;
43 w |= (buf[++i] & 0x3f);
44 if (w < 0x80) {
45 return (false);
46 }
47 continue;
48 }
49 if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
50 (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
51 {
52 unsigned int w;
53 w = (buf[i] & 0x0f) << 12;
54 w |= (buf[++i] & 0x3f) << 6;
55 w |= (buf[++i] & 0x3f);
56 if (w < 0x0800) {
57 return (false);
58 }
59 continue;
60 }
61 if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
62 (buf[i + 1] & 0xc0) == 0x80 &&
63 (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
64 {
65 unsigned int w;
66 w = (buf[i] & 0x07) << 18;
67 w |= (buf[++i] & 0x3f) << 12;
68 w |= (buf[++i] & 0x3f) << 6;
69 w |= (buf[++i] & 0x3f);
70 if (w < 0x10000 || w > 0x10FFFF) {
71 return (false);
72 }
73 continue;
74 }
75 return (false);
76 }
77 return (true);
78 }
79
80 bool
isc_utf8_bom(const unsigned char * buf,size_t len)81 isc_utf8_bom(const unsigned char *buf, size_t len) {
82 REQUIRE(buf != NULL);
83
84 if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
85 return (true);
86 }
87 return (false);
88 }
89