1 /*
2  * Copyright (C) Internet Systems Consortium, Inc. ("ISC")
3  *
4  * SPDX-License-Identifier: MPL-2.0
5  *
6  * This Source Code Form is subject to the terms of the Mozilla Public
7  * License, v. 2.0. If a copy of the MPL was not distributed with this
8  * file, you can obtain one at https://mozilla.org/MPL/2.0/.
9  *
10  * See the COPYRIGHT file distributed with this work for additional
11  * information regarding copyright ownership.
12  */
13 
14 #include <string.h>
15 
16 #include <isc/utf8.h>
17 #include <isc/util.h>
18 
19 /*
20  * UTF-8 is defined in "The Unicode Standard -- Version 4.0"
21  * Also see RFC 3629.
22  *
23  * Char. number range  |        UTF-8 octet sequence
24  *    (hexadecimal)    |              (binary)
25  *  --------------------+---------------------------------------------
26  * 0000 0000-0000 007F | 0xxxxxxx
27  * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
28  * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
29  * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
30  */
31 bool
isc_utf8_valid(const unsigned char * buf,size_t len)32 isc_utf8_valid(const unsigned char *buf, size_t len) {
33 	REQUIRE(buf != NULL);
34 
35 	for (size_t i = 0; i < len; i++) {
36 		if (buf[i] <= 0x7f) {
37 			continue;
38 		}
39 		if ((i + 1) < len && (buf[i] & 0xe0) == 0xc0 &&
40 		    (buf[i + 1] & 0xc0) == 0x80) {
41 			unsigned int w;
42 			w = (buf[i] & 0x1f) << 6;
43 			w |= (buf[++i] & 0x3f);
44 			if (w < 0x80) {
45 				return (false);
46 			}
47 			continue;
48 		}
49 		if ((i + 2) < len && (buf[i] & 0xf0) == 0xe0 &&
50 		    (buf[i + 1] & 0xc0) == 0x80 && (buf[i + 2] & 0xc0) == 0x80)
51 		{
52 			unsigned int w;
53 			w = (buf[i] & 0x0f) << 12;
54 			w |= (buf[++i] & 0x3f) << 6;
55 			w |= (buf[++i] & 0x3f);
56 			if (w < 0x0800) {
57 				return (false);
58 			}
59 			continue;
60 		}
61 		if ((i + 3) < len && (buf[i] & 0xf8) == 0xf0 &&
62 		    (buf[i + 1] & 0xc0) == 0x80 &&
63 		    (buf[i + 2] & 0xc0) == 0x80 && (buf[i + 3] & 0xc0) == 0x80)
64 		{
65 			unsigned int w;
66 			w = (buf[i] & 0x07) << 18;
67 			w |= (buf[++i] & 0x3f) << 12;
68 			w |= (buf[++i] & 0x3f) << 6;
69 			w |= (buf[++i] & 0x3f);
70 			if (w < 0x10000 || w > 0x10FFFF) {
71 				return (false);
72 			}
73 			continue;
74 		}
75 		return (false);
76 	}
77 	return (true);
78 }
79 
80 bool
isc_utf8_bom(const unsigned char * buf,size_t len)81 isc_utf8_bom(const unsigned char *buf, size_t len) {
82 	REQUIRE(buf != NULL);
83 
84 	if (len >= 3U && !memcmp(buf, "\xef\xbb\xbf", 3)) {
85 		return (true);
86 	}
87 	return (false);
88 }
89