xref: /freebsd/contrib/sendmail/libsm/utf8_valid.c (revision 2fb4f839)
1 /*
2  * Copyright (c) 2020 Proofpoint, Inc. and its suppliers.
3  *	All rights reserved.
4  *
5  * By using this file, you agree to the terms and conditions set
6  * forth in the LICENSE file which can be found at the top level of
7  * the sendmail distribution.
8  *
9  */
10 
11 #include <sm/gen.h>
12 #include <sm/sendmail.h>
13 #include <sm/ixlen.h>
14 
15 #if USE_EAI
16 
17 /*
18 **  legal utf-8 byte sequence
19 **  http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
20 **
21 **   Code Points        1st       2s       3s       4s
22 **  U+0000..U+007F     00..7F
23 **  U+0080..U+07FF     C2..DF   80..BF
24 **  U+0800..U+0FFF     E0       A0..BF   80..BF
25 **  U+1000..U+CFFF     E1..EC   80..BF   80..BF
26 **  U+D000..U+D7FF     ED       80..9F   80..BF
27 **  U+E000..U+FFFF     EE..EF   80..BF   80..BF
28 **  U+10000..U+3FFFF   F0       90..BF   80..BF   80..BF
29 **  U+40000..U+FFFFF   F1..F3   80..BF   80..BF   80..BF
30 **  U+100000..U+10FFFF F4       80..8F   80..BF   80..BF
31 */
32 
33 /*
34 **  based on
35 **  https://github.com/lemire/fastvalidate-utf-8.git
36 **  which is distributed under an MIT license (besides others).
37 */
38 
39 bool
utf8_valid(b,length)40 utf8_valid(b, length)
41 	const char *b;
42 	size_t length;
43 {
44 	const unsigned char *bytes;
45 	size_t index;
46 
47 	bytes = (const unsigned char *)b;
48 	index = 0;
49 	while (true)
50 	{
51 		unsigned char byte1;
52 
53 		do { /* fast ASCII Path */
54 			if (index >= length)
55 				return true;
56 			byte1 = bytes[index++];
57 		} while (byte1 < 0x80);
58 		if (byte1 < 0xE0)
59 		{
60 			/* Two-byte form. */
61 			if (index == length)
62 				return false;
63 			if (byte1 < 0xC2 || bytes[index++] > 0xBF)
64 				return false;
65 		}
66 		else if (byte1 < 0xF0)
67 		{
68 			/* Three-byte form. */
69 			if (index + 1 >= length)
70 				return false;
71 			unsigned char byte2 = bytes[index++];
72 			if (byte2 > 0xBF
73 			    /* Overlong? 5 most significant bits must not all be zero. */
74 			    || (byte1 == 0xE0 && byte2 < 0xA0)
75 			    /* Check for illegal surrogate codepoints. */
76 			    || (byte1 == 0xED && 0xA0 <= byte2)
77 			    /* Third byte trailing-byte test. */
78 			    || bytes[index++] > 0xBF)
79 				return false;
80 		}
81 		else
82 		{
83 
84 			/* Four-byte form. */
85 			if (index + 2 >= length)
86 				return false;
87 			int byte2 = bytes[index++];
88 			if (byte2 > 0xBF
89 			    /* Check that 1 <= plane <= 16. Tricky optimized form of: */
90 			    /* if (byte1 > (byte) 0xF4 */
91 			    /*    || byte1 == (byte) 0xF0 && byte2 < (byte) 0x90 */
92 			    /*    || byte1 == (byte) 0xF4 && byte2 > (byte) 0x8F) */
93 			    || (((byte1 << 28) + (byte2 - 0x90)) >> 30) != 0
94 			    /* Third byte trailing-byte test */
95 			    || bytes[index++] > 0xBF
96 			    /* Fourth byte trailing-byte test */
97 			    || bytes[index++] > 0xBF)
98 				return false;
99 		}
100 	}
101 	/* NOTREACHED */
102 	return false;
103 }
104 #endif /* USE_EAI */
105