1 /* utf8.c - UTF-8 character handling functions */
2
3 /* Copyright (C) 2018 Free Software Foundation, Inc.
4
5 This file is part of GNU Bash, the Bourne Again SHell.
6
7 Bash is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 Bash is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with Bash. If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <config.h>
22
23 #ifdef HAVE_STDLIB_H
24 # include <stdlib.h>
25 #endif
26
27 #include "bashansi.h"
28 #include "shmbutil.h"
29
30 extern int locale_mb_cur_max;
31 extern int locale_utf8locale;
32
33 #if defined (HANDLE_MULTIBYTE)
34
35 char *
utf8_mbschr(s,c)36 utf8_mbschr (s, c)
37 const char *s;
38 int c;
39 {
40 return strchr (s, c); /* for now */
41 }
42
43 int
utf8_mbscmp(s1,s2)44 utf8_mbscmp (s1, s2)
45 const char *s1, *s2;
46 {
47 /* Use the fact that the UTF-8 encoding preserves lexicographic order. */
48 return strcmp (s1, s2);
49 }
50
51 char *
utf8_mbsmbchar(str)52 utf8_mbsmbchar (str)
53 const char *str;
54 {
55 register char *s;
56
57 for (s = (char *)str; *s; s++)
58 if ((*s & 0xc0) == 0x80)
59 return s;
60 return (0);
61 }
62
63 int
utf8_mbsnlen(src,srclen,maxlen)64 utf8_mbsnlen(src, srclen, maxlen)
65 const char *src;
66 size_t srclen;
67 int maxlen;
68 {
69 register int sind, count;
70
71 for (sind = count = 0; src[sind] && sind <= maxlen; sind++)
72 {
73 if ((src[sind] & 0xc0) != 0x80)
74 count++;
75 }
76 return (count);
77 }
78
79 /* Adapted from GNU gnulib. Handles UTF-8 characters up to 4 bytes long */
80 int
utf8_mblen(s,n)81 utf8_mblen (s, n)
82 const char *s;
83 size_t n;
84 {
85 unsigned char c, c1, c2, c3;
86
87 if (s == 0)
88 return (0); /* no shift states */
89 if (n <= 0)
90 return (-1);
91
92 c = (unsigned char)*s;
93 if (c < 0x80)
94 return (c != 0);
95 if (c >= 0xc2)
96 {
97 c1 = (unsigned char)s[1];
98 if (c < 0xe0)
99 {
100 if (n == 1)
101 return -2;
102
103 /*
104 * c c1
105 *
106 * U+0080..U+07FF C2..DF 80..BF
107 */
108
109 if (n >= 2 && (c1 ^ 0x80) < 0x40) /* 0x80..0xbf */
110 return 2;
111 }
112 else if (c < 0xf0)
113 {
114 if (n == 1)
115 return -2;
116
117 /*
118 * c c1 c2
119 *
120 * U+0800..U+0FFF E0 A0..BF 80..BF
121 * U+1000..U+CFFF E1..EC 80..BF 80..BF
122 * U+D000..U+D7FF ED 80..9F 80..BF
123 * U+E000..U+FFFF EE..EF 80..BF 80..BF
124 */
125
126 if ((c1 ^ 0x80) < 0x40
127 && (c >= 0xe1 || c1 >= 0xa0)
128 && (c != 0xed || c1 < 0xa0))
129 {
130 if (n == 2)
131 return -2; /* incomplete */
132
133 c2 = (unsigned char)s[2];
134 if ((c2 ^ 0x80) < 0x40)
135 return 3;
136 }
137 }
138 else if (c <= 0xf4)
139 {
140 if (n == 1)
141 return -2;
142
143 /*
144 * c c1 c2 c3
145 *
146 * U+10000..U+3FFFF F0 90..BF 80..BF 80..BF
147 * U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
148 * U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
149 */
150 if (((c1 ^ 0x80) < 0x40)
151 && (c >= 0xf1 || c1 >= 0x90)
152 && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
153 {
154 if (n == 2)
155 return -2; /* incomplete */
156
157 c2 = (unsigned char)s[2];
158 if ((c2 ^ 0x80) < 0x40)
159 {
160 if (n == 3)
161 return -2;
162
163 c3 = (unsigned char)s[3];
164 if ((c3 ^ 0x80) < 0x40)
165 return 4;
166 }
167 }
168 }
169 }
170 /* invalid or incomplete multibyte character */
171 return -1;
172 }
173
174 /* We can optimize this if we know the locale is UTF-8, but needs to handle
175 malformed byte sequences. */
176 size_t
utf8_mbstrlen(s)177 utf8_mbstrlen(s)
178 const char *s;
179 {
180 size_t clen, nc;
181 int mb_cur_max;
182
183 nc = 0;
184 mb_cur_max = MB_CUR_MAX;
185 while (*s && (clen = (size_t)utf8_mblen(s, mb_cur_max)) != 0)
186 {
187 if (MB_INVALIDCH(clen))
188 clen = 1; /* assume single byte */
189
190 s += clen;
191 nc++;
192 }
193 return nc;
194 }
195
196 #endif
197