1 /* utf8.c - UTF-8 character handling functions */
2 
3 /* Copyright (C) 2018 Free Software Foundation, Inc.
4 
5    This file is part of GNU Bash, the Bourne Again SHell.
6 
7    Bash is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation, either version 3 of the License, or
10    (at your option) any later version.
11 
12    Bash is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with Bash.  If not, see <http://www.gnu.org/licenses/>.
19 */
20 
21 #include <config.h>
22 
23 #ifdef HAVE_STDLIB_H
24 #  include <stdlib.h>
25 #endif
26 
27 #include "bashansi.h"
28 #include "shmbutil.h"
29 
30 extern int locale_mb_cur_max;
31 extern int locale_utf8locale;
32 
33 #if defined (HANDLE_MULTIBYTE)
34 
35 char *
utf8_mbschr(s,c)36 utf8_mbschr (s, c)
37      const char *s;
38      int c;
39 {
40   return strchr (s, c);		/* for now */
41 }
42 
43 int
utf8_mbscmp(s1,s2)44 utf8_mbscmp (s1, s2)
45      const char *s1, *s2;
46 {
47   /* Use the fact that the UTF-8 encoding preserves lexicographic order.  */
48   return strcmp (s1, s2);
49 }
50 
51 char *
utf8_mbsmbchar(str)52 utf8_mbsmbchar (str)
53      const char *str;
54 {
55   register char *s;
56 
57   for (s = (char *)str; *s; s++)
58     if ((*s & 0xc0) == 0x80)
59       return s;
60   return (0);
61 }
62 
63 int
utf8_mbsnlen(src,srclen,maxlen)64 utf8_mbsnlen(src, srclen, maxlen)
65      const char *src;
66      size_t srclen;
67      int maxlen;
68 {
69   register int sind, count;
70 
71   for (sind = count = 0; src[sind] && sind <= maxlen; sind++)
72     {
73       if ((src[sind] & 0xc0) != 0x80)
74 	count++;
75     }
76   return (count);
77 }
78 
79 /* Adapted from GNU gnulib. Handles UTF-8 characters up to 4 bytes long */
80 int
utf8_mblen(s,n)81 utf8_mblen (s, n)
82      const char *s;
83      size_t n;
84 {
85   unsigned char c, c1, c2, c3;
86 
87   if (s == 0)
88     return (0);	/* no shift states */
89   if (n <= 0)
90     return (-1);
91 
92   c = (unsigned char)*s;
93   if (c < 0x80)
94     return (c != 0);
95   if (c >= 0xc2)
96     {
97       c1 = (unsigned char)s[1];
98       if (c < 0xe0)
99 	{
100 	  if (n == 1)
101 	    return -2;
102 
103 	  /*
104 	   *				c	c1
105 	   *
106 	   *    U+0080..U+07FF       C2..DF   80..BF
107 	   */
108 
109 	  if (n >= 2 && (c1 ^ 0x80) < 0x40)		/* 0x80..0xbf */
110 	    return 2;
111 	}
112       else if (c < 0xf0)
113 	{
114 	  if (n == 1)
115 	    return -2;
116 
117 	  /*
118 	   *				c	c1	c2
119 	   *
120 	   *    U+0800..U+0FFF       E0       A0..BF   80..BF
121 	   *    U+1000..U+CFFF       E1..EC   80..BF   80..BF
122 	   *    U+D000..U+D7FF       ED       80..9F   80..BF
123 	   *    U+E000..U+FFFF       EE..EF   80..BF   80..BF
124 	   */
125 
126 	  if ((c1 ^ 0x80) < 0x40
127 		&& (c >= 0xe1 || c1 >= 0xa0)
128 		&& (c != 0xed || c1 < 0xa0))
129 	    {
130 	      if (n == 2)
131 		return -2;		/* incomplete */
132 
133 	      c2 = (unsigned char)s[2];
134 	      if ((c2 ^ 0x80) < 0x40)
135 		 return 3;
136 	    }
137 	}
138       else if (c <= 0xf4)
139 	{
140 	  if (n == 1)
141 	    return -2;
142 
143 	  /*
144 	   *				c	c1	c2	c3
145 	   *
146 	   *    U+10000..U+3FFFF     F0       90..BF   80..BF   80..BF
147 	   *    U+40000..U+FFFFF     F1..F3   80..BF   80..BF   80..BF
148 	   *    U+100000..U+10FFFF   F4       80..8F   80..BF   80..BF
149 	   */
150 	  if (((c1 ^ 0x80) < 0x40)
151 		&& (c >= 0xf1 || c1 >= 0x90)
152 		&& (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
153 	    {
154 	      if (n == 2)
155 		return -2;		/* incomplete */
156 
157 	      c2 = (unsigned char)s[2];
158 	      if ((c2 ^ 0x80) < 0x40)
159 		{
160 		  if (n == 3)
161 		    return -2;
162 
163 		  c3 = (unsigned char)s[3];
164 	 	  if ((c3 ^ 0x80) < 0x40)
165 	  	    return 4;
166 		}
167 	    }
168 	}
169     }
170   /* invalid or incomplete multibyte character */
171   return -1;
172 }
173 
174 /* We can optimize this if we know the locale is UTF-8, but needs to handle
175    malformed byte sequences. */
176 size_t
utf8_mbstrlen(s)177 utf8_mbstrlen(s)
178      const char *s;
179 {
180   size_t clen, nc;
181   int mb_cur_max;
182 
183   nc = 0;
184   mb_cur_max = MB_CUR_MAX;
185   while (*s && (clen = (size_t)utf8_mblen(s, mb_cur_max)) != 0)
186     {
187       if (MB_INVALIDCH(clen))
188 	clen = 1;	/* assume single byte */
189 
190       s += clen;
191       nc++;
192     }
193   return nc;
194 }
195 
196 #endif
197