1 /*-------------------------------------------------------------------------
2  *
3  * Multibyte character printing support for frontend code
4  *
5  *
6  * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  * src/fe_utils/mbprint.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres_fe.h"
14 
15 #include "fe_utils/mbprint.h"
16 
17 #include "libpq-fe.h"
18 
19 
20 /*
21  * To avoid version-skew problems, this file must not use declarations
22  * from pg_wchar.h: the encoding IDs we are dealing with are determined
23  * by the libpq.so we are linked with, and that might not match the
24  * numbers we see at compile time.  (If this file were inside libpq,
25  * the problem would go away...)
26  *
27  * Hence, we have our own definition of pg_wchar, and we get the values
28  * of any needed encoding IDs on-the-fly.
29  */
30 
31 typedef unsigned int pg_wchar;
32 
33 static int
pg_get_utf8_id(void)34 pg_get_utf8_id(void)
35 {
36 	static int	utf8_id = -1;
37 
38 	if (utf8_id < 0)
39 		utf8_id = pg_char_to_encoding("utf8");
40 	return utf8_id;
41 }
42 
43 #define PG_UTF8		pg_get_utf8_id()
44 
45 
46 /*
47  * Convert a UTF-8 character to a Unicode code point.
48  * This is a one-character version of pg_utf2wchar_with_len.
49  *
50  * No error checks here, c must point to a long-enough string.
51  */
52 static pg_wchar
utf8_to_unicode(const unsigned char * c)53 utf8_to_unicode(const unsigned char *c)
54 {
55 	if ((*c & 0x80) == 0)
56 		return (pg_wchar) c[0];
57 	else if ((*c & 0xe0) == 0xc0)
58 		return (pg_wchar) (((c[0] & 0x1f) << 6) |
59 						   (c[1] & 0x3f));
60 	else if ((*c & 0xf0) == 0xe0)
61 		return (pg_wchar) (((c[0] & 0x0f) << 12) |
62 						   ((c[1] & 0x3f) << 6) |
63 						   (c[2] & 0x3f));
64 	else if ((*c & 0xf8) == 0xf0)
65 		return (pg_wchar) (((c[0] & 0x07) << 18) |
66 						   ((c[1] & 0x3f) << 12) |
67 						   ((c[2] & 0x3f) << 6) |
68 						   (c[3] & 0x3f));
69 	else
70 		/* that is an invalid code on purpose */
71 		return 0xffffffff;
72 }
73 
74 
75 /*
76  * Unicode 3.1 compliant validation : for each category, it checks the
77  * combination of each byte to make sure it maps to a valid range. It also
78  * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
79  * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
80  */
81 static int
utf_charcheck(const unsigned char * c)82 utf_charcheck(const unsigned char *c)
83 {
84 	if ((*c & 0x80) == 0)
85 		return 1;
86 	else if ((*c & 0xe0) == 0xc0)
87 	{
88 		/* two-byte char */
89 		if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
90 			return 2;
91 		return -1;
92 	}
93 	else if ((*c & 0xf0) == 0xe0)
94 	{
95 		/* three-byte char */
96 		if (((c[1] & 0xc0) == 0x80) &&
97 			(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
98 			((c[2] & 0xc0) == 0x80))
99 		{
100 			int			z = c[0] & 0x0f;
101 			int			yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
102 			int			lx = yx & 0x7f;
103 
104 			/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
105 			if (((z == 0x0f) &&
106 				 (((yx & 0xffe) == 0xffe) ||
107 				  (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
108 				((z == 0x0d) && ((yx & 0xb00) == 0x800)))
109 				return -1;
110 			return 3;
111 		}
112 		return -1;
113 	}
114 	else if ((*c & 0xf8) == 0xf0)
115 	{
116 		int			u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
117 
118 		/* four-byte char */
119 		if (((c[1] & 0xc0) == 0x80) &&
120 			(u > 0x00) && (u <= 0x10) &&
121 			((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
122 		{
123 			/* test for 0xzzzzfffe/0xzzzzfffff */
124 			if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
125 				((c[3] & 0x3e) == 0x3e))
126 				return -1;
127 			return 4;
128 		}
129 		return -1;
130 	}
131 	return -1;
132 }
133 
134 
135 static void
mb_utf_validate(unsigned char * pwcs)136 mb_utf_validate(unsigned char *pwcs)
137 {
138 	unsigned char *p = pwcs;
139 
140 	while (*pwcs)
141 	{
142 		int			len;
143 
144 		if ((len = utf_charcheck(pwcs)) > 0)
145 		{
146 			if (p != pwcs)
147 			{
148 				int			i;
149 
150 				for (i = 0; i < len; i++)
151 					*p++ = *pwcs++;
152 			}
153 			else
154 			{
155 				pwcs += len;
156 				p += len;
157 			}
158 		}
159 		else
160 			/* we skip the char */
161 			pwcs++;
162 	}
163 	if (p != pwcs)
164 		*p = '\0';
165 }
166 
167 /*
168  * public functions : wcswidth and mbvalidate
169  */
170 
171 /*
172  * pg_wcswidth is the dumb display-width function.
173  * It assumes that everything will appear on one line.
174  * OTOH it is easier to use than pg_wcssize if this applies to you.
175  */
176 int
pg_wcswidth(const char * pwcs,size_t len,int encoding)177 pg_wcswidth(const char *pwcs, size_t len, int encoding)
178 {
179 	int			width = 0;
180 
181 	while (len > 0)
182 	{
183 		int			chlen,
184 					chwidth;
185 
186 		chlen = PQmblen(pwcs, encoding);
187 		if (len < (size_t) chlen)
188 			break;				/* Invalid string */
189 
190 		chwidth = PQdsplen(pwcs, encoding);
191 		if (chwidth > 0)
192 			width += chwidth;
193 
194 		pwcs += chlen;
195 		len -= chlen;
196 	}
197 	return width;
198 }
199 
200 /*
201  * pg_wcssize takes the given string in the given encoding and returns three
202  * values:
203  *	  result_width: Width in display characters of the longest line in string
204  *	  result_height: Number of lines in display output
205  *	  result_format_size: Number of bytes required to store formatted
206  *		representation of string
207  *
208  * This MUST be kept in sync with pg_wcsformat!
209  */
210 void
pg_wcssize(const unsigned char * pwcs,size_t len,int encoding,int * result_width,int * result_height,int * result_format_size)211 pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
212 		   int *result_width, int *result_height, int *result_format_size)
213 {
214 	int			w,
215 				chlen = 0,
216 				linewidth = 0;
217 	int			width = 0;
218 	int			height = 1;
219 	int			format_size = 0;
220 
221 	for (; *pwcs && len > 0; pwcs += chlen)
222 	{
223 		chlen = PQmblen((const char *) pwcs, encoding);
224 		if (len < (size_t) chlen)
225 			break;
226 		w = PQdsplen((const char *) pwcs, encoding);
227 
228 		if (chlen == 1)			/* single-byte char */
229 		{
230 			if (*pwcs == '\n')	/* Newline */
231 			{
232 				if (linewidth > width)
233 					width = linewidth;
234 				linewidth = 0;
235 				height += 1;
236 				format_size += 1;	/* For NUL char */
237 			}
238 			else if (*pwcs == '\r') /* Linefeed */
239 			{
240 				linewidth += 2;
241 				format_size += 2;
242 			}
243 			else if (*pwcs == '\t') /* Tab */
244 			{
245 				do
246 				{
247 					linewidth++;
248 					format_size++;
249 				} while (linewidth % 8 != 0);
250 			}
251 			else if (w < 0)		/* Other control char */
252 			{
253 				linewidth += 4;
254 				format_size += 4;
255 			}
256 			else				/* Output it as-is */
257 			{
258 				linewidth += w;
259 				format_size += 1;
260 			}
261 		}
262 		else if (w < 0)			/* Non-ascii control char */
263 		{
264 			linewidth += 6;		/* \u0000 */
265 			format_size += 6;
266 		}
267 		else					/* All other chars */
268 		{
269 			linewidth += w;
270 			format_size += chlen;
271 		}
272 		len -= chlen;
273 	}
274 	if (linewidth > width)
275 		width = linewidth;
276 	format_size += 1;			/* For NUL char */
277 
278 	/* Set results */
279 	if (result_width)
280 		*result_width = width;
281 	if (result_height)
282 		*result_height = height;
283 	if (result_format_size)
284 		*result_format_size = format_size;
285 }
286 
287 /*
288  *	Format a string into one or more "struct lineptr" lines.
289  *	lines[i].ptr == NULL indicates the end of the array.
290  *
291  * This MUST be kept in sync with pg_wcssize!
292  */
293 void
pg_wcsformat(const unsigned char * pwcs,size_t len,int encoding,struct lineptr * lines,int count)294 pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
295 			 struct lineptr *lines, int count)
296 {
297 	int			w,
298 				chlen = 0;
299 	int			linewidth = 0;
300 	unsigned char *ptr = lines->ptr;	/* Pointer to data area */
301 
302 	for (; *pwcs && len > 0; pwcs += chlen)
303 	{
304 		chlen = PQmblen((const char *) pwcs, encoding);
305 		if (len < (size_t) chlen)
306 			break;
307 		w = PQdsplen((const char *) pwcs, encoding);
308 
309 		if (chlen == 1)			/* single-byte char */
310 		{
311 			if (*pwcs == '\n')	/* Newline */
312 			{
313 				*ptr++ = '\0';
314 				lines->width = linewidth;
315 				linewidth = 0;
316 				lines++;
317 				count--;
318 				if (count <= 0)
319 					exit(1);	/* Screwup */
320 
321 				/* make next line point to remaining memory */
322 				lines->ptr = ptr;
323 			}
324 			else if (*pwcs == '\r') /* Linefeed */
325 			{
326 				strcpy((char *) ptr, "\\r");
327 				linewidth += 2;
328 				ptr += 2;
329 			}
330 			else if (*pwcs == '\t') /* Tab */
331 			{
332 				do
333 				{
334 					*ptr++ = ' ';
335 					linewidth++;
336 				} while (linewidth % 8 != 0);
337 			}
338 			else if (w < 0)		/* Other control char */
339 			{
340 				sprintf((char *) ptr, "\\x%02X", *pwcs);
341 				linewidth += 4;
342 				ptr += 4;
343 			}
344 			else				/* Output it as-is */
345 			{
346 				linewidth += w;
347 				*ptr++ = *pwcs;
348 			}
349 		}
350 		else if (w < 0)			/* Non-ascii control char */
351 		{
352 			if (encoding == PG_UTF8)
353 				sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
354 			else
355 			{
356 				/*
357 				 * This case cannot happen in the current code because only
358 				 * UTF-8 signals multibyte control characters. But we may need
359 				 * to support it at some stage
360 				 */
361 				sprintf((char *) ptr, "\\u????");
362 			}
363 			ptr += 6;
364 			linewidth += 6;
365 		}
366 		else					/* All other chars */
367 		{
368 			int			i;
369 
370 			for (i = 0; i < chlen; i++)
371 				*ptr++ = pwcs[i];
372 			linewidth += w;
373 		}
374 		len -= chlen;
375 	}
376 	lines->width = linewidth;
377 	*ptr++ = '\0';				/* Terminate formatted string */
378 
379 	if (count <= 0)
380 		exit(1);				/* Screwup */
381 
382 	(lines + 1)->ptr = NULL;	/* terminate line array */
383 }
384 
385 
386 /*
387  * Encoding validation: delete any unvalidatable characters from the string
388  *
389  * This seems redundant with existing functionality elsewhere?
390  */
391 unsigned char *
mbvalidate(unsigned char * pwcs,int encoding)392 mbvalidate(unsigned char *pwcs, int encoding)
393 {
394 	if (encoding == PG_UTF8)
395 		mb_utf_validate(pwcs);
396 	else
397 	{
398 		/*
399 		 * other encodings needing validation should add their own routines
400 		 * here
401 		 */
402 	}
403 
404 	return pwcs;
405 }
406