1 /*------------------------------------------------------------------------- 2 * 3 * Multibyte character printing support for frontend code 4 * 5 * add_third_party_module_dirs(lldb_root)6 * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group 7 * Portions Copyright (c) 1994, Regents of the University of California 8 * 9 * src/fe_utils/mbprint.c 10 * 11 *------------------------------------------------------------------------- 12 */ 13 #include "postgres_fe.h" 14 15 #include "fe_utils/mbprint.h" 16 17 #include "libpq-fe.h" 18 19 20 /* 21 * To avoid version-skew problems, this file must not use declarations 22 * from pg_wchar.h: the encoding IDs we are dealing with are determined 23 * by the libpq.so we are linked with, and that might not match the 24 * numbers we see at compile time. (If this file were inside libpq, 25 * the problem would go away...) 26 * 27 * Hence, we have our own definition of pg_wchar, and we get the values 28 * of any needed encoding IDs on-the-fly. 29 */ 30 31 typedef unsigned int pg_wchar; 32 33 static int 34 pg_get_utf8_id(void) 35 { 36 static int utf8_id = -1; 37 38 if (utf8_id < 0) 39 utf8_id = pg_char_to_encoding("utf8"); 40 return utf8_id; 41 } 42 43 #define PG_UTF8 pg_get_utf8_id() 44 45 46 /* 47 * Convert a UTF-8 character to a Unicode code point. 48 * This is a one-character version of pg_utf2wchar_with_len. 49 * 50 * No error checks here, c must point to a long-enough string. 51 */ 52 static pg_wchar 53 utf8_to_unicode(const unsigned char *c) 54 { 55 if ((*c & 0x80) == 0) 56 return (pg_wchar) c[0]; 57 else if ((*c & 0xe0) == 0xc0) 58 return (pg_wchar) (((c[0] & 0x1f) << 6) | 59 (c[1] & 0x3f)); 60 else if ((*c & 0xf0) == 0xe0) 61 return (pg_wchar) (((c[0] & 0x0f) << 12) | 62 ((c[1] & 0x3f) << 6) | 63 (c[2] & 0x3f)); 64 else if ((*c & 0xf8) == 0xf0) 65 return (pg_wchar) (((c[0] & 0x07) << 18) | 66 ((c[1] & 0x3f) << 12) | 67 ((c[2] & 0x3f) << 6) | 68 (c[3] & 0x3f)); 69 else 70 /* that is an invalid code on purpose */ 71 return 0xffffffff; 72 } 73 74 75 /* 76 * Unicode 3.1 compliant validation : for each category, it checks the 77 * combination of each byte to make sure it maps to a valid range. It also 78 * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe = 79 * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates) 80 */ 81 static int 82 utf_charcheck(const unsigned char *c) 83 { 84 if ((*c & 0x80) == 0) 85 return 1; 86 else if ((*c & 0xe0) == 0xc0) 87 { 88 /* two-byte char */ 89 if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01)) 90 return 2; 91 return -1; 92 } 93 else if ((*c & 0xf0) == 0xe0) 94 { 95 /* three-byte char */ 96 if (((c[1] & 0xc0) == 0x80) && 97 (((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) && 98 ((c[2] & 0xc0) == 0x80)) 99 { 100 int z = c[0] & 0x0f; 101 int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f); 102 int lx = yx & 0x7f; 103 104 /* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */ 105 if (((z == 0x0f) && 106 (((yx & 0xffe) == 0xffe) || 107 (((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) || 108 ((z == 0x0d) && ((yx & 0xb00) == 0x800))) 109 return -1; 110 return 3; 111 } 112 return -1; 113 } 114 else if ((*c & 0xf8) == 0xf0) 115 { 116 int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4); 117 118 /* four-byte char */ 119 if (((c[1] & 0xc0) == 0x80) && 120 (u > 0x00) && (u <= 0x10) && 121 ((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80)) 122 { 123 /* test for 0xzzzzfffe/0xzzzzfffff */ 124 if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) && 125 ((c[3] & 0x3e) == 0x3e)) 126 return -1; 127 return 4; 128 } 129 return -1; 130 } 131 return -1; 132 } 133 134 135 static void 136 mb_utf_validate(unsigned char *pwcs) 137 { 138 unsigned char *p = pwcs; 139 140 while (*pwcs) 141 { 142 int len; 143 144 if ((len = utf_charcheck(pwcs)) > 0) 145 { 146 if (p != pwcs) 147 { 148 int i; 149 150 for (i = 0; i < len; i++) 151 *p++ = *pwcs++; 152 } 153 else 154 { 155 pwcs += len; 156 p += len; 157 } 158 } 159 else 160 /* we skip the char */ 161 pwcs++; 162 } 163 if (p != pwcs) 164 *p = '\0'; 165 } 166 167 /* 168 * public functions : wcswidth and mbvalidate 169 */ 170 171 /* 172 * pg_wcswidth is the dumb display-width function. 173 * It assumes that everything will appear on one line. 174 * OTOH it is easier to use than pg_wcssize if this applies to you. 175 */ 176 int 177 pg_wcswidth(const char *pwcs, size_t len, int encoding) 178 { 179 int width = 0; 180 181 while (len > 0) 182 { 183 int chlen, 184 chwidth; 185 186 chlen = PQmblen(pwcs, encoding); 187 if (len < (size_t) chlen) 188 break; /* Invalid string */ 189 190 chwidth = PQdsplen(pwcs, encoding); 191 if (chwidth > 0) 192 width += chwidth; 193 194 pwcs += chlen; 195 len -= chlen; 196 } 197 return width; 198 } 199 200 /* 201 * pg_wcssize takes the given string in the given encoding and returns three 202 * values: 203 * result_width: Width in display characters of the longest line in string 204 * result_height: Number of lines in display output 205 * result_format_size: Number of bytes required to store formatted 206 * representation of string 207 * 208 * This MUST be kept in sync with pg_wcsformat! 209 */ 210 void 211 pg_wcssize(const unsigned char *pwcs, size_t len, int encoding, 212 int *result_width, int *result_height, int *result_format_size) 213 { 214 int w, 215 chlen = 0, 216 linewidth = 0; 217 int width = 0; 218 int height = 1; 219 int format_size = 0; 220 221 for (; *pwcs && len > 0; pwcs += chlen) 222 { 223 chlen = PQmblen((const char *) pwcs, encoding); 224 if (len < (size_t) chlen) 225 break; 226 w = PQdsplen((const char *) pwcs, encoding); 227 228 if (chlen == 1) /* single-byte char */ 229 { 230 if (*pwcs == '\n') /* Newline */ 231 { 232 if (linewidth > width) 233 width = linewidth; 234 linewidth = 0; 235 height += 1; 236 format_size += 1; /* For NUL char */ 237 } 238 else if (*pwcs == '\r') /* Linefeed */ 239 { 240 linewidth += 2; 241 format_size += 2; 242 } 243 else if (*pwcs == '\t') /* Tab */ 244 { 245 do 246 { 247 linewidth++; 248 format_size++; 249 } while (linewidth % 8 != 0); 250 } 251 else if (w < 0) /* Other control char */ 252 { 253 linewidth += 4; 254 format_size += 4; 255 } 256 else /* Output it as-is */ 257 { 258 linewidth += w; 259 format_size += 1; 260 } 261 } 262 else if (w < 0) /* Non-ascii control char */ 263 { 264 linewidth += 6; /* \u0000 */ 265 format_size += 6; 266 } 267 else /* All other chars */ 268 { 269 linewidth += w; 270 format_size += chlen; 271 } 272 len -= chlen; 273 } 274 if (linewidth > width) 275 width = linewidth; 276 format_size += 1; /* For NUL char */ 277 278 /* Set results */ 279 if (result_width) 280 *result_width = width; 281 if (result_height) 282 *result_height = height; 283 if (result_format_size) 284 *result_format_size = format_size; 285 } 286 287 /* 288 * Format a string into one or more "struct lineptr" lines. 289 * lines[i].ptr == NULL indicates the end of the array. 290 * 291 * This MUST be kept in sync with pg_wcssize! 292 */ 293 void 294 pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding, 295 struct lineptr *lines, int count) 296 { 297 int w, 298 chlen = 0; 299 int linewidth = 0; 300 unsigned char *ptr = lines->ptr; /* Pointer to data area */ 301 302 for (; *pwcs && len > 0; pwcs += chlen) 303 { 304 chlen = PQmblen((const char *) pwcs, encoding); 305 if (len < (size_t) chlen) 306 break; 307 w = PQdsplen((const char *) pwcs, encoding); 308 309 if (chlen == 1) /* single-byte char */ 310 { 311 if (*pwcs == '\n') /* Newline */ 312 { 313 *ptr++ = '\0'; 314 lines->width = linewidth; 315 linewidth = 0; 316 lines++; 317 count--; 318 if (count <= 0) 319 exit(1); /* Screwup */ 320 321 /* make next line point to remaining memory */ 322 lines->ptr = ptr; 323 } 324 else if (*pwcs == '\r') /* Linefeed */ 325 { 326 strcpy((char *) ptr, "\\r"); 327 linewidth += 2; 328 ptr += 2; 329 } 330 else if (*pwcs == '\t') /* Tab */ 331 { 332 do 333 { 334 *ptr++ = ' '; 335 linewidth++; 336 } while (linewidth % 8 != 0); 337 } 338 else if (w < 0) /* Other control char */ 339 { 340 sprintf((char *) ptr, "\\x%02X", *pwcs); 341 linewidth += 4; 342 ptr += 4; 343 } 344 else /* Output it as-is */ 345 { 346 linewidth += w; 347 *ptr++ = *pwcs; 348 } 349 } 350 else if (w < 0) /* Non-ascii control char */ 351 { 352 if (encoding == PG_UTF8) 353 sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs)); 354 else 355 { 356 /* 357 * This case cannot happen in the current code because only 358 * UTF-8 signals multibyte control characters. But we may need 359 * to support it at some stage 360 */ 361 sprintf((char *) ptr, "\\u????"); 362 } 363 ptr += 6; 364 linewidth += 6; 365 } 366 else /* All other chars */ 367 { 368 int i; 369 370 for (i = 0; i < chlen; i++) 371 *ptr++ = pwcs[i]; 372 linewidth += w; 373 } 374 len -= chlen; 375 } 376 lines->width = linewidth; 377 *ptr++ = '\0'; /* Terminate formatted string */ 378 379 if (count <= 0) 380 exit(1); /* Screwup */ 381 382 (lines + 1)->ptr = NULL; /* terminate line array */ 383 } 384 385 386 /* 387 * Encoding validation: delete any unvalidatable characters from the string 388 * 389 * This seems redundant with existing functionality elsewhere? 390 */ 391 unsigned char * 392 mbvalidate(unsigned char *pwcs, int encoding) 393 { 394 if (encoding == PG_UTF8) 395 mb_utf_validate(pwcs); 396 else 397 { 398 /* 399 * other encodings needing validation should add their own routines 400 * here 401 */ 402 } 403 404 return pwcs; 405 } 406