1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 1998-1999, 2001-2002 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef COMMON_DEFS_H 27 #define COMMON_DEFS_H 28 29 #include <sys/types.h> 30 31 /* Following are replacement characters for non-identical character cases. */ 32 33 #define ICV_TYPE_NON_IDENTICAL_CHAR (-1) 34 #define ICV_TYPE_ILLEGAL_CHAR (-2) 35 36 #define ICV_CHAR_ASCII_REPLACEMENT ('?') 37 #define ICV_CHAR_UTF8_REPLACEMENT (0x00efbfbd) 38 #define ICV_CHAR_UCS2_REPLACEMENT (0xfffd) 39 40 #define IL_ ICV_TYPE_ILLEGAL_CHAR 41 42 typedef enum { false = 0, true = 1 } boolean; 43 44 static const char number_of_bytes_in_utf8_char[0x100] = { 45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 46 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 47 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 48 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 49 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 50 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 51 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 52 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 53 54 /* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ 55 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 56 57 /* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ 58 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 59 60 /* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ 61 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 62 63 /* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ 64 IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_,IL_, 65 66 /* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ 67 IL_,IL_,2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 68 69 /* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ 70 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 71 72 /* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ 73 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 74 75 /* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ 76 4, 4, 4, 4, 4, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, IL_, 77 }; 78 79 #undef IL_ 80 81 /* 82 * Following is a vector of bit-masks to get used bits in the first byte of 83 * a UTF-8 character. Index is the number of bytes in the UTF-8 character 84 * and the index value comes from above table. 85 */ 86 static const char masks_tbl[7] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 87 88 /* 89 * The following two vectors are to provide valid minimum and 90 * maximum values for the 2'nd byte of a multibyte UTF-8 character for 91 * better illegal sequence checking. The index value must be the value of 92 * the first byte of the UTF-8 character. 93 */ 94 static const unsigned char valid_min_2nd_byte[0x100] = { 95 0, 0, 0, 0, 0, 0, 0, 0, 96 0, 0, 0, 0, 0, 0, 0, 0, 97 0, 0, 0, 0, 0, 0, 0, 0, 98 0, 0, 0, 0, 0, 0, 0, 0, 99 0, 0, 0, 0, 0, 0, 0, 0, 100 0, 0, 0, 0, 0, 0, 0, 0, 101 0, 0, 0, 0, 0, 0, 0, 0, 102 0, 0, 0, 0, 0, 0, 0, 0, 103 0, 0, 0, 0, 0, 0, 0, 0, 104 0, 0, 0, 0, 0, 0, 0, 0, 105 0, 0, 0, 0, 0, 0, 0, 0, 106 0, 0, 0, 0, 0, 0, 0, 0, 107 0, 0, 0, 0, 0, 0, 0, 0, 108 0, 0, 0, 0, 0, 0, 0, 0, 109 0, 0, 0, 0, 0, 0, 0, 0, 110 0, 0, 0, 0, 0, 0, 0, 0, 111 0, 0, 0, 0, 0, 0, 0, 0, 112 0, 0, 0, 0, 0, 0, 0, 0, 113 0, 0, 0, 0, 0, 0, 0, 0, 114 0, 0, 0, 0, 0, 0, 0, 0, 115 0, 0, 0, 0, 0, 0, 0, 0, 116 0, 0, 0, 0, 0, 0, 0, 0, 117 0, 0, 0, 0, 0, 0, 0, 0, 118 0, 0, 0, 0, 0, 0, 0, 0, 119 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 120 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 121 /* C8 C9 CA CB CC CD CE CF */ 122 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 123 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 124 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 125 /* D8 D9 DA DB DC DD DE DF */ 126 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 127 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 128 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 129 /* E8 E9 EA EB EC ED EE EF */ 130 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 131 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 132 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, 133 0, 0, 0, 0, 0, 0, 0, 0, 134 }; 135 136 static const unsigned char valid_max_2nd_byte[0x100] = { 137 0, 0, 0, 0, 0, 0, 0, 0, 138 0, 0, 0, 0, 0, 0, 0, 0, 139 0, 0, 0, 0, 0, 0, 0, 0, 140 0, 0, 0, 0, 0, 0, 0, 0, 141 0, 0, 0, 0, 0, 0, 0, 0, 142 0, 0, 0, 0, 0, 0, 0, 0, 143 0, 0, 0, 0, 0, 0, 0, 0, 144 0, 0, 0, 0, 0, 0, 0, 0, 145 0, 0, 0, 0, 0, 0, 0, 0, 146 0, 0, 0, 0, 0, 0, 0, 0, 147 0, 0, 0, 0, 0, 0, 0, 0, 148 0, 0, 0, 0, 0, 0, 0, 0, 149 0, 0, 0, 0, 0, 0, 0, 0, 150 0, 0, 0, 0, 0, 0, 0, 0, 151 0, 0, 0, 0, 0, 0, 0, 0, 152 0, 0, 0, 0, 0, 0, 0, 0, 153 0, 0, 0, 0, 0, 0, 0, 0, 154 0, 0, 0, 0, 0, 0, 0, 0, 155 0, 0, 0, 0, 0, 0, 0, 0, 156 0, 0, 0, 0, 0, 0, 0, 0, 157 0, 0, 0, 0, 0, 0, 0, 0, 158 0, 0, 0, 0, 0, 0, 0, 0, 159 0, 0, 0, 0, 0, 0, 0, 0, 160 0, 0, 0, 0, 0, 0, 0, 0, 161 /* C0 C1 C2 C3 C4 C5 C6 C7 */ 162 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 163 /* C8 C9 CA CB CC CD CE CF */ 164 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 165 /* D0 D1 D2 D3 D4 D5 D6 D7 */ 166 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 167 /* D8 D9 DA DB DC DD DE DF */ 168 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 169 /* E0 E1 E2 E3 E4 E5 E6 E7 */ 170 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 171 /* E8 E9 EA EB EC ED EE EF */ 172 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, 173 /* F0 F1 F2 F3 F4 F5 F6 F7 */ 174 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, 175 0, 0, 0, 0, 0, 0, 0, 0, 176 }; 177 178 179 /* 180 * Following "6" and "0x3f" came from 10xx xxxx bit representation of UTF-8 181 * characters' second to sixth bytes. 182 */ 183 #define ICV_UTF8_BIT_SHIFT 6 184 #define ICV_UTF8_BIT_MASK 0x3f 185 #define ICV_FETCH_UTF8_BOM_SIZE 6 186 187 #define ICV_FETCH_UCS4_SIZE 4 188 #if defined(UCS_2) || defined(UCS_2BE) || defined(UCS_2LE) || \ 189 defined(UTF_16) || defined(UTF_16BE) || defined(UTF_16LE) 190 #define ICV_FETCH_UCS_SIZE 2 191 #define ICV_FETCH_UCS_SIZE_TWO 4 192 #elif defined(UCS_4) || defined(UCS_4BE) || defined(UCS_4LE) || \ 193 defined(UTF_32) || defined(UTF_32BE) || defined(UTF_32LE) 194 #define ICV_FETCH_UCS_SIZE 4 195 #define ICV_FETCH_UCS_SIZE_TWO 8 196 #endif 197 198 199 /* 200 * UTF-8 represantations of critical values 201 */ 202 #define ICV_UTF8_REPRESENTATION_d800 (0x00eda080UL) 203 #define ICV_UTF8_REPRESENTATION_dfff (0x00edbfbfUL) 204 #define ICV_UTF8_REPRESENTATION_fffe (0x00efbfbeUL) 205 #define ICV_UTF8_REPRESENTATION_ffff (0x00efbfbfUL) 206 #define ICV_UTF8_REPRESENTATION_7fffffff (0x00fdbfbfbfbfbfULL) 207 208 /* 209 * common utility to convert utf8 string to unicode 210 */ 211 extern int convert_utf8_to_ucs4(uchar_t *, int, unsigned int *); 212 213 extern int is_valid_utf8_string(unsigned char *, int); 214 215 /* UCS-2/UCS-4/UTF-16/UTF-32 requires state management. */ 216 typedef struct { 217 boolean bom_written; 218 boolean little_endian; 219 } ucs_state_t; 220 221 #endif /* COMMON_DEFS_H */ 222