1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * sub3.c ... ALE enhancement. 28 * Since a typical Asian language has a huge character set, it is not 29 * ideal to index an array by a character code itself, which requires 30 * as large as 2**16 entries per array. 31 * To get arround this problem, we identify a set of characters that 32 * causes the same transition on all states and call it character group. 33 * Every character in a same character group has a unique number called 34 * character group id. A function yycgid(c) maps the character c (in process 35 * code) to the id. This mapping is determined by analyzing all regular 36 * expressions in the lex program. 37 * 38 */ 39 #include <stdlib.h> 40 #include <widec.h> 41 #include <search.h> 42 #include "ldefs.h" 43 44 /* 45 * "lchar" stands for linearized character. It is a variant of 46 * process code. AT&T's 16-bit process code has a drawback in which 47 * for three three process code C, D and E where C <= D <= E, 48 * codeset(C)==codeset(E) does not mean codeset(D)==codeset(C). 49 * In other words, four codesets alternates as the magnitude 50 * of character increases. 51 * The lchar representation holds this property: 52 * If three lchar C', D' and E' have the relationship C' < D' < E' and 53 * codeset(C') == codeset(E') then D' is guaranteed to belong to 54 * the same codeset as C' and E'. 55 * lchar is implemented as 32 bit entities and the function linearize() 56 * that maps a wchar_t to lchar is defined below. There is no 57 * reverse function for it though. 58 * The 32-bit process code by AT&T, used only for Taiwanese version at the 59 * time of wrting, has no such problem and we use it as it is. 60 */ 61 62 lchar yycgidtbl[MAXNCG] = { 63 0, /* For ease of computation of the id. */ 64 '\n', /* Newline is always special because '.' exclude it. */ 65 0x000000ff, /* The upper limit of codeset 0. */ 66 0x20ffffff, /* The upper limit of codeset 2. */ 67 0x40ffffff /* The upper limit of codeset 3. */ 68 /* 0x60ffffff The upper limit of codeset 1. */ 69 /* Above assumes the number of significant bits of wchar_t is <= 24. */ 70 }; 71 int ncgidtbl = 5; /* # elements in yycgidtbl. */ 72 int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */ 73 /* returns plus 1. */ 74 75 static void setsymbol(int i); 76 77 /* 78 * For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below: 79 * 80 * wc: axxxxxxbyyyyyyy 81 * 82 * returns: 0ab0000000000000axxxxxxxbyyyyyyy 83 * 84 * linearize() doesn't do any if compiled with 32-bit wchar_t, use of 85 * which is flagged with LONG_WCHAR_T macro. 86 * NOTE: 87 * The implementation is highly depends on the process code representation. 88 * This function should be modified when 32-bit process code is used. 89 * There is no need to keep 'a' and 'b' bits in the lower half of lchar. 90 * You can actually omit these and squeeze the xxxxxx part one bit right. 91 * We don't do that here just in sake of speed. 92 */ 93 lchar 94 linearize(wchar_t wc) 95 { 96 #ifdef LONG_WCHAR_T 97 return ((lchar)wc); /* Don't do anything. */ 98 #else 99 100 lchar prefix; 101 switch (wc&0x8080) { 102 case 0x0000: prefix = 0x00000000; break; 103 case 0x0080: prefix = 0x20000000; break; 104 case 0x8000: prefix = 0x40000000; break; 105 case 0x8080: prefix = 0x60000000; break; 106 } 107 return (prefix|wc); 108 #endif 109 } 110 111 /* compare liniear characters pointed to by pc1 and pc2 */ 112 int 113 cmplc(const void *arg1, const void *arg2) 114 { 115 lchar *pc1 = (lchar *)arg1; 116 lchar *pc2 = (lchar *)arg2; 117 118 if (*pc1 > *pc2) 119 return (1); 120 else if (*pc1 == *pc2) 121 return (0); 122 else 123 return (-1); 124 } 125 126 void 127 remch(wchar_t c) 128 { 129 lchar lc = linearize(c); 130 size_t local_ncgidtbl; 131 132 /* 133 * User-friendliness consideration: 134 * Make sure no EUC chars are used in reg. exp. 135 */ 136 if (!handleeuc) { 137 if (!isascii(c)) 138 if (iswprint(c)) 139 warning( 140 "Non-ASCII character '%wc' in pattern; use -w or -e lex option.", c); 141 else warning( 142 "Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c); 143 /* In any case, we don't need to construct ncgidtbl[]. */ 144 return; 145 } 146 147 /* 148 * lsearch wants ncgidtbl to be size_t, but it is int. Hence, 149 * the use of local_ncgidtbl to satisfy the calling interface. 150 */ 151 local_ncgidtbl = ncgidtbl; 152 (void) lsearch(&lc, yycgidtbl, 153 &local_ncgidtbl, sizeof (lchar), cmplc); 154 ncgidtbl = (int)local_ncgidtbl; 155 } 156 157 void 158 sortcgidtbl(void) 159 { 160 if (!handleeuc) 161 return; 162 qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc); 163 } 164 165 /* 166 * int yycgid(wchar_t c) 167 * Takes c and returns its character group id, determind by the 168 * following algorithm. The program also uses the binary search 169 * algorithm, generalized from Knuth (6.2.1) Algorithm B. 170 * 171 * This function computes the "character group id" based on 172 * a table yycgidtbl of which each lchar entry is pre-sorted 173 * in ascending sequence The number of valid entries is given 174 * by YYNCGIDTBL. There is no duplicate entries in yycgidtbl. 175 * const int YYNCGIDTBL; 176 * lchar yycgidtbl[YYNCGIDTBL]; 177 * 178 * yycgidtbl[0] is guaranteed to have zero. 179 * 180 * For given c, yycgid(c) returns: 181 * 2*i iff yycgidtbl[i] == lc 182 * 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1] 183 * YYNCGIDTBL*2-1 184 * iff yycgidtbl[YYNCGIDTBL-1] < lc 185 * where lc=linearize(c). 186 * 187 * Some interesting properties.: 188 * 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1 189 * 2. yycgid(c) == 0 iff c == 0. 190 * 3. For any wchar_t c and d, if linearize(c) < linearize(d) then 191 * yycgid(c) <= yycgid(d). 192 * 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then 193 * linearize(c) < linearize(d). 194 */ 195 #define YYNCGIDTBL ncgidtbl 196 197 int 198 yycgid(wchar_t c) 199 { 200 int first = 0; 201 int last = YYNCGIDTBL - 1; 202 lchar lc; 203 204 /* 205 * In ASCII compat. mode, each character forms a "group" and the 206 * group-id is itself... 207 */ 208 if (!handleeuc) 209 return (c); 210 211 lc = linearize(c); 212 213 /* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */ 214 if (yycgidtbl[YYNCGIDTBL - 1] < lc) 215 return (YYNCGIDTBL*2 - 1); 216 217 while (last >= 0) { 218 int i = (first+last)/2; 219 if (lc == yycgidtbl[i]) 220 return (2*i); /* lc exactly matches an element. */ 221 else if (yycgidtbl[i] < lc) { 222 if (lc < yycgidtbl[i+1]) { 223 /* lc is in between two elements */ 224 return (2*i+1); 225 } 226 else 227 first = i + 1; 228 } else 229 last = i - 1; 230 } 231 error( 232 "system error in yycgid():binary search failed for c=0x%04x\n", c); 233 return (0); 234 } 235 236 /* 237 * repbycgid --- replaces each character in the parsing tree by its 238 * character group id. This, however, should be called even in 239 * the ASCII compat. mode to process DOT nodes and to call cclinter() 240 * for the DOT and CCL nodes. 241 */ 242 void 243 repbycgid(void) 244 { 245 int i, c; 246 247 for (i = 0; i < tptr; ++i) { 248 c = name[i]; 249 if (!ISOPERATOR(c)) { 250 /* If not an operator, it must be a char. */ 251 name[i] = yycgid((wchar_t)c); /* So replace it. */ 252 #ifdef DEBUG 253 if (debug) { 254 printf("name[%d]:'%c'->%d;\n", i, c, name[i]); 255 } 256 #endif 257 } else if (c == RSTR) { 258 c = right[i]; 259 right[i] = yycgid((wchar_t)c); 260 #ifdef DEBUG 261 if (debug) { 262 printf( 263 "name[%d].right:'%c'->%d;\n", 264 i, c, right[i]); 265 } 266 #endif 267 } else if ((c == RCCL) || (c == RNCCL)) { 268 CHR cc, *s; 269 int j; 270 CHR ccltoken[CCLSIZE]; 271 CHR *ccp; 272 int m; 273 /* 274 * This node represetns a character class RE [ccccc] 275 * s points to the string of characters that forms 276 * the class and/or a special prefix notation 277 * <RANGE>XY which corresponds to the RE X-Y, 278 * characters in the range of X and Y. Here, 279 * X <= Y is guranteed. 280 * We transform these characters into a string 281 * of sorted character group ids. 282 * 283 * There is another mechanism of packing tables 284 * that is inherited from the ASCII lex. Call of 285 * cclinter() is required for this packing. 286 * This used to be done as yylex() reads the lex 287 * rules but we have to do this here because the 288 * transition table is made to work on the char-group 289 * ids and the mapping cannot be determined until 290 * the entire file is read. 291 */ 292 #ifdef DEBUG 293 if (debug) { 294 printf("name[%d]:R[N]CCL of \"", i); 295 strpt(left[i]); 296 printf(" -> {"); 297 } 298 #endif 299 /* Prepare symbol[] for cclinter(). */ 300 for (j = 0; j < ncg; ++j) 301 symbol[j] = FALSE; 302 303 s = (CHR *) left[i]; 304 while (cc = *s++) { 305 if (cc == RANGE) { 306 int low, high, i; 307 /* 308 * Special form: <RANGE>XY 309 * This means the range X-Y. 310 * We mark all symbols[] 311 * elements for yycgid(X) thru 312 * yycgid(Y), inclusively. 313 */ 314 low = yycgid(*s++); 315 high = yycgid(*s++); 316 for (i = low; i <= high; ++i) 317 setsymbol(i); 318 } else { 319 setsymbol(yycgid(cc)); 320 } 321 } 322 323 /* Now make a transformed string of cgids. */ 324 s = ccptr; 325 m = 0; 326 for (j = 0; j < ncg; ++j) 327 if (symbol[j]) { 328 ccltoken[m++] = (CHR)j; 329 #ifdef DEBUG 330 if (debug) printf("%d, ", j); 331 #endif 332 } 333 334 #ifdef DEBUG 335 if (debug) printf("}\n"); 336 #endif 337 ccltoken[m] = 0; 338 ccp = ccl; 339 while (ccp < ccptr && scomp(ccltoken, ccp) != 0) 340 ccp++; 341 if (ccp < ccptr) { /* character class found in ccl */ 342 left[i] = (int)ccp; 343 } else { /* not in ccl, add it */ 344 left[i] = (int)ccptr; 345 scopy(ccltoken, ccptr); 346 ccptr += slength(ccltoken) + 1; 347 if (ccptr > ccl + CCLSIZE) 348 error( 349 "Too many large character classes"); 350 } 351 cclinter(c == RCCL); 352 } else if (c == DOT) { 353 if (psave == 0) { /* First DOT node. */ 354 int j, nlid; 355 /* 356 * Make symbol[k]=TRUE for all k 357 * except k == yycgid('\n'). 358 */ 359 nlid = yycgid('\n'); 360 psave = ccptr; 361 for (j = 1; j < ncg; ++j) { 362 if (j == nlid) { 363 symbol[j] = FALSE; 364 } else { 365 symbol[j] = TRUE; 366 *ccptr++ = (CHR) j; 367 } 368 } 369 *ccptr++ = 0; 370 if (ccptr > ccl + CCLSIZE) 371 error( 372 "Too many large character classes"); 373 } 374 /* Mimic mn1(RCCL,psave)... */ 375 name[i] = RCCL; 376 left[i] = (int)psave; 377 cclinter(1); 378 } 379 } 380 #ifdef DEBUG 381 if (debug) { 382 printf("treedump after repbycgid().\n"); 383 treedump(); 384 } 385 #endif 386 } 387 388 static void 389 setsymbol(int i) 390 { 391 if (i > sizeof (symbol)) 392 error("setsymbol: (SYSERR) %d out of range", i); 393 symbol[i] = TRUE; 394 } 395