1 /* slwclut.c: wide character lookup tables */
2 /*
3 Copyright (C) 2004-2017,2018 John E. Davis
4
5 This file is part of the S-Lang Library.
6
7 The S-Lang Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11
12 The S-Lang Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this library; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA.
21 */
22 #include "slinclud.h"
23 #include <string.h>
24
25 #include "slang.h"
26 #include "_slang.h"
27 #include "slischar.h"
28
29 #define IS_ASCII256(x) ((x) < 256)
30
31 struct SLwchar_Lut_Type
32 {
33 unsigned char lut[256]; /* for chars < 256 */
34 int utf8_mode;
35
36 SLwchar_Type *chmin, *chmax;
37 unsigned int table_len;
38 unsigned int malloced_len;
39 unsigned int char_class;
40 };
41
SLwchar_free_lut(SLwchar_Lut_Type * r)42 void SLwchar_free_lut (SLwchar_Lut_Type *r)
43 {
44 if (r == NULL)
45 return;
46
47 SLfree ((char *) r->chmin);
48 SLfree ((char *) r->chmax);
49
50 SLfree ((char *) r);
51 }
52
SLwchar_create_lut(unsigned int num_entries)53 SLwchar_Lut_Type *SLwchar_create_lut (unsigned int num_entries)
54 {
55 SLwchar_Lut_Type *r;
56
57 r = (SLwchar_Lut_Type *)SLcalloc (sizeof (SLwchar_Lut_Type), 1);
58 if (r == NULL)
59 return NULL;
60
61 r->chmin = (SLwchar_Type *) _SLcalloc (num_entries, sizeof(SLwchar_Type));
62 r->chmax = (SLwchar_Type *) _SLcalloc (num_entries, sizeof(SLwchar_Type));
63 if ((r->chmin == NULL) || (r->chmax == NULL))
64 {
65 SLwchar_free_lut (r);
66 return NULL;
67 }
68
69 r->malloced_len = num_entries;
70 r->utf8_mode = _pSLinterp_UTF8_Mode;
71 return r;
72 }
73
SLwchar_add_range_to_lut(SLwchar_Lut_Type * r,SLwchar_Type a,SLwchar_Type b)74 int SLwchar_add_range_to_lut (SLwchar_Lut_Type *r, SLwchar_Type a, SLwchar_Type b)
75 {
76 if (b < a)
77 {
78 SLwchar_Type tmp = a;
79 a = b;
80 b = tmp;
81 }
82
83 if (b < 256)
84 {
85 unsigned char *lut = r->lut;
86 while (a <= b)
87 {
88 lut[a] = 1;
89 a++;
90 }
91 return 0;
92 }
93
94 if (a < 256)
95 {
96 if (-1 == SLwchar_add_range_to_lut (r, a, 255))
97 return -1;
98
99 a = 256;
100 }
101
102 if (r->table_len == r->malloced_len)
103 {
104 SLwchar_Type *chmin, *chmax;
105 unsigned int malloced_len = r->malloced_len + 5;
106
107 chmin = (SLwchar_Type *) _SLrecalloc ((char *)r->chmin, malloced_len, sizeof (SLwchar_Type));
108 if (chmin == NULL)
109 return -1;
110 r->chmin = chmin;
111
112 chmax = (SLwchar_Type *) _SLrecalloc ((char *)r->chmax, malloced_len, sizeof (SLwchar_Type));
113 if (chmax == NULL)
114 return -1;
115
116 r->chmax = chmax;
117 r->malloced_len = malloced_len;
118 }
119
120 r->chmin[r->table_len] = a;
121 r->chmax[r->table_len] = b;
122
123 r->table_len += 1;
124
125 return 0;
126 }
127
add_char_class(SLwchar_Lut_Type * r,unsigned int char_class)128 static void add_char_class (SLwchar_Lut_Type *r, unsigned int char_class)
129 {
130 unsigned int i;
131 unsigned char *lut;
132
133 r->char_class |= char_class;
134 lut = r->lut;
135
136 for (i = 0; i < 256; i++)
137 {
138 if (SL_CLASSIFICATION_LOOKUP(i) & char_class)
139 lut[i] = 1;
140 }
141 }
142
wch_in_lut(SLwchar_Lut_Type * r,SLwchar_Type wch)143 static int wch_in_lut (SLwchar_Lut_Type *r, SLwchar_Type wch)
144 {
145 unsigned int i, table_len;
146 SLwchar_Type *chmin, *chmax;
147
148 if (wch < 256)
149 return r->lut[wch];
150
151 if (r->char_class
152 && (SL_CLASSIFICATION_LOOKUP(wch) & r->char_class))
153 return 1;
154
155 /* FIXME. I should use a binary search for this... */
156 table_len = r->table_len;
157 chmin = r->chmin;
158 chmax = r->chmax;
159
160 for (i = 0; i < table_len; i++)
161 {
162 if ((wch <= chmax[i])
163 && (wch >= chmin[i]))
164 return 1;
165 }
166 return 0;
167 }
168
SLwchar_in_lut(SLwchar_Lut_Type * r,SLwchar_Type wch)169 int SLwchar_in_lut (SLwchar_Lut_Type *r, SLwchar_Type wch)
170 {
171 if (r == NULL)
172 return -1;
173
174 return wch_in_lut (r, wch);
175 }
176
SLwchar_skip_range(SLwchar_Lut_Type * r,SLuchar_Type * p,SLuchar_Type * pmax,int ignore_combining,int invert)177 SLuchar_Type *SLwchar_skip_range (SLwchar_Lut_Type *r, SLuchar_Type *p,
178 SLuchar_Type *pmax, int ignore_combining,
179 int invert)
180 {
181 unsigned char *lut;
182 int utf8_mode;
183
184 if ((r == NULL) || (p == NULL) || (pmax == NULL))
185 return NULL;
186
187 lut = r->lut;
188 invert = (invert != 0);
189 utf8_mode = r->utf8_mode;
190
191 while (p < pmax)
192 {
193 SLwchar_Type wch;
194 SLstrlen_Type dn;
195
196 if ((*p < 0x80)
197 || (utf8_mode == 0))
198 {
199 if ((int)lut[*p] == invert)
200 return p;
201
202 p++;
203 continue;
204 }
205
206 if (NULL == SLutf8_decode (p, pmax, &wch, &dn))
207 {
208 if (invert == 0)
209 return p;
210
211 p++;
212 continue;
213 }
214
215 if ((ignore_combining)
216 && (0 == SLwchar_wcwidth (wch)))
217 {
218 p += dn;
219 continue;
220 }
221
222 if (invert == wch_in_lut (r, wch))
223 return p;
224
225 p += dn;
226 }
227
228 return p;
229 }
230
SLwchar_bskip_range(SLwchar_Lut_Type * r,SLuchar_Type * pmin,SLuchar_Type * p,int ignore_combining,int invert)231 SLuchar_Type *SLwchar_bskip_range (SLwchar_Lut_Type *r, SLuchar_Type *pmin,
232 SLuchar_Type *p,
233 int ignore_combining,
234 int invert)
235 {
236 unsigned char *lut;
237 SLuchar_Type *pmax;
238 int utf8_mode;
239
240 if ((r == NULL) || (p == NULL) || (pmin == NULL))
241 return NULL;
242
243 lut = r->lut;
244 pmax = p;
245
246 invert = (invert != 0);
247 utf8_mode = r->utf8_mode;
248
249 while (p > pmin)
250 {
251 SLuchar_Type *p0;
252 SLwchar_Type wch;
253 SLstrlen_Type dn;
254
255 p0 = p - 1;
256 if ((*p0 < 0x80) || (utf8_mode == 0))
257 {
258 if ((int)lut[*p0] == invert)
259 return p;
260
261 p = p0;
262 continue;
263 }
264
265 p0 = SLutf8_bskip_char (pmin, p);
266
267 if (NULL == SLutf8_decode (p0, pmax, &wch, &dn))
268 {
269 if (invert)
270 return p;
271
272 p = p0;
273 continue;
274 }
275
276 if ((ignore_combining)
277 && (0 == SLwchar_wcwidth (wch)))
278 {
279 p = p0;
280 continue;
281 }
282
283 if (invert == wch_in_lut (r, wch))
284 return p;
285
286 p = p0;
287 }
288
289 return p;
290 }
291
292 /*
293 * Special Range characters:
294 *
295 * \w matches a unicode "word" character, taken to be alphanumeric.
296 * \a alpha character, excluding digits
297 * \s matches whitespace
298 * \l matches lowercase
299 * \u matches uppercase
300 * \d matches a digit
301 */
302
303 /* QUESTION: What is the encoding of the range? Is it utf-8? I suspect
304 * it ought to be. For example, a jed .sl file may use:
305 *
306 * skip_chars ("\\w\u{ADFF}-\u{AFFF}");
307 *
308 * to skip words chars and chars in the range 0xADFF-0xAFFF. By the time it
309 * gets here, the parser will have converted the wchars \u{ADFF} and \u{AFFF}
310 * to their UTF-8 equivalents. Hence the function needs to use SLutf8_decode
311 * to get characters.
312 */
313
314 typedef struct
315 {
316 SLCONST char *name;
317 char escaped_form;
318 }
319 Posix_Char_Class_Type;
320
321 static Posix_Char_Class_Type Posix_Char_Class_Table [] =
322 {
323 {"alnum", 'w'},
324 {"alpha", 'a'},
325 {"blank", 'b'},
326 {"cntrl", 'c'},
327 {"digit", 'd'},
328 {"graph", 'g'},
329 {"lower", 'l'},
330 {"print", 'p'},
331 {"punct", ','},
332 {"space", 's'},
333 {"upper", 'u'},
334 {"xdigit", 'x'},
335 {NULL, 0}
336 };
337
is_posix_charclass(SLuchar_Type ** up,SLuchar_Type * umax,SLwchar_Type * char_classp)338 static int is_posix_charclass (SLuchar_Type **up, SLuchar_Type *umax, SLwchar_Type *char_classp)
339 {
340 SLuchar_Type *u, *u1;
341 size_t len;
342 Posix_Char_Class_Type *p;
343
344 u = *up;
345 if (*u != ':')
346 return 0;
347 u++;
348
349 u1 = u;
350 while ((u1 < umax)
351 && (*u1 >= 'a')
352 && (*u1 <= 'z'))
353 u1++;
354
355 if (((u1+1) >= umax) || (u1[0] != ':') || (u1[1] != ']'))
356 return 0;
357
358 len = u1 - u;
359 p = Posix_Char_Class_Table;
360 while (p->name != NULL)
361 {
362 if ((0 == strncmp (p->name, (char *) u, len))
363 && (p->name[len] == 0))
364 {
365 *char_classp = p->escaped_form;
366 *up = u1 + 2;
367 return 1;
368 }
369 p++;
370 }
371 _pSLang_verror (SL_NotImplemented_Error, "Character class in range specification is unknown or unsupported");
372 return -1;
373 }
374
get_lex_char(SLuchar_Type ** up,SLuchar_Type * umax,int allow_charclass,SLwchar_Type * chp,SLwchar_Type * char_classp)375 static int get_lex_char (SLuchar_Type **up, SLuchar_Type *umax,
376 int allow_charclass,
377 SLwchar_Type *chp, SLwchar_Type *char_classp)
378 {
379 SLuchar_Type *u;
380 SLwchar_Type ch;
381
382 u = *up;
383 if (u == umax)
384 {
385 *chp = 0;
386 *char_classp = 0;
387 return 0;
388 }
389
390 if (NULL == (u = _pSLinterp_decode_wchar (u, umax, &ch)))
391 return -1;
392
393 if ((ch == '[') && allow_charclass)
394 {
395 int status = is_posix_charclass (&u, umax, &ch);
396 if (status != 0)
397 {
398 if (status == 1)
399 {
400 *chp = *char_classp = ch;
401 *up = u;
402 }
403 return status;
404 }
405 }
406
407 if ((ch != '\\') || (allow_charclass == 0)
408 || (u == umax)) /* Permit a single backslash as the last character */
409 {
410 *char_classp = 0;
411 *chp = ch;
412 *up = u;
413 return 0;
414 }
415
416 /* Here, ch=='\\' and *u represents the next character. */
417
418 /* Allow \\ and \^ to represent \ and ^, resp. Supporting \^ is useful
419 * in constructs such as "\\^x" since "^x" may mean anything but x, and not
420 * '^' or 'x'.
421 */
422 ch = *u;
423 if ((ch == '\\') || (ch == '^'))
424 {
425 *char_classp = 0;
426 *chp = ch;
427 *up = u+1;
428 return 0;
429 }
430
431 if (NULL == (u = _pSLinterp_decode_wchar (u, umax, &ch)))
432 return -1;
433
434 *chp = *char_classp = ch;
435 *up = u;
436 return 0;
437 }
438
439 typedef struct
440 {
441 #define LEXICAL_CHAR_TYPE 1
442 #define LEXICAL_RANGE_TYPE 2
443 #define LEXICAL_CLASS_TYPE 3
444 int lexical_type;
445 union
446 {
447 SLwchar_Type range[2];
448 SLwchar_Type wch;
449 int char_class;
450 }
451 e;
452 }
453 Lexical_Element_Type;
454
get_lexical_element(SLuchar_Type * u,SLuchar_Type * umax,int allow_range,int allow_charclass,Lexical_Element_Type * lex)455 static SLuchar_Type *get_lexical_element (SLuchar_Type *u, SLuchar_Type *umax,
456 int allow_range,
457 int allow_charclass,
458 Lexical_Element_Type *lex)
459 {
460 SLwchar_Type r0, r1;
461 SLwchar_Type char_class;
462
463 if (u == umax)
464 return NULL;
465
466 if (-1 == get_lex_char (&u, umax, allow_charclass, &r0, &char_class))
467 return NULL;
468
469 if (char_class)
470 {
471 lex->lexical_type = LEXICAL_CLASS_TYPE;
472 switch (char_class)
473 {
474 case '7':
475 lex->e.char_class = SLCHARCLASS_ASCII;
476 break;
477
478 case 'a': /* alpha */
479 lex->e.char_class = SLCHARCLASS_ALPHA;
480 break;
481
482 case 'b':
483 lex->e.char_class = SLCHARCLASS_BLANK;
484 break;
485
486 case 'c':
487 lex->e.char_class = SLCHARCLASS_CNTRL;
488 break;
489
490 case 'd': /* digit */
491 lex->lexical_type = LEXICAL_RANGE_TYPE;
492 lex->e.range[0] = '0';
493 lex->e.range[1] = '9';
494 break;
495
496 case 'g':
497 lex->e.char_class = SLCHARCLASS_GRAPH;
498 break;
499
500 case 'l': /* lowercase */
501 lex->e.char_class = SLCHARCLASS_LOWER;
502 break;
503
504 case 'p': /* printable */
505 lex->e.char_class = SLCHARCLASS_PRINT;
506 break;
507
508 case ',': /* punctuation */
509 lex->e.char_class = SLCHARCLASS_PUNCT;
510 break;
511
512 case 's': /* whitespace */
513 lex->e.char_class = SLCHARCLASS_SPACE;
514 break;
515
516 case 'u': /* uppercase */
517 lex->e.char_class = SLCHARCLASS_UPPER;
518 break;
519
520 case 'x':
521 lex->e.char_class = SLCHARCLASS_XDIGIT;
522 break;
523
524 case 'w': /* alphanumeric */
525 lex->e.char_class = SLCHARCLASS_ALPHA|SLCHARCLASS_XDIGIT;
526 break;
527
528 default:
529 _pSLang_verror (SL_INVALID_PARM, "Invalid character class '%c'.", char_class);
530 return NULL;
531 }
532 return u;
533 }
534
535 if ((*u != '-') || (allow_range == 0)
536 || (u+1 == umax))/* Allow '-' to occur at the end without being interpreted as a range */
537 {
538 lex->lexical_type = LEXICAL_CHAR_TYPE;
539 lex->e.wch = r0;
540 return u;
541 }
542
543 u++;
544
545 if (-1 == get_lex_char (&u, umax, allow_charclass, &r1, &char_class))
546 return NULL;
547
548 if (char_class)
549 {
550 _pSLang_verror (SL_INVALID_PARM, "Character class not allowed in a range");
551 return NULL;
552 }
553
554 if (r1 == 0)
555 {
556 _pSLang_verror (SL_INVALID_PARM, "Unfinished range specification");
557 return NULL;
558 }
559
560 lex->lexical_type = LEXICAL_RANGE_TYPE;
561 lex->e.range[0] = r0;
562 lex->e.range[1] = r1;
563 return u;
564 }
565
SLwchar_strtolut(SLuchar_Type * u,int allow_range,int allow_charclass)566 SLwchar_Lut_Type *SLwchar_strtolut (SLuchar_Type *u,
567 int allow_range, int allow_charclass)
568 {
569 SLuchar_Type *umax;
570 SLwchar_Lut_Type *r;
571 Lexical_Element_Type lex;
572
573 r = SLwchar_create_lut (32);
574 if (r == NULL)
575 return NULL;
576
577 umax = u + strlen ((char *) u);
578
579 while (u < umax)
580 {
581 if (NULL == (u = get_lexical_element (u, umax, allow_range, allow_charclass, &lex)))
582 goto return_error;
583
584 switch (lex.lexical_type)
585 {
586 case LEXICAL_CHAR_TYPE:
587 if (-1 == SLwchar_add_range_to_lut (r, lex.e.wch, lex.e.wch))
588 goto return_error;
589 break;
590
591 case LEXICAL_RANGE_TYPE:
592 if (-1 == SLwchar_add_range_to_lut (r, lex.e.range[0], lex.e.range[1]))
593 goto return_error;
594 break;
595
596 case LEXICAL_CLASS_TYPE:
597 add_char_class (r, lex.e.char_class);
598 break;
599 }
600 }
601 return r;
602
603 return_error:
604 SLwchar_free_lut (r);
605 return NULL;
606 }
607
608 /* This structure is used for mapping 1 character to another, and is used
609 * by, e.g., strtrans.
610 *
611 * The most efficient implementation that I have come up with requires a
612 * many-1 mapping between _constructs_ in the "from" list and the "to" list.
613 * Here a _construct_ is a single character, range, or a character class.
614 * The following mappings are legal:
615 *
616 * Character --> Character
617 * Range --> Character
618 * Range --> Equal length range
619 * Range --> Class (upper or lower)
620 * Class --> Character
621 * Class --> Compatible Class
622 *
623 * For inversion, the only mapping that makes sense is a many to one mapping.
624 * For example, strtrans(str, "^A-Za-z", "x"), should replace any character
625 * that is not one of the ranges A-Z and a-z by x.
626 */
627 typedef struct Char_Map_Type
628 {
629 int (*map_function)(Lexical_Element_Type *, Lexical_Element_Type *, int,
630 SLwchar_Type, SLwchar_Type *);
631
632 Lexical_Element_Type from;
633 Lexical_Element_Type to;
634
635 struct Char_Map_Type *next;
636 }
637 Char_Map_Type;
638
639 struct SLwchar_Map_Type
640 {
641 /* for chars < 256. */
642 SLwchar_Type chmap[256];
643
644 int invert;
645 Char_Map_Type *list;
646 };
647
map_char_to_char_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)648 static int map_char_to_char_method (Lexical_Element_Type *from,
649 Lexical_Element_Type *to, int invert,
650 SLwchar_Type in, SLwchar_Type *out)
651 {
652 int ok = (in == from->e.wch);
653 if (0 == (ok ^ invert))
654 return 0;
655
656 *out = to->e.wch;
657 return 1;
658 }
659
map_range_to_char_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)660 static int map_range_to_char_method (Lexical_Element_Type *from,
661 Lexical_Element_Type *to, int invert,
662 SLwchar_Type in, SLwchar_Type *out)
663 {
664 int ok = ((in >= from->e.range[0]) && (in <= from->e.range[1]));
665 if (0 == (ok ^ invert))
666 return 0;
667
668 *out = to->e.wch;
669 return 1;
670 }
671
map_range_to_range_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)672 static int map_range_to_range_method (Lexical_Element_Type *from,
673 Lexical_Element_Type *to, int invert,
674 SLwchar_Type in, SLwchar_Type *out)
675 {
676 int ok = ((in >= from->e.range[0]) && (in <= from->e.range[1]));
677 if (0 == (ok ^ invert))
678 return 0;
679
680 *out = to->e.range[0] + (in - from->e.range[0]);
681 return 1;
682 }
683
map_range_to_class_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)684 static int map_range_to_class_method (Lexical_Element_Type *from,
685 Lexical_Element_Type *to, int invert,
686 SLwchar_Type in, SLwchar_Type *out)
687 {
688 int ok = ((in >= from->e.range[0]) && (in <= from->e.range[1]));
689 if (0 == (ok ^ invert))
690 return 0;
691
692 if (to->e.char_class == SLCHARCLASS_UPPER)
693 *out = SLwchar_toupper (in);
694 else if (to->e.char_class == SLCHARCLASS_LOWER)
695 *out = SLwchar_tolower (in);
696 else
697 return 0;
698
699 return 1;
700 }
701
is_of_class(int char_class,SLwchar_Type w)702 static int is_of_class (int char_class, SLwchar_Type w)
703 {
704 switch (char_class)
705 {
706 case SLCHARCLASS_ALPHA:
707 return SLwchar_isalpha (w);
708
709 case SLCHARCLASS_ALPHA|SLCHARCLASS_XDIGIT:
710 return SLwchar_isalnum (w);
711
712 case SLCHARCLASS_UPPER:
713 return SLwchar_isupper (w);
714
715 case SLCHARCLASS_LOWER:
716 return SLwchar_islower (w);
717
718 case SLCHARCLASS_SPACE:
719 return SLwchar_isspace (w);
720
721 case SLCHARCLASS_ASCII:
722 return w < (SLwchar_Type)0x80;
723 }
724
725 return 0;
726 }
727
map_class_to_char_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)728 static int map_class_to_char_method (Lexical_Element_Type *from,
729 Lexical_Element_Type *to, int invert,
730 SLwchar_Type in, SLwchar_Type *out)
731 {
732 int ok = is_of_class (from->e.char_class, in);
733 if (0 == (ok ^ invert))
734 return 0;
735
736 *out = to->e.wch;
737 return 1;
738 }
739
map_class_to_class_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)740 static int map_class_to_class_method (Lexical_Element_Type *from,
741 Lexical_Element_Type *to, int invert,
742 SLwchar_Type in, SLwchar_Type *out)
743 {
744 int ok = is_of_class (from->e.char_class, in);
745 if (0 == (ok ^ invert))
746 return 0;
747
748 if (to->e.char_class == SLCHARCLASS_UPPER)
749 *out = SLwchar_toupper (in);
750 else if (to->e.char_class == SLCHARCLASS_LOWER)
751 *out = SLwchar_tolower (in);
752 else
753 return 0;
754
755 return 1;
756 }
757
init_chmap(SLwchar_Type * chmap,SLwchar_Type wch,SLwchar_Type (* to_func)(SLwchar_Type))758 static void init_chmap (SLwchar_Type *chmap, SLwchar_Type wch,
759 SLwchar_Type (*to_func)(SLwchar_Type))
760 {
761 unsigned int i;
762
763 chmap[0] = 0;
764 if (to_func == NULL)
765 {
766 for (i = 1; i < 256; i++)
767 chmap[i] = wch;
768 }
769 else
770 {
771 for (i = 1; i < 256; i++)
772 chmap[i] = (*to_func) (i);
773 }
774 }
775
get_range_values(Lexical_Element_Type * lex,SLwchar_Type * chminp,SLwchar_Type * chmaxp,int * range_dirp)776 static void get_range_values (Lexical_Element_Type *lex,
777 SLwchar_Type *chminp, SLwchar_Type *chmaxp,
778 int *range_dirp)
779 {
780 SLwchar_Type chmin = lex->e.range[0];
781 SLwchar_Type chmax = lex->e.range[1];
782
783 *range_dirp = 1;
784 if (chmin > chmax)
785 {
786 SLwchar_Type tmp = chmin;
787 chmin = chmax;
788 chmax = tmp;
789
790 lex->e.range[0] = chmax;
791 lex->e.range[1] = chmin;
792 *range_dirp = -1;
793 }
794 *chminp = chmin;
795 *chmaxp = chmax;
796 }
797
is_ascii(SLwchar_Type wch)798 static int is_ascii (SLwchar_Type wch)
799 {
800 return wch < (SLwchar_Type) 0x80;
801 }
802
check_char_mapping(SLwchar_Map_Type * map,Char_Map_Type * list,int first_time)803 static int check_char_mapping (SLwchar_Map_Type *map, Char_Map_Type *list, int first_time)
804 {
805 Lexical_Element_Type *lex_from, *lex_to;
806 SLwchar_Type chmin, chmax, wch, wch1;
807 SLwchar_Type (*to_func) (SLwchar_Type);
808 int (*is_func) (SLwchar_Type);
809 SLwchar_Type *chmap;
810 int invert, from_range_dir, to_range_dir;
811
812 lex_to = &list->to;
813 lex_from = &list->from;
814 chmap = map->chmap;
815 invert = map->invert;
816
817 switch (lex_from->lexical_type)
818 {
819 default:
820 return -1;
821
822 case LEXICAL_CHAR_TYPE:
823 if (lex_to->lexical_type != LEXICAL_CHAR_TYPE)
824 return -1;
825
826 wch = lex_to->e.wch;
827 if (invert && first_time)
828 init_chmap (chmap, wch, NULL);
829
830 list->map_function = map_char_to_char_method;
831
832 if (0 == IS_ASCII256(lex_from->e.wch))
833 break;
834
835 if (invert)
836 map->chmap[lex_from->e.wch] = lex_from->e.wch;
837 else
838 {
839 map->chmap[lex_from->e.wch] = wch;
840 list->map_function = NULL;
841 }
842 break;
843
844 case LEXICAL_RANGE_TYPE:
845 get_range_values (lex_from, &chmin, &chmax, &from_range_dir);
846
847 switch (lex_to->lexical_type)
848 {
849 case LEXICAL_CHAR_TYPE:
850 wch = lex_to->e.wch;
851 if (invert && first_time)
852 init_chmap (chmap, wch, NULL);
853
854 while ((chmin < 256) && (chmin <= chmax))
855 {
856 chmap[chmin] = (invert ? chmin : wch);
857 chmin++;
858 }
859 list->map_function = map_range_to_char_method;
860 break;
861
862 case LEXICAL_CLASS_TYPE:
863 if (lex_to->e.char_class == SLCHARCLASS_UPPER)
864 to_func = SLwchar_toupper;
865 else if (lex_to->e.char_class == SLCHARCLASS_LOWER)
866 to_func = SLwchar_tolower;
867 else return -1;
868
869 if (invert && first_time)
870 init_chmap (chmap, 0, to_func);
871
872 while ((chmin < 256) && (chmin <= chmax))
873 {
874 chmap[chmin] = (invert ? chmin : (*to_func) (chmin));
875 chmin++;
876 }
877 list->map_function = map_range_to_class_method;
878 break;
879
880 case LEXICAL_RANGE_TYPE:
881 if (invert)
882 {
883 _pSLang_verror (SL_INVALID_PARM, "Inversion from a range to a range not permitted");
884 return -1;
885 }
886
887 get_range_values (lex_to, &wch, &wch1, &to_range_dir);
888
889 if ((chmax - chmin) != (wch1 - wch))
890 {
891 _pSLang_verror (SL_INVALID_PARM, "Character mapping of unequal ranges is forbidden");
892 return -1;
893 }
894 if (from_range_dir != to_range_dir)
895 {
896 wch = wch1;
897 to_range_dir = -1;
898 }
899 else to_range_dir = 1;
900
901 while ((chmin < 256) && (chmin <= chmax))
902 {
903 chmap[chmin] = wch;
904 chmin++;
905 wch += to_range_dir;
906 }
907 list->map_function = map_range_to_range_method;
908 break;
909
910 default:
911 return -1;
912 }
913 if ((chmax < 256) && (invert == 0))
914 list->map_function = NULL;
915 break;
916
917 case LEXICAL_CLASS_TYPE:
918 switch (lex_from->e.char_class)
919 {
920 case SLCHARCLASS_ALPHA:
921 is_func = SLwchar_isalpha;
922 break;
923
924 case SLCHARCLASS_ALPHA|SLCHARCLASS_XDIGIT:
925 is_func = SLwchar_isalnum;
926 break;
927
928 case SLCHARCLASS_UPPER:
929 is_func = SLwchar_isupper;
930 break;
931
932 case SLCHARCLASS_LOWER:
933 is_func = SLwchar_islower;
934 break;
935
936 case SLCHARCLASS_SPACE:
937 is_func = SLwchar_isspace;
938 break;
939
940 case SLCHARCLASS_ASCII:
941 is_func = is_ascii;
942 break;
943
944 case SLCHARCLASS_BLANK:
945 is_func = SLwchar_isblank;
946 break;
947
948 case SLCHARCLASS_CNTRL:
949 is_func = SLwchar_iscntrl;
950 break;
951
952 case SLCHARCLASS_GRAPH:
953 is_func = SLwchar_isgraph;
954 break;
955
956 case SLCHARCLASS_PRINT:
957 is_func = SLwchar_isprint;
958 break;
959
960 case SLCHARCLASS_PUNCT:
961 is_func = SLwchar_ispunct;
962 break;
963
964 case SLCHARCLASS_XDIGIT:
965 is_func = SLwchar_isxdigit;
966 break;
967
968 default:
969 _pSLang_verror (SL_INVALID_PARM, "Invalid character class in character map");
970 return -1;
971 }
972 switch (lex_to->lexical_type)
973 {
974 case LEXICAL_CHAR_TYPE:
975 wch = lex_to->e.wch;
976
977 if (first_time && invert)
978 init_chmap (chmap, wch, NULL);
979
980 for (chmin = 0; chmin < 256; chmin++)
981 {
982 if ((*is_func)(chmin))
983 chmap[chmin] = (invert ? chmin : wch);
984 }
985 list->map_function = map_class_to_char_method;
986 break;
987
988 case LEXICAL_CLASS_TYPE:
989 switch (lex_to->e.char_class)
990 {
991 case SLCHARCLASS_LOWER:
992 to_func = SLwchar_tolower;
993 break;
994 case SLCHARCLASS_UPPER:
995 to_func = SLwchar_toupper;
996 break;
997
998 default:
999 return -1;
1000 }
1001
1002 if (invert && first_time)
1003 init_chmap (chmap, 0, to_func);
1004
1005 for (chmin = 0; chmin < 256; chmin++)
1006 {
1007 if ((*is_func)(chmin))
1008 chmap[chmin] = (invert ? chmin : (*to_func)(chmin));
1009 }
1010 list->map_function = map_class_to_class_method;
1011 break;
1012
1013 default:
1014 return -1;
1015 }
1016 break;
1017 }
1018 return 0;
1019 }
1020
free_char_map_type(Char_Map_Type * m)1021 static void free_char_map_type (Char_Map_Type *m)
1022 {
1023 SLfree ((char *) m);
1024 }
1025
SLwchar_free_char_map(SLwchar_Map_Type * map)1026 void SLwchar_free_char_map (SLwchar_Map_Type *map)
1027 {
1028 Char_Map_Type *list;
1029
1030 if (map == NULL)
1031 return;
1032
1033 list = map->list;
1034 while (list != NULL)
1035 {
1036 Char_Map_Type *next = list->next;
1037 free_char_map_type (list);
1038 list = next;
1039 }
1040 SLfree ((char *) map);
1041 }
1042
SLwchar_allocate_char_map(SLuchar_Type * from,SLuchar_Type * to)1043 SLwchar_Map_Type *SLwchar_allocate_char_map (SLuchar_Type *from, SLuchar_Type *to)
1044 {
1045 SLwchar_Map_Type *map;
1046 Char_Map_Type *list, *prev;
1047 SLuchar_Type *from_max, *to_max;
1048 unsigned int i;
1049 int invert = 0, first_time;
1050
1051 if (*from == '^')
1052 {
1053 invert = 1;
1054 from++;
1055 }
1056
1057 #if 0
1058 if (*from == 0)
1059 {
1060 _pSLang_verror (SL_INVALID_PARM, "Illegal empty string in character map specification");
1061 return NULL;
1062 }
1063 #endif
1064 map = (SLwchar_Map_Type *)SLcalloc (1, sizeof (SLwchar_Map_Type));
1065 if (map == NULL)
1066 return NULL;
1067
1068 map->invert = invert;
1069
1070 for (i = 0; i < 256; i++)
1071 map->chmap[i] = i;
1072
1073 from_max = from + strlen ((char *) from);
1074 to_max = to + strlen ((char *) to);
1075
1076 list = NULL;
1077
1078 while (from < from_max)
1079 {
1080 Char_Map_Type *next;
1081 SLuchar_Type *next_to;
1082
1083 if (NULL == (next = (Char_Map_Type *) SLcalloc (1, sizeof (Char_Map_Type))))
1084 goto return_error;
1085
1086 if (list == NULL)
1087 map->list = next;
1088 else
1089 list->next = next;
1090 list = next;
1091
1092 if (NULL == (from = get_lexical_element (from, from_max, 1, 1, &list->from)))
1093 goto return_error;
1094
1095 if (NULL == (next_to = get_lexical_element (to, to_max, 1, 1, &list->to)))
1096 goto return_error;
1097
1098 /* If the mapping is not 1-1, then the last "to" object applies to the
1099 * remaining "from" objects. This will permit, e.g.,
1100 * A-Za-z --> X
1101 */
1102 if (next_to != to_max)
1103 {
1104 if (invert)
1105 {
1106 _pSLang_verror (SL_INVALID_PARM, "Character map inversion must specify a many-to-one mapping");
1107 goto return_error;
1108 }
1109 to = next_to;
1110 }
1111 }
1112
1113 list = map->list;
1114 prev = NULL;
1115 first_time = 1;
1116 while (list != NULL)
1117 {
1118 Char_Map_Type *next = list->next;
1119
1120 if (-1 == check_char_mapping (map, list, first_time))
1121 {
1122 _pSLang_verror (SL_INVALID_PARM, "Specified character mapping is invalid");
1123 goto return_error;
1124 }
1125 first_time = 0;
1126
1127 if (list->map_function == NULL)
1128 {
1129 if (prev == NULL)
1130 map->list = next;
1131 else
1132 prev->next = next;
1133
1134 free_char_map_type (list);
1135 }
1136 else prev = list;
1137 list = next;
1138 }
1139 return map;
1140
1141 return_error:
1142 SLwchar_free_char_map (map);
1143 return NULL;
1144 }
1145
apply_lexical_map(SLwchar_Map_Type * map,SLwchar_Type wc_in,SLwchar_Type * wc_out)1146 static int apply_lexical_map (SLwchar_Map_Type *map, SLwchar_Type wc_in, SLwchar_Type *wc_out)
1147 {
1148 Char_Map_Type *list = map->list;
1149 int invert = map->invert;
1150
1151 while (list != NULL)
1152 {
1153 if (list->map_function != NULL)
1154 {
1155 int status = (*list->map_function)(&list->from, &list->to, invert, wc_in, wc_out);
1156 if (status == 0)
1157 {
1158 if (invert)
1159 return status;
1160 }
1161 else return status;
1162 }
1163 list = list->next;
1164 }
1165 return 0;
1166 }
1167
SLwchar_apply_char_map(SLwchar_Map_Type * map,SLwchar_Type * input,SLwchar_Type * output,unsigned int num)1168 int SLwchar_apply_char_map (SLwchar_Map_Type *map, SLwchar_Type *input, SLwchar_Type *output, unsigned int num)
1169 {
1170 unsigned int i;
1171 SLwchar_Type *chmap;
1172
1173 if ((map == NULL) || (input == NULL) || (output == NULL))
1174 return -1;
1175
1176 chmap = map->chmap;
1177
1178 for (i = 0; i < num; i++)
1179 {
1180 SLwchar_Type wc_in;
1181
1182 if ((wc_in = input[i]) < 0x100)
1183 {
1184 output[i] = chmap[wc_in];
1185 continue;
1186 }
1187
1188 if (0 == apply_lexical_map (map, wc_in, output + i))
1189 output[i] = wc_in;
1190 }
1191
1192 return 0;
1193 }
1194
1195 /* This function returns a malloced string */
SLuchar_apply_char_map(SLwchar_Map_Type * map,SLuchar_Type * str)1196 SLuchar_Type *SLuchar_apply_char_map (SLwchar_Map_Type *map, SLuchar_Type *str)
1197 {
1198 SLuchar_Type *str_max;
1199 SLuchar_Type *output, *output_max, *outptr;
1200 int use_chmap;
1201 size_t len;
1202 SLwchar_Type *chmap;
1203
1204 if ((map == NULL) || (str == NULL))
1205 return NULL;
1206
1207 use_chmap = 1;
1208 if (_pSLinterp_UTF8_Mode == 0)
1209 str_max = str + strlen ((char *)str);
1210 else
1211 {
1212 str_max = str;
1213 while (*str_max)
1214 {
1215 if (*str_max & 0x80)
1216 use_chmap = 0;
1217 str_max++;
1218 }
1219 }
1220
1221 len = str_max - str;
1222 chmap = map->chmap;
1223
1224 if (use_chmap)
1225 {
1226 unsigned int i;
1227
1228 output = (SLuchar_Type *)SLmalloc (len+1);
1229 if (output == NULL)
1230 return NULL;
1231
1232 for (i = 0; i < len; i++)
1233 output[i] = chmap[str[i]];
1234
1235 output[len] = 0;
1236 return output;
1237 }
1238
1239 /* Hard way */
1240 len += SLUTF8_MAX_MBLEN;
1241 if (NULL == (output = (SLuchar_Type *)SLmalloc (len + 1)))
1242 return NULL;
1243 output_max = output + len;
1244 outptr = output;
1245
1246 while (str < str_max)
1247 {
1248 SLwchar_Type w_out, w_in;
1249 unsigned int encoded_len;
1250
1251 w_in = (SLwchar_Type) *str;
1252 if (w_in < 0x80)
1253 str++;
1254 else if (NULL == (str = _pSLinterp_decode_wchar (str, str_max, &w_in)))
1255 goto return_error;
1256
1257 if (w_in < 0x100)
1258 {
1259 w_out = chmap[w_in];
1260 if ((w_out < 0x80) && (outptr < output_max))
1261 {
1262 *outptr++ = (SLuchar_Type) w_out;
1263 continue;
1264 }
1265 }
1266 else
1267 {
1268 if (-1 == SLwchar_apply_char_map (map, &w_in, &w_out, 1))
1269 goto return_error;
1270 }
1271
1272 if (outptr + SLUTF8_MAX_MBLEN >= output_max)
1273 {
1274 SLuchar_Type *tmp;
1275
1276 len += 32 * SLUTF8_MAX_MBLEN;
1277 if (NULL == (tmp = (SLuchar_Type *)SLrealloc ((char *)output, len)))
1278 goto return_error;
1279
1280 outptr = tmp + (outptr - output);
1281 output = tmp;
1282 output_max = output + len;
1283 }
1284
1285 if (NULL == (outptr = _pSLinterp_encode_wchar (w_out, outptr, &encoded_len)))
1286 goto return_error;
1287 }
1288
1289 *outptr = 0;
1290
1291 return output;
1292
1293 return_error:
1294 SLfree ((char *) output);
1295 return NULL;
1296 }
1297