1 /* slwclut.c: wide character lookup tables */
2 /*
3 Copyright (C) 2004-2017,2018 John E. Davis
4 
5 This file is part of the S-Lang Library.
6 
7 The S-Lang Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU General Public License as
9 published by the Free Software Foundation; either version 2 of the
10 License, or (at your option) any later version.
11 
12 The S-Lang Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15 General Public License for more details.
16 
17 You should have received a copy of the GNU General Public License
18 along with this library; if not, write to the Free Software
19 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
20 USA.
21 */
22 #include "slinclud.h"
23 #include <string.h>
24 
25 #include "slang.h"
26 #include "_slang.h"
27 #include "slischar.h"
28 
29 #define IS_ASCII256(x) ((x) < 256)
30 
31 struct SLwchar_Lut_Type
32 {
33    unsigned char lut[256];             /* for chars < 256 */
34    int utf8_mode;
35 
36    SLwchar_Type *chmin, *chmax;
37    unsigned int table_len;
38    unsigned int malloced_len;
39    unsigned int char_class;
40 };
41 
SLwchar_free_lut(SLwchar_Lut_Type * r)42 void SLwchar_free_lut (SLwchar_Lut_Type *r)
43 {
44    if (r == NULL)
45      return;
46 
47    SLfree ((char *) r->chmin);
48    SLfree ((char *) r->chmax);
49 
50    SLfree ((char *) r);
51 }
52 
SLwchar_create_lut(unsigned int num_entries)53 SLwchar_Lut_Type *SLwchar_create_lut (unsigned int num_entries)
54 {
55    SLwchar_Lut_Type *r;
56 
57    r = (SLwchar_Lut_Type *)SLcalloc (sizeof (SLwchar_Lut_Type), 1);
58    if (r == NULL)
59      return NULL;
60 
61    r->chmin = (SLwchar_Type *) _SLcalloc (num_entries, sizeof(SLwchar_Type));
62    r->chmax = (SLwchar_Type *) _SLcalloc (num_entries, sizeof(SLwchar_Type));
63    if ((r->chmin == NULL) || (r->chmax == NULL))
64      {
65         SLwchar_free_lut (r);
66         return NULL;
67      }
68 
69    r->malloced_len = num_entries;
70    r->utf8_mode = _pSLinterp_UTF8_Mode;
71    return r;
72 }
73 
SLwchar_add_range_to_lut(SLwchar_Lut_Type * r,SLwchar_Type a,SLwchar_Type b)74 int SLwchar_add_range_to_lut (SLwchar_Lut_Type *r, SLwchar_Type a, SLwchar_Type b)
75 {
76    if (b < a)
77      {
78         SLwchar_Type tmp = a;
79         a = b;
80         b = tmp;
81      }
82 
83    if (b < 256)
84      {
85         unsigned char *lut = r->lut;
86         while (a <= b)
87           {
88              lut[a] = 1;
89              a++;
90           }
91         return 0;
92      }
93 
94    if (a < 256)
95      {
96         if (-1 == SLwchar_add_range_to_lut (r, a, 255))
97           return -1;
98 
99         a = 256;
100      }
101 
102    if (r->table_len == r->malloced_len)
103      {
104         SLwchar_Type *chmin, *chmax;
105         unsigned int malloced_len = r->malloced_len + 5;
106 
107         chmin = (SLwchar_Type *) _SLrecalloc ((char *)r->chmin, malloced_len, sizeof (SLwchar_Type));
108         if (chmin == NULL)
109           return -1;
110         r->chmin = chmin;
111 
112         chmax = (SLwchar_Type *) _SLrecalloc ((char *)r->chmax, malloced_len, sizeof (SLwchar_Type));
113         if (chmax == NULL)
114           return -1;
115 
116         r->chmax = chmax;
117         r->malloced_len = malloced_len;
118      }
119 
120    r->chmin[r->table_len] = a;
121    r->chmax[r->table_len] = b;
122 
123    r->table_len += 1;
124 
125    return 0;
126 }
127 
add_char_class(SLwchar_Lut_Type * r,unsigned int char_class)128 static void add_char_class (SLwchar_Lut_Type *r, unsigned int char_class)
129 {
130    unsigned int i;
131    unsigned char *lut;
132 
133    r->char_class |= char_class;
134    lut = r->lut;
135 
136    for (i = 0; i < 256; i++)
137      {
138 	if (SL_CLASSIFICATION_LOOKUP(i) & char_class)
139 	  lut[i] = 1;
140      }
141 }
142 
wch_in_lut(SLwchar_Lut_Type * r,SLwchar_Type wch)143 static int wch_in_lut (SLwchar_Lut_Type *r, SLwchar_Type wch)
144 {
145    unsigned int i, table_len;
146    SLwchar_Type *chmin, *chmax;
147 
148    if (wch < 256)
149      return r->lut[wch];
150 
151    if (r->char_class
152        && (SL_CLASSIFICATION_LOOKUP(wch) & r->char_class))
153      return 1;
154 
155    /* FIXME.  I should use a binary search for this... */
156    table_len = r->table_len;
157    chmin = r->chmin;
158    chmax = r->chmax;
159 
160    for (i = 0; i < table_len; i++)
161      {
162 	if ((wch <= chmax[i])
163 	    && (wch >= chmin[i]))
164 	  return 1;
165      }
166    return 0;
167 }
168 
SLwchar_in_lut(SLwchar_Lut_Type * r,SLwchar_Type wch)169 int SLwchar_in_lut (SLwchar_Lut_Type *r, SLwchar_Type wch)
170 {
171    if (r == NULL)
172      return -1;
173 
174    return wch_in_lut (r, wch);
175 }
176 
SLwchar_skip_range(SLwchar_Lut_Type * r,SLuchar_Type * p,SLuchar_Type * pmax,int ignore_combining,int invert)177 SLuchar_Type *SLwchar_skip_range (SLwchar_Lut_Type *r, SLuchar_Type *p,
178                                   SLuchar_Type *pmax, int ignore_combining,
179                                   int invert)
180 {
181    unsigned char *lut;
182    int utf8_mode;
183 
184    if ((r == NULL) || (p == NULL) || (pmax == NULL))
185      return NULL;
186 
187    lut = r->lut;
188    invert = (invert != 0);
189    utf8_mode = r->utf8_mode;
190 
191    while (p < pmax)
192      {
193         SLwchar_Type wch;
194         SLstrlen_Type dn;
195 
196         if ((*p < 0x80)
197 	    || (utf8_mode == 0))
198           {
199              if ((int)lut[*p] == invert)
200 	       return p;
201 
202              p++;
203              continue;
204           }
205 
206         if (NULL == SLutf8_decode (p, pmax, &wch, &dn))
207           {
208              if (invert == 0)
209                return p;
210 
211              p++;
212              continue;
213           }
214 
215 	if ((ignore_combining)
216 	    && (0 == SLwchar_wcwidth (wch)))
217 	  {
218 	     p += dn;
219 	     continue;
220 	  }
221 
222 	if (invert == wch_in_lut (r, wch))
223 	  return p;
224 
225         p += dn;
226      }
227 
228    return p;
229 }
230 
SLwchar_bskip_range(SLwchar_Lut_Type * r,SLuchar_Type * pmin,SLuchar_Type * p,int ignore_combining,int invert)231 SLuchar_Type *SLwchar_bskip_range (SLwchar_Lut_Type *r, SLuchar_Type *pmin,
232 				   SLuchar_Type *p,
233 				   int ignore_combining,
234 				   int invert)
235 {
236    unsigned char *lut;
237    SLuchar_Type *pmax;
238    int utf8_mode;
239 
240    if ((r == NULL) || (p == NULL) || (pmin == NULL))
241      return NULL;
242 
243    lut = r->lut;
244    pmax = p;
245 
246    invert = (invert != 0);
247    utf8_mode = r->utf8_mode;
248 
249    while (p > pmin)
250      {
251 	SLuchar_Type *p0;
252         SLwchar_Type wch;
253         SLstrlen_Type dn;
254 
255 	p0 = p - 1;
256         if ((*p0 < 0x80) || (utf8_mode == 0))
257           {
258              if ((int)lut[*p0] == invert)
259 	       return p;
260 
261 	     p = p0;
262 	     continue;
263 	  }
264 
265 	p0 = SLutf8_bskip_char (pmin, p);
266 
267         if (NULL == SLutf8_decode (p0, pmax, &wch, &dn))
268           {
269              if (invert)
270                return p;
271 
272              p = p0;
273              continue;
274           }
275 
276 	if ((ignore_combining)
277 	    && (0 == SLwchar_wcwidth (wch)))
278 	  {
279 	     p = p0;
280 	     continue;
281 	  }
282 
283 	if (invert == wch_in_lut (r, wch))
284 	  return p;
285 
286         p = p0;
287      }
288 
289    return p;
290 }
291 
292 /*
293  * Special Range characters:
294  *
295  * \w matches a unicode "word" character, taken to be alphanumeric.
296  * \a alpha character, excluding digits
297  * \s matches whitespace
298  * \l matches lowercase
299  * \u matches uppercase
300  * \d matches a digit
301  */
302 
303 /* QUESTION: What is the encoding of the range?  Is it utf-8?  I suspect
304  * it ought to be.  For example, a jed .sl file may use:
305  *
306  *    skip_chars ("\\w\u{ADFF}-\u{AFFF}");
307  *
308  * to skip words chars and chars in the range 0xADFF-0xAFFF.  By the time it
309  * gets here, the parser will have converted the wchars \u{ADFF} and \u{AFFF}
310  * to their UTF-8 equivalents.  Hence the function needs to use SLutf8_decode
311  * to get characters.
312  */
313 
314 typedef struct
315 {
316    SLCONST char *name;
317    char escaped_form;
318 }
319 Posix_Char_Class_Type;
320 
321 static Posix_Char_Class_Type Posix_Char_Class_Table [] =
322 {
323      {"alnum", 'w'},
324      {"alpha", 'a'},
325      {"blank", 'b'},
326      {"cntrl", 'c'},
327      {"digit", 'd'},
328      {"graph", 'g'},
329      {"lower", 'l'},
330      {"print", 'p'},
331      {"punct", ','},
332      {"space", 's'},
333      {"upper", 'u'},
334      {"xdigit", 'x'},
335      {NULL, 0}
336 };
337 
is_posix_charclass(SLuchar_Type ** up,SLuchar_Type * umax,SLwchar_Type * char_classp)338 static int is_posix_charclass (SLuchar_Type **up, SLuchar_Type *umax, SLwchar_Type *char_classp)
339 {
340    SLuchar_Type *u, *u1;
341    size_t len;
342    Posix_Char_Class_Type *p;
343 
344    u = *up;
345    if (*u != ':')
346      return 0;
347    u++;
348 
349    u1 = u;
350    while ((u1 < umax)
351 	  && (*u1 >= 'a')
352 	  && (*u1 <= 'z'))
353      u1++;
354 
355    if (((u1+1) >= umax) || (u1[0] != ':') || (u1[1] != ']'))
356      return 0;
357 
358    len = u1 - u;
359    p = Posix_Char_Class_Table;
360    while (p->name != NULL)
361      {
362 	if ((0 == strncmp (p->name, (char *) u, len))
363 	    && (p->name[len] == 0))
364 	  {
365 	     *char_classp = p->escaped_form;
366 	     *up = u1 + 2;
367 	     return 1;
368 	  }
369 	p++;
370      }
371    _pSLang_verror (SL_NotImplemented_Error, "Character class in range specification is unknown or unsupported");
372    return -1;
373 }
374 
get_lex_char(SLuchar_Type ** up,SLuchar_Type * umax,int allow_charclass,SLwchar_Type * chp,SLwchar_Type * char_classp)375 static int get_lex_char (SLuchar_Type **up, SLuchar_Type *umax,
376 			 int allow_charclass,
377 			 SLwchar_Type *chp, SLwchar_Type *char_classp)
378 {
379    SLuchar_Type *u;
380    SLwchar_Type ch;
381 
382    u = *up;
383    if (u == umax)
384      {
385 	*chp = 0;
386 	*char_classp = 0;
387 	return 0;
388      }
389 
390    if (NULL == (u = _pSLinterp_decode_wchar (u, umax, &ch)))
391      return -1;
392 
393    if ((ch == '[') && allow_charclass)
394      {
395 	int status = is_posix_charclass (&u, umax, &ch);
396 	if (status != 0)
397 	  {
398 	     if (status == 1)
399 	       {
400 		  *chp = *char_classp = ch;
401 		  *up = u;
402 	       }
403 	     return status;
404 	  }
405      }
406 
407    if ((ch != '\\') || (allow_charclass == 0)
408        || (u == umax)) /* Permit a single backslash as the last character */
409      {
410 	*char_classp = 0;
411 	*chp = ch;
412 	*up = u;
413 	return 0;
414      }
415 
416    /* Here, ch=='\\' and *u represents the next character. */
417 
418    /* Allow \\ and \^ to represent \ and ^, resp.  Supporting \^ is useful
419     * in constructs such as "\\^x" since "^x" may mean anything but x, and not
420     * '^' or 'x'.
421     */
422    ch = *u;
423    if ((ch == '\\') || (ch == '^'))
424      {
425 	*char_classp = 0;
426 	*chp = ch;
427 	*up = u+1;
428 	return 0;
429      }
430 
431    if (NULL == (u = _pSLinterp_decode_wchar (u, umax, &ch)))
432      return -1;
433 
434    *chp = *char_classp = ch;
435    *up = u;
436    return 0;
437 }
438 
439 typedef struct
440 {
441 #define LEXICAL_CHAR_TYPE	1
442 #define LEXICAL_RANGE_TYPE	2
443 #define LEXICAL_CLASS_TYPE	3
444    int lexical_type;
445    union
446      {
447 	SLwchar_Type range[2];
448 	SLwchar_Type wch;
449 	int char_class;
450      }
451    e;
452 }
453 Lexical_Element_Type;
454 
get_lexical_element(SLuchar_Type * u,SLuchar_Type * umax,int allow_range,int allow_charclass,Lexical_Element_Type * lex)455 static SLuchar_Type *get_lexical_element (SLuchar_Type *u, SLuchar_Type *umax,
456 					  int allow_range,
457 					  int allow_charclass,
458 					  Lexical_Element_Type *lex)
459 {
460    SLwchar_Type r0, r1;
461    SLwchar_Type char_class;
462 
463    if (u == umax)
464      return NULL;
465 
466    if (-1 == get_lex_char (&u, umax, allow_charclass, &r0, &char_class))
467      return NULL;
468 
469    if (char_class)
470      {
471 	lex->lexical_type = LEXICAL_CLASS_TYPE;
472 	switch (char_class)
473 	  {
474 	   case '7':
475 	     lex->e.char_class = SLCHARCLASS_ASCII;
476 	     break;
477 
478 	   case 'a':	       /* alpha */
479 	     lex->e.char_class = SLCHARCLASS_ALPHA;
480 	     break;
481 
482 	   case 'b':
483 	     lex->e.char_class = SLCHARCLASS_BLANK;
484 	     break;
485 
486 	   case 'c':
487 	     lex->e.char_class = SLCHARCLASS_CNTRL;
488 	     break;
489 
490 	   case 'd':	       /* digit */
491 	     lex->lexical_type = LEXICAL_RANGE_TYPE;
492 	     lex->e.range[0] = '0';
493 	     lex->e.range[1] = '9';
494 	     break;
495 
496 	   case 'g':
497 	     lex->e.char_class = SLCHARCLASS_GRAPH;
498 	     break;
499 
500 	   case 'l':	       /* lowercase */
501 	     lex->e.char_class = SLCHARCLASS_LOWER;
502 	     break;
503 
504 	   case 'p':	       /* printable */
505 	     lex->e.char_class = SLCHARCLASS_PRINT;
506 	     break;
507 
508 	   case ',':	       /* punctuation */
509 	     lex->e.char_class = SLCHARCLASS_PUNCT;
510 	     break;
511 
512 	   case 's':	       /* whitespace */
513 	     lex->e.char_class = SLCHARCLASS_SPACE;
514 	     break;
515 
516 	   case 'u':	       /* uppercase */
517 	     lex->e.char_class = SLCHARCLASS_UPPER;
518 	     break;
519 
520 	   case 'x':
521 	     lex->e.char_class = SLCHARCLASS_XDIGIT;
522 	     break;
523 
524 	   case 'w':	       /* alphanumeric */
525 	     lex->e.char_class = SLCHARCLASS_ALPHA|SLCHARCLASS_XDIGIT;
526 	     break;
527 
528 	   default:
529 	     _pSLang_verror (SL_INVALID_PARM, "Invalid character class '%c'.", char_class);
530 	     return NULL;
531 	  }
532 	return u;
533      }
534 
535    if ((*u != '-') || (allow_range == 0)
536        || (u+1 == umax))/* Allow '-' to occur at the end without being interpreted as a range */
537      {
538 	lex->lexical_type = LEXICAL_CHAR_TYPE;
539 	lex->e.wch = r0;
540 	return u;
541      }
542 
543    u++;
544 
545    if (-1 == get_lex_char (&u, umax, allow_charclass, &r1, &char_class))
546      return NULL;
547 
548    if (char_class)
549      {
550 	_pSLang_verror (SL_INVALID_PARM, "Character class not allowed in a range");
551 	return NULL;
552      }
553 
554    if (r1 == 0)
555      {
556 	_pSLang_verror (SL_INVALID_PARM, "Unfinished range specification");
557 	return NULL;
558      }
559 
560    lex->lexical_type = LEXICAL_RANGE_TYPE;
561    lex->e.range[0] = r0;
562    lex->e.range[1] = r1;
563    return u;
564 }
565 
SLwchar_strtolut(SLuchar_Type * u,int allow_range,int allow_charclass)566 SLwchar_Lut_Type *SLwchar_strtolut (SLuchar_Type *u,
567 				    int allow_range, int allow_charclass)
568 {
569    SLuchar_Type *umax;
570    SLwchar_Lut_Type *r;
571    Lexical_Element_Type lex;
572 
573    r = SLwchar_create_lut (32);
574    if (r == NULL)
575      return NULL;
576 
577    umax = u + strlen ((char *) u);
578 
579    while (u < umax)
580      {
581 	if (NULL == (u = get_lexical_element (u, umax, allow_range, allow_charclass, &lex)))
582 	  goto return_error;
583 
584 	switch (lex.lexical_type)
585 	  {
586 	   case LEXICAL_CHAR_TYPE:
587 	     if (-1 == SLwchar_add_range_to_lut (r, lex.e.wch, lex.e.wch))
588 	       goto return_error;
589 	     break;
590 
591 	   case LEXICAL_RANGE_TYPE:
592 	     if (-1 == SLwchar_add_range_to_lut (r, lex.e.range[0], lex.e.range[1]))
593 	       goto return_error;
594 	     break;
595 
596 	   case LEXICAL_CLASS_TYPE:
597 	     add_char_class (r, lex.e.char_class);
598 	     break;
599 	  }
600      }
601    return r;
602 
603    return_error:
604    SLwchar_free_lut (r);
605    return NULL;
606 }
607 
608 /* This structure is used for mapping 1 character to another, and is used
609  * by, e.g., strtrans.
610  *
611  * The most efficient implementation that I have come up with requires a
612  * many-1 mapping between _constructs_ in the "from" list and the "to" list.
613  * Here a _construct_ is a single character, range, or a character class.
614  * The following mappings are legal:
615  *
616  *    Character --> Character
617  *    Range     --> Character
618  *    Range     --> Equal length range
619  *    Range	--> Class (upper or lower)
620  *    Class     --> Character
621  *    Class     --> Compatible Class
622  *
623  * For inversion, the only mapping that makes sense is a many to one mapping.
624  * For example, strtrans(str, "^A-Za-z", "x"), should replace any character
625  * that is not one of the ranges A-Z and a-z by x.
626  */
627 typedef struct Char_Map_Type
628 {
629    int (*map_function)(Lexical_Element_Type *, Lexical_Element_Type *, int,
630 		       SLwchar_Type, SLwchar_Type *);
631 
632    Lexical_Element_Type from;
633    Lexical_Element_Type to;
634 
635    struct Char_Map_Type *next;
636 }
637 Char_Map_Type;
638 
639 struct SLwchar_Map_Type
640 {
641    /* for chars < 256. */
642    SLwchar_Type chmap[256];
643 
644    int invert;
645    Char_Map_Type *list;
646 };
647 
map_char_to_char_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)648 static int map_char_to_char_method (Lexical_Element_Type *from,
649 				    Lexical_Element_Type *to, int invert,
650 				    SLwchar_Type in, SLwchar_Type *out)
651 {
652    int ok = (in == from->e.wch);
653    if (0 == (ok ^ invert))
654      return 0;
655 
656    *out = to->e.wch;
657    return 1;
658 }
659 
map_range_to_char_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)660 static int map_range_to_char_method (Lexical_Element_Type *from,
661 				     Lexical_Element_Type *to, int invert,
662 				     SLwchar_Type in, SLwchar_Type *out)
663 {
664    int ok = ((in >= from->e.range[0]) && (in <= from->e.range[1]));
665    if (0 == (ok ^ invert))
666      return 0;
667 
668    *out = to->e.wch;
669    return 1;
670 }
671 
map_range_to_range_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)672 static int map_range_to_range_method (Lexical_Element_Type *from,
673 				      Lexical_Element_Type *to, int invert,
674 				      SLwchar_Type in, SLwchar_Type *out)
675 {
676    int ok = ((in >= from->e.range[0]) && (in <= from->e.range[1]));
677    if (0 == (ok ^ invert))
678      return 0;
679 
680    *out = to->e.range[0] + (in - from->e.range[0]);
681    return 1;
682 }
683 
map_range_to_class_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)684 static int map_range_to_class_method (Lexical_Element_Type *from,
685 				      Lexical_Element_Type *to, int invert,
686 				      SLwchar_Type in, SLwchar_Type *out)
687 {
688    int ok = ((in >= from->e.range[0]) && (in <= from->e.range[1]));
689    if (0 == (ok ^ invert))
690      return 0;
691 
692    if (to->e.char_class == SLCHARCLASS_UPPER)
693      *out = SLwchar_toupper (in);
694    else if (to->e.char_class == SLCHARCLASS_LOWER)
695      *out = SLwchar_tolower (in);
696    else
697      return 0;
698 
699    return 1;
700 }
701 
is_of_class(int char_class,SLwchar_Type w)702 static int is_of_class (int char_class, SLwchar_Type w)
703 {
704    switch (char_class)
705      {
706       case SLCHARCLASS_ALPHA:
707 	return SLwchar_isalpha (w);
708 
709       case SLCHARCLASS_ALPHA|SLCHARCLASS_XDIGIT:
710 	return SLwchar_isalnum (w);
711 
712       case SLCHARCLASS_UPPER:
713 	return SLwchar_isupper (w);
714 
715       case SLCHARCLASS_LOWER:
716 	return SLwchar_islower (w);
717 
718       case SLCHARCLASS_SPACE:
719 	return SLwchar_isspace (w);
720 
721       case SLCHARCLASS_ASCII:
722 	return w < (SLwchar_Type)0x80;
723      }
724 
725    return 0;
726 }
727 
map_class_to_char_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)728 static int map_class_to_char_method (Lexical_Element_Type *from,
729 				     Lexical_Element_Type *to, int invert,
730 				     SLwchar_Type in, SLwchar_Type *out)
731 {
732    int ok = is_of_class (from->e.char_class, in);
733    if (0 == (ok ^ invert))
734      return 0;
735 
736    *out = to->e.wch;
737    return 1;
738 }
739 
map_class_to_class_method(Lexical_Element_Type * from,Lexical_Element_Type * to,int invert,SLwchar_Type in,SLwchar_Type * out)740 static int map_class_to_class_method (Lexical_Element_Type *from,
741 				      Lexical_Element_Type *to, int invert,
742 				      SLwchar_Type in, SLwchar_Type *out)
743 {
744    int ok = is_of_class (from->e.char_class, in);
745    if (0 == (ok ^ invert))
746      return 0;
747 
748    if (to->e.char_class == SLCHARCLASS_UPPER)
749      *out = SLwchar_toupper (in);
750    else if (to->e.char_class == SLCHARCLASS_LOWER)
751      *out = SLwchar_tolower (in);
752    else
753      return 0;
754 
755    return 1;
756 }
757 
init_chmap(SLwchar_Type * chmap,SLwchar_Type wch,SLwchar_Type (* to_func)(SLwchar_Type))758 static void init_chmap (SLwchar_Type *chmap, SLwchar_Type wch,
759 			SLwchar_Type (*to_func)(SLwchar_Type))
760 {
761    unsigned int i;
762 
763    chmap[0] = 0;
764    if (to_func == NULL)
765      {
766 	for (i = 1; i < 256; i++)
767 	  chmap[i] = wch;
768      }
769    else
770      {
771 	for (i = 1; i < 256; i++)
772 	  chmap[i] = (*to_func) (i);
773      }
774 }
775 
get_range_values(Lexical_Element_Type * lex,SLwchar_Type * chminp,SLwchar_Type * chmaxp,int * range_dirp)776 static void get_range_values (Lexical_Element_Type *lex,
777 			      SLwchar_Type *chminp, SLwchar_Type *chmaxp,
778 			      int *range_dirp)
779 {
780    SLwchar_Type chmin = lex->e.range[0];
781    SLwchar_Type chmax = lex->e.range[1];
782 
783    *range_dirp = 1;
784    if (chmin > chmax)
785      {
786 	SLwchar_Type tmp = chmin;
787 	chmin = chmax;
788 	chmax = tmp;
789 
790 	lex->e.range[0] = chmax;
791 	lex->e.range[1] = chmin;
792 	*range_dirp = -1;
793      }
794    *chminp = chmin;
795    *chmaxp = chmax;
796 }
797 
is_ascii(SLwchar_Type wch)798 static int is_ascii (SLwchar_Type wch)
799 {
800    return wch < (SLwchar_Type) 0x80;
801 }
802 
check_char_mapping(SLwchar_Map_Type * map,Char_Map_Type * list,int first_time)803 static int check_char_mapping (SLwchar_Map_Type *map, Char_Map_Type *list, int first_time)
804 {
805    Lexical_Element_Type *lex_from, *lex_to;
806    SLwchar_Type chmin, chmax, wch, wch1;
807    SLwchar_Type (*to_func) (SLwchar_Type);
808    int (*is_func) (SLwchar_Type);
809    SLwchar_Type *chmap;
810    int invert, from_range_dir, to_range_dir;
811 
812    lex_to = &list->to;
813    lex_from = &list->from;
814    chmap = map->chmap;
815    invert = map->invert;
816 
817    switch (lex_from->lexical_type)
818      {
819       default:
820 	return -1;
821 
822       case LEXICAL_CHAR_TYPE:
823 	if (lex_to->lexical_type != LEXICAL_CHAR_TYPE)
824 	  return -1;
825 
826 	wch = lex_to->e.wch;
827 	if (invert && first_time)
828 	  init_chmap (chmap, wch, NULL);
829 
830 	list->map_function = map_char_to_char_method;
831 
832 	if (0 == IS_ASCII256(lex_from->e.wch))
833 	  break;
834 
835 	if (invert)
836 	  map->chmap[lex_from->e.wch] = lex_from->e.wch;
837 	else
838 	  {
839 	     map->chmap[lex_from->e.wch] = wch;
840 	     list->map_function = NULL;
841 	  }
842 	break;
843 
844       case LEXICAL_RANGE_TYPE:
845 	get_range_values (lex_from, &chmin, &chmax, &from_range_dir);
846 
847 	switch (lex_to->lexical_type)
848 	  {
849 	   case LEXICAL_CHAR_TYPE:
850 	     wch = lex_to->e.wch;
851 	     if (invert && first_time)
852 	       init_chmap (chmap, wch, NULL);
853 
854 	     while ((chmin < 256) && (chmin <= chmax))
855 	       {
856 		  chmap[chmin] = (invert ? chmin : wch);
857 		  chmin++;
858 	       }
859 	     list->map_function = map_range_to_char_method;
860 	     break;
861 
862 	   case LEXICAL_CLASS_TYPE:
863 	     if (lex_to->e.char_class == SLCHARCLASS_UPPER)
864 	       to_func = SLwchar_toupper;
865 	     else if (lex_to->e.char_class == SLCHARCLASS_LOWER)
866 	       to_func = SLwchar_tolower;
867 	     else return -1;
868 
869 	     if (invert && first_time)
870 	       init_chmap (chmap, 0, to_func);
871 
872 	     while ((chmin < 256) && (chmin <= chmax))
873 	       {
874 		  chmap[chmin] = (invert ? chmin : (*to_func) (chmin));
875 		  chmin++;
876 	       }
877 	     list->map_function = map_range_to_class_method;
878 	     break;
879 
880 	   case LEXICAL_RANGE_TYPE:
881 	     if (invert)
882 	       {
883 		  _pSLang_verror (SL_INVALID_PARM, "Inversion from a range to a range not permitted");
884 		  return -1;
885 	       }
886 
887 	     get_range_values (lex_to, &wch, &wch1, &to_range_dir);
888 
889 	     if ((chmax - chmin) != (wch1 - wch))
890 	       {
891 		  _pSLang_verror (SL_INVALID_PARM, "Character mapping of unequal ranges is forbidden");
892 		  return -1;
893 	       }
894 	     if (from_range_dir != to_range_dir)
895 	       {
896 		  wch = wch1;
897 		  to_range_dir = -1;
898 	       }
899 	     else to_range_dir = 1;
900 
901 	     while ((chmin < 256) && (chmin <= chmax))
902 	       {
903 		  chmap[chmin] = wch;
904 		  chmin++;
905 		  wch += to_range_dir;
906 	       }
907 	     list->map_function = map_range_to_range_method;
908 	     break;
909 
910 	   default:
911 	     return -1;
912 	  }
913 	if ((chmax < 256) && (invert == 0))
914 	  list->map_function = NULL;
915 	break;
916 
917       case LEXICAL_CLASS_TYPE:
918 	switch (lex_from->e.char_class)
919 	  {
920 	   case SLCHARCLASS_ALPHA:
921 	     is_func = SLwchar_isalpha;
922 	     break;
923 
924 	   case SLCHARCLASS_ALPHA|SLCHARCLASS_XDIGIT:
925 	     is_func = SLwchar_isalnum;
926 	     break;
927 
928 	   case SLCHARCLASS_UPPER:
929 	     is_func = SLwchar_isupper;
930 	     break;
931 
932 	   case SLCHARCLASS_LOWER:
933 	     is_func = SLwchar_islower;
934 	     break;
935 
936 	   case SLCHARCLASS_SPACE:
937 	     is_func = SLwchar_isspace;
938 	     break;
939 
940 	   case SLCHARCLASS_ASCII:
941 	     is_func = is_ascii;
942 	     break;
943 
944 	   case SLCHARCLASS_BLANK:
945 	     is_func = SLwchar_isblank;
946 	     break;
947 
948 	   case SLCHARCLASS_CNTRL:
949 	     is_func = SLwchar_iscntrl;
950 	     break;
951 
952 	   case SLCHARCLASS_GRAPH:
953 	     is_func = SLwchar_isgraph;
954 	     break;
955 
956 	   case SLCHARCLASS_PRINT:
957 	     is_func = SLwchar_isprint;
958 	     break;
959 
960 	   case SLCHARCLASS_PUNCT:
961 	     is_func = SLwchar_ispunct;
962 	     break;
963 
964 	   case SLCHARCLASS_XDIGIT:
965 	     is_func = SLwchar_isxdigit;
966 	     break;
967 
968 	   default:
969 	     _pSLang_verror (SL_INVALID_PARM, "Invalid character class in character map");
970 	     return -1;
971 	  }
972 	switch (lex_to->lexical_type)
973 	  {
974 	   case LEXICAL_CHAR_TYPE:
975 	     wch = lex_to->e.wch;
976 
977 	     if (first_time && invert)
978 	       init_chmap (chmap, wch, NULL);
979 
980 	     for (chmin = 0; chmin < 256; chmin++)
981 	       {
982 		  if ((*is_func)(chmin))
983 		    chmap[chmin] = (invert ? chmin : wch);
984 	       }
985 	     list->map_function = map_class_to_char_method;
986 	     break;
987 
988 	   case LEXICAL_CLASS_TYPE:
989 	     switch (lex_to->e.char_class)
990 	       {
991 		case SLCHARCLASS_LOWER:
992 		  to_func = SLwchar_tolower;
993 		  break;
994 		case SLCHARCLASS_UPPER:
995 		  to_func = SLwchar_toupper;
996 		  break;
997 
998 		default:
999 		  return -1;
1000 	       }
1001 
1002 	     if (invert && first_time)
1003 	       init_chmap (chmap, 0, to_func);
1004 
1005 	     for (chmin = 0; chmin < 256; chmin++)
1006 	       {
1007 		  if ((*is_func)(chmin))
1008 		    chmap[chmin] = (invert ? chmin : (*to_func)(chmin));
1009 	       }
1010 	     list->map_function = map_class_to_class_method;
1011 	     break;
1012 
1013 	   default:
1014 	     return -1;
1015 	  }
1016 	break;
1017      }
1018    return 0;
1019 }
1020 
free_char_map_type(Char_Map_Type * m)1021 static void free_char_map_type (Char_Map_Type *m)
1022 {
1023    SLfree ((char *) m);
1024 }
1025 
SLwchar_free_char_map(SLwchar_Map_Type * map)1026 void SLwchar_free_char_map (SLwchar_Map_Type *map)
1027 {
1028    Char_Map_Type *list;
1029 
1030    if (map == NULL)
1031      return;
1032 
1033    list = map->list;
1034    while (list != NULL)
1035      {
1036 	Char_Map_Type *next = list->next;
1037 	free_char_map_type (list);
1038 	list = next;
1039      }
1040    SLfree ((char *) map);
1041 }
1042 
SLwchar_allocate_char_map(SLuchar_Type * from,SLuchar_Type * to)1043 SLwchar_Map_Type *SLwchar_allocate_char_map (SLuchar_Type *from, SLuchar_Type *to)
1044 {
1045    SLwchar_Map_Type *map;
1046    Char_Map_Type *list, *prev;
1047    SLuchar_Type *from_max, *to_max;
1048    unsigned int i;
1049    int invert = 0, first_time;
1050 
1051    if (*from == '^')
1052      {
1053 	invert = 1;
1054 	from++;
1055      }
1056 
1057 #if 0
1058    if (*from == 0)
1059      {
1060 	_pSLang_verror (SL_INVALID_PARM, "Illegal empty string in character map specification");
1061 	return NULL;
1062      }
1063 #endif
1064    map = (SLwchar_Map_Type *)SLcalloc (1, sizeof (SLwchar_Map_Type));
1065    if (map == NULL)
1066      return NULL;
1067 
1068    map->invert = invert;
1069 
1070    for (i = 0; i < 256; i++)
1071      map->chmap[i] = i;
1072 
1073    from_max = from + strlen ((char *) from);
1074    to_max = to + strlen ((char *) to);
1075 
1076    list = NULL;
1077 
1078    while (from < from_max)
1079      {
1080 	Char_Map_Type *next;
1081 	SLuchar_Type *next_to;
1082 
1083 	if (NULL == (next = (Char_Map_Type *) SLcalloc (1, sizeof (Char_Map_Type))))
1084 	  goto return_error;
1085 
1086 	if (list == NULL)
1087 	  map->list = next;
1088 	else
1089 	  list->next = next;
1090 	list = next;
1091 
1092 	if (NULL == (from = get_lexical_element (from, from_max, 1, 1, &list->from)))
1093 	  goto return_error;
1094 
1095 	if (NULL == (next_to = get_lexical_element (to, to_max, 1, 1, &list->to)))
1096 	  goto return_error;
1097 
1098 	/* If the mapping is not 1-1, then the last "to" object applies to the
1099 	 * remaining "from" objects.  This will permit, e.g.,
1100 	 *  A-Za-z --> X
1101 	 */
1102 	if (next_to != to_max)
1103 	  {
1104 	     if (invert)
1105 	       {
1106 		  _pSLang_verror (SL_INVALID_PARM, "Character map inversion must specify a many-to-one mapping");
1107 		  goto return_error;
1108 	       }
1109 	     to = next_to;
1110 	  }
1111      }
1112 
1113    list = map->list;
1114    prev = NULL;
1115    first_time = 1;
1116    while (list != NULL)
1117      {
1118 	Char_Map_Type *next = list->next;
1119 
1120 	if (-1 == check_char_mapping (map, list, first_time))
1121 	  {
1122 	     _pSLang_verror (SL_INVALID_PARM, "Specified character mapping is invalid");
1123 	     goto return_error;
1124 	  }
1125 	first_time = 0;
1126 
1127 	if (list->map_function == NULL)
1128 	  {
1129 	     if (prev == NULL)
1130 	       map->list = next;
1131 	     else
1132 	       prev->next = next;
1133 
1134 	     free_char_map_type (list);
1135 	  }
1136 	else prev = list;
1137 	list = next;
1138      }
1139    return map;
1140 
1141    return_error:
1142    SLwchar_free_char_map (map);
1143    return NULL;
1144 }
1145 
apply_lexical_map(SLwchar_Map_Type * map,SLwchar_Type wc_in,SLwchar_Type * wc_out)1146 static int apply_lexical_map (SLwchar_Map_Type *map, SLwchar_Type wc_in, SLwchar_Type *wc_out)
1147 {
1148    Char_Map_Type *list = map->list;
1149    int invert = map->invert;
1150 
1151    while (list != NULL)
1152      {
1153 	if (list->map_function != NULL)
1154 	  {
1155 	     int status = (*list->map_function)(&list->from, &list->to, invert, wc_in, wc_out);
1156 	     if (status == 0)
1157 	       {
1158 		  if (invert)
1159 		    return status;
1160 	       }
1161 	     else return status;
1162 	  }
1163 	list = list->next;
1164      }
1165    return 0;
1166 }
1167 
SLwchar_apply_char_map(SLwchar_Map_Type * map,SLwchar_Type * input,SLwchar_Type * output,unsigned int num)1168 int SLwchar_apply_char_map (SLwchar_Map_Type *map, SLwchar_Type *input, SLwchar_Type *output, unsigned int num)
1169 {
1170    unsigned int i;
1171    SLwchar_Type *chmap;
1172 
1173    if ((map == NULL) || (input == NULL) || (output == NULL))
1174      return -1;
1175 
1176    chmap = map->chmap;
1177 
1178    for (i = 0; i < num; i++)
1179      {
1180 	SLwchar_Type wc_in;
1181 
1182 	if ((wc_in = input[i]) < 0x100)
1183 	  {
1184 	     output[i] = chmap[wc_in];
1185 	     continue;
1186 	  }
1187 
1188 	if (0 == apply_lexical_map (map, wc_in, output + i))
1189 	  output[i] = wc_in;
1190      }
1191 
1192    return 0;
1193 }
1194 
1195 /* This function returns a malloced string */
SLuchar_apply_char_map(SLwchar_Map_Type * map,SLuchar_Type * str)1196 SLuchar_Type *SLuchar_apply_char_map (SLwchar_Map_Type *map, SLuchar_Type *str)
1197 {
1198    SLuchar_Type *str_max;
1199    SLuchar_Type *output, *output_max, *outptr;
1200    int use_chmap;
1201    size_t len;
1202    SLwchar_Type *chmap;
1203 
1204    if ((map == NULL) || (str == NULL))
1205      return NULL;
1206 
1207    use_chmap = 1;
1208    if (_pSLinterp_UTF8_Mode == 0)
1209      str_max = str + strlen ((char *)str);
1210    else
1211      {
1212 	str_max = str;
1213 	while (*str_max)
1214 	  {
1215 	     if (*str_max & 0x80)
1216 	       use_chmap = 0;
1217 	     str_max++;
1218 	  }
1219      }
1220 
1221    len = str_max - str;
1222    chmap = map->chmap;
1223 
1224    if (use_chmap)
1225      {
1226 	unsigned int i;
1227 
1228 	output = (SLuchar_Type *)SLmalloc (len+1);
1229 	if (output == NULL)
1230 	  return NULL;
1231 
1232 	for (i = 0; i < len; i++)
1233 	  output[i] = chmap[str[i]];
1234 
1235 	output[len] = 0;
1236 	return output;
1237      }
1238 
1239    /* Hard way */
1240    len += SLUTF8_MAX_MBLEN;
1241    if (NULL == (output = (SLuchar_Type *)SLmalloc (len + 1)))
1242      return NULL;
1243    output_max = output + len;
1244    outptr = output;
1245 
1246    while (str < str_max)
1247      {
1248 	SLwchar_Type w_out, w_in;
1249 	unsigned int encoded_len;
1250 
1251 	w_in = (SLwchar_Type) *str;
1252 	if (w_in < 0x80)
1253 	  str++;
1254 	else if (NULL == (str = _pSLinterp_decode_wchar (str, str_max, &w_in)))
1255 	  goto return_error;
1256 
1257 	if (w_in < 0x100)
1258 	  {
1259 	     w_out = chmap[w_in];
1260 	     if ((w_out < 0x80) && (outptr < output_max))
1261 	       {
1262 		  *outptr++ = (SLuchar_Type) w_out;
1263 		  continue;
1264 	       }
1265 	  }
1266 	else
1267 	  {
1268 	     if (-1 == SLwchar_apply_char_map (map, &w_in, &w_out, 1))
1269 	       goto return_error;
1270 	  }
1271 
1272 	if (outptr + SLUTF8_MAX_MBLEN >= output_max)
1273 	  {
1274 	     SLuchar_Type *tmp;
1275 
1276 	     len += 32 * SLUTF8_MAX_MBLEN;
1277 	     if (NULL == (tmp = (SLuchar_Type *)SLrealloc ((char *)output, len)))
1278 	       goto return_error;
1279 
1280 	     outptr = tmp + (outptr - output);
1281 	     output = tmp;
1282 	     output_max = output + len;
1283 	  }
1284 
1285 	if (NULL == (outptr = _pSLinterp_encode_wchar (w_out, outptr, &encoded_len)))
1286 	  goto return_error;
1287      }
1288 
1289    *outptr = 0;
1290 
1291    return output;
1292 
1293    return_error:
1294    SLfree ((char *) output);
1295    return NULL;
1296 }
1297