1 /*------------------------------------------------------------------------*
2  * `CompileRE', `ExecRE', and `ConvertSubstituteRE' -- regular expression parsing
3  *
4  * This is a HIGHLY ALTERED VERSION of Henry Spencer's `regcomp'
5  * code adapted for NEdit.
6  *
7  * .-------------------------------------------------------------------.
8  * | ORIGINAL COPYRIGHT NOTICE:                                        |
9  * |                                                                   |
10  * | Copyright (c) 1986 by University of Toronto.                      |
11  * | Written by Henry Spencer.  Not derived from licensed software.    |
12  * |                                                                   |
13  * | Permission is granted to anyone to use this software for any      |
14  * | purpose on any computer system, and to redistribute it freely,    |
15  * | subject to the following restrictions:                            |
16  * |                                                                   |
17  * | 1. The author is not responsible for the consequences of use of   |
18  * |      this software, no matter how awful, even if they arise       |
19  * |      from defects in it.                                          |
20  * |                                                                   |
21  * | 2. The origin of this software must not be misrepresented, either |
22  * |      by explicit claim or by omission.                            |
23  * |                                                                   |
24  * | 3. Altered versions must be plainly marked as such, and must not  |
25  * |      be misrepresented as being the original software.            |
26  * `-------------------------------------------------------------------'
27  *
28  * This is free software; you can redistribute it and/or modify it under the
29  * terms of the GNU General Public License as published by the Free Software
30  * Foundation; either version 2 of the License, or (at your option) any later
31  * version. In addition, you may distribute version of this program linked to
32  * Motif or Open Motif. See README for details.
33  *
34  * This software is distributed in the hope that it will be useful, but WITHOUT
35  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
36  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
37  * for more details.
38  *
39  * You should have received a copy of the GNU General Public License along with
40  * software; if not, write to the Free Software Foundation, Inc., 59 Temple
41  * Place, Suite 330, Boston, MA  02111-1307 USA
42  *
43  */
44 
45 #ifdef HAVE_CONFIG_H
46 #include "../config.h"
47 #endif
48 
49 #include "regexConvert.h"
50 #include "../util/nedit_malloc.h"
51 
52 #include <stdio.h>
53 #include <stdlib.h>
54 #include <string.h>
55 #include <ctype.h>
56 #include <limits.h>
57 
58 #include <X11/Intrinsic.h>
59 
60 #ifdef HAVE_DEBUG_H
61 #include "../debug.h"
62 #endif
63 
64 
65 /* Utility definitions. */
66 
67 #define NSUBEXP 50
68 
69 #define CONVERT_FAIL(m)  {*Error_Ptr = (m); return 0;}
70 #define IS_QUANTIFIER(c) ((c) == '*' || (c) == '+' || (c) == '?')
71 #define U_CHAR_AT(p)     ((unsigned int) *(unsigned char *)(p))
72 
73 /* Flags to be passed up and down via function parameters during compile. */
74 
75 #define WORST             0  /* Worst case. No assumptions can be made.*/
76 #define HAS_WIDTH         1  /* Known never to match null string. */
77 #define SIMPLE            2  /* Simple enough to be STAR/PLUS operand. */
78 
79 #define NO_PAREN          0  /* Only set by initial call to "chunk". */
80 #define PAREN             1  /* Used for normal capturing parentheses. */
81 
82 #define REG_ZERO        0UL
83 #define REG_ONE         1UL
84 
85 /* Global work variables for `ConvertRE'. */
86 
87 static unsigned char *Reg_Parse;       /* Input scan ptr (scans user's regex) */
88 static int            Total_Paren;     /* Parentheses, (),  counter. */
89 static unsigned long  Convert_Size;    /* Address of this used as flag. */
90 static unsigned char *Code_Emit_Ptr;   /* When Code_Emit_Ptr is set to
91                                           &Compute_Size no code is emitted.
92                                           Instead, the size of code that WOULD
93                                           have been generated is accumulated in
94                                           Convert_Size.  Otherwise,
95                                           Code_Emit_Ptr points to where compiled
96                                           regex code is to be written. */
97 static unsigned char  Compute_Size;
98 static char         **Error_Ptr;       /* Place to store error messages so
99                                           they can be returned by `ConvertRE' */
100 static char           Error_Text [128];/* Sting to build error messages in. */
101 
102 static unsigned char  Meta_Char [] = ".*+?[(|)^<>$";
103 
104 static unsigned char *Convert_Str;
105 
106 /* Forward declarations for functions used by `ConvertRE'. */
107 
108 static int            alternative       (int *flag_param);
109 static int            chunk             (int paren, int *flag_param);
110 static void           emit_convert_byte (unsigned char c);
111 static unsigned char  literal_escape    (unsigned char c, int);
112 static int            atom              (int *flag_param);
113 static void           reg_error         (char *str);
114 static int            piece             (int *flag_param);
115 
116 /*----------------------------------------------------------------------*
117  * ConvertRE
118  *
119  * Compiles a regular expression into the internal format used by
120  * `ExecRE'.
121  *
122  * Beware that the optimization and preparation code in here knows about
123  * some of the structure of the compiled regexp.
124  *----------------------------------------------------------------------*/
125 
ConvertRE(const char * exp,char ** errorText)126 char * ConvertRE (const char *exp, char **errorText) {
127 
128    int  flags_local, pass;
129 
130    /* Set up `errorText' to receive failure reports. */
131 
132     Error_Ptr = errorText;
133    *Error_Ptr = "";
134 
135    if (exp == NULL) CONVERT_FAIL ("NULL argument to `ConvertRE\'");
136 
137    Code_Emit_Ptr = &Compute_Size;
138    Convert_Size  = 0UL;
139 
140    /* We can't allocate space until we know how big the compiled form will be,
141       but we can't compile it (and thus know how big it is) until we've got a
142       place to put the code.  So we cheat: we compile it twice, once with code
143       generation turned off and size counting turned on, and once "for real".
144       This also means that we don't allocate space until we are sure that the
145       thing really will compile successfully, and we never have to move the
146       code and thus invalidate pointers into it.  (Note that it has to be in
147       one piece because free() must be able to free it all.) */
148 
149    for (pass = 1; pass <= 2; pass++) {
150       /*-------------------------------------------*
151        * FIRST  PASS: Determine size and legality. *
152        * SECOND PASS: Emit converted code.         *
153        *-------------------------------------------*/
154 
155       Reg_Parse   = (unsigned char *) exp;
156       Total_Paren = 1;
157 
158       if (chunk (NO_PAREN, &flags_local) == 0) return (NULL); /* Something
159                                                                  went wrong */
160       emit_convert_byte ('\0');
161 
162       if (pass == 1) {
163          /* Allocate memory. */
164 
165          Convert_Str =
166             (unsigned char *) NEditMalloc(sizeof (unsigned char) * Convert_Size);
167 
168          if (Convert_Str == NULL) {
169             CONVERT_FAIL ("out of memory in `ConvertRE\'");
170          }
171 
172          Code_Emit_Ptr = Convert_Str;
173       }
174    }
175 
176    return (char *) Convert_Str;
177 }
178 
179 /*----------------------------------------------------------------------*
180  * chunk                                                                *
181  *                                                                      *
182  * Process main body of regex or process a parenthesized "thing".       *
183  *                                                                      *
184  * Caller must absorb opening parenthesis.
185  *----------------------------------------------------------------------*/
186 
chunk(int paren,int * flag_param)187 static int chunk (int paren, int *flag_param) {
188 
189    register int   this_branch;
190             int   flags_local;
191 
192    *flag_param = HAS_WIDTH;  /* Tentatively. */
193 
194    /* Make an OPEN node, if parenthesized. */
195 
196    if (paren == PAREN) {
197       if (Total_Paren >= NSUBEXP) {
198          sprintf (Error_Text, "number of ()'s > %d", (int) NSUBEXP);
199          CONVERT_FAIL (Error_Text);
200       }
201 
202       Total_Paren++;
203    }
204 
205    /* Pick up the branches, linking them together. */
206 
207    do {
208       this_branch = alternative (&flags_local);
209 
210       if (this_branch == 0) return 0;
211 
212       /* If any alternative could be zero width, consider the whole
213          parenthisized thing to be zero width. */
214 
215       if (!(flags_local & HAS_WIDTH)) *flag_param &= ~HAS_WIDTH;
216 
217       /* Are there more alternatives to process? */
218 
219       if (*Reg_Parse != '|') break;
220 
221       emit_convert_byte ('|');
222 
223       Reg_Parse++;
224    } while (1);
225 
226    /* Check for proper termination. */
227 
228    if (paren != NO_PAREN && *Reg_Parse != ')') {
229       CONVERT_FAIL ("missing right parenthesis \')\'");
230 
231    } else if (paren != NO_PAREN) {
232       emit_convert_byte (')');
233       Reg_Parse++;
234 
235    } else if (paren == NO_PAREN && *Reg_Parse != '\0') {
236       if (*Reg_Parse == ')') {
237          CONVERT_FAIL ("missing left parenthesis \'(\'");
238       } else {
239          CONVERT_FAIL ("junk on end");  /* "Can't happen" - NOTREACHED */
240       }
241    }
242 
243    return 1;
244 }
245 
246 /*----------------------------------------------------------------------*
247  * alternative - Processes one alternative of an '|' operator.
248  *----------------------------------------------------------------------*/
249 
alternative(int * flag_param)250 static int alternative (int *flag_param) {
251 
252    int  ret_val;
253    int  flags_local;
254 
255    *flag_param = WORST;  /* Tentatively. */
256 
257    /* Loop until we hit the start of the next alternative, the end of this set
258       of alternatives (end of parentheses), or the end of the regex. */
259 
260    while (*Reg_Parse != '|' && *Reg_Parse != ')' && *Reg_Parse != '\0') {
261       ret_val = piece (&flags_local);
262 
263       if (ret_val == 0) return 0; /* Something went wrong. */
264 
265       *flag_param |= flags_local & HAS_WIDTH;
266    }
267 
268    return 1;
269 }
270 
271 /*----------------------------------------------------------------------*
272  * piece - something followed by possible '*', '+', or '?'.
273  *----------------------------------------------------------------------*/
274 
piece(int * flag_param)275 static int piece (int *flag_param) {
276 
277    register int            ret_val;
278    register unsigned char  op_code;
279             unsigned long  min_val = REG_ZERO;
280             int            flags_local;
281 
282    ret_val = atom (&flags_local);
283 
284    if (ret_val == 0) return 0;  /* Something went wrong. */
285 
286    op_code = *Reg_Parse;
287 
288    if (!IS_QUANTIFIER (op_code)) {
289       *flag_param = flags_local;
290 
291       return (ret_val);
292    }
293 
294    Reg_Parse++;
295 
296    if (op_code == '+') min_val = REG_ONE;
297 
298    /* It is dangerous to apply certain quantifiers to a possibly zero width
299       item. */
300 
301    if (!(flags_local & HAS_WIDTH) && min_val > REG_ZERO) {
302       sprintf (Error_Text, "%c operand could be empty", op_code);
303 
304       CONVERT_FAIL (Error_Text);
305    }
306 
307    *flag_param = (min_val > REG_ZERO) ? (WORST | HAS_WIDTH) : WORST;
308 
309    if ( !((op_code == '*') || (op_code == '+') || (op_code == '?')) ) {
310       /* We get here if the IS_QUANTIFIER macro is not coordinated properly
311          with this function. */
312 
313       CONVERT_FAIL ("internal error #2, `piece\'");
314    }
315 
316    if (IS_QUANTIFIER (*Reg_Parse)) {
317       sprintf (Error_Text, "nested quantifiers, %c%c", op_code, *Reg_Parse);
318 
319       CONVERT_FAIL (Error_Text);
320    }
321 
322    emit_convert_byte (op_code);
323 
324    return (ret_val);
325 }
326 
327 /*----------------------------------------------------------------------*
328  * atom - Process one regex item at the lowest level
329  *----------------------------------------------------------------------*/
330 
atom(int * flag_param)331 static int atom (int *flag_param) {
332    int            ret_val = 1;
333    unsigned char  test;
334    int            flags_local;
335 
336    *flag_param = WORST;  /* Tentatively. */
337 
338    switch (*Reg_Parse++) {
339       case '^':
340          emit_convert_byte ('^');
341          break;
342 
343       case '$':
344          emit_convert_byte ('$');
345          break;
346 
347       case '<':
348          emit_convert_byte ('<');
349          break;
350 
351       case '>':
352          emit_convert_byte ('>');
353          break;
354 
355       case '.':
356          emit_convert_byte ('.');
357 
358          *flag_param |= (HAS_WIDTH | SIMPLE); break;
359 
360       case '(':
361          emit_convert_byte ('(');
362 
363          ret_val = chunk (PAREN, &flags_local);
364 
365          if (ret_val == 0) return 0;  /* Something went wrong. */
366 
367          /* Add HAS_WIDTH flag if it was set by call to chunk. */
368 
369          *flag_param |= flags_local & HAS_WIDTH;
370 
371          break;
372 
373       case '\0':
374       case '|':
375       case ')':
376          CONVERT_FAIL ("internal error #3, `atom\'");  /* Supposed to be  */
377                                                        /* caught earlier. */
378       case '?':
379       case '+':
380       case '*':
381          sprintf (Error_Text, "%c follows nothing", *(Reg_Parse - 1));
382          CONVERT_FAIL (Error_Text);
383 
384       case '{':
385          emit_convert_byte ('\\'); /* Quote braces. */
386          emit_convert_byte ('{');
387 
388          break;
389 
390       case '[':
391          {
392             register unsigned int  last_value;
393                      unsigned char last_emit = 0;
394                      unsigned char buffer [500];
395                               int  head = 0;
396                               int  negated = 0;
397                               int  do_brackets  = 1;
398                               int  a_z_flag     = 0;
399                               int  A_Z_flag     = 0;
400                               int  zero_nine    = 0;
401                               int  u_score_flag = 0;
402 
403             buffer [0]  = '\0';
404 
405             /* Handle characters that can only occur at the start of a class. */
406 
407             if (*Reg_Parse == '^') { /* Complement of range. */
408                negated = 1;
409 
410                Reg_Parse++;
411             }
412 
413             if (*Reg_Parse == ']' || *Reg_Parse == '-') {
414                /* If '-' or ']' is the first character in a class,
415                   it is a literal character in the class. */
416 
417                last_emit = *Reg_Parse;
418 
419                if (head >= 498) {
420                   CONVERT_FAIL ("too much data in [] to convert.");
421                }
422 
423                buffer [head++] = '\\'; /* Escape `]' and '-' for clarity. */
424                buffer [head++] = *Reg_Parse;
425 
426                Reg_Parse++;
427             }
428 
429             /* Handle the rest of the class characters. */
430 
431             while (*Reg_Parse != '\0' && *Reg_Parse != ']') {
432                if (*Reg_Parse == '-') { /* Process a range, e.g [a-z]. */
433                   Reg_Parse++;
434 
435                   if (*Reg_Parse == ']' || *Reg_Parse == '\0') {
436                      /* If '-' is the last character in a class it is a literal
437                         character.  If `Reg_Parse' points to the end of the
438                         regex string, an error will be generated later. */
439 
440                      last_emit = '-';
441 
442                      if (head >= 498) {
443                         CONVERT_FAIL ("too much data in [] to convert.");
444                      }
445 
446                      buffer [head++] = '\\'; /* Escape '-' for clarity. */
447                      buffer [head++] = '-';
448 
449                   } else {
450                      if (*Reg_Parse == '\\') {
451                         /* Handle escaped characters within a class range. */
452 
453                         Reg_Parse++;
454 
455                         if ((test = literal_escape (*Reg_Parse, 0))) {
456 
457                            buffer [head++] = '-';
458 
459                            if (*Reg_Parse != '\"') {
460                               emit_convert_byte ('\\');
461                            }
462 
463                            buffer [head++] = *Reg_Parse;
464                            last_value = (unsigned int) test;
465                         } else {
466                            sprintf (
467                               Error_Text,
468                               "\\%c is an invalid escape sequence(3)",
469                               *Reg_Parse);
470 
471                            CONVERT_FAIL (Error_Text);
472                         }
473                      } else {
474                         last_value = U_CHAR_AT (Reg_Parse);
475 
476                         if (last_emit == '0' && last_value == '9') {
477                            zero_nine = 1;
478                            head--;
479                         } else if (last_emit == 'a' && last_value == 'z') {
480                            a_z_flag  = 1;
481                            head--;
482                         } else if (last_emit == 'A' && last_value == 'Z') {
483                            A_Z_flag = 1;
484                            head--;
485                         } else {
486                            buffer [head++] = '-';
487 
488                            if ((test = literal_escape (*Reg_Parse, 1))) {
489                               /* Ordinary character matches an escape sequence;
490                                  convert it to the escape sequence. */
491 
492                               if (head >= 495) {
493                                  CONVERT_FAIL (
494                                     "too much data in [] to convert.");
495                               }
496 
497                               buffer [head++] = '\\';
498 
499                               if (test == '0') { /* Make octal escape. */
500                                  test = *Reg_Parse;
501                                  buffer [head++] = '0';
502                                  buffer [head++] = ('0' + (test / 64));
503                                  test -= (test / 64) * 64;
504                                  buffer [head++] = ('0' + (test / 8));
505                                  test -= (test / 8) * 8;
506                                  buffer [head++] = ('0' +  test);
507                               } else {
508                                  buffer [head++] = test;
509                               }
510                            } else {
511                               buffer [head++] = last_value;
512                            }
513                         }
514                      }
515 
516                      if (last_emit > last_value) {
517                         CONVERT_FAIL ("invalid [] range");
518                      }
519 
520                      last_emit = (unsigned char) last_value;
521 
522                      Reg_Parse++;
523 
524                   } /* End class character range code. */
525                } else if (*Reg_Parse == '\\') {
526                   Reg_Parse++;
527 
528                   if ((test = literal_escape (*Reg_Parse, 0)) != '\0') {
529                      last_emit = test;
530 
531                      if (head >= 498) {
532                         CONVERT_FAIL ("too much data in [] to convert.");
533                      }
534 
535                      if (*Reg_Parse != '\"') {
536                         buffer [head++] = '\\';
537                      }
538 
539                      buffer [head++] = *Reg_Parse;
540 
541                   } else {
542                      sprintf (Error_Text,
543                               "\\%c is an invalid escape sequence(1)",
544                               *Reg_Parse);
545 
546                      CONVERT_FAIL (Error_Text);
547                   }
548 
549                   Reg_Parse++;
550 
551                   /* End of class escaped sequence code */
552                } else {
553                   last_emit = *Reg_Parse;
554 
555                   if (*Reg_Parse == '_') {
556                      u_score_flag = 1; /* Emit later if we can't do `\w'. */
557 
558                   } else if ((test = literal_escape (*Reg_Parse, 1))) {
559                      /* Ordinary character matches an escape sequence;
560                         convert it to the escape sequence. */
561 
562                      if (head >= 495) {
563                         CONVERT_FAIL ("too much data in [] to convert.");
564                      }
565 
566                      buffer [head++] = '\\';
567 
568                      if (test == '0') {  /* Make octal escape. */
569                         test = *Reg_Parse;
570                         buffer [head++] = '0';
571                         buffer [head++] = ('0' + (test / 64));
572                         test -= (test / 64) * 64;
573                         buffer [head++] = ('0' + (test / 8));
574                         test -= (test / 8) * 8;
575                         buffer [head++] = ('0' +  test);
576                      } else {
577                         if (head >= 499) {
578                            CONVERT_FAIL ("too much data in [] to convert.");
579                         }
580 
581                         buffer [head++] = test;
582                      }
583                   } else {
584                      if (head >= 499) {
585                         CONVERT_FAIL ("too much data in [] to convert.");
586                      }
587 
588                      buffer [head++] = *Reg_Parse;
589                   }
590 
591                   Reg_Parse++;
592                }
593             } /* End of while (*Reg_Parse != '\0' && *Reg_Parse != ']') */
594 
595             if (*Reg_Parse != ']') CONVERT_FAIL ("missing right \']\'");
596 
597             buffer [head] = '\0';
598 
599             /* NOTE: it is impossible to specify an empty class.  This is
600                because [] would be interpreted as "begin character class"
601                followed by a literal ']' character and no "end character class"
602                delimiter (']').  Because of this, it is always safe to assume
603                that a class HAS_WIDTH. */
604 
605             Reg_Parse++; *flag_param |= HAS_WIDTH | SIMPLE;
606 
607             if (head == 0) {
608                if (( a_z_flag &&  A_Z_flag &&  zero_nine &&  u_score_flag) ||
609                    ( a_z_flag &&  A_Z_flag && !zero_nine && !u_score_flag) ||
610                    (!a_z_flag && !A_Z_flag &&  zero_nine && !u_score_flag)) {
611 
612                    do_brackets = 0;
613                }
614             }
615 
616             if (do_brackets) {
617                emit_convert_byte ('[');
618                if (negated) emit_convert_byte ('^');
619             }
620 
621             /* Output any shortcut escapes if we can. */
622 
623             while (a_z_flag || A_Z_flag || zero_nine || u_score_flag) {
624                if (a_z_flag && A_Z_flag && zero_nine && u_score_flag) {
625                   emit_convert_byte ('\\');
626 
627                   if (negated && !do_brackets) {
628                      emit_convert_byte ('W');
629                   } else {
630                      emit_convert_byte ('w');
631                   }
632 
633                   a_z_flag = A_Z_flag = zero_nine = u_score_flag = 0;
634                } else if (a_z_flag && A_Z_flag) {
635                   emit_convert_byte ('\\');
636 
637                   if (negated && !do_brackets) {
638                      emit_convert_byte ('L');
639                   } else {
640                      emit_convert_byte ('l');
641                   }
642 
643                   a_z_flag = A_Z_flag = 0;
644                } else if (zero_nine) {
645                   emit_convert_byte ('\\');
646 
647                   if (negated && !do_brackets) {
648                      emit_convert_byte ('D');
649                   } else {
650                      emit_convert_byte ('d');
651                   }
652 
653                   zero_nine = 0;
654                } else if (a_z_flag) {
655                   emit_convert_byte ('a');
656                   emit_convert_byte ('-');
657                   emit_convert_byte ('z');
658 
659                   a_z_flag = 0;
660                } else if (A_Z_flag) {
661                   emit_convert_byte ('A');
662                   emit_convert_byte ('-');
663                   emit_convert_byte ('Z');
664 
665                   A_Z_flag = 0;
666                } else if (u_score_flag) {
667                   emit_convert_byte ('_');
668 
669                   u_score_flag = 0;
670                }
671             }
672 
673             /* Output our buffered class characters. */
674 
675             for (head = 0; buffer [head] != '\0'; head++) {
676                emit_convert_byte (buffer [head]);
677             }
678 
679             if (do_brackets) {
680                emit_convert_byte (']');
681             }
682          }
683 
684          break; /* End of character class code. */
685 
686          /* Fall through to Default case to handle literal escapes. */
687 
688       default:
689          Reg_Parse--; /* If we fell through from the above code, we are now
690                          pointing at the back slash (\) character. */
691          {
692             unsigned char *parse_save, *emit_save;
693                      int   emit_diff, len = 0;
694 
695             /* Loop until we find a meta character or end of regex string. */
696 
697             for (; *Reg_Parse != '\0' &&
698                    !strchr ((char *) Meta_Char, (int) *Reg_Parse);
699                  len++) {
700 
701                /* Save where we are in case we have to back
702                   this character out. */
703 
704                parse_save = Reg_Parse;
705                emit_save  = Code_Emit_Ptr;
706 
707                if (*Reg_Parse == '\\') {
708                   if ((test = literal_escape (*(Reg_Parse + 1), 0))) {
709                      if (*(Reg_Parse + 1) != '\"') {
710                         emit_convert_byte ('\\');
711                      }
712 
713                      Reg_Parse++; /* Point to escaped character */
714                      emit_convert_byte (*Reg_Parse);
715 
716                   } else {
717                      sprintf (Error_Text,
718                               "\\%c is an invalid escape sequence(2)",
719                               *(Reg_Parse + 1));
720 
721                      CONVERT_FAIL (Error_Text);
722                   }
723 
724                   Reg_Parse++;
725                } else {
726                   /* Ordinary character */
727 
728                   if ((test = literal_escape (*Reg_Parse, 1))) {
729                      /* Ordinary character matches an escape sequence;
730                         convert it to the escape sequence. */
731 
732                      emit_convert_byte ('\\');
733 
734                      if (test == '0') {
735                         test = *Reg_Parse;
736                         emit_convert_byte ('0');
737                         emit_convert_byte ('0' + (test / 64));
738                         test -= (test / 64) * 64;
739                         emit_convert_byte ('0' + (test / 8));
740                         test -= (test / 8) * 8;
741                         emit_convert_byte ('0' +  test);
742                      } else {
743                         emit_convert_byte (test);
744                      }
745                   } else {
746                      emit_convert_byte (*Reg_Parse);
747                   }
748 
749                   Reg_Parse++;
750                }
751 
752                /* If next regex token is a quantifier (?, +. *, or {m,n}) and
753                   our EXACTLY node so far is more than one character, leave the
754                   last character to be made into an EXACTLY node one character
755                   wide for the multiplier to act on.  For example 'abcd* would
756                   have an EXACTLY node with an 'abc' operand followed by a STAR
757                   node followed by another EXACTLY node with a 'd' operand. */
758 
759                if (IS_QUANTIFIER (*Reg_Parse) && len > 0) {
760                   Reg_Parse = parse_save; /* Point to previous regex token. */
761                   emit_diff = (Code_Emit_Ptr - emit_save);
762 
763                   if (Code_Emit_Ptr == &Compute_Size) {
764                      Convert_Size -= emit_diff;
765                   } else { /* Write over previously emitted byte. */
766                      Code_Emit_Ptr = emit_save;
767                   }
768 
769                   break;
770                }
771             }
772 
773             if (len <= 0) CONVERT_FAIL ("internal error #4, `atom\'");
774 
775             *flag_param |= HAS_WIDTH;
776 
777             if (len == 1) *flag_param |= SIMPLE;
778          }
779       } /* END switch (*Reg_Parse++) */
780 
781    return (ret_val);
782 }
783 
784 /*----------------------------------------------------------------------*
785  * emit_convert_byte
786  *
787  * Emit (if appropriate) a byte of converted code.
788  *----------------------------------------------------------------------*/
789 
emit_convert_byte(unsigned char c)790 static void emit_convert_byte (unsigned char c) {
791 
792    if (Code_Emit_Ptr == &Compute_Size) {
793       Convert_Size++;
794    } else {
795       *Code_Emit_Ptr++ = c;
796    }
797 }
798 
799 /*--------------------------------------------------------------------*
800  * literal_escape
801  *
802  * Recognize escaped literal characters (prefixed with backslash),
803  * and translate them into the corresponding character.
804  *
805  * Returns the proper character value or NULL if not a valid literal
806  * escape.
807  *--------------------------------------------------------------------*/
808 
literal_escape(unsigned char c,int action)809 static unsigned char literal_escape (unsigned char c, int action) {
810 
811    static unsigned char control_escape [] =  {
812       'a', 'b',
813       'e',
814       'f', 'n', 'r', 't', 'v', '\0'
815    };
816 
817    static unsigned char control_actual [] =  {
818       '\a', '\b',
819 #ifdef EBCDIC_CHARSET
820       0x27,  /* Escape character in IBM's EBCDIC character set. */
821 #else
822       0x1B,  /* Escape character in ASCII character set. */
823 #endif
824       '\f', '\n', '\r', '\t', '\v', '\0'
825    };
826 
827    static unsigned char valid_escape [] =  {
828       'a',   'b',   'f',   'n',   'r',   't',   'v',   '(',    ')',   '[',
829       ']',   '<',   '>',   '.',   '\\',  '|',   '^',   '$',   '*',   '+',
830       '?',   '&',   '\"',  '\0'
831    };
832 
833    static unsigned char value [] = {
834       '\a',  '\b',  '\f',  '\n',  '\r',  '\t',  '\v',  '(',   ')',   '[',
835       ']',   '<',   '>',   '.',   '\\',   '|',  '^',   '$',   '*',   '+',
836       '?',   '&',   '\"',  '\0'
837    };
838 
839    int i;
840 
841    if (action == 0) {
842       for (i = 0; valid_escape [i] != '\0'; i++) {
843          if (c == valid_escape [i]) return value [i];
844       }
845    } else if (action == 1) {
846       for (i = 0; control_actual [i] != '\0'; i++) {
847          if (c == control_actual [i]) {
848             return control_escape [i];
849          }
850       }
851    }
852 
853    if (action == 1) {
854       if (!isprint (c)) {
855          /* Signal to generate an numeric (octal) escape. */
856          return '0';
857       }
858    }
859 
860    return 0;
861 }
862 
863 /*----------------------------------------------------------------------*
864  * ConvertSubstituteRE - Perform substitutions after a `regexp' match.
865  *----------------------------------------------------------------------*/
866 
ConvertSubstituteRE(const char * source,char * dest,int max)867 void ConvertSubstituteRE (
868    const char   *source,
869    char   *dest,
870    int     max) {
871 
872    register unsigned char *src;
873    register unsigned char *dst;
874    register unsigned char  c;
875    register unsigned char  test;
876 
877    if (source == NULL || dest == NULL) {
878       reg_error ("NULL parm to `ConvertSubstituteRE\'");
879 
880       return;
881    }
882 
883    src = (unsigned char *) source;
884    dst = (unsigned char *) dest;
885 
886    while ((c = *src++) != '\0') {
887 
888       if (c == '\\') {
889          /* Process any case altering tokens, i.e \u, \U, \l, \L. */
890 
891          if (*src == 'u' || *src == 'U' || *src == 'l' || *src == 'L') {
892             *dst++ = '\\';
893              c     = *src++;
894             *dst++ = c;
895 
896             if (c == '\0') {
897                break;
898             } else {
899                c = *src++;
900             }
901          }
902       }
903 
904       if (c == '&') {
905          *dst++ = '&';
906 
907       } else if (c == '\\') {
908          if (*src == '0') {
909             /* Convert `\0' to `&' */
910 
911             *dst++ = '&'; src++;
912 
913          } else if ('1' <= *src && *src <=  '9') {
914             *dst++ = '\\';
915             *dst++ = *src++;
916 
917          } else if ((test = literal_escape (*src, 0)) != '\0') {
918             *dst++ = '\\';
919             *dst++ = *src++;
920 
921          } else if (*src == '\0') {
922             /* If '\' is the last character of the replacement string, it is
923                interpreted as a literal backslash. */
924 
925             *dst++ = '\\';
926          } else {
927             /* Old regex's allowed any escape sequence.  Convert these to
928                unescaped characters that replace themselves; i.e. they don't
929                need to be escaped. */
930 
931             *dst++ = *src++;
932          }
933       } else {
934          /* Ordinary character. */
935 
936          if (((char *) dst - (char *) dest) >= (max - 1)) {
937             break;
938          } else {
939             if ((test = literal_escape (c, 1))) {
940                /* Ordinary character matches an escape sequence;
941                   convert it to the escape sequence. */
942 
943                *dst++ = '\\';
944 
945                if (test == '0') { /* Make octal escape. */
946                   test   = c;
947                   *dst++ = '0';
948                   *dst++ = ('0' + (test / 64));
949                   test  -= (test / 64) * 64;
950                   *dst++ = ('0' + (test / 8));
951                   test  -= (test / 8) * 8;
952                   *dst++ = ('0' +  test);
953                } else {
954                   *dst++ = test;
955                }
956 
957             } else {
958                *dst++ = c;
959             }
960          }
961       }
962    }
963 
964    *dst = '\0';
965 }
966 
967 /*----------------------------------------------------------------------*
968  * reg_error
969  *----------------------------------------------------------------------*/
970 
reg_error(char * str)971 static void reg_error (char *str) {
972 
973    fprintf (
974       stderr,
975       "NEdit: Internal error processing regular expression (%s)\n",
976       str);
977 }
978