1 /*  $Id: uwildmat.c 10326 2019-02-04 14:22:34Z iulius $
2 **
3 **  wildmat pattern matching with Unicode UTF-8 extensions.
4 **
5 **  Do shell-style pattern matching for ?, \, [], and * characters.  Might not
6 **  be robust in face of malformed patterns; e.g., "foo[a-" could cause a
7 **  segmentation violation.  It is 8-bit clean.  (Robustness hopefully fixed
8 **  July 2000; all malformed patterns should now just fail to match anything.)
9 **
10 **  Original by Rich $alz, mirror!rs, Wed Nov 26 19:03:17 EST 1986.
11 **  Rich $alz is now <rsalz@osf.org>.
12 **
13 **  April, 1991:  Replaced mutually-recursive calls with in-line code for the
14 **  star character.
15 **
16 **  Special thanks to Lars Mathiesen <thorinn@diku.dk> for the ABORT code.
17 **  This can greatly speed up failing wildcard patterns.  For example:
18 **
19 **	pattern: -*-*-*-*-*-*-12-*-*-*-m-*-*-*
20 **	text 1:	 -adobe-courier-bold-o-normal--12-120-75-75-m-70-iso8859-1
21 **	text 2:	 -adobe-courier-bold-o-normal--12-120-75-75-X-70-iso8859-1
22 **
23 **  Text 1 matches with 51 calls, while text 2 fails with 54 calls.  Without
24 **  the ABORT code, it takes 22310 calls to fail.  Ugh.  The following
25 **  explanation is from Lars:
26 **
27 **  The precondition that must be fulfilled is that DoMatch will consume at
28 **  least one character in text.  This is true if *p is neither '*' nor '\0'.)
29 **  The last return has ABORT instead of false to avoid quadratic behaviour in
30 **  cases like pattern "*a*b*c*d" with text "abcxxxxx".  With false, each
31 **  star-loop has to run to the end of the text; with ABORT only the last one
32 **  does.
33 **
34 **  Once the control of one instance of DoMatch enters the star-loop, that
35 **  instance will return either true or ABORT, and any calling instance will
36 **  therefore return immediately after (without calling recursively again).
37 **  In effect, only one star-loop is ever active.  It would be possible to
38 **  modify the code to maintain this context explicitly, eliminating all
39 **  recursive calls at the cost of some complication and loss of clarity (and
40 **  the ABORT stuff seems to be unclear enough by itself).  I think it would
41 **  be unwise to try to get this into a released version unless you have a
42 **  good test data base to try it out on.
43 **
44 **  June, 1991:  Robert Elz <kre@munnari.oz.au> added minus and close bracket
45 **  handling for character sets.
46 **
47 **  July, 2000:  Largely rewritten by Russ Allbery <eagle@eyrie.org> to add
48 **  support for ',', '!', and optionally '@' to the core wildmat routine.
49 **  Broke the character class matching into a separate function for clarity
50 **  since it's infrequently used in practice, and added some simple lookahead
51 **  to significantly decrease the recursive calls in the '*' matching code.
52 **  Added support for UTF-8 as the default character set for any high-bit
53 **  characters.
54 **
55 **  For more information on UTF-8, see RFC 3629.
56 **
57 **  Please note that this file is intentionally written so that conditionally
58 **  executed expressions are on separate lines from the condition to
59 **  facilitate analysis of the coverage of the test suite using purecov.
60 **  Please preserve this.  As of March 11, 2001, purecov reports that the
61 **  accompanying test suite achieves 100% coverage of this file.
62 */
63 
64 #include "config.h"
65 #include "clibrary.h"
66 #include <ctype.h>
67 
68 #include "inn/libinn.h"
69 
70 #define ABORT -1
71 
72 /* Whether or not an octet looks like the start of a UTF-8 character. */
73 #define ISUTF8(c)       (((c) & 0xc0) == 0xc0)
74 
75 
76 /*
77 **  Determine the length of a non-ASCII character in octets (for advancing
78 **  pointers when skipping over characters).  Takes a pointer to the start of
79 **  the character and to the last octet of the string.  If end is NULL, expect
80 **  the string pointed to by start to be nul-terminated.  If the character is
81 **  malformed UTF-8, return 1 to treat it like an eight-bit local character.
82 */
83 static int
utf8_length(const unsigned char * start,const unsigned char * end)84 utf8_length(const unsigned char *start, const unsigned char *end)
85 {
86     unsigned char mask = 0x80;
87     const unsigned char *p;
88     int length = 0;
89     int left;
90 
91     for (; mask > 0 && (*start & mask) == mask; mask >>= 1)
92         length++;
93     if (length < 2 || length > 6)
94         return 1;
95     if (end != NULL && (end - start + 1) < length)
96         return 1;
97     left = length - 1;
98     for (p = start + 1; left > 0 && (*p & 0xc0) == 0x80; p++)
99         left--;
100     return (left == 0) ? length : 1;
101 }
102 
103 
104 /*
105 **  Check whether a string contains only valid UTF-8 characters, without
106 **  any ASCII control characters except for \r, \n and \t.
107 */
108 bool
is_valid_utf8(const char * text)109 is_valid_utf8(const char *text)
110 {
111     unsigned char mask;
112     const unsigned char *p;
113     int length;
114     int left;
115 
116     for (p = (const unsigned char *)text; *p != '\0';) {
117         mask = 0x80;
118         length = 0;
119 
120         /* Find out the expected length of the character. */
121         for (; mask > 0 && (*p & mask) == mask; mask >>= 1)
122             length++;
123 
124         p++;
125 
126         /* Valid printable ASCII character or CR, LF or HTAB. */
127         if (length == 0) {
128             if(isprint((unsigned char) p[-1])
129                || p[-1] == '\r' || p[-1] == '\n' || p[-1] == '\t') {
130                 continue;
131             } else {
132                 return false;
133             }
134         }
135 
136         /* Invalid length. */
137         if (length < 2 || length > 6)
138             return false;
139 
140         /* Check that each byte looks like 10xxxxxx, except for the first. */
141         left = length - 1;
142         for (; left > 0 && (*p & 0xc0) == 0x80; p++)
143             left--;
144 
145         if (left > 0)
146             return false;
147     }
148 
149     return true;
150 }
151 
152 
153 /*
154 **  Convert a UTF-8 character to UCS-4.  Takes a pointer to the start of the
155 **  character and to the last octet of the string, and to a uint32_t into
156 **  which to put the decoded UCS-4 value.  If end is NULL, expect the string
157 **  pointed to by start to be nul-terminated.  Returns the number of octets in
158 **  the UTF-8 encoding.  If the UTF-8 character is malformed, set result to
159 **  the decimal value of the first octet; this is wrong, but it will generally
160 **  cause the rest of the wildmat matching to do the right thing for non-UTF-8
161 **  input.
162 */
163 static int
utf8_decode(const unsigned char * start,const unsigned char * end,uint32_t * result)164 utf8_decode(const unsigned char *start, const unsigned char *end,
165             uint32_t *result)
166 {
167     uint32_t value = 0;
168     int length, i;
169     const unsigned char *p = start;
170     unsigned char mask;
171 
172     length = utf8_length(start, end);
173     if (length < 2) {
174         *result = *start;
175         return 1;
176     }
177     mask = (1 << (7 - length)) - 1;
178     value = *p & mask;
179     p++;
180     for (i = length - 1; i > 0; i--) {
181         value = (value << 6) | (*p & 0x3f);
182         p++;
183     }
184     *result = value;
185     return length;
186 }
187 
188 
189 /*
190 **  Match a character class against text, a UCS-4 character.  start is a
191 **  pointer to the first character of the character class, end a pointer to
192 **  the last.  Returns whether the class matches that character.
193 */
194 static bool
match_class(uint32_t text,const unsigned char * start,const unsigned char * end)195 match_class(uint32_t text, const unsigned char *start,
196             const unsigned char *end)
197 {
198     bool reversed, allowrange;
199     const unsigned char *p = start;
200     uint32_t first = 0;
201     uint32_t last;
202 
203     /* Check for an inverted character class (starting with ^).  If the
204        character matches the character class, we return !reversed; that way,
205        we return true if it's a regular character class and false if it's a
206        reversed one.  If the character doesn't match, we return reversed. */
207     reversed = (*p == '^');
208     if (reversed)
209         p++;
210 
211     /* Walk through the character class until we reach the end or find a
212        match, handling character ranges as we go.  Only permit a range to
213        start when allowrange is true; this allows - to be treated like a
214        normal character as the first character of the class and catches
215        malformed ranges like a-e-n.  We treat the character at the beginning
216        of a range as both a regular member of the class and the beginning of
217        the range; this is harmless (although it means that malformed ranges
218        like m-a will match m and nothing else). */
219     allowrange = false;
220     while (p <= end) {
221         if (allowrange && *p == '-' && p < end) {
222             p++;
223             p += utf8_decode(p, end, &last);
224             if (text >= first && text <= last)
225                 return !reversed;
226             allowrange = false;
227         } else {
228             p += utf8_decode(p, end, &first);
229             if (text == first)
230                 return !reversed;
231             allowrange = true;
232         }
233     }
234     return reversed;
235 }
236 
237 
238 /*
239 **  Match the text against the pattern between start and end.  This is a
240 **  single pattern; a leading ! or @ must already be taken care of, and
241 **  commas must be dealt with outside of this routine.
242 */
243 static int
match_pattern(const unsigned char * text,const unsigned char * start,const unsigned char * end)244 match_pattern(const unsigned char *text, const unsigned char *start,
245               const unsigned char *end)
246 {
247     const unsigned char *q, *endclass;
248     const unsigned char *p = start;
249     bool ismeta;
250     int matched, width;
251     uint32_t c;
252 
253     for (; p <= end; p++) {
254         if (!*text && *p != '*')
255             return ABORT;
256 
257         switch (*p) {
258         case '\\':
259             if (!*++p)
260                 return ABORT;
261             /* Fall through. */
262 
263         default:
264             if (*text++ != *p)
265                 return false;
266             break;
267 
268         case '?':
269             text += ISUTF8(*text) ? utf8_length(text, NULL) : 1;
270             break;
271 
272         case '*':
273             /* Consecutive stars are equivalent to one.  Advance pattern to
274                the character after the star. */
275             for (++p; *p == '*'; p++)
276                 ;
277 
278             /* A trailing star will match anything. */
279             if (p > end)
280                 return true;
281 
282             /* Basic algorithm: Recurse at each point where the * could
283                possibly match.  If the match succeeds or aborts, return
284                immediately; otherwise, try the next position.
285 
286                Optimization: If the character after the * in the pattern
287                isn't a metacharacter (the common case), then the * has to
288                consume characters at least up to the next occurrence of that
289                character in the text.  Scan forward for those points rather
290                than recursing at every possible point to save the extra
291                function call overhead. */
292             ismeta = (*p == '[' || *p == '?' || *p == '\\');
293             while (*text) {
294                 width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
295                 if (ismeta) {
296                     matched = match_pattern(text, p, end);
297                     text += width;
298                 } else {
299                     while (*text && *text != *p) {
300                         text += width;
301                         width = ISUTF8(*text) ? utf8_length(text, NULL) : 1;
302                     }
303                     if (!*text)
304                         return ABORT;
305                     matched = match_pattern(++text, p + 1, end);
306                 }
307                 if (matched != false)
308                     return matched;
309             }
310             return ABORT;
311 
312         case '[':
313             /* Find the end of the character class, making sure not to pick
314                up a close bracket at the beginning of the class. */
315             p++;
316             q = p + (*p == '^') + 1;
317             if (q > end)
318                 return ABORT;
319             endclass = memchr(q, ']', (size_t) (end - q + 1));
320             if (!endclass)
321                 return ABORT;
322 
323             /* Do the heavy lifting in another function for clarity, since
324                character classes are an uncommon case. */
325             text += utf8_decode(text, NULL, &c);
326             if (!match_class(c, p, endclass - 1))
327                 return false;
328             p = endclass;
329             break;
330         }
331     }
332 
333     return (*text == '\0');
334 }
335 
336 
337 /*
338 **  Takes text and a wildmat expression; a wildmat expression is a
339 **  comma-separated list of wildmat patterns, optionally preceded by ! to
340 **  invert the sense of the expression.  Returns UWILDMAT_MATCH if that
341 **  expression matches the text, UWILDMAT_FAIL otherwise.  If allowpoison is
342 **  set, allow @ to introduce a poison expression (the same as !, but if it
343 **  triggers the failed match the routine returns UWILDMAT_POISON instead).
344 */
345 static enum uwildmat
match_expression(const unsigned char * text,const unsigned char * start,bool allowpoison)346 match_expression(const unsigned char *text, const unsigned char *start,
347                  bool allowpoison)
348 {
349     const unsigned char *end, *split;
350     const unsigned char *p = start;
351     bool reverse, escaped;
352     bool match = false;
353     bool poison = false;
354     bool poisoned = false;
355 
356     /* Handle the empty expression separately, since otherwise end will be
357        set to an invalid pointer. */
358     if (!*p)
359         return !*text ? UWILDMAT_MATCH : UWILDMAT_FAIL;
360     end = start + strlen((const char *) start) - 1;
361 
362     /* Main match loop.  Find each comma that separates patterns, and attempt
363        to match the text with each pattern in order.  The last matching
364        pattern determines whether the whole expression matches. */
365     for (; p <= end + 1; p = split + 1) {
366         if (allowpoison)
367             poison = (*p == '@');
368         reverse = (*p == '!') || poison;
369         if (reverse)
370             p++;
371 
372         /* Find the first unescaped comma, if any.  If there is none, split
373            will be one greater than end and point at the nul at the end of
374            the string. */
375         for (escaped = false, split = p; split <= end; split++) {
376             if (*split == '[') {
377                 split++;
378                 if (*split == ']')
379                     split++;
380                 while (split <= end && *split != ']')
381                     split++;
382             }
383             if (*split == ',' && !escaped)
384                 break;
385             escaped = (*split == '\\') ? !escaped : false;
386         }
387 
388         /* Optimization: If match == !reverse and poison == poisoned, this
389            pattern can't change the result, so don't do any work. */
390         if (match == !reverse && poison == poisoned)
391             continue;
392         if (match_pattern(text, p, split - 1) == true) {
393             poisoned = poison;
394             match = !reverse;
395         }
396     }
397     if (poisoned)
398         return UWILDMAT_POISON;
399     return match ? UWILDMAT_MATCH : UWILDMAT_FAIL;
400 }
401 
402 
403 /*
404 **  User-level routine used for wildmats where @ should be treated as a
405 **  regular character.
406 */
407 bool
uwildmat(const char * text,const char * pat)408 uwildmat(const char *text, const char *pat)
409 {
410     const unsigned char *utext = (const unsigned char *) text;
411     const unsigned char *upat = (const unsigned char *) pat;
412 
413     if (upat[0] == '*' && upat[1] == '\0')
414         return true;
415     else
416         return (match_expression(utext, upat, false) == UWILDMAT_MATCH);
417 }
418 
419 
420 /*
421 **  User-level routine used for wildmats that support poison matches.
422 */
423 enum uwildmat
uwildmat_poison(const char * text,const char * pat)424 uwildmat_poison(const char *text, const char *pat)
425 {
426     const unsigned char *utext = (const unsigned char *) text;
427     const unsigned char *upat = (const unsigned char *) pat;
428 
429     if (upat[0] == '*' && upat[1] == '\0')
430         return UWILDMAT_MATCH;
431     else
432         return match_expression(utext, upat, true);
433 }
434 
435 
436 /*
437 **  User-level routine for simple expressions (neither , nor ! are special).
438 */
439 bool
uwildmat_simple(const char * text,const char * pat)440 uwildmat_simple(const char *text, const char *pat)
441 {
442     const unsigned char *utext = (const unsigned char *) text;
443     const unsigned char *upat = (const unsigned char *) pat;
444     size_t length;
445 
446     if (upat[0] == '*' && upat[1] == '\0')
447         return true;
448     else {
449         length = strlen(pat);
450         return (match_pattern(utext, upat, upat + length - 1) == true);
451     }
452 }
453