xref: /openbsd/usr.bin/rsync/rmatch.c (revision 5dea098c)
1 /*	$OpenBSD: rmatch.c,v 1.4 2024/02/20 10:36:23 claudio Exp $	*/
2 
3 /*
4  * Copyright (c) 2021 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 /*
20  * Copyright (c) 1989, 1993, 1994
21  *	The Regents of the University of California.  All rights reserved.
22  *
23  * This code is derived from software contributed to Berkeley by
24  * Guido van Rossum.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  */
50 
51 #include <ctype.h>
52 #include <stdio.h>
53 #include <string.h>
54 #include <limits.h>
55 
56 #include "charclass.h"
57 #include "extern.h"
58 
59 #define	RANGE_MATCH	1
60 #define	RANGE_NOMATCH	0
61 #define	RANGE_ERROR	(-1)
62 
63 static int
64 classmatch(const char *pattern, char test, const char **ep)
65 {
66 	const char *mismatch = pattern;
67 	const struct cclass *cc;
68 	const char *colon;
69 	size_t len;
70 	int rval = RANGE_NOMATCH;
71 
72 	if (*pattern++ != ':') {
73 		*ep = mismatch;
74 		return RANGE_ERROR;
75 	}
76 	if ((colon = strchr(pattern, ':')) == NULL || colon[1] != ']') {
77 		*ep = mismatch;
78 		return RANGE_ERROR;
79 	}
80 	*ep = colon + 2;
81 	len = (size_t)(colon - pattern);
82 
83 	for (cc = cclasses; cc->name != NULL; cc++) {
84 		if (!strncmp(pattern, cc->name, len) && cc->name[len] == '\0') {
85 			if (cc->isctype((unsigned char)test))
86 				rval = RANGE_MATCH;
87 			return rval;
88 		}
89 	}
90 
91 	/* invalid character class, treat as normal text */
92 	*ep = mismatch;
93 	return RANGE_ERROR;
94 }
95 
96 static int
97 rangematch(const char **pp, char test)
98 {
99 	const char *pattern = *pp;
100 	int negate, ok;
101 	char c, c2;
102 
103 	/*
104 	 * A bracket expression starting with an unquoted circumflex
105 	 * character produces unspecified results (IEEE 1003.2-1992,
106 	 * 3.13.2).  This implementation treats it like '!', for
107 	 * consistency with the regular expression syntax.
108 	 * J.T. Conklin (conklin@ngai.kaleida.com)
109 	 */
110 	if ((negate = (*pattern == '!' || *pattern == '^')))
111 		++pattern;
112 
113 	/*
114 	 * A right bracket shall lose its special meaning and represent
115 	 * itself in a bracket expression if it occurs first in the list.
116 	 * -- POSIX.2 2.8.3.2
117 	 */
118 	ok = 0;
119 	c = *pattern++;
120 	do {
121 		if (c == '[') {
122 			switch (classmatch(pattern, test, &pattern)) {
123 			case RANGE_MATCH:
124 				ok = 1;
125 				continue;
126 			case RANGE_NOMATCH:
127 				continue;
128 			default:
129 				/* invalid character class, treat literally. */
130 				break;
131 			}
132 		}
133 		if (c == '\\')
134 			c = *pattern++;
135 		if (c == '\0')
136 			return RANGE_ERROR;
137 		/* patterns can not match on '/' */
138 		if (c == '/')
139 			return RANGE_NOMATCH;
140 		if (*pattern == '-'
141 		    && (c2 = *(pattern + 1)) != '\0' && c2 != ']') {
142 			pattern += 2;
143 			if (c2 == '\\')
144 				c2 = *pattern++;
145 			if (c2 == '\0')
146 				return RANGE_ERROR;
147 			if (c <= test && test <= c2)
148 				ok = 1;
149 		} else if (c == test)
150 			ok = 1;
151 	} while ((c = *pattern++) != ']');
152 
153 	*pp = pattern;
154 	return (ok == negate ? RANGE_NOMATCH : RANGE_MATCH);
155 }
156 
157 /*
158  * Single character match, advances pattern as much as needed.
159  * Return 0 on match and !0 (aka 1) on mismatch.
160  * When matched pp is advanced to the end of the pattern matched.
161  */
162 static int
163 matchchar(const char **pp, const char in)
164 {
165 	const char *pattern = *pp;
166 	char c;
167 	int rv = 0;
168 
169 	switch (c = *pattern++) {
170 	case '?':
171 		if (in == '\0')
172 			rv = 1;
173 		if (in == '/')
174 			rv = 1;
175 		break;
176 	case '[':
177 		if (in == '\0')
178 			rv = 1;
179 		if (in == '/')
180 			rv = 1;
181 		if (rv == 1)
182 			break;
183 
184 		switch (rangematch(&pattern, in)) {
185 		case RANGE_ERROR:
186 			/* not a good range, treat as normal text */
187 			goto normal;
188 		case RANGE_MATCH:
189 			break;
190 		case RANGE_NOMATCH:
191 			rv = 1;
192 		}
193 		break;
194 	case '\\':
195 		if ((c = *pattern++) == '\0') {
196 			c = '\\';
197 			--pattern;
198 		}
199 		/* FALLTHROUGH */
200 	default:
201 	normal:
202 		if (c != in)
203 			rv = 1;
204 		break;
205 	}
206 
207 	*pp = pattern;
208 	return rv;
209 }
210 
211 /*
212  * Do a substring match. If wild is set then the pattern started with a '*'.
213  * The match will go until '*', '/' or '\0' is encountered in pattern or
214  * the input string is consumed up to end.
215  * The pattern and string handles pp and ss are updated only on success.
216  */
217 static int
218 matchsub(const char **pp, const char **ss, const char *end, int wild)
219 {
220 	const char *pattern = *pp;
221 	const char *p = pattern;
222 	const char *string = *ss;
223 	size_t matchlen;
224 
225 	/* first calculate how many characters the submatch will consume */
226 	for (matchlen = 0; *p != '\0'; matchlen++) {
227 		if (p[0] == '*')
228 			break;
229 		/* '/' acts as barrier */
230 		if (p[0] == '/' || (p[0] == '\\' && p[1] == '/')) {
231 			if (wild) {
232 				/* match needs to match up to end of segment */
233 				if (string > end - matchlen)
234 					return 1;
235 				string = end - matchlen;
236 				wild = 0;
237 			}
238 			break;
239 		}
240 		/*
241 		 * skip forward one character in pattern by doing a
242 		 * dummy lookup.
243 		 */
244 		matchchar(&p, ' ');
245 	}
246 
247 	/* not enough char to match */
248 	if (string > end - matchlen)
249 		return 1;
250 
251 	if (*p == '\0') {
252 		if (wild) {
253 			/* match needs to match up to end of segment */
254 			string = end - matchlen;
255 			wild = 0;
256 		}
257 	}
258 
259 	while (*pattern != '\0' && *pattern != '*') {
260 		/* eat possible escape char before '/' */
261 		if (pattern[0] == '\\' && pattern[1] == '/')
262 			pattern++;
263 		if (pattern[0] == '/') {
264 			/* hit the barrier but we still have characters left */
265 			if (string < end)
266 				return 1;
267 			break;
268 		}
269 
270 		/* check if there are still characters available to compare */
271 		if (string >= end)
272 			return 1;
273 		/* Compare one char at a time. */
274 		if (!matchchar(&pattern, *string++))
275 			continue;
276 		if (wild) {
277 			/* skip forward one char and restart match */
278 			string = ++*ss;
279 			pattern = *pp;
280 			/* can it still match? */
281 			if (string > end - matchlen)
282 				return 1;
283 		} else {
284 			/* failed match */
285 			return 1;
286 		}
287 	}
288 
289 	*pp = pattern;
290 	*ss = string;
291 	return 0;
292 }
293 
294 /*
295  * File matching with the addition of the special '**'.
296  * Returns 0 on match and !0 for strings that do not match pattern.
297  */
298 int
299 rmatch(const char *pattern, const char *string, int leading_dir)
300 {
301 	const char *segend, *segnext, *mismatch = NULL;
302 	int wild, starstar;
303 
304 	while (*pattern && *string) {
305 
306 		/* handle leading '/' first */
307 		if (pattern[0] == '\\' && pattern[1] == '/')
308 			pattern++;
309 		if (*string == '/' && *pattern == '/') {
310 			string++;
311 			pattern++;
312 		}
313 
314 		/* match to the next '/' in string */
315 		segend = strchr(string, '/');
316 		if (segend == NULL)
317 			segend = strchr(string, '\0');
318 
319 		while (*pattern) {
320 			/*
321 			 * Check for '*' and '**'. For '*' reduce '*' and '?'
322 			 * sequences into n-'?' and trailing '*'.
323 			 * For '**' this optimisation can not be done
324 			 * since '**???/' will match 'a/aa/aaa/' but not
325 			 * 'a/aa/aa/' still additional '*' will be reduced.
326 			 */
327 			wild = 0;
328 			starstar = 0;
329 			for ( ; *pattern == '*' || *pattern == '?'; pattern++) {
330 				if (pattern[0] == '*') {
331 					if (pattern[1] == '*') {
332 						starstar = 1;
333 						pattern++;
334 					}
335 					wild = 1;
336 				} else if (!starstar) {	/* pattern[0] == '?' */
337 					if (string < segend && *string != '/')
338 						string++;
339 					else
340 						/* no match possible */
341 						return 1;
342 				} else
343 					break;
344 			}
345 
346 			/* pattern ends in '**' so it is a match */
347 			if (starstar && *pattern == '\0')
348 				return 0;
349 
350 			if (starstar) {
351 				segnext = segend;
352 				mismatch = pattern;
353 			}
354 
355 			while (string < segend) {
356 				if (matchsub(&pattern, &string, segend, wild)) {
357 failed_match:
358 					/*
359 					 * failed to match, if starstar retry
360 					 * with the next segment.
361 					 */
362 					if (mismatch) {
363 						pattern = mismatch;
364 						wild = 1;
365 						string = segnext;
366 						if (*string == '/')
367 							string++;
368 						segend = strchr(string, '/');
369 						if (!segend)
370 							segend = strchr(string,
371 							    '\0');
372 						segnext = segend;
373 						if (string < segend)
374 							continue;
375 					}
376 					/* no match possible */
377 					return 1;
378 				}
379 				break;
380 			}
381 
382 			/* at end of string segment, eat up any extra '*' */
383 			if (string >= segend && *pattern != '*')
384 				break;
385 		}
386 		if (*string != '\0' && *string != '/')
387 			goto failed_match;
388 		if (*pattern != '\0' && *pattern != '/')
389 			goto failed_match;
390 	}
391 
392 	/* if both pattern and string are consumed it was a match */
393 	if (*pattern == '\0' && *string == '\0')
394 		return 0;
395 	/* if leading_dir is set then string can also be '/' for success */
396 	if (leading_dir && *pattern == '\0' && *string == '/')
397 		return 0;
398 	/* else failure */
399 	return 1;
400 }
401