xref: /openbsd/usr.bin/rsync/rmatch.c (revision 5a38ef86)
1 /*	$OpenBSD: rmatch.c,v 1.2 2021/08/29 15:37:58 claudio Exp $	*/
2 
3 /*
4  * Copyright (c) 2021 Claudio Jeker <claudio@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 /*
20  * Copyright (c) 1989, 1993, 1994
21  *	The Regents of the University of California.  All rights reserved.
22  *
23  * This code is derived from software contributed to Berkeley by
24  * Guido van Rossum.
25  *
26  * Redistribution and use in source and binary forms, with or without
27  * modification, are permitted provided that the following conditions
28  * are met:
29  * 1. Redistributions of source code must retain the above copyright
30  *    notice, this list of conditions and the following disclaimer.
31  * 2. Redistributions in binary form must reproduce the above copyright
32  *    notice, this list of conditions and the following disclaimer in the
33  *    documentation and/or other materials provided with the distribution.
34  * 3. Neither the name of the University nor the names of its contributors
35  *    may be used to endorse or promote products derived from this software
36  *    without specific prior written permission.
37  *
38  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48  * SUCH DAMAGE.
49  */
50 
51 #include <ctype.h>
52 #include <stdio.h>
53 #include <string.h>
54 #include <limits.h>
55 
56 #include "charclass.h"
57 #include "extern.h"
58 
59 #define	RANGE_MATCH	1
60 #define	RANGE_NOMATCH	0
61 #define	RANGE_ERROR	(-1)
62 
63 static int
64 classmatch(const char *pattern, char test, const char **ep)
65 {
66 	const char *mismatch = pattern;
67 	const struct cclass *cc;
68 	const char *colon;
69 	size_t len;
70 	int rval = RANGE_NOMATCH;
71 
72 	if (*pattern++ != ':') {
73 		*ep = mismatch;
74 		return RANGE_ERROR;
75 	}
76 	if ((colon = strchr(pattern, ':')) == NULL || colon[1] != ']') {
77 		*ep = mismatch;
78 		return RANGE_ERROR;
79 	}
80 	*ep = colon + 2;
81 	len = (size_t)(colon - pattern);
82 
83 	for (cc = cclasses; cc->name != NULL; cc++) {
84 		if (!strncmp(pattern, cc->name, len) && cc->name[len] == '\0') {
85 			if (cc->isctype((unsigned char)test))
86 				rval = RANGE_MATCH;
87 			return rval;
88 		}
89 	}
90 
91 	/* invalid character class, treat as normal text */
92 	*ep = mismatch;
93 	return RANGE_ERROR;
94 }
95 
96 static int
97 rangematch(const char **pp, char test)
98 {
99 	const char *pattern = *pp;
100 	int negate, ok;
101 	char c, c2;
102 
103 	/*
104 	 * A bracket expression starting with an unquoted circumflex
105 	 * character produces unspecified results (IEEE 1003.2-1992,
106 	 * 3.13.2).  This implementation treats it like '!', for
107 	 * consistency with the regular expression syntax.
108 	 * J.T. Conklin (conklin@ngai.kaleida.com)
109 	 */
110 	if ((negate = (*pattern == '!' || *pattern == '^')))
111 		++pattern;
112 
113 	/*
114 	 * A right bracket shall lose its special meaning and represent
115 	 * itself in a bracket expression if it occurs first in the list.
116 	 * -- POSIX.2 2.8.3.2
117 	 */
118 	ok = 0;
119 	c = *pattern++;
120 	do {
121 		if (c == '[') {
122 			switch (classmatch(pattern, test, &pattern)) {
123 			case RANGE_MATCH:
124 				ok = 1;
125 				continue;
126 			case RANGE_NOMATCH:
127 				continue;
128 			default:
129 				/* invalid character class, treat litterally. */
130 				break;
131 			}
132 		}
133 		if (c == '\\')
134 			c = *pattern++;
135 		if (c == '\0')
136 			return RANGE_ERROR;
137 		/* patterns can not match on '/' */
138 		if (c == '/')
139 			return RANGE_NOMATCH;
140 		if (*pattern == '-'
141 		    && (c2 = *(pattern + 1)) != '\0' && c2 != ']') {
142 			pattern += 2;
143 			if (c2 == '\\')
144 				c2 = *pattern++;
145 			if (c2 == '\0')
146 				return RANGE_ERROR;
147 			if (c <= test && test <= c2)
148 				ok = 1;
149 		} else if (c == test)
150 			ok = 1;
151 	} while ((c = *pattern++) != ']');
152 
153 	*pp = pattern;
154 	return (ok == negate ? RANGE_NOMATCH : RANGE_MATCH);
155 }
156 
157 /*
158  * Single character match, advances pattern as much as needed.
159  * Return 0 on match and !0 (aka 1) on missmatch.
160  * When matched pp is advanced to the end of the pattern matched.
161  */
162 static int
163 matchchar(const char **pp, const char in)
164 {
165 	const char *pattern = *pp;
166 	char c;
167 	int rv = 0;
168 
169 	switch (c = *pattern++) {
170 	case '?':
171 		if (in == '\0')
172 			rv = 1;
173 		if (in == '/')
174 			rv = 1;
175 		break;
176 	case '[':
177 		if (in == '\0')
178 			rv = 1;
179 		if (in == '/')
180 			rv = 1;
181 		if (rv == 1)
182 			break;
183 
184 		switch (rangematch(&pattern, in)) {
185 		case RANGE_ERROR:
186 			/* not a good range, treat as normal text */
187 			goto normal;
188 		case RANGE_MATCH:
189 			break;
190 		case RANGE_NOMATCH:
191 			rv = 1;
192 		}
193 		break;
194 	case '\\':
195 		if ((c = *pattern++) == '\0') {
196 			c = '\\';
197 			--pattern;
198 		}
199 		/* FALLTHROUGH */
200 	default:
201 	normal:
202 		if (c != in)
203 			rv = 1;
204 		break;
205 	}
206 
207 	*pp = pattern;
208 	return rv;
209 }
210 
211 /*
212  * Do a substring match. If wild is set then the pattern started with a '*'.
213  * The match will go until '*', '/' or '\0' is encountered in pattern or
214  * the input string is consumed up to end.
215  * The pattern and string handles pp and ss are updated only on success.
216  */
217 static int
218 matchsub(const char **pp, const char **ss, const char *end, int wild)
219 {
220 	const char *pattern = *pp;
221 	const char *p = pattern;
222 	const char *string = *ss;
223 	size_t matchlen;
224 
225 	/* first calculate how many characters the submatch will consume */
226 	for (matchlen = 0; *p != '\0'; matchlen++) {
227 		if (p[0] == '*')
228 			break;
229 		/* '/' acts as barrier */
230 		if (p[0] == '/' || (p[0] == '\\' && p[1] == '/')) {
231 			if (wild) {
232 				/* match needs to match up to end of segment */
233 				if (string > end - matchlen)
234 					return 1;
235 				string = end - matchlen;
236 				wild = 0;
237 			}
238 			break;
239 		}
240 		/*
241 		 * skip forward one character in pattern by doing a
242 		 * dummy lookup.
243 		 */
244 		matchchar(&p, ' ');
245 	}
246 
247 	/* not enough char to match */
248 	if (string > end - matchlen)
249 		return 1;
250 
251 	if (*p == '\0') {
252 		if (wild) {
253 			/* match needs to match up to end of segment */
254 			string = end - matchlen;
255 			wild = 0;
256 		}
257 	}
258 
259 	while (*pattern != '\0' && *pattern != '*') {
260 		/* eat possible escape char before '/' */
261 		if (pattern[0] == '\\' && pattern[1] == '/')
262 			pattern++;
263 		if (pattern[0] == '/')
264 			break;
265 
266 		/* check if there are still characters available to compare */
267 		if (string >= end)
268 			return 1;
269 		/* Compare one char at a time. */
270 		if (!matchchar(&pattern, *string++))
271 			continue;
272 		if (wild) {
273 			/* skip forward one char and restart match */
274 			string = ++*ss;
275 			pattern = *pp;
276 			/* can it still match? */
277 			if (string > end - matchlen)
278 				return 1;
279 		} else {
280 			/* failed match */
281 			return 1;
282 		}
283 	}
284 
285 	*pp = pattern;
286 	*ss = string;
287 	return 0;
288 }
289 
290 /*
291  * File matching with the addition of the special '**'.
292  * Returns 0 on match and !0 for strings that do not match pattern.
293  */
294 int
295 rmatch(const char *pattern, const char *string, int leading_dir)
296 {
297 	const char *segend, *segnext, *mismatch = NULL;
298 	int wild, starstar;
299 
300 	while (*pattern && *string) {
301 
302 		/* handle leading '/' first */
303 		if (pattern[0] == '\\' && pattern[1] == '/')
304 			pattern++;
305 		if (*string == '/' && *pattern == '/') {
306 			string++;
307 			pattern++;
308 		}
309 
310 		/* match to the next '/' in string */
311 		segend = strchr(string, '/');
312 		if (segend == NULL)
313 			segend = strchr(string, '\0');
314 
315 		while (*pattern) {
316 			/*
317 			 * Check for '*' and '**'. For '*' reduce '*' and '?'
318 			 * sequences into n-'?' and trailing '*'.
319 			 * For '**' this optimisation can not be done
320 			 * since '**???/' will match 'a/aa/aaa/' but not
321 			 * 'a/aa/aa/' still additional '*' will be reduced.
322 			 */
323 			wild = 0;
324 			starstar = 0;
325 			for ( ; *pattern == '*' || *pattern == '?'; pattern++) {
326 				if (pattern[0] == '*') {
327 					if (pattern[1] == '*') {
328 						starstar = 1;
329 						pattern++;
330 					}
331 					wild = 1;
332 				} else if (!starstar) {	/* pattern[0] == '?' */
333 					if (string < segend && *string != '/')
334 						string++;
335 					else
336 						/* no match possible */
337 						return 1;
338 				} else
339 					break;
340 			}
341 
342 			/* pattern ends in '**' so it is a match */
343 			if (starstar && *pattern == '\0')
344 				return 0;
345 
346 			if (starstar) {
347 				segnext = segend;
348 				mismatch = pattern;
349 			}
350 
351 			while (string < segend) {
352 				if (matchsub(&pattern, &string, segend, wild)) {
353 failed_match:
354 					/*
355 					 * failed to match, if starstar retry
356 					 * with the next segment.
357 					 */
358 					if (mismatch) {
359 						pattern = mismatch;
360 						wild = 1;
361 						string = segnext;
362 						if (*string == '/')
363 							string++;
364 						segend = strchr(string, '/');
365 						if (!segend)
366 							segend = strchr(string,
367 							    '\0');
368 						segnext = segend;
369 						if (string < segend)
370 							continue;
371 					}
372 					/* no match possible */
373 					return 1;
374 				}
375 				break;
376 			}
377 
378 			/* at end of string segment, eat up any extra '*' */
379 			if (string >= segend && *pattern != '*')
380 				break;
381 		}
382 		if (*string != '\0' && *string != '/')
383 			goto failed_match;
384 		if (*pattern != '\0' && *pattern != '/')
385 			goto failed_match;
386 	}
387 
388 	/* if both pattern and string are consumed it was a match */
389 	if (*pattern == '\0' && *string == '\0')
390 		return 0;
391 	/* if leading_dir is set then string can also be '/' for success */
392 	if (leading_dir && *pattern == '\0' && *string == '/')
393 		return 0;
394 	/* else failure */
395 	return 1;
396 }
397