1 /*-
2  * Copyright (c) 2003-2007 Tim Kientzle
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "archive_platform.h"
28 
29 #ifdef HAVE_STRING_H
30 #include <string.h>
31 #endif
32 #ifdef HAVE_WCHAR_H
33 #include <wchar.h>
34 #endif
35 
36 #include "archive_pathmatch.h"
37 
38 /*
39  * Check whether a character 'c' is matched by a list specification [...]:
40  *    * Leading '!' or '^' negates the class.
41  *    * <char>-<char> is a range of characters
42  *    * \<char> removes any special meaning for <char>
43  *
44  * Some interesting boundary cases:
45  *   a-d-e is one range (a-d) followed by two single characters - and e.
46  *   \a-\d is same as a-d
47  *   a\-d is three single characters: a, d, -
48  *   Trailing - is not special (so [a-] is two characters a and -).
49  *   Initial - is not special ([a-] is same as [-a] is same as [\\-a])
50  *   This function never sees a trailing \.
51  *   [] always fails
52  *   [!] always succeeds
53  */
54 static int
pm_list(const char * start,const char * end,const char c,int flags)55 pm_list(const char *start, const char *end, const char c, int flags)
56 {
57 	const char *p = start;
58 	char rangeStart = '\0', nextRangeStart;
59 	int match = 1, nomatch = 0;
60 
61 	/* This will be used soon... */
62 	(void)flags; /* UNUSED */
63 
64 	/* If this is a negated class, return success for nomatch. */
65 	if ((*p == '!' || *p == '^') && p < end) {
66 		match = 0;
67 		nomatch = 1;
68 		++p;
69 	}
70 
71 	while (p < end) {
72 		nextRangeStart = '\0';
73 		switch (*p) {
74 		case '-':
75 			/* Trailing or initial '-' is not special. */
76 			if ((rangeStart == '\0') || (p == end - 1)) {
77 				if (*p == c)
78 					return (match);
79 			} else {
80 				char rangeEnd = *++p;
81 				if (rangeEnd == '\\')
82 					rangeEnd = *++p;
83 				if ((rangeStart <= c) && (c <= rangeEnd))
84 					return (match);
85 			}
86 			break;
87 		case '\\':
88 			++p;
89 			/* Fall through */
90 		default:
91 			if (*p == c)
92 				return (match);
93 			nextRangeStart = *p; /* Possible start of range. */
94 		}
95 		rangeStart = nextRangeStart;
96 		++p;
97 	}
98 	return (nomatch);
99 }
100 
101 static int
pm_list_w(const wchar_t * start,const wchar_t * end,const wchar_t c,int flags)102 pm_list_w(const wchar_t *start, const wchar_t *end, const wchar_t c, int flags)
103 {
104 	const wchar_t *p = start;
105 	wchar_t rangeStart = L'\0', nextRangeStart;
106 	int match = 1, nomatch = 0;
107 
108 	/* This will be used soon... */
109 	(void)flags; /* UNUSED */
110 
111 	/* If this is a negated class, return success for nomatch. */
112 	if ((*p == L'!' || *p == L'^') && p < end) {
113 		match = 0;
114 		nomatch = 1;
115 		++p;
116 	}
117 
118 	while (p < end) {
119 		nextRangeStart = L'\0';
120 		switch (*p) {
121 		case L'-':
122 			/* Trailing or initial '-' is not special. */
123 			if ((rangeStart == L'\0') || (p == end - 1)) {
124 				if (*p == c)
125 					return (match);
126 			} else {
127 				wchar_t rangeEnd = *++p;
128 				if (rangeEnd == L'\\')
129 					rangeEnd = *++p;
130 				if ((rangeStart <= c) && (c <= rangeEnd))
131 					return (match);
132 			}
133 			break;
134 		case L'\\':
135 			++p;
136 			/* Fall through */
137 		default:
138 			if (*p == c)
139 				return (match);
140 			nextRangeStart = *p; /* Possible start of range. */
141 		}
142 		rangeStart = nextRangeStart;
143 		++p;
144 	}
145 	return (nomatch);
146 }
147 
148 /*
149  * If s is pointing to "./", ".//", "./././" or the like, skip it.
150  */
151 static const char *
pm_slashskip(const char * s)152 pm_slashskip(const char *s) {
153 	while ((*s == '/')
154 	    || (s[0] == '.' && s[1] == '/')
155 	    || (s[0] == '.' && s[1] == '\0'))
156 		++s;
157 	return (s);
158 }
159 
160 static const wchar_t *
pm_slashskip_w(const wchar_t * s)161 pm_slashskip_w(const wchar_t *s) {
162 	while ((*s == L'/')
163 	    || (s[0] == L'.' && s[1] == L'/')
164 	    || (s[0] == L'.' && s[1] == L'\0'))
165 		++s;
166 	return (s);
167 }
168 
169 static int
pm(const char * p,const char * s,int flags)170 pm(const char *p, const char *s, int flags)
171 {
172 	const char *end;
173 
174 	/*
175 	 * Ignore leading './', './/', '././', etc.
176 	 */
177 	if (s[0] == '.' && s[1] == '/')
178 		s = pm_slashskip(s + 1);
179 	if (p[0] == '.' && p[1] == '/')
180 		p = pm_slashskip(p + 1);
181 
182 	for (;;) {
183 		switch (*p) {
184 		case '\0':
185 			if (s[0] == '/') {
186 				if (flags & PATHMATCH_NO_ANCHOR_END)
187 					return (1);
188 				/* "dir" == "dir/" == "dir/." */
189 				s = pm_slashskip(s);
190 			}
191 			return (*s == '\0');
192 		case '?':
193 			/* ? always succeeds, unless we hit end of 's' */
194 			if (*s == '\0')
195 				return (0);
196 			break;
197 		case '*':
198 			/* "*" == "**" == "***" ... */
199 			while (*p == '*')
200 				++p;
201 			/* Trailing '*' always succeeds. */
202 			if (*p == '\0')
203 				return (1);
204 			while (*s) {
205 				if (archive_pathmatch(p, s, flags))
206 					return (1);
207 				++s;
208 			}
209 			return (0);
210 		case '[':
211 			/*
212 			 * Find the end of the [...] character class,
213 			 * ignoring \] that might occur within the class.
214 			 */
215 			end = p + 1;
216 			while (*end != '\0' && *end != ']') {
217 				if (*end == '\\' && end[1] != '\0')
218 					++end;
219 				++end;
220 			}
221 			if (*end == ']') {
222 				/* We found [...], try to match it. */
223 				if (!pm_list(p + 1, end, *s, flags))
224 					return (0);
225 				p = end; /* Jump to trailing ']' char. */
226 				break;
227 			} else
228 				/* No final ']', so just match '['. */
229 				if (*p != *s)
230 					return (0);
231 			break;
232 		case '\\':
233 			/* Trailing '\\' matches itself. */
234 			if (p[1] == '\0') {
235 				if (*s != '\\')
236 					return (0);
237 			} else {
238 				++p;
239 				if (*p != *s)
240 					return (0);
241 			}
242 			break;
243 		case '/':
244 			if (*s != '/' && *s != '\0')
245 				return (0);
246 			/* Note: pattern "/\./" won't match "/";
247 			 * pm_slashskip() correctly stops at backslash. */
248 			p = pm_slashskip(p);
249 			s = pm_slashskip(s);
250 			if (*p == '\0' && (flags & PATHMATCH_NO_ANCHOR_END))
251 				return (1);
252 			--p; /* Counteract the increment below. */
253 			--s;
254 			break;
255 		case '$':
256 			/* '$' is special only at end of pattern and only
257 			 * if PATHMATCH_NO_ANCHOR_END is specified. */
258 			if (p[1] == '\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
259 				/* "dir" == "dir/" == "dir/." */
260 				return (*pm_slashskip(s) == '\0');
261 			}
262 			/* Otherwise, '$' is not special. */
263 			/* FALL THROUGH */
264 		default:
265 			if (*p != *s)
266 				return (0);
267 			break;
268 		}
269 		++p;
270 		++s;
271 	}
272 }
273 
274 static int
pm_w(const wchar_t * p,const wchar_t * s,int flags)275 pm_w(const wchar_t *p, const wchar_t *s, int flags)
276 {
277 	const wchar_t *end;
278 
279 	/*
280 	 * Ignore leading './', './/', '././', etc.
281 	 */
282 	if (s[0] == L'.' && s[1] == L'/')
283 		s = pm_slashskip_w(s + 1);
284 	if (p[0] == L'.' && p[1] == L'/')
285 		p = pm_slashskip_w(p + 1);
286 
287 	for (;;) {
288 		switch (*p) {
289 		case L'\0':
290 			if (s[0] == L'/') {
291 				if (flags & PATHMATCH_NO_ANCHOR_END)
292 					return (1);
293 				/* "dir" == "dir/" == "dir/." */
294 				s = pm_slashskip_w(s);
295 			}
296 			return (*s == L'\0');
297 		case L'?':
298 			/* ? always succeeds, unless we hit end of 's' */
299 			if (*s == L'\0')
300 				return (0);
301 			break;
302 		case L'*':
303 			/* "*" == "**" == "***" ... */
304 			while (*p == L'*')
305 				++p;
306 			/* Trailing '*' always succeeds. */
307 			if (*p == L'\0')
308 				return (1);
309 			while (*s) {
310 				if (archive_pathmatch_w(p, s, flags))
311 					return (1);
312 				++s;
313 			}
314 			return (0);
315 		case L'[':
316 			/*
317 			 * Find the end of the [...] character class,
318 			 * ignoring \] that might occur within the class.
319 			 */
320 			end = p + 1;
321 			while (*end != L'\0' && *end != L']') {
322 				if (*end == L'\\' && end[1] != L'\0')
323 					++end;
324 				++end;
325 			}
326 			if (*end == L']') {
327 				/* We found [...], try to match it. */
328 				if (!pm_list_w(p + 1, end, *s, flags))
329 					return (0);
330 				p = end; /* Jump to trailing ']' char. */
331 				break;
332 			} else
333 				/* No final ']', so just match '['. */
334 				if (*p != *s)
335 					return (0);
336 			break;
337 		case L'\\':
338 			/* Trailing '\\' matches itself. */
339 			if (p[1] == L'\0') {
340 				if (*s != L'\\')
341 					return (0);
342 			} else {
343 				++p;
344 				if (*p != *s)
345 					return (0);
346 			}
347 			break;
348 		case L'/':
349 			if (*s != L'/' && *s != L'\0')
350 				return (0);
351 			/* Note: pattern "/\./" won't match "/";
352 			 * pm_slashskip() correctly stops at backslash. */
353 			p = pm_slashskip_w(p);
354 			s = pm_slashskip_w(s);
355 			if (*p == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END))
356 				return (1);
357 			--p; /* Counteract the increment below. */
358 			--s;
359 			break;
360 		case L'$':
361 			/* '$' is special only at end of pattern and only
362 			 * if PATHMATCH_NO_ANCHOR_END is specified. */
363 			if (p[1] == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
364 				/* "dir" == "dir/" == "dir/." */
365 				return (*pm_slashskip_w(s) == L'\0');
366 			}
367 			/* Otherwise, '$' is not special. */
368 			/* FALL THROUGH */
369 		default:
370 			if (*p != *s)
371 				return (0);
372 			break;
373 		}
374 		++p;
375 		++s;
376 	}
377 }
378 
379 /* Main entry point. */
380 int
__archive_pathmatch(const char * p,const char * s,int flags)381 __archive_pathmatch(const char *p, const char *s, int flags)
382 {
383 	/* Empty pattern only matches the empty string. */
384 	if (p == NULL || *p == '\0')
385 		return (s == NULL || *s == '\0');
386 	else if (s == NULL)
387 		return (0);
388 
389 	/* Leading '^' anchors the start of the pattern. */
390 	if (*p == '^') {
391 		++p;
392 		flags &= ~PATHMATCH_NO_ANCHOR_START;
393 	}
394 
395 	if (*p == '/' && *s != '/')
396 		return (0);
397 
398 	/* Certain patterns anchor implicitly. */
399 	if (*p == '*' || *p == '/') {
400 		while (*p == '/')
401 			++p;
402 		while (*s == '/')
403 			++s;
404 		return (pm(p, s, flags));
405 	}
406 
407 	/* If start is unanchored, try to match start of each path element. */
408 	if (flags & PATHMATCH_NO_ANCHOR_START) {
409 		for ( ; s != NULL; s = strchr(s, '/')) {
410 			if (*s == '/')
411 				s++;
412 			if (pm(p, s, flags))
413 				return (1);
414 		}
415 		return (0);
416 	}
417 
418 	/* Default: Match from beginning. */
419 	return (pm(p, s, flags));
420 }
421 
422 int
__archive_pathmatch_w(const wchar_t * p,const wchar_t * s,int flags)423 __archive_pathmatch_w(const wchar_t *p, const wchar_t *s, int flags)
424 {
425 	/* Empty pattern only matches the empty string. */
426 	if (p == NULL || *p == L'\0')
427 		return (s == NULL || *s == L'\0');
428 	else if (s == NULL)
429 		return (0);
430 
431 	/* Leading '^' anchors the start of the pattern. */
432 	if (*p == L'^') {
433 		++p;
434 		flags &= ~PATHMATCH_NO_ANCHOR_START;
435 	}
436 
437 	if (*p == L'/' && *s != L'/')
438 		return (0);
439 
440 	/* Certain patterns anchor implicitly. */
441 	if (*p == L'*' || *p == L'/') {
442 		while (*p == L'/')
443 			++p;
444 		while (*s == L'/')
445 			++s;
446 		return (pm_w(p, s, flags));
447 	}
448 
449 	/* If start is unanchored, try to match start of each path element. */
450 	if (flags & PATHMATCH_NO_ANCHOR_START) {
451 		for ( ; s != NULL; s = wcschr(s, L'/')) {
452 			if (*s == L'/')
453 				s++;
454 			if (pm_w(p, s, flags))
455 				return (1);
456 		}
457 		return (0);
458 	}
459 
460 	/* Default: Match from beginning. */
461 	return (pm_w(p, s, flags));
462 }
463