1 /*-
2  * Copyright (c) 2003-2007 Tim Kientzle
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer
10  *    in this position and unchanged.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "archive_platform.h"
28 __FBSDID("$FreeBSD$");
29 
30 #ifdef HAVE_STRING_H
31 #include <string.h>
32 #endif
33 #ifdef HAVE_WCHAR_H
34 #include <wchar.h>
35 #endif
36 
37 #include "archive_pathmatch.h"
38 
39 /*
40  * Check whether a character 'c' is matched by a list specification [...]:
41  *    * Leading '!' or '^' negates the class.
42  *    * <char>-<char> is a range of characters
43  *    * \<char> removes any special meaning for <char>
44  *
45  * Some interesting boundary cases:
46  *   a-d-e is one range (a-d) followed by two single characters - and e.
47  *   \a-\d is same as a-d
48  *   a\-d is three single characters: a, d, -
49  *   Trailing - is not special (so [a-] is two characters a and -).
50  *   Initial - is not special ([a-] is same as [-a] is same as [\\-a])
51  *   This function never sees a trailing \.
52  *   [] always fails
53  *   [!] always succeeds
54  */
55 static int
56 pm_list(const char *start, const char *end, const char c, int flags)
57 {
58 	const char *p = start;
59 	char rangeStart = '\0', nextRangeStart;
60 	int match = 1, nomatch = 0;
61 
62 	/* This will be used soon... */
63 	(void)flags; /* UNUSED */
64 
65 	/* If this is a negated class, return success for nomatch. */
66 	if ((*p == '!' || *p == '^') && p < end) {
67 		match = 0;
68 		nomatch = 1;
69 		++p;
70 	}
71 
72 	while (p < end) {
73 		nextRangeStart = '\0';
74 		switch (*p) {
75 		case '-':
76 			/* Trailing or initial '-' is not special. */
77 			if ((rangeStart == '\0') || (p == end - 1)) {
78 				if (*p == c)
79 					return (match);
80 			} else {
81 				char rangeEnd = *++p;
82 				if (rangeEnd == '\\')
83 					rangeEnd = *++p;
84 				if ((rangeStart <= c) && (c <= rangeEnd))
85 					return (match);
86 			}
87 			break;
88 		case '\\':
89 			++p;
90 			/* Fall through */
91 		default:
92 			if (*p == c)
93 				return (match);
94 			nextRangeStart = *p; /* Possible start of range. */
95 		}
96 		rangeStart = nextRangeStart;
97 		++p;
98 	}
99 	return (nomatch);
100 }
101 
102 static int
103 pm_list_w(const wchar_t *start, const wchar_t *end, const wchar_t c, int flags)
104 {
105 	const wchar_t *p = start;
106 	wchar_t rangeStart = L'\0', nextRangeStart;
107 	int match = 1, nomatch = 0;
108 
109 	/* This will be used soon... */
110 	(void)flags; /* UNUSED */
111 
112 	/* If this is a negated class, return success for nomatch. */
113 	if ((*p == L'!' || *p == L'^') && p < end) {
114 		match = 0;
115 		nomatch = 1;
116 		++p;
117 	}
118 
119 	while (p < end) {
120 		nextRangeStart = L'\0';
121 		switch (*p) {
122 		case L'-':
123 			/* Trailing or initial '-' is not special. */
124 			if ((rangeStart == L'\0') || (p == end - 1)) {
125 				if (*p == c)
126 					return (match);
127 			} else {
128 				wchar_t rangeEnd = *++p;
129 				if (rangeEnd == L'\\')
130 					rangeEnd = *++p;
131 				if ((rangeStart <= c) && (c <= rangeEnd))
132 					return (match);
133 			}
134 			break;
135 		case L'\\':
136 			++p;
137 			/* Fall through */
138 		default:
139 			if (*p == c)
140 				return (match);
141 			nextRangeStart = *p; /* Possible start of range. */
142 		}
143 		rangeStart = nextRangeStart;
144 		++p;
145 	}
146 	return (nomatch);
147 }
148 
149 /*
150  * If s is pointing to "./", ".//", "./././" or the like, skip it.
151  */
152 static const char *
153 pm_slashskip(const char *s) {
154 	while ((*s == '/')
155 	    || (s[0] == '.' && s[1] == '/')
156 	    || (s[0] == '.' && s[1] == '\0'))
157 		++s;
158 	return (s);
159 }
160 
161 static const wchar_t *
162 pm_slashskip_w(const wchar_t *s) {
163 	while ((*s == L'/')
164 	    || (s[0] == L'.' && s[1] == L'/')
165 	    || (s[0] == L'.' && s[1] == L'\0'))
166 		++s;
167 	return (s);
168 }
169 
170 static int
171 pm(const char *p, const char *s, int flags)
172 {
173 	const char *end;
174 
175 	/*
176 	 * Ignore leading './', './/', '././', etc.
177 	 */
178 	if (s[0] == '.' && s[1] == '/')
179 		s = pm_slashskip(s + 1);
180 	if (p[0] == '.' && p[1] == '/')
181 		p = pm_slashskip(p + 1);
182 
183 	for (;;) {
184 		switch (*p) {
185 		case '\0':
186 			if (s[0] == '/') {
187 				if (flags & PATHMATCH_NO_ANCHOR_END)
188 					return (1);
189 				/* "dir" == "dir/" == "dir/." */
190 				s = pm_slashskip(s);
191 			}
192 			return (*s == '\0');
193 		case '?':
194 			/* ? always succeeds, unless we hit end of 's' */
195 			if (*s == '\0')
196 				return (0);
197 			break;
198 		case '*':
199 			/* "*" == "**" == "***" ... */
200 			while (*p == '*')
201 				++p;
202 			/* Trailing '*' always succeeds. */
203 			if (*p == '\0')
204 				return (1);
205 			while (*s) {
206 				if (archive_pathmatch(p, s, flags))
207 					return (1);
208 				++s;
209 			}
210 			return (0);
211 		case '[':
212 			/*
213 			 * Find the end of the [...] character class,
214 			 * ignoring \] that might occur within the class.
215 			 */
216 			end = p + 1;
217 			while (*end != '\0' && *end != ']') {
218 				if (*end == '\\' && end[1] != '\0')
219 					++end;
220 				++end;
221 			}
222 			if (*end == ']') {
223 				/* We found [...], try to match it. */
224 				if (!pm_list(p + 1, end, *s, flags))
225 					return (0);
226 				p = end; /* Jump to trailing ']' char. */
227 				break;
228 			} else
229 				/* No final ']', so just match '['. */
230 				if (*p != *s)
231 					return (0);
232 			break;
233 		case '\\':
234 			/* Trailing '\\' matches itself. */
235 			if (p[1] == '\0') {
236 				if (*s != '\\')
237 					return (0);
238 			} else {
239 				++p;
240 				if (*p != *s)
241 					return (0);
242 			}
243 			break;
244 		case '/':
245 			if (*s != '/' && *s != '\0')
246 				return (0);
247 			/* Note: pattern "/\./" won't match "/";
248 			 * pm_slashskip() correctly stops at backslash. */
249 			p = pm_slashskip(p);
250 			s = pm_slashskip(s);
251 			if (*p == '\0' && (flags & PATHMATCH_NO_ANCHOR_END))
252 				return (1);
253 			--p; /* Counteract the increment below. */
254 			--s;
255 			break;
256 		case '$':
257 			/* '$' is special only at end of pattern and only
258 			 * if PATHMATCH_NO_ANCHOR_END is specified. */
259 			if (p[1] == '\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
260 				/* "dir" == "dir/" == "dir/." */
261 				return (*pm_slashskip(s) == '\0');
262 			}
263 			/* Otherwise, '$' is not special. */
264 			/* FALL THROUGH */
265 		default:
266 			if (*p != *s)
267 				return (0);
268 			break;
269 		}
270 		++p;
271 		++s;
272 	}
273 }
274 
275 static int
276 pm_w(const wchar_t *p, const wchar_t *s, int flags)
277 {
278 	const wchar_t *end;
279 
280 	/*
281 	 * Ignore leading './', './/', '././', etc.
282 	 */
283 	if (s[0] == L'.' && s[1] == L'/')
284 		s = pm_slashskip_w(s + 1);
285 	if (p[0] == L'.' && p[1] == L'/')
286 		p = pm_slashskip_w(p + 1);
287 
288 	for (;;) {
289 		switch (*p) {
290 		case L'\0':
291 			if (s[0] == L'/') {
292 				if (flags & PATHMATCH_NO_ANCHOR_END)
293 					return (1);
294 				/* "dir" == "dir/" == "dir/." */
295 				s = pm_slashskip_w(s);
296 			}
297 			return (*s == L'\0');
298 		case L'?':
299 			/* ? always succeeds, unless we hit end of 's' */
300 			if (*s == L'\0')
301 				return (0);
302 			break;
303 		case L'*':
304 			/* "*" == "**" == "***" ... */
305 			while (*p == L'*')
306 				++p;
307 			/* Trailing '*' always succeeds. */
308 			if (*p == L'\0')
309 				return (1);
310 			while (*s) {
311 				if (archive_pathmatch_w(p, s, flags))
312 					return (1);
313 				++s;
314 			}
315 			return (0);
316 		case L'[':
317 			/*
318 			 * Find the end of the [...] character class,
319 			 * ignoring \] that might occur within the class.
320 			 */
321 			end = p + 1;
322 			while (*end != L'\0' && *end != L']') {
323 				if (*end == L'\\' && end[1] != L'\0')
324 					++end;
325 				++end;
326 			}
327 			if (*end == L']') {
328 				/* We found [...], try to match it. */
329 				if (!pm_list_w(p + 1, end, *s, flags))
330 					return (0);
331 				p = end; /* Jump to trailing ']' char. */
332 				break;
333 			} else
334 				/* No final ']', so just match '['. */
335 				if (*p != *s)
336 					return (0);
337 			break;
338 		case L'\\':
339 			/* Trailing '\\' matches itself. */
340 			if (p[1] == L'\0') {
341 				if (*s != L'\\')
342 					return (0);
343 			} else {
344 				++p;
345 				if (*p != *s)
346 					return (0);
347 			}
348 			break;
349 		case L'/':
350 			if (*s != L'/' && *s != L'\0')
351 				return (0);
352 			/* Note: pattern "/\./" won't match "/";
353 			 * pm_slashskip() correctly stops at backslash. */
354 			p = pm_slashskip_w(p);
355 			s = pm_slashskip_w(s);
356 			if (*p == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END))
357 				return (1);
358 			--p; /* Counteract the increment below. */
359 			--s;
360 			break;
361 		case L'$':
362 			/* '$' is special only at end of pattern and only
363 			 * if PATHMATCH_NO_ANCHOR_END is specified. */
364 			if (p[1] == L'\0' && (flags & PATHMATCH_NO_ANCHOR_END)){
365 				/* "dir" == "dir/" == "dir/." */
366 				return (*pm_slashskip_w(s) == L'\0');
367 			}
368 			/* Otherwise, '$' is not special. */
369 			/* FALL THROUGH */
370 		default:
371 			if (*p != *s)
372 				return (0);
373 			break;
374 		}
375 		++p;
376 		++s;
377 	}
378 }
379 
380 /* Main entry point. */
381 int
382 __archive_pathmatch(const char *p, const char *s, int flags)
383 {
384 	/* Empty pattern only matches the empty string. */
385 	if (p == NULL || *p == '\0')
386 		return (s == NULL || *s == '\0');
387 	else if (s == NULL)
388 		return (0);
389 
390 	/* Leading '^' anchors the start of the pattern. */
391 	if (*p == '^') {
392 		++p;
393 		flags &= ~PATHMATCH_NO_ANCHOR_START;
394 	}
395 
396 	if (*p == '/' && *s != '/')
397 		return (0);
398 
399 	/* Certain patterns anchor implicitly. */
400 	if (*p == '*' || *p == '/') {
401 		while (*p == '/')
402 			++p;
403 		while (*s == '/')
404 			++s;
405 		return (pm(p, s, flags));
406 	}
407 
408 	/* If start is unanchored, try to match start of each path element. */
409 	if (flags & PATHMATCH_NO_ANCHOR_START) {
410 		for ( ; s != NULL; s = strchr(s, '/')) {
411 			if (*s == '/')
412 				s++;
413 			if (pm(p, s, flags))
414 				return (1);
415 		}
416 		return (0);
417 	}
418 
419 	/* Default: Match from beginning. */
420 	return (pm(p, s, flags));
421 }
422 
423 int
424 __archive_pathmatch_w(const wchar_t *p, const wchar_t *s, int flags)
425 {
426 	/* Empty pattern only matches the empty string. */
427 	if (p == NULL || *p == L'\0')
428 		return (s == NULL || *s == L'\0');
429 	else if (s == NULL)
430 		return (0);
431 
432 	/* Leading '^' anchors the start of the pattern. */
433 	if (*p == L'^') {
434 		++p;
435 		flags &= ~PATHMATCH_NO_ANCHOR_START;
436 	}
437 
438 	if (*p == L'/' && *s != L'/')
439 		return (0);
440 
441 	/* Certain patterns anchor implicitly. */
442 	if (*p == L'*' || *p == L'/') {
443 		while (*p == L'/')
444 			++p;
445 		while (*s == L'/')
446 			++s;
447 		return (pm_w(p, s, flags));
448 	}
449 
450 	/* If start is unanchored, try to match start of each path element. */
451 	if (flags & PATHMATCH_NO_ANCHOR_START) {
452 		for ( ; s != NULL; s = wcschr(s, L'/')) {
453 			if (*s == L'/')
454 				s++;
455 			if (pm_w(p, s, flags))
456 				return (1);
457 		}
458 		return (0);
459 	}
460 
461 	/* Default: Match from beginning. */
462 	return (pm_w(p, s, flags));
463 }
464