1 /*
2  * scan.c - Helper routines for directory tree scans
3  */
4 
5 /*
6  * Copyright (C) 2013-2017 Eric Biggers
7  *
8  * This file is free software; you can redistribute it and/or modify it under
9  * the terms of the GNU Lesser General Public License as published by the Free
10  * Software Foundation; either version 3 of the License, or (at your option) any
11  * later version.
12  *
13  * This file is distributed in the hope that it will be useful, but WITHOUT
14  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
15  * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
16  * details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this file; if not, see http://www.gnu.org/licenses/.
20  */
21 
22 #ifdef HAVE_CONFIG_H
23 #  include "config.h"
24 #endif
25 
26 #include <string.h>
27 
28 #include "wimlib/blob_table.h"
29 #include "wimlib/dentry.h"
30 #include "wimlib/error.h"
31 #include "wimlib/paths.h"
32 #include "wimlib/pattern.h"
33 #include "wimlib/progress.h"
34 #include "wimlib/scan.h"
35 #include "wimlib/textfile.h"
36 
37 /*
38  * Tally a file (or directory) that has been scanned for a capture operation,
39  * and possibly call the progress function provided by the library user.
40  *
41  * @params
42  *	Current path, flags, optional progress function, and progress data for
43  *	the scan operation.
44  * @status
45  *	Status of the scanned file.
46  * @inode
47  *	If @status is WIMLIB_SCAN_DENTRY_OK, this is a pointer to the WIM inode
48  *	that has been created for the scanned file.  The first time the file is
49  *	seen, inode->i_nlink will be 1.  On subsequent visits of the same inode
50  *	via additional hard links, inode->i_nlink will be greater than 1.
51  */
52 int
do_scan_progress(struct scan_params * params,int status,const struct wim_inode * inode)53 do_scan_progress(struct scan_params *params, int status,
54 		 const struct wim_inode *inode)
55 {
56 	int ret;
57 	tchar *cookie;
58 
59 	switch (status) {
60 	case WIMLIB_SCAN_DENTRY_OK:
61 		if (!(params->add_flags & WIMLIB_ADD_FLAG_VERBOSE))
62 			return 0;
63 		break;
64 	case WIMLIB_SCAN_DENTRY_UNSUPPORTED:
65 	case WIMLIB_SCAN_DENTRY_EXCLUDED:
66 	case WIMLIB_SCAN_DENTRY_FIXED_SYMLINK:
67 	case WIMLIB_SCAN_DENTRY_NOT_FIXED_SYMLINK:
68 		if (!(params->add_flags & WIMLIB_ADD_FLAG_EXCLUDE_VERBOSE))
69 			return 0;
70 		break;
71 	}
72 	params->progress.scan.cur_path = params->cur_path;
73 	params->progress.scan.status = status;
74 	if (status == WIMLIB_SCAN_DENTRY_OK) {
75 
76 		/* The first time the inode is seen, tally all its streams.  */
77 		if (inode->i_nlink == 1) {
78 			for (unsigned i = 0; i < inode->i_num_streams; i++) {
79 				const struct blob_descriptor *blob =
80 					stream_blob_resolved(&inode->i_streams[i]);
81 				if (blob)
82 					params->progress.scan.num_bytes_scanned += blob->size;
83 			}
84 		}
85 
86 		/* Tally the file itself, counting every hard link.  It's
87 		 * debatable whether every link should be counted, but counting
88 		 * every link makes the statistics consistent with the ones
89 		 * placed in the FILECOUNT and DIRCOUNT elements of the WIM
90 		 * file's XML document.  It also avoids possible user confusion
91 		 * if the number of files reported were to be lower than that
92 		 * displayed by some other software such as file browsers.  */
93 		if (inode_is_directory(inode))
94 			params->progress.scan.num_dirs_scanned++;
95 		else
96 			params->progress.scan.num_nondirs_scanned++;
97 	}
98 
99 	/* Call the user-provided progress function.  */
100 
101 	cookie = progress_get_win32_path(params->progress.scan.cur_path);
102 	ret = call_progress(params->progfunc, WIMLIB_PROGRESS_MSG_SCAN_DENTRY,
103 			     &params->progress, params->progctx);
104 	progress_put_win32_path(cookie);
105 	return ret;
106 }
107 
108 /*
109  * Given a null-terminated pathname pattern @pat that has been read from line
110  * @line_no of the file @path, validate and canonicalize the pattern.
111  *
112  * On success, returns 0.
113  * On failure, returns WIMLIB_ERR_INVALID_CAPTURE_CONFIG.
114  * In either case, @pat may have been modified in-place (and possibly
115  * shortened).
116  */
117 int
mangle_pat(tchar * pat,const tchar * path,unsigned long line_no)118 mangle_pat(tchar *pat, const tchar *path, unsigned long line_no)
119 {
120 	if (!is_any_path_separator(pat[0]) &&
121 	    pat[0] != T('\0') && pat[1] == T(':'))
122 	{
123 		/* Pattern begins with drive letter.  */
124 
125 		if (!is_any_path_separator(pat[2])) {
126 			/* Something like c:file, which is actually a path
127 			 * relative to the current working directory on the c:
128 			 * drive.  We require paths with drive letters to be
129 			 * absolute.  */
130 			ERROR("%"TS":%lu: Invalid pattern \"%"TS"\":\n"
131 			      "        Patterns including drive letters must be absolute!\n"
132 			      "        Maybe try \"%"TC":%"TC"%"TS"\"?\n",
133 			      path, line_no, pat,
134 			      pat[0], OS_PREFERRED_PATH_SEPARATOR, &pat[2]);
135 			return WIMLIB_ERR_INVALID_CAPTURE_CONFIG;
136 		}
137 
138 		WARNING("%"TS":%lu: Pattern \"%"TS"\" starts with a drive "
139 			"letter, which is being removed.",
140 			path, line_no, pat);
141 
142 		/* Strip the drive letter.  */
143 		tmemmove(pat, pat + 2, tstrlen(pat + 2) + 1);
144 	}
145 
146 	/* Collapse consecutive path separators, and translate both / and \ into
147 	 * / (UNIX) or \ (Windows).
148 	 *
149 	 * Note: we expect that this function produces patterns that can be used
150 	 * for both filesystem paths and WIM paths, so the desired path
151 	 * separators must be the same.  */
152 	STATIC_ASSERT(OS_PREFERRED_PATH_SEPARATOR == WIM_PATH_SEPARATOR);
153 	do_canonicalize_path(pat, pat);
154 
155 	/* Relative patterns can only match file names, so they must be
156 	 * single-component only.  */
157 	if (pat[0] != OS_PREFERRED_PATH_SEPARATOR &&
158 	    tstrchr(pat, OS_PREFERRED_PATH_SEPARATOR))
159 	{
160 		ERROR("%"TS":%lu: Invalid pattern \"%"TS"\":\n"
161 		      "        Relative patterns can only include one path component!\n"
162 		      "        Maybe try \"%"TC"%"TS"\"?",
163 		      path, line_no, pat, OS_PREFERRED_PATH_SEPARATOR, pat);
164 		return WIMLIB_ERR_INVALID_CAPTURE_CONFIG;
165 	}
166 
167 	return 0;
168 }
169 
170 /*
171  * Read, parse, and validate a capture configuration file from either an on-disk
172  * file or an in-memory buffer.
173  *
174  * To read from a file, specify @config_file, and use NULL for @buf.
175  * To read from a buffer, specify @buf and @bufsize.
176  *
177  * @config must be initialized to all 0's.
178  *
179  * On success, 0 will be returned, and the resulting capture configuration will
180  * be stored in @config.
181  *
182  * On failure, a positive error code will be returned, and the contents of
183  * @config will be invalidated.
184  */
185 int
read_capture_config(const tchar * config_file,const void * buf,size_t bufsize,struct capture_config * config)186 read_capture_config(const tchar *config_file, const void *buf,
187 		    size_t bufsize, struct capture_config *config)
188 {
189 	int ret;
190 
191 	/* [PrepopulateList] is used for apply, not capture.  But since we do
192 	 * understand it, recognize it, thereby avoiding the unrecognized
193 	 * section warning, but discard the resulting strings.
194 	 *
195 	 * We currently ignore [CompressionExclusionList] and
196 	 * [CompressionFolderList].  This is a known issue that doesn't seem to
197 	 * have any real consequences, so don't issue warnings about not
198 	 * recognizing those sections.  */
199 	STRING_LIST(prepopulate_pats);
200 	STRING_LIST(compression_exclusion_pats);
201 	STRING_LIST(compression_folder_pats);
202 
203 	struct text_file_section sections[] = {
204 		{T("ExclusionList"),
205 			&config->exclusion_pats},
206 		{T("ExclusionException"),
207 			&config->exclusion_exception_pats},
208 		{T("PrepopulateList"),
209 			&prepopulate_pats},
210 		{T("CompressionExclusionList"),
211 			&compression_exclusion_pats},
212 		{T("CompressionFolderList"),
213 			&compression_folder_pats},
214 	};
215 	void *mem;
216 
217 	ret = load_text_file(config_file, buf, bufsize, &mem,
218 			     sections, ARRAY_LEN(sections),
219 			     LOAD_TEXT_FILE_REMOVE_QUOTES, mangle_pat);
220 	if (ret) {
221 		ERROR("Failed to load capture configuration file \"%"TS"\"",
222 		      config_file);
223 		switch (ret) {
224 		case WIMLIB_ERR_INVALID_UTF8_STRING:
225 		case WIMLIB_ERR_INVALID_UTF16_STRING:
226 			ERROR("Note: the capture configuration file must be "
227 			      "valid UTF-8 or UTF-16LE");
228 			ret = WIMLIB_ERR_INVALID_CAPTURE_CONFIG;
229 			break;
230 		case WIMLIB_ERR_OPEN:
231 		case WIMLIB_ERR_STAT:
232 		case WIMLIB_ERR_NOMEM:
233 		case WIMLIB_ERR_READ:
234 			ret = WIMLIB_ERR_UNABLE_TO_READ_CAPTURE_CONFIG;
235 			break;
236 		}
237 		return ret;
238 	}
239 
240 	FREE(prepopulate_pats.strings);
241 	FREE(compression_exclusion_pats.strings);
242 	FREE(compression_folder_pats.strings);
243 
244 	config->buf = mem;
245 	return 0;
246 }
247 
248 void
destroy_capture_config(struct capture_config * config)249 destroy_capture_config(struct capture_config *config)
250 {
251 	FREE(config->exclusion_pats.strings);
252 	FREE(config->exclusion_exception_pats.strings);
253 	FREE(config->buf);
254 }
255 
256 /*
257  * Determine whether @path matches any of the patterns in @list.
258  * Path separators in @path must be WIM_PATH_SEPARATOR.
259  */
260 bool
match_pattern_list(const tchar * path,const struct string_list * list,int match_flags)261 match_pattern_list(const tchar *path, const struct string_list *list,
262 		   int match_flags)
263 {
264 	for (size_t i = 0; i < list->num_strings; i++)
265 		if (match_path(path, list->strings[i], match_flags))
266 			return true;
267 	return false;
268 }
269 
270 /*
271  * Determine if a file should be excluded from capture.
272  *
273  * This function tests exclusions from both possible sources of exclusions:
274  *
275  *	(1) The capture configuration file
276  *	(2) The user-provided progress function
277  *
278  * params->root_path_nchars must have been set beforehand.  Example for UNIX: if
279  * the capture root directory is "foobar/subdir", then all paths will be
280  * provided starting with "foobar/subdir", so params->root_path_nchars must have
281  * been set to strlen("foobar/subdir") so that the appropriate path suffix can
282  * be matched against the patterns in the exclusion list.
283  *
284  * Returns:
285  *	< 0 if excluded
286  *	= 0 if not excluded and no error
287  *	> 0 (wimlib error code) if error
288  */
289 int
try_exclude(const struct scan_params * params)290 try_exclude(const struct scan_params *params)
291 {
292 	int ret;
293 
294 	if (params->config) {
295 		const tchar *path = params->cur_path + params->root_path_nchars;
296 		if (match_pattern_list(path, &params->config->exclusion_pats,
297 				       MATCH_RECURSIVELY) &&
298 		    !match_pattern_list(path, &params->config->exclusion_exception_pats,
299 					MATCH_RECURSIVELY | MATCH_ANCESTORS))
300 			return -1;
301 	}
302 
303 	if (unlikely(params->add_flags & WIMLIB_ADD_FLAG_TEST_FILE_EXCLUSION)) {
304 
305 		union wimlib_progress_info info;
306 		tchar *cookie;
307 
308 		info.test_file_exclusion.path = params->cur_path;
309 		info.test_file_exclusion.will_exclude = false;
310 
311 		cookie = progress_get_win32_path(info.test_file_exclusion.path);
312 
313 		ret = call_progress(params->progfunc, WIMLIB_PROGRESS_MSG_TEST_FILE_EXCLUSION,
314 				    &info, params->progctx);
315 
316 		progress_put_win32_path(cookie);
317 
318 		if (ret)
319 			return ret;
320 		if (info.test_file_exclusion.will_exclude)
321 			return -1;
322 	}
323 
324 	return 0;
325 }
326 
327 /*
328  * Determine whether a directory entry of the specified name should be ignored.
329  * This is a lower level function which runs prior to try_exclude().  It handles
330  * the standard '.' and '..' entries, which show up in directory listings but
331  * should not be archived.  It also checks for odd filenames that usually should
332  * not exist but could cause problems if archiving them were to be attempted.
333  */
334 bool
should_ignore_filename(const tchar * name,const int name_nchars)335 should_ignore_filename(const tchar *name, const int name_nchars)
336 {
337 	if (name_nchars <= 0) {
338 		WARNING("Ignoring empty filename");
339 		return true;
340 	}
341 
342 	if (name[0] == T('.') &&
343 	    (name_nchars == 1 || (name_nchars == 2 && name[1] == T('.'))))
344 		return true;
345 
346 	for (int i = 0; i < name_nchars; i++) {
347 		if (name[i] == T('\0')) {
348 			WARNING("Ignoring filename containing embedded null character");
349 			return true;
350 		}
351 		if (name[i] == OS_PREFERRED_PATH_SEPARATOR) {
352 			WARNING("Ignoring filename containing embedded path separator");
353 			return true;
354 		}
355 	}
356 
357 	return false;
358 }
359 
360 /* Attach a newly scanned directory tree to its parent directory, with duplicate
361  * handling.  */
362 void
attach_scanned_tree(struct wim_dentry * parent,struct wim_dentry * child,struct blob_table * blob_table)363 attach_scanned_tree(struct wim_dentry *parent, struct wim_dentry *child,
364 		    struct blob_table *blob_table)
365 {
366 	struct wim_dentry *duplicate;
367 
368 	if (child && (duplicate = dentry_add_child(parent, child))) {
369 		WARNING("Duplicate file path: \"%"TS"\".  Only capturing "
370 			"the first version.", dentry_full_path(duplicate));
371 		free_dentry_tree(child, blob_table);
372 	}
373 }
374 
375 /* Set the path at which the directory tree scan is beginning. */
376 int
pathbuf_init(struct scan_params * params,const tchar * root_path)377 pathbuf_init(struct scan_params *params, const tchar *root_path)
378 {
379 	size_t nchars = tstrlen(root_path);
380 	size_t alloc_nchars = nchars + 1 + 1024;
381 
382 	params->cur_path = MALLOC(alloc_nchars * sizeof(tchar));
383 	if (!params->cur_path)
384 		return WIMLIB_ERR_NOMEM;
385 	tmemcpy(params->cur_path, root_path, nchars + 1);
386 	params->cur_path_nchars = nchars;
387 	params->cur_path_alloc_nchars = alloc_nchars;
388 	params->root_path_nchars = nchars;
389 	return 0;
390 }
391 
392 /*
393  * Append a filename to the current path.
394  *
395  * If successful, returns a pointer to the filename component and sets
396  * *orig_path_nchars_ret to the old path length, which can be restored later
397  * using pathbuf_truncate().  Otherwise returns NULL (out of memory).
398  */
399 const tchar *
pathbuf_append_name(struct scan_params * params,const tchar * name,size_t name_nchars,size_t * orig_path_nchars_ret)400 pathbuf_append_name(struct scan_params *params, const tchar *name,
401 		    size_t name_nchars, size_t *orig_path_nchars_ret)
402 {
403 	size_t path_nchars = params->cur_path_nchars;
404 	size_t required_nchars = path_nchars + 1 + name_nchars + 1;
405 	tchar *buf = params->cur_path;
406 
407 	if (unlikely(required_nchars > params->cur_path_alloc_nchars)) {
408 		required_nchars += 1024;
409 		buf = REALLOC(buf, required_nchars * sizeof(tchar));
410 		if (!buf)
411 			return NULL;
412 		params->cur_path = buf;
413 		params->cur_path_alloc_nchars = required_nchars;
414 	}
415 	*orig_path_nchars_ret = path_nchars;
416 
417 	/*
418 	 * Add the slash, but not if it will be a duplicate (which can happen if
419 	 * the path to the capture root directory ends in a slash), because
420 	 * on Windows duplicate slashes sometimes don't work as expected.
421 	 */
422 	if (path_nchars && buf[path_nchars - 1] != OS_PREFERRED_PATH_SEPARATOR)
423 		buf[path_nchars++] = OS_PREFERRED_PATH_SEPARATOR;
424 
425 	tmemcpy(&buf[path_nchars], name, name_nchars);
426 	path_nchars += name_nchars;
427 	buf[path_nchars] = T('\0');
428 	params->cur_path_nchars = path_nchars;
429 	return &buf[path_nchars - name_nchars];
430 }
431 
432 /* Truncate the current path to the specified number of characters. */
433 void
pathbuf_truncate(struct scan_params * params,size_t nchars)434 pathbuf_truncate(struct scan_params *params, size_t nchars)
435 {
436 	wimlib_assert(nchars <= params->cur_path_nchars);
437 	params->cur_path[nchars] = T('\0');
438 	params->cur_path_nchars = nchars;
439 }
440