1 // Author: Bruce Allen
2 // Created: 2/25/2013
3 //
4 // The software provided here is released by the Naval Postgraduate
5 // School, an agency of the U.S. Department of Navy. The software
6 // bears no warranty, either expressed or implied. NPS does not assume
7 // legal liability nor responsibility for a User's use of the software
8 // or the results of such use.
9 //
10 // Please note that within the United States, copyright protection,
11 // under Section 105 of the United States Code, Title 17, is not
12 // available for any work of the United States Government and/or for
13 // any works created by United States Government employees. User
14 // acknowledges that this software contains work which was created by
15 // NPS government employees and is therefore in the public domain and
16 // not subject to copyright.
17 //
18 // Released into the public domain on February 25, 2013 by Bruce Allen.
19
20 /**
21 * \file
22 * provide a filename list from a given path.
23 * The path is utf8. The list is string for POSIX or wstring for Win.
24 */
25
26 #include <config.h>
27 // this process of getting WIN32 defined was inspired
28 // from i686-w64-mingw32/sys-root/mingw/include/windows.h.
29 // All this to include winsock2.h before windows.h to avoid a warning.
30 #if defined(__MINGW64__) && defined(__cplusplus)
31 # ifndef WIN32
32 # define WIN32
33 # endif
34 #endif
35 #ifdef WIN32
36 // including winsock2.h now keeps an included header somewhere from
37 // including windows.h first, resulting in a warning.
38 #include <winsock2.h>
39 #endif
40
41 #include <cassert>
42 #include <string>
43 #include <sstream>
44 #include <string.h>
45 #ifdef WIN32
46 #include <strsafe.h>
47 #else
48 #include <sys/types.h>
49 #include <dirent.h>
50 #endif
51 #include <iostream>
52 #include <sys/stat.h>
53 #include <unistd.h>
54 #include <fcntl.h>
55 #include <stack>
56 #include <set>
57 #include "filename_t.hpp"
58
59 namespace hasher {
60
61 // ************************************************************
62 // helper functions
63 // ************************************************************
64 // return true if filename ends with suffix
ends_with(const filename_t & filename,const filename_t & suffix)65 static bool ends_with(const filename_t& filename, const filename_t& suffix) {
66 if(suffix.size() > filename.size()) return false;
67 return filename.substr(filename.size()-suffix.size())==suffix;
68 }
69
70 // return true if filename can be the first filename of a multipart series
is_multipart(const filename_t & filename)71 static bool is_multipart(const filename_t& filename) {
72 #ifdef WIN32
73 return (ends_with(filename, L".E01") ||
74 ends_with(filename, L".e01"));
75 #else
76 return (ends_with(filename, ".E01") ||
77 ends_with(filename, ".e01"));
78 #endif
79 }
80
81 // strip out non-first multipart filenames
strip_non_first_multipart_filenames(filenames_t & filenames)82 static void strip_non_first_multipart_filenames(filenames_t& filenames) {
83 const filenames_t names(filenames);
84
85 for (filenames_t::const_iterator it = names.begin(); it != names.end();
86 ++it) {
87 filename_t filename = *it;
88 if (is_multipart(filename)) {
89 // get root
90 filename_t filename_root = filename.substr(0, filename.size()-2);
91
92 // remove filenames of root after first
93 for (size_t i=2;;++i) {
94 #ifdef WIN32
95 std::wstringstream ss;
96 #else
97 std::stringstream ss;
98 #endif
99 // concatenate root and digit
100 ss << filename_root;
101 if (i < 10) {
102 // prepend 0 if <= 09
103 #ifdef WIN32
104 ss << L"0";
105 #else
106 ss << "0";
107 #endif
108 }
109 ss << i;
110 filename_t next_filename = ss.str();
111
112 // remove next filename from filenames
113 size_t num_erased = filenames.erase(next_filename);
114 if (num_erased == 1) {
115 // keep going
116 } else {
117 // done with this root
118 break;
119 }
120 }
121 }
122 }
123 }
124
125 // ************************************************************
126 // filename_list
127 // ************************************************************
128 #ifdef WIN32 // Windows implementation
129
130 // adapted from stackoverflow.com/questions/67273/how-do-you-iterate-through-every-file-directory-recursively-in-standard-c
131
132 // get files, return error_message or ""
filename_list(const std::string & utf8_filename,filenames_t * files)133 std::string filename_list(const std::string& utf8_filename,
134 filenames_t* files) {
135
136 // get native filename
137 const std::wstring native_filename = hasher::utf8_to_native(utf8_filename);
138
139 // clear files
140 files->clear();
141
142 // first make sure the filename is a directory
143 DWORD file_attributes = 0;
144 file_attributes = GetFileAttributes(native_filename.c_str());
145 if (file_attributes == INVALID_FILE_ATTRIBUTES) {
146 std::stringstream ss;
147 ss << "Invalid file attributes for file "
148 << hasher::native_to_utf8(native_filename) << ".";
149 return ss.str();
150 }
151 if (!(file_attributes & FILE_ATTRIBUTE_DIRECTORY)) {
152 // not directory so just use filename
153 files->insert(native_filename);
154 return "";
155 }
156
157 // stack for processing found directories
158 std::stack<std::wstring> directories;
159
160 // push the first directory for processing
161 directories.push(native_filename);
162
163 // process directories until empty
164 std::set<uint64_t> seen_file_indexes;
165 while (!directories.empty()) {
166 const std::wstring path = directories.top();
167 directories.pop();
168
169 // prepare filename with '\*' appended
170 const std::wstring filename_star = path + L"\\*";
171 HANDLE filehandle = INVALID_HANDLE_VALUE;
172 WIN32_FIND_DATA file_data;
173 filehandle = FindFirstFile(filename_star.c_str(), &file_data);
174
175 do {
176
177 // make sure the file handle is valid
178 if (filehandle == INVALID_HANDLE_VALUE) {
179 std::stringstream ss;
180 ss << "Invalid file handle for file "
181 << hasher::native_to_utf8(filename_star) << ".";
182 return ss.str();
183 }
184
185 // skip file if "." or ".."
186 if (wcscmp(file_data.cFileName, L".") == 0 ||
187 wcscmp(file_data.cFileName, L"..") == 0) {
188 continue;
189 }
190
191 // prepare absolute_filename
192 std::wstring absolute_filename = path + L"\\" +
193 std::wstring(file_data.cFileName);
194
195 // skip file if seen before
196 HANDLE opened_filehandle = CreateFile(absolute_filename.c_str(),
197 0, // desired access
198 FILE_SHARE_READ,
199 NULL,
200 OPEN_EXISTING,
201 (FILE_FLAG_OPEN_REPARSE_POINT | FILE_FLAG_BACKUP_SEMANTICS),
202 NULL);
203 if (opened_filehandle == INVALID_HANDLE_VALUE) {
204 std::stringstream ss;
205 ss << "Invalid file handle for file "
206 << hasher::native_to_utf8(absolute_filename) << ".";
207 return ss.str();
208 }
209 BY_HANDLE_FILE_INFORMATION fileinfo;
210 bool got_info = GetFileInformationByHandle(opened_filehandle, &fileinfo);
211 CloseHandle(opened_filehandle);
212 if (!got_info) {
213 std::stringstream ss;
214 ss << "Invalid information by file handle for file "
215 << hasher::native_to_utf8(absolute_filename) << ".";
216 return ss.str();
217 }
218 uint64_t file_index = (((uint64_t)fileinfo.nFileIndexHigh)<<32) |
219 (fileinfo.nFileIndexLow);
220 if (seen_file_indexes.find(file_index) == seen_file_indexes.end()) {
221 // new
222 seen_file_indexes.insert(file_index);
223 } else {
224 // seen so skip
225 continue;
226 }
227
228 if (file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
229 directories.push(absolute_filename);
230
231 } else {
232 files->insert(absolute_filename);
233 }
234
235 // next
236 } while (FindNextFile(filehandle, &file_data) != 0);
237
238 if (GetLastError() != ERROR_NO_MORE_FILES) {
239 FindClose(filehandle);
240 std::stringstream ss;
241 ss << "Invalid file path from invalid last error while processing "
242 << hasher::native_to_utf8(filename_star) << ".";
243 return ss.str();
244 }
245
246 FindClose(filehandle);
247 filehandle = INVALID_HANDLE_VALUE;
248 }
249
250 // strip out non-first recursive filenames such as *.E02, etc.
251 strip_non_first_multipart_filenames(*files);
252
253 // done
254 return "";
255 }
256 #else // POSIX implementation
257
258 // Provide a (device, inode) encoding so we can see if the file
259 // has been seen before. From bulk_extractor/src/dig.cpp.
260 class dev_inode_t {
261 public:
dev_inode_t(dev_t dev_,ino_t ino_)262 dev_inode_t(dev_t dev_,ino_t ino_):dev(dev_),ino(ino_){}
dev_inode_t(const dev_inode_t & di)263 dev_inode_t(const dev_inode_t &di):dev(di.dev),ino(di.ino){}
264 dev_t dev;
265 ino_t ino;
operator <(const dev_inode_t t2) const266 bool operator<(const dev_inode_t t2) const{
267 return this->dev < t2.dev || (this->dev==t2.dev && this->ino < t2.ino);
268 }
269 };
270
271 // get files, return error_message or ""
filename_list(const std::string & filename,filenames_t * files)272 std::string filename_list(const std::string& filename, filenames_t* files) {
273
274 // clear files
275 files->clear();
276
277 // first make sure the filename is a directory
278 DIR *d = opendir(filename.c_str());
279 if (d == NULL) {
280 // filename is not a directory
281 files->insert(filename);
282 return "";
283 } else {
284 // close resource
285 closedir(d);
286 }
287
288 // stack for processing found directories
289 std::stack<std::string> directories;
290
291 // push the first directory for processing
292 directories.push(filename);
293
294 // process directories until empty
295 std::set<dev_inode_t> seen_dev_inodes;
296
297 while (!directories.empty()) {
298 const std::string path = directories.top();
299 directories.pop();
300
301 // read POSIX directory entry
302 DIR *dir= opendir(path.c_str());
303 if (dir == NULL) {
304 std::stringstream ss;
305 ss << "failure in opendir reading path " << path
306 << ", " << strerror(errno);
307 return ss.str();
308 }
309
310 // read files in directory
311 while (true) {
312 struct dirent *entry = readdir(dir);
313 if (entry == NULL) {
314 // done with readdir stream
315 break;
316 }
317
318 // skip files "." and ".."
319 const std::string file_suffix(entry->d_name);
320 if (file_suffix == "." || file_suffix == "..") {
321 continue;
322 }
323
324 // get next filename
325 std::stringstream ss;
326 ss << path << "/" << std::string(entry->d_name);
327 const std::string next_filename = ss.str();
328
329 // stat the file and maybe skip it
330 struct stat st;
331 if (stat(next_filename.c_str(), &st)) {
332 // can't stat
333 continue;
334 }
335 if(S_ISFIFO(st.st_mode)) continue; // FIFO
336 if(S_ISSOCK(st.st_mode)) continue; // socket
337 if(S_ISBLK(st.st_mode)) continue; // block device
338 if(S_ISCHR(st.st_mode)) continue; // character device
339 dev_inode_t dev_inode(st.st_dev, st.st_ino);
340 if (seen_dev_inodes.find(dev_inode) == seen_dev_inodes.end()) {
341 // new
342 seen_dev_inodes.insert(dev_inode);
343 } else {
344 // seen
345 continue;
346 }
347
348 // send filename to the directories stack or to the filenames vector
349 DIR *name = opendir(next_filename.c_str());
350 if (name == NULL) {
351 // filename is not a directory
352 files->insert(next_filename);
353 } else {
354 // filename is a directory
355 directories.push(next_filename);
356
357 // close resource
358 closedir(name);
359 }
360 }
361 // close resource
362 closedir(dir);
363 }
364
365 // strip out non-first recursive filenames such as *.E02, etc.
366 strip_non_first_multipart_filenames(*files);
367
368 // done
369 return "";
370 }
371
372 #endif
373
374 } // end namespace hasher
375