1 // Author:  Bruce Allen
2 // Created: 2/25/2013
3 //
4 // The software provided here is released by the Naval Postgraduate
5 // School, an agency of the U.S. Department of Navy.  The software
6 // bears no warranty, either expressed or implied. NPS does not assume
7 // legal liability nor responsibility for a User's use of the software
8 // or the results of such use.
9 //
10 // Please note that within the United States, copyright protection,
11 // under Section 105 of the United States Code, Title 17, is not
12 // available for any work of the United States Government and/or for
13 // any works created by United States Government employees. User
14 // acknowledges that this software contains work which was created by
15 // NPS government employees and is therefore in the public domain and
16 // not subject to copyright.
17 //
18 // Released into the public domain on February 25, 2013 by Bruce Allen.
19 
20 /**
21  * \file
22  * provide a filename list from a given path.
23  * The path is utf8.  The list is string for POSIX or wstring for Win.
24  */
25 
26 #include <config.h>
27 // this process of getting WIN32 defined was inspired
28 // from i686-w64-mingw32/sys-root/mingw/include/windows.h.
29 // All this to include winsock2.h before windows.h to avoid a warning.
30 #if defined(__MINGW64__) && defined(__cplusplus)
31 #  ifndef WIN32
32 #    define WIN32
33 #  endif
34 #endif
35 #ifdef WIN32
36   // including winsock2.h now keeps an included header somewhere from
37   // including windows.h first, resulting in a warning.
38   #include <winsock2.h>
39 #endif
40 
41 #include <cassert>
42 #include <string>
43 #include <sstream>
44 #include <string.h>
45 #ifdef WIN32
46 #include <strsafe.h>
47 #else
48 #include <sys/types.h>
49 #include <dirent.h>
50 #endif
51 #include <iostream>
52 #include <sys/stat.h>
53 #include <unistd.h>
54 #include <fcntl.h>
55 #include <stack>
56 #include <set>
57 #include "filename_t.hpp"
58 
59 namespace hasher {
60 
61 // ************************************************************
62 // helper functions
63 // ************************************************************
64 // return true if filename ends with suffix
ends_with(const filename_t & filename,const filename_t & suffix)65 static bool ends_with(const filename_t& filename, const filename_t& suffix) {
66   if(suffix.size() > filename.size()) return false;
67   return filename.substr(filename.size()-suffix.size())==suffix;
68 }
69 
70 // return true if filename can be the first filename of a multipart series
is_multipart(const filename_t & filename)71 static bool is_multipart(const filename_t& filename) {
72 #ifdef WIN32
73   return (ends_with(filename, L".E01") ||
74           ends_with(filename, L".e01"));
75 #else
76   return (ends_with(filename, ".E01") ||
77           ends_with(filename, ".e01"));
78 #endif
79 }
80 
81 // strip out non-first multipart filenames
strip_non_first_multipart_filenames(filenames_t & filenames)82 static void strip_non_first_multipart_filenames(filenames_t& filenames) {
83   const filenames_t names(filenames);
84 
85   for (filenames_t::const_iterator it = names.begin(); it != names.end();
86                                                                      ++it) {
87     filename_t filename = *it;
88     if (is_multipart(filename)) {
89       // get root
90       filename_t filename_root = filename.substr(0, filename.size()-2);
91 
92       // remove filenames of root after first
93       for (size_t i=2;;++i) {
94 #ifdef WIN32
95         std::wstringstream ss;
96 #else
97         std::stringstream ss;
98 #endif
99         // concatenate root and digit
100         ss << filename_root;
101         if (i < 10) {
102           // prepend 0 if <= 09
103 #ifdef WIN32
104           ss << L"0";
105 #else
106           ss << "0";
107 #endif
108         }
109         ss << i;
110         filename_t next_filename = ss.str();
111 
112         // remove next filename from filenames
113         size_t num_erased = filenames.erase(next_filename);
114         if (num_erased == 1) {
115           // keep going
116         } else {
117           // done with this root
118           break;
119         }
120       }
121     }
122   }
123 }
124 
125 // ************************************************************
126 // filename_list
127 // ************************************************************
128 #ifdef WIN32 // Windows implementation
129 
130 // adapted from stackoverflow.com/questions/67273/how-do-you-iterate-through-every-file-directory-recursively-in-standard-c
131 
132 // get files, return error_message or ""
filename_list(const std::string & utf8_filename,filenames_t * files)133 std::string filename_list(const std::string& utf8_filename,
134                           filenames_t* files) {
135 
136   // get native filename
137   const std::wstring native_filename = hasher::utf8_to_native(utf8_filename);
138 
139   // clear files
140   files->clear();
141 
142   // first make sure the filename is a directory
143   DWORD file_attributes = 0;
144   file_attributes = GetFileAttributes(native_filename.c_str());
145   if (file_attributes == INVALID_FILE_ATTRIBUTES) {
146     std::stringstream ss;
147     ss << "Invalid file attributes for file "
148        << hasher::native_to_utf8(native_filename) << ".";
149     return ss.str();
150   }
151   if (!(file_attributes & FILE_ATTRIBUTE_DIRECTORY)) {
152     // not directory so just use filename
153     files->insert(native_filename);
154     return "";
155   }
156 
157   // stack for processing found directories
158   std::stack<std::wstring> directories;
159 
160   // push the first directory for processing
161   directories.push(native_filename);
162 
163   // process directories until empty
164   std::set<uint64_t> seen_file_indexes;
165   while (!directories.empty()) {
166     const std::wstring path = directories.top();
167     directories.pop();
168 
169     // prepare filename with '\*' appended
170     const std::wstring filename_star = path + L"\\*";
171     HANDLE filehandle = INVALID_HANDLE_VALUE;
172     WIN32_FIND_DATA file_data;
173     filehandle = FindFirstFile(filename_star.c_str(), &file_data);
174 
175     do {
176 
177       // make sure the file handle is valid
178       if (filehandle == INVALID_HANDLE_VALUE)  {
179         std::stringstream ss;
180         ss << "Invalid file handle for file "
181            << hasher::native_to_utf8(filename_star) << ".";
182         return ss.str();
183       }
184 
185       // skip file if "." or ".."
186       if (wcscmp(file_data.cFileName, L".") == 0 ||
187                               wcscmp(file_data.cFileName, L"..") == 0) {
188         continue;
189       }
190 
191       // prepare absolute_filename
192       std::wstring absolute_filename = path + L"\\" +
193                                          std::wstring(file_data.cFileName);
194 
195       // skip file if seen before
196       HANDLE opened_filehandle = CreateFile(absolute_filename.c_str(),
197              0,   // desired access
198              FILE_SHARE_READ,
199              NULL,
200              OPEN_EXISTING,
201              (FILE_FLAG_OPEN_REPARSE_POINT | FILE_FLAG_BACKUP_SEMANTICS),
202              NULL);
203       if (opened_filehandle == INVALID_HANDLE_VALUE) {
204         std::stringstream ss;
205         ss << "Invalid file handle for file "
206            << hasher::native_to_utf8(absolute_filename) << ".";
207         return ss.str();
208       }
209       BY_HANDLE_FILE_INFORMATION fileinfo;
210       bool got_info = GetFileInformationByHandle(opened_filehandle, &fileinfo);
211       CloseHandle(opened_filehandle);
212       if (!got_info) {
213         std::stringstream ss;
214         ss << "Invalid information by file handle for file "
215            << hasher::native_to_utf8(absolute_filename) << ".";
216         return ss.str();
217       }
218       uint64_t file_index = (((uint64_t)fileinfo.nFileIndexHigh)<<32) |
219                             (fileinfo.nFileIndexLow);
220       if (seen_file_indexes.find(file_index) == seen_file_indexes.end()) {
221         // new
222         seen_file_indexes.insert(file_index);
223       } else {
224         // seen so skip
225         continue;
226       }
227 
228       if (file_data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
229         directories.push(absolute_filename);
230 
231       } else {
232         files->insert(absolute_filename);
233       }
234 
235     // next
236     } while (FindNextFile(filehandle, &file_data) != 0);
237 
238     if (GetLastError() != ERROR_NO_MORE_FILES) {
239       FindClose(filehandle);
240       std::stringstream ss;
241       ss << "Invalid file path from invalid last error while processing "
242          << hasher::native_to_utf8(filename_star) << ".";
243       return ss.str();
244     }
245 
246     FindClose(filehandle);
247     filehandle = INVALID_HANDLE_VALUE;
248   }
249 
250   // strip out non-first recursive filenames such as *.E02, etc.
251   strip_non_first_multipart_filenames(*files);
252 
253   // done
254   return "";
255 }
256 #else // POSIX implementation
257 
258 // Provide a (device, inode) encoding so we can see if the file
259 // has been seen before.  From bulk_extractor/src/dig.cpp.
260 class dev_inode_t {
261 public:
dev_inode_t(dev_t dev_,ino_t ino_)262     dev_inode_t(dev_t dev_,ino_t ino_):dev(dev_),ino(ino_){}
dev_inode_t(const dev_inode_t & di)263     dev_inode_t(const dev_inode_t &di):dev(di.dev),ino(di.ino){}
264     dev_t dev;
265     ino_t ino;
operator <(const dev_inode_t t2) const266     bool operator<(const dev_inode_t t2) const{
267         return this->dev < t2.dev || (this->dev==t2.dev && this->ino < t2.ino);
268     }
269 };
270 
271 // get files, return error_message or ""
filename_list(const std::string & filename,filenames_t * files)272 std::string filename_list(const std::string& filename, filenames_t* files) {
273 
274   // clear files
275   files->clear();
276 
277   // first make sure the filename is a directory
278   DIR *d = opendir(filename.c_str());
279   if (d == NULL) {
280     // filename is not a directory
281     files->insert(filename);
282     return "";
283   } else {
284     // close resource
285     closedir(d);
286   }
287 
288   // stack for processing found directories
289   std::stack<std::string> directories;
290 
291   // push the first directory for processing
292   directories.push(filename);
293 
294   // process directories until empty
295   std::set<dev_inode_t> seen_dev_inodes;
296 
297   while (!directories.empty()) {
298     const std::string path = directories.top();
299     directories.pop();
300 
301     // read POSIX directory entry
302     DIR *dir= opendir(path.c_str());
303     if (dir == NULL) {
304       std::stringstream ss;
305       ss << "failure in opendir reading path " << path
306          << ", " << strerror(errno);
307       return ss.str();
308     }
309 
310     // read files in directory
311     while (true) {
312       struct dirent *entry = readdir(dir);
313       if (entry == NULL) {
314         // done with readdir stream
315         break;
316       }
317 
318       // skip files "." and ".."
319       const std::string file_suffix(entry->d_name);
320       if (file_suffix == "." || file_suffix == "..") {
321         continue;
322       }
323 
324       // get next filename
325       std::stringstream ss;
326       ss << path << "/" << std::string(entry->d_name);
327       const std::string next_filename = ss.str();
328 
329       // stat the file and maybe skip it
330       struct stat st;
331       if (stat(next_filename.c_str(), &st)) {
332         // can't stat
333         continue;
334       }
335       if(S_ISFIFO(st.st_mode)) continue; // FIFO
336       if(S_ISSOCK(st.st_mode)) continue; // socket
337       if(S_ISBLK(st.st_mode)) continue;  // block device
338       if(S_ISCHR(st.st_mode)) continue;  // character device
339       dev_inode_t dev_inode(st.st_dev, st.st_ino);
340       if (seen_dev_inodes.find(dev_inode) == seen_dev_inodes.end()) {
341         // new
342         seen_dev_inodes.insert(dev_inode);
343       } else {
344         // seen
345         continue;
346       }
347 
348       // send filename to the directories stack or to the filenames vector
349       DIR *name = opendir(next_filename.c_str());
350       if (name == NULL) {
351         // filename is not a directory
352         files->insert(next_filename);
353       } else {
354         // filename is a directory
355         directories.push(next_filename);
356 
357         // close resource
358         closedir(name);
359       }
360     }
361     // close resource
362     closedir(dir);
363   }
364 
365   // strip out non-first recursive filenames such as *.E02, etc.
366   strip_non_first_multipart_filenames(*files);
367 
368   // done
369   return "";
370 }
371 
372 #endif
373 
374 } // end namespace hasher
375