1 // MD5DEEP - dig.c
2 //
3 // By Jesse Kornblum
4 //
5 // This is a work of the US Government. In accordance with 17 USC 105,
6 // copyright protection is not available for any work of the US Government.
7 //
8 // This program is distributed in the hope that it will be useful, but
9 // WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 //
12 // $Id$
13 
14 #include "ssdeep.h"
15 
16 #define STATUS_OK   FALSE
17 
is_special_dir(TCHAR * d)18 static int is_special_dir(TCHAR *d)
19 {
20   return ((!_tcsncmp(d,_TEXT("."),1) && (_tcslen(d) == 1)) ||
21           (!_tcsncmp(d,_TEXT(".."),2) && (_tcslen(d) == 2)));
22 }
23 
24 #ifndef _WIN32
25 
26 static TCHAR DOUBLE_DIR[4] =
27   { (TCHAR)DIR_SEPARATOR,
28     (TCHAR)DIR_SEPARATOR,
29     0
30   };
31 
remove_double_slash(TCHAR * fn)32 static void remove_double_slash(TCHAR *fn)
33 {
34   size_t tsize = sizeof(TCHAR);
35   TCHAR *tmp = fn, *new_str;
36 
37   new_str = _tcsstr(tmp,DOUBLE_DIR);
38   while (NULL != new_str)
39   {
40     /*
41 #ifdef _WIN32
42     // On Windows, we have to allow the first two characters to be slashes
43     // to account for UNC paths. e.g. \\SERVER\dir\path
44     if (tmp == fn)
45     {
46       ++tmp;
47     }
48     else
49     {
50 #endif  // ifdef _WIN32
51     */
52       _tmemmove(new_str,new_str+tsize,_tcslen(new_str));
53 
54       /*
55 #ifdef _WIN32
56     }
57 #endif  // ifdef _WIN32
58       */
59 
60     new_str = _tcsstr(tmp,DOUBLE_DIR);
61   }
62 }
63 
64 
remove_single_dirs(TCHAR * fn)65 static void remove_single_dirs(TCHAR *fn)
66 {
67   unsigned int pos, chars_found = 0;
68   size_t sz = _tcslen(fn), tsize = sizeof(TCHAR);
69 
70   for (pos = 0 ; pos < sz ; pos++)
71   {
72     // Catch strings that end with /. (e.g. /foo/.)
73     if (pos > 0 &&
74 	fn[pos-1] == _TEXT(DIR_SEPARATOR) &&
75 	fn[pos]   == _TEXT('.') &&
76 	fn[pos+1] == 0)
77       fn[pos] = 0;
78 
79     if (fn[pos] == _TEXT('.') && fn[pos+1] == _TEXT(DIR_SEPARATOR))
80     {
81       if (chars_found && fn[pos-1] == _TEXT(DIR_SEPARATOR))
82       {
83 	_tmemmove(fn+(pos*tsize),(fn+((pos+2)*tsize)),(sz-pos) * tsize);
84 
85 	// In case we have ././ we shift back one!
86 	--pos;
87 
88       }
89     }
90     else
91       ++chars_found;
92   }
93 }
94 
95 // Removes all "../" references from the absolute path fn
remove_double_dirs(TCHAR * fn)96 void remove_double_dirs(TCHAR *fn)
97 {
98   size_t pos, next_dir, sz = _tcslen(fn), tsize = sizeof(TCHAR);
99 
100   for (pos = 0; pos < _tcslen(fn) ; pos++)
101   {
102     if (fn[pos] == _TEXT('.') && fn[pos+1] == _TEXT('.'))
103     {
104       if (pos > 0)
105       {
106 
107 	/* We have to keep this next if statement and the one above separate.
108 	   If not, we can't tell later on if the pos <= 0 or
109 	   that the previous character was a DIR_SEPARATOR.
110 	   This matters when we're looking at ..foo/ as an input */
111 
112 	if (fn[pos-1] == _TEXT(DIR_SEPARATOR))
113 	{
114 
115 	  next_dir = pos + 2;
116 
117 	  /* Back up to just before the previous DIR_SEPARATOR
118 	     unless we're already at the start of the string */
119 	  if (pos > 1)
120 	    pos -= 2;
121 	  else
122 	    pos = 0;
123 
124 	  while (fn[pos] != _TEXT(DIR_SEPARATOR) && pos > 0)
125 	    --pos;
126 
127 	  switch(fn[next_dir])
128 	  {
129 	  case DIR_SEPARATOR:
130 	    _tmemmove(fn+pos,fn+next_dir,((sz - next_dir) + 1) * tsize);
131 	    break;
132 
133 	  case 0:
134 	    /* If we have /.. ending the filename */
135 	    fn[pos+1] = 0;
136 	    break;
137 
138 	    /* If we have ..foo, we should do nothing, but skip
139 	       over these double dots */
140 	  default:
141 	    pos = next_dir;
142 	  }
143 	}
144       }
145 
146       /* If we have two dots starting off the string, we should prepend
147 	 a DIR_SEPARATOR and ignore the two dots. That is:
148 	 from the root directory the path ../foo is really just /foo */
149 
150       else
151       {
152 	fn[pos] = _TEXT(DIR_SEPARATOR);
153 	_tmemmove(fn+pos+1,fn+pos+3,sz-(pos+3));
154 
155 
156       }
157     }
158   }
159 }
160 
161 
162 // We don't need to call these functions when running in Windows
163 // as we've already called real_path() on them in main.c. These
164 // functions are necessary in *nix so that we can clean up the
165 // path names without removing the names of symbolic links. They
166 // are also called when the user has specified an absolute path
167 // but has included extra double dots or such.
168 
clean_name(state * s,TCHAR * fn)169 static void clean_name(state *s, TCHAR *fn)
170 {
171   if (not (s->mode & mode_relative)) {
172     remove_double_slash(fn);
173     remove_single_dirs(fn);
174     remove_double_dirs(fn);
175   }
176 }
177 
178 
process_dir(state * s,TCHAR * fn)179 static int process_dir(state *s, TCHAR *fn)
180 {
181   int return_value = STATUS_OK;
182   TCHAR *new_file;
183   _TDIR *current_dir;
184   struct _tdirent *entry;
185 
186   if (have_processed_dir(fn))
187   {
188     print_error_unicode(s,fn,"symlink creates cycle");
189     return STATUS_OK;
190   }
191 
192   if (!processing_dir(fn))
193     internal_error("%s: Cycle checking failed to register directory.", fn);
194 
195   if ((current_dir = _topendir(fn)) == NULL)
196   {
197     print_error_unicode(s,fn,"%s", strerror(errno));
198     return STATUS_OK;
199   }
200 
201   new_file = (TCHAR *)malloc(sizeof(TCHAR) * SSDEEP_PATH_MAX);
202   if (NULL == new_file)
203     internal_error("%s: Out of memory", __progname);
204 
205   while ((entry = _treaddir(current_dir)) != NULL)
206   {
207     if (is_special_dir(entry->d_name))
208       continue;
209 
210     _sntprintf(new_file,SSDEEP_PATH_MAX,_TEXT("%s%c%s"),
211 	       fn,DIR_SEPARATOR,entry->d_name);
212 
213     return_value = process_normal(s,new_file);
214   }
215 
216   free(new_file);
217   _tclosedir(current_dir);
218 
219   if (!done_processing_dir(fn))
220     internal_error("%s: Cycle checking failed to unregister directory.", fn);
221 
222   return return_value;
223 }
224 
225 
file_type_helper(_tstat_t sb)226 static int file_type_helper(_tstat_t sb)
227 {
228   if (S_ISREG(sb.st_mode))
229     return file_regular;
230 
231   if (S_ISDIR(sb.st_mode))
232     return file_directory;
233 
234   if (S_ISBLK(sb.st_mode))
235     return file_block;
236 
237   if (S_ISCHR(sb.st_mode))
238     return file_character;
239 
240   if (S_ISFIFO(sb.st_mode))
241     return file_pipe;
242 
243   /* These file types do not exist in Win32 */
244 #ifndef _WIN32
245 
246   if (S_ISSOCK(sb.st_mode))
247     return file_socket;
248 
249   if (S_ISLNK(sb.st_mode))
250     return file_symlink;
251 #endif   /* ifndef _WIN32 */
252 
253 
254   /* Used to detect Solaris doors */
255 #ifdef S_IFDOOR
256 #ifdef S_ISDOOR
257   if (S_ISDOOR(sb.st_mode))
258     return file_door;
259 #endif
260 #endif
261 
262   return file_unknown;
263 }
264 
265 
file_type(state * s,TCHAR * fn)266 static int file_type(state *s, TCHAR *fn)
267 {
268   _tstat_t sb;
269 
270   if (NULL == s || NULL == fn)
271     return file_unknown;
272 
273   if (_lstat(fn,&sb))
274   {
275     print_error_unicode(s,fn,"%s", strerror(errno));
276     return file_unknown;
277   }
278 
279   return file_type_helper(sb);
280 }
281 
282 
should_hash_symlink(state * s,TCHAR * fn,int * link_type)283 static int should_hash_symlink(state *s, TCHAR *fn, int *link_type)
284 {
285   int type;
286   _tstat_t sb;
287 
288   if (NULL == s || NULL == fn)
289     fatal_error("%s: Null state passed into should_hash_symlink", __progname);
290 
291   // We must look at what this symlink points to before we process it.
292   // The normal file_type function uses lstat to examine the file,
293   // we use stat to examine what this symlink points to.
294   if (_sstat(fn,&sb))
295     {
296       print_error_unicode(s,fn,"%s",strerror(errno));
297       return FALSE;
298     }
299 
300   type = file_type_helper(sb);
301 
302   if (type == file_directory)
303     {
304       if (s->mode & mode_recursive)
305 	process_dir(s,fn);
306       else
307 	{
308 	  print_error_unicode(s,fn,"Is a directory");
309 	}
310       return FALSE;
311     }
312 
313   if (link_type != NULL)
314     *link_type = type;
315   return TRUE;
316 }
317 
318 
319 #define RETURN_IF_MODE(A) \
320 if (s->mode & A) \
321   return TRUE; \
322 break;
323 
324 
should_hash(state * s,TCHAR * fn)325 static int should_hash(state *s, TCHAR *fn)
326 {
327   int type = file_type(s, fn);
328 
329   if (NULL == s || NULL == fn)
330     fatal_error("%s: Null state passed into should_hash", __progname);
331 
332   if (type == file_directory)
333   {
334     if (s->mode & mode_recursive)
335       process_dir(s,fn);
336     else
337     {
338       print_error_unicode(s,fn,"Is a directory");
339     }
340     return FALSE;
341   }
342 
343   if (type == file_symlink)
344     return should_hash_symlink(s,fn,NULL);
345 
346   if (type == file_unknown)
347     return FALSE;
348 
349   // By default we hash anything we can't identify as a "bad thing"
350   return TRUE;
351 }
352 
353 
process_normal(state * s,TCHAR * fn)354 int process_normal(state *s, TCHAR *fn)
355 {
356   clean_name(s,fn);
357 
358   if (should_hash(s,fn))
359     return (hash_file(s,fn));
360 
361   return FALSE;
362 }
363 #endif   // ifndef _WIN32
364 
365 
process_stdin(state * s)366 int process_stdin(state *s)
367 {
368   if (NULL == s)
369     return TRUE;
370 
371   char sum[FUZZY_MAX_RESULT];
372   int status = fuzzy_hash_stream(stdin, sum);
373 
374   if (status != 0)
375   {
376     print_error_unicode(s,_TEXT("stdin"),"Error processing stdin");
377     return TRUE;
378   }
379 
380   display_result(s,_TEXT("stdin"),sum);
381 
382   return FALSE;
383 }
384 
385 
386 
387 
388 #ifdef _WIN32
is_win32_device_file(TCHAR * fn)389 static int is_win32_device_file(TCHAR *fn)
390 {
391   /* Specifications for device files came from
392      http://msdn.microsoft.com/library/default.asp?url=/library/en-us/fileio/base/createfile.asp
393 
394      -- Physical devices (like hard drives) are
395         \\.\PhysicalDriveX where X is a digit from 0 to 9
396      -- Tape devices is \\.\tapeX where X is a digit from 0 to 9
397      -- Logical volumes is \\.\X: where X is a letter */
398 
399   if (!_tcsnicmp(fn, _TEXT("\\\\.\\physicaldrive"),17) &&
400       (_tcslen(fn) == 18) &&
401       isdigit(fn[17]))
402     return TRUE;
403 
404   if (!_tcsnicmp(fn, _TEXT("\\\\.\\tape"),8) &&
405       (_tcslen(fn) == 9) &&
406       isdigit(fn[8]))
407     return TRUE;
408 
409   if ((!_tcsnicmp(fn,_TEXT("\\\\.\\"),4)) &&
410       (_tcslen(fn) == 6) &&
411       (isalpha(fn[4])) &&
412       (fn[5] == ':'))
413     return TRUE;
414 
415   return FALSE;
416 }
417 
418 
process_dir_win32(state * s,TCHAR * fn)419 bool process_dir_win32(state *s, TCHAR *fn) {
420   TCHAR new_fn[SSDEEP_PATH_MAX];
421 
422   if (have_processed_dir(fn)) {
423     print_error_unicode(s, fn, "Cycle detected");
424     return true;
425   }
426 
427   processing_dir(fn);
428 
429   _sntprintf(new_fn,
430 	     SSDEEP_PATH_MAX,
431 	     _TEXT("%s\\*"),
432 	     fn);
433 
434   process_win32(s, new_fn);
435 
436   done_processing_dir(fn);
437   return false;
438 }
439 
440 
process_win32(state * s,TCHAR * fn)441 bool process_win32(state *s, TCHAR *fn)
442 {
443   int rc;
444   size_t len;
445   HANDLE hFind;
446   TCHAR dirname[SSDEEP_PATH_MAX], new_fn[SSDEEP_PATH_MAX], expanded_fn[SSDEEP_PATH_MAX];
447   WIN32_FIND_DATAW FindFileData;
448 
449   if (NULL == s or NULL == fn)
450     return true;
451 
452   //print_status("process_win32 got %S", fn);
453 
454   if (is_win32_device_file(fn))
455     return hash_file(s, fn);
456   if (is_special_dir(fn))
457     return false;
458 
459   // Most Win32 programs reject 'c:'
460   // as an error or use it to alias the current working directory on c:.
461   // As a convenience to users, we're going to accept 'c:'. To do this
462   // we change it into 'c:\'
463   if (_tcslen(fn) == 2 and isalpha(fn[0]) and fn[1] == _TEXT(':')) {
464     fn[2] = _TEXT(DIR_SEPARATOR);
465     fn[3] = 0;
466   }
467 
468   // FindFirstFile doesn't accept '\' as the trailing character.
469   // If we get '\' as a trailing character, we assume this is a directory
470   // and handle that according. In recursive mode, go through the directory
471   // entries. Otherwise, return an error.
472   len = _tcslen(fn);
473   if (fn[len-1] == _TEXT(DIR_SEPARATOR)) {
474     if (s->mode & mode_recursive) {
475       fn[len]   = _TEXT('*');
476       fn[len+1] = 0;
477     } else {
478       print_error_unicode(s, fn, "Is a directory");
479       return false;
480     }
481   }
482 
483   //print_status("cleaned name %S", fn);
484 
485   // If we don't have it already, create the expanded filename.
486   // "C:\foo\bar.txt" --> "\\?\C:\foo\bar.txt"
487   if (not expanded_path(fn) and
488       not (s->mode & mode_relative)) {
489     _sntprintf(expanded_fn,
490 	       SSDEEP_PATH_MAX,
491 	       _TEXT("\\\\?\\%s"),
492 	       fn);
493   }
494   else {
495     _tcsncpy(expanded_fn, fn, SSDEEP_PATH_MAX);
496   }
497   //print_status("expanded filename %S", expanded_fn);
498 
499   hFind = FindFirstFile(expanded_fn, &FindFileData);
500   if (INVALID_HANDLE_VALUE == hFind)
501   {
502     // We don't display an error if there was a wildcard anywhere in the
503     // original filename, e.g. C:\foo\*. When this happens it means we just
504     // didn't find any matching files.
505     // Note that we still display errors with the original 'fn'
506     if (not _tcsstr(fn, _TEXT("*")))
507       print_error_unicode(s, fn, "No such file or directory");
508     return false;
509   }
510 
511   _tcsncpy(dirname, fn, SSDEEP_PATH_MAX);
512   my_dirname(dirname);
513 
514   do {
515     // The filename we've found doesn't include any path information.
516     // That is, for the file C:\foo\bar.txt, we have bar.txt.
517     // We have to add the path information back in manually.
518     // Thankfully Windows doesn't allow wildcards in the early part
519     // of the path. For example, we will never see:  c:\bin\*\tools
520     //
521     // Because the wildcard is always in the last part of the input
522     // (e.g. c:\bin\*.exe) we can use the original dirname, combined
523     // with the filename we've found, to make the new filename.
524     if (not is_special_dir(FindFileData.cFileName)) {
525 
526       //      print_status("Found file: %S", FindFileData.cFileName);
527 
528       _sntprintf(new_fn,
529 		 SSDEEP_PATH_MAX,
530 		 _TEXT("%s%s"),
531 		 dirname,
532 		 FindFileData.cFileName);
533       if (not expanded_path(new_fn) and
534 	  not (s->mode & mode_relative)) {
535 	_sntprintf(expanded_fn,
536 		   SSDEEP_PATH_MAX,
537 		   _TEXT("\\\\?\\%s"),
538 		   new_fn);
539       } else {
540 	_tcsncpy(expanded_fn, new_fn, SSDEEP_PATH_MAX);
541       }
542 
543       //      print_status("Getting attributes for %S", expanded_fn);
544       DWORD dwFileAttributes = FindFileData.dwFileAttributes;
545 
546       if (INVALID_FILE_ATTRIBUTES == dwFileAttributes) {
547 	print_error_unicode(s, new_fn, "File read error");
548       } else if (dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) {
549 	if (s->mode & mode_recursive) {
550 	  process_dir_win32(s, new_fn);
551 	}
552 
553       // TODO Add support for symbolic links
554       /*
555       } else if (dwFileAttributes & FILE_ATTRIBUTE_REPARSE_POINT) {
556 	// Generally we skip reparse points (e.g. symbolic links,
557 	// junction points, etc) UNLESS it's part of single
558 	// instance storage. Single Instance Storage "is a system's
559 	// ability to keep one copy of content that multiple users
560 	// or computers share". See
561 	// http://blogs.technet.com/b/filecab/archive/2006/02/03/single-instance-store-sis-in-windows-storage-server-r2.aspx
562 	switch (FindFileData.dwReserved0) {
563 	case IO_REPARSE_TAG_MOUNT_POINT:
564 	  print_error_unicode(s, new_fn, "Junction point, skipping.");
565 	  break;
566 
567 	case IO_REPARSE_TAG_SYMLINK:
568 	  print_error_unicode(s, new_fn, "Symbolic link, skipping.");
569 	  break;
570 
571 	case IO_REPARSE_TAG_SIS:
572 	  hash_file(s, new_fn);
573 	  break;
574 
575 	default:
576 	  print_error_unicode(s,
577 			      new_fn,
578 			      "Unknown reparse point 0x%"PRIx32", skipping. Please report this to the developers",
579 			      FindFileData.dwReserved0);
580 	  break;
581 	}
582       */
583 
584       } else {
585 	hash_file(s, new_fn);
586       }
587     }
588 
589     rc = FindNextFile(hFind, &FindFileData);
590   } while (rc != 0);
591 
592   if (GetLastError() != ERROR_NO_MORE_FILES) {
593     // The Windows API for getting an intelligible error message
594     // is beserk. Rather than play their silly games, we
595     // acknowledge that an unknown error occured and hope we
596     // can continue.
597     print_error_unicode(s, new_fn, "Unknown error during directory traversal");
598     return true;
599   }
600 
601   rc = FindClose(hFind);
602   if (0 == rc) {
603     print_error_unicode(s, fn, "Unknown error cleaning up directory traversal");
604   }
605 
606   return false;
607 }
608 #endif
609