1 /*
2 
3     File: file_txt.c
4 
5     Copyright (C) 2005-2012 Christophe GRENIER <grenier@cgsecurity.org>
6 
7     This software is free software; you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation; either version 2 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License along
18     with this program; if not, write the Free Software Foundation, Inc., 51
19     Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 
21  */
22 
23 #ifdef HAVE_CONFIG_H
24 #include <config.h>
25 #endif
26 #ifdef HAVE_STDLIB_H
27 #include <stdlib.h>
28 #endif
29 #ifdef HAVE_STRING_H
30 #include <string.h>
31 #endif
32 #ifdef HAVE_TIME_H
33 #include <time.h>
34 #endif
35 #include <ctype.h>      /* tolower */
36 #include <stdio.h>
37 #include "types.h"
38 #include "common.h"
39 #include "filegen.h"
40 #include "log.h"
41 #include "memmem.h"
42 #include "file_txt.h"
43 
44 extern const file_hint_t file_hint_doc;
45 extern const file_hint_t file_hint_jpg;
46 extern const file_hint_t file_hint_pdf;
47 extern const file_hint_t file_hint_sld;
48 extern const file_hint_t file_hint_tiff;
49 extern const file_hint_t file_hint_zip;
50 
51 static inline int filtre(unsigned int car);
52 
53 static void register_header_check_txt(file_stat_t *file_stat);
54 static int header_check_txt(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new);
55 static void register_header_check_fasttxt(file_stat_t *file_stat);
56 static void register_header_check_snz(file_stat_t *file_stat);
57 static int header_check_fasttxt(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new);
58 #ifdef UTF16
59 static int header_check_le16_txt(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new);
60 #endif
61 
62 const file_hint_t file_hint_snz= {
63   .extension="snz",
64   .description="Olfaction SeeNez odorama",
65   .max_filesize=PHOTOREC_MAX_FILE_SIZE,
66   .recover=1,
67   .enable_by_default=1,
68   .register_header_check=&register_header_check_snz
69 };
70 
71 const file_hint_t file_hint_fasttxt= {
72   .extension="tx?",
73   .description="Text files with header: rtf,xml,xhtml,mbox/imm,pm,ram,reg,sh,slk,stp,jad,url",
74   .max_filesize=PHOTOREC_MAX_FILE_SIZE,
75   .recover=1,
76   .enable_by_default=1,
77   .register_header_check=&register_header_check_fasttxt
78 };
79 
80 const file_hint_t file_hint_txt= {
81   .extension="txt",
82   .description="Other text files: txt,html,asp,bat,C,jsp,perl,php,py/emlx... scripts",
83   .max_filesize=PHOTOREC_MAX_FILE_SIZE,
84   .recover=1,
85   .enable_by_default=1,
86   .register_header_check=&register_header_check_txt
87 };
88 
89 static unsigned char ascii_char[256];
90 
register_header_check_txt(file_stat_t * file_stat)91 static void register_header_check_txt(file_stat_t *file_stat)
92 {
93   unsigned int i;
94   for(i=0; i<256; i++)
95     ascii_char[i]=i;
96   for(i=0; i<256; i++)
97   {
98     if(filtre(i) || i==0xE2 || i==0xC2 || i==0xC3 || i==0xC5 || i==0xC6 || i==0xCB)
99       register_header_check(0, &ascii_char[i], 1, &header_check_txt, file_stat);
100   }
101 #ifdef UTF16
102   register_header_check(1, &ascii_char[0], 1, &header_check_le16_txt, file_stat);
103 #endif
104 }
105 
106 typedef struct
107 {
108   const char *string;
109   const unsigned int len;
110   const char *extension;
111 } txt_header_t;
112 
113 static const txt_header_t fasttxt_headers[] = {
114   /* Unix shell */
115   { "#!/bin/bash", 					11, "sh"},
116   { "#!/bin/ksh",					10, "sh"},
117   { "#!/bin/sh",					 9, "sh"},
118   { "#! /bin/bash", 					12, "sh"},
119   { "#! /bin/ksh",					11, "sh"},
120   { "#! /bin/sh",					10, "sh"},
121   /* Opera Hotlist bookmark/contact list/notes */
122   { "Opera Hotlist version 2.0",			25, "adr"},
123   /* Microsoft VB Class module */
124   { "VERSION 1.0 CLASS\r\nBEGIN",			24, "cls"},
125   /* Cue sheet often begins by the music genre
126    * or by the filename
127    * http://wiki.hydrogenaudio.org/index.php?title=Cue_sheet */
128   { "REM GENRE ",					10, "cue"},
129   { "FILE \"",						 6, "cue"},
130   /* Lotus Data Interchange Format */
131   { "TABLE\r\n0,1\r\n",					12, "dif"},
132   /* Designer, a Photobook Designer Software */
133   { "vSg4q7j8GLrtf",					13, "dp"},
134   { "-----BEGIN DSA PRIVATE KEY-----",			31, "dsa"},
135   /* EMKA IOX file */
136   { "1\t\t\t\t\tthis file\t", 				16,
137 #ifdef DJGPP
138     "emk"
139 #else
140     "emka"
141 #endif
142   },
143   /* Source code in go language */
144   { "package main",					12, "go"},
145   /* ENVI */
146   { "ENVI\r\ndescription",				17, "hdr"},
147   /* Java Application Descriptor
148    * http://en.wikipedia.org/wiki/JAD_%28file_format%29 */
149   { "MIDlet-1:",					 9, "jad"},
150   { "{\"title\":\"\",\"id\":1,\"dateAdded\":",		31, "json"},
151   { "-----BEGIN RSA PRIVATE KEY-----",			31, "key"},
152   /* Lyx http://www.lyx.org */
153   { "#LyX 1.", 						 7, "lyx"},
154   { "#LyX 2.", 						 7, "lyx"},
155   /* LilyPond http://lilypond.org*/
156   { "\n\\version \"", 					11, "ly"},
157   /* Moving Picture Experts Group Audio Layer 3 Uniform Resource Locator */
158   { "#EXTM3U",						 7, "m3u"},
159   /* http://www.mnemosyne-proj.org/
160    * flash-card program to help you memorise question/answer pairs */
161   { "--- Mnemosyne Data Base --- Format Version 2 ---", 48, "mem"},
162   /* Mozilla, firefox, thunderbird msf (Mail Summary File) */
163   { "// <!-- <mdb:mork:z", 				19, "msf"},
164   /* MySQL, phpMyAdmin, PostgreSQL dump */
165   { "-- MySQL dump ",					14, "sql"},
166   { "-- phpMyAdmin SQL Dump",				22, "sql"},
167   { "--\n-- PostgreSQL database cluster dump",		38, "sql"},
168   { "--\r\n-- PostgreSQL database cluster dump",	39, "sql"},
169   { "---- BEGIN SSH2 PUBLIC KEY ----",			31, "ppk"},
170   { "PuTTY-User-Key-File-2:",				22, "ppk"},
171   { "-----BEGIN PGP PRIVATE KEY BLOCK-----",		37, "priv"},
172   { "-----BEGIN PGP PUBLIC KEY BLOCK-----",		36, "pub"},
173   /* PTGui,  panoramic stitching software */
174   { "# ptGui project file",				20, "pts"},
175   { "ssh-dss AAAAB3",					14, "pub"},
176   { "ssh-rsa AAAAB3",					14, "pub"},
177   /* Quantum GIS  */
178   { "<!DOCTYPE qgis ",					15, "qgs"},
179   /* Real Media  */
180   { "rtsp://",						 7, "ram"},
181   /* Windows registry config file */
182   { "REGEDIT4",						 8, "reg"},
183   /*  Reaper Project */
184   { "<REAPER_PROJECT ",					16, "rpp"},
185   /* Olfaction SeeNez subtitle */
186   { "#SeeNez ",						 8, "SeeNezSST"},
187   /* Sylk, Multiplan Symbolic Link Interchange  */
188   { "ID;PSCALC3",					10, "slk"},
189   /* ISO 10303 is an ISO standard for the computer-interpretable
190    * representation and exchange of industrial product data.
191    * - Industrial automation systems and integration - Product data representation and exchange
192    * - Standard for the Exchange of Product model data.
193    * */
194   { "ISO-10303-21;",					13, "stp"},
195   /* URL / Internet Shortcut */
196   { "[InternetShortcut]",				18, "url"},
197   /* Veeam Backup Metadata */
198   { "<BackupMeta Id=\"",				16, "vbm"},
199   /* Windows Play List*/
200   {"<?wpl version=\"1.0\"?>",				21, "wpl"},
201   /* Windows URL / Internet Shortcut */
202   {"BEGIN:VBKM",					10, "url"},
203   /* firefox session store */
204   { "({\"windows\":[{\"tabs\":[{\"entries\":[{\"url\":\"", 42,
205 #ifdef DJGPP
206     "js"
207 #else
208       "sessionstore.js"
209 #endif
210   },
211   /* Mathlab Model .mdl */
212   { "Model {", 7, "mdl"},
213   /* Windows folder settings for file explorer */
214   { "[.ShellClassInfo]",				17, "Desktop.ini" },
215   /* Fotobook */
216   { "<fotobook ",					10, "mcf"},
217   {NULL, 0, NULL}
218 };
219 
220 
221 // #define DEBUG_FILETXT
222 
223 /* return 1 if char can be found in text file */
filtre(unsigned int car)224 static int filtre(unsigned int car)
225 {
226   switch(car)
227   {
228     case 0x7c:  /* similar to | */
229     case 0x80:	/* '€' */
230     case 0x92:	/* '’' */
231     case 0x99:	/* '™' */
232     case 0x9c:	/* 'œ' */
233     case 0xa0:  /* nonbreaking space */
234     case 0xa1:  /* '¡' */
235     case 0xa2:	/* '¢' */
236     case 0xa3:	/* '£' */
237     case 0xa7:	/* '§' */
238     case 0xa8:	/* '¨' */
239     case 0xa9:	/* '©' */
240     case 0xab:	/* '«' */
241     case 0xae:	/* '®' */
242     case 0xb0:	/* '°' */
243     case 0xb4:  /* '´' */
244     case 0xb7:	/* '·' */
245     case 0xbb:  /* '»' */
246     case 0xc0:  /* 'À' */
247     case 0xc7:  /* 'Ç' */
248     case 0xc9:  /* 'É' */
249     case 0xd6:  /* 'Ö' */
250     case 0xd7:	/* '×' */
251     case 0xd9:  /* 'Ù' */
252     case 0xdf:	/* 'ß' */
253     case 0xe0:	/* 'à' */
254     case 0xe1:	/* 'á' */
255     case 0xe2:	/* 'â' */
256     case 0xe3:  /* 'ã' */
257     case 0xe4:	/* 'ä' */
258     case 0xe6:  /* 'æ' */
259     case 0xe7:	/* 'ç' */
260     case 0xe8:	/* 'è' */
261     case 0xe9:	/* 'é' */
262     case 0xea:	/* 'ê' */
263     case 0xeb:	/* 'ë' */
264     case 0xed:  /* 'í' */
265     case 0xee:	/* 'î' */
266     case 0xef:	/* 'ï' */
267     case 0xf4:	/* 'ô' */
268     case 0xf6:	/* 'ö' */
269     case 0xf8:  /* 'ø' */
270     case 0xf9:	/* 'ù' */
271     case 0xfa:  /* 'ú' */
272     case 0xfb:	/* 'û' */
273     case 0xfc:	/* 'ü' */
274       return 1;
275   }
276   if(car=='\b' || car=='\t' || car=='\r' || car=='\n' ||
277       (car >=  ' ' && car <=  '~') ||
278       (car >= 0x82 && car <= 0x8d) ||
279       (car >= 0x93 && car <= 0x98))
280     return 1;
281   return 0;
282 }
283 
284 /* destination should have an extra byte available for null terminator
285    return read size */
UTF2Lat(unsigned char * buffer_lower,const unsigned char * buffer,const int buf_len)286 int UTF2Lat(unsigned char *buffer_lower, const unsigned char *buffer, const int buf_len)
287 {
288   const unsigned char *p;	/* pointers to actual position in source buffer */
289   unsigned char *q;	/* pointers to actual position in destination buffer */
290   int i; /* counter of remaining bytes available in destination buffer */
291   for (i = buf_len, p = buffer, q = buffer_lower; p-buffer<buf_len && i > 0 && *p!='\0';)
292   {
293     const unsigned char *p_org=p;
294     if((*p & 0xf0)==0xe0 && (*(p+1) & 0xc0)==0x80 && (*(p+2) & 0xc0)==0x80)
295     { /* UTF8 l=3 */
296 #ifdef DEBUG_TXT
297       log_info("UTF8 l=3 0x%02x 0x%02x 0x%02x\n", *p, *(p+1),*(p+2));
298 #endif
299       *q = '\0';
300       switch (*p)
301       {
302         case 0xE2 :
303           switch (*(p+1))
304           {
305             case 0x80 :
306               switch (*(p+2))
307               {
308                 case 0x93 : (*q) = 150; break;
309                 case 0x94 : (*q) = 151; break;
310                 case 0x98 : (*q) = 145; break;
311                 /* case 0x99 : (*q) = 146; break; */
312                 case 0x99 : (*q) = '\''; break;
313                 case 0x9A : (*q) = 130; break;
314                 case 0x9C : (*q) = 147; break;
315                 case 0x9D : (*q) = 148; break;
316                 case 0x9E : (*q) = 132; break;
317                 case 0xA0 : (*q) = 134; break;
318                 case 0xA1 : (*q) = 135; break;
319                 case 0xA2 : (*q) = 149; break;
320                 case 0xA6 : (*q) = 133; break;
321                 case 0xB0 : (*q) = 137; break;
322                 case 0xB9 : (*q) = 139; break;
323                 case 0xBA : (*q) = 155; break;
324               }
325               break;
326             case 0x82 :
327               switch (*(p+2))
328               {
329                 case 0xAC : (*q) = 128; break;
330               }
331               break;
332             case 0x84 :
333               switch (*(p+2))
334               {
335                 case 0xA2 : (*q) = 153; break;
336               }
337               break;
338           }
339           break;
340       }
341       p+=3;
342     }
343     else if((*p & 0xe0)==0xc0 && (*(p+1) & 0xc0)==0x80)
344     { /* UTF8 l=2 */
345       *q = '\0';
346       switch (*p)
347       {
348         case 0xC2 :
349           (*q) = ((*(p+1)) | 0x80) & 0xBF; /* A0-BF and a few 80-9F */
350           if((*q)==0xA0)
351             (*q)=' ';
352           break;
353         case 0xC3 :
354           switch (*(p+1))
355 	  {
356 	    case 0xB3 : (*q) = 162; break;
357 	    default:
358 			(*q) = (*(p+1)) | 0xC0; /* C0-FF */
359 			break;
360 	  }
361           break;
362         case 0xC5 :
363           switch (*(p+1)) {
364             case 0x92 : (*q) = 140; break;
365             case 0x93 : (*q) = 156; break;
366             case 0xA0 : (*q) = 138; break;
367             case 0xA1 : (*q) = 154; break;
368             case 0xB8 : (*q) = 143; break;
369             case 0xBD : (*q) = 142; break;
370             case 0xBE : (*q) = 158; break;
371           }
372           break;
373         case 0xC6:
374           switch (*(p+1)) {
375             case 0x92 : (*q) = 131; break;
376           }
377           break;
378         case 0xCB :
379           switch (*(p+1)) {
380             case 0x86 : (*q) = 136; break;
381             case 0x9C : (*q) = 152; break;
382           }
383           break;
384       }
385       p+=2;
386     }
387     else
388     { /* Ascii UCS */
389 #ifdef DEBUG_TXT
390       log_info("UTF8 Ascii UCS 0x%02x\n", *p);
391 #endif
392       *q = tolower(*p);
393       p++;
394     }
395     if (*q=='\0' || filtre(*q)==0)
396     {
397 #ifdef DEBUG_TXT
398       log_warning("UTF2Lat reject 0x%x\n",*q);
399 #endif
400       *q = '\0';
401       return(p_org-buffer);
402     }
403     q++;
404     i--;
405   }
406   *q = '\0';
407   return(p-buffer);
408 }
409 
UTFsize(const unsigned char * buffer,const unsigned int buf_len)410 static int UTFsize(const unsigned char *buffer, const unsigned int buf_len)
411 {
412   const unsigned char *p=buffer;	/* pointers to actual position in source buffer */
413   unsigned int i=0;
414   while(i<buf_len && *p!='\0')
415   {
416     /* Reject some invalid UTF-8 sequences */
417     if(*p==0xc0 || *p==0xc1 || *p==0xf7 || *p>=0xfd)
418       return i;
419     if((*p & 0xf0)==0xe0 && (i+2 >= buf_len || ((*(p+1) & 0xc0)==0x80 && (*(p+2) & 0xc0)==0x80)))
420     { /* UTF8 l=3 */
421 #ifdef DEBUG_TXT
422       log_info("UTFsize i=%u l=3\n", i);
423 #endif
424       p+=3;
425       i+=3;
426     }
427     else if((*p & 0xe0)==0xc0 && (i+1 >= buf_len || (*(p+1) & 0xc0)==0x80))
428     { /* UTF8 l=2 */
429 #ifdef DEBUG_TXT
430       log_info("UTFsize i=%u l=2\n", i);
431 #endif
432       p+=2;
433       i+=2;
434     }
435     else
436     { /* Ascii UCS */
437 #ifdef DEBUG_TXT
438       log_info("UTFsize i=%u l=1 ? *p=%c\n", i, *p);
439 #endif
440       switch(*p)
441       {
442 	case 0x00:
443 	case 0x01:
444 	case 0x02:
445 	case 0x03:
446 	case 0x04:
447 	case 0x05:
448 	case 0x06:
449 	case 0x07:
450 	case 0x0b:
451 	case 0x0c:
452 	case 0x10:
453 	case 0x11:
454 	case 0x12:
455 	case 0x13:
456 	case 0x14:
457 	case 0x15:
458 	case 0x16:
459 	case 0x17:
460 	case 0x18:
461 	case 0x19:
462 	case 0x1a:
463 	case 0x1b:
464 	case 0x1c:
465 	case 0x1d:
466 	case 0x1e:
467 	case 0x1f:
468 	case 0x7f:
469 	  return i;
470       }
471       p++;
472       i++;
473     }
474   }
475   return (i<buf_len?i:buf_len);
476 }
477 
data_check_html(const unsigned char * buffer,const unsigned int buffer_size,file_recovery_t * file_recovery)478 static data_check_t data_check_html(const unsigned char *buffer, const unsigned int buffer_size, file_recovery_t *file_recovery)
479 {
480   const char sign_html_end[]	= "</html>";
481   const unsigned int i=UTFsize(&buffer[buffer_size/2], buffer_size/2);
482   unsigned int j;
483   for(j=(buffer_size/2>sizeof(sign_html_end)?buffer_size/2-sizeof(sign_html_end):0);
484       j+sizeof(sign_html_end)-1 < buffer_size;
485       j++)
486   {
487     if(buffer[j]=='<' && strncasecmp((const char *)&buffer[j], sign_html_end, sizeof(sign_html_end)-1)==0)
488     {
489       file_recovery->calculated_file_size+=j-buffer_size/2+sizeof(sign_html_end)-1;
490       return DC_STOP;
491     }
492   }
493   if(i<buffer_size/2)
494   {
495     if(i>=10)
496       file_recovery->calculated_file_size=file_recovery->file_size+i;
497     return DC_STOP;
498   }
499   file_recovery->calculated_file_size=file_recovery->file_size+(buffer_size/2);
500   return DC_CONTINUE;
501 }
502 
data_check_txt(const unsigned char * buffer,const unsigned int buffer_size,file_recovery_t * file_recovery)503 static data_check_t data_check_txt(const unsigned char *buffer, const unsigned int buffer_size, file_recovery_t *file_recovery)
504 {
505   const unsigned int i=UTFsize(&buffer[buffer_size/2], buffer_size/2);
506   if(i<buffer_size/2)
507   {
508     if(i>=10)
509       file_recovery->calculated_file_size=file_recovery->file_size+i;
510     return DC_STOP;
511   }
512   file_recovery->calculated_file_size=file_recovery->file_size+(buffer_size/2);
513   return DC_CONTINUE;
514 }
515 
data_check_ttd(const unsigned char * buffer,const unsigned int buffer_size,file_recovery_t * file_recovery)516 static data_check_t data_check_ttd(const unsigned char *buffer, const unsigned int buffer_size, file_recovery_t *file_recovery)
517 {
518   unsigned int i;
519   for(i=buffer_size/2; i<buffer_size; i++)
520   {
521     const unsigned char car=buffer[i];
522     if((car>='A' && car<='F') || (car >='0' && car <='9') || car==' ' || car=='\n')
523       continue;
524     file_recovery->calculated_file_size=file_recovery->file_size + i - buffer_size/2;
525     return DC_STOP;
526   }
527   file_recovery->calculated_file_size=file_recovery->file_size+(buffer_size/2);
528   return DC_CONTINUE;
529 }
530 
header_check_ttd(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)531 static int header_check_ttd(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
532 {
533   if(buffer[56]<'0' || buffer[56]>'9')
534     return 0;
535   reset_file_recovery(file_recovery_new);
536   file_recovery_new->data_check=&data_check_ttd;
537   file_recovery_new->file_check=&file_check_size;
538   file_recovery_new->extension="ttd";
539   return 1;
540 }
541 
file_check_ers(file_recovery_t * file_recovery)542 static void file_check_ers(file_recovery_t *file_recovery)
543 {
544   file_search_footer(file_recovery, "DatasetHeader End", 17, 0);
545   file_allow_nl(file_recovery, NL_BARENL|NL_CRLF|NL_BARECR);
546 }
547 
header_check_ers(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)548 static int header_check_ers(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
549 {
550   /* ER Mapper Rasters (ERS) */
551   reset_file_recovery(file_recovery_new);
552   file_recovery_new->data_check=&data_check_txt;
553   file_recovery_new->file_check=&file_check_ers;
554   file_recovery_new->extension="ers";
555   return 1;
556 }
557 
header_check_ics(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)558 static int header_check_ics(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
559 {
560   const char *date_asc;
561   char *buffer2;
562   if(buffer[15]=='\0')
563     return 0;
564   reset_file_recovery(file_recovery_new);
565   file_recovery_new->data_check=&data_check_txt;
566   file_recovery_new->file_check=&file_check_size;
567   /* vcalendar  */
568   file_recovery_new->extension="ics";
569   /* DTSTART:19970714T133000            ;Local time
570    * DTSTART:19970714T173000Z           ;UTC time
571    * DTSTART;TZID=US-Eastern:19970714T133000    ;Local time and time
572    */
573   buffer2=(char *)MALLOC(buffer_size+1);
574   buffer2[buffer_size]='\0';
575   memcpy(buffer2, buffer, buffer_size);
576   date_asc=strstr(buffer2, "DTSTART");
577   if(date_asc!=NULL)
578     date_asc=strchr(date_asc, ':');
579   if(date_asc!=NULL && date_asc+1+14 < buffer2+buffer_size)
580   {
581     file_recovery_new->time=get_time_from_YYYYMMDD_HHMMSS(date_asc+1);
582   }
583   free(buffer2);
584   return 1;
585 }
586 
header_check_perlm(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)587 static int header_check_perlm(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
588 {
589   unsigned int i;
590   const unsigned int buffer_size_test=(buffer_size < 2048 ? buffer_size : 2048);
591   for(i=0; i<128 && buffer[i]!=';' && buffer[i]!='\n'; i++);
592   if(buffer[i]!=';')
593     return 0;
594   reset_file_recovery(file_recovery_new);
595   file_recovery_new->data_check=&data_check_txt;
596   file_recovery_new->file_check=&file_check_size;
597   if( td_memmem(buffer, buffer_size_test, "class", 5)!=NULL ||
598       td_memmem(buffer, buffer_size_test, "private static", 14)!=NULL ||
599       td_memmem(buffer, buffer_size_test, "public interface", 16)!=NULL)
600   {
601     /* source code in java */
602 #ifdef DJGPP
603     file_recovery_new->extension="jav";
604 #else
605     file_recovery_new->extension="java";
606 #endif
607   }
608   else
609   {
610     /* perl module */
611     file_recovery_new->extension="pm";
612   }
613   return 1;
614 }
615 
header_check_dc(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)616 static int header_check_dc(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
617 {
618   if(buffer[0]=='0' && buffer[1]=='0')
619   { /*
620        TSCe Survey Controller DC v10.0
621      */
622     reset_file_recovery(file_recovery_new);
623     file_recovery_new->data_check=&data_check_txt;
624     file_recovery_new->file_check=&file_check_size;
625     file_recovery_new->extension="dc";
626     return 1;
627   }
628   return 0;
629 }
630 
file_rename_fods(file_recovery_t * file_recovery)631 static void file_rename_fods(file_recovery_t *file_recovery)
632 {
633   FILE *file;
634   char buffer[4096];
635   char *tmp;
636   size_t lu;
637   if((file=fopen(file_recovery->filename, "rb"))==NULL)
638     return;
639   if((lu=fread(&buffer, 1, sizeof(buffer)-1, file)) <= 0)
640   {
641     fclose(file);
642     return ;
643   }
644   buffer[lu]='\0';
645   tmp=strchr(buffer,'<');
646   while(tmp!=NULL)
647   {
648     if(strncasecmp(tmp, "<office:meta><dc:title>", 23)==0)
649     {
650       const char *title=tmp+23;
651       tmp=strchr(title,'<');
652       if(tmp!=NULL)
653 	*tmp='\0';
654       file_rename(file_recovery, (const unsigned char*)title, strlen(title), 0, NULL, 1);
655       fclose(file);
656       return ;
657     }
658     tmp++;
659     tmp=strchr(tmp,'<');
660   }
661   fclose(file);
662 }
663 
file_rename_html(file_recovery_t * file_recovery)664 static void file_rename_html(file_recovery_t *file_recovery)
665 {
666   FILE *file;
667   char buffer[4096];
668   char *tmp;
669   size_t lu;
670   if((file=fopen(file_recovery->filename, "rb"))==NULL)
671     return;
672   if((lu=fread(&buffer, 1, sizeof(buffer)-1, file)) <= 0)
673   {
674     fclose(file);
675     return ;
676   }
677   buffer[lu]='\0';
678   tmp=strchr(buffer,'<');
679   while(tmp!=NULL)
680   {
681     if(strncasecmp(tmp, "</head", 5)==0)
682     {
683       fclose(file);
684       return ;
685     }
686     if(strncasecmp(tmp, "<title>", 7)==0)
687     {
688       const char *title=tmp+7;
689       tmp=strchr(title,'<');
690       if(tmp!=NULL)
691 	*tmp='\0';
692       file_rename(file_recovery, (const unsigned char*)title, strlen(title), 0, NULL, 1);
693       fclose(file);
694       return ;
695     }
696     tmp++;
697     tmp=strchr(tmp,'<');
698   }
699   fclose(file);
700 }
701 
header_check_html(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)702 static int header_check_html(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
703 {
704   if(file_recovery->file_stat!=NULL &&
705       file_recovery->file_stat->file_hint==&file_hint_fasttxt &&
706       strcmp(file_recovery->extension,"mbox")==0)
707     return 0;
708   if(buffer[14]==0)
709     return 0;
710   reset_file_recovery(file_recovery_new);
711   file_recovery_new->data_check=&data_check_html;
712   file_recovery_new->file_check=&file_check_size;
713   /* Hypertext Markup Language (HTML) */
714 #ifdef DJGPP
715   file_recovery_new->extension="htm";
716 #else
717   file_recovery_new->extension="html";
718 #endif
719   file_recovery_new->file_rename=&file_rename_html;
720   return 1;
721 }
722 
file_check_vbm(file_recovery_t * file_recovery)723 static void file_check_vbm(file_recovery_t *file_recovery)
724 {
725   file_search_footer(file_recovery, "</BackupMeta>", 13, 0);
726   file_allow_nl(file_recovery, NL_BARENL|NL_CRLF|NL_BARECR);
727 }
728 
header_check_vbm(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)729 static int header_check_vbm(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
730 {
731   reset_file_recovery(file_recovery_new);
732   file_recovery_new->data_check=&data_check_txt;
733   file_recovery_new->extension="vbm";
734   file_recovery_new->file_check=&file_check_vbm;
735   return 1;
736 }
737 
file_check_gpx(file_recovery_t * file_recovery)738 static void file_check_gpx(file_recovery_t *file_recovery)
739 {
740   file_search_footer(file_recovery, "</gpx>", 6, 0);
741   file_allow_nl(file_recovery, NL_BARENL|NL_CRLF|NL_BARECR);
742 }
743 
file_check_xml(file_recovery_t * file_recovery)744 static void file_check_xml(file_recovery_t *file_recovery)
745 {
746   file_search_footer(file_recovery, ">", 1, 0);
747   file_allow_nl(file_recovery, NL_BARENL|NL_CRLF|NL_BARECR);
748 }
749 
file_check_svg(file_recovery_t * file_recovery)750 static void file_check_svg(file_recovery_t *file_recovery)
751 {
752   file_search_footer(file_recovery, "</svg>", 6, 0);
753   file_allow_nl(file_recovery, NL_BARENL|NL_CRLF|NL_BARECR);
754 }
755 
data_check_xml_utf8(const unsigned char * buffer,const unsigned int buffer_size,file_recovery_t * file_recovery)756 static data_check_t data_check_xml_utf8(const unsigned char *buffer, const unsigned int buffer_size, file_recovery_t *file_recovery)
757 {
758   unsigned int i;
759   if(buffer_size<=8)
760     return DC_CONTINUE;
761   i=UTFsize(&buffer[buffer_size/2+4], buffer_size/2-4)+4;
762   if(i<buffer_size/2)
763   {
764     file_recovery->calculated_file_size=file_recovery->file_size+i;
765     return DC_STOP;
766   }
767   file_recovery->calculated_file_size=file_recovery->file_size+(buffer_size/2);
768   file_recovery->data_check=&data_check_txt;
769   return DC_CONTINUE;
770 }
771 
header_check_xml_utf8(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)772 static int header_check_xml_utf8(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
773 {
774   const char *tmp;
775   /* buffer may not be null-terminated */
776   char *buf=(char *)MALLOC(buffer_size+1);
777   memcpy(buf, buffer, buffer_size);
778   buf[buffer_size]='\0';
779   reset_file_recovery(file_recovery_new);
780   file_recovery_new->data_check=&data_check_xml_utf8;
781   file_recovery_new->extension=NULL;
782   tmp=strchr(buf,'<');
783   while(tmp!=NULL && file_recovery_new->extension==NULL)
784   {
785     if(strncasecmp(tmp, "<Archive name=\"Root\">", 8)==0)
786     {
787       /* Grasshopper archive */
788       file_recovery_new->extension="ghx";
789     }
790     tmp++;
791     tmp=strchr(tmp,'<');
792   }
793   if(file_recovery_new->extension==NULL)
794   {
795     file_recovery_new->extension="xml";
796   }
797   file_recovery_new->file_check=&file_check_xml;
798   free(buf);
799   return 1;
800 }
801 
header_check_xml_utf16(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)802 static int header_check_xml_utf16(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
803 {
804   /* Avoid false positive with .sldprt */
805   if(file_recovery->file_stat!=NULL &&
806       file_recovery->file_stat->file_hint==&file_hint_doc)
807     return 0;
808   reset_file_recovery(file_recovery_new);
809   file_recovery_new->extension="xml";
810   return 1;
811 }
812 
header_check_xml(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)813 static int header_check_xml(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
814 {
815   const char *tmp;
816   /* buffer may not be null-terminated */
817   char *buf=(char *)MALLOC(buffer_size+1);
818   memcpy(buf, buffer, buffer_size);
819   buf[buffer_size]='\0';
820   reset_file_recovery(file_recovery_new);
821   file_recovery_new->data_check=&data_check_txt;
822   file_recovery_new->extension=NULL;
823   tmp=strchr(buf,'<');
824   while(tmp!=NULL && file_recovery_new->extension==NULL)
825   {
826     if(strncasecmp(tmp, "<Grisbi>", 8)==0)
827     {
828       /* Grisbi - Personal Finance Manager XML data */
829       file_recovery_new->extension="gsb";
830     }
831     else if(strncasecmp(tmp, "<collection type=\"GC", 20)==0)
832     {
833       /* GCstart, personal collections manager, http://www.gcstar.org/ */
834       file_recovery_new->extension="gcs";
835     }
836     else if(strncasecmp(tmp, "<html", 5)==0)
837     {
838       file_recovery_new->data_check=&data_check_html;
839 #ifdef DJGPP
840       file_recovery_new->extension="htm";
841 #else
842       file_recovery_new->extension="html";
843 #endif
844       file_recovery_new->file_rename=&file_rename_html;
845     }
846     else if(strncasecmp(tmp, "<Version>QBFSD", 14)==0)
847     {
848       /* QuickBook */
849       file_recovery_new->extension="fst";
850     }
851     else if(strncasecmp(tmp, "<svg", 4)==0)
852     {
853       /* Scalable Vector Graphics */
854       file_recovery_new->extension="svg";
855       file_recovery_new->file_check=&file_check_svg;
856       free(buf);
857       return 1;
858     }
859     else if(strncasecmp(tmp, "<!DOCTYPE plist ", 16)==0)
860     {
861       /* Mac OS X property list */
862 #ifdef DJGPP
863       file_recovery_new->extension="pli";
864 #else
865       file_recovery_new->extension="plist";
866 #endif
867     }
868     else if(strncasecmp(tmp, "<gpx ", 5)==0)
869     {
870       /* GPS eXchange Format */
871       file_recovery_new->extension="gpx";
872       file_recovery_new->file_check=&file_check_gpx;
873       free(buf);
874       return 1;
875     }
876     else if(strncasecmp(tmp, "<PremiereData Version=", 22)==0)
877     {
878       /* Adobe Premiere project  */
879       file_recovery_new->data_check=NULL;
880       file_recovery_new->extension="prproj";
881     }
882     else if(strncasecmp(tmp, "<SCRIBUS", 8)==0)
883     {
884       /* Scribus XML file */
885       file_recovery_new->extension="sla";
886     }
887     else if(strncasecmp(tmp, "<FictionBook", 12)==0)
888     {
889       /* FictionBook, see http://www.fictionbook.org */
890       file_recovery_new->extension="fb2";
891     }
892     else if(strncasecmp(tmp, "<office:document", 16)==0)
893     {
894       /* OpenDocument Flat XML Spreadsheet */
895       file_recovery_new->extension="fods";
896       file_recovery_new->data_check=NULL;
897       file_recovery_new->file_rename=&file_rename_fods;
898     }
899     tmp++;
900     tmp=strchr(tmp,'<');
901   }
902   if(file_recovery_new->extension==NULL)
903   {
904     /* XML Extensible Markup Language */
905     file_recovery_new->extension="xml";
906   }
907   file_recovery_new->file_check=&file_check_xml;
908   free(buf);
909   return 1;
910 }
911 
header_check_rtf(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)912 static int header_check_rtf(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
913 {
914   unsigned int i;
915   for(i=0; i<16; i++)
916     if(buffer[i]=='\0')
917       return 0;
918   /* Avoid a false positive with .snt */
919   if(file_recovery->file_stat!=NULL &&
920       file_recovery->file_stat->file_hint==&file_hint_doc)
921     return 0;
922   reset_file_recovery(file_recovery_new);
923   file_recovery_new->data_check=&data_check_txt;
924   file_recovery_new->file_check=&file_check_size;
925   /* Rich Text Format */
926   file_recovery_new->extension="rtf";
927   return 1;
928 }
929 
header_check_xmp(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)930 static int header_check_xmp(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
931 {
932   if(buffer[35]=='\0')
933     return 0;
934   if(file_recovery->file_stat!=NULL &&
935       (file_recovery->file_stat->file_hint==&file_hint_jpg ||
936        file_recovery->file_stat->file_hint==&file_hint_pdf ||
937        file_recovery->file_stat->file_hint==&file_hint_tiff))
938     return 0;
939   /* Adobe's Extensible Metadata Platform */
940   reset_file_recovery(file_recovery_new);
941   file_recovery_new->data_check=&data_check_txt;
942   file_recovery_new->file_check=&file_check_size;
943   file_recovery_new->extension="xmp";
944   return 1;
945 }
946 
file_check_thunderbird(file_recovery_t * file_recovery)947 static void file_check_thunderbird(file_recovery_t *file_recovery)
948 {
949   if(file_recovery->file_size<file_recovery->calculated_file_size)
950   {
951     file_recovery->file_size=0;
952     return;
953   }
954   file_recovery->file_size=file_recovery->calculated_file_size;
955 }
956 
header_check_thunderbird(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)957 static int header_check_thunderbird(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
958 {
959   unsigned int i;
960   if(file_recovery->file_stat!=NULL &&
961       file_recovery->file_stat->file_hint==&file_hint_fasttxt &&
962       strcmp(file_recovery->extension,"mbox")==0)
963     return 0;
964   for(i=0; i<64; i++)
965     if(buffer[i]==0)
966       return 0;
967   reset_file_recovery(file_recovery_new);
968   file_recovery_new->data_check=&data_check_txt;
969   file_recovery_new->file_check=&file_check_thunderbird;
970   file_recovery_new->extension="mbox";
971   return 1;
972 }
973 
header_check_mbox(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)974 static int header_check_mbox(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
975 {
976   unsigned int i;
977   if(file_recovery->file_stat!=NULL &&
978       file_recovery->file_stat->file_hint==&file_hint_fasttxt &&
979       strcmp(file_recovery->extension,"mbox")==0)
980     return 0;
981   for(i=0; i<64; i++)
982     if(buffer[i]==0)
983       return 0;
984   if( memcmp(buffer, "From ", 5)==0 &&
985       memcmp(buffer, "From MAILER-DAEMON ", 19)!=0)
986   {
987     /* From someone@somewhere */
988     for(i=5; i<200 && buffer[i]!=' ' && buffer[i]!='@'; i++);
989     if(buffer[i]!='@')
990       return 0;
991   }
992   reset_file_recovery(file_recovery_new);
993   file_recovery_new->data_check=&data_check_txt;
994   file_recovery_new->file_check=&file_check_size;
995   /* Incredimail has .imm extension but this extension isn't frequent */
996   file_recovery_new->extension="mbox";
997   return 1;
998 }
999 
header_check_fasttxt(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)1000 static int header_check_fasttxt(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
1001 {
1002   const txt_header_t *header=&fasttxt_headers[0];
1003   while(header->len > 0)
1004   {
1005     if(memcmp(buffer, header->string, header->len)==0)
1006     {
1007       if(buffer[header->len]=='\0')
1008 	return 0;
1009       reset_file_recovery(file_recovery_new);
1010       file_recovery_new->data_check=&data_check_txt;
1011       file_recovery_new->file_check=&file_check_size;
1012       file_recovery_new->extension=header->extension;
1013       file_recovery_new->min_filesize=header->len+1;
1014       return 1;
1015     }
1016     header++;
1017   }
1018   return 0;
1019 }
1020 
is_ini(const char * buffer)1021 static int is_ini(const char *buffer)
1022 {
1023   const char *src=buffer;
1024   if(*src!='[')
1025     return 0;
1026   src++;
1027   while(1)
1028   {
1029     if(*src==']')
1030     {
1031       if(src > buffer + 3)
1032 	return 1;
1033       return 0;
1034     }
1035     if(!isalnum(*src) && *src!=' ')
1036       return 0;
1037     src++;
1038   }
1039 }
1040 
1041 #ifdef UTF16
header_check_le16_txt(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)1042 static int header_check_le16_txt(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
1043 {
1044   unsigned int i;
1045   for(i=0; i+1 < buffer_size; i+=2)
1046   {
1047     if(!( buffer[i+1]=='\0' && (isprint(buffer[i]) || buffer[i]=='\n' || buffer[i]=='\r' || buffer[i]==0xbb)))
1048     {
1049       if(i<40)
1050 	return 0;
1051       reset_file_recovery(file_recovery_new);
1052       file_recovery_new->calculated_file_size=i;
1053       file_recovery_new->data_check=&data_check_size;
1054       file_recovery_new->file_check=&file_check_size;
1055       file_recovery_new->extension="utf16";
1056       return 1;
1057     }
1058   }
1059   reset_file_recovery(file_recovery_new);
1060   file_recovery_new->calculated_file_size=i;
1061   file_recovery_new->data_check=&data_check_size;
1062   file_recovery_new->file_check=&file_check_size;
1063   file_recovery_new->extension="utf16";
1064   return 1;
1065 }
1066 #endif
1067 
file_check_emlx(file_recovery_t * file_recovery)1068 static void file_check_emlx(file_recovery_t *file_recovery)
1069 {
1070   if(file_recovery->file_size < file_recovery->calculated_file_size)
1071     file_recovery->file_size=0;
1072   else
1073   {
1074     if(file_recovery->file_size > file_recovery->calculated_file_size+2048)
1075       file_recovery->file_size=file_recovery->calculated_file_size+2048;
1076     file_search_footer(file_recovery, "</plist>\n", 9, 0);
1077   }
1078 }
1079 
header_check_txt(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)1080 static int header_check_txt(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
1081 {
1082   static char *buffer_lower=NULL;
1083   static unsigned int buffer_lower_size=0;
1084   unsigned int l;
1085   const unsigned int buffer_size_test=(buffer_size < 2048 ? buffer_size : 2048);
1086   {
1087     unsigned int i;
1088     unsigned int tmp=0;
1089     for(i=0;i<10 && isdigit(buffer[i]);i++)
1090       tmp=tmp*10+buffer[i]-'0';
1091     if(buffer[i]==0x0a &&
1092       (memcmp(buffer+i+1, "Return-Path: ", 13)==0 ||
1093        memcmp(buffer+i+1, "Received: from", 14)==0) &&
1094         !(file_recovery->file_stat!=NULL &&
1095           file_recovery->file_stat->file_hint==&file_hint_fasttxt &&
1096           strcmp(file_recovery->extension,"mbox")==0))
1097     {
1098       reset_file_recovery(file_recovery_new);
1099       file_recovery_new->calculated_file_size=tmp+i+1;
1100       file_recovery_new->data_check=NULL;
1101       file_recovery_new->file_check=&file_check_emlx;
1102       /* Mac OSX mail */
1103       file_recovery_new->extension="emlx";
1104       return 1;
1105     }
1106   }
1107   if(strncasecmp((const char *)buffer, "@echo off", 9)==0)
1108   {
1109     if(buffer[9]=='\0')
1110       return 0;
1111     reset_file_recovery(file_recovery_new);
1112     file_recovery_new->data_check=&data_check_txt;
1113     file_recovery_new->file_check=&file_check_size;
1114     /* Dos/Windows batch */
1115     file_recovery_new->extension="bat";
1116     return 1;
1117   }
1118   if(strncasecmp((const char *)buffer, "<%@ language=\"vbscript", 22)==0)
1119   {
1120     if(buffer[22]=='\0')
1121       return 0;
1122     reset_file_recovery(file_recovery_new);
1123     file_recovery_new->data_check=&data_check_txt;
1124     file_recovery_new->file_check=&file_check_size;
1125     /* Microsoft Active Server Pages */
1126     file_recovery_new->extension="asp";
1127     return 1;
1128   }
1129   if(strncasecmp((const char *)buffer, "version 4.00\r\nbegin", 19)==0)
1130   {
1131     if(buffer[19]=='\0')
1132       return 0;
1133     reset_file_recovery(file_recovery_new);
1134     file_recovery_new->data_check=&data_check_txt;
1135     file_recovery_new->file_check=&file_check_size;
1136     /* Microsoft Visual Basic */
1137     file_recovery_new->extension="vb";
1138     return 1;
1139   }
1140   if(strncasecmp((const char *)buffer, "begin:vcard", 11)==0)
1141   {
1142     if(buffer[11]=='\0')
1143       return 0;
1144     reset_file_recovery(file_recovery_new);
1145     file_recovery_new->data_check=&data_check_txt;
1146     file_recovery_new->file_check=&file_check_size;
1147     /* vcard, electronic business cards */
1148     file_recovery_new->extension="vcf";
1149     return 1;
1150   }
1151   if(buffer[0]=='#' && buffer[1]=='!')
1152   {
1153     unsigned int ll=512-2;
1154     const unsigned char *haystack=(const unsigned char *)buffer+2;
1155     const unsigned char *res;
1156     res=(const unsigned char *)memchr(haystack,'\n',ll);
1157     if(res!=NULL)
1158       ll=res-haystack;
1159     if(td_memmem(haystack, ll, "perl", 4) != NULL)
1160     {
1161       reset_file_recovery(file_recovery_new);
1162       file_recovery_new->data_check=&data_check_txt;
1163       file_recovery_new->file_check=&file_check_size;
1164       /* Perl script */
1165       file_recovery_new->extension="pl";
1166       return 1;
1167     }
1168     if(td_memmem(haystack, ll, "python", 6) != NULL)
1169     {
1170       reset_file_recovery(file_recovery_new);
1171       file_recovery_new->data_check=&data_check_txt;
1172       file_recovery_new->file_check=&file_check_size;
1173       /* Python script */
1174       file_recovery_new->extension="py";
1175       return 1;
1176     }
1177     if(td_memmem(haystack, ll, "ruby", 4) != NULL)
1178     {
1179       reset_file_recovery(file_recovery_new);
1180       file_recovery_new->data_check=&data_check_txt;
1181       file_recovery_new->file_check=&file_check_size;
1182       /* Ruby script */
1183       file_recovery_new->extension="rb";
1184       return 1;
1185     }
1186   }
1187   if(safe_header_only!=0)
1188   {
1189     return 0;
1190   }
1191   if(file_recovery->file_stat!=NULL)
1192   {
1193     if(file_recovery->file_stat->file_hint == &file_hint_fasttxt ||
1194 	file_recovery->file_stat->file_hint == &file_hint_txt)
1195     {
1196       if(strstr(file_recovery->filename,".html")==NULL)
1197 	return 0;
1198     }
1199     else
1200       return 0;
1201   }
1202   if(buffer_lower_size<buffer_size_test+16)
1203   {
1204     free(buffer_lower);
1205     buffer_lower=NULL;
1206   }
1207   /* Don't malloc/free memory every time, small memory leak */
1208   if(buffer_lower==NULL)
1209   {
1210     buffer_lower_size=buffer_size_test+16;
1211     buffer_lower=(char *)MALLOC(buffer_lower_size);
1212   }
1213   l=UTF2Lat((unsigned char*)buffer_lower, buffer, buffer_size_test);
1214   if(l<10)
1215     return 0;
1216   {
1217     unsigned int line_nbr=0;
1218     unsigned int i;
1219     for(i=0; i<512 && i<l; i++)
1220     {
1221       if(buffer[i]=='\n')
1222 	line_nbr++;
1223     }
1224     /* A text file must contains several lines */
1225     if(line_nbr==0)
1226       return 0;
1227   }
1228   if(strncasecmp((const char *)buffer, "rem ", 4)==0)
1229   {
1230     reset_file_recovery(file_recovery_new);
1231     file_recovery_new->data_check=&data_check_txt;
1232     file_recovery_new->file_check=&file_check_size;
1233     /* Dos/Windows batch */
1234     file_recovery_new->extension="bat";
1235     return 1;
1236   }
1237   if(strncasecmp((const char *)buffer, "dn: ", 4)==0)
1238   {
1239     reset_file_recovery(file_recovery_new);
1240     file_recovery_new->data_check=&data_check_txt;
1241     file_recovery_new->file_check=&file_check_size;
1242     file_recovery_new->extension="ldif";
1243     return 1;
1244   }
1245   {
1246     const char *ext=NULL;
1247     /* ind=~0: random
1248      * ind=~1: constant	*/
1249     double ind;
1250     unsigned int nbrf=0;
1251     unsigned int is_csv=1;
1252     char *str;
1253     /* Detect Fortran */
1254     {
1255       str=buffer_lower;
1256       while((str=strstr(str, "\n      "))!=NULL)
1257       {
1258 	nbrf++;
1259 	str++;
1260       }
1261     }
1262     /* Detect csv */
1263     {
1264       unsigned int csv_per_line_current=0;
1265       unsigned int csv_per_line=0;
1266       unsigned int line_nbr=0;
1267       unsigned int i;
1268       for(i=0;i<l && is_csv>0;i++)
1269       {
1270 	if(buffer_lower[i]==';')
1271 	{
1272 	  csv_per_line_current++;
1273 	}
1274 	else if(buffer_lower[i]=='\n')
1275 	{
1276 	  if(line_nbr==0)
1277 	    csv_per_line=csv_per_line_current;
1278 	  if(csv_per_line_current!=csv_per_line)
1279 	    is_csv=0;
1280 	  line_nbr++;
1281 	  csv_per_line_current=0;
1282 	}
1283       }
1284       if(csv_per_line<1 || line_nbr<10)
1285 	is_csv=0;
1286     }
1287     /* if(l>1) */
1288     {
1289       unsigned int stats[256];
1290       unsigned int i;
1291       memset(&stats, 0, sizeof(stats));
1292       for(i=0;i<l;i++)
1293 	stats[(unsigned char)buffer_lower[i]]++;
1294       ind=0;
1295       for(i=0;i<256;i++)
1296 	if(stats[i]>0)
1297 	  ind+=stats[i]*(stats[i]-1);
1298       ind=ind/l/(l-1);
1299     }
1300     /* Windows Autorun */
1301     if(strstr(buffer_lower, "[autorun]")!=NULL)
1302       ext="inf";
1303     /* Detect .ini */
1304     else if(buffer[0]=='[' && l>50 && is_ini(buffer_lower))
1305       ext="ini";
1306     /* php (Hypertext Preprocessor) script */
1307     else if(strstr(buffer_lower, "<?php")!=NULL)
1308       ext="php";
1309     /* Comma separated values */
1310     else if(is_csv>0)
1311       ext="csv";
1312     /* Detect LaTeX, C, PHP, JSP, ASP, HTML, C header */
1313     else if(strstr(buffer_lower, "\\begin{")!=NULL)
1314       ext="tex";
1315     else if(strstr(buffer_lower, "#include")!=NULL)
1316       ext="c";
1317     else if(l>20 && strstr(buffer_lower, "<%@")!=NULL)
1318       ext="jsp";
1319     else if(l>20 && strstr(buffer_lower, "<%=")!=NULL)
1320       ext="jsp";
1321     else if(l>20 && strstr(buffer_lower, "<% ")!=NULL)
1322       ext="asp";
1323     else if(strstr(buffer_lower, "<html")!=NULL)
1324       ext="html";
1325     else if(strstr(buffer_lower, "private static")!=NULL ||
1326 	strstr(buffer_lower, "public interface")!=NULL)
1327     {
1328 #ifdef DJGPP
1329       ext="jav";
1330 #else
1331       ext="java";
1332 #endif
1333     }
1334     else if(strstr(buffer_lower, "\nimport (")!=NULL)
1335     {
1336       ext="go";
1337     }
1338     else if((str=strstr(buffer_lower, "\nimport "))!=NULL)
1339     {
1340       str+=8;
1341       while(*str!='\0' && *str!='\n' && *str!=';')
1342 	str++;
1343       if(*str==';')
1344 	ext="java";
1345       else
1346 	ext="py";
1347     }
1348     else if(strstr(buffer_lower, "class ")!=NULL &&
1349 	(l>=100 || file_recovery->file_stat==NULL))
1350     {
1351 #ifdef DJGPP
1352       ext="jav";
1353 #else
1354       ext="java";
1355 #endif
1356     }
1357     /* Fortran */
1358     else if(nbrf>10 && ind<0.9 && strstr(buffer_lower, "integer")!=NULL)
1359       ext="f";
1360     /* LilyPond http://lilypond.org*/
1361     else if(strstr(buffer_lower, "\\score {")!=NULL)
1362       ext="ly";
1363     /* C header file */
1364     else if(strstr(buffer_lower, "/*")!=NULL && l>50)
1365       ext="h";
1366     else if(l<100 || ind<0.03 || ind>0.90)
1367       ext=NULL;
1368     /* JavaScript Object Notation  */
1369     else if(memcmp(buffer_lower, "{\"", 2)==0)
1370       ext="json";
1371     else
1372       ext=file_hint_txt.extension;
1373     if(ext==NULL)
1374       return 0;
1375     if(strcmp(ext,"txt")==0 &&
1376 	(strstr(buffer_lower,"<br>")!=NULL || strstr(buffer_lower,"<p>")!=NULL))
1377     {
1378       ext="html";
1379     }
1380     if(file_recovery->file_stat!=NULL)
1381     {
1382       if(file_recovery->file_stat->file_hint == &file_hint_doc)
1383       {
1384 	unsigned int i;
1385 	unsigned int txt_nl=0;
1386 	/* file_recovery->filename is .doc */
1387 	if(ind>0.20)
1388 	  return 0;
1389 	/* Unix: \n (0xA)
1390 	 * Dos: \r\n (0xD 0xA)
1391 	 * Doc: \r (0xD) */
1392 	for(i=0; i<l-1; i++)
1393 	{
1394 	  if(buffer_lower[i]=='\r' && buffer_lower[i+1]!='\n')
1395 	    return 0;
1396 	}
1397 	for(i=0; i<l && i<512; i++)
1398 	  if(buffer_lower[i]=='\n')
1399 	    txt_nl++;
1400 	if(txt_nl<=1)
1401 	  return 0;
1402       }
1403       else if(file_recovery->file_stat->file_hint == &file_hint_fasttxt ||
1404 	  file_recovery->file_stat->file_hint == &file_hint_txt)
1405       {
1406 	/* file_recovery->filename is a .html */
1407 	buffer_lower[511]='\0';
1408 	if(strstr(buffer_lower, "<html")==NULL)
1409 	  return 0;
1410 	/* Special case: two consecutive HTML files */
1411       }
1412     }
1413     reset_file_recovery(file_recovery_new);
1414     if(strcmp(ext, "html")==0)
1415     {
1416       file_recovery_new->file_rename=&file_rename_html;
1417       file_recovery_new->data_check=&data_check_html;
1418     }
1419     else
1420       file_recovery_new->data_check=&data_check_txt;
1421     file_recovery_new->file_check=&file_check_size;
1422     file_recovery_new->extension=ext;
1423     return 1;
1424   }
1425 }
1426 
file_check_smil(file_recovery_t * file_recovery)1427 static void file_check_smil(file_recovery_t *file_recovery)
1428 {
1429   file_search_footer(file_recovery, "</smil>", 7, 0);
1430   file_allow_nl(file_recovery, NL_BARENL|NL_CRLF|NL_BARECR);
1431 }
1432 
header_check_smil(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)1433 static int header_check_smil(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
1434 {
1435   /* Synchronized Multimedia Integration Language
1436    * http://en.wikipedia.org/wiki/Synchronized_Multimedia_Integration_Language */
1437   reset_file_recovery(file_recovery_new);
1438   file_recovery_new->data_check=&data_check_txt;
1439   file_recovery_new->file_check=&file_check_smil;
1440   file_recovery_new->extension="smil";
1441   return 1;
1442 }
1443 
header_check_stl(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)1444 static int header_check_stl(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
1445 {
1446   const unsigned int buffer_size_test=(buffer_size < 512? buffer_size : 512);
1447   if(td_memmem(buffer, buffer_size_test, "facet normal", 12)==NULL)
1448     return 0;
1449   /* StereoLithography - STL Ascii format
1450    * http://www.ennex.com/~fabbers/StL.asp	*/
1451   reset_file_recovery(file_recovery_new);
1452   file_recovery_new->data_check=&data_check_txt;
1453   file_recovery_new->file_check=&file_check_size;
1454   file_recovery_new->extension="stl";
1455   return 1;
1456 }
1457 
header_check_svg(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)1458 static int header_check_svg(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
1459 {
1460   /* Scalable Vector Graphics */
1461   reset_file_recovery(file_recovery_new);
1462   file_recovery_new->extension="svg";
1463   file_recovery_new->file_check=&file_check_svg;
1464   return 1;
1465 }
1466 
header_check_snz(const unsigned char * buffer,const unsigned int buffer_size,const unsigned int safe_header_only,const file_recovery_t * file_recovery,file_recovery_t * file_recovery_new)1467 static int header_check_snz(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
1468 {
1469   const unsigned int buffer_size_test=(buffer_size < 512? buffer_size : 512);
1470   const unsigned char *pos=(const unsigned char *)td_memmem(buffer, buffer_size_test, ".snz", 4);
1471   if(pos==NULL)
1472     return 0;
1473   reset_file_recovery(file_recovery_new);
1474   file_recovery_new->data_check=&data_check_txt;
1475   file_recovery_new->file_check=&file_check_size;
1476   file_recovery_new->extension="snz";
1477   file_recovery_new->min_filesize=pos-buffer;
1478   return 1;
1479 }
1480 
register_header_check_snz(file_stat_t * file_stat)1481 static void register_header_check_snz(file_stat_t *file_stat)
1482 {
1483   register_header_check(0, "DEFAULT\n",   8, &header_check_snz, file_stat);
1484   register_header_check(0, "DEFAULT\r\n", 9, &header_check_snz, file_stat);
1485 }
1486 
register_header_check_fasttxt(file_stat_t * file_stat)1487 static void register_header_check_fasttxt(file_stat_t *file_stat)
1488 {
1489   static const unsigned char header_xml_utf8[17]	= {0xef, 0xbb, 0xbf, '<', '?', 'x', 'm', 'l', ' ', 'v', 'e', 'r', 's', 'i', 'o', 'n', '='};
1490   static const unsigned char header_xml_utf16[30]	= {0xff, 0xfe, '<', 0, '?', 0, 'x', 0, 'm', 0, 'l', 0, ' ', 0, 'v', 0, 'e', 0, 'r', 0, 's', 0, 'i', 0, 'o', 0, 'n', 0, '=', 0};
1491   const txt_header_t *header=&fasttxt_headers[0];
1492   while(header->len > 0)
1493   {
1494     register_header_check(0, header->string, header->len, &header_check_fasttxt, file_stat);
1495     header++;
1496   }
1497   register_header_check(4, "SC V10",		6,  &header_check_dc, file_stat);
1498   register_header_check(0, "DatasetHeader Begin", 19, &header_check_ers, file_stat);
1499 //  register_header_check(0, "\n<!DOCTYPE html",	15, &header_check_html, file_stat);
1500   register_header_check(0, "<!DOCTYPE html",	14, &header_check_html, file_stat);
1501   register_header_check(0, "<!DOCTYPE HTML",	14, &header_check_html, file_stat);
1502 //  register_header_check(0, "<html",		 5, &header_check_html, file_stat);
1503   register_header_check(0, "BEGIN:VCALENDAR",	15, &header_check_ics, file_stat);
1504   register_header_check(0, "From - ",		 7, &header_check_thunderbird, file_stat);
1505   register_header_check(0, "From ",		 5, &header_check_mbox, file_stat);
1506   register_header_check(0, "Message-ID: ",	12, &header_check_mbox, file_stat);
1507   register_header_check(0, "MIME-Version:",	13, &header_check_mbox, file_stat);
1508   register_header_check(0, "Received: from ",	15, &header_check_mbox, file_stat);
1509   register_header_check(0, "Reply-To: ",	10, &header_check_mbox, file_stat);
1510   register_header_check(0, "Return-path: ",	13, &header_check_mbox, file_stat);
1511   register_header_check(0, "Return-Path: ",	13, &header_check_mbox, file_stat);
1512   register_header_check(0, "package ",		 8, &header_check_perlm, file_stat);
1513   register_header_check(0, "package\t",		 8, &header_check_perlm, file_stat);
1514   register_header_check(0, "{\\rtf",		 5, &header_check_rtf, file_stat);
1515   register_header_check(0, "<smil>",		 6, &header_check_smil, file_stat);
1516   register_header_check(0, "solid ",		 6, &header_check_stl, file_stat);
1517   register_header_check(0, "<?xml version=",	14, &header_check_xml, file_stat);
1518   register_header_check(0, header_xml_utf8, sizeof(header_xml_utf8), &header_check_xml_utf8, file_stat);
1519   register_header_check(0, header_xml_utf16, sizeof(header_xml_utf16), &header_check_xml_utf16, file_stat);
1520   /* Veeam Backup */
1521   register_header_check(0, "<BackupMeta Version=",	20, &header_check_vbm, file_stat);
1522   /* TinyTag */
1523   register_header_check(0, "FF 09 FF FF FF FF FF FF FF FF FF FF FF FF FF FF FFFF 00", 55, &header_check_ttd, file_stat);
1524   register_header_check(0, "<x:xmpmeta xmlns:x=\"adobe:ns:meta/\"", 35, &header_check_xmp, file_stat);
1525   register_header_check(0, "<svg xmlns=\"http://www.w3.org/2000/svg\"", 39, &header_check_svg, file_stat);
1526 }
1527