1 /*
2      This file is part of libextractor.
3      Copyright (C) 2004, 2005, 2006, 2007, 2009, 2012, 2018 Vidyut Samanta and Christian Grothoff
4 
5      libextractor is free software; you can redistribute it and/or modify
6      it under the terms of the GNU General Public License as published
7      by the Free Software Foundation; either version 3, or (at your
8      option) any later version.
9 
10      libextractor is distributed in the hope that it will be useful, but
11      WITHOUT ANY WARRANTY; without even the implied warranty of
12      MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13      General Public License for more details.
14 
15      You should have received a copy of the GNU General Public License
16      along with libextractor; see the file COPYING.  If not, write to the
17      Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18      Boston, MA 02110-1301, USA.
19 
20      This code makes extensive use of libgsf
21      -- the Gnome Structured File Library
22      Copyright Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
23 
24      Part of this code was adapted from wordleaker.
25 */
26 /**
27  * @file plugins/ole2_extractor.c
28  * @brief plugin to support OLE2 (DOC, XLS, etc.) files
29  * @author Christian Grothoff
30  */
31 #include "platform.h"
32 #include "extractor.h"
33 #include "convert.h"
34 #include <glib-object.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <gsf/gsf-utils.h>
39 #include <gsf/gsf-input-impl.h>
40 #include <gsf/gsf-input-memory.h>
41 #include <gsf/gsf-impl-utils.h>
42 #include <gsf/gsf-infile.h>
43 #include <gsf/gsf-infile-msole.h>
44 #include <gsf/gsf-msole-utils.h>
45 
46 
47 /**
48  * Set to 1 to use our own GsfInput subclass which supports seeking
49  * and thus can handle very large files.  Set to 0 to use the simple
50  * gsf in-memory buffer (which can only access the first ~16k) for
51  * debugging.
52  */
53 #define USE_LE_INPUT 1
54 
55 
56 /**
57  * Give the given UTF8 string to LE by calling 'proc'.
58  *
59  * @param proc callback to invoke
60  * @param proc_cls closure for proc
61  * @param phrase metadata string to pass; may include spaces
62  *        just double-quotes or just a space in a double quote;
63  *        in those cases, nothing should be done
64  * @param type meta data type to use
65  * @return if 'proc' returned 1, otherwise 0
66  */
67 static int
add_metadata(EXTRACTOR_MetaDataProcessor proc,void * proc_cls,const char * phrase,enum EXTRACTOR_MetaType type)68 add_metadata (EXTRACTOR_MetaDataProcessor proc,
69               void *proc_cls,
70               const char *phrase,
71               enum EXTRACTOR_MetaType type)
72 {
73   char *tmp;
74   int ret;
75 
76   if (0 == strlen (phrase))
77     return 0;
78   if (0 == strcmp (phrase, "\"\""))
79     return 0;
80   if (0 == strcmp (phrase, "\" \""))
81     return 0;
82   if (0 == strcmp (phrase, " "))
83     return 0;
84   if (NULL == (tmp = strdup (phrase)))
85     return 0;
86 
87   while ( (strlen (tmp) > 0) &&
88           (isblank ((unsigned char) tmp [strlen (tmp) - 1])) )
89     tmp [strlen (tmp) - 1] = '\0';
90   ret = proc (proc_cls,
91               "ole2",
92               type,
93               EXTRACTOR_METAFORMAT_UTF8,
94               "text/plain",
95               tmp,
96               strlen (tmp) + 1);
97   free (tmp);
98   return ret;
99 }
100 
101 
102 /**
103  * Entry in the map from OLE meta type  strings
104  * to LE types.
105  */
106 struct Matches
107 {
108   /**
109    * OLE description.
110    */
111   const char *text;
112 
113   /**
114    * Corresponding LE type.
115    */
116   enum EXTRACTOR_MetaType type;
117 };
118 
119 
120 static struct Matches tmap[] = {
121   { "Title", EXTRACTOR_METATYPE_TITLE },
122   { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
123   { "Category", EXTRACTOR_METATYPE_SECTION },
124   { "Manager", EXTRACTOR_METATYPE_MANAGER },
125   { "Company", EXTRACTOR_METATYPE_COMPANY },
126   { "Subject", EXTRACTOR_METATYPE_SUBJECT },
127   { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME },
128   { "Keywords", EXTRACTOR_METATYPE_KEYWORDS },
129   { "Comments", EXTRACTOR_METATYPE_COMMENT },
130   { "Template", EXTRACTOR_METATYPE_TEMPLATE },
131   { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT },
132   { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
133   { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER },
134   { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE },
135   { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE },
136   { "LastSavedTime", EXTRACTOR_METATYPE_MODIFICATION_DATE },
137   { "gsf:company", EXTRACTOR_METATYPE_COMPANY },
138   { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT },
139   { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
140   { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT },
141   { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT },
142   { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT },
143   { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY },
144   { "gsf:manager", EXTRACTOR_METATYPE_MANAGER },
145   { "dc:title", EXTRACTOR_METATYPE_TITLE },
146   { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
147   { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
148   { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
149   { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS },
150   { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED },
151   { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
152   { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
153   { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
154   { "meta:template", EXTRACTOR_METATYPE_TEMPLATE },
155   { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES },
156   /* { "Dictionary", EXTRACTOR_METATYPE_LANGUAGE },  */
157   /* { "gsf:security", EXTRACTOR_SECURITY }, */
158   /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
159   /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */
160   /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
161   { NULL, 0 }
162 };
163 
164 
165 /**
166  * Closure for 'process_metadata'.
167  */
168 struct ProcContext
169 {
170   /**
171    * Function to call for meta data that was found.
172    */
173   EXTRACTOR_MetaDataProcessor proc;
174 
175   /**
176    * Closure for @e proc.
177    */
178   void *proc_cls;
179 
180   /**
181    * Return value; 0 to continue to extract, 1 if we are done
182    */
183   int ret;
184 };
185 
186 
187 /**
188  * Function invoked by 'gst_msole_metadata_read' with
189  * metadata found in the document.
190  *
191  * @param key 'const char *' describing the meta data
192  * @param value the UTF8 representation of the meta data
193  * @param user_data our 'struct ProcContext' (closure)
194  */
195 static void
process_metadata(gpointer key,gpointer value,gpointer user_data)196 process_metadata (gpointer key,
197                   gpointer value,
198                   gpointer user_data)
199 {
200   const char *type = key;
201   const GsfDocProp *prop = value;
202   struct ProcContext *pc = user_data;
203   const GValue *gval;
204   char *contents;
205   int pos;
206 
207   if ( (NULL == key) ||
208        (NULL == value) )
209     return;
210   if (0 != pc->ret)
211     return;
212   gval = gsf_doc_prop_get_val (prop);
213 
214   if (G_VALUE_TYPE (gval) == G_TYPE_STRING)
215   {
216     const char *gvals;
217 
218     gvals = g_value_get_string (gval);
219     if (NULL == gvals)
220       return;
221     contents = strdup (gvals);
222   }
223   else
224   {
225     /* convert other formats? */
226     contents = g_strdup_value_contents (gval);
227   }
228   if (NULL == contents)
229     return;
230   if (0 == strcmp (type,
231                    "meta:generator"))
232   {
233     const char *mimetype = "application/vnd.ms-files";
234     struct
235     {
236       const char *v;
237       const char *m;
238     } mm[] = {
239       { "Microsoft Word", "application/msword" },
240       { "Microsoft Office Word", "application/msword" },
241       { "Microsoft Excel", "application/vnd.ms-excel" },
242       { "Microsoft Office Excel", "application/vnd.ms-excel" },
243       { "Microsoft PowerPoint", "application/vnd.ms-powerpoint" },
244       { "Microsoft Office PowerPoint", "application/vnd.ms-powerpoint"},
245       { "Microsoft Project", "application/vnd.ms-project" },
246       { "Microsoft Visio", "application/vnd.visio" },
247       { "Microsoft Office", "application/vnd.ms-office" },
248       { NULL, NULL }
249     };
250     int i;
251 
252     for (i = 0; NULL != mm[i].v; i++)
253       if (0 == strncmp (value,
254                         mm[i].v,
255                         strlen (mm[i].v) + 1))
256       {
257         mimetype = mm[i].m;
258         break;
259       }
260     if (0 != add_metadata (pc->proc,
261                            pc->proc_cls,
262                            mimetype,
263                            EXTRACTOR_METATYPE_MIMETYPE))
264     {
265       free (contents);
266       pc->ret = 1;
267       return;
268     }
269   }
270   for (pos = 0; NULL != tmap[pos].text; pos++)
271     if (0 == strcmp (tmap[pos].text,
272                      type))
273       break;
274   if ( (NULL != tmap[pos].text) &&
275        (0 != add_metadata (pc->proc, pc->proc_cls,
276                            contents,
277                            tmap[pos].type)) )
278   {
279     free (contents);
280     pc->ret = 1;
281     return;
282   }
283   free (contents);
284 }
285 
286 
287 /**
288  * Function called on (Document)SummaryInformation OLE
289  * streams.
290  *
291  * @param in the input OLE stream
292  * @param proc function to call on meta data found
293  * @param proc_cls closure for proc
294  * @return 0 to continue to extract, 1 if we are done
295  */
296 static int
process(GsfInput * in,EXTRACTOR_MetaDataProcessor proc,void * proc_cls)297 process (GsfInput *in,
298          EXTRACTOR_MetaDataProcessor proc,
299          void *proc_cls)
300 {
301   struct ProcContext pc;
302   GsfDocMetaData *sections;
303   GError *error;
304 
305   pc.proc = proc;
306   pc.proc_cls = proc_cls;
307   pc.ret = 0;
308   sections = gsf_doc_meta_data_new ();
309 #ifdef HAVE_GSF_DOC_META_DATA_READ_FROM_MSOLE
310   error = gsf_doc_meta_data_read_from_msole (sections, in);
311 #else
312   error = gsf_msole_metadata_read (in, sections);
313 #endif
314   if (NULL == error)
315   {
316     gsf_doc_meta_data_foreach (sections,
317                                &process_metadata,
318                                &pc);
319   }
320   else
321   {
322     g_error_free (error);
323   }
324   g_object_unref (G_OBJECT (sections));
325   return pc.ret;
326 }
327 
328 
329 /**
330  * Function called on SfxDocumentInfo OLE
331  * streams.
332  *
333  * @param in the input OLE stream
334  * @param proc function to call on meta data found
335  * @param proc_cls closure for proc
336  * @return 0 to continue to extract, 1 if we are done
337  */
338 static int
process_star_office(GsfInput * src,EXTRACTOR_MetaDataProcessor proc,void * proc_cls)339 process_star_office (GsfInput *src,
340                      EXTRACTOR_MetaDataProcessor proc,
341                      void *proc_cls)
342 {
343   off_t size = gsf_input_size (src);
344 
345   if ( (size < 0x374) ||
346        (size > 4 * 1024 * 1024) ) /* == 0x375?? */
347     return 0;
348   {
349     char buf[size];
350 
351     gsf_input_read (src, size, (unsigned char*) buf);
352     if ( (buf[0] != 0x0F) ||
353          (buf[1] != 0x0) ||
354          (0 != strncmp (&buf[2],
355                         "SfxDocumentInfo",
356                         strlen ("SfxDocumentInfo"))) ||
357          (buf[0x11] != 0x0B) ||
358          (buf[0x13] != 0x00) || /* pw protected! */
359          (buf[0x12] != 0x00) )
360       return 0;
361     buf[0xd3] = '\0';
362     if ( (buf[0x94] + buf[0x93] > 0) &&
363          (0 != add_metadata (proc, proc_cls,
364                              &buf[0x95],
365                              EXTRACTOR_METATYPE_TITLE)) )
366       return 1;
367     buf[0x114] = '\0';
368     if ( (buf[0xd5] + buf[0xd4] > 0) &&
369          (0 != add_metadata (proc, proc_cls,
370                              &buf[0xd6],
371                              EXTRACTOR_METATYPE_SUBJECT)) )
372       return 1;
373     buf[0x215] = '\0';
374     if ( (buf[0x115] + buf[0x116] > 0) &&
375          (0 != add_metadata (proc, proc_cls,
376                              &buf[0x117],
377                              EXTRACTOR_METATYPE_COMMENT)) )
378       return 1;
379     buf[0x296] = '\0';
380     if ( (buf[0x216] + buf[0x217] > 0) &&
381          (0 != add_metadata (proc, proc_cls,
382                              &buf[0x218],
383                              EXTRACTOR_METATYPE_KEYWORDS)) )
384       return 1;
385     /* fixme: do timestamps,
386        mime-type, user-defined info's */
387   }
388   return 0;
389 }
390 
391 
392 /**
393  * We use "__" to translate using iso-639.
394  *
395  * @param a string to translate
396  * @return translated string
397  */
398 #define __(a) dgettext ("iso-639", a)
399 
400 
401 /**
402  * Get the language string for the given language ID (lid)
403  * value.
404  *
405  * @param lid language id value
406  * @return language string corresponding to the lid
407  */
408 static const char *
lid_to_language(unsigned int lid)409 lid_to_language (unsigned int lid)
410 {
411   switch (lid)
412   {
413   case 0x0400:
414     return _ ("No Proofing");
415   case 0x0401:
416     return __ ("Arabic");
417   case 0x0402:
418     return __ ("Bulgarian");
419   case 0x0403:
420     return __ ("Catalan");
421   case 0x0404:
422     return _ ("Traditional Chinese");
423   case 0x0804:
424     return _ ("Simplified Chinese");
425   case 0x0405:
426     return __ ("Chechen");
427   case 0x0406:
428     return __ ("Danish");
429   case 0x0407:
430     return __ ("German");
431   case 0x0807:
432     return _ ("Swiss German");
433   case 0x0408:
434     return __ ("Greek");
435   case 0x0409:
436     return _ ("U.S. English");
437   case 0x0809:
438     return _ ("U.K. English");
439   case 0x0c09:
440     return _ ("Australian English");
441   case 0x040a:
442     return _ ("Castilian Spanish");
443   case 0x080a:
444     return _ ("Mexican Spanish");
445   case 0x040b:
446     return __ ("Finnish");
447   case 0x040c:
448     return __ ("French");
449   case 0x080c:
450     return _ ("Belgian French");
451   case 0x0c0c:
452     return _ ("Canadian French");
453   case 0x100c:
454     return _ ("Swiss French");
455   case 0x040d:
456     return __ ("Hebrew");
457   case 0x040e:
458     return __ ("Hungarian");
459   case 0x040f:
460     return __ ("Icelandic");
461   case 0x0410:
462     return __ ("Italian");
463   case 0x0810:
464     return _ ("Swiss Italian");
465   case 0x0411:
466     return __ ("Japanese");
467   case 0x0412:
468     return __ ("Korean");
469   case 0x0413:
470     return __ ("Dutch");
471   case 0x0813:
472     return _ ("Belgian Dutch");
473   case 0x0414:
474     return _ ("Norwegian Bokmal");
475   case 0x0814:
476     return __ ("Norwegian Nynorsk");
477   case 0x0415:
478     return __ ("Polish");
479   case 0x0416:
480     return __ ("Brazilian Portuguese");
481   case 0x0816:
482     return __ ("Portuguese");
483   case 0x0417:
484     return _ ("Rhaeto-Romanic");
485   case 0x0418:
486     return __ ("Romanian");
487   case 0x0419:
488     return __ ("Russian");
489   case 0x041a:
490     return _ ("Croato-Serbian (Latin)");
491   case 0x081a:
492     return _ ("Serbo-Croatian (Cyrillic)");
493   case 0x041b:
494     return __ ("Slovak");
495   case 0x041c:
496     return __ ("Albanian");
497   case 0x041d:
498     return __ ("Swedish");
499   case 0x041e:
500     return __ ("Thai");
501   case 0x041f:
502     return __ ("Turkish");
503   case 0x0420:
504     return __ ("Urdu");
505   case 0x0421:
506     return __ ("Bahasa");
507   case 0x0422:
508     return __ ("Ukrainian");
509   case 0x0423:
510     return __ ("Byelorussian");
511   case 0x0424:
512     return __ ("Slovenian");
513   case 0x0425:
514     return __ ("Estonian");
515   case 0x0426:
516     return __ ("Latvian");
517   case 0x0427:
518     return __ ("Lithuanian");
519   case 0x0429:
520     return _ ("Farsi");
521   case 0x042D:
522     return __ ("Basque");
523   case 0x042F:
524     return __ ("Macedonian");
525   case 0x0436:
526     return __ ("Afrikaans");
527   case 0x043E:
528     return __ ("Malayalam");
529   default:
530     return NULL;
531   }
532 }
533 
534 
535 /**
536  * Extract editing history from XTable stream.
537  *
538  * @param stream OLE stream to process
539  * @param lcSttbSavedBy length of the revision history in bytes
540  * @param fcSttbSavedBy offset of the revision history in the stream
541  * @param proc function to call on meta data found
542  * @param proc_cls closure for proc
543  * @return 0 to continue to extract, 1 if we are done
544  */
545 static int
history_extract(GsfInput * stream,unsigned int lcbSttbSavedBy,unsigned int fcSttbSavedBy,EXTRACTOR_MetaDataProcessor proc,void * proc_cls)546 history_extract (GsfInput *stream,
547                  unsigned int lcbSttbSavedBy,
548                  unsigned int fcSttbSavedBy,
549                  EXTRACTOR_MetaDataProcessor proc,
550                  void *proc_cls)
551 {
552   unsigned int where;
553   unsigned char *lbuffer;
554   unsigned int i;
555   unsigned int length;
556   char *author;
557   char *filename;
558   char *rbuf;
559   unsigned int nRev;
560   int ret;
561 
562   /* goto offset of revision information */
563   gsf_input_seek (stream, fcSttbSavedBy, G_SEEK_SET);
564   if (gsf_input_remaining (stream) < lcbSttbSavedBy)
565     return 0;
566   if (NULL == (lbuffer = malloc (lcbSttbSavedBy)))
567     return 0;
568   /* read all the revision history */
569   gsf_input_read (stream, lcbSttbSavedBy, lbuffer);
570   /* there are n strings, so n/2 revisions (author & file) */
571   nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
572   where = 6;
573   ret = 0;
574   for (i = 0; i < nRev; i++)
575   {
576     if (where >= lcbSttbSavedBy)
577       break;
578     length = lbuffer[where++];
579     if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
580          (where + 2 * length + 2 <= where) )
581       break;
582     author = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
583                                                length * 2,
584                                                "UTF-16BE");
585     where += length * 2 + 1;
586     length = lbuffer[where++];
587     if ( (where + 2 * length >= lcbSttbSavedBy) ||
588          (where + 2 * length + 1 <= where) )
589     {
590       if (NULL != author)
591         free (author);
592       break;
593     }
594     filename = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
595                                                  length * 2,
596                                                  "UTF-16BE");
597     where += length * 2 + 1;
598     if ( (NULL != author) &&
599          (NULL != filename) )
600     {
601       size_t bsize;
602 
603       bsize = strlen (author) + strlen (filename) + 512;
604       if (NULL != (rbuf = malloc (bsize)))
605       {
606         int snret;
607 
608         snret = snprintf (rbuf,
609                           bsize,
610                           _ ("Revision #%u: Author `%s' worked on `%s'"),
611                           i,
612                           author,
613                           filename);
614         if ( (-1 != snret) &&
615              (bsize > (size_t) snret) )
616         {
617           ret = add_metadata (proc,
618                               proc_cls,
619                               rbuf,
620                               EXTRACTOR_METATYPE_REVISION_HISTORY);
621         }
622         free (rbuf);
623       }
624     }
625     if (NULL != author)
626       free (author);
627     if (NULL != filename)
628       free (filename);
629     if (0 != ret)
630       break;
631   }
632   free (lbuffer);
633   return ret;
634 }
635 
636 
637 /* *************************** custom GSF input method ***************** */
638 
639 #define LE_TYPE_INPUT                  (le_input_get_type ())
640 #define LE_INPUT(obj)                  (G_TYPE_CHECK_INSTANCE_CAST ((obj), \
641                                                                     LE_TYPE_INPUT, \
642                                                                     LeInput))
643 #define LE_INPUT_CLASS(klass)          (G_TYPE_CHECK_CLASS_CAST ((klass), \
644                                                                  LE_TYPE_INPUT, \
645                                                                  LeInputClass))
646 #define IS_LE_INPUT(obj)               (G_TYPE_CHECK_INSTANCE_TYPE ((obj), \
647                                                                     LE_TYPE_INPUT))
648 #define IS_LE_INPUT_CLASS(klass)       (G_TYPE_CHECK_CLASS_TYPE ((klass), \
649                                                                  LE_TYPE_INPUT))
650 #define LE_INPUT_GET_CLASS(obj)        (G_TYPE_INSTANCE_GET_CLASS ((obj), \
651                                                                    LE_TYPE_INPUT, \
652                                                                    LeInputClass))
653 
654 /**
655  * Internal state of an "LeInput" object.
656  */
657 typedef struct _LeInputPrivate
658 {
659   /**
660    * Our extraction context.
661    */
662   struct EXTRACTOR_ExtractContext *ec;
663 } LeInputPrivate;
664 
665 
666 /**
667  * Overall state of an "LeInput" object.
668  */
669 typedef struct _LeInput
670 {
671   /**
672    * Inherited state from parent (GsfInput).
673    */
674   GsfInput input;
675 
676   /*< private > */
677   /**
678    * Private state of the LeInput.
679    */
680   LeInputPrivate *priv;
681 } LeInput;
682 
683 
684 /**
685  * LeInput's class state.
686  */
687 typedef struct _LeInputClass
688 {
689   /**
690    * GsfInput is our parent class.
691    */
692   GsfInputClass parent_class;
693 
694   /* Padding for future expansion */
695   void (*_gtk_reserved1)(void);
696   void (*_gtk_reserved2)(void);
697   void (*_gtk_reserved3)(void);
698   void (*_gtk_reserved4)(void);
699 } LeInputClass;
700 
701 
702 /**
703  * Constructor for LeInput objects.
704  *
705  * @param ec extraction context to use
706  * @return the LeInput, NULL on error
707  */
708 GsfInput *
709 le_input_new (struct EXTRACTOR_ExtractContext *ec);
710 
711 
712 /**
713  * Class initializer for the "LeInput" class.
714  *
715  * @param class class object to initialize
716  */
717 static void
718 le_input_class_init (LeInputClass *class);
719 
720 
721 /**
722  * Initialize internal state of fresh input object.
723  *
724  * @param input object to initialize
725  */
726 static void
727 le_input_init (LeInput *input);
728 
729 
730 /**
731  * Macro to create LeInput type definition and register the class.
732  */
GSF_CLASS(LeInput,le_input,le_input_class_init,le_input_init,GSF_INPUT_TYPE)733 GSF_CLASS (LeInput, le_input, le_input_class_init, le_input_init,
734            GSF_INPUT_TYPE)
735 
736 
737 /**
738  * Duplicate input, leaving the new one at the same offset.
739  *
740  * @param input the input to duplicate
741  * @param err location for error reporting, can be NULL
742  * @return NULL on error (always)
743  */
744 static GsfInput *
745 le_input_dup (GsfInput * input,
746               GError * *err)
747 {
748   if (NULL != err)
749     *err = g_error_new (gsf_input_error_id (), 0,
750                         "dup not supported on LeInput");
751   return NULL;
752 }
753 
754 
755 /**
756  * Read at least num_bytes. Does not change the current position if
757  * there is an error. Will only read if the entire amount can be
758  * read. Invalidates the buffer associated with previous calls to
759  * gsf_input_read.
760  *
761  * @param input
762  * @param num_bytes
763  * @param optional_buffer
764  * @return buffer where num_bytes data are available, or NULL on error
765  */
766 static const guint8 *
le_input_read(GsfInput * input,size_t num_bytes,guint8 * optional_buffer)767 le_input_read (GsfInput *input,
768                size_t num_bytes,
769                guint8 *optional_buffer)
770 {
771   LeInput *li = LE_INPUT (input);
772   struct EXTRACTOR_ExtractContext *ec;
773   void *buf;
774   uint64_t old_off;
775   ssize_t ret;
776 
777   ec = li->priv->ec;
778   old_off = ec->seek (ec->cls, 0, SEEK_CUR);
779   if (num_bytes
780       != (ret = ec->read (ec->cls,
781                           &buf,
782                           num_bytes)))
783   {
784     /* we don't support partial reads;
785  most other GsfInput implementations in this case
786  allocate some huge temporary buffer just to avoid
787  the partial read; we might need to do that as well!? */
788     ec->seek (ec->cls, SEEK_SET, old_off);
789     return NULL;
790   }
791   if (NULL != optional_buffer)
792   {
793     memcpy (optional_buffer, buf, num_bytes);
794     return optional_buffer;
795   }
796   return buf;
797 }
798 
799 
800 /**
801  * Move the current location in an input stream
802  *
803  * @param input stream to seek
804  * @param offset target offset
805  * @param whence determines to what the offset is relative to
806  * @return TRUE on error
807  */
808 static gboolean
le_input_seek(GsfInput * input,gsf_off_t offset,GSeekType whence)809 le_input_seek (GsfInput *input,
810                gsf_off_t offset,
811                GSeekType whence)
812 {
813   LeInput *li = LE_INPUT (input);
814   struct EXTRACTOR_ExtractContext *ec;
815   int w;
816   int64_t ret;
817 
818   ec = li->priv->ec;
819   switch (whence)
820   {
821   case G_SEEK_SET:
822     w = SEEK_SET;
823     break;
824   case G_SEEK_CUR:
825     w = SEEK_CUR;
826     break;
827   case G_SEEK_END:
828     w = SEEK_END;
829     break;
830   default:
831     return TRUE;
832   }
833   if (-1 ==
834       (ret = ec->seek (ec->cls,
835                        offset,
836                        w)))
837     return TRUE;
838   return FALSE;
839 }
840 
841 
842 /**
843  * Class initializer for the "LeInput" class.
844  *
845  * @param class class object to initialize
846  */
847 static void
le_input_class_init(LeInputClass * class)848 le_input_class_init (LeInputClass *class)
849 {
850   GsfInputClass *input_class;
851 
852   input_class = (GsfInputClass *) class;
853   input_class->Dup = le_input_dup;
854   input_class->Read = le_input_read;
855   input_class->Seek = le_input_seek;
856   g_type_class_add_private (class, sizeof (LeInputPrivate));
857 }
858 
859 
860 /**
861  * Initialize internal state of fresh input object.
862  *
863  * @param input object to initialize
864  */
865 static void
le_input_init(LeInput * input)866 le_input_init (LeInput *input)
867 {
868   LeInputPrivate *priv;
869 
870   input->priv =
871     G_TYPE_INSTANCE_GET_PRIVATE (input, LE_TYPE_INPUT,
872                                  LeInputPrivate);
873   priv = input->priv;
874   priv->ec = NULL;
875 }
876 
877 
878 /**
879  * Creates a new LeInput object.
880  *
881  * @param ec extractor context to wrap
882  * @return NULL on error
883  */
884 GsfInput *
le_input_new(struct EXTRACTOR_ExtractContext * ec)885 le_input_new (struct EXTRACTOR_ExtractContext *ec)
886 {
887   LeInput *input;
888 
889   input = g_object_new (LE_TYPE_INPUT, NULL);
890   gsf_input_set_size (GSF_INPUT (input),
891                       ec->get_size (ec->cls));
892   gsf_input_seek_emulate (GSF_INPUT (input),
893                           0);
894   input->input.name = NULL;
895   input->input.container = NULL;
896   input->priv->ec = ec;
897 
898   return GSF_INPUT (input);
899 }
900 
901 
902 /* *********************** end of custom GSF input method ************* */
903 
904 
905 /**
906  * Main entry method for the OLE2 extraction plugin.
907  *
908  * @param ec extraction context provided to the plugin
909  */
910 void
EXTRACTOR_ole2_extract_method(struct EXTRACTOR_ExtractContext * ec)911 EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
912 {
913   GsfInput *input;
914   GsfInfile *infile;
915   GsfInput *src;
916   const char *name;
917   unsigned int i;
918   unsigned int lcb;
919   unsigned int fcb;
920   const unsigned char *data512;
921   unsigned int lid;
922   const char *lang;
923   int ret;
924   void *data;
925   uint64_t fsize;
926   ssize_t data_size;
927 
928   fsize = ec->get_size (ec->cls);
929   if (fsize < 512 + 898)
930   {
931     /* File too small for OLE2 */
932     return;   /* can hardly be OLE2 */
933   }
934   if (512 + 898 > (data_size = ec->read (ec->cls, &data, fsize)))
935   {
936     /* Failed to read minimum file size to buffer */
937     return;
938   }
939   data512 = (const unsigned char*) data + 512;
940   lid = data512[6] + (data512[7] << 8);
941   if ( (NULL != (lang = lid_to_language (lid))) &&
942        (0 != (ret = add_metadata (ec->proc, ec->cls,
943                                   lang,
944                                   EXTRACTOR_METATYPE_LANGUAGE))) )
945     return;
946   lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16)
947         + (data512[729] << 24);
948   fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16)
949         + (data512[725] << 24);
950   if (0 != ec->seek (ec->cls, 0, SEEK_SET))
951   {
952     /* seek failed!? */
953     return;
954   }
955 #if USE_LE_INPUT
956   if (NULL == (input = le_input_new (ec)))
957   {
958     fprintf (stderr, "le_input_new failed\n");
959     return;
960   }
961 #else
962   input = gsf_input_memory_new ((const guint8 *) data,
963                                 data_size,
964                                 FALSE);
965 #endif
966   if (NULL == (infile = gsf_infile_msole_new (input, NULL)))
967   {
968     g_object_unref (G_OBJECT (input));
969     return;
970   }
971   ret = 0;
972   for (i = 0; i<gsf_infile_num_children (infile); i++)
973   {
974     if (0 != ret)
975       break;
976     if (NULL == (name = gsf_infile_name_by_index (infile, i)))
977       continue;
978     src = NULL;
979     if ( ( (0 == strcmp (name, "\005SummaryInformation")) ||
980            (0 == strcmp (name, "\005DocumentSummaryInformation")) ) &&
981          (NULL != (src = gsf_infile_child_by_index (infile, i))) )
982       ret = process (src,
983                      ec->proc,
984                      ec->cls);
985     if ( (0 == strcmp (name, "SfxDocumentInfo")) &&
986          (NULL != (src = gsf_infile_child_by_index (infile, i))) )
987       ret = process_star_office (src,
988                                  ec->proc,
989                                  ec->cls);
990     if (NULL != src)
991       g_object_unref (G_OBJECT (src));
992   }
993   if (0 != ret)
994     goto CLEANUP;
995 
996   if (lcb < 6)
997     goto CLEANUP;
998   for (i = 0; i<gsf_infile_num_children (infile); i++)
999   {
1000     if (ret != 0)
1001       break;
1002     if (NULL == (name = gsf_infile_name_by_index (infile, i)))
1003       continue;
1004     if ( ( (0 == strcmp (name, "1Table")) ||
1005            (0 == strcmp (name, "0Table")) ) &&
1006          (NULL != (src = gsf_infile_child_by_index (infile, i))) )
1007     {
1008       ret = history_extract (src,
1009                              lcb,
1010                              fcb,
1011                              ec->proc, ec->cls);
1012       g_object_unref (G_OBJECT (src));
1013     }
1014   }
1015 CLEANUP:
1016   g_object_unref (G_OBJECT (infile));
1017   g_object_unref (G_OBJECT (input));
1018 }
1019 
1020 
1021 /**
1022  * Custom log function we give to GSF to disable logging.
1023  *
1024  * @param log_domain unused
1025  * @param log_level unused
1026  * @param message unused
1027  * @param user_data unused
1028  */
1029 static void
nolog(const gchar * log_domain,GLogLevelFlags log_level,const gchar * message,gpointer user_data)1030 nolog (const gchar *log_domain,
1031        GLogLevelFlags log_level,
1032        const gchar *message,
1033        gpointer user_data)
1034 {
1035   /* do nothing */
1036 }
1037 
1038 
1039 /**
1040  * OLE2 plugin constructor. Initializes glib and gsf, in particular
1041  * gsf logging is disabled.
1042  */
1043 void __attribute__ ((constructor))
ole2_ltdl_init()1044 ole2_ltdl_init ()
1045 {
1046 #if ! GLIB_CHECK_VERSION (2, 35, 0)
1047   g_type_init ();
1048 #endif
1049 #ifdef HAVE_GSF_INIT
1050   gsf_init ();
1051 #endif
1052   /* disable logging -- thanks, Jody! */
1053   g_log_set_handler ("libgsf:msole",
1054                      G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,
1055                      &nolog, NULL);
1056 }
1057 
1058 
1059 /**
1060  * OLE2 plugin destructor.  Shutdown of gsf.
1061  */
1062 void __attribute__ ((destructor))
ole2_ltdl_fini()1063 ole2_ltdl_fini ()
1064 {
1065 #ifdef HAVE_GSF_INIT
1066   gsf_shutdown ();
1067 #endif
1068 }
1069 
1070 
1071 /* end of ole2_extractor.c */
1072