1 /*
2 This file is part of libextractor.
3 Copyright (C) 2004, 2005, 2006, 2007, 2009, 2012, 2018 Vidyut Samanta and Christian Grothoff
4
5 libextractor is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published
7 by the Free Software Foundation; either version 3, or (at your
8 option) any later version.
9
10 libextractor is distributed in the hope that it will be useful, but
11 WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with libextractor; see the file COPYING. If not, write to the
17 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 Boston, MA 02110-1301, USA.
19
20 This code makes extensive use of libgsf
21 -- the Gnome Structured File Library
22 Copyright Copyright (C) 2002-2004 Jody Goldberg (jody@gnome.org)
23
24 Part of this code was adapted from wordleaker.
25 */
26 /**
27 * @file plugins/ole2_extractor.c
28 * @brief plugin to support OLE2 (DOC, XLS, etc.) files
29 * @author Christian Grothoff
30 */
31 #include "platform.h"
32 #include "extractor.h"
33 #include "convert.h"
34 #include <glib-object.h>
35 #include <string.h>
36 #include <stdio.h>
37 #include <ctype.h>
38 #include <gsf/gsf-utils.h>
39 #include <gsf/gsf-input-impl.h>
40 #include <gsf/gsf-input-memory.h>
41 #include <gsf/gsf-impl-utils.h>
42 #include <gsf/gsf-infile.h>
43 #include <gsf/gsf-infile-msole.h>
44 #include <gsf/gsf-msole-utils.h>
45
46
47 /**
48 * Set to 1 to use our own GsfInput subclass which supports seeking
49 * and thus can handle very large files. Set to 0 to use the simple
50 * gsf in-memory buffer (which can only access the first ~16k) for
51 * debugging.
52 */
53 #define USE_LE_INPUT 1
54
55
56 /**
57 * Give the given UTF8 string to LE by calling 'proc'.
58 *
59 * @param proc callback to invoke
60 * @param proc_cls closure for proc
61 * @param phrase metadata string to pass; may include spaces
62 * just double-quotes or just a space in a double quote;
63 * in those cases, nothing should be done
64 * @param type meta data type to use
65 * @return if 'proc' returned 1, otherwise 0
66 */
67 static int
add_metadata(EXTRACTOR_MetaDataProcessor proc,void * proc_cls,const char * phrase,enum EXTRACTOR_MetaType type)68 add_metadata (EXTRACTOR_MetaDataProcessor proc,
69 void *proc_cls,
70 const char *phrase,
71 enum EXTRACTOR_MetaType type)
72 {
73 char *tmp;
74 int ret;
75
76 if (0 == strlen (phrase))
77 return 0;
78 if (0 == strcmp (phrase, "\"\""))
79 return 0;
80 if (0 == strcmp (phrase, "\" \""))
81 return 0;
82 if (0 == strcmp (phrase, " "))
83 return 0;
84 if (NULL == (tmp = strdup (phrase)))
85 return 0;
86
87 while ( (strlen (tmp) > 0) &&
88 (isblank ((unsigned char) tmp [strlen (tmp) - 1])) )
89 tmp [strlen (tmp) - 1] = '\0';
90 ret = proc (proc_cls,
91 "ole2",
92 type,
93 EXTRACTOR_METAFORMAT_UTF8,
94 "text/plain",
95 tmp,
96 strlen (tmp) + 1);
97 free (tmp);
98 return ret;
99 }
100
101
102 /**
103 * Entry in the map from OLE meta type strings
104 * to LE types.
105 */
106 struct Matches
107 {
108 /**
109 * OLE description.
110 */
111 const char *text;
112
113 /**
114 * Corresponding LE type.
115 */
116 enum EXTRACTOR_MetaType type;
117 };
118
119
120 static struct Matches tmap[] = {
121 { "Title", EXTRACTOR_METATYPE_TITLE },
122 { "PresentationFormat", EXTRACTOR_METATYPE_FORMAT },
123 { "Category", EXTRACTOR_METATYPE_SECTION },
124 { "Manager", EXTRACTOR_METATYPE_MANAGER },
125 { "Company", EXTRACTOR_METATYPE_COMPANY },
126 { "Subject", EXTRACTOR_METATYPE_SUBJECT },
127 { "Author", EXTRACTOR_METATYPE_AUTHOR_NAME },
128 { "Keywords", EXTRACTOR_METATYPE_KEYWORDS },
129 { "Comments", EXTRACTOR_METATYPE_COMMENT },
130 { "Template", EXTRACTOR_METATYPE_TEMPLATE },
131 { "NumPages", EXTRACTOR_METATYPE_PAGE_COUNT },
132 { "AppName", EXTRACTOR_METATYPE_PRODUCED_BY_SOFTWARE },
133 { "RevisionNumber", EXTRACTOR_METATYPE_REVISION_NUMBER },
134 { "NumBytes", EXTRACTOR_METATYPE_EMBEDDED_FILE_SIZE },
135 { "CreatedTime", EXTRACTOR_METATYPE_CREATION_DATE },
136 { "LastSavedTime", EXTRACTOR_METATYPE_MODIFICATION_DATE },
137 { "gsf:company", EXTRACTOR_METATYPE_COMPANY },
138 { "gsf:character-count", EXTRACTOR_METATYPE_CHARACTER_COUNT },
139 { "gsf:page-count", EXTRACTOR_METATYPE_PAGE_COUNT },
140 { "gsf:line-count", EXTRACTOR_METATYPE_LINE_COUNT },
141 { "gsf:word-count", EXTRACTOR_METATYPE_WORD_COUNT },
142 { "gsf:paragraph-count", EXTRACTOR_METATYPE_PARAGRAPH_COUNT },
143 { "gsf:last-saved-by", EXTRACTOR_METATYPE_LAST_SAVED_BY },
144 { "gsf:manager", EXTRACTOR_METATYPE_MANAGER },
145 { "dc:title", EXTRACTOR_METATYPE_TITLE },
146 { "dc:creator", EXTRACTOR_METATYPE_CREATOR },
147 { "dc:date", EXTRACTOR_METATYPE_UNKNOWN_DATE },
148 { "dc:subject", EXTRACTOR_METATYPE_SUBJECT },
149 { "dc:keywords", EXTRACTOR_METATYPE_KEYWORDS },
150 { "dc:last-printed", EXTRACTOR_METATYPE_LAST_PRINTED },
151 { "dc:description", EXTRACTOR_METATYPE_DESCRIPTION },
152 { "meta:creation-date", EXTRACTOR_METATYPE_CREATION_DATE },
153 { "meta:generator", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE },
154 { "meta:template", EXTRACTOR_METATYPE_TEMPLATE },
155 { "meta:editing-cycles", EXTRACTOR_METATYPE_EDITING_CYCLES },
156 /* { "Dictionary", EXTRACTOR_METATYPE_LANGUAGE }, */
157 /* { "gsf:security", EXTRACTOR_SECURITY }, */
158 /* { "gsf:scale", EXTRACTOR_SCALE }, // always "false"? */
159 /* { "meta:editing-duration", EXTRACTOR_METATYPE_TOTAL_EDITING_TIME }, // encoding? */
160 /* { "msole:codepage", EXTRACTOR_CHARACTER_SET }, */
161 { NULL, 0 }
162 };
163
164
165 /**
166 * Closure for 'process_metadata'.
167 */
168 struct ProcContext
169 {
170 /**
171 * Function to call for meta data that was found.
172 */
173 EXTRACTOR_MetaDataProcessor proc;
174
175 /**
176 * Closure for @e proc.
177 */
178 void *proc_cls;
179
180 /**
181 * Return value; 0 to continue to extract, 1 if we are done
182 */
183 int ret;
184 };
185
186
187 /**
188 * Function invoked by 'gst_msole_metadata_read' with
189 * metadata found in the document.
190 *
191 * @param key 'const char *' describing the meta data
192 * @param value the UTF8 representation of the meta data
193 * @param user_data our 'struct ProcContext' (closure)
194 */
195 static void
process_metadata(gpointer key,gpointer value,gpointer user_data)196 process_metadata (gpointer key,
197 gpointer value,
198 gpointer user_data)
199 {
200 const char *type = key;
201 const GsfDocProp *prop = value;
202 struct ProcContext *pc = user_data;
203 const GValue *gval;
204 char *contents;
205 int pos;
206
207 if ( (NULL == key) ||
208 (NULL == value) )
209 return;
210 if (0 != pc->ret)
211 return;
212 gval = gsf_doc_prop_get_val (prop);
213
214 if (G_VALUE_TYPE (gval) == G_TYPE_STRING)
215 {
216 const char *gvals;
217
218 gvals = g_value_get_string (gval);
219 if (NULL == gvals)
220 return;
221 contents = strdup (gvals);
222 }
223 else
224 {
225 /* convert other formats? */
226 contents = g_strdup_value_contents (gval);
227 }
228 if (NULL == contents)
229 return;
230 if (0 == strcmp (type,
231 "meta:generator"))
232 {
233 const char *mimetype = "application/vnd.ms-files";
234 struct
235 {
236 const char *v;
237 const char *m;
238 } mm[] = {
239 { "Microsoft Word", "application/msword" },
240 { "Microsoft Office Word", "application/msword" },
241 { "Microsoft Excel", "application/vnd.ms-excel" },
242 { "Microsoft Office Excel", "application/vnd.ms-excel" },
243 { "Microsoft PowerPoint", "application/vnd.ms-powerpoint" },
244 { "Microsoft Office PowerPoint", "application/vnd.ms-powerpoint"},
245 { "Microsoft Project", "application/vnd.ms-project" },
246 { "Microsoft Visio", "application/vnd.visio" },
247 { "Microsoft Office", "application/vnd.ms-office" },
248 { NULL, NULL }
249 };
250 int i;
251
252 for (i = 0; NULL != mm[i].v; i++)
253 if (0 == strncmp (value,
254 mm[i].v,
255 strlen (mm[i].v) + 1))
256 {
257 mimetype = mm[i].m;
258 break;
259 }
260 if (0 != add_metadata (pc->proc,
261 pc->proc_cls,
262 mimetype,
263 EXTRACTOR_METATYPE_MIMETYPE))
264 {
265 free (contents);
266 pc->ret = 1;
267 return;
268 }
269 }
270 for (pos = 0; NULL != tmap[pos].text; pos++)
271 if (0 == strcmp (tmap[pos].text,
272 type))
273 break;
274 if ( (NULL != tmap[pos].text) &&
275 (0 != add_metadata (pc->proc, pc->proc_cls,
276 contents,
277 tmap[pos].type)) )
278 {
279 free (contents);
280 pc->ret = 1;
281 return;
282 }
283 free (contents);
284 }
285
286
287 /**
288 * Function called on (Document)SummaryInformation OLE
289 * streams.
290 *
291 * @param in the input OLE stream
292 * @param proc function to call on meta data found
293 * @param proc_cls closure for proc
294 * @return 0 to continue to extract, 1 if we are done
295 */
296 static int
process(GsfInput * in,EXTRACTOR_MetaDataProcessor proc,void * proc_cls)297 process (GsfInput *in,
298 EXTRACTOR_MetaDataProcessor proc,
299 void *proc_cls)
300 {
301 struct ProcContext pc;
302 GsfDocMetaData *sections;
303 GError *error;
304
305 pc.proc = proc;
306 pc.proc_cls = proc_cls;
307 pc.ret = 0;
308 sections = gsf_doc_meta_data_new ();
309 #ifdef HAVE_GSF_DOC_META_DATA_READ_FROM_MSOLE
310 error = gsf_doc_meta_data_read_from_msole (sections, in);
311 #else
312 error = gsf_msole_metadata_read (in, sections);
313 #endif
314 if (NULL == error)
315 {
316 gsf_doc_meta_data_foreach (sections,
317 &process_metadata,
318 &pc);
319 }
320 else
321 {
322 g_error_free (error);
323 }
324 g_object_unref (G_OBJECT (sections));
325 return pc.ret;
326 }
327
328
329 /**
330 * Function called on SfxDocumentInfo OLE
331 * streams.
332 *
333 * @param in the input OLE stream
334 * @param proc function to call on meta data found
335 * @param proc_cls closure for proc
336 * @return 0 to continue to extract, 1 if we are done
337 */
338 static int
process_star_office(GsfInput * src,EXTRACTOR_MetaDataProcessor proc,void * proc_cls)339 process_star_office (GsfInput *src,
340 EXTRACTOR_MetaDataProcessor proc,
341 void *proc_cls)
342 {
343 off_t size = gsf_input_size (src);
344
345 if ( (size < 0x374) ||
346 (size > 4 * 1024 * 1024) ) /* == 0x375?? */
347 return 0;
348 {
349 char buf[size];
350
351 gsf_input_read (src, size, (unsigned char*) buf);
352 if ( (buf[0] != 0x0F) ||
353 (buf[1] != 0x0) ||
354 (0 != strncmp (&buf[2],
355 "SfxDocumentInfo",
356 strlen ("SfxDocumentInfo"))) ||
357 (buf[0x11] != 0x0B) ||
358 (buf[0x13] != 0x00) || /* pw protected! */
359 (buf[0x12] != 0x00) )
360 return 0;
361 buf[0xd3] = '\0';
362 if ( (buf[0x94] + buf[0x93] > 0) &&
363 (0 != add_metadata (proc, proc_cls,
364 &buf[0x95],
365 EXTRACTOR_METATYPE_TITLE)) )
366 return 1;
367 buf[0x114] = '\0';
368 if ( (buf[0xd5] + buf[0xd4] > 0) &&
369 (0 != add_metadata (proc, proc_cls,
370 &buf[0xd6],
371 EXTRACTOR_METATYPE_SUBJECT)) )
372 return 1;
373 buf[0x215] = '\0';
374 if ( (buf[0x115] + buf[0x116] > 0) &&
375 (0 != add_metadata (proc, proc_cls,
376 &buf[0x117],
377 EXTRACTOR_METATYPE_COMMENT)) )
378 return 1;
379 buf[0x296] = '\0';
380 if ( (buf[0x216] + buf[0x217] > 0) &&
381 (0 != add_metadata (proc, proc_cls,
382 &buf[0x218],
383 EXTRACTOR_METATYPE_KEYWORDS)) )
384 return 1;
385 /* fixme: do timestamps,
386 mime-type, user-defined info's */
387 }
388 return 0;
389 }
390
391
392 /**
393 * We use "__" to translate using iso-639.
394 *
395 * @param a string to translate
396 * @return translated string
397 */
398 #define __(a) dgettext ("iso-639", a)
399
400
401 /**
402 * Get the language string for the given language ID (lid)
403 * value.
404 *
405 * @param lid language id value
406 * @return language string corresponding to the lid
407 */
408 static const char *
lid_to_language(unsigned int lid)409 lid_to_language (unsigned int lid)
410 {
411 switch (lid)
412 {
413 case 0x0400:
414 return _ ("No Proofing");
415 case 0x0401:
416 return __ ("Arabic");
417 case 0x0402:
418 return __ ("Bulgarian");
419 case 0x0403:
420 return __ ("Catalan");
421 case 0x0404:
422 return _ ("Traditional Chinese");
423 case 0x0804:
424 return _ ("Simplified Chinese");
425 case 0x0405:
426 return __ ("Chechen");
427 case 0x0406:
428 return __ ("Danish");
429 case 0x0407:
430 return __ ("German");
431 case 0x0807:
432 return _ ("Swiss German");
433 case 0x0408:
434 return __ ("Greek");
435 case 0x0409:
436 return _ ("U.S. English");
437 case 0x0809:
438 return _ ("U.K. English");
439 case 0x0c09:
440 return _ ("Australian English");
441 case 0x040a:
442 return _ ("Castilian Spanish");
443 case 0x080a:
444 return _ ("Mexican Spanish");
445 case 0x040b:
446 return __ ("Finnish");
447 case 0x040c:
448 return __ ("French");
449 case 0x080c:
450 return _ ("Belgian French");
451 case 0x0c0c:
452 return _ ("Canadian French");
453 case 0x100c:
454 return _ ("Swiss French");
455 case 0x040d:
456 return __ ("Hebrew");
457 case 0x040e:
458 return __ ("Hungarian");
459 case 0x040f:
460 return __ ("Icelandic");
461 case 0x0410:
462 return __ ("Italian");
463 case 0x0810:
464 return _ ("Swiss Italian");
465 case 0x0411:
466 return __ ("Japanese");
467 case 0x0412:
468 return __ ("Korean");
469 case 0x0413:
470 return __ ("Dutch");
471 case 0x0813:
472 return _ ("Belgian Dutch");
473 case 0x0414:
474 return _ ("Norwegian Bokmal");
475 case 0x0814:
476 return __ ("Norwegian Nynorsk");
477 case 0x0415:
478 return __ ("Polish");
479 case 0x0416:
480 return __ ("Brazilian Portuguese");
481 case 0x0816:
482 return __ ("Portuguese");
483 case 0x0417:
484 return _ ("Rhaeto-Romanic");
485 case 0x0418:
486 return __ ("Romanian");
487 case 0x0419:
488 return __ ("Russian");
489 case 0x041a:
490 return _ ("Croato-Serbian (Latin)");
491 case 0x081a:
492 return _ ("Serbo-Croatian (Cyrillic)");
493 case 0x041b:
494 return __ ("Slovak");
495 case 0x041c:
496 return __ ("Albanian");
497 case 0x041d:
498 return __ ("Swedish");
499 case 0x041e:
500 return __ ("Thai");
501 case 0x041f:
502 return __ ("Turkish");
503 case 0x0420:
504 return __ ("Urdu");
505 case 0x0421:
506 return __ ("Bahasa");
507 case 0x0422:
508 return __ ("Ukrainian");
509 case 0x0423:
510 return __ ("Byelorussian");
511 case 0x0424:
512 return __ ("Slovenian");
513 case 0x0425:
514 return __ ("Estonian");
515 case 0x0426:
516 return __ ("Latvian");
517 case 0x0427:
518 return __ ("Lithuanian");
519 case 0x0429:
520 return _ ("Farsi");
521 case 0x042D:
522 return __ ("Basque");
523 case 0x042F:
524 return __ ("Macedonian");
525 case 0x0436:
526 return __ ("Afrikaans");
527 case 0x043E:
528 return __ ("Malayalam");
529 default:
530 return NULL;
531 }
532 }
533
534
535 /**
536 * Extract editing history from XTable stream.
537 *
538 * @param stream OLE stream to process
539 * @param lcSttbSavedBy length of the revision history in bytes
540 * @param fcSttbSavedBy offset of the revision history in the stream
541 * @param proc function to call on meta data found
542 * @param proc_cls closure for proc
543 * @return 0 to continue to extract, 1 if we are done
544 */
545 static int
history_extract(GsfInput * stream,unsigned int lcbSttbSavedBy,unsigned int fcSttbSavedBy,EXTRACTOR_MetaDataProcessor proc,void * proc_cls)546 history_extract (GsfInput *stream,
547 unsigned int lcbSttbSavedBy,
548 unsigned int fcSttbSavedBy,
549 EXTRACTOR_MetaDataProcessor proc,
550 void *proc_cls)
551 {
552 unsigned int where;
553 unsigned char *lbuffer;
554 unsigned int i;
555 unsigned int length;
556 char *author;
557 char *filename;
558 char *rbuf;
559 unsigned int nRev;
560 int ret;
561
562 /* goto offset of revision information */
563 gsf_input_seek (stream, fcSttbSavedBy, G_SEEK_SET);
564 if (gsf_input_remaining (stream) < lcbSttbSavedBy)
565 return 0;
566 if (NULL == (lbuffer = malloc (lcbSttbSavedBy)))
567 return 0;
568 /* read all the revision history */
569 gsf_input_read (stream, lcbSttbSavedBy, lbuffer);
570 /* there are n strings, so n/2 revisions (author & file) */
571 nRev = (lbuffer[2] + (lbuffer[3] << 8)) / 2;
572 where = 6;
573 ret = 0;
574 for (i = 0; i < nRev; i++)
575 {
576 if (where >= lcbSttbSavedBy)
577 break;
578 length = lbuffer[where++];
579 if ( (where + 2 * length + 2 >= lcbSttbSavedBy) ||
580 (where + 2 * length + 2 <= where) )
581 break;
582 author = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
583 length * 2,
584 "UTF-16BE");
585 where += length * 2 + 1;
586 length = lbuffer[where++];
587 if ( (where + 2 * length >= lcbSttbSavedBy) ||
588 (where + 2 * length + 1 <= where) )
589 {
590 if (NULL != author)
591 free (author);
592 break;
593 }
594 filename = EXTRACTOR_common_convert_to_utf8 ((const char*) &lbuffer[where],
595 length * 2,
596 "UTF-16BE");
597 where += length * 2 + 1;
598 if ( (NULL != author) &&
599 (NULL != filename) )
600 {
601 size_t bsize;
602
603 bsize = strlen (author) + strlen (filename) + 512;
604 if (NULL != (rbuf = malloc (bsize)))
605 {
606 int snret;
607
608 snret = snprintf (rbuf,
609 bsize,
610 _ ("Revision #%u: Author `%s' worked on `%s'"),
611 i,
612 author,
613 filename);
614 if ( (-1 != snret) &&
615 (bsize > (size_t) snret) )
616 {
617 ret = add_metadata (proc,
618 proc_cls,
619 rbuf,
620 EXTRACTOR_METATYPE_REVISION_HISTORY);
621 }
622 free (rbuf);
623 }
624 }
625 if (NULL != author)
626 free (author);
627 if (NULL != filename)
628 free (filename);
629 if (0 != ret)
630 break;
631 }
632 free (lbuffer);
633 return ret;
634 }
635
636
637 /* *************************** custom GSF input method ***************** */
638
639 #define LE_TYPE_INPUT (le_input_get_type ())
640 #define LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_CAST ((obj), \
641 LE_TYPE_INPUT, \
642 LeInput))
643 #define LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST ((klass), \
644 LE_TYPE_INPUT, \
645 LeInputClass))
646 #define IS_LE_INPUT(obj) (G_TYPE_CHECK_INSTANCE_TYPE ((obj), \
647 LE_TYPE_INPUT))
648 #define IS_LE_INPUT_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE ((klass), \
649 LE_TYPE_INPUT))
650 #define LE_INPUT_GET_CLASS(obj) (G_TYPE_INSTANCE_GET_CLASS ((obj), \
651 LE_TYPE_INPUT, \
652 LeInputClass))
653
654 /**
655 * Internal state of an "LeInput" object.
656 */
657 typedef struct _LeInputPrivate
658 {
659 /**
660 * Our extraction context.
661 */
662 struct EXTRACTOR_ExtractContext *ec;
663 } LeInputPrivate;
664
665
666 /**
667 * Overall state of an "LeInput" object.
668 */
669 typedef struct _LeInput
670 {
671 /**
672 * Inherited state from parent (GsfInput).
673 */
674 GsfInput input;
675
676 /*< private > */
677 /**
678 * Private state of the LeInput.
679 */
680 LeInputPrivate *priv;
681 } LeInput;
682
683
684 /**
685 * LeInput's class state.
686 */
687 typedef struct _LeInputClass
688 {
689 /**
690 * GsfInput is our parent class.
691 */
692 GsfInputClass parent_class;
693
694 /* Padding for future expansion */
695 void (*_gtk_reserved1)(void);
696 void (*_gtk_reserved2)(void);
697 void (*_gtk_reserved3)(void);
698 void (*_gtk_reserved4)(void);
699 } LeInputClass;
700
701
702 /**
703 * Constructor for LeInput objects.
704 *
705 * @param ec extraction context to use
706 * @return the LeInput, NULL on error
707 */
708 GsfInput *
709 le_input_new (struct EXTRACTOR_ExtractContext *ec);
710
711
712 /**
713 * Class initializer for the "LeInput" class.
714 *
715 * @param class class object to initialize
716 */
717 static void
718 le_input_class_init (LeInputClass *class);
719
720
721 /**
722 * Initialize internal state of fresh input object.
723 *
724 * @param input object to initialize
725 */
726 static void
727 le_input_init (LeInput *input);
728
729
730 /**
731 * Macro to create LeInput type definition and register the class.
732 */
GSF_CLASS(LeInput,le_input,le_input_class_init,le_input_init,GSF_INPUT_TYPE)733 GSF_CLASS (LeInput, le_input, le_input_class_init, le_input_init,
734 GSF_INPUT_TYPE)
735
736
737 /**
738 * Duplicate input, leaving the new one at the same offset.
739 *
740 * @param input the input to duplicate
741 * @param err location for error reporting, can be NULL
742 * @return NULL on error (always)
743 */
744 static GsfInput *
745 le_input_dup (GsfInput * input,
746 GError * *err)
747 {
748 if (NULL != err)
749 *err = g_error_new (gsf_input_error_id (), 0,
750 "dup not supported on LeInput");
751 return NULL;
752 }
753
754
755 /**
756 * Read at least num_bytes. Does not change the current position if
757 * there is an error. Will only read if the entire amount can be
758 * read. Invalidates the buffer associated with previous calls to
759 * gsf_input_read.
760 *
761 * @param input
762 * @param num_bytes
763 * @param optional_buffer
764 * @return buffer where num_bytes data are available, or NULL on error
765 */
766 static const guint8 *
le_input_read(GsfInput * input,size_t num_bytes,guint8 * optional_buffer)767 le_input_read (GsfInput *input,
768 size_t num_bytes,
769 guint8 *optional_buffer)
770 {
771 LeInput *li = LE_INPUT (input);
772 struct EXTRACTOR_ExtractContext *ec;
773 void *buf;
774 uint64_t old_off;
775 ssize_t ret;
776
777 ec = li->priv->ec;
778 old_off = ec->seek (ec->cls, 0, SEEK_CUR);
779 if (num_bytes
780 != (ret = ec->read (ec->cls,
781 &buf,
782 num_bytes)))
783 {
784 /* we don't support partial reads;
785 most other GsfInput implementations in this case
786 allocate some huge temporary buffer just to avoid
787 the partial read; we might need to do that as well!? */
788 ec->seek (ec->cls, SEEK_SET, old_off);
789 return NULL;
790 }
791 if (NULL != optional_buffer)
792 {
793 memcpy (optional_buffer, buf, num_bytes);
794 return optional_buffer;
795 }
796 return buf;
797 }
798
799
800 /**
801 * Move the current location in an input stream
802 *
803 * @param input stream to seek
804 * @param offset target offset
805 * @param whence determines to what the offset is relative to
806 * @return TRUE on error
807 */
808 static gboolean
le_input_seek(GsfInput * input,gsf_off_t offset,GSeekType whence)809 le_input_seek (GsfInput *input,
810 gsf_off_t offset,
811 GSeekType whence)
812 {
813 LeInput *li = LE_INPUT (input);
814 struct EXTRACTOR_ExtractContext *ec;
815 int w;
816 int64_t ret;
817
818 ec = li->priv->ec;
819 switch (whence)
820 {
821 case G_SEEK_SET:
822 w = SEEK_SET;
823 break;
824 case G_SEEK_CUR:
825 w = SEEK_CUR;
826 break;
827 case G_SEEK_END:
828 w = SEEK_END;
829 break;
830 default:
831 return TRUE;
832 }
833 if (-1 ==
834 (ret = ec->seek (ec->cls,
835 offset,
836 w)))
837 return TRUE;
838 return FALSE;
839 }
840
841
842 /**
843 * Class initializer for the "LeInput" class.
844 *
845 * @param class class object to initialize
846 */
847 static void
le_input_class_init(LeInputClass * class)848 le_input_class_init (LeInputClass *class)
849 {
850 GsfInputClass *input_class;
851
852 input_class = (GsfInputClass *) class;
853 input_class->Dup = le_input_dup;
854 input_class->Read = le_input_read;
855 input_class->Seek = le_input_seek;
856 g_type_class_add_private (class, sizeof (LeInputPrivate));
857 }
858
859
860 /**
861 * Initialize internal state of fresh input object.
862 *
863 * @param input object to initialize
864 */
865 static void
le_input_init(LeInput * input)866 le_input_init (LeInput *input)
867 {
868 LeInputPrivate *priv;
869
870 input->priv =
871 G_TYPE_INSTANCE_GET_PRIVATE (input, LE_TYPE_INPUT,
872 LeInputPrivate);
873 priv = input->priv;
874 priv->ec = NULL;
875 }
876
877
878 /**
879 * Creates a new LeInput object.
880 *
881 * @param ec extractor context to wrap
882 * @return NULL on error
883 */
884 GsfInput *
le_input_new(struct EXTRACTOR_ExtractContext * ec)885 le_input_new (struct EXTRACTOR_ExtractContext *ec)
886 {
887 LeInput *input;
888
889 input = g_object_new (LE_TYPE_INPUT, NULL);
890 gsf_input_set_size (GSF_INPUT (input),
891 ec->get_size (ec->cls));
892 gsf_input_seek_emulate (GSF_INPUT (input),
893 0);
894 input->input.name = NULL;
895 input->input.container = NULL;
896 input->priv->ec = ec;
897
898 return GSF_INPUT (input);
899 }
900
901
902 /* *********************** end of custom GSF input method ************* */
903
904
905 /**
906 * Main entry method for the OLE2 extraction plugin.
907 *
908 * @param ec extraction context provided to the plugin
909 */
910 void
EXTRACTOR_ole2_extract_method(struct EXTRACTOR_ExtractContext * ec)911 EXTRACTOR_ole2_extract_method (struct EXTRACTOR_ExtractContext *ec)
912 {
913 GsfInput *input;
914 GsfInfile *infile;
915 GsfInput *src;
916 const char *name;
917 unsigned int i;
918 unsigned int lcb;
919 unsigned int fcb;
920 const unsigned char *data512;
921 unsigned int lid;
922 const char *lang;
923 int ret;
924 void *data;
925 uint64_t fsize;
926 ssize_t data_size;
927
928 fsize = ec->get_size (ec->cls);
929 if (fsize < 512 + 898)
930 {
931 /* File too small for OLE2 */
932 return; /* can hardly be OLE2 */
933 }
934 if (512 + 898 > (data_size = ec->read (ec->cls, &data, fsize)))
935 {
936 /* Failed to read minimum file size to buffer */
937 return;
938 }
939 data512 = (const unsigned char*) data + 512;
940 lid = data512[6] + (data512[7] << 8);
941 if ( (NULL != (lang = lid_to_language (lid))) &&
942 (0 != (ret = add_metadata (ec->proc, ec->cls,
943 lang,
944 EXTRACTOR_METATYPE_LANGUAGE))) )
945 return;
946 lcb = data512[726] + (data512[727] << 8) + (data512[728] << 16)
947 + (data512[729] << 24);
948 fcb = data512[722] + (data512[723] << 8) + (data512[724] << 16)
949 + (data512[725] << 24);
950 if (0 != ec->seek (ec->cls, 0, SEEK_SET))
951 {
952 /* seek failed!? */
953 return;
954 }
955 #if USE_LE_INPUT
956 if (NULL == (input = le_input_new (ec)))
957 {
958 fprintf (stderr, "le_input_new failed\n");
959 return;
960 }
961 #else
962 input = gsf_input_memory_new ((const guint8 *) data,
963 data_size,
964 FALSE);
965 #endif
966 if (NULL == (infile = gsf_infile_msole_new (input, NULL)))
967 {
968 g_object_unref (G_OBJECT (input));
969 return;
970 }
971 ret = 0;
972 for (i = 0; i<gsf_infile_num_children (infile); i++)
973 {
974 if (0 != ret)
975 break;
976 if (NULL == (name = gsf_infile_name_by_index (infile, i)))
977 continue;
978 src = NULL;
979 if ( ( (0 == strcmp (name, "\005SummaryInformation")) ||
980 (0 == strcmp (name, "\005DocumentSummaryInformation")) ) &&
981 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
982 ret = process (src,
983 ec->proc,
984 ec->cls);
985 if ( (0 == strcmp (name, "SfxDocumentInfo")) &&
986 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
987 ret = process_star_office (src,
988 ec->proc,
989 ec->cls);
990 if (NULL != src)
991 g_object_unref (G_OBJECT (src));
992 }
993 if (0 != ret)
994 goto CLEANUP;
995
996 if (lcb < 6)
997 goto CLEANUP;
998 for (i = 0; i<gsf_infile_num_children (infile); i++)
999 {
1000 if (ret != 0)
1001 break;
1002 if (NULL == (name = gsf_infile_name_by_index (infile, i)))
1003 continue;
1004 if ( ( (0 == strcmp (name, "1Table")) ||
1005 (0 == strcmp (name, "0Table")) ) &&
1006 (NULL != (src = gsf_infile_child_by_index (infile, i))) )
1007 {
1008 ret = history_extract (src,
1009 lcb,
1010 fcb,
1011 ec->proc, ec->cls);
1012 g_object_unref (G_OBJECT (src));
1013 }
1014 }
1015 CLEANUP:
1016 g_object_unref (G_OBJECT (infile));
1017 g_object_unref (G_OBJECT (input));
1018 }
1019
1020
1021 /**
1022 * Custom log function we give to GSF to disable logging.
1023 *
1024 * @param log_domain unused
1025 * @param log_level unused
1026 * @param message unused
1027 * @param user_data unused
1028 */
1029 static void
nolog(const gchar * log_domain,GLogLevelFlags log_level,const gchar * message,gpointer user_data)1030 nolog (const gchar *log_domain,
1031 GLogLevelFlags log_level,
1032 const gchar *message,
1033 gpointer user_data)
1034 {
1035 /* do nothing */
1036 }
1037
1038
1039 /**
1040 * OLE2 plugin constructor. Initializes glib and gsf, in particular
1041 * gsf logging is disabled.
1042 */
1043 void __attribute__ ((constructor))
ole2_ltdl_init()1044 ole2_ltdl_init ()
1045 {
1046 #if ! GLIB_CHECK_VERSION (2, 35, 0)
1047 g_type_init ();
1048 #endif
1049 #ifdef HAVE_GSF_INIT
1050 gsf_init ();
1051 #endif
1052 /* disable logging -- thanks, Jody! */
1053 g_log_set_handler ("libgsf:msole",
1054 G_LOG_LEVEL_CRITICAL | G_LOG_LEVEL_WARNING,
1055 &nolog, NULL);
1056 }
1057
1058
1059 /**
1060 * OLE2 plugin destructor. Shutdown of gsf.
1061 */
1062 void __attribute__ ((destructor))
ole2_ltdl_fini()1063 ole2_ltdl_fini ()
1064 {
1065 #ifdef HAVE_GSF_INIT
1066 gsf_shutdown ();
1067 #endif
1068 }
1069
1070
1071 /* end of ole2_extractor.c */
1072