1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3 * gsf-msole-utils.c:
4 *
5 * Copyright (C) 2002-2006 Jody Goldberg (jody@gnome.org)
6 * Copyright (C) 2002-2006 Dom Lachowicz (cinamod@hotmail.com)
7 * excel_iconv* family of functions (C) 2001 by Vlad Harchev <hvv@hippo.ru>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of version 2.1 of the GNU Lesser General Public
11 * License as published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
22 */
23
24 #include <gsf-config.h>
25 #include <gsf/gsf-msole-utils.h>
26 #include <gsf/gsf.h>
27
28 #include <locale.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <time.h>
32 #include <glib/gi18n-lib.h>
33
34 #define NO_DEBUG_OLE_PROPS
35 #ifndef NO_DEBUG_OLE_PROPS
36 #define d(code) do { code } while (0)
37 #else
38 #define d(code)
39 #endif
40
41 enum {
42 DEBUG_UNKNOWN_PROPS = 1
43 };
44
45 static gboolean
msole_debug(guint what)46 msole_debug (guint what)
47 {
48 static guint flags;
49 static gboolean inited = FALSE;
50
51 if (!inited) {
52 /* not static */
53 const GDebugKey keys[] = {
54 { (char*)"msole_prop", DEBUG_UNKNOWN_PROPS },
55 };
56
57 const char *val = g_getenv ("GSF_DEBUG");
58 flags = val
59 ? g_parse_debug_string (val, keys, G_N_ELEMENTS (keys))
60 : 0;
61
62 inited = TRUE;
63 }
64
65 return (flags & what) != 0;
66 }
67
68 /*
69 * The Format Identifier for Summary Information
70 * F29F85E0-4FF9-1068-AB91-08002B27B3D9
71 */
72 static guint8 const component_guid [] = {
73 0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
74 0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
75 };
76
77 /*
78 * The Format Identifier for Document Summary Information
79 * D5CDD502-2E9C-101B-9397-08002B2CF9AE
80 */
81 static guint8 const document_guid [] = {
82 0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
83 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
84 };
85
86 /*
87 * The Format Identifier for User-Defined Properties
88 * D5CDD505-2E9C-101B-9397-08002B2CF9AE
89 */
90 static guint8 const user_guid [] = {
91 0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
92 0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
93 };
94
95 typedef enum {
96 COMMON_PROP, /* in either summary or docsummary */
97 COMPONENT_PROP, /* SummaryInformation properties */
98 DOC_PROP, /* DocumentSummaryInformation properties */
99 USER_PROP
100 } GsfMSOleMetaDataType;
101
102 typedef enum {
103 VT_EMPTY = 0,
104 VT_NULL = 1,
105 VT_I2 = 2,
106 VT_I4 = 3,
107 VT_R4 = 4,
108 VT_R8 = 5,
109 VT_CY = 6,
110 VT_DATE = 7,
111 VT_BSTR = 8,
112 VT_DISPATCH = 9,
113 VT_ERROR = 10,
114 VT_BOOL = 11,
115 VT_VARIANT = 12,
116 VT_UNKNOWN = 13,
117 VT_DECIMAL = 14,
118
119 VT_I1 = 16,
120 VT_UI1 = 17,
121 VT_UI2 = 18,
122 VT_UI4 = 19,
123 VT_I8 = 20,
124 VT_UI8 = 21,
125 VT_INT = 22,
126 VT_UINT = 23,
127 VT_VOID = 24,
128 VT_HRESULT = 25,
129 VT_PTR = 26,
130 VT_SAFEARRAY = 27,
131 VT_CARRAY = 28,
132 VT_USERDEFINED = 29,
133 VT_LPSTR = 30,
134 VT_LPWSTR = 31,
135
136 VT_FILETIME = 64,
137 VT_BLOB = 65,
138 VT_STREAM = 66,
139 VT_STORAGE = 67,
140 VT_STREAMED_OBJECT = 68,
141 VT_STORED_OBJECT = 69,
142 VT_BLOB_OBJECT = 70,
143 VT_CF = 71,
144 VT_CLSID = 72,
145 VT_VECTOR = 0x1000
146 } GsfMSOleVariantType;
147
148 typedef struct {
149 char const *ms_name;
150 GsfMSOleMetaDataType section;
151 char const *gsf_name;
152 guint32 id;
153 GsfMSOleVariantType prefered_type;
154 } GsfMSOleMetaDataPropMap;
155
156 typedef struct {
157 guint32 id;
158 gsf_off_t offset;
159 } GsfMSOleMetaDataProp;
160
161 typedef struct {
162 GsfMSOleMetaDataType type;
163 gsf_off_t offset;
164 guint32 size, num_props;
165 GIConv iconv_handle;
166 unsigned char_size;
167 GHashTable *dict;
168 } GsfMSOleMetaDataSection;
169
170 static GsfMSOleMetaDataPropMap const builtin_props [] = {
171 { "Dictionary", COMMON_PROP, GSF_META_NAME_DICTIONARY, 0, 0, /* magic */},
172 { "CodePage", COMMON_PROP, GSF_META_NAME_CODEPAGE, 1, VT_I2 },
173 { "LOCALE_SYSTEM_DEFAULT",COMMON_PROP, GSF_META_NAME_LOCALE_SYSTEM_DEFAULT, 0x80000000, VT_UI4},
174 { "CASE_SENSITIVE", COMMON_PROP, GSF_META_NAME_CASE_SENSITIVE, 0x80000003, VT_UI4},
175 { "Category", DOC_PROP, GSF_META_NAME_CATEGORY, 2, VT_LPSTR },
176 { "PresentationFormat", DOC_PROP, GSF_META_NAME_PRESENTATION_FORMAT, 3, VT_LPSTR },
177 { "NumBytes", DOC_PROP, GSF_META_NAME_BYTE_COUNT, 4, VT_I4 },
178 { "NumLines", DOC_PROP, GSF_META_NAME_LINE_COUNT, 5, VT_I4 },
179 { "NumParagraphs", DOC_PROP, GSF_META_NAME_PARAGRAPH_COUNT, 6, VT_I4 },
180 { "NumSlides", DOC_PROP, GSF_META_NAME_SLIDE_COUNT, 7, VT_I4 },
181 { "NumNotes", DOC_PROP, GSF_META_NAME_NOTE_COUNT, 8, VT_I4 },
182 { "NumHiddenSlides", DOC_PROP, GSF_META_NAME_HIDDEN_SLIDE_COUNT, 9, VT_I4 },
183 { "NumMMClips", DOC_PROP, GSF_META_NAME_MM_CLIP_COUNT, 10, VT_I4 },
184 { "Scale", DOC_PROP, GSF_META_NAME_SCALE, 11, VT_BOOL },
185 { "HeadingPairs", DOC_PROP, GSF_META_NAME_HEADING_PAIRS, 12, VT_VECTOR | VT_VARIANT },
186 { "DocumentParts", DOC_PROP, GSF_META_NAME_DOCUMENT_PARTS, 13, VT_VECTOR | VT_LPSTR },
187 { "Manager", DOC_PROP, GSF_META_NAME_MANAGER, 14, VT_LPSTR },
188 { "Company", DOC_PROP, GSF_META_NAME_COMPANY, 15, VT_LPSTR },
189 { "LinksDirty", DOC_PROP, GSF_META_NAME_LINKS_DIRTY, 16, VT_BOOL },
190 // Possible match: { 0x0011, 0x0003, "PIDDSI_CCHWITHSPACES", "Number of characters with white-space" },
191 { "DocSumInfo_17", DOC_PROP, GSF_META_NAME_MSOLE_UNKNOWN_17, 17, VT_UNKNOWN },
192 { "DocSumInfo_18", DOC_PROP, GSF_META_NAME_MSOLE_UNKNOWN_18, 18, VT_UNKNOWN },
193 // Possible match: { 0x0013, 0x000b, "PIDDSI_SHAREDDOC", "Shared document" },
194 { "DocSumInfo_19", DOC_PROP, GSF_META_NAME_MSOLE_UNKNOWN_19, 19, VT_BOOL },
195 // Possible match: + PIDDSI_LINKBASE = 0x0014
196 { "DocSumInfo_20", DOC_PROP, GSF_META_NAME_MSOLE_UNKNOWN_20, 20, VT_UNKNOWN },
197 // Possible match: + PIDDSI_HLINKS= 0x0015,
198 { "DocSumInfo_21", DOC_PROP, GSF_META_NAME_MSOLE_UNKNOWN_21, 21, VT_UNKNOWN },
199 // Possible match: { 0x0016, 0x000b, "PIDDSI_HYPERLINKSCHANGED", "Hyper links changed" },
200 { "DocSumInfo_22", DOC_PROP, GSF_META_NAME_MSOLE_UNKNOWN_22, 22, VT_BOOL },
201 // Possible match: { 0x0017, 0x0003, "PIDDSI_VERSION", "Creating application version" },
202 { "DocSumInfo_23", DOC_PROP, GSF_META_NAME_MSOLE_UNKNOWN_23, 23, VT_I4 },
203 { "Title", COMPONENT_PROP, GSF_META_NAME_TITLE, 2, VT_LPSTR },
204 { "Subject", COMPONENT_PROP, GSF_META_NAME_SUBJECT, 3, VT_LPSTR },
205 { "Author", COMPONENT_PROP, GSF_META_NAME_CREATOR, 4, VT_LPSTR },
206 { "Keywords", COMPONENT_PROP, GSF_META_NAME_KEYWORDS, 5, VT_LPSTR },
207 { "Comments", COMPONENT_PROP, GSF_META_NAME_DESCRIPTION, 6, VT_LPSTR },
208 { "Template", COMPONENT_PROP, GSF_META_NAME_TEMPLATE, 7, VT_LPSTR },
209 { "LastSavedBy", COMPONENT_PROP, GSF_META_NAME_LAST_SAVED_BY, 8, VT_LPSTR },
210 { "RevisionNumber", COMPONENT_PROP, GSF_META_NAME_REVISION_COUNT, 9, VT_LPSTR },
211 { "TotalEditingTime", COMPONENT_PROP, GSF_META_NAME_EDITING_DURATION, 10, VT_FILETIME },
212 { "LastPrinted", COMPONENT_PROP, GSF_META_NAME_LAST_PRINTED, 11, VT_FILETIME },
213 { "CreateTime", COMPONENT_PROP, GSF_META_NAME_DATE_CREATED, 12, VT_FILETIME },
214 { "LastSavedTime", COMPONENT_PROP, GSF_META_NAME_DATE_MODIFIED, 13, VT_FILETIME },
215 { "NumPages", COMPONENT_PROP, GSF_META_NAME_PAGE_COUNT, 14, VT_I4 },
216 { "NumWords", COMPONENT_PROP, GSF_META_NAME_WORD_COUNT, 15, VT_I4 },
217 { "NumCharacters", COMPONENT_PROP, GSF_META_NAME_CHARACTER_COUNT, 16, VT_I4 },
218 { "Thumbnail", COMPONENT_PROP, GSF_META_NAME_THUMBNAIL, 17, VT_CF },
219 { "AppName", COMPONENT_PROP, GSF_META_NAME_GENERATOR, 18, VT_LPSTR },
220 { "Security", COMPONENT_PROP, GSF_META_NAME_SECURITY, 19, VT_I4 }
221 };
222
223 static GHashTable *name_to_prop_hash = NULL;
224
225 static char const *
msole_vt_name(GsfMSOleVariantType type)226 msole_vt_name (GsfMSOleVariantType type)
227 {
228 static char const *names[] = {
229 "VT_EMPTY", "VT_NULL", "VT_I2", "VT_I4", "VT_R4",
230 "VT_R8", "VT_CY", "VT_DATE", "VT_BSTR", "VT_DISPATCH",
231 "VT_ERROR", "VT_BOOL", "VT_VARIANT", "VT_UNKNOWN", "VT_DECIMAL",
232 NULL, "VT_I1", "VT_UI1", "VT_UI2", "VT_UI4",
233 "VT_I8", "VT_UI8", "VT_INT", "VT_UINT", "VT_VOID",
234 "VT_HRESULT", "VT_PTR", "VT_SAFEARRAY", "VT_CARRAY", "VT_USERDEFINED",
235 "VT_LPSTR", "VT_LPWSTR",
236 };
237 static char const *names2[] = {
238 "VT_FILETIME",
239 "VT_BLOB", "VT_STREAM", "VT_STORAGE", "VT_STREAMED_OBJECT",
240 "VT_STORED_OBJECT", "VT_BLOB_OBJECT", "VT_CF", "VT_CLSID"
241 };
242
243 type &= ~VT_VECTOR;
244 if (type <= VT_LPWSTR)
245 return names[type];
246 g_return_val_if_fail (type >= VT_FILETIME, "_UNKNOWN_");
247 g_return_val_if_fail (type <= VT_CLSID, "_UNKNOWN_");
248 return names2[type-VT_FILETIME];
249 }
250
251 static char const *
msole_prop_id_to_gsf(GsfMSOleMetaDataSection * section,guint32 id,gboolean * linked)252 msole_prop_id_to_gsf (GsfMSOleMetaDataSection *section, guint32 id, gboolean *linked)
253 {
254 char const *res = NULL;
255 GsfMSOleMetaDataPropMap const *map = NULL;
256 unsigned i = 0;
257
258 *linked = FALSE;
259 if (section->dict != NULL) {
260 if (id & 0x1000000) {
261 *linked = TRUE;
262 id &= ~0x1000000;
263 d (g_print ("LINKED "););
264 }
265
266 res = g_hash_table_lookup (section->dict, GINT_TO_POINTER (id));
267
268 if (res != NULL) {
269 d (g_print ("%s", res););
270 return res;
271 }
272 }
273
274 map = builtin_props ;
275 i = G_N_ELEMENTS (builtin_props);
276 while (i-- > 0)
277 if (map[i].id == id &&
278 (map[i].section == COMMON_PROP || map[i].section == section->type)) {
279 d (g_print ("%s\n", map[i].gsf_name););
280 return map[i].gsf_name;
281 }
282
283 d (g_print ("_UNKNOWN_(0x%x %d)\n", id, id););
284
285 return NULL;
286 }
287
288 static GsfMSOleMetaDataPropMap const *
msole_gsf_name_to_prop(char const * name)289 msole_gsf_name_to_prop (char const *name)
290 {
291 if (NULL == name_to_prop_hash) {
292 int i;
293 name_to_prop_hash = g_hash_table_new (g_str_hash, g_str_equal);
294 for (i = G_N_ELEMENTS (builtin_props); i-- > 0; )
295 g_hash_table_replace (name_to_prop_hash,
296 (gpointer) builtin_props[i].gsf_name,
297 (gpointer) (builtin_props+i));
298 }
299
300 return g_hash_table_lookup (name_to_prop_hash, (gpointer)name);
301 }
302
303 static void
set_error_missing_data(GError ** error,const char * property_name,gsize size_needed,gsize size_gotten)304 set_error_missing_data (GError **error, const char *property_name, gsize size_needed, gsize size_gotten)
305 {
306 gchar *size_needed_str, *size_gotten_str;
307
308 size_needed_str = g_strdup_printf ("%" G_GSIZE_FORMAT, size_needed);
309 size_gotten_str = g_strdup_printf ("%" G_GSIZE_FORMAT, size_gotten);
310 g_set_error (error,
311 GSF_ERROR,
312 GSF_ERROR_INVALID_DATA,
313 _("Missing data when reading the %s property; got %s bytes, "
314 "but %s bytes at least are needed."),
315 property_name,
316 size_needed_str,
317 size_gotten_str);
318 g_free (size_needed_str);
319 g_free (size_gotten_str);
320 }
321
322 /* Can return errors from gsf_blob_new() and GSF_ERROR_INVALID_DATA */
323 static gboolean
parse_vt_cf(GValue * res,guint8 const ** data,guint8 const * data_end,GError ** error)324 parse_vt_cf (GValue *res, guint8 const **data, guint8 const *data_end, GError **error)
325 {
326 /* clipboard size uint32 sizeof (clipboard format tag) + sizeof (clipboard data)
327 * clipboard format tag int32 see below
328 * clipboard data byte[] see below
329 *
330 * Clipboard format tag:
331 * -1 - Windows clipboard format
332 * -2 - Macintosh clipboard format
333 * -3 - GUID that contains a format identifier (FMTID)
334 * >0 - custom clipboard format name plus data (see msdn site below)
335 * 0 - No data
336 *
337 * References:
338 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/stg/stg/propvariant.asp
339 * http://jakarta.apache.org/poi/hpsf/thumbnails.html
340 * http://linux.com.hk/docs/poi/org/apache/poi/hpsf/Thumbnail.html
341 * http://sparks.discreet.com/knowledgebase/public/solutions/ExtractThumbnailImg.htm
342 */
343 guint32 clip_size, clip_data_size;
344 gint32 clip_format;
345 GsfBlob *blob;
346 GsfClipData *clip_data;
347
348 /* Clipboard size field */
349
350 if (data_end < *data + 4) {
351 set_error_missing_data (error, "VT_CF", 4, data_end - *data);
352 return FALSE;
353 }
354
355 clip_size = GSF_LE_GET_GUINT32 (*data);
356
357 if (clip_size < 4) { /* must emcompass int32 format plus data size */
358 gchar *size_str;
359
360 size_str = g_strdup_printf ("%" G_GSIZE_FORMAT, (gsize) clip_size);
361 g_set_error (error,
362 GSF_ERROR,
363 GSF_ERROR_INVALID_DATA,
364 _("Corrupt data in the VT_CF property; clipboard data length must be at least 4 bytes, "
365 "but the data says it only has %s bytes available."),
366 size_str);
367 g_free (size_str);
368 return FALSE;
369 }
370
371 *data += 4;
372
373 /* Check clipboard format plus data size */
374
375 if (data_end < *data + clip_size) {
376 set_error_missing_data (error, "VT_CF", clip_size, data_end - *data);
377 return FALSE;
378 }
379
380 clip_format = GSF_LE_GET_GINT32 (*data);
381 *data += 4;
382
383 switch (clip_format) {
384 case GSF_CLIP_FORMAT_WINDOWS_CLIPBOARD:
385 case GSF_CLIP_FORMAT_MACINTOSH_CLIPBOARD:
386 case GSF_CLIP_FORMAT_GUID:
387 case GSF_CLIP_FORMAT_NO_DATA:
388 /* everything is ok */
389 break;
390
391 default:
392 if (clip_format > 0)
393 clip_format = GSF_CLIP_FORMAT_CLIPBOARD_FORMAT_NAME;
394 else
395 clip_format = GSF_CLIP_FORMAT_UNKNOWN;
396
397 break;
398 }
399
400 clip_data_size = clip_size - 4;
401
402 blob = gsf_blob_new (clip_data_size, *data, error);
403
404 *data += clip_data_size;
405
406 if (!blob)
407 return FALSE;
408
409 clip_data = gsf_clip_data_new (clip_format, blob);
410 g_object_unref (blob);
411
412 g_value_init (res, GSF_TYPE_CLIP_DATA);
413 g_value_set_object (res, clip_data);
414 g_object_unref (clip_data);
415
416 return TRUE;
417 }
418
419 static unsigned
msole_codepage_char_size(int codepage)420 msole_codepage_char_size (int codepage)
421 {
422 return (codepage == 1200 || codepage == 1201
423 ? 2
424 : 1);
425 }
426
427 /*
428 * Return a number no bigger than the number of bytes used for a property
429 * value of a given type. The returned number might be too small, but
430 * we try to return as big a value as possible.
431 */
432 static size_t
msole_prop_min_size(guint32 type)433 msole_prop_min_size (guint32 type)
434 {
435 switch (type) {
436 case VT_EMPTY:
437 case VT_NULL:
438 return 0;
439
440 case VT_BOOL:
441 case VT_I1:
442 case VT_UI1:
443 return 1;
444
445 case VT_I2:
446 case VT_UI2:
447 return 2;
448
449 case VT_I4:
450 case VT_R4:
451 case VT_ERROR:
452 case VT_VARIANT:
453 case VT_UI4:
454 case VT_LPSTR:
455 case VT_LPWSTR:
456 case VT_BLOB:
457 case VT_BLOB_OBJECT:
458 case VT_CF:
459 case VT_VECTOR:
460 return 4;
461
462 case VT_BSTR:
463 return 5;
464
465 case VT_R8:
466 case VT_CY:
467 case VT_DATE:
468 case VT_I8:
469 case VT_UI8:
470 case VT_FILETIME:
471 return 8;
472
473 case VT_CLSID:
474 return 16;
475
476 case VT_DISPATCH:
477 case VT_UNKNOWN:
478 case VT_DECIMAL:
479 case VT_INT:
480 case VT_UINT:
481 case VT_VOID:
482 case VT_HRESULT:
483 case VT_PTR:
484 case VT_SAFEARRAY:
485 case VT_CARRAY:
486 case VT_USERDEFINED:
487 case VT_STREAM:
488 case VT_STORAGE:
489 case VT_STREAMED_OBJECT:
490 case VT_STORED_OBJECT:
491 default:
492 return 0;
493 }
494 }
495
496 #define NEED_RECS(_n,_size1) \
497 do { \
498 guint _s1 = (_size1); \
499 bytes_needed = (_n); \
500 if (_s1 > 0 && (data_end - *data) / _s1 < bytes_needed) { \
501 g_warning ("Invalid MS property or file truncated"); \
502 g_free (res); \
503 return NULL; \
504 } \
505 bytes_needed *= _s1; \
506 } while (0)
507
508 #define NEED_BYTES(_n) NEED_RECS(_n,1)
509
510 #define ADVANCE do { *data += bytes_needed; } while (0)
511
512 static GValue *
msole_prop_parse(GsfMSOleMetaDataSection * section,guint32 type,guint8 const ** data,guint8 const * data_end)513 msole_prop_parse (GsfMSOleMetaDataSection *section,
514 guint32 type, guint8 const **data, guint8 const *data_end)
515 {
516 GValue *res = NULL;
517 char *str;
518 guint32 len;
519 gboolean const is_vector = type & VT_VECTOR;
520 GError *error;
521 guint bytes_needed;
522
523 g_return_val_if_fail (!(type & (unsigned)(~0x1fff)), NULL); /* not valid in a prop set */
524
525 type &= 0xfff;
526
527 if (is_vector) {
528 /*
529 * A vector is basically an array. If the type associated with
530 * it is a variant, then each element can have a different
531 * variant type. Otherwise, each element has the same variant
532 * type associated with the vector.
533 */
534 unsigned i, n, size1;
535 GsfDocPropVector *vector;
536
537 NEED_BYTES (4);
538 n = GSF_LE_GET_GUINT32 (*data);
539 ADVANCE;
540
541 d (g_print (" array with %d elem\n", n);
542 gsf_mem_dump (*data, (unsigned)(data_end - *data)););
543
544 size1 = msole_prop_min_size (type);
545 NEED_RECS(n, size1);
546
547 vector = gsf_docprop_vector_new ();
548
549 for (i = 0 ; i < n ; i++) {
550 GValue *v;
551 guint8 const *data0 = *data;
552 d (g_print ("\t[%d] ", i););
553 v = msole_prop_parse (section, type, data, data_end);
554 if (v) {
555 if (G_IS_VALUE (v)) {
556 gsf_docprop_vector_append (vector, v);
557 g_value_unset (v);
558 }
559 g_free (v);
560 }
561 if (*data == data0)
562 break;
563 }
564
565 res = g_new0 (GValue, 1);
566 g_value_init (res, GSF_DOCPROP_VECTOR_TYPE);
567 g_value_set_object (res, vector);
568 g_object_unref (vector);
569 return res;
570 }
571
572 res = g_new0 (GValue, 1);
573 d (g_print ("%s\n", msole_vt_name (type)););
574 switch (type) {
575 case VT_EMPTY:
576 /*
577 * A property with a type indicator of VT_EMPTY has no data
578 * associated with it; that is, the size of the value is zero.
579 */
580 /* value::unset == empty */
581 break;
582
583 case VT_NULL:
584 /* This is like a pointer to NULL */
585 /* value::unset == null too :-) do we need to distinguish ? */
586 break;
587
588 case VT_I2:
589 /* 2-byte signed integer */
590 NEED_BYTES (2);
591 g_value_init (res, G_TYPE_INT);
592 g_value_set_int (res, GSF_LE_GET_GINT16 (*data));
593 ADVANCE;
594 break;
595
596 case VT_I4:
597 /* 4-byte signed integer */
598 NEED_BYTES (4);
599 g_value_init (res, G_TYPE_INT);
600 g_value_set_int (res, GSF_LE_GET_GINT32 (*data));
601 ADVANCE;
602 break;
603
604 case VT_R4:
605 /* 32-bit IEEE floating-point value */
606 NEED_BYTES (4);
607 g_value_init (res, G_TYPE_FLOAT);
608 g_value_set_float (res, GSF_LE_GET_FLOAT (*data));
609 ADVANCE;
610 break;
611
612 case VT_R8:
613 /* 64-bit IEEE floating-point value */
614 NEED_BYTES (8);
615 g_value_init (res, G_TYPE_DOUBLE);
616 g_value_set_double (res, GSF_LE_GET_DOUBLE (*data));
617 ADVANCE;
618 break;
619
620 case VT_CY:
621 /* 8-byte two's complement integer (scaled by 10,000) */
622 NEED_BYTES (8);
623 /* CHEAT : just store as an int64 for now */
624 g_value_init (res, G_TYPE_INT64);
625 g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data));
626 break;
627
628 case VT_DATE:
629 /*
630 * 64-bit floating-point number representing the number of days
631 * (not seconds) since December 31, 1899.
632 */
633 if (msole_debug (DEBUG_UNKNOWN_PROPS))
634 g_warning ("Unhandled property value type %d (0x%x)",
635 type, type);
636 NEED_BYTES (8);
637 ADVANCE;
638 break;
639
640 case VT_BSTR:
641 /*
642 * Pointer to null-terminated Unicode string; the string is pre-
643 * ceeded by a DWORD representing the byte count of the number
644 * of bytes in the string (including the terminating null).
645 */
646 if (msole_debug (DEBUG_UNKNOWN_PROPS))
647 g_warning ("Unhandled property value type %d (0x%x)",
648 type, type);
649 NEED_BYTES (4);
650 ADVANCE;
651 break;
652
653 case VT_DISPATCH:
654 if (msole_debug (DEBUG_UNKNOWN_PROPS))
655 g_warning ("Unhandled property value type %d (0x%x)",
656 type, type);
657 break;
658
659 case VT_BOOL:
660 /* A boolean (WORD) value containg 0 (false) or -1 (true). */
661 NEED_BYTES (1);
662 g_value_init (res, G_TYPE_BOOLEAN);
663 g_value_set_boolean (res, **data ? TRUE : FALSE);
664 ADVANCE;
665 break;
666
667 case VT_VARIANT : d (g_print ("\tcontaining a "););
668 /*
669 * A type indicator (a DWORD) followed by the corresponding
670 * value. VT_VARIANT is only used in conjunction with
671 * VT_VECTOR.
672 */
673 NEED_BYTES (4);
674 g_free (res);
675 type = GSF_LE_GET_GUINT32 (*data);
676 ADVANCE;
677 return msole_prop_parse (section, type, data, data_end);
678
679 case VT_UI1:
680 /* 1-byte unsigned integer */
681 NEED_BYTES (1);
682 g_value_init (res, G_TYPE_UCHAR);
683 g_value_set_uchar (res, GSF_LE_GET_GUINT8 (*data));
684 ADVANCE;
685 break;
686
687 case VT_I1:
688 /* 1-byte signed integer */
689 NEED_BYTES (1);
690 g_value_init (res, G_TYPE_CHAR);
691 g_value_set_schar (res, GSF_LE_GET_GINT8 (*data));
692 ADVANCE;
693 break;
694
695 case VT_UI2:
696 /* 2-byte unsigned integer */
697 NEED_BYTES (2);
698 g_value_init (res, G_TYPE_UINT);
699 g_value_set_uint (res, GSF_LE_GET_GUINT16 (*data));
700 ADVANCE;
701 break;
702
703 case VT_UI4:
704 /* 4-type unsigned integer */
705 NEED_BYTES (4);
706 g_value_init (res, G_TYPE_UINT);
707 g_value_set_uint (res, GSF_LE_GET_GUINT32 (*data));
708 ADVANCE;
709 break;
710
711 case VT_I8 : d (g_print ("VT_I8\n"););
712 /* 8-byte signed integer */
713 NEED_BYTES (8);
714 g_value_init (res, G_TYPE_INT64);
715 g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data));
716 ADVANCE;
717 break;
718
719 case VT_UI8:
720 /* 8-byte unsigned integer */
721 NEED_BYTES (8);
722 g_value_init (res, G_TYPE_UINT64);
723 g_value_set_uint64 (res, GSF_LE_GET_GUINT64 (*data));
724 ADVANCE;
725 break;
726
727 case VT_LPSTR: {
728 guint32 need;
729 /*
730 * This is the representation of many strings. It is stored in
731 * the same representation as VT_BSTR. Note that the serialized
732 * representation of VP_LPSTR has a preceding byte count,
733 * whereas the in-memory representation does not.
734 */
735 NEED_BYTES (4);
736 len = GSF_LE_GET_GUINT32 (*data);
737 ADVANCE;
738
739 g_return_val_if_fail (len < 0x10000, NULL);
740
741 need = len;
742 if (section->char_size > 1 && (need & 3))
743 need = (need & ~3) + 4;
744 NEED_BYTES (need);
745
746 error = NULL;
747 d (gsf_mem_dump (*data, len * section->char_size););
748 str = g_convert_with_iconv (*data,
749 len > section->char_size ? len - section->char_size : 0,
750 section->iconv_handle, NULL, NULL, &error);
751
752 g_value_init (res, G_TYPE_STRING);
753 if (NULL != str) {
754 g_value_take_string (res, str);
755 } else if (NULL != error) {
756 g_warning ("error: %s", error->message);
757 g_error_free (error);
758 } else {
759 g_warning ("unknown error converting string property, using blank");
760 }
761 ADVANCE;
762 break;
763 }
764
765 case VT_LPWSTR:
766 /*
767 * A counted and null-terminated Unicode string; a DWORD character
768 * count (where the count includes the terminating null) followed
769 * by that many Unicode (16-bit) characters. Note that the count
770 * is character count, not byte count.
771 */
772
773 NEED_BYTES (4);
774 len = GSF_LE_GET_GUINT32 (*data);
775 ADVANCE;
776
777 NEED_RECS (len, 2);
778
779 g_return_val_if_fail (len < 0x10000, NULL);
780
781 error = NULL;
782 d (gsf_mem_dump (*data, len * 2););
783 str = g_convert (*data, len * 2,
784 "UTF-8", "UTF-16LE", NULL, NULL, &error);
785
786 g_value_init (res, G_TYPE_STRING);
787 if (NULL != str) {
788 g_value_set_string (res, str);
789 g_free (str);
790 } else if (NULL != error) {
791 g_warning ("error: %s", error->message);
792 g_error_free (error);
793 } else {
794 g_warning ("unknown error converting string property, using blank");
795 }
796 ADVANCE;
797 break;
798
799 case VT_FILETIME : {
800 /* 64-bit FILETIME structure, as defined by Win32. */
801 guint64 ft;
802 GsfTimestamp *ts;
803
804 NEED_BYTES (8);
805
806 /* ft * 100ns since Jan 1 1601 */
807 ft = GSF_LE_GET_GUINT64 (*data);
808
809 ft /= 10000000; /* convert to seconds */
810 ft -= G_GINT64_CONSTANT (11644473600); /* move to Jan 1 1970 */
811 ts = gsf_timestamp_new ();
812 gsf_timestamp_set_time (ts, ft);
813 g_value_init (res, GSF_TIMESTAMP_TYPE);
814 gsf_timestamp_to_value (ts, res);
815 gsf_timestamp_free (ts);
816
817 ADVANCE;
818 break;
819 }
820
821 case VT_BLOB:
822 /*
823 * A DWORD count of bytes, followed by that many bytes of data.
824 * The byte count does not include the four bytes for the length
825 * of the count itself: An empty blob would have a count of
826 * zero, followed by zero bytes. Thus the serialized represen-
827 * tation of a VT_BLOB is similar to that of a VT_BSTR but does
828 * not guarantee a null byte at the end of the data.
829 */
830 NEED_BYTES (4);
831 ADVANCE;
832 if (msole_debug (DEBUG_UNKNOWN_PROPS))
833 g_warning ("Unhandled property value type %d (0x%x)",
834 type, type);
835 g_free (res);
836 res = NULL;
837 break;
838
839 case VT_STREAM:
840 /*
841 * Indicates the value is stored in a stream that is sibling
842 * to the CONTENTS stream. Following this type indicator is
843 * data in the format of a serialized VT_LPSTR, which names
844 * the stream containing the data.
845 */
846 if (msole_debug (DEBUG_UNKNOWN_PROPS))
847 g_warning ("Unhandled property value type %d (0x%x)",
848 type, type);
849 g_free (res);
850 res = NULL;
851 break;
852
853 case VT_STORAGE:
854 /*
855 * Indicates the value is stored in an IStorage that is
856 * sibling to the CONTENTS stream. Following this type
857 * indicator is data in the format of a serialized VT_LPSTR,
858 * which names the IStorage containing the data.
859 */
860 if (msole_debug (DEBUG_UNKNOWN_PROPS))
861 g_warning ("Unhandled property value type %d (0x%x)",
862 type, type);
863 g_free (res);
864 res = NULL;
865 break;
866
867 case VT_STREAMED_OBJECT:
868 /*
869 * Same as VT_STREAM, but indicates that the stream contains a
870 * serialized object, which is a class ID followed by initiali-
871 * zation data for the class.
872 */
873 if (msole_debug (DEBUG_UNKNOWN_PROPS))
874 g_warning ("Unhandled property value type %d (0x%x)",
875 type, type);
876 g_free (res);
877 res = NULL;
878 break;
879
880 case VT_STORED_OBJECT:
881 /*
882 * Same as VT_STORAGE, but indicates that the designated
883 * IStorage contains a loadable object.
884 */
885 if (msole_debug (DEBUG_UNKNOWN_PROPS))
886 g_warning ("Unhandled property value type %d (0x%x)",
887 type, type);
888 g_free (res);
889 res = NULL;
890 break;
891
892 case VT_BLOB_OBJECT:
893 /*
894 * Contains a serialized object in the same representation as
895 * would appear in a VT_STREAMED_OBJECT. That is, following
896 * the VT_BLOB_OBJECT tag is a DWORD byte count of the
897 * remaining data (where the byte count does not include the
898 * size of itself) which is in the format of a class ID
899 * followed by initialization data for that class
900 */
901 if (msole_debug (DEBUG_UNKNOWN_PROPS))
902 g_warning ("Unhandled property value type %d (0x%x)",
903 type, type);
904 g_free (res);
905 res = NULL;
906 break;
907
908 case VT_CF:
909 error = NULL;
910 if (!parse_vt_cf (res, data, data_end, &error)) {
911 /* suck, we can't propagate the error upwards */
912 if (error) {
913 g_warning ("error: %s", error->message);
914 g_error_free (error);
915 }
916 else {
917 g_warning ("unknown error parsing vt_cf");
918 }
919 g_free (res);
920 res = NULL;
921 }
922 break;
923
924 case VT_CLSID:
925 /* A class ID (or other GUID) */
926 NEED_BYTES (16);
927 ADVANCE;
928 g_free (res);
929 res = NULL;
930 break;
931
932 case VT_ERROR:
933 /* A DWORD containing a status code. */
934 case VT_UNKNOWN:
935 case VT_DECIMAL:
936 case VT_INT:
937 case VT_UINT:
938 case VT_VOID:
939 case VT_HRESULT:
940 case VT_PTR:
941 case VT_SAFEARRAY:
942 case VT_CARRAY:
943 case VT_USERDEFINED:
944 g_warning ("type %s (0x%x) is not permitted in property sets",
945 msole_vt_name (type), type);
946 g_free (res);
947 res = NULL;
948 break;
949
950 default:
951 if (msole_debug (DEBUG_UNKNOWN_PROPS))
952 g_warning ("Unknown property type %d (0x%x)",
953 type, type);
954 g_free (res);
955 res = NULL;
956 }
957
958 if (res != NULL && G_IS_VALUE (res)) {
959 d ( {
960 char *val = g_strdup_value_contents (res);
961 g_print ("%s\n", val);
962 g_free (val);
963 });
964 } else {
965 d ({
966 char const *type_name = msole_vt_name (type);
967 if (type_name) {
968 g_printerr ("A '%s' property could not be parsed\n", type_name);
969 } else {
970 g_printerr ("A %d property could not be parsed\n", type);
971 }
972 });
973 g_free (res);
974 res = NULL;
975 }
976 return res;
977 }
978 #undef NEED_BYTES
979 #undef NEED_RECS
980 #undef ADVANCE
981
982 static gboolean
msole_prop_read(GsfInput * in,GsfMSOleMetaDataSection * section,GsfMSOleMetaDataProp * props,unsigned i,GsfDocMetaData * accum)983 msole_prop_read (GsfInput *in,
984 GsfMSOleMetaDataSection *section,
985 GsfMSOleMetaDataProp *props,
986 unsigned i,
987 GsfDocMetaData *accum)
988 {
989 guint32 type;
990 guint8 const *data;
991 gsf_off_t size = ((i+1) >= section->num_props)
992 ? section->size
993 : props[i+1].offset;
994 char *name;
995 GValue *val;
996
997 g_return_val_if_fail (i < section->num_props, FALSE);
998 g_return_val_if_fail (size >= props[i].offset + 4, FALSE);
999
1000 size -= props[i].offset; /* includes the type id */
1001 /* From now on, size is actually a size. */
1002 if (gsf_input_seek (in, section->offset+props[i].offset, G_SEEK_SET) ||
1003 NULL == (data = gsf_input_read (in, size, NULL))) {
1004 g_warning ("failed to read prop #%d", i);
1005 return FALSE;
1006 }
1007
1008 type = GSF_LE_GET_GUINT32 (data);
1009 data += 4;
1010
1011 /* dictionary is magic */
1012 if (props[i].id == 0) {
1013 guint32 len, id, j, n;
1014 gsize gslen;
1015 char *name;
1016 guint8 const *start = data;
1017 guint8 const *end = start + (size - 4);
1018
1019 g_return_val_if_fail (section->dict == NULL, FALSE);
1020
1021 section->dict = g_hash_table_new_full (
1022 g_direct_hash, g_direct_equal,
1023 NULL, g_free);
1024
1025 d ({ g_print ("Dictionary = \n"); gsf_mem_dump (data-4, size); });
1026 n = type;
1027 for (j = 0; j < n; j++) {
1028 g_return_val_if_fail (end - data >= 8, FALSE);
1029
1030 id = GSF_LE_GET_GUINT32 (data);
1031 len = GSF_LE_GET_GUINT32 (data + 4);
1032
1033 g_return_val_if_fail (len < 0x10000, FALSE);
1034 g_return_val_if_fail (len <= end - data + 8, FALSE);
1035
1036 gslen = 0;
1037 name = g_convert_with_iconv (data + 8,
1038 len * section->char_size,
1039 section->iconv_handle, &gslen, NULL, NULL);
1040 len = (guint32)gslen;
1041 data += 8 + len;
1042
1043 d (g_print ("\t%u == %s\n", id, name););
1044 g_hash_table_replace (section->dict,
1045 GINT_TO_POINTER (id), name);
1046
1047 /* MS documentation blows goats !
1048 * The docs claim there are padding bytes in the dictionary.
1049 * Their examples show padding bytes.
1050 * In reality non-unicode strings do not seem to
1051 * have padding.
1052 */
1053 if (section->char_size != 1 && (data - start) % 4)
1054 data += 4 - ((data - start) % 4);
1055 }
1056 } else {
1057 gboolean linked;
1058 d (g_print ("===> %u) ", i);
1059 gsf_mem_dump (data-4, size););
1060
1061 name = g_strdup (msole_prop_id_to_gsf (section, props[i].id, &linked));
1062 d (g_print (" @ %x %x = ", (unsigned)props[i].offset, (unsigned)size););
1063 val = msole_prop_parse (section, type, &data, data + size - 4);
1064
1065 if (NULL != name && NULL != val) {
1066 if (linked) {
1067 GsfDocProp *prop = gsf_doc_meta_data_lookup (accum, name);
1068 if (NULL == prop) {
1069 g_warning ("linking property '%s' before it\'s value is specified",
1070 (name ? name : "<null>"));
1071 } else if (!G_VALUE_HOLDS_STRING (val)) {
1072 g_warning ("linking property '%s' before it\'s value is specified",
1073 (name ? name : "<null>"));
1074 } else
1075 gsf_doc_prop_set_link (prop,
1076 g_value_dup_string (val));
1077 } else {
1078 gsf_doc_meta_data_insert (accum, name, val);
1079 val = NULL;
1080 name = NULL;
1081 }
1082 }
1083
1084 if (NULL != val) {
1085 if (G_IS_VALUE (val))
1086 g_value_unset (val);
1087 g_free (val);
1088 }
1089 g_free (name);
1090 }
1091
1092 return TRUE;
1093 }
1094
1095 static int
msole_prop_cmp(gconstpointer a,gconstpointer b)1096 msole_prop_cmp (gconstpointer a, gconstpointer b)
1097 {
1098 GsfMSOleMetaDataProp const *prop_a = a;
1099 GsfMSOleMetaDataProp const *prop_b = b;
1100
1101 if (prop_a->offset < prop_b->offset)
1102 return -1;
1103 else if (prop_a->offset > prop_b->offset)
1104 return +1;
1105 else
1106 return 0;
1107 }
1108
1109 /**
1110 * gsf_doc_meta_data_read_from_msole:
1111 * @accum: #GsfDocMetaData
1112 * @in: #GsfInput
1113 *
1114 * Read a stream formated as a set of MS OLE properties from @in and store the
1115 * results in @accum.
1116 *
1117 * Since: 1.14.24
1118 *
1119 * Returns: (transfer full): A #GError if there was an error.
1120 **/
1121 GError *
gsf_doc_meta_data_read_from_msole(GsfDocMetaData * accum,GsfInput * in)1122 gsf_doc_meta_data_read_from_msole (GsfDocMetaData *accum, GsfInput *in)
1123 {
1124 guint8 const *data;
1125 guint16 version;
1126 guint32 os, num_sections;
1127 unsigned i, j;
1128 GsfMSOleMetaDataSection *sections;
1129 GsfMSOleMetaDataProp *props;
1130 GsfDocProp *prop;
1131
1132 /* http://bugzilla.gnome.org/show_bug.cgi?id=352055
1133 * psiwin generates files with empty property sections */
1134 if (gsf_input_size (in) <= 0)
1135 return NULL;
1136
1137 data = gsf_input_read (in, 28, NULL);
1138 if (NULL == data)
1139 return g_error_new (gsf_input_error_id (), 0,
1140 _("Unable to read MS property stream header"));
1141
1142 d ({g_print ("===================================\n"
1143 "header class id ==\n");
1144 gsf_mem_dump (data, 28);});
1145 /*
1146 * Validate the Property Set Header.
1147 * Format (bytes):
1148 * 00 - 01 Byte order 0xfffe
1149 * 02 - 03 Format 0
1150 * 04 - 05 OS Version high word is the OS
1151 * 06 - 07 low word is the OS version
1152 * 0 = win16
1153 * 1 = mac
1154 * 2 = win32
1155 * 08 - 23 Class Identifier Usually Format ID
1156 * 24 - 27 Section count Should be at least 1
1157 */
1158 os = GSF_LE_GET_GUINT16 (data + 6);
1159 version = GSF_LE_GET_GUINT16 (data + 2);
1160 num_sections = GSF_LE_GET_GUINT32 (data + 24);
1161 if (GSF_LE_GET_GUINT16 (data + 0) != 0xfffe
1162 || (version != 0 && version != 1)
1163 || os > 2
1164 || num_sections > gsf_input_size(in) / 20
1165 || num_sections > 100) /* arbitrary sanity check */
1166 return g_error_new (gsf_input_error_id (), 0,
1167 _("Invalid MS property stream header"));
1168
1169 /* extract the section info */
1170 /*
1171 * The Format ID/Offset list follows.
1172 * Format:
1173 * 00 - 16 Section Name Format ID
1174 * 16 - 19 Section Offset The offset is the number of
1175 * bytes from the start of the
1176 * whole stream to where the
1177 * section begins.
1178 */
1179 sections = (GsfMSOleMetaDataSection *)g_alloca (sizeof (GsfMSOleMetaDataSection)* num_sections);
1180 for (i = 0 ; i < num_sections ; i++) {
1181 data = gsf_input_read (in, 20, NULL);
1182 if (NULL == data)
1183 return g_error_new (gsf_input_error_id (), 0,
1184 _("Unable to read MS property stream header"));
1185 if (!memcmp (data, component_guid, sizeof (component_guid)))
1186 sections [i].type = COMPONENT_PROP;
1187 else if (!memcmp (data, document_guid, sizeof (document_guid)))
1188 sections [i].type = DOC_PROP;
1189 else if (!memcmp (data, user_guid, sizeof (user_guid)))
1190 sections [i].type = USER_PROP;
1191 else {
1192 sections [i].type = USER_PROP;
1193 g_warning ("Unknown property section type, treating it as USER");
1194 gsf_mem_dump (data, 16);
1195 }
1196
1197 sections [i].offset = GSF_LE_GET_GUINT32 (data + 16);
1198 }
1199
1200 /*
1201 * A section is the third part of the property set stream.
1202 * Format (bytes):
1203 * 00 - 03 Section size A byte count for the section (which is inclusive
1204 * of the byte count itself and should always be a
1205 * multiple of 4);
1206 * 04 - 07 Property count A count of the number of properties
1207 * 08 - xx An array of 32-bit Property ID/Offset pairs
1208 * yy - zz An array of Property Type indicators/Value pairs
1209 */
1210 for (i = 0 ; i < num_sections ; i++) {
1211 if (gsf_input_seek (in, sections[i].offset, G_SEEK_SET) ||
1212 NULL == (data = gsf_input_read (in, 8, NULL)))
1213 return g_error_new (gsf_input_error_id (), 0,
1214 _("Invalid MS property section"));
1215
1216 sections[i].iconv_handle = (GIConv)-1;
1217 sections[i].char_size = 1;
1218 sections[i].dict = NULL;
1219 sections[i].size = GSF_LE_GET_GUINT32 (data); /* includes header */
1220 sections[i].num_props = GSF_LE_GET_GUINT32 (data + 4);
1221
1222 d (g_print ("=============================================\n"
1223 "===> section #%d : type %d at offset 0x%x, size 0x%x, numprops = %u\n",
1224 i, (int)sections [i].type,
1225 (guint32)sections [i].offset,
1226 sections[i].size,
1227 sections[i].num_props););
1228
1229 if (sections[i].num_props <= 0)
1230 continue;
1231 if (sections[i].num_props > gsf_input_remaining(in) / 8)
1232 return g_error_new (gsf_input_error_id (), 0,
1233 _("Invalid MS property stream header or file truncated"));
1234
1235 if (sections[i].offset + sections[i].size > gsf_input_size(in))
1236 return g_error_new (gsf_input_error_id (), 0,
1237 _("Invalid MS property stream header or file truncated"));
1238
1239 /*
1240 * Get and save all the Property ID/Offset pairs.
1241 * Format (bytes):
1242 * 00 - 03 id Property ID
1243 * 04 - 07 offset The distance from the start of the section to the
1244 * start of the Property Type/Value pair.
1245 */
1246 d (g_print ("Offsets\n"););
1247 props = g_new (GsfMSOleMetaDataProp, sections[i].num_props);
1248 for (j = 0; j < sections[i].num_props; j++) {
1249 if (NULL == (data = gsf_input_read (in, 8, NULL))) {
1250 g_free (props);
1251 return g_error_new (gsf_input_error_id (), 0,
1252 _("Invalid MS property section"));
1253 }
1254
1255 props[j].id = GSF_LE_GET_GUINT32 (data);
1256 props[j].offset = GSF_LE_GET_GUINT32 (data + 4);
1257 d (g_print ("%d) ID=%d, offset=0x%x\n", j,
1258 props [j].id, (unsigned)props [j].offset););
1259 }
1260
1261 /* FIXME: Should we check that ids are distinct? */
1262
1263 /* order prop info by offset to facilitate bounds checking */
1264 qsort (props, sections[i].num_props,
1265 sizeof (GsfMSOleMetaDataProp),
1266 msole_prop_cmp);
1267
1268 /* Sanity checks. */
1269 for (j = 0; j < sections[i].num_props; j++) {
1270 guint end = (j == sections[i].num_props - 1)
1271 ? sections[i].size
1272 : props[j + 1].offset;
1273 if (props[j].offset < 0 || props[j].offset + 4 > end) {
1274 g_free (props);
1275 return g_error_new (gsf_input_error_id (), 0,
1276 _("Invalid MS property section"));
1277 }
1278 }
1279
1280 /*
1281 * Find and process the code page.
1282 * Property ID 1 is reserved as an indicator of the code page.
1283 */
1284 sections[i].iconv_handle = (GIConv)-1;
1285 sections[i].char_size = 1;
1286 for (j = 0; j < sections[i].num_props; j++) /* first codepage */
1287 if (props[j].id == 1) {
1288 msole_prop_read (in, sections+i, props, j, accum);
1289 if (NULL != (prop = gsf_doc_meta_data_lookup (accum, GSF_META_NAME_CODEPAGE))) {
1290 GValue const *val = gsf_doc_prop_get_val (prop);
1291 if (NULL != val && G_VALUE_HOLDS_INT (val)) {
1292 int codepage = g_value_get_int (val);
1293 sections[i].iconv_handle =
1294 gsf_msole_iconv_open_for_import (codepage);
1295 sections[i].char_size = msole_codepage_char_size (codepage);
1296 }
1297 }
1298 }
1299
1300 if (sections[i].iconv_handle == (GIConv)-1)
1301 sections[i].iconv_handle = gsf_msole_iconv_open_for_import (1252);
1302
1303 /*
1304 * Find and process the Property Set Dictionary
1305 * Property ID 0 is reserved as an indicator of the dictionary.
1306 * For User Defined Sections, Property ID 0 is NOT a dictionary.
1307 */
1308 for (j = 0; j < sections[i].num_props; j++) /* then dictionary */
1309 if (props[j].id == 0)
1310 msole_prop_read (in, sections+i, props, j, accum);
1311
1312 /* Process all the properties */
1313 for (j = 0; j < sections[i].num_props; j++) /* the rest */
1314 if (props[j].id > 1)
1315 msole_prop_read (in, sections+i, props, j, accum);
1316
1317 gsf_iconv_close (sections[i].iconv_handle);
1318 g_free (props);
1319 if (sections[i].dict != NULL)
1320 g_hash_table_destroy (sections[i].dict);
1321 }
1322 return NULL;
1323 }
1324
1325 /**
1326 * gsf_msole_metadata_read: (skip)
1327 * @in: #GsfInput
1328 * @accum: #GsfDocMetaData
1329 *
1330 * Read a stream formated as a set of MS OLE properties from @in and store the
1331 * results in @accum.
1332 *
1333 * Deprecated: 1.14.24, use gsf_doc_meta_data_read_from_msole
1334 *
1335 * Returns: (transfer full): A #GError if there was an error.
1336 **/
1337 GError *
gsf_msole_metadata_read(GsfInput * in,GsfDocMetaData * accum)1338 gsf_msole_metadata_read (GsfInput *in, GsfDocMetaData *accum)
1339 {
1340 return gsf_doc_meta_data_read_from_msole (accum, in);
1341 }
1342
1343 /****************************************************************************/
1344
1345 typedef struct {
1346 GsfOutput *out;
1347 gboolean doc_not_component;
1348
1349 GHashTable *dict;
1350 struct {
1351 unsigned count; /* includes 2nd prop for links */
1352 GSList *props;
1353 } builtin, user;
1354
1355 unsigned codepage;
1356 GIConv iconv_handle;
1357 unsigned char_size;
1358 } WritePropState;
1359
1360 static GsfMSOleVariantType
gvalue_to_msole_vt(GValue const * value,GsfMSOleMetaDataPropMap const * map)1361 gvalue_to_msole_vt (GValue const *value, GsfMSOleMetaDataPropMap const *map)
1362 {
1363 g_return_val_if_fail (value != NULL, VT_EMPTY);
1364
1365 switch (G_TYPE_FUNDAMENTAL (G_VALUE_TYPE (value))) {
1366 case G_TYPE_BOOLEAN: return VT_BOOL;
1367 case G_TYPE_UCHAR: return VT_UI1;
1368 case G_TYPE_FLOAT: return VT_R4;
1369 case G_TYPE_DOUBLE: return VT_R8;
1370 case G_TYPE_STRING: return VT_LPSTR;
1371 case G_TYPE_INT:
1372 return (NULL != map && map->prefered_type == VT_I2)
1373 ? VT_I2 : VT_I4;
1374 case G_TYPE_UINT:
1375 return (NULL != map && map->prefered_type == VT_UI2)
1376 ? VT_UI2 : VT_UI4;
1377 case G_TYPE_BOXED:
1378 if (VAL_IS_GSF_TIMESTAMP (value))
1379 return VT_FILETIME;
1380 return VT_UNKNOWN;
1381 case G_TYPE_OBJECT:
1382 if (VAL_IS_GSF_DOCPROP_VECTOR (value)) {
1383 GArray *vector = gsf_value_get_docprop_array (value);
1384 unsigned i, n;
1385 GsfMSOleVariantType type, tmp;
1386
1387 if (vector == NULL)
1388 return VT_UNKNOWN;
1389
1390 if (map != NULL) {
1391 type = map->prefered_type & (~VT_VECTOR);
1392 if (type == VT_VARIANT)
1393 return VT_VECTOR | VT_VARIANT;
1394 } else
1395 type = VT_UNKNOWN;
1396 n = vector->len;
1397 for (i = 0; i < n; i++) {
1398 tmp = gvalue_to_msole_vt (
1399 &g_array_index (vector, GValue, i), NULL);
1400 if (type == VT_UNKNOWN)
1401 type = tmp;
1402 else if (type != tmp)
1403 return VT_VECTOR | VT_VARIANT;
1404 }
1405 return VT_VECTOR | type;
1406 }
1407 break;
1408 }
1409 return VT_UNKNOWN;
1410 }
1411
1412 static gboolean
msole_metadata_write_string(WritePropState * state,const char * txt)1413 msole_metadata_write_string (WritePropState *state, const char *txt)
1414 {
1415 guint8 buf[4];
1416 guint32 len;
1417 gchar *ctxt;
1418 gsize bytes_written;
1419 gboolean res;
1420
1421 if (!txt) txt = "";
1422 len = strlen (txt);
1423 ctxt = g_convert_with_iconv (txt, len, state->iconv_handle,
1424 NULL, &bytes_written, NULL);
1425 if (!ctxt) {
1426 /* See bug #703952 */
1427 g_warning ("Failed to write metadata string");
1428 bytes_written = 0;
1429 }
1430
1431 // *Bytes*, not characters, including the termination, but not the
1432 // padding.
1433 GSF_LE_SET_GUINT32 (buf, bytes_written + state->char_size);
1434 res = gsf_output_write (state->out, 4, buf);
1435
1436 res = res && gsf_output_write (state->out, bytes_written, ctxt);
1437
1438 GSF_LE_SET_GUINT32 (buf, 0);
1439 res = res && gsf_output_write (state->out, state->char_size, buf);
1440
1441 if (state->char_size > 1) {
1442 unsigned padding = 4 - (bytes_written + state->char_size) % 4;
1443 if (padding < 4)
1444 res = res && gsf_output_write (state->out, padding, buf);
1445 }
1446
1447 g_free (ctxt);
1448 return res;
1449 }
1450
1451
1452 /* Returns: TRUE on success */
1453 static gboolean
msole_metadata_write_prop(WritePropState * state,char const * name,GValue const * value,gboolean suppress_type)1454 msole_metadata_write_prop (WritePropState *state,
1455 char const *name,
1456 GValue const *value,
1457 gboolean suppress_type)
1458 {
1459 GsfMSOleMetaDataPropMap const *map =
1460 (name != NULL) ? msole_gsf_name_to_prop (name) : NULL;
1461 GsfMSOleVariantType type;
1462 guint8 buf[8];
1463
1464 g_return_val_if_fail (value != NULL, FALSE);
1465
1466 type = gvalue_to_msole_vt (value, map);
1467 if (!suppress_type) {
1468 GSF_LE_SET_GUINT32 (buf, type);
1469 gsf_output_write (state->out, 4, buf);
1470 }
1471 if (NULL != map && map->prefered_type != type) {
1472 d(g_print ("Exporting property '%s' with type 0x%x rather than the usual 0x%x\n",
1473 map->gsf_name, type, map->prefered_type););
1474 }
1475
1476 if (type & VT_VECTOR) {
1477 GArray *vector = gsf_value_get_docprop_array (value);
1478 unsigned i, n = vector->len;
1479 gboolean res;
1480
1481 GSF_LE_SET_GINT32 (buf, n);
1482 res = gsf_output_write (state->out, 4, buf);
1483 for (i = 0; i < n; i++) {
1484 gboolean suppress = type != (VT_VECTOR | VT_VARIANT);
1485 res &= msole_metadata_write_prop (state, NULL,
1486 &g_array_index (vector, GValue, i),
1487 suppress);
1488 }
1489 return res;
1490 }
1491
1492 switch (type) {
1493 case VT_BOOL:
1494 if (g_value_get_boolean (value))
1495 GSF_LE_SET_GINT32 (buf, 0xffffffff);
1496 else
1497 GSF_LE_SET_GINT32 (buf, 0);
1498 return gsf_output_write (state->out, 4, buf);
1499 case VT_UI1:
1500 GSF_LE_SET_GUINT32 (buf, g_value_get_uchar (value));
1501 return gsf_output_write (state->out, 4, buf);
1502 case VT_I2:
1503 GSF_LE_SET_GINT16 (buf, g_value_get_int (value));
1504 GSF_LE_SET_GUINT16 (buf+2, 0);
1505 return gsf_output_write (state->out, 4, buf);
1506 case VT_I4:
1507 GSF_LE_SET_GINT32 (buf, g_value_get_int (value));
1508 return gsf_output_write (state->out, 4, buf);
1509 case VT_UI2:
1510 case VT_UI4:
1511 GSF_LE_SET_GUINT32 (buf, g_value_get_uint (value));
1512 return gsf_output_write (state->out, 4, buf);
1513 case VT_R4:
1514 GSF_LE_SET_FLOAT (buf, g_value_get_float (value));
1515 return gsf_output_write (state->out, 4, buf);
1516 case VT_R8:
1517 GSF_LE_SET_DOUBLE (buf, g_value_get_double (value));
1518 return gsf_output_write (state->out, 8, buf);
1519
1520 case VT_LPSTR:
1521 return msole_metadata_write_string (state, g_value_get_string (value));
1522
1523 case VT_FILETIME : {
1524 GsfTimestamp const *ts = g_value_get_boxed (value);
1525 gint32 timet_signed = (gint32) ts->timet;
1526 guint64 ft;
1527
1528 ft = timet_signed + G_GINT64_CONSTANT (11644473600);
1529 ft *= 10000000;
1530
1531 GSF_LE_SET_GUINT64 (buf, ft);
1532
1533 return gsf_output_write (state->out, 8, buf);
1534 }
1535
1536 default:
1537 break;
1538 }
1539
1540 g_warning ("Ignoring property '%s', how do we export a property of type '%s'",
1541 name ? name : "<unnamed>",
1542 g_type_name (G_TYPE_FUNDAMENTAL (G_VALUE_TYPE (value))));
1543 return FALSE;
1544 }
1545
1546 static void
cb_write_dict(char const * name,gpointer id,WritePropState * state)1547 cb_write_dict (char const *name, gpointer id, WritePropState *state)
1548 {
1549 guint8 buf[4];
1550
1551 GSF_LE_SET_GUINT32 (buf, GPOINTER_TO_UINT (id));
1552 gsf_output_write (state->out, 4, buf);
1553 msole_metadata_write_string (state, name);
1554 }
1555
1556 static gboolean
msole_metadata_write_section(WritePropState * state,gboolean user)1557 msole_metadata_write_section (WritePropState *state, gboolean user)
1558 {
1559 char const *name;
1560 guint8 buf [8];
1561 GSList *ptr = user ? state->user.props : state->builtin.props;
1562 unsigned count = user ? state->user.count : state->builtin.count;
1563 gsf_off_t len, base = gsf_output_tell (state->out);
1564 GsfMSOleMetaDataProp *offsets;
1565 GsfMSOleMetaDataPropMap const *map;
1566 GsfDocProp const *prop;
1567 gpointer tmp;
1568 unsigned i;
1569 GValue scratch;
1570
1571 if (user && state->dict == NULL)
1572 return TRUE;
1573
1574 // Skip past the size+count and id/offset pairs
1575 GSF_LE_SET_GUINT32 (buf, 0);
1576 for (i = 0; i < 1 + 1 + 2 * count; i++)
1577 gsf_output_write (state->out, 4, buf);
1578
1579 memset (&scratch, 0, sizeof (GValue));
1580 g_value_init (&scratch, G_TYPE_STRING);
1581
1582 offsets = g_alloca (sizeof (GsfMSOleMetaDataProp) * count);
1583
1584 i = 0;
1585
1586 /* 0) codepage */
1587 if (i < count) {
1588 offsets[0].id = 1;
1589 offsets[0].offset = gsf_output_tell (state->out);
1590 GSF_LE_SET_GUINT32 (buf, VT_I2);
1591 GSF_LE_SET_GUINT32 (buf+4, state->codepage);
1592 gsf_output_write (state->out, 8, buf);
1593 i++;
1594 }
1595
1596 /* 1) dictionary */
1597 if (user && i < count) {
1598 offsets[1].id = 0;
1599 offsets[1].offset = gsf_output_tell (state->out);
1600 GSF_LE_SET_GUINT32 (buf, g_hash_table_size (state->dict));
1601 gsf_output_write (state->out, 4, buf);
1602 g_hash_table_foreach (state->dict,
1603 (GHFunc) cb_write_dict, state);
1604 i++;
1605 }
1606
1607 /* 2) props */
1608 for (; ptr != NULL && i < count ; ptr = ptr->next, i++) {
1609 offsets[i].offset = gsf_output_tell (state->out);
1610 prop = ptr->data;
1611 name = gsf_doc_prop_get_name (prop);
1612 if (user) {
1613 tmp = g_hash_table_lookup (state->dict, name);
1614 offsets[i].id = GPOINTER_TO_INT (tmp);
1615 if (offsets[i].id < 2) {
1616 g_warning ("Invalid ID (%d) for custom name '%s'", offsets[i].id, name);
1617 continue;
1618 }
1619 } else {
1620 map = msole_gsf_name_to_prop (name);
1621 if (map == NULL) {
1622 g_warning ("Missing map for built-in property '%s'", name);
1623 continue;
1624 }
1625 offsets[i].id = map->id;
1626 }
1627
1628 msole_metadata_write_prop (state, name,
1629 gsf_doc_prop_get_val (prop), FALSE);
1630 if (gsf_doc_prop_get_link (prop)) {
1631 i++;
1632 offsets[i].id = offsets[i-1].id | 0x1000000;
1633 offsets[i].offset = gsf_output_tell (state->out);
1634 g_value_set_static_string (&scratch,
1635 gsf_doc_prop_get_link (prop));
1636 msole_metadata_write_prop (state, NULL, &scratch, FALSE);
1637 }
1638 }
1639
1640 while (i < count) {
1641 static gboolean warned = FALSE;
1642 if (!warned) {
1643 warned = TRUE;
1644 g_warning ("Something strange in msole_metadata_write_section");
1645 }
1646 offsets[i].id = 0;
1647 offsets[i].offset = offsets[i - 1].offset;
1648 i++;
1649 }
1650
1651 len = gsf_output_tell (state->out) - base;
1652 gsf_output_seek (state->out, base, G_SEEK_SET);
1653 GSF_LE_SET_GUINT32 (buf, len);
1654 GSF_LE_SET_GUINT32 (buf+4, count);
1655 gsf_output_write (state->out, 8, buf);
1656 for (i = 0 ; i < count ; i++) {
1657 GSF_LE_SET_GUINT32 (buf, offsets[i].id);
1658 GSF_LE_SET_GUINT32 (buf+4, offsets[i].offset - base);
1659 gsf_output_write (state->out, 8, buf);
1660 }
1661
1662 return gsf_output_seek (state->out, 0, G_SEEK_END);
1663 }
1664
1665 static void
cb_count_props(char const * name,GsfDocProp * prop,WritePropState * state)1666 cb_count_props (char const *name, GsfDocProp *prop, WritePropState *state)
1667 {
1668 GsfMSOleMetaDataPropMap const *map = msole_gsf_name_to_prop (name);
1669
1670 /* allocate predefined ids or add it to the dictionary */
1671 if (map != NULL) {
1672 if (map->id == 0) return; /* dictionary is handled elsewhere */
1673 if (map->section == (state->doc_not_component ? COMPONENT_PROP : DOC_PROP))
1674 return;
1675 if (map->id == 1) { /*codepage */
1676 GValue const *val = gsf_doc_prop_get_val (prop);
1677 if (NULL != val && G_VALUE_HOLDS_INT (val))
1678 state->codepage = g_value_get_int (val);
1679 return;
1680 }
1681
1682 d (g_print ("%d) Adding builtin %s'\n",
1683 state->builtin.count, map->gsf_name););
1684 state->builtin.count += gsf_doc_prop_get_link (prop) ? 2 : 1;
1685 state->builtin.props = g_slist_prepend (state->builtin.props, prop);
1686 } else if (state->doc_not_component) { /* keep user props in the document */
1687 d (g_print("user defined named '%s' assigned id = %d\n",
1688 name, state->user.count););
1689 if (NULL == state->dict)
1690 state->dict = g_hash_table_new (g_str_hash, g_str_equal);
1691 g_hash_table_insert (state->dict,
1692 (gpointer) name, GINT_TO_POINTER (state->user.count));
1693 state->user.count += gsf_doc_prop_get_link (prop) ? 2 : 1;
1694 state->user.props = g_slist_prepend (state->user.props, prop);
1695 }
1696 }
1697
1698 /**
1699 * gsf_doc_meta_data_write_to_msole:
1700 * @out: #GsfOutput
1701 * @meta_data: #GsfDocMetaData
1702 * @doc_not_component: a kludge to differentiate DocumentSummary from Summary
1703 *
1704 * Since: 1.14.24
1705 *
1706 * Returns: %TRUE on success;
1707 **/
1708 gboolean
gsf_doc_meta_data_write_to_msole(GsfDocMetaData const * meta_data,GsfOutput * out,gboolean doc_not_component)1709 gsf_doc_meta_data_write_to_msole (GsfDocMetaData const *meta_data,
1710 GsfOutput *out,
1711 gboolean doc_not_component)
1712 {
1713 static guint8 const header[] = {
1714 0xfe, 0xff, /* byte order */
1715 0, 0, /* Format */
1716 0x04, 0x0a, /* OS : XP == 0xA04 */
1717 0x02, 0x00, /* win32 == 2 */
1718 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, /* clasid = 0 */
1719 };
1720
1721 gboolean success = FALSE;
1722 guint8 buf [4];
1723 WritePropState state;
1724
1725 state.codepage = 1252;
1726 state.iconv_handle = (GIConv)-1;
1727 state.char_size = 1;
1728 state.out = out;
1729 state.dict = NULL;
1730 state.builtin.count = 1; /* codepage */
1731 state.user.count = 2; /* codepage and dictionary */
1732 state.builtin.props = state.user.props = NULL;
1733 state.doc_not_component = doc_not_component;
1734 d (g_print ("================================\nFinding props\n"););
1735 gsf_doc_meta_data_foreach (meta_data,
1736 (GHFunc) cb_count_props, &state);
1737 d (g_print ("Done\n"
1738 "================================\n"););
1739
1740 state.iconv_handle = gsf_msole_iconv_open_codepage_for_export (state.codepage);
1741 state.char_size = msole_codepage_char_size (state.codepage);
1742
1743 /* Write stream header */
1744 GSF_LE_SET_GUINT32 (buf, (state.dict != NULL) ? 2 : 1);
1745 if (!gsf_output_write (out, sizeof (header), header) ||
1746 !gsf_output_write (out, 4, buf))
1747 goto err;
1748
1749 /* Write section header(s) */
1750 GSF_LE_SET_GUINT32 (buf, (state.dict != NULL) ? 0x44 : 0x30);
1751 if (!gsf_output_write (out, 16,
1752 doc_not_component ? document_guid : component_guid) ||
1753 !gsf_output_write (out, 4, buf))
1754 goto err;
1755 if (state.dict != NULL) {
1756 GSF_LE_SET_GUINT32 (buf, 0);
1757 if (!gsf_output_write (out, sizeof (user_guid), user_guid) ||
1758 !gsf_output_write (out, 4, buf)) /* bogus position, fix it later */
1759 goto err;
1760 }
1761
1762 /* Write section(s) */
1763 if (!msole_metadata_write_section (&state, FALSE))
1764 goto err;
1765 if (state.dict != NULL) {
1766 gsf_off_t base = gsf_output_tell (state.out);
1767 GSF_LE_SET_GUINT32 (buf, base);
1768 if (!gsf_output_seek (state.out, 0x40, G_SEEK_SET) ||
1769 !gsf_output_write (out, 4, buf) ||
1770 !gsf_output_seek (state.out, 0, G_SEEK_END) ||
1771 !msole_metadata_write_section (&state, TRUE))
1772 goto err;
1773 }
1774
1775 success = TRUE;
1776 err:
1777 gsf_iconv_close (state.iconv_handle);
1778 g_slist_free (state.builtin.props);
1779 g_slist_free (state.user.props);
1780 if (state.dict != NULL)
1781 g_hash_table_destroy (state.dict);
1782 return success;
1783 }
1784
1785 /**
1786 * gsf_msole_metadata_write: (skip)
1787 * @out: #GsfOutput
1788 * @meta_data: #GsfDocMetaData
1789 * @doc_not_component: a kludge to differentiate DocumentSummary from Summary
1790 *
1791 * Deprecated: 1.14.24, use gsf_doc_meta_data_write_to_msole
1792 *
1793 * Returns: %TRUE on success;
1794 **/
1795 gboolean
gsf_msole_metadata_write(GsfOutput * out,GsfDocMetaData const * meta_data,gboolean doc_not_component)1796 gsf_msole_metadata_write (GsfOutput *out,
1797 GsfDocMetaData const *meta_data,
1798 gboolean doc_not_component)
1799 {
1800 return gsf_doc_meta_data_write_to_msole (meta_data, out, doc_not_component);
1801 }
1802
1803 static struct {
1804 char const *tag;
1805 guint lid;
1806 } const gsf_msole_language_ids[] = {
1807 { "-none-", 0x0000 }, /* none (language neutral) */
1808 { "-none-", 0x0400 }, /* none */
1809 { "af_ZA", 0x0436 }, /* Afrikaans */
1810 { "am", 0x045e }, /* Amharic */
1811 { "sq_AL", 0x041c }, /* Albanian */
1812 { "ar_SA", 0x0401 }, /* Arabic (Saudi) */
1813 { "ar_IQ", 0x0801 }, /* Arabic (Iraq) */
1814 { "ar_EG", 0x0c01 }, /* Arabic (Egypt) */
1815 { "ar_LY", 0x1001 }, /* Arabic (Libya) */
1816 { "ar_DZ", 0x1401 }, /* Arabic (Algeria) */
1817 { "ar_MA", 0x1801 }, /* Arabic (Morocco) */
1818 { "ar_TN", 0x1c01 }, /* Arabic (Tunisia) */
1819 { "ar_OM", 0x2001 }, /* Arabic (Oman) */
1820 { "ar_YE", 0x2401 }, /* Arabic (Yemen) */
1821 { "ar_SY", 0x2801 }, /* Arabic (Syria) */
1822 { "ar_JO", 0x2c01 }, /* Arabic (Jordan) */
1823 { "ar_LB", 0x3001 }, /* Arabic (Lebanon) */
1824 { "ar_KW", 0x3401 }, /* Arabic (Kuwait) */
1825 { "ar_AE", 0x3801 }, /* Arabic (United Arab Emirates) */
1826 { "ar_BH", 0x3c01 }, /* Arabic (Bahrain) */
1827 { "ar_QA", 0x4001 }, /* Arabic (Qatar) */
1828 { "as", 0x044d }, /* Assamese */
1829 { "az", 0x042c }, /* Azerbaijani */
1830 { "hy_AM", 0x042b }, /* Armenian */
1831 { "az", 0x044c }, /* Azeri (Latin) az_ */
1832 { "az", 0x082c }, /* Azeri (Cyrillic) az_ */
1833 { "eu_ES", 0x042d }, /* Basque */
1834 { "be_BY", 0x0423 }, /* Belarussian */
1835 { "bn", 0x0445 }, /* Bengali bn_ */
1836 { "bg_BG", 0x0402 }, /* Bulgarian */
1837 { "ca_ES", 0x0403 }, /* Catalan */
1838 { "zh_TW", 0x0404 }, /* Chinese (Taiwan) */
1839 { "zh_CN", 0x0804 }, /* Chinese (PRC) */
1840 { "zh_HK", 0x0c04 }, /* Chinese (Hong Kong) */
1841 { "zh_SG", 0x1004 }, /* Chinese (Singapore) */
1842 { "ch_MO", 0x1404 }, /* Chinese (Macau SAR) */
1843 { "hr_HR", 0x041a }, /* Croatian */
1844 { "cs_CZ", 0x0405 }, /* Czech */
1845 { "da_DK", 0x0406 }, /* Danish */
1846 { "div", 0x465 }, /* Divehi div_*/
1847 { "nl_NL", 0x0413 }, /* Dutch (Netherlands) */
1848 { "nl_BE", 0x0813 }, /* Dutch (Belgium) */
1849 { "en_US", 0x0409 }, /* English (USA) */
1850 { "en_GB", 0x0809 }, /* English (UK) */
1851 { "en_AU", 0x0c09 }, /* English (Australia) */
1852 { "en_CA", 0x1009 }, /* English (Canada) */
1853 { "en_NZ", 0x1409 }, /* English (New Zealand) */
1854 { "en_IE", 0x1809 }, /* English (Ireland) */
1855 { "en_ZA", 0x1c09 }, /* English (South Africa) */
1856 { "en_JM", 0x2009 }, /* English (Jamaica) */
1857 { "en", 0x2409 }, /* English (Caribbean) */
1858 { "en_BZ", 0x2809 }, /* English (Belize) */
1859 { "en_TT", 0x2c09 }, /* English (Trinidad) */
1860 { "en_ZW", 0x3009 }, /* English (Zimbabwe) */
1861 { "en_PH", 0x3409 }, /* English (Phillipines) */
1862 { "et_EE", 0x0425 }, /* Estonian */
1863 { "fo", 0x0438 }, /* Faeroese fo_ */
1864 { "fa_IR", 0x0429 }, /* Farsi */
1865 { "fi_FI", 0x040b }, /* Finnish */
1866 { "fr_FR", 0x040c }, /* French (France) */
1867 { "fr_BE", 0x080c }, /* French (Belgium) */
1868 { "fr_CA", 0x0c0c }, /* French (Canada) */
1869 { "fr_CH", 0x100c }, /* French (Switzerland) */
1870 { "fr_LU", 0x140c }, /* French (Luxembourg) */
1871 { "fr_MC", 0x180c }, /* French (Monaco) */
1872 { "gl", 0x0456 }, /* Galician gl_ */
1873 { "ga_IE", 0x083c }, /* Irish Gaelic */
1874 { "gd_GB", 0x100c }, /* Scottish Gaelic */
1875 { "ka_GE", 0x0437 }, /* Georgian */
1876 { "de_DE", 0x0407 }, /* German (Germany) */
1877 { "de_CH", 0x0807 }, /* German (Switzerland) */
1878 { "de_AT", 0x0c07 }, /* German (Austria) */
1879 { "de_LU", 0x1007 }, /* German (Luxembourg) */
1880 { "de_LI", 0x1407 }, /* German (Liechtenstein) */
1881 { "el_GR", 0x0408 }, /* Greek */
1882 { "gu", 0x0447 }, /* Gujarati gu_ */
1883 { "ha", 0x0468 }, /* Hausa */
1884 { "he_IL", 0x040d }, /* Hebrew */
1885 { "hi_IN", 0x0439 }, /* Hindi */
1886 { "hu_HU", 0x040e }, /* Hungarian */
1887 { "is_IS", 0x040f }, /* Icelandic */
1888 { "id_ID", 0x0421 }, /* Indonesian */
1889 { "iu", 0x045d }, /* Inkutitut */
1890 { "it_IT", 0x0410 }, /* Italian (Italy) */
1891 { "it_CH", 0x0810 }, /* Italian (Switzerland) */
1892 { "ja_JP", 0x0411}, /* Japanese */
1893 { "kn", 0x044b }, /* Kannada kn_ */
1894 { "ks", 0x0860 }, /* Kashmiri (India) ks_ */
1895 { "kk", 0x043f }, /* Kazakh kk_ */
1896 { "kok", 0x0457 }, /* Konkani kok_ */
1897 { "ko_KR", 0x0412 }, /* Korean */
1898 { "ko", 0x0812 }, /* Korean (Johab) ko_ */
1899 { "kir", 0x0440 }, /* Kyrgyz */
1900 { "la", 0x0476 }, /* Latin */
1901 { "lo", 0x0454 }, /* Laothian */
1902 { "lv_LV", 0x0426 }, /* Latvian */
1903 { "lt_LT", 0x0427 }, /* Lithuanian */
1904 { "lt_LT", 0x0827 }, /* Lithuanian (Classic) */
1905 { "mk", 0x042f }, /* FYRO Macedonian */
1906 { "my_MY", 0x043e }, /* Malaysian */
1907 { "my_BN", 0x083e }, /* Malay Brunei Darussalam */
1908 { "ml", 0x044c }, /* Malayalam ml_ */
1909 { "mr", 0x044e }, /* Marathi mr_ */
1910 { "mt", 0x043a }, /* Maltese */
1911 { "mo", 0x0450 }, /* Mongolian */
1912 { "ne_NP", 0x0461 }, /* Napali (Nepal) */
1913 { "ne_IN", 0x0861 }, /* Nepali (India) */
1914 { "nb_NO", 0x0414 }, /* Norwegian (Bokmaal) */
1915 { "nn_NO", 0x0814 }, /* Norwegian (Nynorsk) */
1916 { "or", 0x0448 }, /* Oriya or_ */
1917 { "om", 0x0472 }, /* Oromo (Afan, Galla) */
1918 { "pl_PL", 0x0415 }, /* Polish */
1919 { "pt_BR", 0x0416 }, /* Portuguese (Brazil) */
1920 { "pt_PT", 0x0816 }, /* Portuguese (Portugal) */
1921 { "pa", 0x0446 }, /* Punjabi pa_ */
1922 { "ps", 0x0463 }, /* Pashto (Pushto) */
1923 { "rm", 0x0417 }, /* Rhaeto_Romanic rm_ */
1924 { "ro_RO", 0x0418 }, /* Romanian */
1925 { "ro_MD", 0x0818 }, /* Romanian (Moldova) */
1926 { "ru_RU", 0x0419 }, /* Russian */
1927 { "ru_MD", 0x0819 }, /* Russian (Moldova) */
1928 { "se", 0x043b }, /* Sami (Lappish) se_ */
1929 { "sa", 0x044f }, /* Sanskrit sa_ */
1930 { "sr", 0x0c1a }, /* Serbian (Cyrillic) sr_ */
1931 { "sr", 0x081a }, /* Serbian (Latin) sr_ */
1932 { "sd", 0x0459 }, /* Sindhi sd_ */
1933 { "sk_SK", 0x041b }, /* Slovak */
1934 { "sl_SI", 0x0424 }, /* Slovenian */
1935 { "wen", 0x042e }, /* Sorbian wen_ */
1936 { "so", 0x0477 }, /* Somali */
1937 { "es_ES", 0x040a }, /* Spanish (Spain, Traditional) */
1938 { "es_MX", 0x080a }, /* Spanish (Mexico) */
1939 { "es_ES", 0x0c0a }, /* Spanish (Modern) */
1940 { "es_GT", 0x100a }, /* Spanish (Guatemala) */
1941 { "es_CR", 0x140a }, /* Spanish (Costa Rica) */
1942 { "es_PA", 0x180a }, /* Spanish (Panama) */
1943 { "es_DO", 0x1c0a }, /* Spanish (Dominican Republic) */
1944 { "es_VE", 0x200a }, /* Spanish (Venezuela) */
1945 { "es_CO", 0x240a }, /* Spanish (Colombia) */
1946 { "es_PE", 0x280a }, /* Spanish (Peru) */
1947 { "es_AR", 0x2c0a }, /* Spanish (Argentina) */
1948 { "es_EC", 0x300a }, /* Spanish (Ecuador) */
1949 { "es_CL", 0x340a }, /* Spanish (Chile) */
1950 { "es_UY", 0x380a }, /* Spanish (Uruguay) */
1951 { "es_PY", 0x3c0a }, /* Spanish (Paraguay) */
1952 { "es_BO", 0x400a }, /* Spanish (Bolivia) */
1953 { "es_SV", 0x440a }, /* Spanish (El Salvador) */
1954 { "es_HN", 0x480a }, /* Spanish (Honduras) */
1955 { "es_NI", 0x4c0a }, /* Spanish (Nicaragua) */
1956 { "es_PR", 0x500a }, /* Spanish (Puerto Rico) */
1957 { "sx", 0x0430 }, /* Sutu */
1958 { "sw", 0x0441 }, /* Swahili (Kiswahili/Kenya) */
1959 { "sv_SE", 0x041d }, /* Swedish */
1960 { "sv_FI", 0x081d }, /* Swedish (Finland) */
1961 { "ta", 0x0449 }, /* Tamil ta_ */
1962 { "tt", 0x0444 }, /* Tatar (Tatarstan) tt_ */
1963 { "te", 0x044a }, /* Telugu te_ */
1964 { "th_TH", 0x041e }, /* Thai */
1965 { "ts", 0x0431 }, /* Tsonga ts_ */
1966 { "tn", 0x0432 }, /* Tswana tn_ */
1967 { "tr_TR", 0x041f }, /* Turkish */
1968 { "tl", 0x0464 }, /* Tagalog */
1969 { "tg", 0x0428 }, /* Tajik */
1970 { "bo", 0x0451 }, /* Tibetan */
1971 { "ti", 0x0473 }, /* Tigrinya */
1972 { "uk_UA", 0x0422 }, /* Ukrainian */
1973 { "ur_PK", 0x0420 }, /* Urdu (Pakistan) */
1974 { "ur_IN", 0x0820 }, /* Urdu (India) */
1975 { "uz", 0x0443 }, /* Uzbek (Latin) uz_ */
1976 { "uz", 0x0843 }, /* Uzbek (Cyrillic) uz_ */
1977 { "ven", 0x0433 }, /* Venda ven_ */
1978 { "vi_VN", 0x042a }, /* Vietnamese */
1979 { "cy_GB", 0x0452 }, /* Welsh */
1980 { "xh", 0x0434 }, /* Xhosa xh */
1981 { "yi", 0x043d }, /* Yiddish yi_ */
1982 { "yo", 0x046a }, /* Yoruba */
1983 { "zu", 0x0435 }, /* Zulu zu_ */
1984 { "en_US", 0x0800 } /* Default */
1985 };
1986
1987 /**
1988 * gsf_msole_lid_for_language:
1989 * @lang: (allow-none): Language id, i.e., locale name.
1990 *
1991 * Returns: the LID (Language Identifier) for the input language.
1992 * If lang is %NULL, return 0x0400 ("-none-"), and not 0x0000 ("no proofing")
1993 **/
1994 guint
gsf_msole_lid_for_language(char const * lang)1995 gsf_msole_lid_for_language (char const *lang)
1996 {
1997 guint i = 0 ;
1998 size_t len;
1999
2000 if (lang == NULL)
2001 return 0x0400; /* return -none- */
2002
2003 /* Allow lang to match as a prefix (eg fr == fr_FR@euro) */
2004 len = strlen (lang);
2005 for (i = 0 ; i < G_N_ELEMENTS(gsf_msole_language_ids); i++)
2006 if (!strncmp (lang, gsf_msole_language_ids[i].tag, len))
2007 return gsf_msole_language_ids[i].lid;
2008
2009 return 0x0400 ; /* return -none- */
2010 }
2011
2012 /**
2013 * gsf_msole_language_for_lid:
2014 * @lid: numerical language id
2015 *
2016 * Returns: (transfer none): the xx_YY style string (can be just xx or
2017 * xxx) for the given LID. If the LID is not found, is set to 0x0400,
2018 * or is set to 0x0000, will return "-none-"
2019 **/
2020 char const *
gsf_msole_language_for_lid(guint lid)2021 gsf_msole_language_for_lid (guint lid)
2022 {
2023 guint i = 0 ;
2024
2025 for (i = 0 ; i < G_N_ELEMENTS(gsf_msole_language_ids); i++)
2026 if (gsf_msole_language_ids[i].lid == lid)
2027 return gsf_msole_language_ids[i].tag;
2028
2029 return "-none-"; /* default */
2030 }
2031
2032 /**
2033 * gsf_msole_locale_to_lid:
2034 * @codepage: character code page.
2035 *
2036 * Convert the the codepage into an applicable LID
2037 **/
2038 guint
gsf_msole_codepage_to_lid(int codepage)2039 gsf_msole_codepage_to_lid (int codepage)
2040 {
2041 switch (codepage) {
2042 case 77: /* MAC_CHARSET */
2043 return 0xFFF; /* This number is a hack */
2044 case 128: /* SHIFTJIS_CHARSET */
2045 return 0x411; /* Japanese */
2046 case 129: /* HANGEUL_CHARSET */
2047 return 0x412; /* Korean */
2048 case 130: /* JOHAB_CHARSET */
2049 return 0x812; /* Korean (Johab) */
2050 case 134: /* GB2312_CHARSET - Chinese Simplified */
2051 return 0x804; /* China PRC - And others!! */
2052 case 136: /* CHINESEBIG5_CHARSET - Chinese Traditional */
2053 return 0x404; /* Taiwan - And others!! */
2054 case 161: /* GREEK_CHARSET */
2055 return 0x408; /* Greek */
2056 case 162: /* TURKISH_CHARSET */
2057 return 0x41f; /* Turkish */
2058 case 163: /* VIETNAMESE_CHARSET */
2059 return 0x42a; /* Vietnamese */
2060 case 177: /* HEBREW_CHARSET */
2061 return 0x40d; /* Hebrew */
2062 case 178: /* ARABIC_CHARSET */
2063 return 0x01; /* Arabic */
2064 case 186: /* BALTIC_CHARSET */
2065 return 0x425; /* Estonian - And others!! */
2066 case 204: /* RUSSIAN_CHARSET */
2067 return 0x419; /* Russian - And others!! */
2068 case 222: /* THAI_CHARSET */
2069 return 0x41e; /* Thai */
2070 case 238: /* EASTEUROPE_CHARSET */
2071 return 0x405; /* Czech - And many others!! */
2072 }
2073
2074 /* default */
2075 return 0x0;
2076 }
2077
2078 /**
2079 * gsf_msole_lid_to_codepage:
2080 * @lid: numerical language id
2081 *
2082 * Returns: our best guess at the codepage for the given language id
2083 **/
2084 int
gsf_msole_lid_to_codepage(guint lid)2085 gsf_msole_lid_to_codepage (guint lid)
2086 {
2087 if (lid == 0x0FFF) /* Macintosh Hack */
2088 return 0x0FFF;
2089
2090 switch (lid & 0xff) {
2091 case 0x01: /* Arabic */
2092 return 1256;
2093 case 0x02: /* Bulgarian */
2094 return 1251;
2095 case 0x03: /* Catalan */
2096 return 1252;
2097 case 0x04: /* Chinese */
2098 switch (lid) {
2099 case 0x1004: /* Chinese (Singapore) */
2100 case 0x0404: /* Chinese (Taiwan) */
2101 case 0x1404: /* Chinese (Macau SAR) */
2102 case 0x0c04: /* Chinese (Hong Kong SAR, PRC) */
2103 return 950;
2104
2105 case 0x0804: /* Chinese (PRC) */
2106 return 936;
2107 default:
2108 break;
2109 }
2110 break;
2111 case 0x05: /* Czech */
2112 return 1250;
2113 case 0x06: /* Danish */
2114 return 1252;
2115 case 0x07: /* German */
2116 return 1252;
2117 case 0x08: /* Greek */
2118 return 1253;
2119 case 0x09: /* English */
2120 return 1252;
2121 case 0x0a: /* Spanish */
2122 return 1252;
2123 case 0x0b: /* Finnish */
2124 return 1252;
2125 case 0x0c: /* French */
2126 return 1252;
2127 case 0x0d: /* Hebrew */
2128 return 1255;
2129 case 0x0e: /* Hungarian */
2130 return 1250;
2131 case 0x0f: /* Icelandic */
2132 return 1252;
2133 case 0x10: /* Italian */
2134 return 1252;
2135 case 0x11: /* Japanese */
2136 return 932;
2137 case 0x12: /* Korean */
2138 switch (lid) {
2139 case 0x0812: /* Korean (Johab) */
2140 return 1361;
2141 case 0x0412: /* Korean */
2142 return 949;
2143 default:
2144 break;
2145 }
2146 break;
2147 case 0x13: /* Dutch */
2148 return 1252;
2149 case 0x14: /* Norwegian */
2150 return 1252;
2151 case 0x15: /* Polish */
2152 return 1250;
2153 case 0x16: /* Portuguese */
2154 return 1252;
2155 case 0x17: /* Rhaeto-Romanic */
2156 return 1252;
2157 case 0x18: /* Romanian */
2158 return 1250;
2159 case 0x19: /* Russian */
2160 return 1251;
2161 case 0x1a: /* Serbian, Croatian, (Bosnian?) */
2162 switch (lid) {
2163 case 0x041a: /* Croatian */
2164 return 1252;
2165 case 0x0c1a: /* Serbian (Cyrillic) */
2166 return 1251;
2167 case 0x081a: /* Serbian (Latin) */
2168 return 1252;
2169 default:
2170 break;
2171 }
2172 break;
2173 case 0x1b: /* Slovak */
2174 return 1250;
2175 case 0x1c: /* Albanian */
2176 return 1251;
2177 case 0x1d: /* Swedish */
2178 return 1252;
2179 case 0x1e: /* Thai */
2180 return 874;
2181 case 0x1f: /* Turkish */
2182 return 1254;
2183 case 0x20: /* Urdu. This is Unicode only. */
2184 return 0;
2185 case 0x21: /* Bahasa Indonesian */
2186 return 1252;
2187 case 0x22: /* Ukrainian */
2188 return 1251;
2189 case 0x23: /* Byelorussian / Belarusian */
2190 return 1251;
2191 case 0x24: /* Slovenian */
2192 return 1250;
2193 case 0x25: /* Estonian */
2194 return 1257;
2195 case 0x26: /* Latvian */
2196 return 1257;
2197 case 0x27: /* Lithuanian */
2198 return 1257;
2199 case 0x29: /* Farsi / Persian. This is Unicode only. */
2200 return 0;
2201 case 0x2a: /* Vietnamese */
2202 return 1258;
2203 case 0x2b: /* Windows 2000: Armenian. This is Unicode only. */
2204 return 0;
2205 case 0x2c: /* Azeri */
2206 switch (lid) {
2207 case 0x082c: /* Azeri (Cyrillic) */
2208 return 1251;
2209 default:
2210 break;
2211 }
2212 break;
2213 case 0x2d: /* Basque */
2214 return 1252;
2215 case 0x2f: /* Macedonian */
2216 return 1251;
2217 case 0x36: /* Afrikaans */
2218 return 1252;
2219 case 0x37: /* Windows 2000: Georgian. This is Unicode only. */
2220 return 0;
2221 case 0x38: /* Faeroese */
2222 return 1252;
2223 case 0x39: /* Windows 2000: Hindi. This is Unicode only. */
2224 return 0;
2225 case 0x3E: /* Malaysian / Malay */
2226 return 1252;
2227 case 0x41: /* Swahili */
2228 return 1252;
2229 case 0x43: /* Uzbek */
2230 switch (lid) {
2231 case 0x0843: /* Uzbek (Cyrillic) */
2232 return 1251;
2233 default:
2234 break;
2235 }
2236 break;
2237 case 0x45: /* Windows 2000: Bengali. This is Unicode only. */
2238 case 0x46: /* Windows 2000: Punjabi. This is Unicode only. */
2239 case 0x47: /* Windows 2000: Gujarati. This is Unicode only. */
2240 case 0x48: /* Windows 2000: Oriya. This is Unicode only. */
2241 case 0x49: /* Windows 2000: Tamil. This is Unicode only. */
2242 case 0x4a: /* Windows 2000: Telugu. This is Unicode only. */
2243 case 0x4b: /* Windows 2000: Kannada. This is Unicode only. */
2244 case 0x4c: /* Windows 2000: Malayalam. This is Unicode only. */
2245 case 0x4d: /* Windows 2000: Assamese. This is Unicode only. */
2246 case 0x4e: /* Windows 2000: Marathi. This is Unicode only. */
2247 case 0x4f: /* Windows 2000: Sanskrit. This is Unicode only. */
2248 case 0x55: /* Myanmar / Burmese. This is Unicode only. */
2249 case 0x57: /* Windows 2000: Konkani. This is Unicode only. */
2250 case 0x61: /* Windows 2000: Nepali (India). This is Unicode only. */
2251 return 0;
2252
2253 #if 0
2254 /******************************************************************
2255 * Below this line is untested, unproven, and are just guesses. *
2256 * Insert above and use at your own risk *
2257 ******************************************************************/
2258
2259 case 0x042c: /* Azeri (Latin) */
2260 case 0x0443: /* Uzbek (Latin) */
2261 case 0x30: /* Sutu */
2262 return 1252; /* UNKNOWN, believed to be CP1252 */
2263
2264 case 0x3f: /* Kazakh */
2265 return 1251; /* JUST UNKNOWN, probably CP1251 */
2266
2267 case 0x44: /* Tatar */
2268 case 0x58: /* Manipuri */
2269 case 0x59: /* Sindhi */
2270 case 0x60: /* Kashmiri (India) */
2271 return 0; /* UNKNOWN, believed to be Unicode only */
2272 #endif
2273 };
2274
2275 /* This is just a guess, but it will be a frequent guess */
2276 return 1252;
2277 }
2278
2279 /**
2280 * gsf_msole_lid_to_codepage_str:
2281 * @lid: numerical language id
2282 *
2283 * Returns: (transfer full): the Iconv codepage string for the given
2284 * LID.
2285 **/
2286 gchar *
gsf_msole_lid_to_codepage_str(guint lid)2287 gsf_msole_lid_to_codepage_str (guint lid)
2288 {
2289 guint cp = 0;
2290
2291 if (lid == 0x0FFF) /* Macintosh Hack */
2292 return g_strdup ("MACINTOSH");
2293
2294 cp = gsf_msole_lid_to_codepage (lid);
2295 return g_strdup_printf ("CP%d", cp);
2296 }
2297
2298 /**
2299 * gsf_msole_iconv_win_codepage:
2300 *
2301 * Returns: our best guess at the applicable windows code page based on an
2302 * environment variable or the current locale.
2303 **/
2304 int
gsf_msole_iconv_win_codepage(void)2305 gsf_msole_iconv_win_codepage (void)
2306 {
2307 const char *win_lang;
2308 char *lang = NULL;
2309
2310 win_lang = g_getenv("WINDOWS_LANGUAGE");
2311 if (win_lang) {
2312 lang = g_strdup (win_lang);
2313 } else {
2314 char const *locale = setlocale (LC_CTYPE, NULL);
2315 if (locale != NULL) {
2316 char const *lang_sep = strchr (locale, '.');
2317 if (lang_sep)
2318 lang = g_strndup (locale, lang_sep - locale);
2319 else
2320 lang = g_strdup (locale);
2321 }
2322 }
2323
2324 if (lang != NULL) {
2325 guint lid = gsf_msole_lid_for_language (lang);
2326 g_free (lang);
2327 return gsf_msole_lid_to_codepage (lid);
2328 }
2329 return 1252; /* default ansi */
2330 }
2331
2332 static GSList *
gsf_msole_iconv_get_codepage_string_list(int codepage)2333 gsf_msole_iconv_get_codepage_string_list (int codepage)
2334 {
2335 GSList *cp_list = NULL;
2336
2337 switch (codepage) {
2338 case 1200:
2339 cp_list = g_slist_prepend (cp_list, g_strdup ("UTF-16LE"));
2340 break;
2341 case 1201:
2342 cp_list = g_slist_prepend (cp_list, g_strdup ("UTF-16BE"));
2343 break;
2344 case 0x8000:
2345 case 10000:
2346 cp_list = g_slist_prepend (cp_list, g_strdup ("MACROMAN"));
2347 cp_list = g_slist_prepend (cp_list, g_strdup ("MACINTOSH"));
2348 break;
2349 case -535:
2350 case 65001:
2351 cp_list = g_slist_prepend (cp_list, g_strdup ("UTF-8"));
2352 break;
2353 case 0x8001:
2354 /* according to OOo docs 8001 is a synonym CP1252 */
2355 codepage = 1252;
2356 /* fallthrough */
2357
2358 default:
2359 cp_list = g_slist_prepend (cp_list, g_strdup_printf ("CP%u", codepage));
2360 }
2361
2362 return cp_list;
2363 }
2364
2365 /**
2366 * gsf_msole_iconv_open_codepage_for_import: (skip)
2367 * @to: the target encoding.
2368 * @codepage: the source code page.
2369 *
2370 * NOTE: skipped since GIConv is not exported to introspection.
2371 *
2372 * Returns: an iconv converter for @codepage -> utf8.
2373 **/
2374 GIConv
gsf_msole_iconv_open_codepage_for_import(char const * to,int codepage)2375 gsf_msole_iconv_open_codepage_for_import (char const *to, int codepage)
2376 {
2377 GIConv iconv_handle = (GIConv)(-1);
2378 gchar *codepage_str;
2379 GSList *codepage_list, *cp;
2380 g_return_val_if_fail (to != NULL, (GIConv)(-1));
2381
2382 cp = codepage_list = gsf_msole_iconv_get_codepage_string_list (codepage);
2383 while (cp) {
2384 codepage_str = cp->data;
2385 if (iconv_handle == (GIConv)(-1))
2386 iconv_handle = g_iconv_open (to, codepage_str);
2387 g_free (codepage_str);
2388 cp = cp->next;
2389 }
2390 g_slist_free (codepage_list);
2391
2392 if (iconv_handle == (GIConv)(-1))
2393 g_warning ("Unable to open an iconv handle from codepage %d -> %s",
2394 codepage, to);
2395 return iconv_handle;
2396 }
2397
2398 /**
2399 * gsf_msole_iconv_open_for_import: (skip)
2400 * @codepage: the source code page.
2401 *
2402 * NOTE: skipped since GIConv is not exported to introspection.
2403 *
2404 * Returns: an iconv converter for single byte encodings @codepage -> utf8.
2405 * Attempt to handle the semantics of a specification for multibyte encodings
2406 * since this is only supposed to be used for single bytes.
2407 **/
2408 GIConv
gsf_msole_iconv_open_for_import(int codepage)2409 gsf_msole_iconv_open_for_import (int codepage)
2410 {
2411 return gsf_msole_iconv_open_codepage_for_import ("UTF-8", codepage);
2412 }
2413
2414 /**
2415 * gsf_msole_iconv_open_codepages_for_export: (skip)
2416 * @codepage_to: the target code page.
2417 * @from: the source encoding.
2418 *
2419 * NOTE: skipped since GIConv is not exported to introspection.
2420 *
2421 * Returns: an iconv converter to go from utf8 -> to our best guess at a useful
2422 * windows codepage.
2423 **/
2424 GIConv
gsf_msole_iconv_open_codepages_for_export(int codepage_to,char const * from)2425 gsf_msole_iconv_open_codepages_for_export (int codepage_to, char const *from)
2426 {
2427 GIConv iconv_handle = (GIConv)(-1);
2428 gchar *codepage_str;
2429 GSList *codepage_list, *cp;
2430 g_return_val_if_fail (from != NULL, (GIConv)(-1));
2431
2432 cp = codepage_list = gsf_msole_iconv_get_codepage_string_list (codepage_to);
2433 while (cp) {
2434 codepage_str = cp->data;
2435 if (iconv_handle == (GIConv)(-1))
2436 iconv_handle = g_iconv_open (codepage_str, from);
2437 g_free (codepage_str);
2438 cp = cp->next;
2439 }
2440 g_slist_free (codepage_list);
2441
2442 if (iconv_handle == (GIConv)(-1))
2443 g_warning ("Unable to open an iconv handle from %s -> codepage %u",
2444 from, codepage_to);
2445 return iconv_handle;
2446 }
2447
2448 /**
2449 * gsf_msole_iconv_open_codepage_for_export: (skip)
2450 * @codepage_to: the target code page.
2451 *
2452 * NOTE: skipped since GIConv is not exported to introspection.
2453 *
2454 * Returns: an iconv converter to go from utf8 -> to our best guess at a useful
2455 * windows codepage.
2456 **/
2457 GIConv
gsf_msole_iconv_open_codepage_for_export(int codepage_to)2458 gsf_msole_iconv_open_codepage_for_export (int codepage_to)
2459 {
2460 return gsf_msole_iconv_open_codepages_for_export (codepage_to, "UTF-8");
2461 }
2462
2463 /**
2464 * gsf_msole_iconv_open_for_export: (skip)
2465 *
2466 * NOTE: skipped since GIConv is not exported to introspection.
2467 *
2468 * Returns: an iconv convert to go from utf8 -> to our best guess at a useful
2469 * windows codepage.
2470 **/
2471 GIConv
gsf_msole_iconv_open_for_export(void)2472 gsf_msole_iconv_open_for_export (void)
2473 {
2474 return gsf_msole_iconv_open_codepage_for_export (gsf_msole_iconv_win_codepage ());
2475 }
2476
2477 #define VBA_COMPRESSION_WINDOW 4096
2478
2479 /**
2480 * gsf_msole_inflate:
2481 * @input: stream to read from
2482 * @offset: offset into it for start byte of compresse stream
2483 *
2484 * Decompresses an LZ compressed stream.
2485 *
2486 * Return value: (transfer full): A GByteArray that the caller is responsible for freeing
2487 **/
2488 GByteArray *
gsf_msole_inflate(GsfInput * input,gsf_off_t offset)2489 gsf_msole_inflate (GsfInput *input, gsf_off_t offset)
2490 {
2491 GByteArray *res;
2492 unsigned i, win_pos, pos = 0;
2493 unsigned mask, shift, distance;
2494 guint8 flag, buffer [VBA_COMPRESSION_WINDOW];
2495 guint8 const *tmp;
2496 guint16 token, len;
2497 gboolean clean = TRUE;
2498
2499 if (gsf_input_seek (input, offset, G_SEEK_SET))
2500 return NULL;
2501
2502 res = g_byte_array_new ();
2503
2504 /* explaination from libole2/ms-ole-vba.c */
2505 /* The first byte is a flag byte. Each bit in this byte
2506 * determines what the next byte is. If the bit is zero,
2507 * the next byte is a character. Otherwise the next two
2508 * bytes contain the number of characters to copy from the
2509 * umcompresed buffer and where to copy them from (offset,
2510 * length).
2511 */
2512 while (NULL != gsf_input_read (input, 1, &flag))
2513 for (mask = 1; mask < 0x100 ; mask <<= 1)
2514 if (flag & mask) {
2515 if (NULL == (tmp = gsf_input_read (input, 2, NULL)))
2516 break;
2517 win_pos = pos % VBA_COMPRESSION_WINDOW;
2518 if (win_pos <= 0x80) {
2519 if (win_pos <= 0x20)
2520 shift = (win_pos <= 0x10) ? 12 : 11;
2521 else
2522 shift = (win_pos <= 0x40) ? 10 : 9;
2523 } else {
2524 if (win_pos <= 0x200)
2525 shift = (win_pos <= 0x100) ? 8 : 7;
2526 else if (win_pos <= 0x800)
2527 shift = (win_pos <= 0x400) ? 6 : 5;
2528 else
2529 shift = 4;
2530 }
2531
2532 token = GSF_LE_GET_GUINT16 (tmp);
2533 len = (token & ((1 << shift) - 1)) + 3;
2534 distance = token >> shift;
2535 clean = TRUE;
2536 /* fprintf (stderr, "Shift %d, token len %d, distance %d bytes %.2x %.2x\n",
2537 shift, len, distance, (token & 0xff), (token >> 8)); */
2538
2539 if (distance >= pos) {
2540 g_warning ("Corrupted compressed stream");
2541 break;
2542 }
2543
2544 for (i = 0; i < len; i++) {
2545 unsigned srcpos = (pos - distance - 1) % VBA_COMPRESSION_WINDOW;
2546 guint8 c = buffer [srcpos];
2547 buffer [pos++ % VBA_COMPRESSION_WINDOW] = c;
2548 }
2549 } else {
2550 if ((pos != 0) && ((pos % VBA_COMPRESSION_WINDOW) == 0) && clean) {
2551 (void) gsf_input_read (input, 2, NULL);
2552 clean = FALSE;
2553 g_byte_array_append (res, buffer, VBA_COMPRESSION_WINDOW);
2554 break;
2555 }
2556 if (NULL != gsf_input_read (input, 1, buffer + (pos % VBA_COMPRESSION_WINDOW)))
2557 pos++;
2558 clean = TRUE;
2559 }
2560
2561 if (pos % VBA_COMPRESSION_WINDOW)
2562 g_byte_array_append (res, buffer, pos % VBA_COMPRESSION_WINDOW);
2563 return res;
2564 }
2565
2566
2567 struct GsfMSOleSortingKey_ {
2568 gunichar2 *name;
2569 size_t len;
2570 };
2571
2572 GsfMSOleSortingKey *
gsf_msole_sorting_key_new(const char * name)2573 gsf_msole_sorting_key_new (const char *name)
2574 {
2575 GsfMSOleSortingKey *res = g_new (GsfMSOleSortingKey, 1);
2576 size_t name_len;
2577 const char *p;
2578
2579 if (!name)
2580 name = "";
2581 name_len = strlen (name);
2582
2583 res->name = g_new (gunichar2, name_len + 1);
2584 res->len = 0;
2585
2586 /* This code is a bit like g_utf8_to_utf16. */
2587
2588 for (p = name; *p; p = g_utf8_next_char (p)) {
2589 gunichar wc =
2590 g_utf8_get_char_validated (p, name_len - (p - name));
2591 if (wc & 0x80000000)
2592 break; /* Something invalid or incomplete */
2593 if (wc < 0x10000) {
2594 wc = g_unichar_toupper (wc);
2595 /* Let's hope no uppercase char is above 0xffff! */
2596 res->name[res->len++] = wc;
2597 } else {
2598 res->name[res->len++] = (wc - 0x10000) / 0x400 + 0xd800;
2599 res->name[res->len++] = (wc - 0x10000) % 0x400 + 0xdc00;
2600 }
2601 }
2602 res->name[res->len] = 0;
2603
2604 return res;
2605 }
2606
2607 void
gsf_msole_sorting_key_free(GsfMSOleSortingKey * sk)2608 gsf_msole_sorting_key_free (GsfMSOleSortingKey *sk)
2609 {
2610 if (sk) {
2611 g_free (sk->name);
2612 g_free (sk);
2613 }
2614 }
2615
2616 static GsfMSOleSortingKey *
gsf_ms_ole_sorting_key_copy(GsfMSOleSortingKey * sk)2617 gsf_ms_ole_sorting_key_copy (GsfMSOleSortingKey *sk)
2618 {
2619 GsfMSOleSortingKey *res = g_new (GsfMSOleSortingKey, 1);
2620 res->len = sk->len;
2621 res->name = g_new (gunichar2, sk->len + 1);
2622 memcpy (res->name, sk->name, (sk->len + 1) * sizeof (gunichar2));
2623 return res;
2624 }
2625
2626 GType
gsf_msole_sorting_key_get_type(void)2627 gsf_msole_sorting_key_get_type (void)
2628 {
2629 static GType type = 0;
2630
2631 if (type == 0)
2632 type = g_boxed_type_register_static
2633 ("GsfMSOleSortingKey",
2634 (GBoxedCopyFunc) gsf_ms_ole_sorting_key_copy,
2635 (GBoxedFreeFunc) gsf_msole_sorting_key_free);
2636
2637 return type;
2638 }
2639
2640 int
gsf_msole_sorting_key_cmp(const GsfMSOleSortingKey * a,const GsfMSOleSortingKey * b)2641 gsf_msole_sorting_key_cmp (const GsfMSOleSortingKey *a,
2642 const GsfMSOleSortingKey *b)
2643 {
2644 long diff;
2645 /* According to the docs length is more important than lexical order */
2646 if (a->len != b->len)
2647 diff = a->len - b->len;
2648 else {
2649 const gunichar2 *pa = a->name;
2650 const gunichar2 *pb = b->name;
2651 while (*pa == *pb && *pa)
2652 pa++, pb++;
2653 diff = *pa - *pb;
2654 }
2655
2656 /* Note, that diff might not fit "int" */
2657 return diff > 0 ? +1 : (diff < 0 ? -1 : 0);
2658 }
2659