1 /* vim: set sw=8: -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
2 /*
3  * gsf-msole-utils.c:
4  *
5  * Copyright (C) 2002-2006 Jody Goldberg (jody@gnome.org)
6  * Copyright (C) 2002-2006 Dom Lachowicz (cinamod@hotmail.com)
7  * excel_iconv* family of functions (C) 2001 by Vlad Harchev <hvv@hippo.ru>
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of version 2.1 of the GNU Lesser General Public
11  * License as published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
21  * USA
22  */
23 
24 #include <gsf-config.h>
25 #include <gsf/gsf-msole-utils.h>
26 #include <gsf/gsf.h>
27 
28 #include <locale.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <time.h>
32 #include <glib/gi18n-lib.h>
33 
34 #define NO_DEBUG_OLE_PROPS
35 #ifndef NO_DEBUG_OLE_PROPS
36 #define d(code)	do { code } while (0)
37 #else
38 #define d(code)
39 #endif
40 
41 enum {
42 	DEBUG_UNKNOWN_PROPS = 1
43 };
44 
45 static gboolean
msole_debug(guint what)46 msole_debug (guint what)
47 {
48 	static guint flags;
49 	static gboolean inited = FALSE;
50 
51 	if (!inited) {
52 		/* not static */
53 		const GDebugKey keys[] = {
54 			{ (char*)"msole_prop", DEBUG_UNKNOWN_PROPS },
55 		};
56 
57 		const char *val = g_getenv ("GSF_DEBUG");
58 		flags = val
59 			? g_parse_debug_string (val, keys, G_N_ELEMENTS (keys))
60 			: 0;
61 
62 		inited = TRUE;
63 	}
64 
65 	return (flags & what) != 0;
66 }
67 
68 /*
69  * The Format Identifier for Summary Information
70  * F29F85E0-4FF9-1068-AB91-08002B27B3D9
71  */
72 static guint8 const component_guid [] = {
73 	0xe0, 0x85, 0x9f, 0xf2, 0xf9, 0x4f, 0x68, 0x10,
74 	0xab, 0x91, 0x08, 0x00, 0x2b, 0x27, 0xb3, 0xd9
75 };
76 
77 /*
78  * The Format Identifier for Document Summary Information
79  * D5CDD502-2E9C-101B-9397-08002B2CF9AE
80  */
81 static guint8 const document_guid [] = {
82 	0x02, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
83 	0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
84 };
85 
86 /*
87  * The Format Identifier for User-Defined Properties
88  * D5CDD505-2E9C-101B-9397-08002B2CF9AE
89  */
90 static guint8 const user_guid [] = {
91 	0x05, 0xd5, 0xcd, 0xd5, 0x9c, 0x2e, 0x1b, 0x10,
92 	0x93, 0x97, 0x08, 0x00, 0x2b, 0x2c, 0xf9, 0xae
93 };
94 
95 typedef enum {
96 	COMMON_PROP,	/* in either summary or docsummary */
97 	COMPONENT_PROP, /* SummaryInformation properties */
98 	DOC_PROP,	/* DocumentSummaryInformation properties */
99 	USER_PROP
100 } GsfMSOleMetaDataType;
101 
102 typedef enum {
103 	VT_EMPTY	   = 0,
104 	VT_NULL		   = 1,
105 	VT_I2		   = 2,
106 	VT_I4		   = 3,
107 	VT_R4		   = 4,
108 	VT_R8		   = 5,
109 	VT_CY		   = 6,
110 	VT_DATE		   = 7,
111 	VT_BSTR		   = 8,
112 	VT_DISPATCH	   = 9,
113 	VT_ERROR	   = 10,
114 	VT_BOOL		   = 11,
115 	VT_VARIANT	   = 12,
116 	VT_UNKNOWN	   = 13,
117 	VT_DECIMAL	   = 14,
118 
119 	VT_I1		   = 16,
120 	VT_UI1		   = 17,
121 	VT_UI2		   = 18,
122 	VT_UI4		   = 19,
123 	VT_I8		   = 20,
124 	VT_UI8		   = 21,
125 	VT_INT		   = 22,
126 	VT_UINT		   = 23,
127 	VT_VOID		   = 24,
128 	VT_HRESULT	   = 25,
129 	VT_PTR		   = 26,
130 	VT_SAFEARRAY	   = 27,
131 	VT_CARRAY	   = 28,
132 	VT_USERDEFINED	   = 29,
133 	VT_LPSTR	   = 30,
134 	VT_LPWSTR	   = 31,
135 
136 	VT_FILETIME	   = 64,
137 	VT_BLOB		   = 65,
138 	VT_STREAM	   = 66,
139 	VT_STORAGE	   = 67,
140 	VT_STREAMED_OBJECT = 68,
141 	VT_STORED_OBJECT   = 69,
142 	VT_BLOB_OBJECT	   = 70,
143 	VT_CF		   = 71,
144 	VT_CLSID	   = 72,
145 	VT_VECTOR	   = 0x1000
146 } GsfMSOleVariantType;
147 
148 typedef struct {
149 	char const		*ms_name;
150 	GsfMSOleMetaDataType	 section;
151 	char const		*gsf_name;
152 	guint32			 id;
153 	GsfMSOleVariantType	 prefered_type;
154 } GsfMSOleMetaDataPropMap;
155 
156 typedef struct {
157 	guint32		id;
158 	gsf_off_t	offset;
159 } GsfMSOleMetaDataProp;
160 
161 typedef struct {
162 	GsfMSOleMetaDataType type;
163 	gsf_off_t   offset;
164 	guint32	    size, num_props;
165 	GIConv	    iconv_handle;
166 	unsigned    char_size;
167 	GHashTable *dict;
168 } GsfMSOleMetaDataSection;
169 
170 static GsfMSOleMetaDataPropMap const builtin_props [] = {
171 	{ "Dictionary",		  COMMON_PROP,	GSF_META_NAME_DICTIONARY,            0,	   0, /* magic */},
172 	{ "CodePage",		  COMMON_PROP,	GSF_META_NAME_CODEPAGE,              1,	   VT_I2 },
173 	{ "LOCALE_SYSTEM_DEFAULT",COMMON_PROP,	GSF_META_NAME_LOCALE_SYSTEM_DEFAULT, 0x80000000, VT_UI4},
174 	{ "CASE_SENSITIVE",	  COMMON_PROP,	GSF_META_NAME_CASE_SENSITIVE,        0x80000003, VT_UI4},
175 	{ "Category",		DOC_PROP,	GSF_META_NAME_CATEGORY,             2,	VT_LPSTR },
176 	{ "PresentationFormat",	DOC_PROP,	GSF_META_NAME_PRESENTATION_FORMAT,  3,	VT_LPSTR },
177 	{ "NumBytes",		DOC_PROP,	GSF_META_NAME_BYTE_COUNT,           4,	VT_I4 },
178 	{ "NumLines",		DOC_PROP,	GSF_META_NAME_LINE_COUNT,           5,	VT_I4 },
179 	{ "NumParagraphs",	DOC_PROP,	GSF_META_NAME_PARAGRAPH_COUNT,      6,	VT_I4 },
180 	{ "NumSlides",		DOC_PROP,	GSF_META_NAME_SLIDE_COUNT,          7,	VT_I4 },
181 	{ "NumNotes",		DOC_PROP,	GSF_META_NAME_NOTE_COUNT,           8,	VT_I4 },
182 	{ "NumHiddenSlides",	DOC_PROP,	GSF_META_NAME_HIDDEN_SLIDE_COUNT,   9,	VT_I4 },
183 	{ "NumMMClips",		DOC_PROP,	GSF_META_NAME_MM_CLIP_COUNT,       10,	VT_I4 },
184 	{ "Scale",		DOC_PROP,	GSF_META_NAME_SCALE,               11,	VT_BOOL },
185 	{ "HeadingPairs",	DOC_PROP,	GSF_META_NAME_HEADING_PAIRS,       12,	VT_VECTOR | VT_VARIANT },
186 	{ "DocumentParts",	DOC_PROP,	GSF_META_NAME_DOCUMENT_PARTS,      13,	VT_VECTOR | VT_LPSTR },
187 	{ "Manager",		DOC_PROP,	GSF_META_NAME_MANAGER,             14,	VT_LPSTR },
188 	{ "Company",		DOC_PROP,	GSF_META_NAME_COMPANY,             15,	VT_LPSTR },
189 	{ "LinksDirty",		DOC_PROP,	GSF_META_NAME_LINKS_DIRTY,         16,	VT_BOOL },
190 	// Possible match: { 0x0011, 0x0003, "PIDDSI_CCHWITHSPACES", "Number of characters with white-space" },
191 	{ "DocSumInfo_17",      DOC_PROP,	GSF_META_NAME_MSOLE_UNKNOWN_17,    17,	VT_UNKNOWN },
192 	{ "DocSumInfo_18",      DOC_PROP,	GSF_META_NAME_MSOLE_UNKNOWN_18,    18,	VT_UNKNOWN },
193 	// Possible match: { 0x0013, 0x000b, "PIDDSI_SHAREDDOC", "Shared document" },
194 	{ "DocSumInfo_19",      DOC_PROP,	GSF_META_NAME_MSOLE_UNKNOWN_19,    19,	VT_BOOL },
195 	// Possible match: +  PIDDSI_LINKBASE  = 0x0014
196 	{ "DocSumInfo_20",      DOC_PROP,	GSF_META_NAME_MSOLE_UNKNOWN_20,    20,	VT_UNKNOWN },
197 	// Possible match: +  PIDDSI_HLINKS= 0x0015,
198 	{ "DocSumInfo_21",      DOC_PROP,	GSF_META_NAME_MSOLE_UNKNOWN_21,    21,	VT_UNKNOWN },
199 	// Possible match: { 0x0016, 0x000b, "PIDDSI_HYPERLINKSCHANGED", "Hyper links changed" },
200 	{ "DocSumInfo_22",      DOC_PROP,	GSF_META_NAME_MSOLE_UNKNOWN_22,    22,	VT_BOOL },
201 	// Possible match: { 0x0017, 0x0003, "PIDDSI_VERSION", "Creating application version" },
202 	{ "DocSumInfo_23",      DOC_PROP,	GSF_META_NAME_MSOLE_UNKNOWN_23,    23,	VT_I4 },
203 	{ "Title",		COMPONENT_PROP, GSF_META_NAME_TITLE,		    2,	VT_LPSTR },
204 	{ "Subject",		COMPONENT_PROP, GSF_META_NAME_SUBJECT,		    3,	VT_LPSTR },
205 	{ "Author",		COMPONENT_PROP, GSF_META_NAME_CREATOR,		    4,	VT_LPSTR },
206 	{ "Keywords",		COMPONENT_PROP, GSF_META_NAME_KEYWORDS,		    5,	VT_LPSTR },
207 	{ "Comments",		COMPONENT_PROP, GSF_META_NAME_DESCRIPTION,	    6,	VT_LPSTR },
208 	{ "Template",		COMPONENT_PROP, GSF_META_NAME_TEMPLATE,		    7,	VT_LPSTR },
209 	{ "LastSavedBy",	COMPONENT_PROP, GSF_META_NAME_LAST_SAVED_BY,	    8,	VT_LPSTR },
210 	{ "RevisionNumber",	COMPONENT_PROP, GSF_META_NAME_REVISION_COUNT,	    9,	VT_LPSTR },
211 	{ "TotalEditingTime",	COMPONENT_PROP, GSF_META_NAME_EDITING_DURATION,	   10,	VT_FILETIME },
212 	{ "LastPrinted",	COMPONENT_PROP, GSF_META_NAME_LAST_PRINTED,	   11,	VT_FILETIME },
213 	{ "CreateTime",		COMPONENT_PROP, GSF_META_NAME_DATE_CREATED,	   12,	VT_FILETIME },
214 	{ "LastSavedTime",	COMPONENT_PROP, GSF_META_NAME_DATE_MODIFIED,	   13,	VT_FILETIME },
215 	{ "NumPages",		COMPONENT_PROP, GSF_META_NAME_PAGE_COUNT,	   14,	VT_I4 },
216 	{ "NumWords",		COMPONENT_PROP, GSF_META_NAME_WORD_COUNT,	   15,	VT_I4 },
217 	{ "NumCharacters",	COMPONENT_PROP, GSF_META_NAME_CHARACTER_COUNT,	   16,	VT_I4 },
218 	{ "Thumbnail",		COMPONENT_PROP, GSF_META_NAME_THUMBNAIL,	   17,	VT_CF },
219 	{ "AppName",		COMPONENT_PROP, GSF_META_NAME_GENERATOR,	   18,	VT_LPSTR },
220 	{ "Security",		COMPONENT_PROP, GSF_META_NAME_SECURITY,		   19,	VT_I4 }
221 };
222 
223 static GHashTable *name_to_prop_hash = NULL;
224 
225 static char const *
msole_vt_name(GsfMSOleVariantType type)226 msole_vt_name (GsfMSOleVariantType type)
227 {
228 	static char const *names[] = {
229 		"VT_EMPTY",	"VT_NULL",	"VT_I2",	"VT_I4",	"VT_R4",
230 		"VT_R8",	"VT_CY",	"VT_DATE",	"VT_BSTR",	"VT_DISPATCH",
231 		"VT_ERROR",	"VT_BOOL",	"VT_VARIANT",	"VT_UNKNOWN",	"VT_DECIMAL",
232 		NULL,		"VT_I1",	"VT_UI1",	"VT_UI2",	"VT_UI4",
233 		"VT_I8",	"VT_UI8",	"VT_INT",	"VT_UINT",	"VT_VOID",
234 		"VT_HRESULT",	"VT_PTR",	"VT_SAFEARRAY",	"VT_CARRAY",	"VT_USERDEFINED",
235 		"VT_LPSTR",	"VT_LPWSTR",
236 	};
237 	static char const *names2[] = {
238 		"VT_FILETIME",
239 		"VT_BLOB",	"VT_STREAM",	"VT_STORAGE",	"VT_STREAMED_OBJECT",
240 		"VT_STORED_OBJECT", "VT_BLOB_OBJECT", "VT_CF",	"VT_CLSID"
241 	};
242 
243 	type &= ~VT_VECTOR;
244 	if (type <= VT_LPWSTR)
245 		return names[type];
246 	g_return_val_if_fail (type >= VT_FILETIME, "_UNKNOWN_");
247 	g_return_val_if_fail (type <= VT_CLSID, "_UNKNOWN_");
248 	return names2[type-VT_FILETIME];
249 }
250 
251 static char const *
msole_prop_id_to_gsf(GsfMSOleMetaDataSection * section,guint32 id,gboolean * linked)252 msole_prop_id_to_gsf (GsfMSOleMetaDataSection *section, guint32 id, gboolean *linked)
253 {
254 	char const *res = NULL;
255 	GsfMSOleMetaDataPropMap const *map = NULL;
256 	unsigned i = 0;
257 
258 	*linked = FALSE;
259 	if (section->dict != NULL) {
260 		if (id & 0x1000000) {
261 			*linked = TRUE;
262 			id &= ~0x1000000;
263 			d (g_print ("LINKED "););
264 		}
265 
266 		res = g_hash_table_lookup (section->dict, GINT_TO_POINTER (id));
267 
268 		if (res != NULL) {
269 			d (g_print ("%s", res););
270 			return res;
271 		}
272 	}
273 
274 	map = builtin_props ;
275 	i = G_N_ELEMENTS (builtin_props);
276 	while (i-- > 0)
277 		if (map[i].id == id &&
278 		    (map[i].section == COMMON_PROP || map[i].section == section->type)) {
279 			d (g_print ("%s\n", map[i].gsf_name););
280 			return map[i].gsf_name;
281 		}
282 
283 	d (g_print ("_UNKNOWN_(0x%x %d)\n", id, id););
284 
285 	return NULL;
286 }
287 
288 static GsfMSOleMetaDataPropMap const *
msole_gsf_name_to_prop(char const * name)289 msole_gsf_name_to_prop (char const *name)
290 {
291 	if (NULL == name_to_prop_hash) {
292 		int i;
293 		name_to_prop_hash = g_hash_table_new (g_str_hash, g_str_equal);
294 		for (i = G_N_ELEMENTS (builtin_props); i-- > 0; )
295 			g_hash_table_replace (name_to_prop_hash,
296 				(gpointer) builtin_props[i].gsf_name,
297 				(gpointer) (builtin_props+i));
298 	}
299 
300 	return g_hash_table_lookup (name_to_prop_hash, (gpointer)name);
301 }
302 
303 static void
set_error_missing_data(GError ** error,const char * property_name,gsize size_needed,gsize size_gotten)304 set_error_missing_data (GError **error, const char *property_name, gsize size_needed, gsize size_gotten)
305 {
306 	gchar *size_needed_str, *size_gotten_str;
307 
308 	size_needed_str = g_strdup_printf ("%" G_GSIZE_FORMAT, size_needed);
309 	size_gotten_str = g_strdup_printf ("%" G_GSIZE_FORMAT, size_gotten);
310 	g_set_error (error,
311 		     GSF_ERROR,
312 		     GSF_ERROR_INVALID_DATA,
313 		     _("Missing data when reading the %s property; got %s bytes, "
314 		       "but %s bytes at least are needed."),
315 		     property_name,
316 		     size_needed_str,
317 		     size_gotten_str);
318 	g_free (size_needed_str);
319 	g_free (size_gotten_str);
320 }
321 
322 /* Can return errors from gsf_blob_new() and GSF_ERROR_INVALID_DATA */
323 static gboolean
parse_vt_cf(GValue * res,guint8 const ** data,guint8 const * data_end,GError ** error)324 parse_vt_cf (GValue *res, guint8 const **data, guint8 const *data_end, GError **error)
325 {
326 	/* clipboard size		uint32		sizeof (clipboard format tag) + sizeof (clipboard data)
327 	 * clipboard format tag		int32		see below
328 	 * clipboard data		byte[]		see below
329 	 *
330 	 * Clipboard format tag:
331 	 * -1 - Windows clipboard format
332 	 * -2 - Macintosh clipboard format
333 	 * -3 - GUID that contains a format identifier (FMTID)
334 	 * >0 - custom clipboard format name plus data (see msdn site below)
335 	 *  0 - No data
336 	 *
337 	 * References:
338 	 * http://msdn.microsoft.com/library/default.asp?url=/library/en-us/stg/stg/propvariant.asp
339 	 * http://jakarta.apache.org/poi/hpsf/thumbnails.html
340 	 * http://linux.com.hk/docs/poi/org/apache/poi/hpsf/Thumbnail.html
341 	 * http://sparks.discreet.com/knowledgebase/public/solutions/ExtractThumbnailImg.htm
342 	 */
343 	guint32 clip_size, clip_data_size;
344 	gint32 clip_format;
345 	GsfBlob *blob;
346 	GsfClipData *clip_data;
347 
348 	/* Clipboard size field */
349 
350 	if (data_end < *data + 4) {
351 		set_error_missing_data (error, "VT_CF", 4, data_end - *data);
352 		return FALSE;
353 	}
354 
355 	clip_size = GSF_LE_GET_GUINT32 (*data);
356 
357 	if (clip_size < 4) {	/* must emcompass int32 format plus data size */
358 		gchar *size_str;
359 
360 		size_str = g_strdup_printf ("%" G_GSIZE_FORMAT, (gsize) clip_size);
361 		g_set_error (error,
362 			     GSF_ERROR,
363 			     GSF_ERROR_INVALID_DATA,
364 			     _("Corrupt data in the VT_CF property; clipboard data length must be at least 4 bytes, "
365 			       "but the data says it only has %s bytes available."),
366 			     size_str);
367 		g_free (size_str);
368 		return FALSE;
369 	}
370 
371 	*data += 4;
372 
373 	/* Check clipboard format plus data size */
374 
375 	if (data_end < *data + clip_size) {
376 		set_error_missing_data (error, "VT_CF", clip_size, data_end - *data);
377 		return FALSE;
378 	}
379 
380 	clip_format = GSF_LE_GET_GINT32 (*data);
381 	*data += 4;
382 
383 	switch (clip_format) {
384 	case GSF_CLIP_FORMAT_WINDOWS_CLIPBOARD:
385 	case GSF_CLIP_FORMAT_MACINTOSH_CLIPBOARD:
386 	case GSF_CLIP_FORMAT_GUID:
387 	case GSF_CLIP_FORMAT_NO_DATA:
388 		/* everything is ok */
389 		break;
390 
391 	default:
392 		if (clip_format > 0)
393 			clip_format = GSF_CLIP_FORMAT_CLIPBOARD_FORMAT_NAME;
394 		else
395 			clip_format = GSF_CLIP_FORMAT_UNKNOWN;
396 
397 		break;
398 	}
399 
400 	clip_data_size = clip_size - 4;
401 
402 	blob = gsf_blob_new (clip_data_size, *data, error);
403 
404 	*data += clip_data_size;
405 
406 	if (!blob)
407 		return FALSE;
408 
409 	clip_data = gsf_clip_data_new (clip_format, blob);
410 	g_object_unref (blob);
411 
412 	g_value_init (res, GSF_TYPE_CLIP_DATA);
413 	g_value_set_object (res, clip_data);
414 	g_object_unref (clip_data);
415 
416 	return TRUE;
417 }
418 
419 static unsigned
msole_codepage_char_size(int codepage)420 msole_codepage_char_size (int codepage)
421 {
422 	return (codepage == 1200 || codepage == 1201
423 		? 2
424 		: 1);
425 }
426 
427 /*
428  * Return a number no bigger than the number of bytes used for a property
429  * value of a given type.  The returned number might be too small, but
430  * we try to return as big a value as possible.
431  */
432 static size_t
msole_prop_min_size(guint32 type)433 msole_prop_min_size (guint32 type)
434 {
435 	switch (type) {
436 	case VT_EMPTY:
437 	case VT_NULL:
438 		return 0;
439 
440 	case VT_BOOL:
441 	case VT_I1:
442 	case VT_UI1:
443 		return 1;
444 
445 	case VT_I2:
446 	case VT_UI2:
447 		return 2;
448 
449 	case VT_I4:
450 	case VT_R4:
451 	case VT_ERROR:
452 	case VT_VARIANT:
453 	case VT_UI4:
454 	case VT_LPSTR:
455 	case VT_LPWSTR:
456 	case VT_BLOB:
457 	case VT_BLOB_OBJECT:
458 	case VT_CF:
459 	case VT_VECTOR:
460 		return 4;
461 
462 	case VT_BSTR:
463 		return 5;
464 
465 	case VT_R8:
466 	case VT_CY:
467 	case VT_DATE:
468 	case VT_I8:
469 	case VT_UI8:
470 	case VT_FILETIME:
471 		return 8;
472 
473 	case VT_CLSID:
474 		return 16;
475 
476 	case VT_DISPATCH:
477 	case VT_UNKNOWN:
478 	case VT_DECIMAL:
479 	case VT_INT:
480 	case VT_UINT:
481 	case VT_VOID:
482 	case VT_HRESULT:
483 	case VT_PTR:
484 	case VT_SAFEARRAY:
485 	case VT_CARRAY:
486 	case VT_USERDEFINED:
487 	case VT_STREAM:
488 	case VT_STORAGE:
489 	case VT_STREAMED_OBJECT:
490 	case VT_STORED_OBJECT:
491 	default:
492 		return 0;
493 	}
494 }
495 
496 #define NEED_RECS(_n,_size1)						\
497   do {									\
498         guint _s1 = (_size1);						\
499 	bytes_needed = (_n);						\
500 	if (_s1 > 0 && (data_end - *data) / _s1 < bytes_needed) {	\
501 		g_warning ("Invalid MS property or file truncated");	\
502 		g_free (res);						\
503 		return NULL;						\
504 	}								\
505 	bytes_needed *= _s1;						\
506   } while (0)
507 
508 #define NEED_BYTES(_n) NEED_RECS(_n,1)
509 
510 #define ADVANCE do { *data += bytes_needed; } while (0)
511 
512 static GValue *
msole_prop_parse(GsfMSOleMetaDataSection * section,guint32 type,guint8 const ** data,guint8 const * data_end)513 msole_prop_parse (GsfMSOleMetaDataSection *section,
514 		  guint32 type, guint8 const **data, guint8 const *data_end)
515 {
516 	GValue *res = NULL;
517 	char *str;
518 	guint32 len;
519 	gboolean const is_vector = type & VT_VECTOR;
520 	GError *error;
521 	guint bytes_needed;
522 
523 	g_return_val_if_fail (!(type & (unsigned)(~0x1fff)), NULL); /* not valid in a prop set */
524 
525 	type &= 0xfff;
526 
527 	if (is_vector) {
528 		/*
529 		 *  A vector is basically an array.  If the type associated with
530 		 *  it is a variant, then each element can have a different
531 		 *  variant type.  Otherwise, each element has the same variant
532 		 *  type associated with the vector.
533 		 */
534 		unsigned i, n, size1;
535 		GsfDocPropVector *vector;
536 
537 		NEED_BYTES (4);
538 		n = GSF_LE_GET_GUINT32 (*data);
539 		ADVANCE;
540 
541 		d (g_print (" array with %d elem\n", n);
542 		   gsf_mem_dump (*data, (unsigned)(data_end - *data)););
543 
544 		size1 = msole_prop_min_size (type);
545 		NEED_RECS(n, size1);
546 
547 		vector = gsf_docprop_vector_new ();
548 
549 		for (i = 0 ; i < n ; i++) {
550 			GValue *v;
551 			guint8 const *data0 = *data;
552 			d (g_print ("\t[%d] ", i););
553 			v = msole_prop_parse (section, type, data, data_end);
554 			if (v) {
555 				if (G_IS_VALUE (v)) {
556 					gsf_docprop_vector_append (vector, v);
557 					g_value_unset (v);
558 				}
559 				g_free (v);
560 			}
561 			if (*data == data0)
562 				break;
563 		}
564 
565 		res = g_new0 (GValue, 1);
566 		g_value_init (res, GSF_DOCPROP_VECTOR_TYPE);
567 		g_value_set_object (res, vector);
568 		g_object_unref (vector);
569 		return res;
570 	}
571 
572 	res = g_new0 (GValue, 1);
573 	d (g_print ("%s\n", msole_vt_name (type)););
574 	switch (type) {
575 	case VT_EMPTY:
576 		/*
577 		 * A property with a type indicator of VT_EMPTY has no data
578 		 * associated with it; that is, the size of the value is zero.
579 		 */
580 		/* value::unset == empty */
581 		break;
582 
583 	case VT_NULL:
584 		/* This is like a pointer to NULL */
585 		/* value::unset == null too :-) do we need to distinguish ? */
586 		break;
587 
588 	case VT_I2:
589 		/* 2-byte signed integer */
590 		NEED_BYTES (2);
591 		g_value_init (res, G_TYPE_INT);
592 		g_value_set_int	(res, GSF_LE_GET_GINT16 (*data));
593 		ADVANCE;
594 		break;
595 
596 	case VT_I4:
597 		/* 4-byte signed integer */
598 		NEED_BYTES (4);
599 		g_value_init (res, G_TYPE_INT);
600 		g_value_set_int	(res, GSF_LE_GET_GINT32 (*data));
601 		ADVANCE;
602 		break;
603 
604 	case VT_R4:
605 		/* 32-bit IEEE floating-point value */
606 		NEED_BYTES (4);
607 		g_value_init (res, G_TYPE_FLOAT);
608 		g_value_set_float (res, GSF_LE_GET_FLOAT (*data));
609 		ADVANCE;
610 		break;
611 
612 	case VT_R8:
613 		/* 64-bit IEEE floating-point value */
614 		NEED_BYTES (8);
615 		g_value_init (res, G_TYPE_DOUBLE);
616 		g_value_set_double (res, GSF_LE_GET_DOUBLE (*data));
617 		ADVANCE;
618 		break;
619 
620 	case VT_CY:
621 		/* 8-byte two's complement integer (scaled by 10,000) */
622 		NEED_BYTES (8);
623 		/* CHEAT : just store as an int64 for now */
624 		g_value_init (res, G_TYPE_INT64);
625 		g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data));
626 		break;
627 
628 	case VT_DATE:
629 		/*
630 		 * 64-bit floating-point number representing the number of days
631 		 * (not seconds) since December 31, 1899.
632 		 */
633 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
634 			g_warning ("Unhandled property value type %d (0x%x)",
635 				   type, type);
636 		NEED_BYTES (8);
637 		ADVANCE;
638 		break;
639 
640 	case VT_BSTR:
641 		/*
642 		 * Pointer to null-terminated Unicode string; the string is pre-
643 		 * ceeded by a DWORD representing the byte count of the number
644 		 * of bytes in the string (including the  terminating null).
645 		 */
646 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
647 			g_warning ("Unhandled property value type %d (0x%x)",
648 				   type, type);
649 		NEED_BYTES (4);
650 		ADVANCE;
651 		break;
652 
653 	case VT_DISPATCH:
654 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
655 			g_warning ("Unhandled property value type %d (0x%x)",
656 				   type, type);
657 		break;
658 
659 	case VT_BOOL:
660 		/* A boolean (WORD) value containg 0 (false) or -1 (true). */
661 		NEED_BYTES (1);
662 		g_value_init (res, G_TYPE_BOOLEAN);
663 		g_value_set_boolean (res, **data ? TRUE : FALSE);
664 		ADVANCE;
665 		break;
666 
667 	case VT_VARIANT :	 d (g_print ("\tcontaining a "););
668 		/*
669 		 * A type indicator (a DWORD) followed by the corresponding
670 		 *  value.  VT_VARIANT is only used in conjunction with
671 		 *  VT_VECTOR.
672 		 */
673 		NEED_BYTES (4);
674 		g_free (res);
675 		type = GSF_LE_GET_GUINT32 (*data);
676 		ADVANCE;
677 		return msole_prop_parse (section, type, data, data_end);
678 
679 	case VT_UI1:
680 		/* 1-byte unsigned integer */
681 		NEED_BYTES (1);
682 		g_value_init (res, G_TYPE_UCHAR);
683 		g_value_set_uchar (res, GSF_LE_GET_GUINT8 (*data));
684 		ADVANCE;
685 		break;
686 
687 	case VT_I1:
688 		/* 1-byte signed integer */
689 		NEED_BYTES (1);
690 		g_value_init (res, G_TYPE_CHAR);
691 		g_value_set_schar (res, GSF_LE_GET_GINT8 (*data));
692 		ADVANCE;
693 		break;
694 
695 	case VT_UI2:
696 		/* 2-byte unsigned integer */
697 		NEED_BYTES (2);
698 		g_value_init (res, G_TYPE_UINT);
699 		g_value_set_uint (res, GSF_LE_GET_GUINT16 (*data));
700 		ADVANCE;
701 		break;
702 
703 	case VT_UI4:
704 		/* 4-type unsigned integer */
705 		NEED_BYTES (4);
706 		g_value_init (res, G_TYPE_UINT);
707 		g_value_set_uint (res, GSF_LE_GET_GUINT32 (*data));
708 		ADVANCE;
709 		break;
710 
711 	case VT_I8 :		 d (g_print ("VT_I8\n"););
712 		/* 8-byte signed integer */
713 		NEED_BYTES (8);
714 		g_value_init (res, G_TYPE_INT64);
715 		g_value_set_int64 (res, GSF_LE_GET_GINT64 (*data));
716 		ADVANCE;
717 		break;
718 
719 	case VT_UI8:
720 		/* 8-byte unsigned integer */
721 		NEED_BYTES (8);
722 		g_value_init (res, G_TYPE_UINT64);
723 		g_value_set_uint64 (res, GSF_LE_GET_GUINT64 (*data));
724 		ADVANCE;
725 		break;
726 
727 	case VT_LPSTR: {
728 		guint32 need;
729 		/*
730 		 * This is the representation of many strings.  It is stored in
731 		 * the same representation as VT_BSTR.  Note that the serialized
732 		 * representation of VP_LPSTR has a preceding byte count,
733 		 * whereas the in-memory representation does not.
734 		 */
735 		NEED_BYTES (4);
736 		len = GSF_LE_GET_GUINT32 (*data);
737 		ADVANCE;
738 
739 		g_return_val_if_fail (len < 0x10000, NULL);
740 
741 		need = len;
742 		if (section->char_size > 1 && (need & 3))
743 			need = (need & ~3) + 4;
744 		NEED_BYTES (need);
745 
746 		error = NULL;
747 		d (gsf_mem_dump (*data, len * section->char_size););
748 		str = g_convert_with_iconv (*data,
749 					    len > section->char_size ? len - section->char_size : 0,
750 			section->iconv_handle, NULL, NULL, &error);
751 
752 		g_value_init (res, G_TYPE_STRING);
753 		if (NULL != str) {
754 			g_value_take_string (res, str);
755 		} else if (NULL != error) {
756 			g_warning ("error: %s", error->message);
757 			g_error_free (error);
758 		} else {
759 			g_warning ("unknown error converting string property, using blank");
760 		}
761 		ADVANCE;
762 		break;
763 	}
764 
765 	case VT_LPWSTR:
766 		/*
767 		 * A counted and null-terminated Unicode string; a DWORD character
768 		 * count (where the count includes the terminating null) followed
769 		 * by that many Unicode (16-bit) characters.  Note that the count
770 		 * is character count, not byte count.
771 		 */
772 
773 		NEED_BYTES (4);
774 		len = GSF_LE_GET_GUINT32 (*data);
775 		ADVANCE;
776 
777 		NEED_RECS (len, 2);
778 
779 		g_return_val_if_fail (len < 0x10000, NULL);
780 
781 		error = NULL;
782 		d (gsf_mem_dump (*data, len * 2););
783 		str = g_convert (*data, len * 2,
784 				 "UTF-8", "UTF-16LE", NULL, NULL, &error);
785 
786 		g_value_init (res, G_TYPE_STRING);
787 		if (NULL != str) {
788 			g_value_set_string (res, str);
789 			g_free (str);
790 		} else if (NULL != error) {
791 			g_warning ("error: %s", error->message);
792 			g_error_free (error);
793 		} else {
794 			g_warning ("unknown error converting string property, using blank");
795 		}
796 		ADVANCE;
797 		break;
798 
799 	case VT_FILETIME : {
800 		/* 64-bit FILETIME structure, as defined by Win32. */
801 		guint64 ft;
802 		GsfTimestamp *ts;
803 
804 		NEED_BYTES (8);
805 
806 		/* ft * 100ns since Jan 1 1601 */
807 		ft = GSF_LE_GET_GUINT64 (*data);
808 
809 		ft /= 10000000; /* convert to seconds */
810 		ft -= G_GINT64_CONSTANT (11644473600); /* move to Jan 1 1970 */
811 		ts = gsf_timestamp_new ();
812 		gsf_timestamp_set_time (ts, ft);
813 		g_value_init (res, GSF_TIMESTAMP_TYPE);
814 		gsf_timestamp_to_value (ts, res);
815 		gsf_timestamp_free (ts);
816 
817 		ADVANCE;
818 		break;
819 	}
820 
821 	case VT_BLOB:
822 		/*
823 		 * A DWORD count of bytes, followed by that many bytes of data.
824 		 * The byte count does not include the four bytes for the length
825 		 * of the count itself:  An empty blob would have a count of
826 		 * zero, followed by zero bytes.  Thus the serialized represen-
827 		 * tation of a VT_BLOB is similar to that of a VT_BSTR but does
828 		 * not guarantee a null byte at the end of the data.
829 		 */
830 		NEED_BYTES (4);
831 		ADVANCE;
832 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
833 			g_warning ("Unhandled property value type %d (0x%x)",
834 				   type, type);
835 		g_free (res);
836 		res = NULL;
837 		break;
838 
839 	case VT_STREAM:
840 		/*
841 		 * Indicates the value is stored in a stream that is sibling
842 		 * to the CONTENTS stream.  Following this type indicator is
843 		 * data in the format of a serialized VT_LPSTR, which names
844 		 * the stream containing the data.
845 		 */
846 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
847 			g_warning ("Unhandled property value type %d (0x%x)",
848 				   type, type);
849 		g_free (res);
850 		res = NULL;
851 		break;
852 
853 	case VT_STORAGE:
854 		/*
855 		 * Indicates the value is stored in an IStorage that is
856 		 * sibling to the CONTENTS stream.  Following this type
857 		 * indicator is data in the format of a serialized VT_LPSTR,
858 		 * which names the IStorage containing the data.
859 		 */
860 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
861 			g_warning ("Unhandled property value type %d (0x%x)",
862 				   type, type);
863 		g_free (res);
864 		res = NULL;
865 		break;
866 
867 	case VT_STREAMED_OBJECT:
868 		/*
869 		 * Same as VT_STREAM, but indicates that the stream contains a
870 		 * serialized object, which is a class ID followed by initiali-
871 		 * zation data for the class.
872 		 */
873 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
874 			g_warning ("Unhandled property value type %d (0x%x)",
875 				   type, type);
876 		g_free (res);
877 		res = NULL;
878 		break;
879 
880 	case VT_STORED_OBJECT:
881 		/*
882 		 * Same as VT_STORAGE, but indicates that the designated
883 		 * IStorage contains a loadable object.
884 		 */
885 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
886 			g_warning ("Unhandled property value type %d (0x%x)",
887 				   type, type);
888 		g_free (res);
889 		res = NULL;
890 		break;
891 
892 	case VT_BLOB_OBJECT:
893 		/*
894 		 * Contains a serialized object in the same representation as
895 		 * would appear in a VT_STREAMED_OBJECT.  That is, following
896 		 * the VT_BLOB_OBJECT tag is a DWORD byte count of the
897 		 * remaining data (where the byte count does not include the
898 		 * size of itself) which is in the format of a class ID
899 		 * followed by initialization data for that class
900 		 */
901 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
902 			g_warning ("Unhandled property value type %d (0x%x)",
903 				   type, type);
904 		g_free (res);
905 		res = NULL;
906 		break;
907 
908 	case VT_CF:
909 		error = NULL;
910 		if (!parse_vt_cf (res, data, data_end, &error)) {
911 			/* suck, we can't propagate the error upwards */
912 			if (error) {
913 				g_warning ("error: %s", error->message);
914 				g_error_free (error);
915 			}
916 			else {
917 				g_warning ("unknown error parsing vt_cf");
918 			}
919 			g_free (res);
920 			res = NULL;
921 		}
922 		break;
923 
924 	case VT_CLSID:
925 		/* A class ID (or other GUID) */
926 		NEED_BYTES (16);
927 		ADVANCE;
928 		g_free (res);
929 		res = NULL;
930 		break;
931 
932 	case VT_ERROR:
933 		/* A DWORD containing a status code. */
934 	case VT_UNKNOWN:
935 	case VT_DECIMAL:
936 	case VT_INT:
937 	case VT_UINT:
938 	case VT_VOID:
939 	case VT_HRESULT:
940 	case VT_PTR:
941 	case VT_SAFEARRAY:
942 	case VT_CARRAY:
943 	case VT_USERDEFINED:
944 		g_warning ("type %s (0x%x) is not permitted in property sets",
945 			   msole_vt_name (type), type);
946 		g_free (res);
947 		res = NULL;
948 		break;
949 
950 	default:
951 		if (msole_debug (DEBUG_UNKNOWN_PROPS))
952 			g_warning ("Unknown property type %d (0x%x)",
953 				   type, type);
954 		g_free (res);
955 		res = NULL;
956 	}
957 
958 	if (res != NULL && G_IS_VALUE (res)) {
959 		d ( {
960 			char *val = g_strdup_value_contents (res);
961 			g_print ("%s\n", val);
962 			g_free (val);
963 		});
964 	} else {
965 		d ({
966 			char const *type_name = msole_vt_name (type);
967 			if (type_name) {
968 				g_printerr ("A '%s' property could not be parsed\n", type_name);
969 			} else {
970 				g_printerr ("A %d property could not be parsed\n", type);
971 			}
972 		});
973 		g_free (res);
974 		res = NULL;
975 	}
976 	return res;
977 }
978 #undef NEED_BYTES
979 #undef NEED_RECS
980 #undef ADVANCE
981 
982 static gboolean
msole_prop_read(GsfInput * in,GsfMSOleMetaDataSection * section,GsfMSOleMetaDataProp * props,unsigned i,GsfDocMetaData * accum)983 msole_prop_read (GsfInput *in,
984 		 GsfMSOleMetaDataSection *section,
985 		 GsfMSOleMetaDataProp    *props,
986 		 unsigned		  i,
987 		 GsfDocMetaData		 *accum)
988 {
989 	guint32 type;
990 	guint8 const *data;
991 	gsf_off_t size = ((i+1) >= section->num_props)
992 		? section->size
993 		: props[i+1].offset;
994 	char   *name;
995 	GValue *val;
996 
997 	g_return_val_if_fail (i < section->num_props, FALSE);
998 	g_return_val_if_fail (size >= props[i].offset + 4, FALSE);
999 
1000 	size -= props[i].offset; /* includes the type id */
1001 	/* From now on, size is actually a size.  */
1002 	if (gsf_input_seek (in, section->offset+props[i].offset, G_SEEK_SET) ||
1003 	    NULL == (data = gsf_input_read (in, size, NULL))) {
1004 		g_warning ("failed to read prop #%d", i);
1005 		return FALSE;
1006 	}
1007 
1008 	type = GSF_LE_GET_GUINT32 (data);
1009 	data += 4;
1010 
1011 	/* dictionary is magic */
1012 	if (props[i].id == 0) {
1013 		guint32 len, id, j, n;
1014 		gsize gslen;
1015 		char *name;
1016 		guint8 const *start = data;
1017 		guint8 const *end = start + (size - 4);
1018 
1019 		g_return_val_if_fail (section->dict == NULL, FALSE);
1020 
1021 		section->dict = g_hash_table_new_full (
1022 			g_direct_hash, g_direct_equal,
1023 			NULL, g_free);
1024 
1025 		d ({ g_print ("Dictionary = \n"); gsf_mem_dump (data-4, size); });
1026 		n = type;
1027 		for (j = 0; j < n; j++) {
1028 			g_return_val_if_fail (end - data >= 8, FALSE);
1029 
1030 			id = GSF_LE_GET_GUINT32 (data);
1031 			len = GSF_LE_GET_GUINT32 (data + 4);
1032 
1033 			g_return_val_if_fail (len < 0x10000, FALSE);
1034 			g_return_val_if_fail (len <= end - data + 8, FALSE);
1035 
1036 			gslen = 0;
1037 			name = g_convert_with_iconv (data + 8,
1038 				len * section->char_size,
1039 				section->iconv_handle, &gslen, NULL, NULL);
1040 			len = (guint32)gslen;
1041 			data += 8 + len;
1042 
1043 			d (g_print ("\t%u == %s\n", id, name););
1044 			g_hash_table_replace (section->dict,
1045 				GINT_TO_POINTER (id), name);
1046 
1047 			/* MS documentation blows goats !
1048 			 * The docs claim there are padding bytes in the dictionary.
1049 			 * Their examples show padding bytes.
1050 			 * In reality non-unicode strings do not seem to
1051 			 * have padding.
1052 			 */
1053 			if (section->char_size != 1 && (data - start) % 4)
1054 				data += 4 - ((data - start) % 4);
1055 		}
1056 	} else {
1057 		gboolean linked;
1058 		d (g_print ("===> %u) ", i);
1059 		   gsf_mem_dump (data-4, size););
1060 
1061 		name = g_strdup (msole_prop_id_to_gsf (section, props[i].id, &linked));
1062 		d (g_print (" @ %x %x = ", (unsigned)props[i].offset, (unsigned)size););
1063 		val = msole_prop_parse (section, type, &data, data + size - 4);
1064 
1065 		if (NULL != name && NULL != val) {
1066 			if (linked) {
1067 				GsfDocProp *prop = gsf_doc_meta_data_lookup (accum, name);
1068 				if (NULL == prop) {
1069 					g_warning ("linking property '%s' before it\'s value is specified",
1070 						   (name ? name : "<null>"));
1071 				} else if (!G_VALUE_HOLDS_STRING (val)) {
1072 					g_warning ("linking property '%s' before it\'s value is specified",
1073 						   (name ? name : "<null>"));
1074 				} else
1075 					gsf_doc_prop_set_link (prop,
1076 						g_value_dup_string (val));
1077 			} else {
1078 				gsf_doc_meta_data_insert (accum, name, val);
1079 				val = NULL;
1080 				name = NULL;
1081 			}
1082 		}
1083 
1084 		if (NULL != val) {
1085 			if (G_IS_VALUE (val))
1086 				g_value_unset (val);
1087 			g_free (val);
1088 		}
1089 		g_free (name);
1090 	}
1091 
1092 	return TRUE;
1093 }
1094 
1095 static int
msole_prop_cmp(gconstpointer a,gconstpointer b)1096 msole_prop_cmp (gconstpointer a, gconstpointer b)
1097 {
1098 	GsfMSOleMetaDataProp const *prop_a = a;
1099 	GsfMSOleMetaDataProp const *prop_b = b;
1100 
1101 	if (prop_a->offset < prop_b->offset)
1102 		return -1;
1103 	else if (prop_a->offset > prop_b->offset)
1104 		return +1;
1105 	else
1106 		return 0;
1107 }
1108 
1109 /**
1110  * gsf_doc_meta_data_read_from_msole:
1111  * @accum: #GsfDocMetaData
1112  * @in: #GsfInput
1113  *
1114  * Read a stream formated as a set of MS OLE properties from @in and store the
1115  * results in @accum.
1116  *
1117  * Since: 1.14.24
1118  *
1119  * Returns: (transfer full): A #GError if there was an error.
1120  **/
1121 GError *
gsf_doc_meta_data_read_from_msole(GsfDocMetaData * accum,GsfInput * in)1122 gsf_doc_meta_data_read_from_msole (GsfDocMetaData *accum, GsfInput *in)
1123 {
1124 	guint8 const *data;
1125 	guint16 version;
1126 	guint32 os, num_sections;
1127 	unsigned i, j;
1128 	GsfMSOleMetaDataSection *sections;
1129 	GsfMSOleMetaDataProp	*props;
1130 	GsfDocProp		*prop;
1131 
1132 	/* http://bugzilla.gnome.org/show_bug.cgi?id=352055
1133 	 * psiwin generates files with empty property sections */
1134 	if (gsf_input_size (in) <= 0)
1135 		return NULL;
1136 
1137 	data = gsf_input_read (in, 28, NULL);
1138 	if (NULL == data)
1139 		return g_error_new (gsf_input_error_id (), 0,
1140 				    _("Unable to read MS property stream header"));
1141 
1142 	d ({g_print ("===================================\n"
1143 		   "header class id ==\n");
1144 	   gsf_mem_dump (data, 28);});
1145 	/*
1146 	 * Validate the Property Set Header.
1147 	 * Format (bytes):
1148 	 *   00 - 01	Byte order		0xfffe
1149 	 *   02 - 03	Format			0
1150 	 *   04 - 05	OS Version		high word is the OS
1151 	 *   06 - 07				low  word is the OS version
1152 	 *					  0 = win16
1153 	 *					  1 = mac
1154 	 *					  2 = win32
1155 	 *   08 - 23	Class Identifier	Usually Format ID
1156 	 *   24 - 27	Section count		Should be at least 1
1157 	 */
1158 	os	     = GSF_LE_GET_GUINT16 (data + 6);
1159 	version	     = GSF_LE_GET_GUINT16 (data + 2);
1160 	num_sections = GSF_LE_GET_GUINT32 (data + 24);
1161 	if (GSF_LE_GET_GUINT16 (data + 0) != 0xfffe
1162 	    || (version != 0 && version != 1)
1163 	    || os > 2
1164 	    || num_sections > gsf_input_size(in) / 20
1165 	    || num_sections > 100) /* arbitrary sanity check */
1166 		return g_error_new (gsf_input_error_id (), 0,
1167 				    _("Invalid MS property stream header"));
1168 
1169 	/* extract the section info */
1170 	/*
1171 	 * The Format ID/Offset list follows.
1172 	 * Format:
1173 	 *   00 - 16	Section Name		Format ID
1174 	 *   16 - 19	Section Offset		The offset is the number of
1175 	 *					bytes from the start of the
1176 	 *					whole stream to where the
1177 	 *					section begins.
1178 	 */
1179 	sections = (GsfMSOleMetaDataSection *)g_alloca (sizeof (GsfMSOleMetaDataSection)* num_sections);
1180 	for (i = 0 ; i < num_sections ; i++) {
1181 		data = gsf_input_read (in, 20, NULL);
1182 		if (NULL == data)
1183 			return g_error_new (gsf_input_error_id (), 0,
1184 					    _("Unable to read MS property stream header"));
1185 		if (!memcmp (data, component_guid, sizeof (component_guid)))
1186 			sections [i].type = COMPONENT_PROP;
1187 		else if (!memcmp (data, document_guid, sizeof (document_guid)))
1188 			sections [i].type = DOC_PROP;
1189 		else if (!memcmp (data, user_guid, sizeof (user_guid)))
1190 			sections [i].type = USER_PROP;
1191 		else {
1192 			sections [i].type = USER_PROP;
1193 			g_warning ("Unknown property section type, treating it as USER");
1194 			gsf_mem_dump (data, 16);
1195 		}
1196 
1197 		sections [i].offset = GSF_LE_GET_GUINT32 (data + 16);
1198 	}
1199 
1200 	/*
1201 	 * A section is the third part of the property set stream.
1202 	 * Format (bytes):
1203 	 *   00 - 03	Section size	A byte count for the section (which is inclusive
1204 	 *				of the byte count itself and should always be a
1205 	 *				multiple of 4);
1206 	 *   04 - 07	Property count	A count of the number of properties
1207 	 *   08 - xx   			An array of 32-bit Property ID/Offset pairs
1208 	 *   yy - zz			An array of Property Type indicators/Value pairs
1209 	 */
1210 	for (i = 0 ; i < num_sections ; i++) {
1211 		if (gsf_input_seek (in, sections[i].offset, G_SEEK_SET) ||
1212 		    NULL == (data = gsf_input_read (in, 8, NULL)))
1213 			return g_error_new (gsf_input_error_id (), 0,
1214 					    _("Invalid MS property section"));
1215 
1216 		sections[i].iconv_handle = (GIConv)-1;
1217 		sections[i].char_size    = 1;
1218 		sections[i].dict      = NULL;
1219 		sections[i].size      = GSF_LE_GET_GUINT32 (data); /* includes header */
1220 		sections[i].num_props = GSF_LE_GET_GUINT32 (data + 4);
1221 
1222 		d (g_print ("=============================================\n"
1223 			   "===> section #%d : type %d at offset 0x%x, size 0x%x, numprops = %u\n",
1224 			   i, (int)sections [i].type,
1225 			   (guint32)sections [i].offset,
1226 			   sections[i].size,
1227 			   sections[i].num_props););
1228 
1229 		if (sections[i].num_props <= 0)
1230 			continue;
1231 		if (sections[i].num_props > gsf_input_remaining(in) / 8)
1232 			return g_error_new (gsf_input_error_id (), 0,
1233 					    _("Invalid MS property stream header or file truncated"));
1234 
1235 		if (sections[i].offset + sections[i].size > gsf_input_size(in))
1236 			return g_error_new (gsf_input_error_id (), 0,
1237 					    _("Invalid MS property stream header or file truncated"));
1238 
1239 		/*
1240 		 * Get and save all the Property ID/Offset pairs.
1241 		 * Format (bytes):
1242 		 *   00 - 03	id	Property ID
1243 		 *   04 - 07	offset	The distance from the start of the section to the
1244 		 *			start of the Property Type/Value pair.
1245 		 */
1246 		d (g_print ("Offsets\n"););
1247 		props = g_new (GsfMSOleMetaDataProp, sections[i].num_props);
1248 		for (j = 0; j < sections[i].num_props; j++) {
1249 			if (NULL == (data = gsf_input_read (in, 8, NULL))) {
1250 				g_free (props);
1251 				return g_error_new (gsf_input_error_id (), 0,
1252 						    _("Invalid MS property section"));
1253 			}
1254 
1255 			props[j].id = GSF_LE_GET_GUINT32 (data);
1256 			props[j].offset = GSF_LE_GET_GUINT32 (data + 4);
1257 			d (g_print ("%d) ID=%d, offset=0x%x\n", j,
1258 				    props [j].id, (unsigned)props [j].offset););
1259 		}
1260 
1261 		/* FIXME: Should we check that ids are distinct?  */
1262 
1263 		/* order prop info by offset to facilitate bounds checking */
1264 		qsort (props, sections[i].num_props,
1265 		       sizeof (GsfMSOleMetaDataProp),
1266 		       msole_prop_cmp);
1267 
1268 		/* Sanity checks.  */
1269 		for (j = 0; j < sections[i].num_props; j++) {
1270 			guint end = (j == sections[i].num_props - 1)
1271 				? sections[i].size
1272 				: props[j + 1].offset;
1273 			if (props[j].offset < 0 || props[j].offset + 4 > end) {
1274 				g_free (props);
1275 				return g_error_new (gsf_input_error_id (), 0,
1276 						    _("Invalid MS property section"));
1277 			}
1278 		}
1279 
1280 		/*
1281 		 * Find and process the code page.
1282 		 * Property ID 1 is reserved as an indicator of the code page.
1283 		 */
1284 		sections[i].iconv_handle = (GIConv)-1;
1285 		sections[i].char_size = 1;
1286 		for (j = 0; j < sections[i].num_props; j++) /* first codepage */
1287 			if (props[j].id == 1) {
1288 				msole_prop_read (in, sections+i, props, j, accum);
1289 				if (NULL != (prop = gsf_doc_meta_data_lookup (accum, GSF_META_NAME_CODEPAGE))) {
1290 					GValue const *val = gsf_doc_prop_get_val (prop);
1291 					if (NULL != val && G_VALUE_HOLDS_INT (val)) {
1292 						int codepage = g_value_get_int (val);
1293 						sections[i].iconv_handle =
1294 							gsf_msole_iconv_open_for_import (codepage);
1295 						sections[i].char_size = msole_codepage_char_size (codepage);
1296 					}
1297 				}
1298 			}
1299 
1300 		if (sections[i].iconv_handle == (GIConv)-1)
1301 			sections[i].iconv_handle = gsf_msole_iconv_open_for_import (1252);
1302 
1303 		/*
1304 		 * Find and process the Property Set Dictionary
1305 		 * Property ID 0 is reserved as an indicator of the dictionary.
1306 		 * For User Defined Sections, Property ID 0 is NOT a dictionary.
1307 		 */
1308 		for (j = 0; j < sections[i].num_props; j++) /* then dictionary */
1309 			if (props[j].id == 0)
1310 				msole_prop_read (in, sections+i, props, j, accum);
1311 
1312 		/* Process all the properties */
1313 		for (j = 0; j < sections[i].num_props; j++) /* the rest */
1314 			if (props[j].id > 1)
1315 				msole_prop_read (in, sections+i, props, j, accum);
1316 
1317 		gsf_iconv_close (sections[i].iconv_handle);
1318 		g_free (props);
1319 		if (sections[i].dict != NULL)
1320 			g_hash_table_destroy (sections[i].dict);
1321 	}
1322 	return NULL;
1323 }
1324 
1325 /**
1326  * gsf_msole_metadata_read: (skip)
1327  * @in: #GsfInput
1328  * @accum: #GsfDocMetaData
1329  *
1330  * Read a stream formated as a set of MS OLE properties from @in and store the
1331  * results in @accum.
1332  *
1333  * Deprecated: 1.14.24, use gsf_doc_meta_data_read_from_msole
1334  *
1335  * Returns: (transfer full): A #GError if there was an error.
1336  **/
1337 GError *
gsf_msole_metadata_read(GsfInput * in,GsfDocMetaData * accum)1338 gsf_msole_metadata_read	(GsfInput *in, GsfDocMetaData *accum)
1339 {
1340 	return gsf_doc_meta_data_read_from_msole (accum, in);
1341 }
1342 
1343 /****************************************************************************/
1344 
1345 typedef struct {
1346 	GsfOutput  *out;
1347 	gboolean    doc_not_component;
1348 
1349 	GHashTable *dict;
1350 	struct {
1351 		unsigned count;	 /* includes 2nd prop for links */
1352 		GSList  *props;
1353 	} builtin, user;
1354 
1355 	unsigned codepage;
1356 	GIConv iconv_handle;
1357 	unsigned char_size;
1358 } WritePropState;
1359 
1360 static GsfMSOleVariantType
gvalue_to_msole_vt(GValue const * value,GsfMSOleMetaDataPropMap const * map)1361 gvalue_to_msole_vt (GValue const *value, GsfMSOleMetaDataPropMap const *map)
1362 {
1363 	g_return_val_if_fail (value != NULL, VT_EMPTY);
1364 
1365 	switch (G_TYPE_FUNDAMENTAL (G_VALUE_TYPE (value))) {
1366 	case G_TYPE_BOOLEAN:	return VT_BOOL;
1367 	case G_TYPE_UCHAR:	return VT_UI1;
1368 	case G_TYPE_FLOAT:	return VT_R4;
1369 	case G_TYPE_DOUBLE:	return VT_R8;
1370 	case G_TYPE_STRING: 	return VT_LPSTR;
1371 	case G_TYPE_INT:
1372 		return (NULL != map && map->prefered_type == VT_I2)
1373 			? VT_I2 : VT_I4;
1374 	case G_TYPE_UINT:
1375 		return (NULL != map && map->prefered_type == VT_UI2)
1376 			? VT_UI2 : VT_UI4;
1377 	case G_TYPE_BOXED:
1378 		if (VAL_IS_GSF_TIMESTAMP (value))
1379 			return VT_FILETIME;
1380 		return VT_UNKNOWN;
1381 	case G_TYPE_OBJECT:
1382 		if (VAL_IS_GSF_DOCPROP_VECTOR (value)) {
1383 			GArray *vector = gsf_value_get_docprop_array (value);
1384 			unsigned i, n;
1385 			GsfMSOleVariantType type, tmp;
1386 
1387 			if (vector == NULL)
1388 				return VT_UNKNOWN;
1389 
1390 			if (map != NULL) {
1391 				type = map->prefered_type & (~VT_VECTOR);
1392 				if (type == VT_VARIANT)
1393 					return VT_VECTOR | VT_VARIANT;
1394 			} else
1395 				type = VT_UNKNOWN;
1396 			n = vector->len;
1397 			for (i = 0; i < n; i++) {
1398 				tmp = gvalue_to_msole_vt (
1399 					&g_array_index (vector, GValue, i), NULL);
1400 				if (type == VT_UNKNOWN)
1401 					type = tmp;
1402 				else if (type != tmp)
1403 					return VT_VECTOR | VT_VARIANT;
1404 			}
1405 			return VT_VECTOR | type;
1406 		}
1407 		break;
1408 	}
1409 	return VT_UNKNOWN;
1410 }
1411 
1412 static gboolean
msole_metadata_write_string(WritePropState * state,const char * txt)1413 msole_metadata_write_string (WritePropState *state, const char *txt)
1414 {
1415 	guint8 buf[4];
1416 	guint32 len;
1417 	gchar *ctxt;
1418 	gsize bytes_written;
1419 	gboolean res;
1420 
1421 	if (!txt) txt = "";
1422 	len = strlen (txt);
1423 	ctxt = g_convert_with_iconv (txt, len, state->iconv_handle,
1424 				     NULL, &bytes_written, NULL);
1425 	if (!ctxt) {
1426 		/* See bug #703952 */
1427 		g_warning ("Failed to write metadata string");
1428 		bytes_written = 0;
1429 	}
1430 
1431 	// *Bytes*, not characters, including the termination, but not the
1432 	// padding.
1433 	GSF_LE_SET_GUINT32 (buf, bytes_written + state->char_size);
1434 	res = gsf_output_write (state->out, 4, buf);
1435 
1436 	res = res && gsf_output_write (state->out, bytes_written, ctxt);
1437 
1438 	GSF_LE_SET_GUINT32 (buf, 0);
1439 	res = res && gsf_output_write (state->out, state->char_size, buf);
1440 
1441 	if (state->char_size > 1) {
1442 		unsigned padding = 4 - (bytes_written + state->char_size) % 4;
1443 		if (padding < 4)
1444 			res = res && gsf_output_write (state->out, padding, buf);
1445 	}
1446 
1447 	g_free (ctxt);
1448 	return res;
1449 }
1450 
1451 
1452 /* Returns: TRUE on success */
1453 static gboolean
msole_metadata_write_prop(WritePropState * state,char const * name,GValue const * value,gboolean suppress_type)1454 msole_metadata_write_prop (WritePropState *state,
1455 			   char const *name,
1456 			   GValue const *value,
1457 			   gboolean suppress_type)
1458 {
1459 	GsfMSOleMetaDataPropMap const *map =
1460 		(name != NULL) ? msole_gsf_name_to_prop (name) : NULL;
1461 	GsfMSOleVariantType type;
1462 	guint8 buf[8];
1463 
1464 	g_return_val_if_fail (value != NULL, FALSE);
1465 
1466 	type = gvalue_to_msole_vt (value, map);
1467 	if (!suppress_type) {
1468 		GSF_LE_SET_GUINT32 (buf, type);
1469 		gsf_output_write (state->out, 4, buf);
1470 	}
1471 	if (NULL != map && map->prefered_type != type) {
1472 		d(g_print ("Exporting property '%s' with type 0x%x rather than the usual 0x%x\n",
1473 			   map->gsf_name, type, map->prefered_type););
1474 	}
1475 
1476 	if (type & VT_VECTOR) {
1477 		GArray *vector = gsf_value_get_docprop_array (value);
1478 		unsigned i, n = vector->len;
1479 		gboolean res;
1480 
1481 		GSF_LE_SET_GINT32 (buf, n);
1482 		res = gsf_output_write (state->out, 4, buf);
1483 		for (i = 0; i < n; i++) {
1484 			gboolean suppress = type != (VT_VECTOR | VT_VARIANT);
1485 			res &= msole_metadata_write_prop (state, NULL,
1486 			        &g_array_index (vector, GValue, i),
1487 				suppress);
1488 		}
1489 		return res;
1490 	}
1491 
1492 	switch (type) {
1493 	case VT_BOOL:
1494 		if (g_value_get_boolean (value))
1495 			GSF_LE_SET_GINT32 (buf, 0xffffffff);
1496 		else
1497 			GSF_LE_SET_GINT32 (buf, 0);
1498 		return gsf_output_write (state->out, 4, buf);
1499 	case VT_UI1:
1500 		GSF_LE_SET_GUINT32 (buf, g_value_get_uchar (value));
1501 		return gsf_output_write (state->out, 4, buf);
1502 	case VT_I2:
1503 		GSF_LE_SET_GINT16 (buf, g_value_get_int (value));
1504 		GSF_LE_SET_GUINT16 (buf+2, 0);
1505 		return gsf_output_write (state->out, 4, buf);
1506 	case VT_I4:
1507 		GSF_LE_SET_GINT32 (buf, g_value_get_int (value));
1508 		return gsf_output_write (state->out, 4, buf);
1509 	case VT_UI2:
1510 	case VT_UI4:
1511 		GSF_LE_SET_GUINT32 (buf, g_value_get_uint (value));
1512 		return gsf_output_write (state->out, 4, buf);
1513 	case VT_R4:
1514 		GSF_LE_SET_FLOAT (buf, g_value_get_float (value));
1515 		return gsf_output_write (state->out, 4, buf);
1516 	case VT_R8:
1517 		GSF_LE_SET_DOUBLE (buf, g_value_get_double (value));
1518 		return gsf_output_write (state->out, 8, buf);
1519 
1520 	case VT_LPSTR:
1521 		return msole_metadata_write_string (state, g_value_get_string (value));
1522 
1523 	case VT_FILETIME : {
1524 		GsfTimestamp const *ts = g_value_get_boxed (value);
1525 		gint32  timet_signed = (gint32) ts->timet;
1526 		guint64 ft;
1527 
1528 		ft = timet_signed + G_GINT64_CONSTANT (11644473600);
1529 		ft *= 10000000;
1530 
1531 		GSF_LE_SET_GUINT64 (buf, ft);
1532 
1533 		return gsf_output_write (state->out, 8, buf);
1534 	}
1535 
1536 	default:
1537 		break;
1538 	}
1539 
1540 	g_warning ("Ignoring property '%s', how do we export a property of type '%s'",
1541 		name ? name : "<unnamed>",
1542 		g_type_name (G_TYPE_FUNDAMENTAL (G_VALUE_TYPE (value))));
1543 	return FALSE;
1544 }
1545 
1546 static void
cb_write_dict(char const * name,gpointer id,WritePropState * state)1547 cb_write_dict (char const *name, gpointer id, WritePropState *state)
1548 {
1549 	guint8 buf[4];
1550 
1551 	GSF_LE_SET_GUINT32 (buf, GPOINTER_TO_UINT (id));
1552 	gsf_output_write (state->out, 4, buf);
1553 	msole_metadata_write_string (state, name);
1554 }
1555 
1556 static gboolean
msole_metadata_write_section(WritePropState * state,gboolean user)1557 msole_metadata_write_section (WritePropState *state, gboolean user)
1558 {
1559 	char const *name;
1560 	guint8	  buf [8];
1561 	GSList   *ptr   = user ? state->user.props : state->builtin.props;
1562 	unsigned  count = user ? state->user.count : state->builtin.count;
1563 	gsf_off_t len, base  = gsf_output_tell (state->out);
1564 	GsfMSOleMetaDataProp *offsets;
1565 	GsfMSOleMetaDataPropMap const *map;
1566 	GsfDocProp const *prop;
1567 	gpointer tmp;
1568 	unsigned i;
1569 	GValue	 scratch;
1570 
1571 	if (user && state->dict == NULL)
1572 		return TRUE;
1573 
1574 	// Skip past the size+count and id/offset pairs
1575 	GSF_LE_SET_GUINT32 (buf, 0);
1576 	for (i = 0; i < 1 + 1 + 2 * count; i++)
1577 		gsf_output_write (state->out, 4, buf);
1578 
1579 	memset (&scratch,  0, sizeof (GValue));
1580 	g_value_init (&scratch, G_TYPE_STRING);
1581 
1582 	offsets = g_alloca (sizeof (GsfMSOleMetaDataProp) * count);
1583 
1584 	i = 0;
1585 
1586 	/* 0) codepage */
1587 	if (i < count) {
1588 		offsets[0].id = 1;
1589 		offsets[0].offset = gsf_output_tell (state->out);
1590 		GSF_LE_SET_GUINT32 (buf, VT_I2);
1591 		GSF_LE_SET_GUINT32 (buf+4, state->codepage);
1592 		gsf_output_write (state->out, 8, buf);
1593 		i++;
1594 	}
1595 
1596 	/* 1) dictionary */
1597 	if (user && i < count) {
1598 		offsets[1].id = 0;
1599 		offsets[1].offset = gsf_output_tell (state->out);
1600 		GSF_LE_SET_GUINT32 (buf, g_hash_table_size (state->dict));
1601 		gsf_output_write (state->out, 4, buf);
1602 		g_hash_table_foreach (state->dict,
1603 			(GHFunc) cb_write_dict, state);
1604 		i++;
1605 	}
1606 
1607 	/* 2) props */
1608 	for (; ptr != NULL && i < count ; ptr = ptr->next, i++) {
1609 		offsets[i].offset = gsf_output_tell (state->out);
1610 		prop = ptr->data;
1611 		name = gsf_doc_prop_get_name (prop);
1612 		if (user) {
1613 			tmp = g_hash_table_lookup (state->dict, name);
1614 			offsets[i].id = GPOINTER_TO_INT (tmp);
1615 			if (offsets[i].id < 2) {
1616 				g_warning ("Invalid ID (%d) for custom name '%s'", offsets[i].id, name);
1617 				continue;
1618 			}
1619 		} else {
1620 			map = msole_gsf_name_to_prop (name);
1621 			if (map == NULL) {
1622 				g_warning ("Missing map for built-in property '%s'", name);
1623 				continue;
1624 			}
1625 			offsets[i].id = map->id;
1626 		}
1627 
1628 		msole_metadata_write_prop (state, name,
1629 			gsf_doc_prop_get_val (prop), FALSE);
1630 		if (gsf_doc_prop_get_link (prop)) {
1631 			i++;
1632 			offsets[i].id     = offsets[i-1].id | 0x1000000;
1633 			offsets[i].offset = gsf_output_tell (state->out);
1634 			g_value_set_static_string (&scratch,
1635 				gsf_doc_prop_get_link (prop));
1636 			msole_metadata_write_prop (state, NULL, &scratch, FALSE);
1637 		}
1638 	}
1639 
1640 	while (i < count) {
1641 		static gboolean warned = FALSE;
1642 		if (!warned) {
1643 			warned = TRUE;
1644 			g_warning ("Something strange in msole_metadata_write_section");
1645 		}
1646 		offsets[i].id = 0;
1647 		offsets[i].offset = offsets[i - 1].offset;
1648 		i++;
1649 	}
1650 
1651 	len = gsf_output_tell (state->out) - base;
1652 	gsf_output_seek (state->out, base, G_SEEK_SET);
1653 	GSF_LE_SET_GUINT32 (buf, len);
1654 	GSF_LE_SET_GUINT32 (buf+4, count);
1655 	gsf_output_write (state->out, 8, buf);
1656 	for (i = 0 ; i < count ; i++) {
1657 		GSF_LE_SET_GUINT32 (buf, offsets[i].id);
1658 		GSF_LE_SET_GUINT32 (buf+4, offsets[i].offset - base);
1659 		gsf_output_write (state->out, 8, buf);
1660 	}
1661 
1662 	return gsf_output_seek (state->out, 0, G_SEEK_END);
1663 }
1664 
1665 static void
cb_count_props(char const * name,GsfDocProp * prop,WritePropState * state)1666 cb_count_props (char const *name, GsfDocProp *prop, WritePropState *state)
1667 {
1668 	GsfMSOleMetaDataPropMap const *map = msole_gsf_name_to_prop (name);
1669 
1670 	/* allocate predefined ids or add it to the dictionary */
1671 	if (map != NULL) {
1672 		if (map->id == 0) return; /* dictionary is handled elsewhere */
1673 		if (map->section == (state->doc_not_component ? COMPONENT_PROP : DOC_PROP))
1674 			return;
1675 		if (map->id == 1) { /*codepage */
1676 			GValue const *val = gsf_doc_prop_get_val (prop);
1677 			if (NULL != val && G_VALUE_HOLDS_INT (val))
1678 				state->codepage = g_value_get_int (val);
1679 			return;
1680 		}
1681 
1682 		d (g_print ("%d) Adding builtin %s'\n",
1683 			    state->builtin.count, map->gsf_name););
1684 		state->builtin.count += gsf_doc_prop_get_link (prop) ? 2 : 1;
1685 		state->builtin.props = g_slist_prepend (state->builtin.props, prop);
1686 	} else if (state->doc_not_component) { /* keep user props in the document */
1687 		d (g_print("user defined named '%s' assigned id = %d\n",
1688 			   name, state->user.count););
1689 		if (NULL == state->dict)
1690 			state->dict = g_hash_table_new (g_str_hash, g_str_equal);
1691 		g_hash_table_insert (state->dict,
1692 			(gpointer) name, GINT_TO_POINTER (state->user.count));
1693 		state->user.count += gsf_doc_prop_get_link (prop) ? 2 : 1;
1694 		state->user.props = g_slist_prepend (state->user.props, prop);
1695 	}
1696 }
1697 
1698 /**
1699  * gsf_doc_meta_data_write_to_msole:
1700  * @out: #GsfOutput
1701  * @meta_data: #GsfDocMetaData
1702  * @doc_not_component: a kludge to differentiate DocumentSummary from Summary
1703  *
1704  * Since: 1.14.24
1705  *
1706  * Returns: %TRUE on success;
1707  **/
1708 gboolean
gsf_doc_meta_data_write_to_msole(GsfDocMetaData const * meta_data,GsfOutput * out,gboolean doc_not_component)1709 gsf_doc_meta_data_write_to_msole (GsfDocMetaData const *meta_data,
1710                                   GsfOutput *out,
1711 				  gboolean doc_not_component)
1712 {
1713 	static guint8 const header[] = {
1714 		0xfe, 0xff,	/* byte order */
1715 		   0,    0,	/* Format */
1716 		0x04, 0x0a,	/* OS : XP == 0xA04 */
1717 		0x02, 0x00,	/* win32 == 2 */
1718 		0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, /* clasid = 0 */
1719 	};
1720 
1721 	gboolean	success = FALSE;
1722 	guint8		buf [4];
1723 	WritePropState	state;
1724 
1725 	state.codepage		= 1252;
1726 	state.iconv_handle      = (GIConv)-1;
1727 	state.char_size         = 1;
1728 	state.out		= out;
1729 	state.dict		= NULL;
1730 	state.builtin.count     = 1; /* codepage */
1731 	state.user.count	= 2; /* codepage and dictionary */
1732 	state.builtin.props     = state.user.props = NULL;
1733 	state.doc_not_component = doc_not_component;
1734 	d (g_print ("================================\nFinding props\n"););
1735 	gsf_doc_meta_data_foreach (meta_data,
1736 		(GHFunc) cb_count_props, &state);
1737 	d (g_print ("Done\n"
1738 		    "================================\n"););
1739 
1740 	state.iconv_handle = gsf_msole_iconv_open_codepage_for_export (state.codepage);
1741 	state.char_size = msole_codepage_char_size (state.codepage);
1742 
1743 	/* Write stream header */
1744 	GSF_LE_SET_GUINT32 (buf, (state.dict != NULL) ? 2 : 1);
1745 	if (!gsf_output_write (out, sizeof (header), header) ||
1746 	    !gsf_output_write (out, 4, buf))
1747 		goto err;
1748 
1749 	/* Write section header(s) */
1750 	GSF_LE_SET_GUINT32 (buf, (state.dict != NULL) ? 0x44 : 0x30);
1751 	if (!gsf_output_write (out, 16,
1752 		doc_not_component ? document_guid : component_guid) ||
1753 	    !gsf_output_write (out, 4, buf))
1754 		goto err;
1755 	if (state.dict != NULL) {
1756 		GSF_LE_SET_GUINT32 (buf, 0);
1757 		if (!gsf_output_write (out, sizeof (user_guid), user_guid) ||
1758 		    !gsf_output_write (out, 4, buf)) /* bogus position, fix it later */
1759 			goto err;
1760 	}
1761 
1762 	/* Write section(s) */
1763 	if (!msole_metadata_write_section (&state, FALSE))
1764 		goto err;
1765 	if (state.dict != NULL) {
1766 		gsf_off_t base  = gsf_output_tell (state.out);
1767 		GSF_LE_SET_GUINT32 (buf, base);
1768 		if (!gsf_output_seek (state.out, 0x40, G_SEEK_SET) ||
1769 		    !gsf_output_write (out, 4, buf) ||
1770 		    !gsf_output_seek (state.out, 0, G_SEEK_END) ||
1771 		    !msole_metadata_write_section (&state, TRUE))
1772 			goto err;
1773 	}
1774 
1775 	success = TRUE;
1776 err:
1777 	gsf_iconv_close (state.iconv_handle);
1778 	g_slist_free (state.builtin.props);
1779 	g_slist_free (state.user.props);
1780 	if (state.dict != NULL)
1781 		g_hash_table_destroy (state.dict);
1782 	return success;
1783 }
1784 
1785 /**
1786  * gsf_msole_metadata_write: (skip)
1787  * @out: #GsfOutput
1788  * @meta_data: #GsfDocMetaData
1789  * @doc_not_component: a kludge to differentiate DocumentSummary from Summary
1790  *
1791  * Deprecated: 1.14.24, use gsf_doc_meta_data_write_to_msole
1792  *
1793  * Returns: %TRUE on success;
1794  **/
1795 gboolean
gsf_msole_metadata_write(GsfOutput * out,GsfDocMetaData const * meta_data,gboolean doc_not_component)1796 gsf_msole_metadata_write (GsfOutput *out,
1797 			  GsfDocMetaData const *meta_data,
1798 			  gboolean doc_not_component)
1799 {
1800 	return gsf_doc_meta_data_write_to_msole (meta_data, out, doc_not_component);
1801 }
1802 
1803 static struct {
1804 	char const *tag;
1805 	guint	lid;
1806 } const gsf_msole_language_ids[] = {
1807 	{ "-none-", 0x0000 }, /* none (language neutral) */
1808 	{ "-none-", 0x0400 }, /* none */
1809 	{ "af_ZA",  0x0436 }, /* Afrikaans */
1810 	{ "am",     0x045e }, /* Amharic */
1811 	{ "sq_AL",  0x041c }, /* Albanian */
1812 	{ "ar_SA",  0x0401 }, /* Arabic (Saudi) */
1813 	{ "ar_IQ",  0x0801 }, /* Arabic (Iraq) */
1814 	{ "ar_EG",  0x0c01 }, /* Arabic (Egypt) */
1815 	{ "ar_LY",  0x1001 }, /* Arabic (Libya) */
1816 	{ "ar_DZ",  0x1401 }, /* Arabic (Algeria) */
1817 	{ "ar_MA",  0x1801 }, /* Arabic (Morocco) */
1818 	{ "ar_TN",  0x1c01 }, /* Arabic (Tunisia) */
1819 	{ "ar_OM",  0x2001 }, /* Arabic (Oman) */
1820 	{ "ar_YE",  0x2401 }, /* Arabic (Yemen) */
1821 	{ "ar_SY",  0x2801 }, /* Arabic (Syria) */
1822 	{ "ar_JO",  0x2c01 }, /* Arabic (Jordan) */
1823 	{ "ar_LB",  0x3001 }, /* Arabic (Lebanon) */
1824 	{ "ar_KW",  0x3401 }, /* Arabic (Kuwait) */
1825 	{ "ar_AE",  0x3801 }, /* Arabic (United Arab Emirates) */
1826 	{ "ar_BH",  0x3c01 }, /* Arabic (Bahrain) */
1827 	{ "ar_QA",  0x4001 }, /* Arabic (Qatar) */
1828 	{ "as",     0x044d }, /* Assamese */
1829 	{ "az",     0x042c }, /* Azerbaijani */
1830 	{ "hy_AM",  0x042b }, /* Armenian */
1831 	{ "az",     0x044c }, /* Azeri (Latin) az_ */
1832 	{ "az",     0x082c }, /* Azeri (Cyrillic) az_ */
1833 	{ "eu_ES",  0x042d }, /* Basque */
1834 	{ "be_BY",  0x0423 }, /* Belarussian */
1835 	{ "bn",     0x0445 }, /* Bengali bn_ */
1836 	{ "bg_BG",  0x0402 }, /* Bulgarian */
1837 	{ "ca_ES",  0x0403 }, /* Catalan */
1838 	{ "zh_TW",  0x0404 }, /* Chinese (Taiwan) */
1839 	{ "zh_CN",  0x0804 }, /* Chinese (PRC) */
1840 	{ "zh_HK",  0x0c04 }, /* Chinese (Hong Kong) */
1841 	{ "zh_SG",  0x1004 }, /* Chinese (Singapore) */
1842 	{ "ch_MO",  0x1404 }, /* Chinese (Macau SAR) */
1843 	{ "hr_HR",  0x041a }, /* Croatian */
1844 	{ "cs_CZ",  0x0405 }, /* Czech */
1845 	{ "da_DK",  0x0406 }, /* Danish */
1846 	{ "div",    0x465 }, /* Divehi div_*/
1847 	{ "nl_NL",  0x0413 }, /* Dutch (Netherlands) */
1848 	{ "nl_BE",  0x0813 }, /* Dutch (Belgium) */
1849 	{ "en_US",  0x0409 }, /* English (USA) */
1850 	{ "en_GB",  0x0809 }, /* English (UK) */
1851 	{ "en_AU",  0x0c09 }, /* English (Australia) */
1852 	{ "en_CA",  0x1009 }, /* English (Canada) */
1853 	{ "en_NZ",  0x1409 }, /* English (New Zealand) */
1854 	{ "en_IE",  0x1809 }, /* English (Ireland) */
1855 	{ "en_ZA",  0x1c09 }, /* English (South Africa) */
1856 	{ "en_JM",  0x2009 }, /* English (Jamaica) */
1857 	{ "en",     0x2409 }, /* English (Caribbean) */
1858 	{ "en_BZ",  0x2809 }, /* English (Belize) */
1859 	{ "en_TT",  0x2c09 }, /* English (Trinidad) */
1860 	{ "en_ZW",  0x3009 }, /* English (Zimbabwe) */
1861 	{ "en_PH",  0x3409 }, /* English (Phillipines) */
1862 	{ "et_EE",  0x0425 }, /* Estonian */
1863 	{ "fo",     0x0438 }, /* Faeroese fo_ */
1864 	{ "fa_IR",  0x0429 }, /* Farsi */
1865 	{ "fi_FI",  0x040b }, /* Finnish */
1866 	{ "fr_FR",  0x040c }, /* French (France) */
1867 	{ "fr_BE",  0x080c }, /* French (Belgium) */
1868 	{ "fr_CA",  0x0c0c }, /* French (Canada) */
1869 	{ "fr_CH",  0x100c }, /* French (Switzerland) */
1870 	{ "fr_LU",  0x140c }, /* French (Luxembourg) */
1871 	{ "fr_MC",  0x180c }, /* French (Monaco) */
1872 	{ "gl",     0x0456 }, /* Galician gl_ */
1873 	{ "ga_IE",  0x083c }, /* Irish Gaelic */
1874 	{ "gd_GB",  0x100c }, /* Scottish Gaelic */
1875 	{ "ka_GE",  0x0437 }, /* Georgian */
1876 	{ "de_DE",  0x0407 }, /* German (Germany) */
1877 	{ "de_CH",  0x0807 }, /* German (Switzerland) */
1878 	{ "de_AT",  0x0c07 }, /* German (Austria) */
1879 	{ "de_LU",  0x1007 }, /* German (Luxembourg) */
1880 	{ "de_LI",  0x1407 }, /* German (Liechtenstein) */
1881 	{ "el_GR",  0x0408 }, /* Greek */
1882 	{ "gu",     0x0447 }, /* Gujarati gu_ */
1883 	{ "ha",     0x0468 }, /* Hausa */
1884 	{ "he_IL",  0x040d }, /* Hebrew */
1885 	{ "hi_IN",  0x0439 }, /* Hindi */
1886 	{ "hu_HU",  0x040e }, /* Hungarian */
1887 	{ "is_IS",  0x040f }, /* Icelandic */
1888 	{ "id_ID",  0x0421 }, /* Indonesian */
1889 	{ "iu",     0x045d }, /* Inkutitut */
1890 	{ "it_IT",  0x0410 }, /* Italian (Italy) */
1891 	{ "it_CH",  0x0810 }, /* Italian (Switzerland) */
1892 	{ "ja_JP",  0x0411}, /* Japanese */
1893 	{ "kn",     0x044b }, /* Kannada kn_ */
1894 	{ "ks",     0x0860 }, /* Kashmiri (India) ks_ */
1895 	{ "kk",     0x043f }, /* Kazakh kk_ */
1896 	{ "kok",    0x0457 }, /* Konkani kok_ */
1897 	{ "ko_KR",  0x0412 }, /* Korean */
1898 	{ "ko",     0x0812 }, /* Korean (Johab) ko_ */
1899 	{ "kir",    0x0440 }, /* Kyrgyz */
1900 	{ "la",     0x0476 }, /* Latin */
1901 	{ "lo",     0x0454 }, /* Laothian */
1902 	{ "lv_LV",  0x0426 }, /* Latvian */
1903 	{ "lt_LT",  0x0427 }, /* Lithuanian */
1904 	{ "lt_LT",  0x0827 }, /* Lithuanian (Classic) */
1905 	{ "mk",     0x042f }, /* FYRO Macedonian */
1906 	{ "my_MY",  0x043e }, /* Malaysian */
1907 	{ "my_BN",  0x083e }, /* Malay Brunei Darussalam */
1908 	{ "ml",     0x044c }, /* Malayalam ml_ */
1909 	{ "mr",     0x044e }, /* Marathi mr_ */
1910 	{ "mt",     0x043a }, /* Maltese */
1911 	{ "mo",     0x0450 }, /* Mongolian */
1912 	{ "ne_NP",  0x0461 }, /* Napali (Nepal) */
1913 	{ "ne_IN",  0x0861 }, /* Nepali (India) */
1914 	{ "nb_NO",  0x0414 }, /* Norwegian (Bokmaal) */
1915 	{ "nn_NO",  0x0814 }, /* Norwegian (Nynorsk) */
1916 	{ "or",     0x0448 }, /* Oriya or_ */
1917 	{ "om",     0x0472 }, /* Oromo (Afan, Galla) */
1918 	{ "pl_PL",  0x0415 }, /* Polish */
1919 	{ "pt_BR",  0x0416 }, /* Portuguese (Brazil) */
1920 	{ "pt_PT",  0x0816 }, /* Portuguese (Portugal) */
1921 	{ "pa",     0x0446 }, /* Punjabi pa_ */
1922 	{ "ps",     0x0463 }, /* Pashto (Pushto) */
1923 	{ "rm",     0x0417 }, /* Rhaeto_Romanic rm_ */
1924 	{ "ro_RO",  0x0418 }, /* Romanian */
1925 	{ "ro_MD",  0x0818 }, /* Romanian (Moldova) */
1926 	{ "ru_RU",  0x0419 }, /* Russian */
1927 	{ "ru_MD",  0x0819 }, /* Russian (Moldova) */
1928 	{ "se",     0x043b }, /* Sami (Lappish) se_ */
1929 	{ "sa",     0x044f }, /* Sanskrit sa_ */
1930 	{ "sr",     0x0c1a }, /* Serbian (Cyrillic) sr_ */
1931 	{ "sr",     0x081a }, /* Serbian (Latin) sr_ */
1932 	{ "sd",     0x0459 }, /* Sindhi sd_ */
1933 	{ "sk_SK",  0x041b }, /* Slovak */
1934 	{ "sl_SI",  0x0424 }, /* Slovenian */
1935 	{ "wen",    0x042e }, /* Sorbian wen_ */
1936 	{ "so",     0x0477 }, /* Somali */
1937 	{ "es_ES",  0x040a }, /* Spanish (Spain, Traditional) */
1938 	{ "es_MX",  0x080a }, /* Spanish (Mexico) */
1939 	{ "es_ES",  0x0c0a }, /* Spanish (Modern) */
1940 	{ "es_GT",  0x100a }, /* Spanish (Guatemala) */
1941 	{ "es_CR",  0x140a }, /* Spanish (Costa Rica) */
1942 	{ "es_PA",  0x180a }, /* Spanish (Panama) */
1943 	{ "es_DO",  0x1c0a }, /* Spanish (Dominican Republic) */
1944 	{ "es_VE",  0x200a }, /* Spanish (Venezuela) */
1945 	{ "es_CO",  0x240a }, /* Spanish (Colombia) */
1946 	{ "es_PE",  0x280a }, /* Spanish (Peru) */
1947 	{ "es_AR",  0x2c0a }, /* Spanish (Argentina) */
1948 	{ "es_EC",  0x300a }, /* Spanish (Ecuador) */
1949 	{ "es_CL",  0x340a }, /* Spanish (Chile) */
1950 	{ "es_UY",  0x380a }, /* Spanish (Uruguay) */
1951 	{ "es_PY",  0x3c0a }, /* Spanish (Paraguay) */
1952 	{ "es_BO",  0x400a }, /* Spanish (Bolivia) */
1953 	{ "es_SV",  0x440a }, /* Spanish (El Salvador) */
1954 	{ "es_HN",  0x480a }, /* Spanish (Honduras) */
1955 	{ "es_NI",  0x4c0a }, /* Spanish (Nicaragua) */
1956 	{ "es_PR",  0x500a }, /* Spanish (Puerto Rico) */
1957 	{ "sx",     0x0430 }, /* Sutu */
1958 	{ "sw",     0x0441 }, /* Swahili (Kiswahili/Kenya) */
1959 	{ "sv_SE",  0x041d }, /* Swedish */
1960 	{ "sv_FI",  0x081d }, /* Swedish (Finland) */
1961 	{ "ta",     0x0449 }, /* Tamil ta_ */
1962 	{ "tt",     0x0444 }, /* Tatar (Tatarstan) tt_ */
1963 	{ "te",     0x044a }, /* Telugu te_ */
1964 	{ "th_TH",  0x041e }, /* Thai */
1965 	{ "ts",     0x0431 }, /* Tsonga ts_ */
1966 	{ "tn",     0x0432 }, /* Tswana tn_ */
1967 	{ "tr_TR",  0x041f }, /* Turkish */
1968 	{ "tl",     0x0464 }, /* Tagalog */
1969 	{ "tg",     0x0428 }, /* Tajik */
1970 	{ "bo",     0x0451 }, /* Tibetan */
1971 	{ "ti",     0x0473 }, /* Tigrinya */
1972 	{ "uk_UA",  0x0422 }, /* Ukrainian */
1973 	{ "ur_PK",  0x0420 }, /* Urdu (Pakistan) */
1974 	{ "ur_IN",  0x0820 }, /* Urdu (India) */
1975 	{ "uz",     0x0443 }, /* Uzbek (Latin) uz_ */
1976 	{ "uz",     0x0843 }, /* Uzbek (Cyrillic) uz_ */
1977 	{ "ven",    0x0433 }, /* Venda ven_ */
1978 	{ "vi_VN",  0x042a }, /* Vietnamese */
1979 	{ "cy_GB",  0x0452 }, /* Welsh */
1980 	{ "xh",     0x0434 }, /* Xhosa xh */
1981 	{ "yi",     0x043d }, /* Yiddish yi_ */
1982 	{ "yo",     0x046a }, /* Yoruba */
1983 	{ "zu",     0x0435 }, /* Zulu zu_ */
1984 	{ "en_US",  0x0800 } /* Default */
1985 };
1986 
1987 /**
1988  * gsf_msole_lid_for_language:
1989  * @lang: (allow-none): Language id, i.e., locale name.
1990  *
1991  * Returns: the LID (Language Identifier) for the input language.
1992  * 	If lang is %NULL, return 0x0400 ("-none-"), and not 0x0000 ("no proofing")
1993  **/
1994 guint
gsf_msole_lid_for_language(char const * lang)1995 gsf_msole_lid_for_language (char const *lang)
1996 {
1997 	guint i = 0 ;
1998 	size_t len;
1999 
2000 	if (lang == NULL)
2001 		return 0x0400;   /* return -none- */
2002 
2003 	/* Allow lang to match as a prefix (eg fr == fr_FR@euro) */
2004 	len = strlen (lang);
2005 	for (i = 0 ; i < G_N_ELEMENTS(gsf_msole_language_ids); i++)
2006 		if (!strncmp (lang, gsf_msole_language_ids[i].tag, len))
2007 			return gsf_msole_language_ids[i].lid;
2008 
2009 	return 0x0400 ;   /* return -none- */
2010 }
2011 
2012 /**
2013  * gsf_msole_language_for_lid:
2014  * @lid: numerical language id
2015  *
2016  * Returns: (transfer none): the xx_YY style string (can be just xx or
2017  * xxx) for the given LID.  If the LID is not found, is set to 0x0400,
2018  * or is set to 0x0000, will return "-none-"
2019  **/
2020 char const *
gsf_msole_language_for_lid(guint lid)2021 gsf_msole_language_for_lid (guint lid)
2022 {
2023 	guint i = 0 ;
2024 
2025 	for (i = 0 ; i < G_N_ELEMENTS(gsf_msole_language_ids); i++)
2026 		if (gsf_msole_language_ids[i].lid == lid)
2027 			return gsf_msole_language_ids[i].tag;
2028 
2029 	return "-none-"; /* default */
2030 }
2031 
2032 /**
2033  * gsf_msole_locale_to_lid:
2034  * @codepage: character code page.
2035  *
2036  * Convert the the codepage into an applicable LID
2037  **/
2038 guint
gsf_msole_codepage_to_lid(int codepage)2039 gsf_msole_codepage_to_lid (int codepage)
2040 {
2041 	switch (codepage) {
2042 	case 77:		/* MAC_CHARSET */
2043 		return 0xFFF;	/* This number is a hack */
2044 	case 128:		/* SHIFTJIS_CHARSET */
2045 		return 0x411;	/* Japanese */
2046 	case 129:		/* HANGEUL_CHARSET */
2047 		return 0x412;	/* Korean */
2048 	case 130:		/* JOHAB_CHARSET */
2049 		return 0x812;	/* Korean (Johab) */
2050 	case 134:		/* GB2312_CHARSET - Chinese Simplified */
2051 		return 0x804;	/* China PRC - And others!! */
2052 	case 136:		/* CHINESEBIG5_CHARSET - Chinese Traditional */
2053 		return 0x404;	/* Taiwan - And others!! */
2054 	case 161:		/* GREEK_CHARSET */
2055 		return 0x408;	/* Greek */
2056 	case 162:		/* TURKISH_CHARSET */
2057 		return 0x41f;	/* Turkish */
2058 	case 163:		/* VIETNAMESE_CHARSET */
2059 		return 0x42a;	/* Vietnamese */
2060 	case 177:		/* HEBREW_CHARSET */
2061 		return 0x40d;	/* Hebrew */
2062 	case 178:		/* ARABIC_CHARSET */
2063 		return 0x01;	/* Arabic */
2064 	case 186:		/* BALTIC_CHARSET */
2065 		return 0x425;	/* Estonian - And others!! */
2066 	case 204:		/* RUSSIAN_CHARSET */
2067 		return 0x419;	/* Russian - And others!! */
2068 	case 222:		/* THAI_CHARSET */
2069 		return 0x41e;	/* Thai */
2070 	case 238:		/* EASTEUROPE_CHARSET */
2071 		return 0x405;	/* Czech - And many others!! */
2072 	}
2073 
2074 	/* default */
2075 	return 0x0;
2076 }
2077 
2078 /**
2079  * gsf_msole_lid_to_codepage:
2080  * @lid: numerical language id
2081  *
2082  * Returns: our best guess at the codepage for the given language id
2083  **/
2084 int
gsf_msole_lid_to_codepage(guint lid)2085 gsf_msole_lid_to_codepage (guint lid)
2086 {
2087 	if (lid == 0x0FFF) /* Macintosh Hack */
2088 		return 0x0FFF;
2089 
2090 	switch (lid & 0xff) {
2091 	case 0x01:		/* Arabic */
2092 		return 1256;
2093 	case 0x02:		/* Bulgarian */
2094 		return 1251;
2095 	case 0x03:		/* Catalan */
2096 		return 1252;
2097 	case 0x04:		/* Chinese */
2098 		switch (lid) {
2099 		case 0x1004:		/* Chinese (Singapore) */
2100 		case 0x0404:		/* Chinese (Taiwan) */
2101 		case 0x1404:		/* Chinese (Macau SAR) */
2102 		case 0x0c04:		/* Chinese (Hong Kong SAR, PRC) */
2103 			return 950;
2104 
2105 		case 0x0804:		/* Chinese (PRC) */
2106 			return 936;
2107 		default:
2108 			break;
2109 		}
2110 		break;
2111 	case 0x05:		/* Czech */
2112 		return 1250;
2113 	case 0x06:		/* Danish */
2114 		return 1252;
2115 	case 0x07:		/* German */
2116 		return 1252;
2117 	case 0x08:		/* Greek */
2118 		return 1253;
2119 	case 0x09:		/* English */
2120 		return 1252;
2121 	case 0x0a:		/* Spanish */
2122 		return 1252;
2123 	case 0x0b:		/* Finnish */
2124 		return 1252;
2125 	case 0x0c:		/* French */
2126 		return 1252;
2127 	case 0x0d:		/* Hebrew */
2128 		return 1255;
2129 	case 0x0e:		/* Hungarian */
2130 		return 1250;
2131 	case 0x0f:		/* Icelandic */
2132 		return 1252;
2133 	case 0x10:		/* Italian */
2134 		return 1252;
2135 	case 0x11:		/* Japanese */
2136 		return 932;
2137 	case 0x12:		/* Korean */
2138 		switch (lid) {
2139 		case 0x0812:		/* Korean (Johab) */
2140 			return 1361;
2141 		case 0x0412:		/* Korean */
2142 			return 949;
2143 		default:
2144 			break;
2145 		}
2146 		break;
2147 	case 0x13:		/* Dutch */
2148 		return 1252;
2149 	case 0x14:		/* Norwegian */
2150 		return 1252;
2151 	case 0x15:		/* Polish */
2152 		return 1250;
2153 	case 0x16:		/* Portuguese */
2154 		return 1252;
2155 	case 0x17:		/* Rhaeto-Romanic */
2156 		return 1252;
2157 	case 0x18:		/* Romanian */
2158 		return 1250;
2159 	case 0x19:		/* Russian */
2160 		return 1251;
2161 	case 0x1a:		/* Serbian, Croatian, (Bosnian?) */
2162 		switch (lid) {
2163 		case 0x041a:		/* Croatian */
2164 			return 1252;
2165 		case 0x0c1a:		/* Serbian (Cyrillic) */
2166 			return 1251;
2167 		case 0x081a:		/* Serbian (Latin) */
2168 			return 1252;
2169 		default:
2170 			break;
2171 		}
2172 		break;
2173 	case 0x1b:		/* Slovak */
2174 		return 1250;
2175 	case 0x1c:		/* Albanian */
2176 		return 1251;
2177 	case 0x1d:		/* Swedish */
2178 		return 1252;
2179 	case 0x1e:		/* Thai */
2180 		return 874;
2181 	case 0x1f:		/* Turkish */
2182 		return 1254;
2183 	case 0x20:		/* Urdu. This is Unicode only. */
2184 		return 0;
2185 	case 0x21:		/* Bahasa Indonesian */
2186 		return 1252;
2187 	case 0x22:		/* Ukrainian */
2188 		return 1251;
2189 	case 0x23:		/* Byelorussian / Belarusian */
2190 		return 1251;
2191 	case 0x24:		/* Slovenian */
2192 		return 1250;
2193 	case 0x25:		/* Estonian */
2194 		return 1257;
2195 	case 0x26:		/* Latvian */
2196 		return 1257;
2197 	case 0x27:		/* Lithuanian */
2198 		return 1257;
2199 	case 0x29:		/* Farsi / Persian. This is Unicode only. */
2200 		return 0;
2201 	case 0x2a:		/* Vietnamese */
2202 		return 1258;
2203 	case 0x2b:		/* Windows 2000: Armenian. This is Unicode only. */
2204 		return 0;
2205 	case 0x2c:		/* Azeri */
2206 		switch (lid) {
2207 		case 0x082c:		/* Azeri (Cyrillic) */
2208 			return 1251;
2209 		default:
2210 			break;
2211 		}
2212 		break;
2213 	case 0x2d:		/* Basque */
2214 		return 1252;
2215 	case 0x2f:		/* Macedonian */
2216 		return 1251;
2217 	case 0x36:		/* Afrikaans */
2218 		return 1252;
2219 	case 0x37:		/* Windows 2000: Georgian. This is Unicode only. */
2220 		return 0;
2221 	case 0x38:		/* Faeroese */
2222 		return 1252;
2223 	case 0x39:		/* Windows 2000: Hindi. This is Unicode only. */
2224 		return 0;
2225 	case 0x3E:		/* Malaysian / Malay */
2226 		return 1252;
2227 	case 0x41:		/* Swahili */
2228 		return 1252;
2229 	case 0x43:		/* Uzbek */
2230 		switch (lid) {
2231 		case 0x0843:		/* Uzbek (Cyrillic) */
2232 			return 1251;
2233 		default:
2234 			break;
2235 		}
2236 		break;
2237 	case 0x45:		/* Windows 2000: Bengali. This is Unicode only. */
2238 	case 0x46:		/* Windows 2000: Punjabi. This is Unicode only. */
2239 	case 0x47:		/* Windows 2000: Gujarati. This is Unicode only. */
2240 	case 0x48:		/* Windows 2000: Oriya. This is Unicode only. */
2241 	case 0x49:		/* Windows 2000: Tamil. This is Unicode only. */
2242 	case 0x4a:		/* Windows 2000: Telugu. This is Unicode only. */
2243 	case 0x4b:		/* Windows 2000: Kannada. This is Unicode only. */
2244 	case 0x4c:		/* Windows 2000: Malayalam. This is Unicode only. */
2245 	case 0x4d:		/* Windows 2000: Assamese. This is Unicode only. */
2246 	case 0x4e:		/* Windows 2000: Marathi. This is Unicode only. */
2247 	case 0x4f:		/* Windows 2000: Sanskrit. This is Unicode only. */
2248 	case 0x55:		/* Myanmar / Burmese. This is Unicode only. */
2249 	case 0x57:		/* Windows 2000: Konkani. This is Unicode only. */
2250 	case 0x61:		/* Windows 2000: Nepali (India). This is Unicode only. */
2251 		return 0;
2252 
2253 #if 0
2254 		/******************************************************************
2255 		 * Below this line is untested, unproven, and are just guesses.   *
2256 		 * Insert above and use at your own risk                          *
2257 		 ******************************************************************/
2258 
2259 	case 0x042c:		/* Azeri (Latin) */
2260 	case 0x0443:		/* Uzbek (Latin) */
2261 	case 0x30:		/* Sutu */
2262 		return 1252; /* UNKNOWN, believed to be CP1252 */
2263 
2264 	case 0x3f:		/* Kazakh */
2265 		return 1251; /* JUST UNKNOWN, probably CP1251 */
2266 
2267 	case 0x44:		/* Tatar */
2268 	case 0x58:		/* Manipuri */
2269 	case 0x59:		/* Sindhi */
2270 	case 0x60:		/* Kashmiri (India) */
2271 		return 0; /* UNKNOWN, believed to be Unicode only */
2272 #endif
2273 	};
2274 
2275 	/* This is just a guess, but it will be a frequent guess */
2276 	return 1252;
2277 }
2278 
2279 /**
2280  * gsf_msole_lid_to_codepage_str:
2281  * @lid: numerical language id
2282  *
2283  * Returns: (transfer full): the Iconv codepage string for the given
2284  * LID.
2285  **/
2286 gchar *
gsf_msole_lid_to_codepage_str(guint lid)2287 gsf_msole_lid_to_codepage_str (guint lid)
2288 {
2289 	guint cp = 0;
2290 
2291 	if (lid == 0x0FFF)	/* Macintosh Hack */
2292 		return g_strdup ("MACINTOSH");
2293 
2294 	cp = gsf_msole_lid_to_codepage (lid);
2295 	return g_strdup_printf ("CP%d", cp);
2296 }
2297 
2298 /**
2299  * gsf_msole_iconv_win_codepage:
2300  *
2301  * Returns: our best guess at the applicable windows code page based on an
2302  * 	environment variable or the current locale.
2303  **/
2304 int
gsf_msole_iconv_win_codepage(void)2305 gsf_msole_iconv_win_codepage (void)
2306 {
2307 	const char *win_lang;
2308 	char *lang = NULL;
2309 
2310 	win_lang = g_getenv("WINDOWS_LANGUAGE");
2311 	if (win_lang) {
2312 		lang = g_strdup (win_lang);
2313 	} else {
2314 		char const *locale = setlocale (LC_CTYPE, NULL);
2315 		if (locale != NULL) {
2316 			char const *lang_sep = strchr (locale, '.');
2317 			if (lang_sep)
2318 				lang = g_strndup (locale, lang_sep - locale);
2319 			else
2320 				lang = g_strdup (locale);
2321 		}
2322 	}
2323 
2324 	if (lang != NULL) {
2325 		guint lid = gsf_msole_lid_for_language (lang);
2326 		g_free (lang);
2327 		return gsf_msole_lid_to_codepage (lid);
2328 	}
2329 	return 1252; /* default ansi */
2330 }
2331 
2332 static GSList *
gsf_msole_iconv_get_codepage_string_list(int codepage)2333 gsf_msole_iconv_get_codepage_string_list (int codepage)
2334 {
2335 	GSList *cp_list = NULL;
2336 
2337 	switch (codepage) {
2338 		case 1200:
2339 			cp_list = g_slist_prepend (cp_list, g_strdup ("UTF-16LE"));
2340 			break;
2341 		case 1201:
2342 			cp_list = g_slist_prepend (cp_list, g_strdup ("UTF-16BE"));
2343 			break;
2344 		case 0x8000:
2345 		case 10000:
2346 			cp_list = g_slist_prepend (cp_list, g_strdup ("MACROMAN"));
2347 			cp_list = g_slist_prepend (cp_list, g_strdup ("MACINTOSH"));
2348 			break;
2349 		case -535:
2350 		case 65001:
2351 			cp_list = g_slist_prepend (cp_list, g_strdup ("UTF-8"));
2352 			break;
2353 		case 0x8001:
2354 			/* according to OOo docs 8001 is a synonym CP1252 */
2355 			codepage = 1252;
2356 			/* fallthrough */
2357 
2358 		default:
2359 			cp_list = g_slist_prepend (cp_list, g_strdup_printf ("CP%u", codepage));
2360 	}
2361 
2362 	return cp_list;
2363 }
2364 
2365 /**
2366  * gsf_msole_iconv_open_codepage_for_import: (skip)
2367  * @to: the target encoding.
2368  * @codepage: the source code page.
2369  *
2370  * NOTE: skipped since GIConv is not exported to introspection.
2371  *
2372  * Returns: an iconv converter for @codepage -> utf8.
2373  **/
2374 GIConv
gsf_msole_iconv_open_codepage_for_import(char const * to,int codepage)2375 gsf_msole_iconv_open_codepage_for_import (char const *to, int codepage)
2376 {
2377 	GIConv iconv_handle = (GIConv)(-1);
2378 	gchar *codepage_str;
2379 	GSList *codepage_list, *cp;
2380 	g_return_val_if_fail (to != NULL, (GIConv)(-1));
2381 
2382 	cp = codepage_list = gsf_msole_iconv_get_codepage_string_list (codepage);
2383 	while (cp) {
2384 		codepage_str = cp->data;
2385 		if (iconv_handle == (GIConv)(-1))
2386 			iconv_handle = g_iconv_open (to, codepage_str);
2387 		g_free (codepage_str);
2388 		cp = cp->next;
2389 	}
2390 	g_slist_free (codepage_list);
2391 
2392 	if (iconv_handle == (GIConv)(-1))
2393 		g_warning ("Unable to open an iconv handle from codepage %d -> %s",
2394 			   codepage, to);
2395 	return iconv_handle;
2396 }
2397 
2398 /**
2399  * gsf_msole_iconv_open_for_import: (skip)
2400  * @codepage: the source code page.
2401  *
2402  * NOTE: skipped since GIConv is not exported to introspection.
2403  *
2404  * Returns: an iconv converter for single byte encodings @codepage -> utf8.
2405  * 	Attempt to handle the semantics of a specification for multibyte encodings
2406  * 	since this is only supposed to be used for single bytes.
2407  **/
2408 GIConv
gsf_msole_iconv_open_for_import(int codepage)2409 gsf_msole_iconv_open_for_import (int codepage)
2410 {
2411 	return gsf_msole_iconv_open_codepage_for_import ("UTF-8", codepage);
2412 }
2413 
2414 /**
2415  * gsf_msole_iconv_open_codepages_for_export: (skip)
2416  * @codepage_to: the target code page.
2417  * @from: the source encoding.
2418  *
2419  * NOTE: skipped since GIConv is not exported to introspection.
2420  *
2421  * Returns: an iconv converter to go from utf8 -> to our best guess at a useful
2422  * 	windows codepage.
2423  **/
2424 GIConv
gsf_msole_iconv_open_codepages_for_export(int codepage_to,char const * from)2425 gsf_msole_iconv_open_codepages_for_export (int codepage_to, char const *from)
2426 {
2427 	GIConv iconv_handle = (GIConv)(-1);
2428 	gchar *codepage_str;
2429 	GSList *codepage_list, *cp;
2430 	g_return_val_if_fail (from != NULL, (GIConv)(-1));
2431 
2432 	cp = codepage_list = gsf_msole_iconv_get_codepage_string_list (codepage_to);
2433 	while (cp) {
2434 		codepage_str = cp->data;
2435 		if (iconv_handle == (GIConv)(-1))
2436 			iconv_handle = g_iconv_open (codepage_str, from);
2437 		g_free (codepage_str);
2438 		cp = cp->next;
2439 	}
2440 	g_slist_free (codepage_list);
2441 
2442 	if (iconv_handle == (GIConv)(-1))
2443 		g_warning ("Unable to open an iconv handle from %s -> codepage %u",
2444 			   from, codepage_to);
2445 	return iconv_handle;
2446 }
2447 
2448 /**
2449  * gsf_msole_iconv_open_codepage_for_export: (skip)
2450  * @codepage_to: the target code page.
2451  *
2452  * NOTE: skipped since GIConv is not exported to introspection.
2453  *
2454  * Returns: an iconv converter to go from utf8 -> to our best guess at a useful
2455  * 	windows codepage.
2456  **/
2457 GIConv
gsf_msole_iconv_open_codepage_for_export(int codepage_to)2458 gsf_msole_iconv_open_codepage_for_export (int codepage_to)
2459 {
2460 	return gsf_msole_iconv_open_codepages_for_export (codepage_to, "UTF-8");
2461 }
2462 
2463 /**
2464  * gsf_msole_iconv_open_for_export: (skip)
2465  *
2466  * NOTE: skipped since GIConv is not exported to introspection.
2467  *
2468  * Returns: an iconv convert to go from utf8 -> to our best guess at a useful
2469  * 	windows codepage.
2470  **/
2471 GIConv
gsf_msole_iconv_open_for_export(void)2472 gsf_msole_iconv_open_for_export (void)
2473 {
2474 	return gsf_msole_iconv_open_codepage_for_export (gsf_msole_iconv_win_codepage ());
2475 }
2476 
2477 #define VBA_COMPRESSION_WINDOW 4096
2478 
2479 /**
2480  * gsf_msole_inflate:
2481  * @input: stream to read from
2482  * @offset: offset into it for start byte of compresse stream
2483  *
2484  * Decompresses an LZ compressed stream.
2485  *
2486  * Return value: (transfer full): A GByteArray that the caller is responsible for freeing
2487  **/
2488 GByteArray *
gsf_msole_inflate(GsfInput * input,gsf_off_t offset)2489 gsf_msole_inflate (GsfInput *input, gsf_off_t offset)
2490 {
2491 	GByteArray *res;
2492 	unsigned	i, win_pos, pos = 0;
2493 	unsigned	mask, shift, distance;
2494 	guint8		flag, buffer [VBA_COMPRESSION_WINDOW];
2495 	guint8 const   *tmp;
2496 	guint16		token, len;
2497 	gboolean	clean = TRUE;
2498 
2499 	if (gsf_input_seek (input, offset, G_SEEK_SET))
2500 		return NULL;
2501 
2502 	res = g_byte_array_new ();
2503 
2504 	/* explaination from libole2/ms-ole-vba.c */
2505 	/* The first byte is a flag byte.  Each bit in this byte
2506 	 * determines what the next byte is.  If the bit is zero,
2507 	 * the next byte is a character.  Otherwise the  next two
2508 	 * bytes contain the number of characters to copy from the
2509 	 * umcompresed buffer and where to copy them from (offset,
2510 	 * length).
2511 	 */
2512 	while (NULL != gsf_input_read (input, 1, &flag))
2513 		for (mask = 1; mask < 0x100 ; mask <<= 1)
2514 			if (flag & mask) {
2515 				if (NULL == (tmp = gsf_input_read (input, 2, NULL)))
2516 					break;
2517 				win_pos = pos % VBA_COMPRESSION_WINDOW;
2518 				if (win_pos <= 0x80) {
2519 					if (win_pos <= 0x20)
2520 						shift = (win_pos <= 0x10) ? 12 : 11;
2521 					else
2522 						shift = (win_pos <= 0x40) ? 10 : 9;
2523 				} else {
2524 					if (win_pos <= 0x200)
2525 						shift = (win_pos <= 0x100) ? 8 : 7;
2526 					else if (win_pos <= 0x800)
2527 						shift = (win_pos <= 0x400) ? 6 : 5;
2528 					else
2529 						shift = 4;
2530 				}
2531 
2532 				token = GSF_LE_GET_GUINT16 (tmp);
2533 				len = (token & ((1 << shift) - 1)) + 3;
2534 				distance = token >> shift;
2535 				clean = TRUE;
2536 /*				fprintf (stderr, "Shift %d, token len %d, distance %d bytes %.2x %.2x\n",
2537 				shift, len, distance, (token & 0xff), (token >> 8)); */
2538 
2539 				if (distance >= pos) {
2540 					g_warning ("Corrupted compressed stream");
2541 					break;
2542 				}
2543 
2544 				for (i = 0; i < len; i++) {
2545 					unsigned srcpos = (pos - distance - 1) % VBA_COMPRESSION_WINDOW;
2546 					guint8 c = buffer [srcpos];
2547 					buffer [pos++ % VBA_COMPRESSION_WINDOW] = c;
2548 				}
2549 			} else {
2550 				if ((pos != 0) && ((pos % VBA_COMPRESSION_WINDOW) == 0) && clean) {
2551 					(void) gsf_input_read (input, 2, NULL);
2552 					clean = FALSE;
2553 					g_byte_array_append (res, buffer, VBA_COMPRESSION_WINDOW);
2554 					break;
2555 				}
2556 				if (NULL != gsf_input_read (input, 1, buffer + (pos % VBA_COMPRESSION_WINDOW)))
2557 					pos++;
2558 				clean = TRUE;
2559 			}
2560 
2561 	if (pos % VBA_COMPRESSION_WINDOW)
2562 		g_byte_array_append (res, buffer, pos % VBA_COMPRESSION_WINDOW);
2563 	return res;
2564 }
2565 
2566 
2567 struct GsfMSOleSortingKey_ {
2568 	gunichar2 *name;
2569 	size_t len;
2570 };
2571 
2572 GsfMSOleSortingKey *
gsf_msole_sorting_key_new(const char * name)2573 gsf_msole_sorting_key_new (const char *name)
2574 {
2575 	GsfMSOleSortingKey *res = g_new (GsfMSOleSortingKey, 1);
2576 	size_t name_len;
2577 	const char *p;
2578 
2579 	if (!name)
2580 		name = "";
2581 	name_len = strlen (name);
2582 
2583 	res->name = g_new (gunichar2, name_len + 1);
2584 	res->len = 0;
2585 
2586 	/* This code is a bit like g_utf8_to_utf16.  */
2587 
2588 	for (p = name; *p; p = g_utf8_next_char (p)) {
2589 		gunichar wc =
2590 			g_utf8_get_char_validated (p, name_len - (p - name));
2591 		if (wc & 0x80000000)
2592 			break; /* Something invalid or incomplete */
2593 		if (wc < 0x10000) {
2594 			wc = g_unichar_toupper (wc);
2595 			/* Let's hope no uppercase char is above 0xffff! */
2596 			res->name[res->len++] = wc;
2597 		} else {
2598 			res->name[res->len++] =	(wc - 0x10000) / 0x400 + 0xd800;
2599 			res->name[res->len++] =	(wc - 0x10000) % 0x400 + 0xdc00;
2600 		}
2601 	}
2602 	res->name[res->len] = 0;
2603 
2604 	return res;
2605 }
2606 
2607 void
gsf_msole_sorting_key_free(GsfMSOleSortingKey * sk)2608 gsf_msole_sorting_key_free (GsfMSOleSortingKey *sk)
2609 {
2610 	if (sk) {
2611 		g_free (sk->name);
2612 		g_free (sk);
2613 	}
2614 }
2615 
2616 static GsfMSOleSortingKey *
gsf_ms_ole_sorting_key_copy(GsfMSOleSortingKey * sk)2617 gsf_ms_ole_sorting_key_copy (GsfMSOleSortingKey *sk)
2618 {
2619 	GsfMSOleSortingKey *res = g_new (GsfMSOleSortingKey, 1);
2620 	res->len = sk->len;
2621 	res->name = g_new (gunichar2, sk->len + 1);
2622 	memcpy (res->name, sk->name, (sk->len + 1) * sizeof (gunichar2));
2623 	return res;
2624 }
2625 
2626 GType
gsf_msole_sorting_key_get_type(void)2627 gsf_msole_sorting_key_get_type (void)
2628 {
2629     static GType type = 0;
2630 
2631     if (type == 0)
2632 	type = g_boxed_type_register_static
2633 	    ("GsfMSOleSortingKey",
2634 	     (GBoxedCopyFunc) gsf_ms_ole_sorting_key_copy,
2635 	     (GBoxedFreeFunc) gsf_msole_sorting_key_free);
2636 
2637     return type;
2638 }
2639 
2640 int
gsf_msole_sorting_key_cmp(const GsfMSOleSortingKey * a,const GsfMSOleSortingKey * b)2641 gsf_msole_sorting_key_cmp (const GsfMSOleSortingKey *a,
2642 			   const GsfMSOleSortingKey *b)
2643 {
2644 	long diff;
2645 	/* According to the docs length is more important than lexical order */
2646 	if (a->len != b->len)
2647 		diff = a->len - b->len;
2648 	else {
2649 		const gunichar2 *pa = a->name;
2650 		const gunichar2 *pb = b->name;
2651 		while (*pa == *pb && *pa)
2652 			pa++, pb++;
2653 		diff = *pa - *pb;
2654 	}
2655 
2656 	/* Note, that diff might not fit "int" */
2657 	return diff > 0 ? +1 : (diff < 0 ? -1 : 0);
2658 }
2659