1 // =================================================================================================
2 // Copyright 2002-2008 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 //
8 // Adobe patent application tracking #P435, entitled 'Unique markers to simplify embedding data of
9 // one format in a file with a different format', inventors: Sean Parent, Greg Gilley.
10 // =================================================================================================
11 
12 #include "XMP_Environment.h"	// ! This must be the first include!
13 #include "XMPCore_Impl.hpp"
14 
15 #include "XMPMeta.hpp"
16 #include "XMPUtils.hpp"
17 
18 #include "UnicodeInlines.incl_cpp"
19 #include "UnicodeConversions.hpp"
20 #include "ExpatAdapter.hpp"
21 
22 #if XMP_DebugBuild
23 	#include <iostream>
24 #endif
25 
26 using namespace std;
27 
28 #if XMP_WinBuild
29 #ifdef _MSC_VER
30 	#pragma warning ( disable : 4533 )	// initialization of '...' is skipped by 'goto ...'
31 	#pragma warning ( disable : 4702 )	// unreachable code
32 	#pragma warning ( disable : 4800 )	// forcing value to bool 'true' or 'false' (performance warning)
33 	#pragma warning ( disable : 4996 )	// '...' was declared deprecated
34 #endif
35 #endif
36 
37 
38 // *** Use the XMP_PropIsXyz (Schema, Simple, Struct, Array, ...) macros
39 // *** Add debug codegen checks, e.g. that typical masking operations really work
40 // *** Change all uses of strcmp and strncmp to XMP_LitMatch and XMP_LitNMatch
41 
42 
43 // =================================================================================================
44 // Local Types and Constants
45 // =========================
46 
47 
48 // =================================================================================================
49 // Static Variables
50 // ================
51 
52 #ifndef Trace_ParsingHackery
53 	#define Trace_ParsingHackery 0
54 #endif
55 
56 static const char * kReplaceLatin1[128] =
57 	{
58 
59 		// The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code page 1252.
60 		// The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined by Windows 1252, but
61 		// their conversion API maps them to U+0081, etc. These are in XML's RestrictedChar set, so
62 		// we map them to a space.
63 
64 		"\xE2\x82\xAC", " ",            "\xE2\x80\x9A", "\xC6\x92",		// 0x80 .. 0x83
65 		"\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1",	// 0x84 .. 0x87
66 		"\xCB\x86",     "\xE2\x80\xB0", "\xC5\xA0",     "\xE2\x80\xB9",	// 0x88 .. 0x8B
67 		"\xC5\x92",     " ",            "\xC5\xBD",     " ",			// 0x8C .. 0x8F
68 
69 		" ",            "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C",	// 0x90 .. 0x93
70 		"\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94",	// 0x94 .. 0x97
71 		"\xCB\x9C",     "\xE2\x84\xA2", "\xC5\xA1",     "\xE2\x80\xBA",	// 0x98 .. 0x9B
72 		"\xC5\x93",     " ",            "\xC5\xBE",     "\xC5\xB8",		// 0x9C .. 0x9F
73 
74 		// These are the UTF-8 forms of the official Latin-1 characters in the range 0xA0..0xFF. Not
75 		// too surprisingly these map to U+00A0, etc. Which is the Unicode Latin Supplement range.
76 
77 		"\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7",	// 0xA0 .. 0xA7
78 		"\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF",	// 0xA8 .. 0xAF
79 
80 		"\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7",	// 0xB0 .. 0xB7
81 		"\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF",	// 0xB8 .. 0xBF
82 
83 		"\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87",	// 0xC0 .. 0xC7
84 		"\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F",	// 0xC8 .. 0xCF
85 
86 		"\xC3\x90", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97",	// 0xD0 .. 0xD7
87 		"\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F",	// 0xD8 .. 0xDF
88 
89 		"\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7",	// 0xE0 .. 0xE7
90 		"\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF",	// 0xE8 .. 0xEF
91 
92 		"\xC3\xB0", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7",	// 0xF0 .. 0xF7
93 		"\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC3\xBF",	// 0xF8 .. 0xFF
94 
95 	};
96 
97 
98 // =================================================================================================
99 // Local Utilities
100 // ===============
101 
102 
103 #define IsHexDigit(ch)		( (('0' <= (ch)) && ((ch) <= '9')) || (('A' <= (ch)) && ((ch) <= 'F')) )
104 #define HexDigitValue(ch)	( (((ch) - '0') < 10) ? ((ch) - '0') : ((ch) - 'A' + 10) )
105 
106 
107 // -------------------------------------------------------------------------------------------------
108 // PickBestRoot
109 // ------------
PickBestRoot(const XML_Node & xmlParent,XMP_OptionBits options)110 static const XML_Node * PickBestRoot ( const XML_Node & xmlParent, XMP_OptionBits options )
111 {
112 
113 	// Look among this parent's content for x:xmpmeta. The recursion for x:xmpmeta is broader than
114 	// the strictly defined choice, but gives us smaller code.
115 	for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
116 		const XML_Node * childNode = xmlParent.content[childNum];
117 		if ( childNode->kind != kElemNode ) continue;
118 		if ( (childNode->name == "x:xmpmeta") || (childNode->name == "x:xapmeta") ) return PickBestRoot ( *childNode, 0 );
119 	}
120 	// Look among this parent's content for a bare rdf:RDF if that is allowed.
121 	if ( ! (options & kXMP_RequireXMPMeta) ) {
122 		for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
123 			const XML_Node * childNode = xmlParent.content[childNum];
124 			if ( childNode->kind != kElemNode ) continue;
125 			if ( childNode->name == "rdf:RDF" ) return childNode;
126 		}
127 	}
128 
129 	// Recurse into the content.
130 	for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
131 		const XML_Node * foundRoot = PickBestRoot ( *xmlParent.content[childNum], options );
132 		if ( foundRoot != 0 ) return foundRoot;
133 	}
134 
135 	return 0;
136 
137 }	// PickBestRoot
138 
139 // -------------------------------------------------------------------------------------------------
140 // FindRootNode
141 // ------------
142 //
143 // Find the XML node that is the root of the XMP data tree. Generally this will be an outer node,
144 // but it could be anywhere if a general XML document is parsed (e.g. SVG). The XML parser counted
145 // all possible root nodes, and kept a pointer to the last one. If there is more than one possible
146 // root use PickBestRoot to choose among them.
147 //
148 // If there is a root node, try to extract the version of the previous XMP toolkit.
149 
FindRootNode(XMPMeta * thiz,const XMLParserAdapter & xmlParser,XMP_OptionBits options)150 static const XML_Node * FindRootNode ( XMPMeta * thiz, const XMLParserAdapter & xmlParser, XMP_OptionBits options )
151 {
152 	const XML_Node * rootNode = xmlParser.rootNode;
153 
154 	if ( xmlParser.rootCount > 1 ) rootNode = PickBestRoot ( xmlParser.tree, options );
155 	if ( rootNode == 0 ) return 0;
156 
157 	// We have a root node. Try to extract previous toolkit version number.
158 
159 	XMP_StringPtr verStr = "";
160 
161 		XMP_Assert ( rootNode->name == "rdf:RDF" );
162 
163 		if ( (options & kXMP_RequireXMPMeta) &&
164 		     ((rootNode->parent == 0) ||
165 		      ((rootNode->parent->name != "x:xmpmeta") && (rootNode->parent->name != "x:xapmeta"))) ) return 0;
166 
167 		for ( size_t attrNum = 0, attrLim = rootNode->parent->attrs.size(); attrNum < attrLim; ++attrNum ) {
168 			const XML_Node * currAttr =rootNode->parent->attrs[attrNum];
169 			if ( (currAttr->name == "x:xmptk") || (currAttr->name == "x:xaptk") ) {
170 				verStr = currAttr->value.c_str();
171 				break;
172 			}
173 		}
174 
175 	// Decode the version number into MMmmuubbb digits. If any part is too big, peg it at 99 or 999.
176 
177 	unsigned long part;
178 	while ( (*verStr != 0) && ((*verStr < '0') || (*verStr > '9')) ) ++verStr;
179 
180 	part = 0;
181 	while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
182 		part = (part * 10) + (*verStr - '0');
183 		++verStr;
184 	}
185 	if ( part > 99 ) part = 99;
186 	thiz->prevTkVer = part * 100*100*1000;
187 
188 	part = 0;
189 	if ( *verStr == '.' ) ++verStr;
190 	while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
191 		part = (part * 10) + (*verStr - '0');
192 		++verStr;
193 	}
194 	if ( part > 99 ) part = 99;
195 	thiz->prevTkVer += part * 100*1000;
196 
197 	part = 0;
198 	if ( *verStr == '.' ) ++verStr;
199 	while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
200 		part = (part * 10) + (*verStr - '0');
201 		++verStr;
202 	}
203 	if ( part > 99 ) part = 99;
204 	thiz->prevTkVer += part * 1000;
205 
206 	part = 0;
207 	if ( *verStr == '-' ) ++verStr;
208 	while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
209 		part = (part * 10) + (*verStr - '0');
210 		++verStr;
211 	}
212 	if ( part > 999 ) part = 999;
213 	thiz->prevTkVer += part;
214 
215 	return rootNode;
216 
217 }	// FindRootNode
218 
219 // -------------------------------------------------------------------------------------------------
220 // NormalizeDCArrays
221 // -----------------
222 //
223 // Undo the denormalization performed by the XMP used in Acrobat 5. If a Dublin Core array had only
224 // one item, it was serialized as a simple property. The xml:lang attribute was dropped from an
225 // alt-text item if the language was x-default.
226 
227 // *** This depends on the dc: namespace prefix.
228 
229 static void
NormalizeDCArrays(XMP_Node * xmpTree)230 NormalizeDCArrays ( XMP_Node * xmpTree )
231 {
232 	XMP_Node * dcSchema = FindSchemaNode ( xmpTree, kXMP_NS_DC, kXMP_ExistingOnly );
233 	if ( dcSchema == 0 ) return;
234 
235 	for ( size_t propNum = 0, propLimit = dcSchema->children.size(); propNum < propLimit; ++propNum ) {
236 		XMP_Node *     currProp  = dcSchema->children[propNum];
237 		XMP_OptionBits arrayForm = 0;
238 
239 		if ( ! XMP_PropIsSimple ( currProp->options ) ) continue;	// Nothing to do if not simple.
240 
241 		if ( (currProp->name == "dc:creator" )     ||	// See if it is supposed to be an array.
242 		     (currProp->name == "dc:date" ) ) {			// *** Think about an array of char* and a loop.
243 			arrayForm = kXMP_PropArrayIsOrdered;
244 		} else if (
245 		     (currProp->name == "dc:description" ) ||
246 		     (currProp->name == "dc:rights" )      ||
247 		     (currProp->name == "dc:title" ) ) {
248 			arrayForm = kXMP_PropArrayIsAltText;
249 		} else if (
250 		     (currProp->name == "dc:contributor" ) ||
251 		     (currProp->name == "dc:language" )    ||
252 		     (currProp->name == "dc:publisher" )   ||
253 		     (currProp->name == "dc:relation" )    ||
254 		     (currProp->name == "dc:subject" )     ||
255 		     (currProp->name == "dc:type" ) ) {
256 			arrayForm = kXMP_PropValueIsArray;
257 		}
258 		if ( arrayForm == 0 ) continue;	// Nothing to do if it isn't supposed to be an array.
259 
260 		arrayForm = VerifySetOptions ( arrayForm, 0 );	// Set the implicit array bits.
261 		XMP_Node * newArray = new XMP_Node ( dcSchema, currProp->name.c_str(), arrayForm );
262 		dcSchema->children[propNum] = newArray;
263 		newArray->children.push_back ( currProp );
264 		currProp->parent = newArray;
265 		currProp->name = kXMP_ArrayItemName;
266 
267 		if ( XMP_ArrayIsAltText ( arrayForm ) && (! (currProp->options & kXMP_PropHasLang)) ) {
268 			XMP_Node * newLang = new XMP_Node ( currProp, "xml:lang", "x-default", kXMP_PropIsQualifier );
269 			currProp->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
270 			if ( currProp->qualifiers.empty() ) {	// *** Need a util?
271 				currProp->qualifiers.push_back ( newLang );
272 			} else {
273 				currProp->qualifiers.insert ( currProp->qualifiers.begin(), newLang );
274 			}
275 		}
276 
277 	}
278 
279 }	// NormalizeDCArrays
280 
281 
282 // -------------------------------------------------------------------------------------------------
283 // CompareAliasedSubtrees
284 // ----------------------
285 
286 // *** Change to do some alias-specific setup, then use CompareSubtrees. One special case for
287 // *** aliases is a simple to x-default alias, the options and qualifiers obviously differ.
288 
289 static void
CompareAliasedSubtrees(XMP_Node * aliasNode,XMP_Node * baseNode,bool outerCall=true)290 CompareAliasedSubtrees ( XMP_Node * aliasNode, XMP_Node * baseNode, bool outerCall = true )
291 {
292 	// ! The outermost call is special. The names almost certainly differ. The qualifiers (and
293 	// ! hence options) will differ for an alias to the x-default item of a langAlt array.
294 	if ( (aliasNode->value != baseNode->value) ||
295 	     (aliasNode->children.size() != baseNode->children.size()) ) {
296 		XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
297 	}
298 	if ( ! outerCall ) {
299 		if ( (aliasNode->name != baseNode->name) ||
300 		     (aliasNode->options != baseNode->options) ||
301 		     (aliasNode->qualifiers.size() != baseNode->qualifiers.size()) ) {
302 			XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
303 		}
304 	}
305 
306 	for ( size_t childNum = 0, childLim = aliasNode->children.size(); childNum < childLim; ++childNum ) {
307 		XMP_Node * aliasChild = aliasNode->children[childNum];
308 		XMP_Node * baseChild  = baseNode->children[childNum];
309 		CompareAliasedSubtrees ( aliasChild, baseChild, false );
310 	}
311 
312 	for ( size_t qualNum = 0, qualLim = aliasNode->qualifiers.size(); qualNum < qualLim; ++qualNum ) {
313 		XMP_Node * aliasQual = aliasNode->qualifiers[qualNum];
314 		XMP_Node * baseQual  = baseNode->qualifiers[qualNum];
315 		CompareAliasedSubtrees ( aliasQual, baseQual, false );
316 	}
317 
318 }	// CompareAliasedSubtrees
319 
320 
321 // -------------------------------------------------------------------------------------------------
322 // TransplantArrayItemAlias
323 // ------------------------
324 
325 static void
TransplantArrayItemAlias(XMP_Node * oldParent,size_t oldNum,XMP_Node * newParent)326 TransplantArrayItemAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent )
327 {
328 	XMP_Node * childNode = oldParent->children[oldNum];
329 
330 	if ( newParent->options & kXMP_PropArrayIsAltText ) {
331 		if ( childNode->options & kXMP_PropHasLang ) {
332 			XMP_Throw ( "Alias to x-default already has a language qualifier", kXMPErr_BadXMP );	// *** Allow x-default.
333 		}
334 		childNode->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
335 		XMP_Node * langQual = new XMP_Node ( childNode, "xml:lang", "x-default", kXMP_PropIsQualifier );	// *** AddLangQual util?
336 		if ( childNode->qualifiers.empty() ) {
337 			childNode->qualifiers.push_back ( langQual );
338 		} else {
339 			childNode->qualifiers.insert ( childNode->qualifiers.begin(), langQual );
340 		}
341 	}
342 
343 	oldParent->children.erase ( oldParent->children.begin() + oldNum );
344 	childNode->name = kXMP_ArrayItemName;
345 	childNode->parent = newParent;
346 	if ( newParent->children.empty() ) {
347 		newParent->children.push_back ( childNode );
348 	} else {
349 		newParent->children.insert ( newParent->children.begin(), childNode );
350 	}
351 
352 }	// TransplantArrayItemAlias
353 
354 
355 // -------------------------------------------------------------------------------------------------
356 // TransplantNamedAlias
357 // --------------------
358 
359 static void
TransplantNamedAlias(XMP_Node * oldParent,size_t oldNum,XMP_Node * newParent,XMP_VarString & newName)360 TransplantNamedAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent, XMP_VarString & newName )
361 {
362 	XMP_Node * childNode = oldParent->children[oldNum];
363 
364 	oldParent->children.erase ( oldParent->children.begin() + oldNum );
365 	childNode->name = newName;
366 	childNode->parent = newParent;
367 	newParent->children.push_back ( childNode );
368 
369 }	// TransplantNamedAlias
370 
371 
372 // -------------------------------------------------------------------------------------------------
373 // MoveExplicitAliases
374 // -------------------
375 
376 static void
MoveExplicitAliases(XMP_Node * tree,XMP_OptionBits parseOptions)377 MoveExplicitAliases ( XMP_Node * tree, XMP_OptionBits parseOptions )
378 {
379 	tree->options ^= kXMP_PropHasAliases;
380 	const bool strictAliasing = ((parseOptions & kXMP_StrictAliasing) != 0);
381 
382 	// Visit all of the top level nodes looking for aliases. If there is no base, transplant the
383 	// alias subtree. If there is a base and strict aliasing is on, make sure the alias and base
384 	// subtrees match.
385 
386 	// ! Use "while" loops not "for" loops since both the schema and property loops can remove the
387 	// ! current item from the vector being traversed. And don't increment the counter for a delete.
388 
389 	size_t schemaNum = 0;
390 	while ( schemaNum < tree->children.size() ) {
391 		XMP_Node * currSchema = tree->children[schemaNum];
392 
393 		size_t propNum = 0;
394 		while ( propNum < currSchema->children.size() ) {
395 			XMP_Node * currProp = currSchema->children[propNum];
396 			if ( ! (currProp->options & kXMP_PropIsAlias) ) {
397 				++propNum;
398 				continue;
399 			}
400 			currProp->options ^= kXMP_PropIsAlias;
401 
402 			// Find the base path, look for the base schema and root node.
403 
404 			XMP_AliasMapPos aliasPos = sRegisteredAliasMap->find ( currProp->name );
405 			XMP_Assert ( aliasPos != sRegisteredAliasMap->end() );
406 			XMP_ExpandedXPath & basePath = aliasPos->second;
407 			XMP_OptionBits arrayOptions = (basePath[kRootPropStep].options & kXMP_PropArrayFormMask);
408 
409 			XMP_Node * baseSchema = FindSchemaNode ( tree, basePath[kSchemaStep].step.c_str(), kXMP_CreateNodes );
410 			if ( baseSchema->options & kXMP_NewImplicitNode ) baseSchema->options ^= kXMP_NewImplicitNode;
411 			XMP_Node * baseNode = FindChildNode ( baseSchema, basePath[kRootPropStep].step.c_str(), kXMP_ExistingOnly );
412 
413 			if ( baseNode == 0 ) {
414 
415 				if ( basePath.size() == 2 ) {
416 					// A top-to-top alias, transplant the property.
417 					TransplantNamedAlias ( currSchema, propNum, baseSchema, basePath[kRootPropStep].step );
418 				} else {
419 					// An alias to an array item, create the array and transplant the property.
420 					baseNode = new XMP_Node ( baseSchema, basePath[kRootPropStep].step.c_str(), arrayOptions );
421 					baseSchema->children.push_back ( baseNode );
422 					TransplantArrayItemAlias ( currSchema, propNum, baseNode );
423 				}
424 
425 			} else if ( basePath.size() == 2 ) {
426 
427 				// The base node does exist and this is a top-to-top alias. Check for conflicts if
428 				// strict aliasing is on. Remove and delete the alias subtree.
429 				if ( strictAliasing ) CompareAliasedSubtrees ( currProp, baseNode );
430 				currSchema->children.erase ( currSchema->children.begin() + propNum );
431 				delete currProp;
432 
433 			} else {
434 
435 				// This is an alias to an array item and the array exists. Look for the aliased item.
436 				// Then transplant or check & delete as appropriate.
437 
438 				XMP_Node * itemNode = 0;
439 				if ( arrayOptions & kXMP_PropArrayIsAltText ) {
440 					XMP_Index xdIndex = LookupLangItem ( baseNode, *xdefaultName );
441 					if ( xdIndex != -1 ) itemNode = baseNode->children[xdIndex];
442 				} else if ( ! baseNode->children.empty() ) {
443 					itemNode = baseNode->children[0];
444 				}
445 
446 				if ( itemNode == 0 ) {
447 					TransplantArrayItemAlias ( currSchema, propNum, baseNode );
448 				} else {
449 					if ( strictAliasing ) CompareAliasedSubtrees ( currProp, itemNode );
450 					currSchema->children.erase ( currSchema->children.begin() + propNum );
451 					delete currProp;
452 				}
453 
454 			}
455 
456 		}	// Property loop
457 
458 		// Increment the counter or remove an empty schema node.
459 		if ( currSchema->children.size() > 0 ) {
460 			++schemaNum;
461 		} else {
462 			delete tree->children[schemaNum];	// ! Delete the schema node itself.
463 			tree->children.erase ( tree->children.begin() + schemaNum );
464 		}
465 
466 	}	// Schema loop
467 
468 }	// MoveExplicitAliases
469 
470 
471 // -------------------------------------------------------------------------------------------------
472 // FixGPSTimeStamp
473 // ---------------
474 
475 static void
FixGPSTimeStamp(XMP_Node * exifSchema,XMP_Node * gpsDateTime)476 FixGPSTimeStamp ( XMP_Node * exifSchema, XMP_Node * gpsDateTime )
477 {
478 	XMP_DateTime binGPSStamp;
479 	try {
480 		XMPUtils::ConvertToDate ( gpsDateTime->value.c_str(), &binGPSStamp );
481 	} catch ( ... ) {
482 		return;	// Don't let a bad date stop other things.
483 	}
484 	if ( (binGPSStamp.year != 0) || (binGPSStamp.month != 0) || (binGPSStamp.day != 0) ) return;
485 
486 	XMP_Node * otherDate = FindChildNode ( exifSchema, "exif:DateTimeOriginal", kXMP_ExistingOnly );
487 	if ( otherDate == 0 ) otherDate = FindChildNode ( exifSchema, "exif:DateTimeDigitized", kXMP_ExistingOnly );
488 	if ( otherDate == 0 ) return;
489 
490 	XMP_DateTime binOtherDate;
491 	try {
492 		XMPUtils::ConvertToDate ( otherDate->value.c_str(), &binOtherDate );
493 	} catch ( ... ) {
494 		return;	// Don't let a bad date stop other things.
495 	}
496 
497 	binGPSStamp.year  = binOtherDate.year;
498 	binGPSStamp.month = binOtherDate.month;
499 	binGPSStamp.day   = binOtherDate.day;
500 
501 	XMP_StringPtr goodStr;
502 	XMP_StringLen goodLen;
503 	XMPUtils::ConvertFromDate ( binGPSStamp, &goodStr, &goodLen );
504 
505 	gpsDateTime->value.assign ( goodStr, goodLen );
506 
507 }	// FixGPSTimeStamp
508 
509 
510 // -------------------------------------------------------------------------------------------------
511 // MigrateAudioCopyright
512 // ---------------------
513 //
514 // The initial support for WAV files mapped a legacy ID3 audio copyright into a new xmpDM:copyright
515 // property. This is special case code to migrate that into dc:rights['x-default']. The rules:
516 //
517 //   1. If there is no dc:rights array, or an empty array -
518 //      Create one with dc:rights['x-default'] set from double linefeed and xmpDM:copyright.
519 //
520 //   2. If there is a dc:rights array but it has no x-default item -
521 //      Create an x-default item as a copy of the first item then apply rule #3.
522 //
523 //   3. If there is a dc:rights array with an x-default item, look for a double linefeed in the value.
524 //      A. If no double linefeed, compare the x-default value to the xmpDM:copyright value.
525 //         A1. If they match then leave the x-default value alone.
526 //         A2. Otherwise, append a double linefeed and the xmpDM:copyright value to the x-default value.
527 //      B. If there is a double linefeed, compare the trailing text to the xmpDM:copyright value.
528 //         B1. If they match then leave the x-default value alone.
529 //         B2. Otherwise, replace the trailing x-default text with the xmpDM:copyright value.
530 //
531 //   4. In all cases, delete the xmpDM:copyright property.
532 
533 static void
MigrateAudioCopyright(XMPMeta * xmp,XMP_Node * dmCopyright)534 MigrateAudioCopyright ( XMPMeta * xmp, XMP_Node * dmCopyright )
535 {
536 
537 	try {
538 
539 		std::string & dmValue = dmCopyright->value;
540 		static const char * kDoubleLF = "\xA\xA";
541 
542 		XMP_Node * dcSchema = FindSchemaNode ( &xmp->tree, kXMP_NS_DC, kXMP_CreateNodes );
543 		XMP_Node * dcRightsArray = FindChildNode ( dcSchema, "dc:rights", kXMP_ExistingOnly );
544 
545 		if ( (dcRightsArray == 0) || dcRightsArray->children.empty() ) {
546 
547 			// 1. No dc:rights array, create from double linefeed and xmpDM:copyright.
548 			dmValue.insert ( 0, kDoubleLF );
549 			xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default",  dmValue.c_str(), 0 );
550 
551 		} else {
552 
553 			std::string xdefaultStr ( "x-default" );
554 
555 			XMP_Index xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
556 
557 			if ( xdIndex < 0 ) {
558 				// 2. No x-default item, create from the first item.
559 				XMP_StringPtr firstValue = dcRightsArray->children[0]->value.c_str();
560 				xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default",  firstValue, 0 );
561 				xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
562 			}
563 
564 			// 3. Look for a double linefeed in the x-default value.
565 			XMP_Assert ( xdIndex == 0 );
566 			std::string & defaultValue = dcRightsArray->children[xdIndex]->value;
567 			XMP_Index lfPos = defaultValue.find ( kDoubleLF );
568 
569 			if ( lfPos < 0 ) {
570 
571 				// 3A. No double LF, compare whole values.
572 				if ( dmValue != defaultValue ) {
573 					// 3A2. Append the xmpDM:copyright to the x-default item.
574 					defaultValue += kDoubleLF;
575 					defaultValue += dmValue;
576 				}
577 
578 			} else {
579 
580 				// 3B. Has double LF, compare the tail.
581 				if ( defaultValue.compare ( lfPos+2, std::string::npos, dmValue ) != 0 ) {
582 					// 3B2. Replace the x-default tail.
583 					defaultValue.replace ( lfPos+2, std::string::npos, dmValue );
584 				}
585 
586 			}
587 
588 		}
589 
590 		// 4. Get rid of the xmpDM:copyright.
591 		xmp->DeleteProperty ( kXMP_NS_DM, "copyright" );
592 
593 	} catch ( ... ) {
594 		// Don't let failures (like a bad dc:rights form) stop other cleanup.
595 	}
596 
597 }	// MigrateAudioCopyright
598 
599 
600 // -------------------------------------------------------------------------------------------------
601 // RepairAltText
602 // -------------
603 //
604 // Make sure that the array is well-formed AltText. Each item must be simple and have an xml:lang
605 // qualifier. If repairs are needed, keep simple non-empty items by adding the xml:lang.
606 
607 static void
RepairAltText(XMP_Node & tree,XMP_StringPtr schemaNS,XMP_StringPtr arrayName)608 RepairAltText ( XMP_Node & tree, XMP_StringPtr schemaNS, XMP_StringPtr arrayName )
609 {
610 	XMP_Node * schemaNode = FindSchemaNode ( &tree, schemaNS, kXMP_ExistingOnly );
611 	if ( schemaNode == 0 ) return;
612 
613 	XMP_Node * arrayNode = FindChildNode ( schemaNode, arrayName, kXMP_ExistingOnly );
614 	if ( (arrayNode == 0) || XMP_ArrayIsAltText ( arrayNode->options ) ) return;	// Already OK.
615 
616 	if ( ! XMP_PropIsArray ( arrayNode->options ) ) return;	// ! Not even an array, leave it alone.
617 	// *** Should probably change simple values to LangAlt with 'x-default' item.
618 
619 	arrayNode->options |= (kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText);
620 
621 	for ( int i = arrayNode->children.size()-1; i >= 0; --i ) {	// ! Need a signed index type.
622 
623 		XMP_Node * currChild = arrayNode->children[i];
624 
625 		if ( ! XMP_PropIsSimple ( currChild->options ) ) {
626 
627 			// Delete non-simple children.
628 			delete ( currChild );
629 			arrayNode->children.erase ( arrayNode->children.begin() + i );
630 
631 		} else if ( ! XMP_PropHasLang ( currChild->options ) ) {
632 
633 			if ( currChild->value.empty() ) {
634 
635 				// Delete empty valued children that have no xml:lang.
636 				delete ( currChild );
637 				arrayNode->children.erase ( arrayNode->children.begin() + i );
638 
639 			} else {
640 
641 				// Add an xml:lang qualifier with the value "x-repair".
642 				XMP_Node * repairLang = new XMP_Node ( currChild, "xml:lang", "x-repair", kXMP_PropIsQualifier );
643 				if ( currChild->qualifiers.empty() ) {
644 					currChild->qualifiers.push_back ( repairLang );
645 				} else {
646 					currChild->qualifiers.insert ( currChild->qualifiers.begin(), repairLang );
647 				}
648 				currChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
649 
650 			}
651 
652 		}
653 
654 	}
655 
656 }	// RepairAltText
657 
658 
659 // -------------------------------------------------------------------------------------------------
660 // TouchUpDataModel
661 // ----------------
662 
663 static void
TouchUpDataModel(XMPMeta * xmp)664 TouchUpDataModel ( XMPMeta * xmp )
665 {
666 	XMP_Node & tree = xmp->tree;
667 
668 	// Do special case touch ups for certain schema.
669 
670 	XMP_Node * currSchema = 0;
671 
672 	currSchema = FindSchemaNode ( &tree, kXMP_NS_EXIF, kXMP_ExistingOnly );
673 	if ( currSchema != 0 ) {
674 
675 		// Do a special case fix for exif:GPSTimeStamp.
676 		XMP_Node * gpsDateTime = FindChildNode ( currSchema, "exif:GPSTimeStamp", kXMP_ExistingOnly );
677 		if ( gpsDateTime != 0 ) FixGPSTimeStamp ( currSchema, gpsDateTime );
678 
679 		// *** Should probably have RepairAltText change simple values to LangAlt with 'x-default' item.
680 		// *** For now just do this for exif:UserComment, the one case we know about, late in cycle fix.
681 		XMP_Node * userComment = FindChildNode ( currSchema, "exif:UserComment", kXMP_ExistingOnly );
682 		if ( (userComment != 0) && XMP_PropIsSimple ( userComment->options ) ) {
683 			XMP_Node * newChild = new XMP_Node ( userComment, kXMP_ArrayItemName,
684 												 userComment->value.c_str(), userComment->options );
685 			newChild->qualifiers.swap ( userComment->qualifiers );
686 			if ( ! XMP_PropHasLang ( newChild->options ) ) {
687 				XMP_Node * langQual = new XMP_Node ( newChild, "xml:lang", "x-default", kXMP_PropIsQualifier );
688 				newChild->qualifiers.insert ( newChild->qualifiers.begin(), langQual );
689 				newChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
690 			}
691 			userComment->value.erase();
692 			userComment->options = kXMP_PropArrayFormMask;	// ! Happens to have all the right bits.
693 			userComment->children.push_back ( newChild );
694 		}
695 
696 	}
697 
698 	currSchema = FindSchemaNode ( &tree, kXMP_NS_DM, kXMP_ExistingOnly );
699 	if ( currSchema != 0 ) {
700 		// Do a special case migration of xmpDM:copyright to dc:rights['x-default']. Do this before
701 		// the dc: touch up since it can affect the dc: schema.
702 		XMP_Node * dmCopyright = FindChildNode ( currSchema, "xmpDM:copyright", kXMP_ExistingOnly );
703 		if ( dmCopyright != 0 ) MigrateAudioCopyright ( xmp, dmCopyright );
704 	}
705 
706 	currSchema = FindSchemaNode ( &tree, kXMP_NS_DC, kXMP_ExistingOnly );
707 	if ( currSchema != 0 ) {
708 		// Do a special case fix for dc:subject, make sure it is an unordered array.
709 		XMP_Node * dcSubject = FindChildNode ( currSchema, "dc:subject", kXMP_ExistingOnly );
710 		if ( dcSubject != 0 ) {
711                         XMP_OptionBits keepMask = static_cast<XMP_OptionBits>(~(kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText));
712 			dcSubject->options &= keepMask;	// Make sure any ordered array bits are clear.
713 		}
714 	}
715 
716 	// Fix any broken AltText arrays that we know about.
717 
718 	RepairAltText ( tree, kXMP_NS_DC, "dc:description" );	// ! Note inclusion of prefixes for direct node lookup!
719 	RepairAltText ( tree, kXMP_NS_DC, "dc:rights" );
720 	RepairAltText ( tree, kXMP_NS_DC, "dc:title" );
721 	RepairAltText ( tree, kXMP_NS_XMP_Rights, "xmpRights:UsageTerms" );
722 	RepairAltText ( tree, kXMP_NS_EXIF, "exif:UserComment" );
723 
724 	// Tweak old XMP: Move an instance ID from rdf:about to the xmpMM:InstanceID property. An old
725 	// instance ID usually looks like "uuid:bac965c4-9d87-11d9-9a30-000d936b79c4", plus InDesign
726 	// 3.0 wrote them like "bac965c4-9d87-11d9-9a30-000d936b79c4". If the name looks like a UUID
727 	// simply move it to xmpMM:InstanceID, don't worry about any existing xmpMM:InstanceID. Both
728 	// will only be present when a newer file with the xmpMM:InstanceID property is updated by an
729 	// old app that uses rdf:about.
730 
731 	if ( ! tree.name.empty() ) {
732 
733 		bool nameIsUUID = false;
734 		XMP_StringPtr nameStr = tree.name.c_str();
735 
736 		if ( XMP_LitNMatch ( nameStr, "uuid:", 5 ) ) {
737 
738 			nameIsUUID = true;
739 
740 		} else if ( tree.name.size() == 36 ) {
741 
742 			nameIsUUID = true;	// ! Assume true, we'll set it to false below if not.
743 			for ( int i = 0;  i < 36; ++i ) {
744 				char ch = nameStr[i];
745 				if ( ch == '-' ) {
746 					if ( (i == 8) || (i == 13) || (i == 18) || (i == 23) ) continue;
747 					nameIsUUID = false;
748 					break;
749 				} else {
750 					if ( (('0' <= ch) && (ch <= '9')) || (('a' <= ch) && (ch <= 'z')) ) continue;
751 					nameIsUUID = false;
752 					break;
753 				}
754 			}
755 
756 		}
757 
758 		if ( nameIsUUID ) {
759 
760 			XMP_ExpandedXPath expPath;
761 			ExpandXPath ( kXMP_NS_XMP_MM, "InstanceID", &expPath );
762 			XMP_Node * idNode = FindNode ( &tree, expPath, kXMP_CreateNodes, 0 );
763 			if ( idNode == 0 ) XMP_Throw ( "Failure creating xmpMM:InstanceID", kXMPErr_InternalFailure );
764 
765 			idNode->options = 0;	// Clobber any existing xmpMM:InstanceID.
766 			idNode->value = tree.name;
767 			idNode->RemoveChildren();
768 			idNode->RemoveQualifiers();
769 
770 			tree.name.erase();
771 
772 		}
773 
774 	}
775 
776 }	// TouchUpDataModel
777 
778 
779 // -------------------------------------------------------------------------------------------------
780 // DetermineInputEncoding
781 // ----------------------
782 //
783 // Try to determine the character encoding, making a guess if the input is too short. We make some
784 // simplifying assumtions: the first character must be U+FEFF or ASCII, U+0000 is not allowed. The
785 // XML 1.1 spec is even more strict, UTF-16 XML documents must begin with U+FEFF, and the first
786 // "real" character must be '<'. Ignoring the XML declaration, the first XML character could be '<',
787 // space, tab, CR, or LF.
788 //
789 // The possible input sequences are:
790 //
791 //   Cases with U+FEFF
792 //      EF BB BF -- - UTF-8
793 //      FE FF -- -- - Big endian UTF-16
794 //      00 00 FE FF - Big endian UTF 32
795 //      FF FE 00 00 - Little endian UTF-32
796 //      FF FE -- -- - Little endian UTF-16
797 //
798 //   Cases with ASCII
799 //      nn mm -- -- - UTF-8 -
800 //      00 00 00 nn - Big endian UTF-32
801 //      00 nn -- -- - Big endian UTF-16
802 //      nn 00 00 00 - Little endian UTF-32
803 //      nn 00 -- -- - Little endian UTF-16
804 //
805 // ! We don't check for full patterns, or for errors. We just check enough to determine what the
806 // ! only possible (or reasonable) case would be.
807 
808 static XMP_OptionBits
DetermineInputEncoding(const XMP_Uns8 * buffer,size_t length)809 DetermineInputEncoding ( const XMP_Uns8 * buffer, size_t length )
810 {
811 	if ( length < 2 ) return kXMP_EncodeUTF8;
812 
813 	XMP_Uns8 * uniChar = (XMP_Uns8*)buffer;	// ! Make sure comparisons are unsigned.
814 
815 	if ( uniChar[0] == 0 ) {
816 
817 		// These cases are:
818 		//   00 nn -- -- - Big endian UTF-16
819 		//   00 00 00 nn - Big endian UTF-32
820 		//   00 00 FE FF - Big endian UTF 32
821 
822 		if ( (length < 4) || (uniChar[1] != 0) ) return kXMP_EncodeUTF16Big;
823 		return kXMP_EncodeUTF32Big;
824 
825 	} else if ( uniChar[0] < 0x80 ) {
826 
827 		// These cases are:
828 		//   nn mm -- -- - UTF-8, includes EF BB BF case
829 		//   nn 00 00 00 - Little endian UTF-32
830 		//   nn 00 -- -- - Little endian UTF-16
831 
832 		if ( uniChar[1] != 0 )  return kXMP_EncodeUTF8;
833 		if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
834 		return kXMP_EncodeUTF32Little;
835 
836 	} else {
837 
838 		// These cases are:
839 		//   EF BB BF -- - UTF-8
840 		//   FE FF -- -- - Big endian UTF-16
841 		//   FF FE 00 00 - Little endian UTF-32
842 		//   FF FE -- -- - Little endian UTF-16
843 
844 		if ( uniChar[0] == 0xEF ) return kXMP_EncodeUTF8;
845 		if ( uniChar[0] == 0xFE ) return kXMP_EncodeUTF16Big;
846 		if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
847 		return kXMP_EncodeUTF32Little;
848 
849 	}
850 
851 }	// DetermineInputEncoding
852 
853 
854 // -------------------------------------------------------------------------------------------------
855 // CountUTF8
856 // ---------
857 //
858 // Look for a valid multi-byte UTF-8 sequence and return its length. Returns 0 for an invalid UTF-8
859 // sequence. Returns a negative value for a partial valid sequence at the end of the buffer.
860 //
861 // The checking is not strict. We simply count the number of high order 1 bits in the first byte,
862 // then look for n-1 following bytes whose high order 2 bits are 1 and 0. We do not check for a
863 // minimal length representation of the codepoint, or that the codepoint is defined by Unicode.
864 
865 static int
CountUTF8(const XMP_Uns8 * charStart,const XMP_Uns8 * bufEnd)866 CountUTF8 ( const XMP_Uns8 * charStart, const XMP_Uns8 * bufEnd )
867 {
868 	XMP_Assert ( charStart < bufEnd );		// Catch this in debug builds.
869 	if ( charStart >= bufEnd ) return 0;	// Don't run-on in release builds.
870 	if ( (*charStart & 0xC0) != 0xC0 ) return 0;	// Must have at least 2 high bits set.
871 
872 	int byteCount = 2;
873 	XMP_Uns8 firstByte = *charStart;
874 	for ( firstByte = firstByte << 2; (firstByte & 0x80) != 0; firstByte = firstByte << 1 ) ++byteCount;
875 
876 	if ( (charStart + byteCount) > bufEnd ) return -byteCount;
877 
878 	for ( int i = 1; i < byteCount; ++i ) {
879 		if ( (charStart[i] & 0xC0) != 0x80 ) return 0;
880 	}
881 
882 	return byteCount;
883 
884 }	// CountUTF8
885 
886 
887 // -------------------------------------------------------------------------------------------------
888 // CountControlEscape
889 // ------------------
890 //
891 // Look for a numeric escape sequence for a "prohibited" ASCII control character. These are 0x7F,
892 // and the range 0x00..0x1F except for tab/LF/CR. Return 0 if this is definitely not a numeric
893 // escape, the length of the escape if found, or a negative value for a partial escape.
894 
895 static int
CountControlEscape(const XMP_Uns8 * escStart,const XMP_Uns8 * bufEnd)896 CountControlEscape ( const XMP_Uns8 * escStart, const XMP_Uns8 * bufEnd )
897 {
898 	XMP_Assert ( escStart < bufEnd );	// Catch this in debug builds.
899 	if ( escStart >= bufEnd ) return 0;	// Don't run-on in release builds.
900 	XMP_Assert ( *escStart == '&' );
901 
902 	size_t tailLen = bufEnd - escStart;
903 	if ( tailLen < 5 ) return -1;	// Don't need a more thorough check, we'll catch it on the next pass.
904 
905 	if ( strncmp ( (char*)escStart, "&#x", 3 ) != 0 ) return 0;
906 
907 	XMP_Uns8 escValue = 0;
908 	const XMP_Uns8 * escPos = escStart + 3;
909 
910 	if ( ('0' <= *escPos) && (*escPos <= '9') ) {
911 		escValue = *escPos - '0';
912 		++escPos;
913 	} else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
914 		escValue = *escPos - 'A' + 10;
915 		++escPos;
916 	} else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
917 		escValue = *escPos - 'a' + 10;
918 		++escPos;
919 	}
920 
921 	if ( ('0' <= *escPos) && (*escPos <= '9') ) {
922 		escValue = (escValue << 4) + (*escPos - '0');
923 		++escPos;
924 	} else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
925 		escValue = (escValue << 4) + (*escPos - 'A' + 10);
926 		++escPos;
927 	} else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
928 		escValue = (escValue << 4) + (*escPos - 'a' + 10);
929 		++escPos;
930 	}
931 
932 	if ( escPos == bufEnd ) return -1;	// Partial escape.
933 	if ( *escPos != ';' ) return 0;
934 
935 	size_t escLen = escPos - escStart + 1;
936 	if ( escLen < 5 ) return 0;	// ! Catch "&#x;".
937 
938 	if ( (escValue == kTab) || (escValue == kLF) || (escValue == kCR) ) return 0;	// An allowed escape.
939 
940 	return escLen;	// Found a full "prohibited" numeric escape.
941 
942 }	// CountControlEscape
943 
944 
945 // -------------------------------------------------------------------------------------------------
946 // ProcessUTF8Portion
947 // ------------------
948 //
949 // Early versions of the XMP spec mentioned allowing ISO Latin-1 input. There are also problems with
950 // some clients placing ASCII control characters within XMP values. This is an XML problem, the XML
951 // spec only allows tab (0x09), LF (0x0A), and CR (0x0D) from the 0x00..0x1F range. As a concession
952 // to this we scan 8-bit input for byte sequences that are not valid UTF-8 or in the 0x00..0x1F
953 // range and replace each byte as follows:
954 //   0x00..0x1F - Replace with a space, except for tab, CR, and LF.
955 //   0x7F       - Replace with a space. This is ASCII Delete, not allowed by ISO Latin-1.
956 //   0x80..0x9F - Replace with the UTF-8 for a corresponding Unicode character.
957 //   0xA0..0XFF - Replace with the UTF-8 for a corresponding Unicode character.
958 //
959 // The 0x80..0x9F range is not defined by Latin-1. But the Windows 1252 code page defines these and
960 // is otherwise the same as Latin-1.
961 //
962 // For at least historical compatibility reasons we also find and replace singly escaped ASCII
963 // control characters. The Expat parser we're using does not allow numeric escapes like "&#x10;".
964 // The XML spec is clear that raw controls are not allowed (in the RestrictedChar set), but it isn't
965 // as clear about numeric escapes for them. At any rate, Expat complains, so we treat the numeric
966 // escapes like raw characters and replace them with a space.
967 //
968 // We check for 1 or 2 hex digits ("&#x9;" or "&#x09;") and upper or lower case ("&#xA;" or "&#xa;").
969 // The full escape sequence is 5 or 6 bytes.
970 
971 static size_t
ProcessUTF8Portion(XMLParserAdapter * xmlParser,const XMP_Uns8 * buffer,size_t length,bool last)972 ProcessUTF8Portion ( XMLParserAdapter * xmlParser,
973 					 const XMP_Uns8 *   buffer,
974 					 size_t				length,
975 					 bool				last )
976 {
977 	const XMP_Uns8 * bufEnd = buffer + length;
978 
979 	const XMP_Uns8 * spanEnd;
980 
981 	// `buffer` is copied into this std::string. If `buffer` only
982 	// contains valid UTF-8 and no escape characters, then the copy
983 	// will be identical to the original, but invalid characters are
984 	// replaced - usually with a space character.  This std::string was
985 	// added as a performance fix for:
986 	// https://github.com/Exiv2/exiv2/security/advisories/GHSA-w8mv-g8qq-36mj
987 	// Previously, the code was repeatedly calling
988 	// `xmlParser->ParseBuffer()`, which turned out to have quadratic
989 	// complexity, because expat kept reparsing the entire string from
990 	// the beginning.
991 	std::string copy;
992 
993 	for ( spanEnd = buffer; spanEnd < bufEnd; ++spanEnd ) {
994 
995 		if ( (0x20 <= *spanEnd) && (*spanEnd <= 0x7E) && (*spanEnd != '&') ) {
996 			copy.push_back(*spanEnd);
997 			continue;	// A regular ASCII character.
998 		}
999 
1000 		if ( *spanEnd >= 0x80 ) {
1001 
1002 			// See if this is a multi-byte UTF-8 sequence, or a Latin-1 character to replace.
1003 
1004 			int uniLen = CountUTF8 ( spanEnd, bufEnd );
1005 
1006 			if ( uniLen > 0 ) {
1007 
1008 				// A valid UTF-8 character, keep it as-is.
1009 				copy.append((const char*)spanEnd, uniLen);
1010 				spanEnd += uniLen - 1;	// ! The loop increment will put back the +1.
1011 
1012 			} else if ( (uniLen < 0) && (! last) ) {
1013 
1014 				// Have a partial UTF-8 character at the end of the buffer and more input coming.
1015 				xmlParser->ParseBuffer ( copy.c_str(), copy.size(), false );
1016 				return (spanEnd - buffer);
1017 
1018 			} else {
1019 
1020 				// Not a valid UTF-8 sequence. Replace the first byte with the Latin-1 equivalent.
1021 				const char * replacement = kReplaceLatin1 [ *spanEnd - 0x80 ];
1022 				copy.append ( replacement );
1023 
1024 			}
1025 
1026 		} else if ( (*spanEnd < 0x20) || (*spanEnd == 0x7F) ) {
1027 
1028 			// Replace ASCII controls other than tab, LF, and CR with a space.
1029 
1030 			if ( (*spanEnd == kTab) || (*spanEnd == kLF) || (*spanEnd == kCR) ) {
1031 				copy.push_back(*spanEnd);
1032 				continue;
1033 			}
1034 
1035 			copy.push_back(' ');
1036 
1037 		} else {
1038 
1039 			// See if this is a numeric escape sequence for a prohibited ASCII control.
1040 
1041 			XMP_Assert ( *spanEnd == '&' );
1042 			int escLen = CountControlEscape ( spanEnd, bufEnd );
1043 
1044 			if ( escLen < 0 ) {
1045 
1046 				// Have a partial numeric escape in this buffer, wait for more input.
1047 				if ( last ) {
1048 					copy.push_back('&');
1049 					continue;	// No more buffers, not an escape, absorb as normal input.
1050 				}
1051 				xmlParser->ParseBuffer ( copy.c_str(), copy.size(), false );
1052 				return (spanEnd - buffer);
1053 
1054 			} else if ( escLen > 0 ) {
1055 
1056 				// Have a complete numeric escape to replace.
1057 				copy.push_back(' ');
1058 				spanEnd = spanEnd + escLen - 1;	// ! The loop continuation will increment spanEnd!
1059 
1060 			} else {
1061 				copy.push_back('&');
1062 			}
1063 
1064 		}
1065 
1066 	}
1067 
1068 	XMP_Assert ( spanEnd == bufEnd );
1069 	copy.push_back(' ');
1070 	xmlParser->ParseBuffer ( copy.c_str(), copy.size(), true );
1071 	return length;
1072 
1073 }	// ProcessUTF8Portion
1074 
1075 
1076 // -------------------------------------------------------------------------------------------------
1077 // ParseFromBuffer
1078 // ---------------
1079 //
1080 // Although most clients will probably parse everything in one call, we have a buffered API model
1081 // and need to support even the extreme case of 1 byte at a time parsing. This is considerably
1082 // complicated by some special cases for 8-bit input. Because of this, the first thing we do is
1083 // determine whether the input is 8-bit, UTF-16, or UTF-32.
1084 //
1085 // Both the 8-bit special cases and the encoding determination are easier to do with 8 bytes or more
1086 // of input. The XMLParserAdapter class has a pending-input buffer for this. At the start of parsing
1087 // we (moght) try to fill this buffer before determining the input character encoding. After that,
1088 // we (might) use this buffer with the current input to simplify the logic in Process8BitInput. The
1089 // "(might)" part means that we don't actually use the pending-input buffer unless we have to. In
1090 // particular, the common case of single-buffer parsing won't use it.
1091 
1092 void
ParseFromBuffer(XMP_StringPtr buffer,XMP_StringLen xmpSize,XMP_OptionBits options)1093 XMPMeta::ParseFromBuffer ( XMP_StringPtr  buffer,
1094 						   XMP_StringLen  xmpSize,
1095 						   XMP_OptionBits options )
1096 {
1097 	if ( (buffer == 0) && (xmpSize != 0) ) XMP_Throw ( "Null parse buffer", kXMPErr_BadParam );
1098 	if ( xmpSize == kXMP_UseNullTermination ) xmpSize = strlen ( buffer );
1099 
1100 	const bool lastClientCall = ((options & kXMP_ParseMoreBuffers) == 0);	// *** Could use FlagIsSet & FlagIsClear macros.
1101 
1102 	this->tree.ClearNode();	// Make sure the target XMP object is totally empty.
1103 
1104 	if ( this->xmlParser == 0 ) {
1105 		if ( (xmpSize == 0) && lastClientCall ) return;	// Tolerate empty parse. Expat complains if there are no XML elements.
1106 		this->xmlParser = XMP_NewExpatAdapter();
1107 	}
1108 
1109 	XMLParserAdapter& parser = *this->xmlParser;
1110 
1111 	#if 0	// XMP_DebugBuild
1112 		if ( parser.parseLog != 0 ) {
1113 			char message [200];	// AUDIT: Using sizeof(message) below for snprintf length is safe.
1114 			snprintf ( message, sizeof(message), "<!-- ParseFromBuffer, length = %d, options = %X%s -->",	// AUDIT: See above.
1115 					   xmpSize, options, (lastClientCall ? " (last)" : "") );
1116 			fwrite ( message, 1, strlen(message), parser.parseLog );
1117 			fflush ( parser.parseLog );
1118 		}
1119 	#endif
1120 
1121 	try {	// Cleanup the tree and xmlParser if anything fails.
1122 
1123 		// Determine the character encoding before doing any real parsing. This is needed to do the
1124 		// 8-bit special processing.
1125 
1126 		if ( parser.charEncoding == XMP_OptionBits(-1) ) {
1127 
1128 			if ( (parser.pendingCount == 0) && (xmpSize >= kXMLPendingInputMax) ) {
1129 
1130 				// This ought to be the common case, the first buffer is big enough.
1131 				parser.charEncoding = DetermineInputEncoding ( (XMP_Uns8*)buffer, xmpSize );
1132 
1133 			} else {
1134 
1135 				// Try to fill the pendingInput buffer before calling DetermineInputEncoding.
1136 
1137 				size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1138 				if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1139 
1140 				memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap );	// AUDIT: Count is safe.
1141 				buffer += pendingOverlap;
1142 				xmpSize -= pendingOverlap;
1143 				parser.pendingCount += pendingOverlap;
1144 
1145 				if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1146 				parser.charEncoding = DetermineInputEncoding ( parser.pendingInput, parser.pendingCount );
1147 
1148 				#if Trace_ParsingHackery
1149 					fprintf ( stderr, "XMP Character encoding is %d\n", parser.charEncoding );
1150 				#endif
1151 
1152 			}
1153 
1154 		}
1155 
1156 		// We have the character encoding. Process UTF-16 and UTF-32 as is. UTF-8 needs special
1157 		// handling to take care of things like ISO Latin-1 or unescaped ASCII controls.
1158 
1159 		XMP_Assert ( parser.charEncoding != XMP_OptionBits(-1) );
1160 
1161 		if ( parser.charEncoding != kXMP_EncodeUTF8 ) {
1162 
1163 			if ( parser.pendingCount > 0 ) {
1164 				// Might have pendingInput from the above portion to determine the character encoding.
1165 				parser.ParseBuffer ( parser.pendingInput, parser.pendingCount, false );
1166 			}
1167 			parser.ParseBuffer ( buffer, xmpSize, lastClientCall );
1168 
1169 		} else {
1170 
1171 			#if Trace_ParsingHackery
1172 				fprintf ( stderr, "Parsing %d bytes @ %.8X, %s, %d pending, context: %.8s\n",
1173 						  xmpSize, buffer, (lastClientCall ? "last" : "not last"), parser.pendingCount, buffer );
1174 			#endif
1175 
1176 			// The UTF-8 processing is a bit complex due to the need to tolerate ISO Latin-1 input.
1177 			// This is done by scanning the input for byte sequences that are not valid UTF-8,
1178 			// assuming they are Latin-1 characters in the range 0x80..0xFF. This requires saving a
1179 			// pending input buffer to handle partial UTF-8 sequences at the end of a buffer.
1180 
1181 			while ( parser.pendingCount > 0 ) {
1182 
1183 				// We've got some leftover input, process it first then continue with the current
1184 				// buffer. Try to fill the pendingInput buffer before parsing further. We use a loop
1185 				// for weird edge cases like a 2 byte input buffer, using 1 byte for pendingInput,
1186 				// then having a partial UTF-8 end and need to absorb more.
1187 
1188 				size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1189 				if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1190 
1191 				memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap );	// AUDIT: Count is safe.
1192 				parser.pendingCount += pendingOverlap;
1193 				buffer += pendingOverlap;
1194 				xmpSize -= pendingOverlap;
1195 
1196 				if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1197 				size_t bytesDone = ProcessUTF8Portion ( &parser, parser.pendingInput, parser.pendingCount, lastClientCall );
1198 				size_t bytesLeft = parser.pendingCount - bytesDone;
1199 
1200 				#if Trace_ParsingHackery
1201 					fprintf ( stderr, "   ProcessUTF8Portion handled %d pending bytes\n", bytesDone );
1202 				#endif
1203 
1204 				if ( bytesDone == parser.pendingCount ) {
1205 
1206 					// Done with all of the pending input, move on to the current buffer.
1207 					parser.pendingCount = 0;
1208 
1209 				} else if ( bytesLeft <= pendingOverlap ) {
1210 
1211 					// The leftover pending input all came from the current buffer. Exit this loop.
1212 					buffer -= bytesLeft;
1213 					xmpSize += bytesLeft;
1214 					parser.pendingCount = 0;
1215 
1216 				} else if ( xmpSize > 0 ) {
1217 
1218 					// Pull more of the current buffer into the pending input and try again.
1219 					// Backup by this pass's overlap so the loop entry code runs OK.
1220 					parser.pendingCount -= pendingOverlap;
1221 					buffer -= pendingOverlap;
1222 					xmpSize += pendingOverlap;
1223 
1224 				} else {
1225 
1226 					// There is no more of the current buffer. Wait for more. Partial sequences at
1227 					// the end of the last buffer should be treated as Latin-1 by ProcessUTF8Portion.
1228 					XMP_Assert ( ! lastClientCall );
1229 					parser.pendingCount = bytesLeft;
1230 					memcpy ( &parser.pendingInput[0], &parser.pendingInput[bytesDone], bytesLeft );	// AUDIT: Count is safe.
1231 					return;
1232 
1233 				}
1234 
1235 			}
1236 
1237 			// Done with the pending input, process the current buffer.
1238 
1239 			size_t bytesDone = ProcessUTF8Portion ( &parser, (XMP_Uns8*)buffer, xmpSize, lastClientCall );
1240 
1241 			#if Trace_ParsingHackery
1242 				fprintf ( stderr, "   ProcessUTF8Portion handled %d additional bytes\n", bytesDone );
1243 			#endif
1244 
1245 			if ( bytesDone < xmpSize ) {
1246 
1247 				XMP_Assert ( ! lastClientCall );
1248 				size_t bytesLeft = xmpSize - bytesDone;
1249 				if ( bytesLeft > kXMLPendingInputMax ) XMP_Throw ( "Parser bytesLeft too large", kXMPErr_InternalFailure );
1250 
1251 				memcpy ( parser.pendingInput, &buffer[bytesDone], bytesLeft );	// AUDIT: Count is safe.
1252 				parser.pendingCount = bytesLeft;
1253 				return;	// Wait for the next buffer.
1254 
1255 			}
1256 
1257 		}
1258 
1259 		if ( lastClientCall ) {
1260 
1261 			#if XMP_DebugBuild && DumpXMLParseTree
1262 				if ( parser.parseLog == 0 ) parser.parseLog = stdout;
1263 				DumpXMLTree ( parser.parseLog, parser.tree, 0 );
1264 			#endif
1265 
1266 			const XML_Node * xmlRoot = FindRootNode ( this, *this->xmlParser, options );
1267 
1268 			if ( xmlRoot != 0 ) {
1269 
1270 				ProcessRDF ( &this->tree, *xmlRoot, options );
1271 				NormalizeDCArrays ( &this->tree );
1272 				if ( this->tree.options & kXMP_PropHasAliases ) MoveExplicitAliases ( &this->tree, options );
1273 				TouchUpDataModel ( this );
1274 
1275 				// Delete empty schema nodes. Do this last, other cleanup can make empty schema.
1276 				size_t schemaNum = 0;
1277 				while ( schemaNum < this->tree.children.size() ) {
1278 					XMP_Node * currSchema = this->tree.children[schemaNum];
1279 					if ( currSchema->children.size() > 0 ) {
1280 						++schemaNum;
1281 					} else {
1282 						delete this->tree.children[schemaNum];	// ! Delete the schema node itself.
1283 						this->tree.children.erase ( this->tree.children.begin() + schemaNum );
1284 					}
1285 				}
1286 
1287 			}
1288 
1289 			delete this->xmlParser;
1290 			this->xmlParser = 0;
1291 
1292 		}
1293 
1294 	} catch ( ... ) {
1295 
1296 		delete this->xmlParser;
1297 		this->xmlParser = 0;
1298 		prevTkVer = 0;
1299 		this->tree.ClearNode();
1300 		throw;
1301 
1302 	}
1303 
1304 }	// ParseFromBuffer
1305 
1306 // =================================================================================================
1307