1 // =================================================================================================
2 // Copyright 2002-2008 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 //
8 // Adobe patent application tracking #P435, entitled 'Unique markers to simplify embedding data of
9 // one format in a file with a different format', inventors: Sean Parent, Greg Gilley.
10 // =================================================================================================
11
12 #include "XMP_Environment.h" // ! This must be the first include!
13 #include "XMPCore_Impl.hpp"
14
15 #include "XMPMeta.hpp"
16 #include "XMPUtils.hpp"
17
18 #include "UnicodeInlines.incl_cpp"
19 #include "UnicodeConversions.hpp"
20 #include "ExpatAdapter.hpp"
21
22 #if XMP_DebugBuild
23 #include <iostream>
24 #endif
25
26 using namespace std;
27
28 #if XMP_WinBuild
29 #ifdef _MSC_VER
30 #pragma warning ( disable : 4533 ) // initialization of '...' is skipped by 'goto ...'
31 #pragma warning ( disable : 4702 ) // unreachable code
32 #pragma warning ( disable : 4800 ) // forcing value to bool 'true' or 'false' (performance warning)
33 #pragma warning ( disable : 4996 ) // '...' was declared deprecated
34 #endif
35 #endif
36
37
38 // *** Use the XMP_PropIsXyz (Schema, Simple, Struct, Array, ...) macros
39 // *** Add debug codegen checks, e.g. that typical masking operations really work
40 // *** Change all uses of strcmp and strncmp to XMP_LitMatch and XMP_LitNMatch
41
42
43 // =================================================================================================
44 // Local Types and Constants
45 // =========================
46
47
48 // =================================================================================================
49 // Static Variables
50 // ================
51
52 #ifndef Trace_ParsingHackery
53 #define Trace_ParsingHackery 0
54 #endif
55
56 static const char * kReplaceLatin1[128] =
57 {
58
59 // The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code page 1252.
60 // The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined by Windows 1252, but
61 // their conversion API maps them to U+0081, etc. These are in XML's RestrictedChar set, so
62 // we map them to a space.
63
64 "\xE2\x82\xAC", " ", "\xE2\x80\x9A", "\xC6\x92", // 0x80 .. 0x83
65 "\xE2\x80\x9E", "\xE2\x80\xA6", "\xE2\x80\xA0", "\xE2\x80\xA1", // 0x84 .. 0x87
66 "\xCB\x86", "\xE2\x80\xB0", "\xC5\xA0", "\xE2\x80\xB9", // 0x88 .. 0x8B
67 "\xC5\x92", " ", "\xC5\xBD", " ", // 0x8C .. 0x8F
68
69 " ", "\xE2\x80\x98", "\xE2\x80\x99", "\xE2\x80\x9C", // 0x90 .. 0x93
70 "\xE2\x80\x9D", "\xE2\x80\xA2", "\xE2\x80\x93", "\xE2\x80\x94", // 0x94 .. 0x97
71 "\xCB\x9C", "\xE2\x84\xA2", "\xC5\xA1", "\xE2\x80\xBA", // 0x98 .. 0x9B
72 "\xC5\x93", " ", "\xC5\xBE", "\xC5\xB8", // 0x9C .. 0x9F
73
74 // These are the UTF-8 forms of the official Latin-1 characters in the range 0xA0..0xFF. Not
75 // too surprisingly these map to U+00A0, etc. Which is the Unicode Latin Supplement range.
76
77 "\xC2\xA0", "\xC2\xA1", "\xC2\xA2", "\xC2\xA3", "\xC2\xA4", "\xC2\xA5", "\xC2\xA6", "\xC2\xA7", // 0xA0 .. 0xA7
78 "\xC2\xA8", "\xC2\xA9", "\xC2\xAA", "\xC2\xAB", "\xC2\xAC", "\xC2\xAD", "\xC2\xAE", "\xC2\xAF", // 0xA8 .. 0xAF
79
80 "\xC2\xB0", "\xC2\xB1", "\xC2\xB2", "\xC2\xB3", "\xC2\xB4", "\xC2\xB5", "\xC2\xB6", "\xC2\xB7", // 0xB0 .. 0xB7
81 "\xC2\xB8", "\xC2\xB9", "\xC2\xBA", "\xC2\xBB", "\xC2\xBC", "\xC2\xBD", "\xC2\xBE", "\xC2\xBF", // 0xB8 .. 0xBF
82
83 "\xC3\x80", "\xC3\x81", "\xC3\x82", "\xC3\x83", "\xC3\x84", "\xC3\x85", "\xC3\x86", "\xC3\x87", // 0xC0 .. 0xC7
84 "\xC3\x88", "\xC3\x89", "\xC3\x8A", "\xC3\x8B", "\xC3\x8C", "\xC3\x8D", "\xC3\x8E", "\xC3\x8F", // 0xC8 .. 0xCF
85
86 "\xC3\x90", "\xC3\x91", "\xC3\x92", "\xC3\x93", "\xC3\x94", "\xC3\x95", "\xC3\x96", "\xC3\x97", // 0xD0 .. 0xD7
87 "\xC3\x98", "\xC3\x99", "\xC3\x9A", "\xC3\x9B", "\xC3\x9C", "\xC3\x9D", "\xC3\x9E", "\xC3\x9F", // 0xD8 .. 0xDF
88
89 "\xC3\xA0", "\xC3\xA1", "\xC3\xA2", "\xC3\xA3", "\xC3\xA4", "\xC3\xA5", "\xC3\xA6", "\xC3\xA7", // 0xE0 .. 0xE7
90 "\xC3\xA8", "\xC3\xA9", "\xC3\xAA", "\xC3\xAB", "\xC3\xAC", "\xC3\xAD", "\xC3\xAE", "\xC3\xAF", // 0xE8 .. 0xEF
91
92 "\xC3\xB0", "\xC3\xB1", "\xC3\xB2", "\xC3\xB3", "\xC3\xB4", "\xC3\xB5", "\xC3\xB6", "\xC3\xB7", // 0xF0 .. 0xF7
93 "\xC3\xB8", "\xC3\xB9", "\xC3\xBA", "\xC3\xBB", "\xC3\xBC", "\xC3\xBD", "\xC3\xBE", "\xC3\xBF", // 0xF8 .. 0xFF
94
95 };
96
97
98 // =================================================================================================
99 // Local Utilities
100 // ===============
101
102
103 #define IsHexDigit(ch) ( (('0' <= (ch)) && ((ch) <= '9')) || (('A' <= (ch)) && ((ch) <= 'F')) )
104 #define HexDigitValue(ch) ( (((ch) - '0') < 10) ? ((ch) - '0') : ((ch) - 'A' + 10) )
105
106
107 // -------------------------------------------------------------------------------------------------
108 // PickBestRoot
109 // ------------
PickBestRoot(const XML_Node & xmlParent,XMP_OptionBits options)110 static const XML_Node * PickBestRoot ( const XML_Node & xmlParent, XMP_OptionBits options )
111 {
112
113 // Look among this parent's content for x:xmpmeta. The recursion for x:xmpmeta is broader than
114 // the strictly defined choice, but gives us smaller code.
115 for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
116 const XML_Node * childNode = xmlParent.content[childNum];
117 if ( childNode->kind != kElemNode ) continue;
118 if ( (childNode->name == "x:xmpmeta") || (childNode->name == "x:xapmeta") ) return PickBestRoot ( *childNode, 0 );
119 }
120 // Look among this parent's content for a bare rdf:RDF if that is allowed.
121 if ( ! (options & kXMP_RequireXMPMeta) ) {
122 for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
123 const XML_Node * childNode = xmlParent.content[childNum];
124 if ( childNode->kind != kElemNode ) continue;
125 if ( childNode->name == "rdf:RDF" ) return childNode;
126 }
127 }
128
129 // Recurse into the content.
130 for ( size_t childNum = 0, childLim = xmlParent.content.size(); childNum < childLim; ++childNum ) {
131 const XML_Node * foundRoot = PickBestRoot ( *xmlParent.content[childNum], options );
132 if ( foundRoot != 0 ) return foundRoot;
133 }
134
135 return 0;
136
137 } // PickBestRoot
138
139 // -------------------------------------------------------------------------------------------------
140 // FindRootNode
141 // ------------
142 //
143 // Find the XML node that is the root of the XMP data tree. Generally this will be an outer node,
144 // but it could be anywhere if a general XML document is parsed (e.g. SVG). The XML parser counted
145 // all possible root nodes, and kept a pointer to the last one. If there is more than one possible
146 // root use PickBestRoot to choose among them.
147 //
148 // If there is a root node, try to extract the version of the previous XMP toolkit.
149
FindRootNode(XMPMeta * thiz,const XMLParserAdapter & xmlParser,XMP_OptionBits options)150 static const XML_Node * FindRootNode ( XMPMeta * thiz, const XMLParserAdapter & xmlParser, XMP_OptionBits options )
151 {
152 const XML_Node * rootNode = xmlParser.rootNode;
153
154 if ( xmlParser.rootCount > 1 ) rootNode = PickBestRoot ( xmlParser.tree, options );
155 if ( rootNode == 0 ) return 0;
156
157 // We have a root node. Try to extract previous toolkit version number.
158
159 XMP_StringPtr verStr = "";
160
161 XMP_Assert ( rootNode->name == "rdf:RDF" );
162
163 if ( (options & kXMP_RequireXMPMeta) &&
164 ((rootNode->parent == 0) ||
165 ((rootNode->parent->name != "x:xmpmeta") && (rootNode->parent->name != "x:xapmeta"))) ) return 0;
166
167 for ( size_t attrNum = 0, attrLim = rootNode->parent->attrs.size(); attrNum < attrLim; ++attrNum ) {
168 const XML_Node * currAttr =rootNode->parent->attrs[attrNum];
169 if ( (currAttr->name == "x:xmptk") || (currAttr->name == "x:xaptk") ) {
170 verStr = currAttr->value.c_str();
171 break;
172 }
173 }
174
175 // Decode the version number into MMmmuubbb digits. If any part is too big, peg it at 99 or 999.
176
177 unsigned long part;
178 while ( (*verStr != 0) && ((*verStr < '0') || (*verStr > '9')) ) ++verStr;
179
180 part = 0;
181 while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
182 part = (part * 10) + (*verStr - '0');
183 ++verStr;
184 }
185 if ( part > 99 ) part = 99;
186 thiz->prevTkVer = part * 100*100*1000;
187
188 part = 0;
189 if ( *verStr == '.' ) ++verStr;
190 while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
191 part = (part * 10) + (*verStr - '0');
192 ++verStr;
193 }
194 if ( part > 99 ) part = 99;
195 thiz->prevTkVer += part * 100*1000;
196
197 part = 0;
198 if ( *verStr == '.' ) ++verStr;
199 while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
200 part = (part * 10) + (*verStr - '0');
201 ++verStr;
202 }
203 if ( part > 99 ) part = 99;
204 thiz->prevTkVer += part * 1000;
205
206 part = 0;
207 if ( *verStr == '-' ) ++verStr;
208 while ( (*verStr != 0) && ('0' <= *verStr) && (*verStr <= '9') ) {
209 part = (part * 10) + (*verStr - '0');
210 ++verStr;
211 }
212 if ( part > 999 ) part = 999;
213 thiz->prevTkVer += part;
214
215 return rootNode;
216
217 } // FindRootNode
218
219 // -------------------------------------------------------------------------------------------------
220 // NormalizeDCArrays
221 // -----------------
222 //
223 // Undo the denormalization performed by the XMP used in Acrobat 5. If a Dublin Core array had only
224 // one item, it was serialized as a simple property. The xml:lang attribute was dropped from an
225 // alt-text item if the language was x-default.
226
227 // *** This depends on the dc: namespace prefix.
228
229 static void
NormalizeDCArrays(XMP_Node * xmpTree)230 NormalizeDCArrays ( XMP_Node * xmpTree )
231 {
232 XMP_Node * dcSchema = FindSchemaNode ( xmpTree, kXMP_NS_DC, kXMP_ExistingOnly );
233 if ( dcSchema == 0 ) return;
234
235 for ( size_t propNum = 0, propLimit = dcSchema->children.size(); propNum < propLimit; ++propNum ) {
236 XMP_Node * currProp = dcSchema->children[propNum];
237 XMP_OptionBits arrayForm = 0;
238
239 if ( ! XMP_PropIsSimple ( currProp->options ) ) continue; // Nothing to do if not simple.
240
241 if ( (currProp->name == "dc:creator" ) || // See if it is supposed to be an array.
242 (currProp->name == "dc:date" ) ) { // *** Think about an array of char* and a loop.
243 arrayForm = kXMP_PropArrayIsOrdered;
244 } else if (
245 (currProp->name == "dc:description" ) ||
246 (currProp->name == "dc:rights" ) ||
247 (currProp->name == "dc:title" ) ) {
248 arrayForm = kXMP_PropArrayIsAltText;
249 } else if (
250 (currProp->name == "dc:contributor" ) ||
251 (currProp->name == "dc:language" ) ||
252 (currProp->name == "dc:publisher" ) ||
253 (currProp->name == "dc:relation" ) ||
254 (currProp->name == "dc:subject" ) ||
255 (currProp->name == "dc:type" ) ) {
256 arrayForm = kXMP_PropValueIsArray;
257 }
258 if ( arrayForm == 0 ) continue; // Nothing to do if it isn't supposed to be an array.
259
260 arrayForm = VerifySetOptions ( arrayForm, 0 ); // Set the implicit array bits.
261 XMP_Node * newArray = new XMP_Node ( dcSchema, currProp->name.c_str(), arrayForm );
262 dcSchema->children[propNum] = newArray;
263 newArray->children.push_back ( currProp );
264 currProp->parent = newArray;
265 currProp->name = kXMP_ArrayItemName;
266
267 if ( XMP_ArrayIsAltText ( arrayForm ) && (! (currProp->options & kXMP_PropHasLang)) ) {
268 XMP_Node * newLang = new XMP_Node ( currProp, "xml:lang", "x-default", kXMP_PropIsQualifier );
269 currProp->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
270 if ( currProp->qualifiers.empty() ) { // *** Need a util?
271 currProp->qualifiers.push_back ( newLang );
272 } else {
273 currProp->qualifiers.insert ( currProp->qualifiers.begin(), newLang );
274 }
275 }
276
277 }
278
279 } // NormalizeDCArrays
280
281
282 // -------------------------------------------------------------------------------------------------
283 // CompareAliasedSubtrees
284 // ----------------------
285
286 // *** Change to do some alias-specific setup, then use CompareSubtrees. One special case for
287 // *** aliases is a simple to x-default alias, the options and qualifiers obviously differ.
288
289 static void
CompareAliasedSubtrees(XMP_Node * aliasNode,XMP_Node * baseNode,bool outerCall=true)290 CompareAliasedSubtrees ( XMP_Node * aliasNode, XMP_Node * baseNode, bool outerCall = true )
291 {
292 // ! The outermost call is special. The names almost certainly differ. The qualifiers (and
293 // ! hence options) will differ for an alias to the x-default item of a langAlt array.
294 if ( (aliasNode->value != baseNode->value) ||
295 (aliasNode->children.size() != baseNode->children.size()) ) {
296 XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
297 }
298 if ( ! outerCall ) {
299 if ( (aliasNode->name != baseNode->name) ||
300 (aliasNode->options != baseNode->options) ||
301 (aliasNode->qualifiers.size() != baseNode->qualifiers.size()) ) {
302 XMP_Throw ( "Mismatch between alias and base nodes", kXMPErr_BadXMP );
303 }
304 }
305
306 for ( size_t childNum = 0, childLim = aliasNode->children.size(); childNum < childLim; ++childNum ) {
307 XMP_Node * aliasChild = aliasNode->children[childNum];
308 XMP_Node * baseChild = baseNode->children[childNum];
309 CompareAliasedSubtrees ( aliasChild, baseChild, false );
310 }
311
312 for ( size_t qualNum = 0, qualLim = aliasNode->qualifiers.size(); qualNum < qualLim; ++qualNum ) {
313 XMP_Node * aliasQual = aliasNode->qualifiers[qualNum];
314 XMP_Node * baseQual = baseNode->qualifiers[qualNum];
315 CompareAliasedSubtrees ( aliasQual, baseQual, false );
316 }
317
318 } // CompareAliasedSubtrees
319
320
321 // -------------------------------------------------------------------------------------------------
322 // TransplantArrayItemAlias
323 // ------------------------
324
325 static void
TransplantArrayItemAlias(XMP_Node * oldParent,size_t oldNum,XMP_Node * newParent)326 TransplantArrayItemAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent )
327 {
328 XMP_Node * childNode = oldParent->children[oldNum];
329
330 if ( newParent->options & kXMP_PropArrayIsAltText ) {
331 if ( childNode->options & kXMP_PropHasLang ) {
332 XMP_Throw ( "Alias to x-default already has a language qualifier", kXMPErr_BadXMP ); // *** Allow x-default.
333 }
334 childNode->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
335 XMP_Node * langQual = new XMP_Node ( childNode, "xml:lang", "x-default", kXMP_PropIsQualifier ); // *** AddLangQual util?
336 if ( childNode->qualifiers.empty() ) {
337 childNode->qualifiers.push_back ( langQual );
338 } else {
339 childNode->qualifiers.insert ( childNode->qualifiers.begin(), langQual );
340 }
341 }
342
343 oldParent->children.erase ( oldParent->children.begin() + oldNum );
344 childNode->name = kXMP_ArrayItemName;
345 childNode->parent = newParent;
346 if ( newParent->children.empty() ) {
347 newParent->children.push_back ( childNode );
348 } else {
349 newParent->children.insert ( newParent->children.begin(), childNode );
350 }
351
352 } // TransplantArrayItemAlias
353
354
355 // -------------------------------------------------------------------------------------------------
356 // TransplantNamedAlias
357 // --------------------
358
359 static void
TransplantNamedAlias(XMP_Node * oldParent,size_t oldNum,XMP_Node * newParent,XMP_VarString & newName)360 TransplantNamedAlias ( XMP_Node * oldParent, size_t oldNum, XMP_Node * newParent, XMP_VarString & newName )
361 {
362 XMP_Node * childNode = oldParent->children[oldNum];
363
364 oldParent->children.erase ( oldParent->children.begin() + oldNum );
365 childNode->name = newName;
366 childNode->parent = newParent;
367 newParent->children.push_back ( childNode );
368
369 } // TransplantNamedAlias
370
371
372 // -------------------------------------------------------------------------------------------------
373 // MoveExplicitAliases
374 // -------------------
375
376 static void
MoveExplicitAliases(XMP_Node * tree,XMP_OptionBits parseOptions)377 MoveExplicitAliases ( XMP_Node * tree, XMP_OptionBits parseOptions )
378 {
379 tree->options ^= kXMP_PropHasAliases;
380 const bool strictAliasing = ((parseOptions & kXMP_StrictAliasing) != 0);
381
382 // Visit all of the top level nodes looking for aliases. If there is no base, transplant the
383 // alias subtree. If there is a base and strict aliasing is on, make sure the alias and base
384 // subtrees match.
385
386 // ! Use "while" loops not "for" loops since both the schema and property loops can remove the
387 // ! current item from the vector being traversed. And don't increment the counter for a delete.
388
389 size_t schemaNum = 0;
390 while ( schemaNum < tree->children.size() ) {
391 XMP_Node * currSchema = tree->children[schemaNum];
392
393 size_t propNum = 0;
394 while ( propNum < currSchema->children.size() ) {
395 XMP_Node * currProp = currSchema->children[propNum];
396 if ( ! (currProp->options & kXMP_PropIsAlias) ) {
397 ++propNum;
398 continue;
399 }
400 currProp->options ^= kXMP_PropIsAlias;
401
402 // Find the base path, look for the base schema and root node.
403
404 XMP_AliasMapPos aliasPos = sRegisteredAliasMap->find ( currProp->name );
405 XMP_Assert ( aliasPos != sRegisteredAliasMap->end() );
406 XMP_ExpandedXPath & basePath = aliasPos->second;
407 XMP_OptionBits arrayOptions = (basePath[kRootPropStep].options & kXMP_PropArrayFormMask);
408
409 XMP_Node * baseSchema = FindSchemaNode ( tree, basePath[kSchemaStep].step.c_str(), kXMP_CreateNodes );
410 if ( baseSchema->options & kXMP_NewImplicitNode ) baseSchema->options ^= kXMP_NewImplicitNode;
411 XMP_Node * baseNode = FindChildNode ( baseSchema, basePath[kRootPropStep].step.c_str(), kXMP_ExistingOnly );
412
413 if ( baseNode == 0 ) {
414
415 if ( basePath.size() == 2 ) {
416 // A top-to-top alias, transplant the property.
417 TransplantNamedAlias ( currSchema, propNum, baseSchema, basePath[kRootPropStep].step );
418 } else {
419 // An alias to an array item, create the array and transplant the property.
420 baseNode = new XMP_Node ( baseSchema, basePath[kRootPropStep].step.c_str(), arrayOptions );
421 baseSchema->children.push_back ( baseNode );
422 TransplantArrayItemAlias ( currSchema, propNum, baseNode );
423 }
424
425 } else if ( basePath.size() == 2 ) {
426
427 // The base node does exist and this is a top-to-top alias. Check for conflicts if
428 // strict aliasing is on. Remove and delete the alias subtree.
429 if ( strictAliasing ) CompareAliasedSubtrees ( currProp, baseNode );
430 currSchema->children.erase ( currSchema->children.begin() + propNum );
431 delete currProp;
432
433 } else {
434
435 // This is an alias to an array item and the array exists. Look for the aliased item.
436 // Then transplant or check & delete as appropriate.
437
438 XMP_Node * itemNode = 0;
439 if ( arrayOptions & kXMP_PropArrayIsAltText ) {
440 XMP_Index xdIndex = LookupLangItem ( baseNode, *xdefaultName );
441 if ( xdIndex != -1 ) itemNode = baseNode->children[xdIndex];
442 } else if ( ! baseNode->children.empty() ) {
443 itemNode = baseNode->children[0];
444 }
445
446 if ( itemNode == 0 ) {
447 TransplantArrayItemAlias ( currSchema, propNum, baseNode );
448 } else {
449 if ( strictAliasing ) CompareAliasedSubtrees ( currProp, itemNode );
450 currSchema->children.erase ( currSchema->children.begin() + propNum );
451 delete currProp;
452 }
453
454 }
455
456 } // Property loop
457
458 // Increment the counter or remove an empty schema node.
459 if ( currSchema->children.size() > 0 ) {
460 ++schemaNum;
461 } else {
462 delete tree->children[schemaNum]; // ! Delete the schema node itself.
463 tree->children.erase ( tree->children.begin() + schemaNum );
464 }
465
466 } // Schema loop
467
468 } // MoveExplicitAliases
469
470
471 // -------------------------------------------------------------------------------------------------
472 // FixGPSTimeStamp
473 // ---------------
474
475 static void
FixGPSTimeStamp(XMP_Node * exifSchema,XMP_Node * gpsDateTime)476 FixGPSTimeStamp ( XMP_Node * exifSchema, XMP_Node * gpsDateTime )
477 {
478 XMP_DateTime binGPSStamp;
479 try {
480 XMPUtils::ConvertToDate ( gpsDateTime->value.c_str(), &binGPSStamp );
481 } catch ( ... ) {
482 return; // Don't let a bad date stop other things.
483 }
484 if ( (binGPSStamp.year != 0) || (binGPSStamp.month != 0) || (binGPSStamp.day != 0) ) return;
485
486 XMP_Node * otherDate = FindChildNode ( exifSchema, "exif:DateTimeOriginal", kXMP_ExistingOnly );
487 if ( otherDate == 0 ) otherDate = FindChildNode ( exifSchema, "exif:DateTimeDigitized", kXMP_ExistingOnly );
488 if ( otherDate == 0 ) return;
489
490 XMP_DateTime binOtherDate;
491 try {
492 XMPUtils::ConvertToDate ( otherDate->value.c_str(), &binOtherDate );
493 } catch ( ... ) {
494 return; // Don't let a bad date stop other things.
495 }
496
497 binGPSStamp.year = binOtherDate.year;
498 binGPSStamp.month = binOtherDate.month;
499 binGPSStamp.day = binOtherDate.day;
500
501 XMP_StringPtr goodStr;
502 XMP_StringLen goodLen;
503 XMPUtils::ConvertFromDate ( binGPSStamp, &goodStr, &goodLen );
504
505 gpsDateTime->value.assign ( goodStr, goodLen );
506
507 } // FixGPSTimeStamp
508
509
510 // -------------------------------------------------------------------------------------------------
511 // MigrateAudioCopyright
512 // ---------------------
513 //
514 // The initial support for WAV files mapped a legacy ID3 audio copyright into a new xmpDM:copyright
515 // property. This is special case code to migrate that into dc:rights['x-default']. The rules:
516 //
517 // 1. If there is no dc:rights array, or an empty array -
518 // Create one with dc:rights['x-default'] set from double linefeed and xmpDM:copyright.
519 //
520 // 2. If there is a dc:rights array but it has no x-default item -
521 // Create an x-default item as a copy of the first item then apply rule #3.
522 //
523 // 3. If there is a dc:rights array with an x-default item, look for a double linefeed in the value.
524 // A. If no double linefeed, compare the x-default value to the xmpDM:copyright value.
525 // A1. If they match then leave the x-default value alone.
526 // A2. Otherwise, append a double linefeed and the xmpDM:copyright value to the x-default value.
527 // B. If there is a double linefeed, compare the trailing text to the xmpDM:copyright value.
528 // B1. If they match then leave the x-default value alone.
529 // B2. Otherwise, replace the trailing x-default text with the xmpDM:copyright value.
530 //
531 // 4. In all cases, delete the xmpDM:copyright property.
532
533 static void
MigrateAudioCopyright(XMPMeta * xmp,XMP_Node * dmCopyright)534 MigrateAudioCopyright ( XMPMeta * xmp, XMP_Node * dmCopyright )
535 {
536
537 try {
538
539 std::string & dmValue = dmCopyright->value;
540 static const char * kDoubleLF = "\xA\xA";
541
542 XMP_Node * dcSchema = FindSchemaNode ( &xmp->tree, kXMP_NS_DC, kXMP_CreateNodes );
543 XMP_Node * dcRightsArray = FindChildNode ( dcSchema, "dc:rights", kXMP_ExistingOnly );
544
545 if ( (dcRightsArray == 0) || dcRightsArray->children.empty() ) {
546
547 // 1. No dc:rights array, create from double linefeed and xmpDM:copyright.
548 dmValue.insert ( 0, kDoubleLF );
549 xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default", dmValue.c_str(), 0 );
550
551 } else {
552
553 std::string xdefaultStr ( "x-default" );
554
555 XMP_Index xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
556
557 if ( xdIndex < 0 ) {
558 // 2. No x-default item, create from the first item.
559 XMP_StringPtr firstValue = dcRightsArray->children[0]->value.c_str();
560 xmp->SetLocalizedText ( kXMP_NS_DC, "rights", "", "x-default", firstValue, 0 );
561 xdIndex = LookupLangItem ( dcRightsArray, xdefaultStr );
562 }
563
564 // 3. Look for a double linefeed in the x-default value.
565 XMP_Assert ( xdIndex == 0 );
566 std::string & defaultValue = dcRightsArray->children[xdIndex]->value;
567 XMP_Index lfPos = defaultValue.find ( kDoubleLF );
568
569 if ( lfPos < 0 ) {
570
571 // 3A. No double LF, compare whole values.
572 if ( dmValue != defaultValue ) {
573 // 3A2. Append the xmpDM:copyright to the x-default item.
574 defaultValue += kDoubleLF;
575 defaultValue += dmValue;
576 }
577
578 } else {
579
580 // 3B. Has double LF, compare the tail.
581 if ( defaultValue.compare ( lfPos+2, std::string::npos, dmValue ) != 0 ) {
582 // 3B2. Replace the x-default tail.
583 defaultValue.replace ( lfPos+2, std::string::npos, dmValue );
584 }
585
586 }
587
588 }
589
590 // 4. Get rid of the xmpDM:copyright.
591 xmp->DeleteProperty ( kXMP_NS_DM, "copyright" );
592
593 } catch ( ... ) {
594 // Don't let failures (like a bad dc:rights form) stop other cleanup.
595 }
596
597 } // MigrateAudioCopyright
598
599
600 // -------------------------------------------------------------------------------------------------
601 // RepairAltText
602 // -------------
603 //
604 // Make sure that the array is well-formed AltText. Each item must be simple and have an xml:lang
605 // qualifier. If repairs are needed, keep simple non-empty items by adding the xml:lang.
606
607 static void
RepairAltText(XMP_Node & tree,XMP_StringPtr schemaNS,XMP_StringPtr arrayName)608 RepairAltText ( XMP_Node & tree, XMP_StringPtr schemaNS, XMP_StringPtr arrayName )
609 {
610 XMP_Node * schemaNode = FindSchemaNode ( &tree, schemaNS, kXMP_ExistingOnly );
611 if ( schemaNode == 0 ) return;
612
613 XMP_Node * arrayNode = FindChildNode ( schemaNode, arrayName, kXMP_ExistingOnly );
614 if ( (arrayNode == 0) || XMP_ArrayIsAltText ( arrayNode->options ) ) return; // Already OK.
615
616 if ( ! XMP_PropIsArray ( arrayNode->options ) ) return; // ! Not even an array, leave it alone.
617 // *** Should probably change simple values to LangAlt with 'x-default' item.
618
619 arrayNode->options |= (kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText);
620
621 for ( int i = arrayNode->children.size()-1; i >= 0; --i ) { // ! Need a signed index type.
622
623 XMP_Node * currChild = arrayNode->children[i];
624
625 if ( ! XMP_PropIsSimple ( currChild->options ) ) {
626
627 // Delete non-simple children.
628 delete ( currChild );
629 arrayNode->children.erase ( arrayNode->children.begin() + i );
630
631 } else if ( ! XMP_PropHasLang ( currChild->options ) ) {
632
633 if ( currChild->value.empty() ) {
634
635 // Delete empty valued children that have no xml:lang.
636 delete ( currChild );
637 arrayNode->children.erase ( arrayNode->children.begin() + i );
638
639 } else {
640
641 // Add an xml:lang qualifier with the value "x-repair".
642 XMP_Node * repairLang = new XMP_Node ( currChild, "xml:lang", "x-repair", kXMP_PropIsQualifier );
643 if ( currChild->qualifiers.empty() ) {
644 currChild->qualifiers.push_back ( repairLang );
645 } else {
646 currChild->qualifiers.insert ( currChild->qualifiers.begin(), repairLang );
647 }
648 currChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
649
650 }
651
652 }
653
654 }
655
656 } // RepairAltText
657
658
659 // -------------------------------------------------------------------------------------------------
660 // TouchUpDataModel
661 // ----------------
662
663 static void
TouchUpDataModel(XMPMeta * xmp)664 TouchUpDataModel ( XMPMeta * xmp )
665 {
666 XMP_Node & tree = xmp->tree;
667
668 // Do special case touch ups for certain schema.
669
670 XMP_Node * currSchema = 0;
671
672 currSchema = FindSchemaNode ( &tree, kXMP_NS_EXIF, kXMP_ExistingOnly );
673 if ( currSchema != 0 ) {
674
675 // Do a special case fix for exif:GPSTimeStamp.
676 XMP_Node * gpsDateTime = FindChildNode ( currSchema, "exif:GPSTimeStamp", kXMP_ExistingOnly );
677 if ( gpsDateTime != 0 ) FixGPSTimeStamp ( currSchema, gpsDateTime );
678
679 // *** Should probably have RepairAltText change simple values to LangAlt with 'x-default' item.
680 // *** For now just do this for exif:UserComment, the one case we know about, late in cycle fix.
681 XMP_Node * userComment = FindChildNode ( currSchema, "exif:UserComment", kXMP_ExistingOnly );
682 if ( (userComment != 0) && XMP_PropIsSimple ( userComment->options ) ) {
683 XMP_Node * newChild = new XMP_Node ( userComment, kXMP_ArrayItemName,
684 userComment->value.c_str(), userComment->options );
685 newChild->qualifiers.swap ( userComment->qualifiers );
686 if ( ! XMP_PropHasLang ( newChild->options ) ) {
687 XMP_Node * langQual = new XMP_Node ( newChild, "xml:lang", "x-default", kXMP_PropIsQualifier );
688 newChild->qualifiers.insert ( newChild->qualifiers.begin(), langQual );
689 newChild->options |= (kXMP_PropHasQualifiers | kXMP_PropHasLang);
690 }
691 userComment->value.erase();
692 userComment->options = kXMP_PropArrayFormMask; // ! Happens to have all the right bits.
693 userComment->children.push_back ( newChild );
694 }
695
696 }
697
698 currSchema = FindSchemaNode ( &tree, kXMP_NS_DM, kXMP_ExistingOnly );
699 if ( currSchema != 0 ) {
700 // Do a special case migration of xmpDM:copyright to dc:rights['x-default']. Do this before
701 // the dc: touch up since it can affect the dc: schema.
702 XMP_Node * dmCopyright = FindChildNode ( currSchema, "xmpDM:copyright", kXMP_ExistingOnly );
703 if ( dmCopyright != 0 ) MigrateAudioCopyright ( xmp, dmCopyright );
704 }
705
706 currSchema = FindSchemaNode ( &tree, kXMP_NS_DC, kXMP_ExistingOnly );
707 if ( currSchema != 0 ) {
708 // Do a special case fix for dc:subject, make sure it is an unordered array.
709 XMP_Node * dcSubject = FindChildNode ( currSchema, "dc:subject", kXMP_ExistingOnly );
710 if ( dcSubject != 0 ) {
711 XMP_OptionBits keepMask = static_cast<XMP_OptionBits>(~(kXMP_PropArrayIsOrdered | kXMP_PropArrayIsAlternate | kXMP_PropArrayIsAltText));
712 dcSubject->options &= keepMask; // Make sure any ordered array bits are clear.
713 }
714 }
715
716 // Fix any broken AltText arrays that we know about.
717
718 RepairAltText ( tree, kXMP_NS_DC, "dc:description" ); // ! Note inclusion of prefixes for direct node lookup!
719 RepairAltText ( tree, kXMP_NS_DC, "dc:rights" );
720 RepairAltText ( tree, kXMP_NS_DC, "dc:title" );
721 RepairAltText ( tree, kXMP_NS_XMP_Rights, "xmpRights:UsageTerms" );
722 RepairAltText ( tree, kXMP_NS_EXIF, "exif:UserComment" );
723
724 // Tweak old XMP: Move an instance ID from rdf:about to the xmpMM:InstanceID property. An old
725 // instance ID usually looks like "uuid:bac965c4-9d87-11d9-9a30-000d936b79c4", plus InDesign
726 // 3.0 wrote them like "bac965c4-9d87-11d9-9a30-000d936b79c4". If the name looks like a UUID
727 // simply move it to xmpMM:InstanceID, don't worry about any existing xmpMM:InstanceID. Both
728 // will only be present when a newer file with the xmpMM:InstanceID property is updated by an
729 // old app that uses rdf:about.
730
731 if ( ! tree.name.empty() ) {
732
733 bool nameIsUUID = false;
734 XMP_StringPtr nameStr = tree.name.c_str();
735
736 if ( XMP_LitNMatch ( nameStr, "uuid:", 5 ) ) {
737
738 nameIsUUID = true;
739
740 } else if ( tree.name.size() == 36 ) {
741
742 nameIsUUID = true; // ! Assume true, we'll set it to false below if not.
743 for ( int i = 0; i < 36; ++i ) {
744 char ch = nameStr[i];
745 if ( ch == '-' ) {
746 if ( (i == 8) || (i == 13) || (i == 18) || (i == 23) ) continue;
747 nameIsUUID = false;
748 break;
749 } else {
750 if ( (('0' <= ch) && (ch <= '9')) || (('a' <= ch) && (ch <= 'z')) ) continue;
751 nameIsUUID = false;
752 break;
753 }
754 }
755
756 }
757
758 if ( nameIsUUID ) {
759
760 XMP_ExpandedXPath expPath;
761 ExpandXPath ( kXMP_NS_XMP_MM, "InstanceID", &expPath );
762 XMP_Node * idNode = FindNode ( &tree, expPath, kXMP_CreateNodes, 0 );
763 if ( idNode == 0 ) XMP_Throw ( "Failure creating xmpMM:InstanceID", kXMPErr_InternalFailure );
764
765 idNode->options = 0; // Clobber any existing xmpMM:InstanceID.
766 idNode->value = tree.name;
767 idNode->RemoveChildren();
768 idNode->RemoveQualifiers();
769
770 tree.name.erase();
771
772 }
773
774 }
775
776 } // TouchUpDataModel
777
778
779 // -------------------------------------------------------------------------------------------------
780 // DetermineInputEncoding
781 // ----------------------
782 //
783 // Try to determine the character encoding, making a guess if the input is too short. We make some
784 // simplifying assumtions: the first character must be U+FEFF or ASCII, U+0000 is not allowed. The
785 // XML 1.1 spec is even more strict, UTF-16 XML documents must begin with U+FEFF, and the first
786 // "real" character must be '<'. Ignoring the XML declaration, the first XML character could be '<',
787 // space, tab, CR, or LF.
788 //
789 // The possible input sequences are:
790 //
791 // Cases with U+FEFF
792 // EF BB BF -- - UTF-8
793 // FE FF -- -- - Big endian UTF-16
794 // 00 00 FE FF - Big endian UTF 32
795 // FF FE 00 00 - Little endian UTF-32
796 // FF FE -- -- - Little endian UTF-16
797 //
798 // Cases with ASCII
799 // nn mm -- -- - UTF-8 -
800 // 00 00 00 nn - Big endian UTF-32
801 // 00 nn -- -- - Big endian UTF-16
802 // nn 00 00 00 - Little endian UTF-32
803 // nn 00 -- -- - Little endian UTF-16
804 //
805 // ! We don't check for full patterns, or for errors. We just check enough to determine what the
806 // ! only possible (or reasonable) case would be.
807
808 static XMP_OptionBits
DetermineInputEncoding(const XMP_Uns8 * buffer,size_t length)809 DetermineInputEncoding ( const XMP_Uns8 * buffer, size_t length )
810 {
811 if ( length < 2 ) return kXMP_EncodeUTF8;
812
813 XMP_Uns8 * uniChar = (XMP_Uns8*)buffer; // ! Make sure comparisons are unsigned.
814
815 if ( uniChar[0] == 0 ) {
816
817 // These cases are:
818 // 00 nn -- -- - Big endian UTF-16
819 // 00 00 00 nn - Big endian UTF-32
820 // 00 00 FE FF - Big endian UTF 32
821
822 if ( (length < 4) || (uniChar[1] != 0) ) return kXMP_EncodeUTF16Big;
823 return kXMP_EncodeUTF32Big;
824
825 } else if ( uniChar[0] < 0x80 ) {
826
827 // These cases are:
828 // nn mm -- -- - UTF-8, includes EF BB BF case
829 // nn 00 00 00 - Little endian UTF-32
830 // nn 00 -- -- - Little endian UTF-16
831
832 if ( uniChar[1] != 0 ) return kXMP_EncodeUTF8;
833 if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
834 return kXMP_EncodeUTF32Little;
835
836 } else {
837
838 // These cases are:
839 // EF BB BF -- - UTF-8
840 // FE FF -- -- - Big endian UTF-16
841 // FF FE 00 00 - Little endian UTF-32
842 // FF FE -- -- - Little endian UTF-16
843
844 if ( uniChar[0] == 0xEF ) return kXMP_EncodeUTF8;
845 if ( uniChar[0] == 0xFE ) return kXMP_EncodeUTF16Big;
846 if ( (length < 4) || (uniChar[2] != 0) ) return kXMP_EncodeUTF16Little;
847 return kXMP_EncodeUTF32Little;
848
849 }
850
851 } // DetermineInputEncoding
852
853
854 // -------------------------------------------------------------------------------------------------
855 // CountUTF8
856 // ---------
857 //
858 // Look for a valid multi-byte UTF-8 sequence and return its length. Returns 0 for an invalid UTF-8
859 // sequence. Returns a negative value for a partial valid sequence at the end of the buffer.
860 //
861 // The checking is not strict. We simply count the number of high order 1 bits in the first byte,
862 // then look for n-1 following bytes whose high order 2 bits are 1 and 0. We do not check for a
863 // minimal length representation of the codepoint, or that the codepoint is defined by Unicode.
864
865 static int
CountUTF8(const XMP_Uns8 * charStart,const XMP_Uns8 * bufEnd)866 CountUTF8 ( const XMP_Uns8 * charStart, const XMP_Uns8 * bufEnd )
867 {
868 XMP_Assert ( charStart < bufEnd ); // Catch this in debug builds.
869 if ( charStart >= bufEnd ) return 0; // Don't run-on in release builds.
870 if ( (*charStart & 0xC0) != 0xC0 ) return 0; // Must have at least 2 high bits set.
871
872 int byteCount = 2;
873 XMP_Uns8 firstByte = *charStart;
874 for ( firstByte = firstByte << 2; (firstByte & 0x80) != 0; firstByte = firstByte << 1 ) ++byteCount;
875
876 if ( (charStart + byteCount) > bufEnd ) return -byteCount;
877
878 for ( int i = 1; i < byteCount; ++i ) {
879 if ( (charStart[i] & 0xC0) != 0x80 ) return 0;
880 }
881
882 return byteCount;
883
884 } // CountUTF8
885
886
887 // -------------------------------------------------------------------------------------------------
888 // CountControlEscape
889 // ------------------
890 //
891 // Look for a numeric escape sequence for a "prohibited" ASCII control character. These are 0x7F,
892 // and the range 0x00..0x1F except for tab/LF/CR. Return 0 if this is definitely not a numeric
893 // escape, the length of the escape if found, or a negative value for a partial escape.
894
895 static int
CountControlEscape(const XMP_Uns8 * escStart,const XMP_Uns8 * bufEnd)896 CountControlEscape ( const XMP_Uns8 * escStart, const XMP_Uns8 * bufEnd )
897 {
898 XMP_Assert ( escStart < bufEnd ); // Catch this in debug builds.
899 if ( escStart >= bufEnd ) return 0; // Don't run-on in release builds.
900 XMP_Assert ( *escStart == '&' );
901
902 size_t tailLen = bufEnd - escStart;
903 if ( tailLen < 5 ) return -1; // Don't need a more thorough check, we'll catch it on the next pass.
904
905 if ( strncmp ( (char*)escStart, "&#x", 3 ) != 0 ) return 0;
906
907 XMP_Uns8 escValue = 0;
908 const XMP_Uns8 * escPos = escStart + 3;
909
910 if ( ('0' <= *escPos) && (*escPos <= '9') ) {
911 escValue = *escPos - '0';
912 ++escPos;
913 } else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
914 escValue = *escPos - 'A' + 10;
915 ++escPos;
916 } else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
917 escValue = *escPos - 'a' + 10;
918 ++escPos;
919 }
920
921 if ( ('0' <= *escPos) && (*escPos <= '9') ) {
922 escValue = (escValue << 4) + (*escPos - '0');
923 ++escPos;
924 } else if ( ('A' <= *escPos) && (*escPos <= 'F') ) {
925 escValue = (escValue << 4) + (*escPos - 'A' + 10);
926 ++escPos;
927 } else if ( ('a' <= *escPos) && (*escPos <= 'f') ) {
928 escValue = (escValue << 4) + (*escPos - 'a' + 10);
929 ++escPos;
930 }
931
932 if ( escPos == bufEnd ) return -1; // Partial escape.
933 if ( *escPos != ';' ) return 0;
934
935 size_t escLen = escPos - escStart + 1;
936 if ( escLen < 5 ) return 0; // ! Catch "&#x;".
937
938 if ( (escValue == kTab) || (escValue == kLF) || (escValue == kCR) ) return 0; // An allowed escape.
939
940 return escLen; // Found a full "prohibited" numeric escape.
941
942 } // CountControlEscape
943
944
945 // -------------------------------------------------------------------------------------------------
946 // ProcessUTF8Portion
947 // ------------------
948 //
949 // Early versions of the XMP spec mentioned allowing ISO Latin-1 input. There are also problems with
950 // some clients placing ASCII control characters within XMP values. This is an XML problem, the XML
951 // spec only allows tab (0x09), LF (0x0A), and CR (0x0D) from the 0x00..0x1F range. As a concession
952 // to this we scan 8-bit input for byte sequences that are not valid UTF-8 or in the 0x00..0x1F
953 // range and replace each byte as follows:
954 // 0x00..0x1F - Replace with a space, except for tab, CR, and LF.
955 // 0x7F - Replace with a space. This is ASCII Delete, not allowed by ISO Latin-1.
956 // 0x80..0x9F - Replace with the UTF-8 for a corresponding Unicode character.
957 // 0xA0..0XFF - Replace with the UTF-8 for a corresponding Unicode character.
958 //
959 // The 0x80..0x9F range is not defined by Latin-1. But the Windows 1252 code page defines these and
960 // is otherwise the same as Latin-1.
961 //
962 // For at least historical compatibility reasons we also find and replace singly escaped ASCII
963 // control characters. The Expat parser we're using does not allow numeric escapes like "".
964 // The XML spec is clear that raw controls are not allowed (in the RestrictedChar set), but it isn't
965 // as clear about numeric escapes for them. At any rate, Expat complains, so we treat the numeric
966 // escapes like raw characters and replace them with a space.
967 //
968 // We check for 1 or 2 hex digits ("	" or "	") and upper or lower case ("
" or "
").
969 // The full escape sequence is 5 or 6 bytes.
970
971 static size_t
ProcessUTF8Portion(XMLParserAdapter * xmlParser,const XMP_Uns8 * buffer,size_t length,bool last)972 ProcessUTF8Portion ( XMLParserAdapter * xmlParser,
973 const XMP_Uns8 * buffer,
974 size_t length,
975 bool last )
976 {
977 const XMP_Uns8 * bufEnd = buffer + length;
978
979 const XMP_Uns8 * spanEnd;
980
981 // `buffer` is copied into this std::string. If `buffer` only
982 // contains valid UTF-8 and no escape characters, then the copy
983 // will be identical to the original, but invalid characters are
984 // replaced - usually with a space character. This std::string was
985 // added as a performance fix for:
986 // https://github.com/Exiv2/exiv2/security/advisories/GHSA-w8mv-g8qq-36mj
987 // Previously, the code was repeatedly calling
988 // `xmlParser->ParseBuffer()`, which turned out to have quadratic
989 // complexity, because expat kept reparsing the entire string from
990 // the beginning.
991 std::string copy;
992
993 for ( spanEnd = buffer; spanEnd < bufEnd; ++spanEnd ) {
994
995 if ( (0x20 <= *spanEnd) && (*spanEnd <= 0x7E) && (*spanEnd != '&') ) {
996 copy.push_back(*spanEnd);
997 continue; // A regular ASCII character.
998 }
999
1000 if ( *spanEnd >= 0x80 ) {
1001
1002 // See if this is a multi-byte UTF-8 sequence, or a Latin-1 character to replace.
1003
1004 int uniLen = CountUTF8 ( spanEnd, bufEnd );
1005
1006 if ( uniLen > 0 ) {
1007
1008 // A valid UTF-8 character, keep it as-is.
1009 copy.append((const char*)spanEnd, uniLen);
1010 spanEnd += uniLen - 1; // ! The loop increment will put back the +1.
1011
1012 } else if ( (uniLen < 0) && (! last) ) {
1013
1014 // Have a partial UTF-8 character at the end of the buffer and more input coming.
1015 xmlParser->ParseBuffer ( copy.c_str(), copy.size(), false );
1016 return (spanEnd - buffer);
1017
1018 } else {
1019
1020 // Not a valid UTF-8 sequence. Replace the first byte with the Latin-1 equivalent.
1021 const char * replacement = kReplaceLatin1 [ *spanEnd - 0x80 ];
1022 copy.append ( replacement );
1023
1024 }
1025
1026 } else if ( (*spanEnd < 0x20) || (*spanEnd == 0x7F) ) {
1027
1028 // Replace ASCII controls other than tab, LF, and CR with a space.
1029
1030 if ( (*spanEnd == kTab) || (*spanEnd == kLF) || (*spanEnd == kCR) ) {
1031 copy.push_back(*spanEnd);
1032 continue;
1033 }
1034
1035 copy.push_back(' ');
1036
1037 } else {
1038
1039 // See if this is a numeric escape sequence for a prohibited ASCII control.
1040
1041 XMP_Assert ( *spanEnd == '&' );
1042 int escLen = CountControlEscape ( spanEnd, bufEnd );
1043
1044 if ( escLen < 0 ) {
1045
1046 // Have a partial numeric escape in this buffer, wait for more input.
1047 if ( last ) {
1048 copy.push_back('&');
1049 continue; // No more buffers, not an escape, absorb as normal input.
1050 }
1051 xmlParser->ParseBuffer ( copy.c_str(), copy.size(), false );
1052 return (spanEnd - buffer);
1053
1054 } else if ( escLen > 0 ) {
1055
1056 // Have a complete numeric escape to replace.
1057 copy.push_back(' ');
1058 spanEnd = spanEnd + escLen - 1; // ! The loop continuation will increment spanEnd!
1059
1060 } else {
1061 copy.push_back('&');
1062 }
1063
1064 }
1065
1066 }
1067
1068 XMP_Assert ( spanEnd == bufEnd );
1069 copy.push_back(' ');
1070 xmlParser->ParseBuffer ( copy.c_str(), copy.size(), true );
1071 return length;
1072
1073 } // ProcessUTF8Portion
1074
1075
1076 // -------------------------------------------------------------------------------------------------
1077 // ParseFromBuffer
1078 // ---------------
1079 //
1080 // Although most clients will probably parse everything in one call, we have a buffered API model
1081 // and need to support even the extreme case of 1 byte at a time parsing. This is considerably
1082 // complicated by some special cases for 8-bit input. Because of this, the first thing we do is
1083 // determine whether the input is 8-bit, UTF-16, or UTF-32.
1084 //
1085 // Both the 8-bit special cases and the encoding determination are easier to do with 8 bytes or more
1086 // of input. The XMLParserAdapter class has a pending-input buffer for this. At the start of parsing
1087 // we (moght) try to fill this buffer before determining the input character encoding. After that,
1088 // we (might) use this buffer with the current input to simplify the logic in Process8BitInput. The
1089 // "(might)" part means that we don't actually use the pending-input buffer unless we have to. In
1090 // particular, the common case of single-buffer parsing won't use it.
1091
1092 void
ParseFromBuffer(XMP_StringPtr buffer,XMP_StringLen xmpSize,XMP_OptionBits options)1093 XMPMeta::ParseFromBuffer ( XMP_StringPtr buffer,
1094 XMP_StringLen xmpSize,
1095 XMP_OptionBits options )
1096 {
1097 if ( (buffer == 0) && (xmpSize != 0) ) XMP_Throw ( "Null parse buffer", kXMPErr_BadParam );
1098 if ( xmpSize == kXMP_UseNullTermination ) xmpSize = strlen ( buffer );
1099
1100 const bool lastClientCall = ((options & kXMP_ParseMoreBuffers) == 0); // *** Could use FlagIsSet & FlagIsClear macros.
1101
1102 this->tree.ClearNode(); // Make sure the target XMP object is totally empty.
1103
1104 if ( this->xmlParser == 0 ) {
1105 if ( (xmpSize == 0) && lastClientCall ) return; // Tolerate empty parse. Expat complains if there are no XML elements.
1106 this->xmlParser = XMP_NewExpatAdapter();
1107 }
1108
1109 XMLParserAdapter& parser = *this->xmlParser;
1110
1111 #if 0 // XMP_DebugBuild
1112 if ( parser.parseLog != 0 ) {
1113 char message [200]; // AUDIT: Using sizeof(message) below for snprintf length is safe.
1114 snprintf ( message, sizeof(message), "<!-- ParseFromBuffer, length = %d, options = %X%s -->", // AUDIT: See above.
1115 xmpSize, options, (lastClientCall ? " (last)" : "") );
1116 fwrite ( message, 1, strlen(message), parser.parseLog );
1117 fflush ( parser.parseLog );
1118 }
1119 #endif
1120
1121 try { // Cleanup the tree and xmlParser if anything fails.
1122
1123 // Determine the character encoding before doing any real parsing. This is needed to do the
1124 // 8-bit special processing.
1125
1126 if ( parser.charEncoding == XMP_OptionBits(-1) ) {
1127
1128 if ( (parser.pendingCount == 0) && (xmpSize >= kXMLPendingInputMax) ) {
1129
1130 // This ought to be the common case, the first buffer is big enough.
1131 parser.charEncoding = DetermineInputEncoding ( (XMP_Uns8*)buffer, xmpSize );
1132
1133 } else {
1134
1135 // Try to fill the pendingInput buffer before calling DetermineInputEncoding.
1136
1137 size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1138 if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1139
1140 memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap ); // AUDIT: Count is safe.
1141 buffer += pendingOverlap;
1142 xmpSize -= pendingOverlap;
1143 parser.pendingCount += pendingOverlap;
1144
1145 if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1146 parser.charEncoding = DetermineInputEncoding ( parser.pendingInput, parser.pendingCount );
1147
1148 #if Trace_ParsingHackery
1149 fprintf ( stderr, "XMP Character encoding is %d\n", parser.charEncoding );
1150 #endif
1151
1152 }
1153
1154 }
1155
1156 // We have the character encoding. Process UTF-16 and UTF-32 as is. UTF-8 needs special
1157 // handling to take care of things like ISO Latin-1 or unescaped ASCII controls.
1158
1159 XMP_Assert ( parser.charEncoding != XMP_OptionBits(-1) );
1160
1161 if ( parser.charEncoding != kXMP_EncodeUTF8 ) {
1162
1163 if ( parser.pendingCount > 0 ) {
1164 // Might have pendingInput from the above portion to determine the character encoding.
1165 parser.ParseBuffer ( parser.pendingInput, parser.pendingCount, false );
1166 }
1167 parser.ParseBuffer ( buffer, xmpSize, lastClientCall );
1168
1169 } else {
1170
1171 #if Trace_ParsingHackery
1172 fprintf ( stderr, "Parsing %d bytes @ %.8X, %s, %d pending, context: %.8s\n",
1173 xmpSize, buffer, (lastClientCall ? "last" : "not last"), parser.pendingCount, buffer );
1174 #endif
1175
1176 // The UTF-8 processing is a bit complex due to the need to tolerate ISO Latin-1 input.
1177 // This is done by scanning the input for byte sequences that are not valid UTF-8,
1178 // assuming they are Latin-1 characters in the range 0x80..0xFF. This requires saving a
1179 // pending input buffer to handle partial UTF-8 sequences at the end of a buffer.
1180
1181 while ( parser.pendingCount > 0 ) {
1182
1183 // We've got some leftover input, process it first then continue with the current
1184 // buffer. Try to fill the pendingInput buffer before parsing further. We use a loop
1185 // for weird edge cases like a 2 byte input buffer, using 1 byte for pendingInput,
1186 // then having a partial UTF-8 end and need to absorb more.
1187
1188 size_t pendingOverlap = kXMLPendingInputMax - parser.pendingCount;
1189 if ( pendingOverlap > xmpSize ) pendingOverlap = xmpSize;
1190
1191 memcpy ( &parser.pendingInput[parser.pendingCount], buffer, pendingOverlap ); // AUDIT: Count is safe.
1192 parser.pendingCount += pendingOverlap;
1193 buffer += pendingOverlap;
1194 xmpSize -= pendingOverlap;
1195
1196 if ( (! lastClientCall) && (parser.pendingCount < kXMLPendingInputMax) ) return;
1197 size_t bytesDone = ProcessUTF8Portion ( &parser, parser.pendingInput, parser.pendingCount, lastClientCall );
1198 size_t bytesLeft = parser.pendingCount - bytesDone;
1199
1200 #if Trace_ParsingHackery
1201 fprintf ( stderr, " ProcessUTF8Portion handled %d pending bytes\n", bytesDone );
1202 #endif
1203
1204 if ( bytesDone == parser.pendingCount ) {
1205
1206 // Done with all of the pending input, move on to the current buffer.
1207 parser.pendingCount = 0;
1208
1209 } else if ( bytesLeft <= pendingOverlap ) {
1210
1211 // The leftover pending input all came from the current buffer. Exit this loop.
1212 buffer -= bytesLeft;
1213 xmpSize += bytesLeft;
1214 parser.pendingCount = 0;
1215
1216 } else if ( xmpSize > 0 ) {
1217
1218 // Pull more of the current buffer into the pending input and try again.
1219 // Backup by this pass's overlap so the loop entry code runs OK.
1220 parser.pendingCount -= pendingOverlap;
1221 buffer -= pendingOverlap;
1222 xmpSize += pendingOverlap;
1223
1224 } else {
1225
1226 // There is no more of the current buffer. Wait for more. Partial sequences at
1227 // the end of the last buffer should be treated as Latin-1 by ProcessUTF8Portion.
1228 XMP_Assert ( ! lastClientCall );
1229 parser.pendingCount = bytesLeft;
1230 memcpy ( &parser.pendingInput[0], &parser.pendingInput[bytesDone], bytesLeft ); // AUDIT: Count is safe.
1231 return;
1232
1233 }
1234
1235 }
1236
1237 // Done with the pending input, process the current buffer.
1238
1239 size_t bytesDone = ProcessUTF8Portion ( &parser, (XMP_Uns8*)buffer, xmpSize, lastClientCall );
1240
1241 #if Trace_ParsingHackery
1242 fprintf ( stderr, " ProcessUTF8Portion handled %d additional bytes\n", bytesDone );
1243 #endif
1244
1245 if ( bytesDone < xmpSize ) {
1246
1247 XMP_Assert ( ! lastClientCall );
1248 size_t bytesLeft = xmpSize - bytesDone;
1249 if ( bytesLeft > kXMLPendingInputMax ) XMP_Throw ( "Parser bytesLeft too large", kXMPErr_InternalFailure );
1250
1251 memcpy ( parser.pendingInput, &buffer[bytesDone], bytesLeft ); // AUDIT: Count is safe.
1252 parser.pendingCount = bytesLeft;
1253 return; // Wait for the next buffer.
1254
1255 }
1256
1257 }
1258
1259 if ( lastClientCall ) {
1260
1261 #if XMP_DebugBuild && DumpXMLParseTree
1262 if ( parser.parseLog == 0 ) parser.parseLog = stdout;
1263 DumpXMLTree ( parser.parseLog, parser.tree, 0 );
1264 #endif
1265
1266 const XML_Node * xmlRoot = FindRootNode ( this, *this->xmlParser, options );
1267
1268 if ( xmlRoot != 0 ) {
1269
1270 ProcessRDF ( &this->tree, *xmlRoot, options );
1271 NormalizeDCArrays ( &this->tree );
1272 if ( this->tree.options & kXMP_PropHasAliases ) MoveExplicitAliases ( &this->tree, options );
1273 TouchUpDataModel ( this );
1274
1275 // Delete empty schema nodes. Do this last, other cleanup can make empty schema.
1276 size_t schemaNum = 0;
1277 while ( schemaNum < this->tree.children.size() ) {
1278 XMP_Node * currSchema = this->tree.children[schemaNum];
1279 if ( currSchema->children.size() > 0 ) {
1280 ++schemaNum;
1281 } else {
1282 delete this->tree.children[schemaNum]; // ! Delete the schema node itself.
1283 this->tree.children.erase ( this->tree.children.begin() + schemaNum );
1284 }
1285 }
1286
1287 }
1288
1289 delete this->xmlParser;
1290 this->xmlParser = 0;
1291
1292 }
1293
1294 } catch ( ... ) {
1295
1296 delete this->xmlParser;
1297 this->xmlParser = 0;
1298 prevTkVer = 0;
1299 this->tree.ClearNode();
1300 throw;
1301
1302 }
1303
1304 } // ParseFromBuffer
1305
1306 // =================================================================================================
1307