1 // =================================================================================================
2 // Copyright 2002-2007 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 //
8 // Adobe patent application tracking #P435, entitled 'Unique markers to simplify embedding data of
9 // one format in a file with a different format', inventors: Sean Parent, Greg Gilley.
10 // =================================================================================================
11 
12 #include <cstring>
13 
14 #if WIN32
15 	#pragma warning ( disable : 4127 )	// conditional expression is constant
16 	#pragma warning ( disable : 4510 )	// default constructor could not be generated
17 	#pragma warning ( disable : 4610 )	// user defined constructor required
18 	#pragma warning ( disable : 4786 )	// debugger can't handle long symbol names
19 #endif
20 
21 
22 #include "XMPScanner.hpp"
23 
24 #include <cassert>
25 #include <string>
26 #include <cstdlib>
27 
28 #if DEBUG
29 	#include <iostream>
30 	#include <iomanip>
31 	#include <fstream>
32 #endif
33 
34 
35 #ifndef UseStringPushBack	// VC++ 6.x does not provide push_back for strings!
36 	#define UseStringPushBack	0
37 #endif
38 
39 
40 using namespace std;
41 
42 
43 // *** Consider Boyer-Moore style search for "<?xpacket begin=".  It isn't an obvious win, the
44 // *** additional code might be slower than scanning every character.  Especially if we will
45 // *** read every cache line anyway.
46 
47 
48 // =================================================================================================
49 // =================================================================================================
50 // class PacketMachine
51 // ===================
52 //
53 // This is the packet recognizer state machine.  The top of the machine is FindNextPacket, this
54 // calls the specific state components and handles transitions.  The states are described by an
55 // array of RecognizerInfo records, indexed by the RecognizerKind enumeration.  Each RecognizerInfo
56 // record has a function that does that state's work, the success and failure transition states,
57 // and a string literal that is passed to the state function.  The literal lets a common MatchChar
58 // or MatchString function be used in several places.
59 //
60 // The state functions are responsible for consuming input to recognize their particular state.
61 // This includes intervening nulls for 16 and 32 bit character forms.  For the simplicity, things
62 // are treated as essentially little endian and the nulls are not actually checked.  The opening
63 // '<' is found with a byte-by-byte search, then the number of bytes per character is determined
64 // by counting the following nulls.  From then on, consuming a character means incrementing the
65 // buffer pointer by the number of bytes per character.  Thus the buffer pointer only points to
66 // the "real" bytes.  This also means that the pointer can go off the end of the buffer by a
67 // variable amount.  The amount of overrun is saved so that the pointer can be positioned at the
68 // right byte to start the next buffer.
69 //
70 // The state functions return a TriState value, eTriYes means the pattern was found, eTriNo means
71 // the pattern was definitely not found, eTriMaybe means that the end of the buffer was reached
72 // while working through the pattern.
73 //
74 // When eTriYes is returned, the fBufferPtr data member is left pointing to the "real" byte
75 // following the last actual byte.  Which might not be addressable memory!  This also means that
76 // a state function can be entered with nothing available in the buffer.  When eTriNo is returned,
77 // the fBufferPtr data member is left pointing to the byte that caused the failure.  The state
78 // machine starts over from the failure byte.
79 //
80 // The state functions must preserve their internal micro-state before returning eTriMaybe, and
81 // resume processing when called with the next buffer.  The fPosition data member is used to denote
82 // how many actual characters have been consumed.  The fNullCount data member is used to denote how
83 // many nulls are left before the next actual character.
84 
85 
86 // =================================================================================================
87 // PacketMachine
88 // =============
89 
PacketMachine(XMP_Int64 bufferOffset,const void * bufferOrigin,XMP_Int64 bufferLength)90 XMPScanner::PacketMachine::PacketMachine ( XMP_Int64 bufferOffset, const void * bufferOrigin, XMP_Int64 bufferLength ) :
91 
92 	// Public members
93 	fPacketStart ( 0 ),
94 	fPacketLength ( 0 ),
95 	fBytesAttr ( -1 ),
96 	fCharForm ( eChar8Bit ),
97 	fAccess ( ' ' ),
98 	fBogusPacket ( false ),
99 
100 	// Private members
101 	fBufferOffset ( bufferOffset ),
102 	fBufferOrigin ( (const char *) bufferOrigin ),
103 	fBufferPtr ( fBufferOrigin ),
104 	fBufferLimit ( fBufferOrigin + bufferLength ),
105 	fRecognizer ( eLeadInRecognizer ),
106 	fPosition ( 0 ),
107 	fBytesPerChar ( 1 ),
108 	fBufferOverrun ( 0 ),
109 	fQuoteChar ( ' ' )
110 
111 {
112 	/*
113 	REVIEW NOTES : Should the buffer stuff be in a class?
114 	*/
115 
116 	assert ( bufferOrigin != NULL );
117 	assert ( bufferLength != 0 );
118 
119 }	// PacketMachine
120 
121 
122 // =================================================================================================
123 // ~PacketMachine
124 // ==============
125 
~PacketMachine()126 XMPScanner::PacketMachine::~PacketMachine ()
127 {
128 
129 	// An empty placeholder.
130 
131 }	// ~PacketMachine
132 
133 
134 // =================================================================================================
135 // AssociateBuffer
136 // ===============
137 
138 void
AssociateBuffer(XMP_Int64 bufferOffset,const void * bufferOrigin,XMP_Int64 bufferLength)139 XMPScanner::PacketMachine::AssociateBuffer ( XMP_Int64 bufferOffset, const void * bufferOrigin, XMP_Int64 bufferLength )
140 {
141 
142 	fBufferOffset = bufferOffset;
143 	fBufferOrigin = (const char *) bufferOrigin;
144 	fBufferPtr = fBufferOrigin + fBufferOverrun;
145 	fBufferLimit = fBufferOrigin + bufferLength;
146 
147 }	// AssociateBuffer
148 
149 
150 // =================================================================================================
151 // ResetMachine
152 // ============
153 
154 void
ResetMachine()155 XMPScanner::PacketMachine::ResetMachine ()
156 {
157 
158 	fRecognizer = eLeadInRecognizer;
159 	fPosition = 0;
160 	fBufferOverrun = 0;
161 	fCharForm = eChar8Bit;
162 	fBytesPerChar = 1;
163 	fAccess = ' ';
164 	fBytesAttr = -1;
165 	fBogusPacket = false;
166 
167 	fAttrName.erase ( fAttrName.begin(), fAttrName.end() );
168 	fAttrValue.erase ( fAttrValue.begin(), fAttrValue.end() );
169 	fEncodingAttr.erase ( fEncodingAttr.begin(), fEncodingAttr.end() );
170 
171 }	// ResetMachine
172 
173 
174 // =================================================================================================
175 // FindLessThan
176 // ============
177 
178 XMPScanner::PacketMachine::TriState
FindLessThan(PacketMachine * ths,const char * which)179 XMPScanner::PacketMachine::FindLessThan ( PacketMachine * ths, const char * which )
180 {
181 
182 	if ( *which == 'H' ) {
183 
184 		// --------------------------------------------------------------------------------
185 		// We're looking for the '<' of the header.  If we fail there is no packet in this
186 		// part of the input, so return eTriNo.
187 
188 		ths->fCharForm = eChar8Bit;	// We might have just failed from a bogus 16 or 32 bit case.
189 		ths->fBytesPerChar = 1;
190 
191 		while ( ths->fBufferPtr < ths->fBufferLimit ) {	// Don't skip nulls for the header's '<'!
192 			if ( *ths->fBufferPtr == '<' ) break;
193 			ths->fBufferPtr++;
194 		}
195 
196 		if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriNo;
197 		ths->fBufferPtr++;
198 		return eTriYes;
199 
200 	} else {
201 
202 		// --------------------------------------------------------------------------------
203 		// We're looking for the '<' of the trailer.  We're already inside the packet body,
204 		// looking for the trailer.  So here if we fail we must return eTriMaybe so that we
205 		// keep looking for the trailer in the next buffer.
206 
207 		const int bytesPerChar = ths->fBytesPerChar;
208 
209 		while ( ths->fBufferPtr < ths->fBufferLimit ) {
210 			if ( *ths->fBufferPtr == '<' ) break;
211 			ths->fBufferPtr += bytesPerChar;
212 		}
213 
214 		if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
215 		ths->fBufferPtr += bytesPerChar;
216 		return eTriYes;
217 
218 	}
219 
220 }	// FindLessThan
221 
222 
223 // =================================================================================================
224 // MatchString
225 // ===========
226 
227 XMPScanner::PacketMachine::TriState
MatchString(PacketMachine * ths,const char * literal)228 XMPScanner::PacketMachine::MatchString ( PacketMachine * ths, const char * literal )
229 {
230 	const int			bytesPerChar	= ths->fBytesPerChar;
231 	const char *		litPtr			= literal + ths->fPosition;
232 	const XMP_Int32		charsToGo		= (XMP_Int32) strlen ( literal ) - ths->fPosition;
233 	int					charsDone		= 0;
234 
235 	while ( (charsDone < charsToGo) && (ths->fBufferPtr < ths->fBufferLimit) ) {
236 		if ( *litPtr != *ths->fBufferPtr ) return eTriNo;
237 		charsDone++;
238 		litPtr++;
239 		ths->fBufferPtr += bytesPerChar;
240 	}
241 
242 	if ( charsDone == charsToGo ) return eTriYes;
243 	ths->fPosition += charsDone;
244 	return eTriMaybe;
245 
246 }	// MatchString
247 
248 
249 // =================================================================================================
250 // MatchChar
251 // =========
252 
253 XMPScanner::PacketMachine::TriState
MatchChar(PacketMachine * ths,const char * literal)254 XMPScanner::PacketMachine::MatchChar ( PacketMachine * ths, const char * literal )
255 {
256 	const int	bytesPerChar	= ths->fBytesPerChar;
257 
258 	if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
259 
260 	const char currChar = *ths->fBufferPtr;
261 	if ( currChar != *literal ) return eTriNo;
262 	ths->fBufferPtr += bytesPerChar;
263 	return eTriYes;
264 
265 }	// MatchChar
266 
267 
268 // =================================================================================================
269 // MatchOpenQuote
270 // ==============
271 
272 XMPScanner::PacketMachine::TriState
MatchOpenQuote(PacketMachine * ths,const char *)273 XMPScanner::PacketMachine::MatchOpenQuote ( PacketMachine * ths, const char * /* unused */ )
274 {
275 	const int	bytesPerChar	= ths->fBytesPerChar;
276 
277 	if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
278 
279 	const char currChar = *ths->fBufferPtr;
280 	if ( (currChar != '\'') && (currChar != '"') ) return eTriNo;
281 	ths->fQuoteChar = currChar;
282 	ths->fBufferPtr += bytesPerChar;
283 	return eTriYes;
284 
285 }	// MatchOpenQuote
286 
287 
288 // =================================================================================================
289 // MatchCloseQuote
290 // ===============
291 
292 XMPScanner::PacketMachine::TriState
MatchCloseQuote(PacketMachine * ths,const char *)293 XMPScanner::PacketMachine::MatchCloseQuote ( PacketMachine * ths, const char * /* unused */ )
294 {
295 
296 	return MatchChar ( ths, &ths->fQuoteChar );
297 
298 }	// MatchCloseQuote
299 
300 
301 // =================================================================================================
302 // CaptureAttrName
303 // ===============
304 
305 XMPScanner::PacketMachine::TriState
CaptureAttrName(PacketMachine * ths,const char *)306 XMPScanner::PacketMachine::CaptureAttrName ( PacketMachine * ths, const char * /* unused */ )
307 {
308 	const int	bytesPerChar	= ths->fBytesPerChar;
309 	char		currChar;
310 
311 	if ( ths->fPosition == 0 ) {	// Get the first character in the name.
312 
313 		if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
314 
315 		currChar = *ths->fBufferPtr;
316 		if ( ths->fAttrName.size() == 0 ) {
317 			if ( ! ( ( ('a' <= currChar) && (currChar <= 'z') ) ||
318 					 ( ('A' <= currChar) && (currChar <= 'Z') ) ||
319 					 (currChar == '_') || (currChar == ':') ) ) {
320 				return eTriNo;
321 			}
322 		}
323 
324 		ths->fAttrName.erase ( ths->fAttrName.begin(), ths->fAttrName.end() );
325 		#if UseStringPushBack
326 			ths->fAttrName.push_back ( currChar );
327 		#else
328 			ths->fAttrName.insert ( ths->fAttrName.end(), currChar );
329 		#endif
330 		ths->fBufferPtr += bytesPerChar;
331 
332 	}
333 
334 	while ( ths->fBufferPtr < ths->fBufferLimit ) {	// Get the remainder of the name.
335 
336 		currChar = *ths->fBufferPtr;
337 		if ( ! ( ( ('a' <= currChar) && (currChar <= 'z') ) ||
338 				 ( ('A' <= currChar) && (currChar <= 'Z') ) ||
339 				 ( ('0' <= currChar) && (currChar <= '9') ) ||
340 				 (currChar == '-') || (currChar == '.') || (currChar == '_') || (currChar == ':') ) ) {
341 			break;
342 		}
343 
344 		#if UseStringPushBack
345 			ths->fAttrName.push_back ( currChar );
346 		#else
347 			ths->fAttrName.insert ( ths->fAttrName.end(), currChar );
348 		#endif
349 		ths->fBufferPtr += bytesPerChar;
350 
351 	}
352 
353 	if ( ths->fBufferPtr < ths->fBufferLimit ) return eTriYes;
354 	ths->fPosition = (long) ths->fAttrName.size();	// The name might span into the next buffer.
355 	return eTriMaybe;
356 
357 }	// CaptureAttrName
358 
359 
360 // =================================================================================================
361 // CaptureAttrValue
362 // ================
363 //
364 // Recognize the equal sign and the quoted string value, capture the value along the way.
365 
366 XMPScanner::PacketMachine::TriState
CaptureAttrValue(PacketMachine * ths,const char *)367 XMPScanner::PacketMachine::CaptureAttrValue ( PacketMachine * ths, const char * /* unused */ )
368 {
369 	const int	bytesPerChar	= ths->fBytesPerChar;
370 	char		currChar		= 0;
371 	TriState	result			= eTriMaybe;
372 
373 	if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
374 
375 	switch ( ths->fPosition ) {
376 
377 		case 0 :	// The name should haved ended at the '=', nulls already skipped.
378 
379 			if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
380 			if ( *ths->fBufferPtr != '=' ) return eTriNo;
381 			ths->fBufferPtr += bytesPerChar;
382 			ths->fPosition = 1;
383 			// fall through OK because MatchOpenQuote will check the buffer limit and nulls ...
384 
385 		case 1 :	// Look for the open quote.
386 
387 			result = MatchOpenQuote ( ths, NULL );
388 			if ( result != eTriYes ) return result;
389 			ths->fPosition = 2;
390 			// fall through OK because the buffer limit and nulls are checked below ...
391 
392 		default :	// Look for the close quote, capturing the value along the way.
393 
394 			assert ( ths->fPosition == 2 );
395 
396 			const char quoteChar = ths->fQuoteChar;
397 
398 			while ( ths->fBufferPtr < ths->fBufferLimit ) {
399 				currChar = *ths->fBufferPtr;
400 				if ( currChar == quoteChar ) break;
401 				#if UseStringPushBack
402 					ths->fAttrValue.push_back ( currChar );
403 				#else
404 					ths->fAttrValue.insert ( ths->fAttrValue.end(), currChar );
405 				#endif
406 				ths->fBufferPtr += bytesPerChar;
407 			}
408 
409 			if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
410 			assert ( currChar == quoteChar );
411 			ths->fBufferPtr += bytesPerChar;	// Advance past the closing quote.
412 			return eTriYes;
413 
414 	}
415 
416 }	// CaptureAttrValue
417 
418 
419 // =================================================================================================
420 // RecordStart
421 // ===========
422 //
423 // Note that this routine looks at bytes, not logical characters.  It has to figure out how many
424 // bytes per character there are so that the other recognizers can skip intervening nulls.
425 
426 XMPScanner::PacketMachine::TriState
RecordStart(PacketMachine * ths,const char *)427 XMPScanner::PacketMachine::RecordStart ( PacketMachine * ths, const char * /* unused */ )
428 {
429 
430 	while ( true ) {
431 
432 		if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
433 
434 		const char currByte = *ths->fBufferPtr;
435 
436 		switch ( ths->fPosition ) {
437 
438 			case 0 :	// Record the length.
439 				assert ( ths->fCharForm == eChar8Bit );
440 				assert ( ths->fBytesPerChar == 1 );
441 				ths->fPacketStart = ths->fBufferOffset + ((ths->fBufferPtr - 1) - ths->fBufferOrigin);
442 				ths->fPacketLength = 0;
443 				ths->fPosition = 1;
444 				// ! OK to fall through here, we didn't consume a byte in this step.
445 
446 			case 1 :	// Look for the first null byte.
447 				if ( currByte != 0 ) return eTriYes;	// No nulls found.
448 				ths->fCharForm = eChar16BitBig;			// Assume 16 bit big endian for now.
449 				ths->fBytesPerChar = 2;
450 				ths->fBufferPtr++;
451 				ths->fPosition = 2;
452 				break;	// ! Don't fall through, have to check for the end of the buffer between each byte.
453 
454 			case 2 :	// One null was found, look for a second.
455 				if ( currByte != 0 ) return eTriYes;	// Just one null found.
456 				ths->fBufferPtr++;
457 				ths->fPosition = 3;
458 				break;
459 
460 			case 3 :	// Two nulls were found, look for a third.
461 				if ( currByte != 0 ) return eTriNo;	// Just two nulls is not valid.
462 				ths->fCharForm = eChar32BitBig;		// Assume 32 bit big endian for now.
463 				ths->fBytesPerChar = 4;
464 				ths->fBufferPtr++;
465 				return eTriYes;
466 				break;
467 
468 		}
469 
470 	}
471 
472 }	// RecordStart
473 
474 
475 // =================================================================================================
476 // RecognizeBOM
477 // ============
478 //
479 // Recognizing the byte order marker is a surprisingly messy thing to do.  It can't be done by the
480 // normal string matcher, there are no intervening nulls.  There are 4 transitions after the opening
481 // quote, the closing quote or one of the three encodings.  For the actual BOM there are then 1 or 2
482 // following bytes that depend on which of the encodings we're in.  Not to mention that the buffer
483 // might end at any point.
484 //
485 // The intervening null count done earlier determined 8, 16, or 32 bits per character, but not the
486 // big or little endian nature for the 16/32 bit cases.  The BOM must be present for the 16 and 32
487 // bit cases in order to determine the endian mode.  There are six possible byte sequences for the
488 // quoted BOM string, ignoring the differences for quoting with ''' versus '"'.
489 //
490 // Keep in mind that for the 16 and 32 bit cases there will be nulls for the quote.  In the table
491 // below the symbol <quote> means just the one byte containing the ''' or '"'.  The nulls for the
492 // quote character are explicitly shown.
493 //
494 //	<quote> <quote>					- 1: No BOM, this must be an 8 bit case.
495 //	<quote> \xEF \xBB \xBF <quote>	- 1.12-13: The 8 bit form.
496 //
497 //	<quote> \xFE \xFF \x00 <quote>	- 1.22-23: The 16 bit, big endian form
498 //	<quote> \x00 \xFF \xFE <quote>	- 1.32-33: The 16 bit, little endian form.
499 //
500 //	<quote> \x00 \x00 \xFE \xFF \x00 \x00 \x00 <quote>	- 1.32.43-45.56-57: The 32 bit, big endian form.
501 //	<quote> \x00 \x00 \x00 \xFF \xFE \x00 \x00 <quote>	- 1.32.43.54-57: The 32 bit, little endian form.
502 
503 enum {
504 	eBOM_8_1		= 0xEF,
505 	eBOM_8_2		= 0xBB,
506 	eBOM_8_3		= 0xBF,
507 	eBOM_Big_1		= 0xFE,
508 	eBOM_Big_2		= 0xFF,
509 	eBOM_Little_1	= eBOM_Big_2,
510 	eBOM_Little_2	= eBOM_Big_1
511 };
512 
513 XMPScanner::PacketMachine::TriState
RecognizeBOM(PacketMachine * ths,const char *)514 XMPScanner::PacketMachine::RecognizeBOM ( PacketMachine * ths, const char * /* unused */ )
515 {
516 	const int	bytesPerChar	= ths->fBytesPerChar;
517 
518 	while ( true ) {	// Handle one character at a time, the micro-state (fPosition) changes for each.
519 
520 		if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
521 
522 		const unsigned char currChar = *ths->fBufferPtr;	// ! The BOM bytes look like integers bigger than 127.
523 
524 		switch ( ths->fPosition ) {
525 
526 			case  0 :	// Look for the opening quote.
527 				if ( (currChar != '\'') && (currChar != '"') ) return eTriNo;
528 				ths->fQuoteChar = currChar;
529 				ths->fBufferPtr++;
530 				ths->fPosition = 1;
531 				break;	// ! Don't fall through, have to check for the end of the buffer between each byte.
532 
533 			case 1 :	// Look at the byte immediately following the opening quote.
534 				if ( currChar == ths->fQuoteChar ) {	// Closing quote, no BOM character, must be 8 bit.
535 					if ( ths->fCharForm != eChar8Bit ) return eTriNo;
536 					ths->fBufferPtr += bytesPerChar;	// Skip the nulls after the closing quote.
537 					return eTriYes;
538 				} else if ( currChar == eBOM_8_1 ) {	// Start of the 8 bit form.
539 					if ( ths->fCharForm != eChar8Bit ) return eTriNo;
540 					ths->fBufferPtr++;
541 					ths->fPosition = 12;
542 				} else if ( currChar == eBOM_Big_1 ) {	// Start of the 16 bit big endian form.
543 					if ( ths->fCharForm != eChar16BitBig ) return eTriNo;
544 					ths->fBufferPtr++;
545 					ths->fPosition = 22;
546 				} else if ( currChar == 0 ) {	// Start of the 16 bit little endian or either 32 bit form.
547 					if ( ths->fCharForm == eChar8Bit ) return eTriNo;
548 					ths->fBufferPtr++;
549 					ths->fPosition = 32;
550 				} else {
551 					return eTriNo;
552 				}
553 				break;
554 
555 			case 12 :	// Look for the second byte of the 8 bit form.
556 				if ( currChar != eBOM_8_2 ) return eTriNo;
557 				ths->fPosition = 13;
558 				ths->fBufferPtr++;
559 				break;
560 
561 			case 13 :	// Look for the third byte of the 8 bit form.
562 				if ( currChar != eBOM_8_3 ) return eTriNo;
563 				ths->fPosition = 99;
564 				ths->fBufferPtr++;
565 				break;
566 
567 			case 22 :	// Look for the second byte of the 16 bit big endian form.
568 				if ( currChar != eBOM_Big_2 ) return eTriNo;
569 				ths->fPosition = 23;
570 				ths->fBufferPtr++;
571 				break;
572 
573 			case 23 :	// Look for the null before the closing quote of the 16 bit big endian form.
574 				if ( currChar != 0 ) return eTriNo;
575 				ths->fBufferPtr++;
576 				ths->fPosition = 99;
577 				break;
578 
579 			case 32 :	// Look at the second byte of the 16 bit little endian or either 32 bit form.
580 				if ( currChar == eBOM_Little_1 ) {
581 					ths->fPosition = 33;
582 				} else if ( currChar == 0 ) {
583 					ths->fPosition = 43;
584 				} else {
585 					return eTriNo;
586 				}
587 				ths->fBufferPtr++;
588 				break;
589 
590 			case 33 :	// Look for the third byte of the 16 bit little endian form.
591 				if ( ths->fCharForm != eChar16BitBig ) return eTriNo;	// Null count before assumed big endian.
592 				if ( currChar != eBOM_Little_2 ) return eTriNo;
593 				ths->fCharForm = eChar16BitLittle;
594 				ths->fPosition = 99;
595 				ths->fBufferPtr++;
596 				break;
597 
598 			case 43 :	// Look at the third byte of either 32 bit form.
599 				if ( ths->fCharForm != eChar32BitBig ) return eTriNo;	// Null count before assumed big endian.
600 				if ( currChar == eBOM_Big_1 ) {
601 					ths->fPosition = 44;
602 				} else if ( currChar == 0 ) {
603 					ths->fPosition = 54;
604 				} else {
605 					return eTriNo;
606 				}
607 				ths->fBufferPtr++;
608 				break;
609 
610 			case 44 :	// Look for the fourth byte of the 32 bit big endian form.
611 				if ( currChar != eBOM_Big_2 ) return eTriNo;
612 				ths->fPosition = 45;
613 				ths->fBufferPtr++;
614 				break;
615 
616 			case 45 :	// Look for the first null before the closing quote of the 32 bit big endian form.
617 				if ( currChar != 0 ) return eTriNo;
618 				ths->fPosition = 56;
619 				ths->fBufferPtr++;
620 				break;
621 
622 			case 54 :	// Look for the fourth byte of the 32 bit little endian form.
623 				ths->fCharForm = eChar32BitLittle;
624 				if ( currChar != eBOM_Little_1 ) return eTriNo;
625 				ths->fPosition = 55;
626 				ths->fBufferPtr++;
627 				break;
628 
629 			case 55 :	// Look for the fifth byte of the 32 bit little endian form.
630 				if ( currChar != eBOM_Little_2 ) return eTriNo;
631 				ths->fPosition = 56;
632 				ths->fBufferPtr++;
633 				break;
634 
635 			case 56 :	// Look for the next to last null before the closing quote of the 32 bit forms.
636 				if ( currChar != 0 ) return eTriNo;
637 				ths->fPosition = 57;
638 				ths->fBufferPtr++;
639 				break;
640 
641 			case 57 :	// Look for the last null before the closing quote of the 32 bit forms.
642 				if ( currChar != 0 ) return eTriNo;
643 				ths->fPosition = 99;
644 				ths->fBufferPtr++;
645 				break;
646 
647 			default :	// Look for the closing quote.
648 				assert ( ths->fPosition == 99 );
649 				if ( currChar != ths->fQuoteChar ) return eTriNo;
650 				ths->fBufferPtr += bytesPerChar;	// Skip the nulls after the closing quote.
651 				return eTriYes;
652 				break;
653 
654 		}
655 
656 	}
657 
658 }	// RecognizeBOM
659 
660 
661 // =================================================================================================
662 // RecordHeadAttr
663 // ==============
664 
665 XMPScanner::PacketMachine::TriState
RecordHeadAttr(PacketMachine * ths,const char *)666 XMPScanner::PacketMachine::RecordHeadAttr ( PacketMachine * ths, const char * /* unused */ )
667 {
668 
669 	if ( ths->fAttrName == "encoding" ) {
670 
671 		assert ( ths->fEncodingAttr.empty() );
672 		ths->fEncodingAttr = ths->fAttrValue;
673 
674 	} else if ( ths->fAttrName == "bytes" ) {
675 
676 		long	value	= 0;
677 		int		count	= (int) ths->fAttrValue.size();
678 		int		i;
679 
680 		assert ( ths->fBytesAttr == -1 );
681 
682 		if ( count > 0 ) {	// Allow bytes='' to be the same as no bytes attribute.
683 
684 			for ( i = 0; i < count; i++ ) {
685 				const char	currChar	= ths->fAttrValue[i];
686 				if ( ('0' <= currChar) && (currChar <= '9') ) {
687 					value = (value * 10) + (currChar - '0');
688 				} else {
689 					ths->fBogusPacket = true;
690 					value = -1;
691 					break;
692 				}
693 			}
694 			ths->fBytesAttr = value;
695 
696 			if ( CharFormIs16Bit ( ths->fCharForm ) ) {
697 				if ( (ths->fBytesAttr & 1) != 0 ) ths->fBogusPacket = true;
698 			} else if ( CharFormIs32Bit ( ths->fCharForm ) ) {
699 				if ( (ths->fBytesAttr & 3) != 0 ) ths->fBogusPacket = true;
700 			}
701 
702 		}
703 
704 	}
705 
706 	ths->fAttrName.erase ( ths->fAttrName.begin(), ths->fAttrName.end() );
707 	ths->fAttrValue.erase ( ths->fAttrValue.begin(), ths->fAttrValue.end() );
708 
709 	return eTriYes;
710 
711 }	// RecordHeadAttr
712 
713 
714 // =================================================================================================
715 // CaptureAccess
716 // =============
717 
718 XMPScanner::PacketMachine::TriState
CaptureAccess(PacketMachine * ths,const char *)719 XMPScanner::PacketMachine::CaptureAccess ( PacketMachine * ths, const char * /* unused */ )
720 {
721 	const int	bytesPerChar	= ths->fBytesPerChar;
722 
723 	while ( true ) {
724 
725 		if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
726 
727 		const char currChar = *ths->fBufferPtr;
728 
729 		switch ( ths->fPosition ) {
730 
731 			case  0 :	// Look for the opening quote.
732 				if ( (currChar != '\'') && (currChar != '"') ) return eTriNo;
733 				ths->fQuoteChar = currChar;
734 				ths->fBufferPtr += bytesPerChar;
735 				ths->fPosition = 1;
736 				break;	// ! Don't fall through, have to check for the end of the buffer between each byte.
737 
738 			case  1 :	// Look for the 'r' or 'w'.
739 				if ( (currChar != 'r') && (currChar != 'w') ) return eTriNo;
740 				ths->fAccess = currChar;
741 				ths->fBufferPtr += bytesPerChar;
742 				ths->fPosition = 2;
743 				break;
744 
745 			default :	// Look for the closing quote.
746 				assert ( ths->fPosition == 2 );
747 				if ( currChar != ths->fQuoteChar ) return eTriNo;
748 				ths->fBufferPtr += bytesPerChar;
749 				return eTriYes;
750 				break;
751 
752 		}
753 
754 	}
755 
756 }	// CaptureAccess
757 
758 
759 // =================================================================================================
760 // RecordTailAttr
761 // ==============
762 
763 XMPScanner::PacketMachine::TriState
RecordTailAttr(PacketMachine * ths,const char *)764 XMPScanner::PacketMachine::RecordTailAttr ( PacketMachine * ths, const char * /* unused */ )
765 {
766 
767 	// There are no known "general" attributes for the packet trailer.
768 
769 	ths->fAttrName.erase ( ths->fAttrName.begin(), ths->fAttrName.end() );
770 	ths->fAttrValue.erase ( ths->fAttrValue.begin(), ths->fAttrValue.end() );
771 
772 	return eTriYes;
773 
774 
775 }	// RecordTailAttr
776 
777 
778 // =================================================================================================
779 // CheckPacketEnd
780 // ==============
781 //
782 // Check for trailing padding and record the packet length.  We have trailing padding if the bytes
783 // attribute is present and has a value greater than the current length.
784 
785 XMPScanner::PacketMachine::TriState
CheckPacketEnd(PacketMachine * ths,const char *)786 XMPScanner::PacketMachine::CheckPacketEnd ( PacketMachine * ths, const char * /* unused */ )
787 {
788 	const int	bytesPerChar	= ths->fBytesPerChar;
789 
790 	if ( ths->fPosition == 0 ) {	// First call, decide if there is trailing padding.
791 
792 		const XMP_Int64 currLen64 = (ths->fBufferOffset + (ths->fBufferPtr - ths->fBufferOrigin)) - ths->fPacketStart;
793 		if ( currLen64 > 0x7FFFFFFF ) throw std::runtime_error ( "Packet length exceeds 2GB-1" );
794 		const XMP_Int32 currLength = (XMP_Int32)currLen64;
795 
796 		if ( (ths->fBytesAttr != -1) && (ths->fBytesAttr != currLength) ) {
797 			if ( ths->fBytesAttr < currLength ) {
798 				ths->fBogusPacket = true;	// The bytes attribute value is too small.
799 			} else {
800 				ths->fPosition = ths->fBytesAttr - currLength;
801 				if ( (ths->fPosition % ths->fBytesPerChar) != 0 ) {
802 					ths->fBogusPacket = true;	// The padding is not a multiple of the character size.
803 					ths->fPosition = (ths->fPosition / ths->fBytesPerChar) * ths->fBytesPerChar;
804 				}
805 			}
806 		}
807 
808 	}
809 
810 	while ( ths->fPosition > 0 ) {
811 
812 		if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
813 
814 		const char currChar = *ths->fBufferPtr;
815 
816 		if ( (currChar != ' ') && (currChar != '\t') && (currChar != '\n') && (currChar != '\r') ) {
817 			ths->fBogusPacket = true;	// The padding is not whitespace.
818 			break;						// Stop the packet here.
819 		}
820 
821 		ths->fPosition -= bytesPerChar;
822 		ths->fBufferPtr += bytesPerChar;
823 
824 	}
825 
826 	const XMP_Int64 currLen64 = (ths->fBufferOffset + (ths->fBufferPtr - ths->fBufferOrigin)) - ths->fPacketStart;
827 	if ( currLen64 > 0x7FFFFFFF ) throw std::runtime_error ( "Packet length exceeds 2GB-1" );
828 	ths->fPacketLength = (XMP_Int32)currLen64;
829 	return eTriYes;
830 
831 }	// CheckPacketEnd
832 
833 
834 // =================================================================================================
835 // CheckFinalNulls
836 // ===============
837 //
838 // Do some special case processing for little endian characters.  We have to make sure the presumed
839 // nulls after the last character actually exist, i.e. that the stream does not end too soon.  Note
840 // that the prior character scanning has moved the buffer pointer to the address following the last
841 // byte of the last character.  I.e. we're already past the presumed nulls, so we can't check their
842 // content.  All we can do is verify that the stream does not end too soon.
843 //
844 // Doing this check is simple yet subtle.  If we're still in the current buffer then the trailing
845 // bytes obviously exist.  If we're exactly at the end of the buffer then the bytes also exist.
846 // The only question is when we're actually past this buffer, partly into the next buffer.  This is
847 // when "ths->fBufferPtr > ths->fBufferLimit" on entry.  For that case we have to wait until we've
848 // actually seen enough extra bytes of input.
849 //
850 // Since the normal buffer processing is already adjusting for this partial character overrun, all
851 // that needs to be done here is wait until "ths->fBufferPtr <= ths->fBufferLimit" on entry.  In
852 // other words, if we're presently too far, ths->fBufferPtr will be adjusted by the amount of the
853 // overflow the next time XMPScanner::Scan is called.  This might still be too far, so just keep
854 // waiting for enough data to pass by.
855 //
856 // Note that there is a corresponding special case for big endian characters, we must decrement the
857 // starting offset by the number of leading nulls.  But we don't do that here, we leave it to the
858 // outer code.  This is because the leading nulls might have been at the exact end of a previous
859 // buffer, in which case we have to also decrement the length of that raw data snip.
860 
861 XMPScanner::PacketMachine::TriState
CheckFinalNulls(PacketMachine * ths,const char *)862 XMPScanner::PacketMachine::CheckFinalNulls ( PacketMachine * ths, const char * /* unused */ )
863 {
864 
865 	if ( (ths->fCharForm != eChar8Bit) && CharFormIsLittleEndian ( ths->fCharForm ) ) {
866 		if ( ths->fBufferPtr > ths->fBufferLimit ) return eTriMaybe;
867 	}
868 
869 	return eTriYes;
870 
871 }	// CheckFinalNulls
872 
873 
874 // =================================================================================================
875 // SetNextRecognizer
876 // =================
877 
878 void
SetNextRecognizer(RecognizerKind nextRecognizer)879 XMPScanner::PacketMachine::SetNextRecognizer ( RecognizerKind nextRecognizer )
880 {
881 
882 	fRecognizer = nextRecognizer;
883 	fPosition = 0;
884 
885 }	// SetNextRecognizer
886 
887 
888 // =================================================================================================
889 // FindNextPacket
890 // ==============
891 
892 // *** When we start validating intervening nulls for 2 and 4 bytes characters, throw an exception
893 // *** for errors.  Don't return eTriNo, that might skip at an optional point.
894 
895 XMPScanner::PacketMachine::TriState
FindNextPacket()896 XMPScanner::PacketMachine::FindNextPacket ()
897 {
898 
899 	TriState	status;
900 
901 	#define kPacketHead		"?xpacket begin="
902 	#define kPacketID		"W5M0MpCehiHzreSzNTczkc9d"
903 	#define kPacketTail		"?xpacket end="
904 
905 	static const RecognizerInfo	recognizerTable [eRecognizerCount]	= {		// ! Would be safer to assign these explicitly.
906 
907 		// proc				successNext					failureNext					literal
908 
909 		{ NULL,				eFailureRecognizer,			eFailureRecognizer,			NULL},			// eFailureRecognizer
910 		{ NULL,				eSuccessRecognizer,			eSuccessRecognizer,			NULL},			// eSuccessRecognizer
911 
912 		{ FindLessThan,		eHeadStartRecorder,			eFailureRecognizer,			"H" },			// eLeadInRecognizer
913 		{ RecordStart,	 	eHeadStartRecognizer,		eLeadInRecognizer,			NULL },			// eHeadStartRecorder
914 		{ MatchString, 		eBOMRecognizer,				eLeadInRecognizer,			kPacketHead },	// eHeadStartRecognizer
915 
916 		{ RecognizeBOM, 	eIDTagRecognizer,			eLeadInRecognizer,			NULL },			// eBOMRecognizer
917 
918 		{ MatchString, 		eIDOpenRecognizer,			eLeadInRecognizer,			" id=" },		// eIDTagRecognizer
919 		{ MatchOpenQuote,	eIDValueRecognizer,			eLeadInRecognizer,			NULL },			// eIDOpenRecognizer
920 		{ MatchString, 		eIDCloseRecognizer,			eLeadInRecognizer,			kPacketID },	// eIDValueRecognizer
921 		{ MatchCloseQuote,	eAttrSpaceRecognizer_1,		eLeadInRecognizer,			NULL },			// eIDCloseRecognizer
922 
923 		{ MatchChar, 		eAttrNameRecognizer_1,		eHeadEndRecognizer,			" " },			// eAttrSpaceRecognizer_1
924 		{ CaptureAttrName,	eAttrValueRecognizer_1,		eLeadInRecognizer,			NULL },			// eAttrNameRecognizer_1
925 		{ CaptureAttrValue,	eAttrValueRecorder_1,		eLeadInRecognizer,			NULL },			// eAttrValueRecognizer_1
926 		{ RecordHeadAttr,	eAttrSpaceRecognizer_1,		eLeadInRecognizer,			NULL },			// eAttrValueRecorder_1
927 
928 		{ MatchString, 		eBodyRecognizer,			eLeadInRecognizer,			"?>" },			// eHeadEndRecognizer
929 
930 		{ FindLessThan,		eTailStartRecognizer,		eBodyRecognizer,			"T"},			// eBodyRecognizer
931 
932 		{ MatchString, 		eAccessValueRecognizer,		eBodyRecognizer,			kPacketTail },	// eTailStartRecognizer
933 		{ CaptureAccess,	eAttrSpaceRecognizer_2,		eBodyRecognizer,			NULL },			// eAccessValueRecognizer
934 
935 		{ MatchChar, 		eAttrNameRecognizer_2,		eTailEndRecognizer,			" " },			// eAttrSpaceRecognizer_2
936 		{ CaptureAttrName,	eAttrValueRecognizer_2,		eBodyRecognizer,			NULL },			// eAttrNameRecognizer_2
937 		{ CaptureAttrValue,	eAttrValueRecorder_2,		eBodyRecognizer,			NULL },			// eAttrValueRecognizer_2
938 		{ RecordTailAttr,	eAttrSpaceRecognizer_2,		eBodyRecognizer,			NULL },			// eAttrValueRecorder_2
939 
940 		{ MatchString, 		ePacketEndRecognizer,		eBodyRecognizer,			"?>" },			// eTailEndRecognizer
941 		{ CheckPacketEnd,	eCloseOutRecognizer,		eBodyRecognizer,			"" },			// ePacketEndRecognizer
942 		{ CheckFinalNulls,	eSuccessRecognizer,			eBodyRecognizer,			"" }			// eCloseOutRecognizer
943 
944 	};
945 
946 	while ( true ) {
947 
948 		switch ( fRecognizer ) {
949 
950 			case eFailureRecognizer :
951 				return eTriNo;
952 
953 			case eSuccessRecognizer :
954 				return eTriYes;
955 
956 			default :
957 
958 				// -------------------------------------------------------------------
959 				// For everything else, the normal cases, use the state machine table.
960 
961 				const RecognizerInfo *	thisState	= &recognizerTable [fRecognizer];
962 
963 				status = thisState->proc ( this, thisState->literal );
964 
965 				switch ( status ) {
966 
967 					case eTriNo :
968 						SetNextRecognizer ( thisState->failureNext );
969 						continue;
970 
971 					case eTriYes :
972 						SetNextRecognizer ( thisState->successNext );
973 						continue;
974 
975 					case eTriMaybe :
976 						fBufferOverrun = (unsigned char)(fBufferPtr - fBufferLimit);
977 						return eTriMaybe;	// Keep this recognizer intact, to be resumed later.
978 
979 				}
980 
981 		}	// switch ( fRecognizer ) { ...
982 
983 	}	// while ( true ) { ...
984 
985 }	// FindNextPacket
986 
987 
988 // =================================================================================================
989 // =================================================================================================
990 // class InternalSnip
991 // ==================
992 
993 
994 // =================================================================================================
995 // InternalSnip
996 // ============
997 
InternalSnip(XMP_Int64 offset,XMP_Int64 length)998 XMPScanner::InternalSnip::InternalSnip ( XMP_Int64 offset, XMP_Int64 length )
999 {
1000 
1001 	fInfo.fOffset = offset;
1002 	fInfo.fLength = length;
1003 
1004 }	// InternalSnip
1005 
1006 
1007 // =================================================================================================
1008 // InternalSnip
1009 // ============
1010 
InternalSnip(const InternalSnip & rhs)1011 XMPScanner::InternalSnip::InternalSnip ( const InternalSnip & rhs ) :
1012 	fInfo ( rhs.fInfo ),
1013 	fMachine ( NULL )
1014 {
1015 
1016 	assert ( rhs.fMachine.get() == NULL );	// Don't copy a snip with a machine.
1017 	assert ( (rhs.fInfo.fEncodingAttr == 0) || (*rhs.fInfo.fEncodingAttr == 0) ); // Don't copy a snip with an encoding.
1018 
1019 }	// InternalSnip
1020 
1021 
1022 // =================================================================================================
1023 // ~InternalSnip
1024 // =============
1025 
~InternalSnip()1026 XMPScanner::InternalSnip::~InternalSnip ()
1027 {
1028 }	// ~InternalSnip
1029 
1030 
1031 
1032 // =================================================================================================
1033 // =================================================================================================
1034 // class XMPScanner
1035 // ================
1036 
1037 
1038 // =================================================================================================
1039 // DumpSnipList
1040 // ============
1041 
1042 #if DEBUG
1043 
1044 static const char *	snipStateName [6] = { "not-seen", "pending", "raw-data", "good-packet", "partial", "bad-packet" };
1045 
1046 void
DumpSnipList(const char * title)1047 XMPScanner::DumpSnipList ( const char * title )
1048 {
1049 	InternalSnipIterator currPos = fInternalSnips.begin();
1050 	InternalSnipIterator endPos  = fInternalSnips.end();
1051 
1052 	cout << endl << title << " snip list: " << fInternalSnips.size() << endl;
1053 
1054 	for ( ; currPos != endPos; ++currPos ) {
1055 		SnipInfo * currSnip = &currPos->fInfo;
1056 		cout << '\t' << currSnip << ' ' << snipStateName[currSnip->fState] << ' '
1057 		     << currSnip->fOffset << ".." << (currSnip->fOffset + currSnip->fLength - 1)
1058 			 << ' ' << currSnip->fLength << ' ' << endl;
1059 	}
1060 }	// DumpSnipList
1061 
1062 #endif
1063 
1064 
1065 // =================================================================================================
1066 // PrevSnip and NextSnip
1067 // =====================
1068 
1069 XMPScanner::InternalSnipIterator
PrevSnip(InternalSnipIterator snipPos)1070 XMPScanner::PrevSnip ( InternalSnipIterator snipPos )
1071 {
1072 
1073 	InternalSnipIterator prev = snipPos;
1074 	return --prev;
1075 
1076 }	// PrevSnip
1077 
1078 XMPScanner::InternalSnipIterator
NextSnip(InternalSnipIterator snipPos)1079 XMPScanner::NextSnip ( InternalSnipIterator snipPos )
1080 {
1081 
1082 	InternalSnipIterator next = snipPos;
1083 	return ++next;
1084 
1085 }	// NextSnip
1086 
1087 
1088 // =================================================================================================
1089 // XMPScanner
1090 // ==========
1091 //
1092 // Initialize the scanner object with one "not seen" snip covering the whole stream.
1093 
XMPScanner(XMP_Int64 streamLength)1094 XMPScanner::XMPScanner ( XMP_Int64 streamLength ) :
1095 
1096 	fStreamLength ( streamLength )
1097 
1098 {
1099 	InternalSnip	rootSnip ( 0, streamLength );
1100 
1101 	if ( streamLength > 0 ) fInternalSnips.push_front ( rootSnip );		// Be nice for empty files.
1102 	// DumpSnipList ( "New XMPScanner" );
1103 
1104 }	// XMPScanner
1105 
1106 
1107 // =================================================================================================
1108 // ~XMPScanner
1109 // ===========
1110 
~XMPScanner()1111 XMPScanner::~XMPScanner()
1112 {
1113 
1114 }	// ~XMPScanner
1115 
1116 
1117 // =================================================================================================
1118 // GetSnipCount
1119 // ============
1120 
1121 long
GetSnipCount()1122 XMPScanner::GetSnipCount ()
1123 {
1124 
1125 	return (long)fInternalSnips.size();
1126 
1127 }	// GetSnipCount
1128 
1129 
1130 // =================================================================================================
1131 // StreamAllScanned
1132 // ================
1133 
1134 bool
StreamAllScanned()1135 XMPScanner::StreamAllScanned ()
1136 {
1137 	InternalSnipIterator currPos = fInternalSnips.begin();
1138 	InternalSnipIterator endPos  = fInternalSnips.end();
1139 
1140 	for ( ; currPos != endPos; ++currPos ) {
1141 		if ( currPos->fInfo.fState == eNotSeenSnip ) return false;
1142 	}
1143 	return true;
1144 
1145 }	// StreamAllScanned
1146 
1147 
1148 // =================================================================================================
1149 // SplitInternalSnip
1150 // =================
1151 //
1152 // Split the given snip into up to 3 pieces.  The new pieces are inserted before and after this one
1153 // in the snip list.  The relOffset is the first byte to be kept, it is relative to this snip.  If
1154 // the preceeding or following snips have the same state as this one, just shift the boundaries.
1155 // I.e. move the contents from one snip to the other, don't create a new snip.
1156 
1157 // *** To be thread safe we ought to lock the entire list during manipulation.  Let data scanning
1158 // *** happen in parallel, serialize all mucking with the list.
1159 
1160 void
SplitInternalSnip(InternalSnipIterator snipPos,XMP_Int64 relOffset,XMP_Int64 newLength)1161 XMPScanner::SplitInternalSnip ( InternalSnipIterator snipPos, XMP_Int64 relOffset, XMP_Int64 newLength )
1162 {
1163 
1164 	assert ( (relOffset + newLength) > relOffset );	// Check for overflow.
1165 	assert ( (relOffset + newLength) <= snipPos->fInfo.fLength );
1166 
1167 	// -----------------------------------
1168 	// First deal with the low offset end.
1169 
1170 	if ( relOffset > 0 ) {
1171 
1172 		InternalSnipIterator prevPos;
1173 		if ( snipPos != fInternalSnips.begin() ) prevPos = PrevSnip ( snipPos );
1174 
1175 		if ( (snipPos != fInternalSnips.begin()) && (snipPos->fInfo.fState == prevPos->fInfo.fState) ) {
1176 			prevPos->fInfo.fLength += relOffset;	// Adjust the preceeding snip.
1177 		} else {
1178 			InternalSnip headExcess ( snipPos->fInfo.fOffset, relOffset );
1179 			headExcess.fInfo.fState = snipPos->fInfo.fState;
1180 			headExcess.fInfo.fOutOfOrder = snipPos->fInfo.fOutOfOrder;
1181 			fInternalSnips.insert ( snipPos, headExcess );	// Insert the head piece before the middle piece.
1182 		}
1183 
1184 		snipPos->fInfo.fOffset += relOffset;	// Adjust the remainder of this snip.
1185 		snipPos->fInfo.fLength -= relOffset;
1186 
1187 	}
1188 
1189 	// ----------------------------------
1190 	// Now deal with the high offset end.
1191 
1192 	if ( newLength < snipPos->fInfo.fLength ) {
1193 
1194 		InternalSnipIterator nextPos    = NextSnip ( snipPos );
1195 		const XMP_Int64      tailLength = snipPos->fInfo.fLength - newLength;
1196 
1197 		if ( (nextPos != fInternalSnips.end()) && (snipPos->fInfo.fState == nextPos->fInfo.fState) ) {
1198 			nextPos->fInfo.fOffset -= tailLength;		// Adjust the following snip.
1199 			nextPos->fInfo.fLength += tailLength;
1200 		} else {
1201 			InternalSnip tailExcess ( (snipPos->fInfo.fOffset + newLength), tailLength );
1202 			tailExcess.fInfo.fState = snipPos->fInfo.fState;
1203 			tailExcess.fInfo.fOutOfOrder = snipPos->fInfo.fOutOfOrder;
1204 			fInternalSnips.insert ( nextPos, tailExcess );		// Insert the tail piece after the middle piece.
1205 		}
1206 
1207 		snipPos->fInfo.fLength = newLength;
1208 
1209 	}
1210 
1211 }	// SplitInternalSnip
1212 
1213 
1214 // =================================================================================================
1215 // MergeInternalSnips
1216 // ==================
1217 
1218 XMPScanner::InternalSnipIterator
MergeInternalSnips(InternalSnipIterator firstPos,InternalSnipIterator secondPos)1219 XMPScanner::MergeInternalSnips ( InternalSnipIterator firstPos, InternalSnipIterator secondPos )
1220 {
1221 
1222 	firstPos->fInfo.fLength += secondPos->fInfo.fLength;
1223 	fInternalSnips.erase ( secondPos );
1224 	return firstPos;
1225 
1226 }	// MergeInternalSnips
1227 
1228 
1229 // =================================================================================================
1230 // Scan
1231 // ====
1232 
1233 void
Scan(const void * bufferOrigin,XMP_Int64 bufferOffset,XMP_Int64 bufferLength)1234 XMPScanner::Scan ( const void * bufferOrigin, XMP_Int64 bufferOffset, XMP_Int64 bufferLength )
1235 {
1236 	XMP_Int64	relOffset;
1237 
1238 	#if 0
1239 		cout << "Scan: @ " << bufferOrigin << ", " << bufferOffset << ", " << bufferLength << endl;
1240 	#endif
1241 
1242 	if ( bufferLength == 0 ) return;
1243 
1244 	// ----------------------------------------------------------------
1245 	// These comparisons are carefully done to avoid overflow problems.
1246 
1247 	if ( (bufferOffset >= fStreamLength) ||
1248 		 (bufferLength > (fStreamLength - bufferOffset)) ||
1249 		 (bufferOrigin == 0) ) {
1250 		throw ScanError ( "Bad origin, offset, or length" );
1251 	}
1252 
1253 	// ----------------------------------------------------------------------------------------------
1254 	// This buffer must be within a not-seen snip.  Find it and split it.  The first snip whose whose
1255 	// end is beyond the buffer must be the enclosing one.
1256 
1257 	// *** It would be friendly for rescans for out of order problems to accept any buffer postion.
1258 
1259 	const XMP_Int64			endOffset	= bufferOffset + bufferLength - 1;
1260 	InternalSnipIterator	snipPos	= fInternalSnips.begin();
1261 
1262 	while ( endOffset > (snipPos->fInfo.fOffset + snipPos->fInfo.fLength - 1) ) ++ snipPos;
1263 	if ( snipPos->fInfo.fState != eNotSeenSnip ) throw ScanError ( "Already seen" );
1264 
1265 	relOffset = bufferOffset - snipPos->fInfo.fOffset;
1266 	if ( (relOffset + bufferLength) > snipPos->fInfo.fLength ) throw ScanError ( "Not within existing snip" );
1267 
1268 	SplitInternalSnip ( snipPos, relOffset, bufferLength );		// *** If sequential & prev is partial, just tack on,
1269 
1270 	// --------------------------------------------------------
1271 	// Merge this snip with the preceeding snip if appropriate.
1272 
1273 	// *** When out of order I/O is supported we have to do something about buffers who's predecessor is not seen.
1274 
1275 	if ( snipPos->fInfo.fOffset > 0 ) {
1276 		InternalSnipIterator prevPos = PrevSnip ( snipPos );
1277 		if ( prevPos->fInfo.fState == ePartialPacketSnip ) snipPos = MergeInternalSnips ( prevPos, snipPos );
1278 	}
1279 
1280 	// ----------------------------------
1281 	// Look for packets within this snip.
1282 
1283 	snipPos->fInfo.fState = ePendingSnip;
1284 	PacketMachine* thisMachine = snipPos->fMachine.get();
1285 	// DumpSnipList ( "Before scan" );
1286 
1287 	if ( thisMachine != 0 ) {
1288 		thisMachine->AssociateBuffer ( bufferOffset, bufferOrigin, bufferLength );
1289 	} else {
1290 		// *** snipPos->fMachine.reset ( new PacketMachine ( bufferOffset, bufferOrigin, bufferLength ) );		VC++ lacks reset
1291 		#if 0
1292 			snipPos->fMachine = auto_ptr<PacketMachine> ( new PacketMachine ( bufferOffset, bufferOrigin, bufferLength ) );
1293 		#else
1294 			{
1295 				// Some versions of gcc complain about the assignment operator above.  This avoids the gcc bug.
1296 				PacketMachine *	pm	= new PacketMachine ( bufferOffset, bufferOrigin, bufferLength );
1297 				auto_ptr<PacketMachine>	ap ( pm );
1298 				snipPos->fMachine = ap;
1299 			}
1300 		#endif
1301 		thisMachine = snipPos->fMachine.get();
1302 	}
1303 
1304 	bool	bufferDone	= false;
1305 	while ( ! bufferDone ) {
1306 
1307 		PacketMachine::TriState	foundPacket = thisMachine->FindNextPacket();
1308 
1309 		if ( foundPacket == PacketMachine::eTriNo ) {
1310 
1311 			// -----------------------------------------------------------------------
1312 			// No packet, mark the snip as raw data and get rid of the packet machine.
1313 			// We're done with this buffer.
1314 
1315 			snipPos->fInfo.fState = eRawInputSnip;
1316 			#if 0
1317 				snipPos->fMachine = auto_ptr<PacketMachine>();	// *** snipPos->fMachine.reset();	VC++ lacks reset
1318 			#else
1319 				{
1320 					// Some versions of gcc complain about the assignment operator above.  This avoids the gcc bug.
1321 					auto_ptr<PacketMachine>	ap ( 0 );
1322 					snipPos->fMachine = ap;
1323 				}
1324 			#endif
1325 			bufferDone = true;
1326 
1327 		} else {
1328 
1329 			// ---------------------------------------------------------------------------------------------
1330 			// Either a full or partial packet.  First trim any excess off of the front as a raw input snip.
1331 			// If this is a partial packet mark the snip and keep the packet machine to be resumed later.
1332 			// We're done with this buffer, the partial packet by definition extends to the end.  If this is
1333 			// a complete packet first extract the additional information from the packet machine.  If there
1334 			// is leftover data split the snip and transfer the packet machine to the new trailing snip.
1335 
1336 			if ( thisMachine->fPacketStart > snipPos->fInfo.fOffset ) {
1337 
1338 				// There is data at the front of the current snip that must be trimmed.
1339 				SnipState	savedState	= snipPos->fInfo.fState;
1340 				snipPos->fInfo.fState = eRawInputSnip;	// ! So it gets propagated to the trimmed front part.
1341 				relOffset = thisMachine->fPacketStart - snipPos->fInfo.fOffset;
1342 				SplitInternalSnip ( snipPos, relOffset, (snipPos->fInfo.fLength - relOffset) );
1343 				snipPos->fInfo.fState = savedState;
1344 
1345 			}
1346 
1347 			if ( foundPacket == PacketMachine::eTriMaybe ) {
1348 
1349 				// We have only found a partial packet.
1350 				snipPos->fInfo.fState = ePartialPacketSnip;
1351 				bufferDone = true;
1352 
1353 			} else {
1354 
1355 				// We have found a complete packet. Extract all the info for it and split any trailing data.
1356 
1357 				InternalSnipIterator	packetSnip	= snipPos;
1358 				SnipState				packetState	= eValidPacketSnip;
1359 
1360 				if ( thisMachine->fBogusPacket ) packetState = eBadPacketSnip;
1361 
1362 				packetSnip->fInfo.fAccess = thisMachine->fAccess;
1363 				packetSnip->fInfo.fCharForm = thisMachine->fCharForm;
1364 				packetSnip->fInfo.fBytesAttr = thisMachine->fBytesAttr;
1365 				packetSnip->fInfo.fEncodingAttr = thisMachine->fEncodingAttr.c_str();
1366 				thisMachine->fEncodingAttr.erase ( thisMachine->fEncodingAttr.begin(), thisMachine->fEncodingAttr.end() );
1367 
1368 				if ( (thisMachine->fCharForm != eChar8Bit) && CharFormIsBigEndian ( thisMachine->fCharForm ) ) {
1369 
1370 					// ------------------------------------------------------------------------------
1371 					// Handle a special case for big endian characters.  The packet machine works as
1372 					// though things were little endian.  The packet starting offset points to the
1373 					// byte containing the opening '<', and the length includes presumed nulls that
1374 					// follow the last "real" byte.  If the characters are big endian we now have to
1375 					// decrement the starting offset of the packet, and also decrement the length of
1376 					// the previous snip.
1377 					//
1378 					// Note that we can't do this before the head trimming above in general.  The
1379 					// nulls might have been exactly at the end of a buffer and already in the
1380 					// previous snip.  We are doing this before trimming the tail from the raw snip
1381 					// containing the packet.  We adjust the raw snip's size because it ends with
1382 					// the input buffer.  We don't adjust the packet's size, it is already correct.
1383 					//
1384 					// The raw snip (the one before the packet) might entirely disappear.  A simple
1385 					// example of this is when the packet is at the start of the file.
1386 
1387 					assert ( packetSnip != fInternalSnips.begin() );	// Leading nulls were trimmed!
1388 
1389 					if ( packetSnip != fInternalSnips.begin() ) {	// ... but let's program defensibly.
1390 
1391 						InternalSnipIterator prevSnip  = PrevSnip ( packetSnip );
1392 						const unsigned int nullsToAdd = ( CharFormIs16Bit ( thisMachine->fCharForm ) ? 1 : 3 );
1393 
1394 						assert ( nullsToAdd <= prevSnip->fInfo.fLength );
1395 						prevSnip->fInfo.fLength -= nullsToAdd;
1396 						if ( prevSnip->fInfo.fLength == 0 ) (void) fInternalSnips.erase ( prevSnip );
1397 
1398 						packetSnip->fInfo.fOffset	-= nullsToAdd;
1399 						packetSnip->fInfo.fLength	+= nullsToAdd;
1400 						thisMachine->fPacketStart	-= nullsToAdd;
1401 
1402 					}
1403 
1404 				}
1405 
1406 				if ( thisMachine->fPacketLength == snipPos->fInfo.fLength ) {
1407 
1408 					// This packet ends exactly at the end of the current snip.
1409 					#if 0
1410 						snipPos->fMachine = auto_ptr<PacketMachine>();	// *** snipPos->fMachine.reset();	VC++ lacks reset
1411 					#else
1412 						{
1413 							// Some versions of gcc complain about the assignment operator above.  This avoids the gcc bug.
1414 							auto_ptr<PacketMachine>	ap ( 0 );
1415 							snipPos->fMachine = ap;
1416 						}
1417 					#endif
1418 					bufferDone = true;
1419 
1420 				} else {
1421 
1422 					// There is trailing data to split from the just found packet.
1423 					SplitInternalSnip ( snipPos, 0, thisMachine->fPacketLength );
1424 
1425 					InternalSnipIterator	tailPos	= NextSnip ( snipPos );
1426 
1427 					tailPos->fMachine = snipPos->fMachine;	// auto_ptr assignment - taking ownership
1428 					thisMachine->ResetMachine ();
1429 
1430 					snipPos = tailPos;
1431 
1432 				}
1433 
1434 				packetSnip->fInfo.fState = packetState;	// Do this last to avoid messing up the tail split.
1435 				// DumpSnipList ( "Found a packet" );
1436 
1437 
1438 			}
1439 
1440 		}
1441 
1442 	}
1443 
1444 	// --------------------------------------------------------
1445 	// Merge this snip with the preceeding snip if appropriate.
1446 
1447 	// *** When out of order I/O is supported we have to check the following snip too.
1448 
1449 	if ( (snipPos->fInfo.fOffset > 0) && (snipPos->fInfo.fState == eRawInputSnip) ) {
1450 		InternalSnipIterator prevPos = PrevSnip ( snipPos );
1451 		if ( prevPos->fInfo.fState == eRawInputSnip ) snipPos = MergeInternalSnips ( prevPos, snipPos );
1452 	}
1453 
1454 	// DumpSnipList ( "After scan" );
1455 
1456 }	// Scan
1457 
1458 
1459 // =================================================================================================
1460 // Report
1461 // ======
1462 
1463 void
Report(SnipInfoVector & snips)1464 XMPScanner::Report ( SnipInfoVector& snips )
1465 {
1466 	const int				count	= (int)fInternalSnips.size();
1467 	InternalSnipIterator	snipPos	= fInternalSnips.begin();
1468 
1469 	int	s;
1470 
1471 	// DumpSnipList ( "Report" );
1472 
1473 	snips.erase ( snips.begin(), snips.end() );		// ! Should use snips.clear, but VC++ doesn't have it.
1474 	snips.reserve ( count );
1475 
1476 	for ( s = 0; s < count; s += 1 ) {
1477 		snips.push_back ( SnipInfo ( snipPos->fInfo.fState, snipPos->fInfo.fOffset, snipPos->fInfo.fLength ) );
1478 		snips[s] = snipPos->fInfo;	// Pick up all of the fields.
1479 		++ snipPos;
1480 	}
1481 
1482 }	// Report
1483