1 // =================================================================================================
2 // Copyright 2002-2007 Adobe Systems Incorporated
3 // All Rights Reserved.
4 //
5 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms
6 // of the Adobe license agreement accompanying it.
7 //
8 // Adobe patent application tracking #P435, entitled 'Unique markers to simplify embedding data of
9 // one format in a file with a different format', inventors: Sean Parent, Greg Gilley.
10 // =================================================================================================
11
12 #include <cstring>
13
14 #if WIN32
15 #pragma warning ( disable : 4127 ) // conditional expression is constant
16 #pragma warning ( disable : 4510 ) // default constructor could not be generated
17 #pragma warning ( disable : 4610 ) // user defined constructor required
18 #pragma warning ( disable : 4786 ) // debugger can't handle long symbol names
19 #endif
20
21
22 #include "XMPScanner.hpp"
23
24 #include <cassert>
25 #include <string>
26 #include <cstdlib>
27
28 #if DEBUG
29 #include <iostream>
30 #include <iomanip>
31 #include <fstream>
32 #endif
33
34
35 #ifndef UseStringPushBack // VC++ 6.x does not provide push_back for strings!
36 #define UseStringPushBack 0
37 #endif
38
39
40 using namespace std;
41
42
43 // *** Consider Boyer-Moore style search for "<?xpacket begin=". It isn't an obvious win, the
44 // *** additional code might be slower than scanning every character. Especially if we will
45 // *** read every cache line anyway.
46
47
48 // =================================================================================================
49 // =================================================================================================
50 // class PacketMachine
51 // ===================
52 //
53 // This is the packet recognizer state machine. The top of the machine is FindNextPacket, this
54 // calls the specific state components and handles transitions. The states are described by an
55 // array of RecognizerInfo records, indexed by the RecognizerKind enumeration. Each RecognizerInfo
56 // record has a function that does that state's work, the success and failure transition states,
57 // and a string literal that is passed to the state function. The literal lets a common MatchChar
58 // or MatchString function be used in several places.
59 //
60 // The state functions are responsible for consuming input to recognize their particular state.
61 // This includes intervening nulls for 16 and 32 bit character forms. For the simplicity, things
62 // are treated as essentially little endian and the nulls are not actually checked. The opening
63 // '<' is found with a byte-by-byte search, then the number of bytes per character is determined
64 // by counting the following nulls. From then on, consuming a character means incrementing the
65 // buffer pointer by the number of bytes per character. Thus the buffer pointer only points to
66 // the "real" bytes. This also means that the pointer can go off the end of the buffer by a
67 // variable amount. The amount of overrun is saved so that the pointer can be positioned at the
68 // right byte to start the next buffer.
69 //
70 // The state functions return a TriState value, eTriYes means the pattern was found, eTriNo means
71 // the pattern was definitely not found, eTriMaybe means that the end of the buffer was reached
72 // while working through the pattern.
73 //
74 // When eTriYes is returned, the fBufferPtr data member is left pointing to the "real" byte
75 // following the last actual byte. Which might not be addressable memory! This also means that
76 // a state function can be entered with nothing available in the buffer. When eTriNo is returned,
77 // the fBufferPtr data member is left pointing to the byte that caused the failure. The state
78 // machine starts over from the failure byte.
79 //
80 // The state functions must preserve their internal micro-state before returning eTriMaybe, and
81 // resume processing when called with the next buffer. The fPosition data member is used to denote
82 // how many actual characters have been consumed. The fNullCount data member is used to denote how
83 // many nulls are left before the next actual character.
84
85
86 // =================================================================================================
87 // PacketMachine
88 // =============
89
PacketMachine(XMP_Int64 bufferOffset,const void * bufferOrigin,XMP_Int64 bufferLength)90 XMPScanner::PacketMachine::PacketMachine ( XMP_Int64 bufferOffset, const void * bufferOrigin, XMP_Int64 bufferLength ) :
91
92 // Public members
93 fPacketStart ( 0 ),
94 fPacketLength ( 0 ),
95 fBytesAttr ( -1 ),
96 fCharForm ( eChar8Bit ),
97 fAccess ( ' ' ),
98 fBogusPacket ( false ),
99
100 // Private members
101 fBufferOffset ( bufferOffset ),
102 fBufferOrigin ( (const char *) bufferOrigin ),
103 fBufferPtr ( fBufferOrigin ),
104 fBufferLimit ( fBufferOrigin + bufferLength ),
105 fRecognizer ( eLeadInRecognizer ),
106 fPosition ( 0 ),
107 fBytesPerChar ( 1 ),
108 fBufferOverrun ( 0 ),
109 fQuoteChar ( ' ' )
110
111 {
112 /*
113 REVIEW NOTES : Should the buffer stuff be in a class?
114 */
115
116 assert ( bufferOrigin != NULL );
117 assert ( bufferLength != 0 );
118
119 } // PacketMachine
120
121
122 // =================================================================================================
123 // ~PacketMachine
124 // ==============
125
~PacketMachine()126 XMPScanner::PacketMachine::~PacketMachine ()
127 {
128
129 // An empty placeholder.
130
131 } // ~PacketMachine
132
133
134 // =================================================================================================
135 // AssociateBuffer
136 // ===============
137
138 void
AssociateBuffer(XMP_Int64 bufferOffset,const void * bufferOrigin,XMP_Int64 bufferLength)139 XMPScanner::PacketMachine::AssociateBuffer ( XMP_Int64 bufferOffset, const void * bufferOrigin, XMP_Int64 bufferLength )
140 {
141
142 fBufferOffset = bufferOffset;
143 fBufferOrigin = (const char *) bufferOrigin;
144 fBufferPtr = fBufferOrigin + fBufferOverrun;
145 fBufferLimit = fBufferOrigin + bufferLength;
146
147 } // AssociateBuffer
148
149
150 // =================================================================================================
151 // ResetMachine
152 // ============
153
154 void
ResetMachine()155 XMPScanner::PacketMachine::ResetMachine ()
156 {
157
158 fRecognizer = eLeadInRecognizer;
159 fPosition = 0;
160 fBufferOverrun = 0;
161 fCharForm = eChar8Bit;
162 fBytesPerChar = 1;
163 fAccess = ' ';
164 fBytesAttr = -1;
165 fBogusPacket = false;
166
167 fAttrName.erase ( fAttrName.begin(), fAttrName.end() );
168 fAttrValue.erase ( fAttrValue.begin(), fAttrValue.end() );
169 fEncodingAttr.erase ( fEncodingAttr.begin(), fEncodingAttr.end() );
170
171 } // ResetMachine
172
173
174 // =================================================================================================
175 // FindLessThan
176 // ============
177
178 XMPScanner::PacketMachine::TriState
FindLessThan(PacketMachine * ths,const char * which)179 XMPScanner::PacketMachine::FindLessThan ( PacketMachine * ths, const char * which )
180 {
181
182 if ( *which == 'H' ) {
183
184 // --------------------------------------------------------------------------------
185 // We're looking for the '<' of the header. If we fail there is no packet in this
186 // part of the input, so return eTriNo.
187
188 ths->fCharForm = eChar8Bit; // We might have just failed from a bogus 16 or 32 bit case.
189 ths->fBytesPerChar = 1;
190
191 while ( ths->fBufferPtr < ths->fBufferLimit ) { // Don't skip nulls for the header's '<'!
192 if ( *ths->fBufferPtr == '<' ) break;
193 ths->fBufferPtr++;
194 }
195
196 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriNo;
197 ths->fBufferPtr++;
198 return eTriYes;
199
200 } else {
201
202 // --------------------------------------------------------------------------------
203 // We're looking for the '<' of the trailer. We're already inside the packet body,
204 // looking for the trailer. So here if we fail we must return eTriMaybe so that we
205 // keep looking for the trailer in the next buffer.
206
207 const int bytesPerChar = ths->fBytesPerChar;
208
209 while ( ths->fBufferPtr < ths->fBufferLimit ) {
210 if ( *ths->fBufferPtr == '<' ) break;
211 ths->fBufferPtr += bytesPerChar;
212 }
213
214 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
215 ths->fBufferPtr += bytesPerChar;
216 return eTriYes;
217
218 }
219
220 } // FindLessThan
221
222
223 // =================================================================================================
224 // MatchString
225 // ===========
226
227 XMPScanner::PacketMachine::TriState
MatchString(PacketMachine * ths,const char * literal)228 XMPScanner::PacketMachine::MatchString ( PacketMachine * ths, const char * literal )
229 {
230 const int bytesPerChar = ths->fBytesPerChar;
231 const char * litPtr = literal + ths->fPosition;
232 const XMP_Int32 charsToGo = (XMP_Int32) strlen ( literal ) - ths->fPosition;
233 int charsDone = 0;
234
235 while ( (charsDone < charsToGo) && (ths->fBufferPtr < ths->fBufferLimit) ) {
236 if ( *litPtr != *ths->fBufferPtr ) return eTriNo;
237 charsDone++;
238 litPtr++;
239 ths->fBufferPtr += bytesPerChar;
240 }
241
242 if ( charsDone == charsToGo ) return eTriYes;
243 ths->fPosition += charsDone;
244 return eTriMaybe;
245
246 } // MatchString
247
248
249 // =================================================================================================
250 // MatchChar
251 // =========
252
253 XMPScanner::PacketMachine::TriState
MatchChar(PacketMachine * ths,const char * literal)254 XMPScanner::PacketMachine::MatchChar ( PacketMachine * ths, const char * literal )
255 {
256 const int bytesPerChar = ths->fBytesPerChar;
257
258 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
259
260 const char currChar = *ths->fBufferPtr;
261 if ( currChar != *literal ) return eTriNo;
262 ths->fBufferPtr += bytesPerChar;
263 return eTriYes;
264
265 } // MatchChar
266
267
268 // =================================================================================================
269 // MatchOpenQuote
270 // ==============
271
272 XMPScanner::PacketMachine::TriState
MatchOpenQuote(PacketMachine * ths,const char *)273 XMPScanner::PacketMachine::MatchOpenQuote ( PacketMachine * ths, const char * /* unused */ )
274 {
275 const int bytesPerChar = ths->fBytesPerChar;
276
277 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
278
279 const char currChar = *ths->fBufferPtr;
280 if ( (currChar != '\'') && (currChar != '"') ) return eTriNo;
281 ths->fQuoteChar = currChar;
282 ths->fBufferPtr += bytesPerChar;
283 return eTriYes;
284
285 } // MatchOpenQuote
286
287
288 // =================================================================================================
289 // MatchCloseQuote
290 // ===============
291
292 XMPScanner::PacketMachine::TriState
MatchCloseQuote(PacketMachine * ths,const char *)293 XMPScanner::PacketMachine::MatchCloseQuote ( PacketMachine * ths, const char * /* unused */ )
294 {
295
296 return MatchChar ( ths, &ths->fQuoteChar );
297
298 } // MatchCloseQuote
299
300
301 // =================================================================================================
302 // CaptureAttrName
303 // ===============
304
305 XMPScanner::PacketMachine::TriState
CaptureAttrName(PacketMachine * ths,const char *)306 XMPScanner::PacketMachine::CaptureAttrName ( PacketMachine * ths, const char * /* unused */ )
307 {
308 const int bytesPerChar = ths->fBytesPerChar;
309 char currChar;
310
311 if ( ths->fPosition == 0 ) { // Get the first character in the name.
312
313 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
314
315 currChar = *ths->fBufferPtr;
316 if ( ths->fAttrName.size() == 0 ) {
317 if ( ! ( ( ('a' <= currChar) && (currChar <= 'z') ) ||
318 ( ('A' <= currChar) && (currChar <= 'Z') ) ||
319 (currChar == '_') || (currChar == ':') ) ) {
320 return eTriNo;
321 }
322 }
323
324 ths->fAttrName.erase ( ths->fAttrName.begin(), ths->fAttrName.end() );
325 #if UseStringPushBack
326 ths->fAttrName.push_back ( currChar );
327 #else
328 ths->fAttrName.insert ( ths->fAttrName.end(), currChar );
329 #endif
330 ths->fBufferPtr += bytesPerChar;
331
332 }
333
334 while ( ths->fBufferPtr < ths->fBufferLimit ) { // Get the remainder of the name.
335
336 currChar = *ths->fBufferPtr;
337 if ( ! ( ( ('a' <= currChar) && (currChar <= 'z') ) ||
338 ( ('A' <= currChar) && (currChar <= 'Z') ) ||
339 ( ('0' <= currChar) && (currChar <= '9') ) ||
340 (currChar == '-') || (currChar == '.') || (currChar == '_') || (currChar == ':') ) ) {
341 break;
342 }
343
344 #if UseStringPushBack
345 ths->fAttrName.push_back ( currChar );
346 #else
347 ths->fAttrName.insert ( ths->fAttrName.end(), currChar );
348 #endif
349 ths->fBufferPtr += bytesPerChar;
350
351 }
352
353 if ( ths->fBufferPtr < ths->fBufferLimit ) return eTriYes;
354 ths->fPosition = (long) ths->fAttrName.size(); // The name might span into the next buffer.
355 return eTriMaybe;
356
357 } // CaptureAttrName
358
359
360 // =================================================================================================
361 // CaptureAttrValue
362 // ================
363 //
364 // Recognize the equal sign and the quoted string value, capture the value along the way.
365
366 XMPScanner::PacketMachine::TriState
CaptureAttrValue(PacketMachine * ths,const char *)367 XMPScanner::PacketMachine::CaptureAttrValue ( PacketMachine * ths, const char * /* unused */ )
368 {
369 const int bytesPerChar = ths->fBytesPerChar;
370 char currChar = 0;
371 TriState result = eTriMaybe;
372
373 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
374
375 switch ( ths->fPosition ) {
376
377 case 0 : // The name should haved ended at the '=', nulls already skipped.
378
379 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
380 if ( *ths->fBufferPtr != '=' ) return eTriNo;
381 ths->fBufferPtr += bytesPerChar;
382 ths->fPosition = 1;
383 // fall through OK because MatchOpenQuote will check the buffer limit and nulls ...
384
385 case 1 : // Look for the open quote.
386
387 result = MatchOpenQuote ( ths, NULL );
388 if ( result != eTriYes ) return result;
389 ths->fPosition = 2;
390 // fall through OK because the buffer limit and nulls are checked below ...
391
392 default : // Look for the close quote, capturing the value along the way.
393
394 assert ( ths->fPosition == 2 );
395
396 const char quoteChar = ths->fQuoteChar;
397
398 while ( ths->fBufferPtr < ths->fBufferLimit ) {
399 currChar = *ths->fBufferPtr;
400 if ( currChar == quoteChar ) break;
401 #if UseStringPushBack
402 ths->fAttrValue.push_back ( currChar );
403 #else
404 ths->fAttrValue.insert ( ths->fAttrValue.end(), currChar );
405 #endif
406 ths->fBufferPtr += bytesPerChar;
407 }
408
409 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
410 assert ( currChar == quoteChar );
411 ths->fBufferPtr += bytesPerChar; // Advance past the closing quote.
412 return eTriYes;
413
414 }
415
416 } // CaptureAttrValue
417
418
419 // =================================================================================================
420 // RecordStart
421 // ===========
422 //
423 // Note that this routine looks at bytes, not logical characters. It has to figure out how many
424 // bytes per character there are so that the other recognizers can skip intervening nulls.
425
426 XMPScanner::PacketMachine::TriState
RecordStart(PacketMachine * ths,const char *)427 XMPScanner::PacketMachine::RecordStart ( PacketMachine * ths, const char * /* unused */ )
428 {
429
430 while ( true ) {
431
432 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
433
434 const char currByte = *ths->fBufferPtr;
435
436 switch ( ths->fPosition ) {
437
438 case 0 : // Record the length.
439 assert ( ths->fCharForm == eChar8Bit );
440 assert ( ths->fBytesPerChar == 1 );
441 ths->fPacketStart = ths->fBufferOffset + ((ths->fBufferPtr - 1) - ths->fBufferOrigin);
442 ths->fPacketLength = 0;
443 ths->fPosition = 1;
444 // ! OK to fall through here, we didn't consume a byte in this step.
445
446 case 1 : // Look for the first null byte.
447 if ( currByte != 0 ) return eTriYes; // No nulls found.
448 ths->fCharForm = eChar16BitBig; // Assume 16 bit big endian for now.
449 ths->fBytesPerChar = 2;
450 ths->fBufferPtr++;
451 ths->fPosition = 2;
452 break; // ! Don't fall through, have to check for the end of the buffer between each byte.
453
454 case 2 : // One null was found, look for a second.
455 if ( currByte != 0 ) return eTriYes; // Just one null found.
456 ths->fBufferPtr++;
457 ths->fPosition = 3;
458 break;
459
460 case 3 : // Two nulls were found, look for a third.
461 if ( currByte != 0 ) return eTriNo; // Just two nulls is not valid.
462 ths->fCharForm = eChar32BitBig; // Assume 32 bit big endian for now.
463 ths->fBytesPerChar = 4;
464 ths->fBufferPtr++;
465 return eTriYes;
466 break;
467
468 }
469
470 }
471
472 } // RecordStart
473
474
475 // =================================================================================================
476 // RecognizeBOM
477 // ============
478 //
479 // Recognizing the byte order marker is a surprisingly messy thing to do. It can't be done by the
480 // normal string matcher, there are no intervening nulls. There are 4 transitions after the opening
481 // quote, the closing quote or one of the three encodings. For the actual BOM there are then 1 or 2
482 // following bytes that depend on which of the encodings we're in. Not to mention that the buffer
483 // might end at any point.
484 //
485 // The intervening null count done earlier determined 8, 16, or 32 bits per character, but not the
486 // big or little endian nature for the 16/32 bit cases. The BOM must be present for the 16 and 32
487 // bit cases in order to determine the endian mode. There are six possible byte sequences for the
488 // quoted BOM string, ignoring the differences for quoting with ''' versus '"'.
489 //
490 // Keep in mind that for the 16 and 32 bit cases there will be nulls for the quote. In the table
491 // below the symbol <quote> means just the one byte containing the ''' or '"'. The nulls for the
492 // quote character are explicitly shown.
493 //
494 // <quote> <quote> - 1: No BOM, this must be an 8 bit case.
495 // <quote> \xEF \xBB \xBF <quote> - 1.12-13: The 8 bit form.
496 //
497 // <quote> \xFE \xFF \x00 <quote> - 1.22-23: The 16 bit, big endian form
498 // <quote> \x00 \xFF \xFE <quote> - 1.32-33: The 16 bit, little endian form.
499 //
500 // <quote> \x00 \x00 \xFE \xFF \x00 \x00 \x00 <quote> - 1.32.43-45.56-57: The 32 bit, big endian form.
501 // <quote> \x00 \x00 \x00 \xFF \xFE \x00 \x00 <quote> - 1.32.43.54-57: The 32 bit, little endian form.
502
503 enum {
504 eBOM_8_1 = 0xEF,
505 eBOM_8_2 = 0xBB,
506 eBOM_8_3 = 0xBF,
507 eBOM_Big_1 = 0xFE,
508 eBOM_Big_2 = 0xFF,
509 eBOM_Little_1 = eBOM_Big_2,
510 eBOM_Little_2 = eBOM_Big_1
511 };
512
513 XMPScanner::PacketMachine::TriState
RecognizeBOM(PacketMachine * ths,const char *)514 XMPScanner::PacketMachine::RecognizeBOM ( PacketMachine * ths, const char * /* unused */ )
515 {
516 const int bytesPerChar = ths->fBytesPerChar;
517
518 while ( true ) { // Handle one character at a time, the micro-state (fPosition) changes for each.
519
520 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
521
522 const unsigned char currChar = *ths->fBufferPtr; // ! The BOM bytes look like integers bigger than 127.
523
524 switch ( ths->fPosition ) {
525
526 case 0 : // Look for the opening quote.
527 if ( (currChar != '\'') && (currChar != '"') ) return eTriNo;
528 ths->fQuoteChar = currChar;
529 ths->fBufferPtr++;
530 ths->fPosition = 1;
531 break; // ! Don't fall through, have to check for the end of the buffer between each byte.
532
533 case 1 : // Look at the byte immediately following the opening quote.
534 if ( currChar == ths->fQuoteChar ) { // Closing quote, no BOM character, must be 8 bit.
535 if ( ths->fCharForm != eChar8Bit ) return eTriNo;
536 ths->fBufferPtr += bytesPerChar; // Skip the nulls after the closing quote.
537 return eTriYes;
538 } else if ( currChar == eBOM_8_1 ) { // Start of the 8 bit form.
539 if ( ths->fCharForm != eChar8Bit ) return eTriNo;
540 ths->fBufferPtr++;
541 ths->fPosition = 12;
542 } else if ( currChar == eBOM_Big_1 ) { // Start of the 16 bit big endian form.
543 if ( ths->fCharForm != eChar16BitBig ) return eTriNo;
544 ths->fBufferPtr++;
545 ths->fPosition = 22;
546 } else if ( currChar == 0 ) { // Start of the 16 bit little endian or either 32 bit form.
547 if ( ths->fCharForm == eChar8Bit ) return eTriNo;
548 ths->fBufferPtr++;
549 ths->fPosition = 32;
550 } else {
551 return eTriNo;
552 }
553 break;
554
555 case 12 : // Look for the second byte of the 8 bit form.
556 if ( currChar != eBOM_8_2 ) return eTriNo;
557 ths->fPosition = 13;
558 ths->fBufferPtr++;
559 break;
560
561 case 13 : // Look for the third byte of the 8 bit form.
562 if ( currChar != eBOM_8_3 ) return eTriNo;
563 ths->fPosition = 99;
564 ths->fBufferPtr++;
565 break;
566
567 case 22 : // Look for the second byte of the 16 bit big endian form.
568 if ( currChar != eBOM_Big_2 ) return eTriNo;
569 ths->fPosition = 23;
570 ths->fBufferPtr++;
571 break;
572
573 case 23 : // Look for the null before the closing quote of the 16 bit big endian form.
574 if ( currChar != 0 ) return eTriNo;
575 ths->fBufferPtr++;
576 ths->fPosition = 99;
577 break;
578
579 case 32 : // Look at the second byte of the 16 bit little endian or either 32 bit form.
580 if ( currChar == eBOM_Little_1 ) {
581 ths->fPosition = 33;
582 } else if ( currChar == 0 ) {
583 ths->fPosition = 43;
584 } else {
585 return eTriNo;
586 }
587 ths->fBufferPtr++;
588 break;
589
590 case 33 : // Look for the third byte of the 16 bit little endian form.
591 if ( ths->fCharForm != eChar16BitBig ) return eTriNo; // Null count before assumed big endian.
592 if ( currChar != eBOM_Little_2 ) return eTriNo;
593 ths->fCharForm = eChar16BitLittle;
594 ths->fPosition = 99;
595 ths->fBufferPtr++;
596 break;
597
598 case 43 : // Look at the third byte of either 32 bit form.
599 if ( ths->fCharForm != eChar32BitBig ) return eTriNo; // Null count before assumed big endian.
600 if ( currChar == eBOM_Big_1 ) {
601 ths->fPosition = 44;
602 } else if ( currChar == 0 ) {
603 ths->fPosition = 54;
604 } else {
605 return eTriNo;
606 }
607 ths->fBufferPtr++;
608 break;
609
610 case 44 : // Look for the fourth byte of the 32 bit big endian form.
611 if ( currChar != eBOM_Big_2 ) return eTriNo;
612 ths->fPosition = 45;
613 ths->fBufferPtr++;
614 break;
615
616 case 45 : // Look for the first null before the closing quote of the 32 bit big endian form.
617 if ( currChar != 0 ) return eTriNo;
618 ths->fPosition = 56;
619 ths->fBufferPtr++;
620 break;
621
622 case 54 : // Look for the fourth byte of the 32 bit little endian form.
623 ths->fCharForm = eChar32BitLittle;
624 if ( currChar != eBOM_Little_1 ) return eTriNo;
625 ths->fPosition = 55;
626 ths->fBufferPtr++;
627 break;
628
629 case 55 : // Look for the fifth byte of the 32 bit little endian form.
630 if ( currChar != eBOM_Little_2 ) return eTriNo;
631 ths->fPosition = 56;
632 ths->fBufferPtr++;
633 break;
634
635 case 56 : // Look for the next to last null before the closing quote of the 32 bit forms.
636 if ( currChar != 0 ) return eTriNo;
637 ths->fPosition = 57;
638 ths->fBufferPtr++;
639 break;
640
641 case 57 : // Look for the last null before the closing quote of the 32 bit forms.
642 if ( currChar != 0 ) return eTriNo;
643 ths->fPosition = 99;
644 ths->fBufferPtr++;
645 break;
646
647 default : // Look for the closing quote.
648 assert ( ths->fPosition == 99 );
649 if ( currChar != ths->fQuoteChar ) return eTriNo;
650 ths->fBufferPtr += bytesPerChar; // Skip the nulls after the closing quote.
651 return eTriYes;
652 break;
653
654 }
655
656 }
657
658 } // RecognizeBOM
659
660
661 // =================================================================================================
662 // RecordHeadAttr
663 // ==============
664
665 XMPScanner::PacketMachine::TriState
RecordHeadAttr(PacketMachine * ths,const char *)666 XMPScanner::PacketMachine::RecordHeadAttr ( PacketMachine * ths, const char * /* unused */ )
667 {
668
669 if ( ths->fAttrName == "encoding" ) {
670
671 assert ( ths->fEncodingAttr.empty() );
672 ths->fEncodingAttr = ths->fAttrValue;
673
674 } else if ( ths->fAttrName == "bytes" ) {
675
676 long value = 0;
677 int count = (int) ths->fAttrValue.size();
678 int i;
679
680 assert ( ths->fBytesAttr == -1 );
681
682 if ( count > 0 ) { // Allow bytes='' to be the same as no bytes attribute.
683
684 for ( i = 0; i < count; i++ ) {
685 const char currChar = ths->fAttrValue[i];
686 if ( ('0' <= currChar) && (currChar <= '9') ) {
687 value = (value * 10) + (currChar - '0');
688 } else {
689 ths->fBogusPacket = true;
690 value = -1;
691 break;
692 }
693 }
694 ths->fBytesAttr = value;
695
696 if ( CharFormIs16Bit ( ths->fCharForm ) ) {
697 if ( (ths->fBytesAttr & 1) != 0 ) ths->fBogusPacket = true;
698 } else if ( CharFormIs32Bit ( ths->fCharForm ) ) {
699 if ( (ths->fBytesAttr & 3) != 0 ) ths->fBogusPacket = true;
700 }
701
702 }
703
704 }
705
706 ths->fAttrName.erase ( ths->fAttrName.begin(), ths->fAttrName.end() );
707 ths->fAttrValue.erase ( ths->fAttrValue.begin(), ths->fAttrValue.end() );
708
709 return eTriYes;
710
711 } // RecordHeadAttr
712
713
714 // =================================================================================================
715 // CaptureAccess
716 // =============
717
718 XMPScanner::PacketMachine::TriState
CaptureAccess(PacketMachine * ths,const char *)719 XMPScanner::PacketMachine::CaptureAccess ( PacketMachine * ths, const char * /* unused */ )
720 {
721 const int bytesPerChar = ths->fBytesPerChar;
722
723 while ( true ) {
724
725 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
726
727 const char currChar = *ths->fBufferPtr;
728
729 switch ( ths->fPosition ) {
730
731 case 0 : // Look for the opening quote.
732 if ( (currChar != '\'') && (currChar != '"') ) return eTriNo;
733 ths->fQuoteChar = currChar;
734 ths->fBufferPtr += bytesPerChar;
735 ths->fPosition = 1;
736 break; // ! Don't fall through, have to check for the end of the buffer between each byte.
737
738 case 1 : // Look for the 'r' or 'w'.
739 if ( (currChar != 'r') && (currChar != 'w') ) return eTriNo;
740 ths->fAccess = currChar;
741 ths->fBufferPtr += bytesPerChar;
742 ths->fPosition = 2;
743 break;
744
745 default : // Look for the closing quote.
746 assert ( ths->fPosition == 2 );
747 if ( currChar != ths->fQuoteChar ) return eTriNo;
748 ths->fBufferPtr += bytesPerChar;
749 return eTriYes;
750 break;
751
752 }
753
754 }
755
756 } // CaptureAccess
757
758
759 // =================================================================================================
760 // RecordTailAttr
761 // ==============
762
763 XMPScanner::PacketMachine::TriState
RecordTailAttr(PacketMachine * ths,const char *)764 XMPScanner::PacketMachine::RecordTailAttr ( PacketMachine * ths, const char * /* unused */ )
765 {
766
767 // There are no known "general" attributes for the packet trailer.
768
769 ths->fAttrName.erase ( ths->fAttrName.begin(), ths->fAttrName.end() );
770 ths->fAttrValue.erase ( ths->fAttrValue.begin(), ths->fAttrValue.end() );
771
772 return eTriYes;
773
774
775 } // RecordTailAttr
776
777
778 // =================================================================================================
779 // CheckPacketEnd
780 // ==============
781 //
782 // Check for trailing padding and record the packet length. We have trailing padding if the bytes
783 // attribute is present and has a value greater than the current length.
784
785 XMPScanner::PacketMachine::TriState
CheckPacketEnd(PacketMachine * ths,const char *)786 XMPScanner::PacketMachine::CheckPacketEnd ( PacketMachine * ths, const char * /* unused */ )
787 {
788 const int bytesPerChar = ths->fBytesPerChar;
789
790 if ( ths->fPosition == 0 ) { // First call, decide if there is trailing padding.
791
792 const XMP_Int64 currLen64 = (ths->fBufferOffset + (ths->fBufferPtr - ths->fBufferOrigin)) - ths->fPacketStart;
793 if ( currLen64 > 0x7FFFFFFF ) throw std::runtime_error ( "Packet length exceeds 2GB-1" );
794 const XMP_Int32 currLength = (XMP_Int32)currLen64;
795
796 if ( (ths->fBytesAttr != -1) && (ths->fBytesAttr != currLength) ) {
797 if ( ths->fBytesAttr < currLength ) {
798 ths->fBogusPacket = true; // The bytes attribute value is too small.
799 } else {
800 ths->fPosition = ths->fBytesAttr - currLength;
801 if ( (ths->fPosition % ths->fBytesPerChar) != 0 ) {
802 ths->fBogusPacket = true; // The padding is not a multiple of the character size.
803 ths->fPosition = (ths->fPosition / ths->fBytesPerChar) * ths->fBytesPerChar;
804 }
805 }
806 }
807
808 }
809
810 while ( ths->fPosition > 0 ) {
811
812 if ( ths->fBufferPtr >= ths->fBufferLimit ) return eTriMaybe;
813
814 const char currChar = *ths->fBufferPtr;
815
816 if ( (currChar != ' ') && (currChar != '\t') && (currChar != '\n') && (currChar != '\r') ) {
817 ths->fBogusPacket = true; // The padding is not whitespace.
818 break; // Stop the packet here.
819 }
820
821 ths->fPosition -= bytesPerChar;
822 ths->fBufferPtr += bytesPerChar;
823
824 }
825
826 const XMP_Int64 currLen64 = (ths->fBufferOffset + (ths->fBufferPtr - ths->fBufferOrigin)) - ths->fPacketStart;
827 if ( currLen64 > 0x7FFFFFFF ) throw std::runtime_error ( "Packet length exceeds 2GB-1" );
828 ths->fPacketLength = (XMP_Int32)currLen64;
829 return eTriYes;
830
831 } // CheckPacketEnd
832
833
834 // =================================================================================================
835 // CheckFinalNulls
836 // ===============
837 //
838 // Do some special case processing for little endian characters. We have to make sure the presumed
839 // nulls after the last character actually exist, i.e. that the stream does not end too soon. Note
840 // that the prior character scanning has moved the buffer pointer to the address following the last
841 // byte of the last character. I.e. we're already past the presumed nulls, so we can't check their
842 // content. All we can do is verify that the stream does not end too soon.
843 //
844 // Doing this check is simple yet subtle. If we're still in the current buffer then the trailing
845 // bytes obviously exist. If we're exactly at the end of the buffer then the bytes also exist.
846 // The only question is when we're actually past this buffer, partly into the next buffer. This is
847 // when "ths->fBufferPtr > ths->fBufferLimit" on entry. For that case we have to wait until we've
848 // actually seen enough extra bytes of input.
849 //
850 // Since the normal buffer processing is already adjusting for this partial character overrun, all
851 // that needs to be done here is wait until "ths->fBufferPtr <= ths->fBufferLimit" on entry. In
852 // other words, if we're presently too far, ths->fBufferPtr will be adjusted by the amount of the
853 // overflow the next time XMPScanner::Scan is called. This might still be too far, so just keep
854 // waiting for enough data to pass by.
855 //
856 // Note that there is a corresponding special case for big endian characters, we must decrement the
857 // starting offset by the number of leading nulls. But we don't do that here, we leave it to the
858 // outer code. This is because the leading nulls might have been at the exact end of a previous
859 // buffer, in which case we have to also decrement the length of that raw data snip.
860
861 XMPScanner::PacketMachine::TriState
CheckFinalNulls(PacketMachine * ths,const char *)862 XMPScanner::PacketMachine::CheckFinalNulls ( PacketMachine * ths, const char * /* unused */ )
863 {
864
865 if ( (ths->fCharForm != eChar8Bit) && CharFormIsLittleEndian ( ths->fCharForm ) ) {
866 if ( ths->fBufferPtr > ths->fBufferLimit ) return eTriMaybe;
867 }
868
869 return eTriYes;
870
871 } // CheckFinalNulls
872
873
874 // =================================================================================================
875 // SetNextRecognizer
876 // =================
877
878 void
SetNextRecognizer(RecognizerKind nextRecognizer)879 XMPScanner::PacketMachine::SetNextRecognizer ( RecognizerKind nextRecognizer )
880 {
881
882 fRecognizer = nextRecognizer;
883 fPosition = 0;
884
885 } // SetNextRecognizer
886
887
888 // =================================================================================================
889 // FindNextPacket
890 // ==============
891
892 // *** When we start validating intervening nulls for 2 and 4 bytes characters, throw an exception
893 // *** for errors. Don't return eTriNo, that might skip at an optional point.
894
895 XMPScanner::PacketMachine::TriState
FindNextPacket()896 XMPScanner::PacketMachine::FindNextPacket ()
897 {
898
899 TriState status;
900
901 #define kPacketHead "?xpacket begin="
902 #define kPacketID "W5M0MpCehiHzreSzNTczkc9d"
903 #define kPacketTail "?xpacket end="
904
905 static const RecognizerInfo recognizerTable [eRecognizerCount] = { // ! Would be safer to assign these explicitly.
906
907 // proc successNext failureNext literal
908
909 { NULL, eFailureRecognizer, eFailureRecognizer, NULL}, // eFailureRecognizer
910 { NULL, eSuccessRecognizer, eSuccessRecognizer, NULL}, // eSuccessRecognizer
911
912 { FindLessThan, eHeadStartRecorder, eFailureRecognizer, "H" }, // eLeadInRecognizer
913 { RecordStart, eHeadStartRecognizer, eLeadInRecognizer, NULL }, // eHeadStartRecorder
914 { MatchString, eBOMRecognizer, eLeadInRecognizer, kPacketHead }, // eHeadStartRecognizer
915
916 { RecognizeBOM, eIDTagRecognizer, eLeadInRecognizer, NULL }, // eBOMRecognizer
917
918 { MatchString, eIDOpenRecognizer, eLeadInRecognizer, " id=" }, // eIDTagRecognizer
919 { MatchOpenQuote, eIDValueRecognizer, eLeadInRecognizer, NULL }, // eIDOpenRecognizer
920 { MatchString, eIDCloseRecognizer, eLeadInRecognizer, kPacketID }, // eIDValueRecognizer
921 { MatchCloseQuote, eAttrSpaceRecognizer_1, eLeadInRecognizer, NULL }, // eIDCloseRecognizer
922
923 { MatchChar, eAttrNameRecognizer_1, eHeadEndRecognizer, " " }, // eAttrSpaceRecognizer_1
924 { CaptureAttrName, eAttrValueRecognizer_1, eLeadInRecognizer, NULL }, // eAttrNameRecognizer_1
925 { CaptureAttrValue, eAttrValueRecorder_1, eLeadInRecognizer, NULL }, // eAttrValueRecognizer_1
926 { RecordHeadAttr, eAttrSpaceRecognizer_1, eLeadInRecognizer, NULL }, // eAttrValueRecorder_1
927
928 { MatchString, eBodyRecognizer, eLeadInRecognizer, "?>" }, // eHeadEndRecognizer
929
930 { FindLessThan, eTailStartRecognizer, eBodyRecognizer, "T"}, // eBodyRecognizer
931
932 { MatchString, eAccessValueRecognizer, eBodyRecognizer, kPacketTail }, // eTailStartRecognizer
933 { CaptureAccess, eAttrSpaceRecognizer_2, eBodyRecognizer, NULL }, // eAccessValueRecognizer
934
935 { MatchChar, eAttrNameRecognizer_2, eTailEndRecognizer, " " }, // eAttrSpaceRecognizer_2
936 { CaptureAttrName, eAttrValueRecognizer_2, eBodyRecognizer, NULL }, // eAttrNameRecognizer_2
937 { CaptureAttrValue, eAttrValueRecorder_2, eBodyRecognizer, NULL }, // eAttrValueRecognizer_2
938 { RecordTailAttr, eAttrSpaceRecognizer_2, eBodyRecognizer, NULL }, // eAttrValueRecorder_2
939
940 { MatchString, ePacketEndRecognizer, eBodyRecognizer, "?>" }, // eTailEndRecognizer
941 { CheckPacketEnd, eCloseOutRecognizer, eBodyRecognizer, "" }, // ePacketEndRecognizer
942 { CheckFinalNulls, eSuccessRecognizer, eBodyRecognizer, "" } // eCloseOutRecognizer
943
944 };
945
946 while ( true ) {
947
948 switch ( fRecognizer ) {
949
950 case eFailureRecognizer :
951 return eTriNo;
952
953 case eSuccessRecognizer :
954 return eTriYes;
955
956 default :
957
958 // -------------------------------------------------------------------
959 // For everything else, the normal cases, use the state machine table.
960
961 const RecognizerInfo * thisState = &recognizerTable [fRecognizer];
962
963 status = thisState->proc ( this, thisState->literal );
964
965 switch ( status ) {
966
967 case eTriNo :
968 SetNextRecognizer ( thisState->failureNext );
969 continue;
970
971 case eTriYes :
972 SetNextRecognizer ( thisState->successNext );
973 continue;
974
975 case eTriMaybe :
976 fBufferOverrun = (unsigned char)(fBufferPtr - fBufferLimit);
977 return eTriMaybe; // Keep this recognizer intact, to be resumed later.
978
979 }
980
981 } // switch ( fRecognizer ) { ...
982
983 } // while ( true ) { ...
984
985 } // FindNextPacket
986
987
988 // =================================================================================================
989 // =================================================================================================
990 // class InternalSnip
991 // ==================
992
993
994 // =================================================================================================
995 // InternalSnip
996 // ============
997
InternalSnip(XMP_Int64 offset,XMP_Int64 length)998 XMPScanner::InternalSnip::InternalSnip ( XMP_Int64 offset, XMP_Int64 length )
999 {
1000
1001 fInfo.fOffset = offset;
1002 fInfo.fLength = length;
1003
1004 } // InternalSnip
1005
1006
1007 // =================================================================================================
1008 // InternalSnip
1009 // ============
1010
InternalSnip(const InternalSnip & rhs)1011 XMPScanner::InternalSnip::InternalSnip ( const InternalSnip & rhs ) :
1012 fInfo ( rhs.fInfo ),
1013 fMachine ( NULL )
1014 {
1015
1016 assert ( rhs.fMachine.get() == NULL ); // Don't copy a snip with a machine.
1017 assert ( (rhs.fInfo.fEncodingAttr == 0) || (*rhs.fInfo.fEncodingAttr == 0) ); // Don't copy a snip with an encoding.
1018
1019 } // InternalSnip
1020
1021
1022 // =================================================================================================
1023 // ~InternalSnip
1024 // =============
1025
~InternalSnip()1026 XMPScanner::InternalSnip::~InternalSnip ()
1027 {
1028 } // ~InternalSnip
1029
1030
1031
1032 // =================================================================================================
1033 // =================================================================================================
1034 // class XMPScanner
1035 // ================
1036
1037
1038 // =================================================================================================
1039 // DumpSnipList
1040 // ============
1041
1042 #if DEBUG
1043
1044 static const char * snipStateName [6] = { "not-seen", "pending", "raw-data", "good-packet", "partial", "bad-packet" };
1045
1046 void
DumpSnipList(const char * title)1047 XMPScanner::DumpSnipList ( const char * title )
1048 {
1049 InternalSnipIterator currPos = fInternalSnips.begin();
1050 InternalSnipIterator endPos = fInternalSnips.end();
1051
1052 cout << endl << title << " snip list: " << fInternalSnips.size() << endl;
1053
1054 for ( ; currPos != endPos; ++currPos ) {
1055 SnipInfo * currSnip = &currPos->fInfo;
1056 cout << '\t' << currSnip << ' ' << snipStateName[currSnip->fState] << ' '
1057 << currSnip->fOffset << ".." << (currSnip->fOffset + currSnip->fLength - 1)
1058 << ' ' << currSnip->fLength << ' ' << endl;
1059 }
1060 } // DumpSnipList
1061
1062 #endif
1063
1064
1065 // =================================================================================================
1066 // PrevSnip and NextSnip
1067 // =====================
1068
1069 XMPScanner::InternalSnipIterator
PrevSnip(InternalSnipIterator snipPos)1070 XMPScanner::PrevSnip ( InternalSnipIterator snipPos )
1071 {
1072
1073 InternalSnipIterator prev = snipPos;
1074 return --prev;
1075
1076 } // PrevSnip
1077
1078 XMPScanner::InternalSnipIterator
NextSnip(InternalSnipIterator snipPos)1079 XMPScanner::NextSnip ( InternalSnipIterator snipPos )
1080 {
1081
1082 InternalSnipIterator next = snipPos;
1083 return ++next;
1084
1085 } // NextSnip
1086
1087
1088 // =================================================================================================
1089 // XMPScanner
1090 // ==========
1091 //
1092 // Initialize the scanner object with one "not seen" snip covering the whole stream.
1093
XMPScanner(XMP_Int64 streamLength)1094 XMPScanner::XMPScanner ( XMP_Int64 streamLength ) :
1095
1096 fStreamLength ( streamLength )
1097
1098 {
1099 InternalSnip rootSnip ( 0, streamLength );
1100
1101 if ( streamLength > 0 ) fInternalSnips.push_front ( rootSnip ); // Be nice for empty files.
1102 // DumpSnipList ( "New XMPScanner" );
1103
1104 } // XMPScanner
1105
1106
1107 // =================================================================================================
1108 // ~XMPScanner
1109 // ===========
1110
~XMPScanner()1111 XMPScanner::~XMPScanner()
1112 {
1113
1114 } // ~XMPScanner
1115
1116
1117 // =================================================================================================
1118 // GetSnipCount
1119 // ============
1120
1121 long
GetSnipCount()1122 XMPScanner::GetSnipCount ()
1123 {
1124
1125 return (long)fInternalSnips.size();
1126
1127 } // GetSnipCount
1128
1129
1130 // =================================================================================================
1131 // StreamAllScanned
1132 // ================
1133
1134 bool
StreamAllScanned()1135 XMPScanner::StreamAllScanned ()
1136 {
1137 InternalSnipIterator currPos = fInternalSnips.begin();
1138 InternalSnipIterator endPos = fInternalSnips.end();
1139
1140 for ( ; currPos != endPos; ++currPos ) {
1141 if ( currPos->fInfo.fState == eNotSeenSnip ) return false;
1142 }
1143 return true;
1144
1145 } // StreamAllScanned
1146
1147
1148 // =================================================================================================
1149 // SplitInternalSnip
1150 // =================
1151 //
1152 // Split the given snip into up to 3 pieces. The new pieces are inserted before and after this one
1153 // in the snip list. The relOffset is the first byte to be kept, it is relative to this snip. If
1154 // the preceeding or following snips have the same state as this one, just shift the boundaries.
1155 // I.e. move the contents from one snip to the other, don't create a new snip.
1156
1157 // *** To be thread safe we ought to lock the entire list during manipulation. Let data scanning
1158 // *** happen in parallel, serialize all mucking with the list.
1159
1160 void
SplitInternalSnip(InternalSnipIterator snipPos,XMP_Int64 relOffset,XMP_Int64 newLength)1161 XMPScanner::SplitInternalSnip ( InternalSnipIterator snipPos, XMP_Int64 relOffset, XMP_Int64 newLength )
1162 {
1163
1164 assert ( (relOffset + newLength) > relOffset ); // Check for overflow.
1165 assert ( (relOffset + newLength) <= snipPos->fInfo.fLength );
1166
1167 // -----------------------------------
1168 // First deal with the low offset end.
1169
1170 if ( relOffset > 0 ) {
1171
1172 InternalSnipIterator prevPos;
1173 if ( snipPos != fInternalSnips.begin() ) prevPos = PrevSnip ( snipPos );
1174
1175 if ( (snipPos != fInternalSnips.begin()) && (snipPos->fInfo.fState == prevPos->fInfo.fState) ) {
1176 prevPos->fInfo.fLength += relOffset; // Adjust the preceeding snip.
1177 } else {
1178 InternalSnip headExcess ( snipPos->fInfo.fOffset, relOffset );
1179 headExcess.fInfo.fState = snipPos->fInfo.fState;
1180 headExcess.fInfo.fOutOfOrder = snipPos->fInfo.fOutOfOrder;
1181 fInternalSnips.insert ( snipPos, headExcess ); // Insert the head piece before the middle piece.
1182 }
1183
1184 snipPos->fInfo.fOffset += relOffset; // Adjust the remainder of this snip.
1185 snipPos->fInfo.fLength -= relOffset;
1186
1187 }
1188
1189 // ----------------------------------
1190 // Now deal with the high offset end.
1191
1192 if ( newLength < snipPos->fInfo.fLength ) {
1193
1194 InternalSnipIterator nextPos = NextSnip ( snipPos );
1195 const XMP_Int64 tailLength = snipPos->fInfo.fLength - newLength;
1196
1197 if ( (nextPos != fInternalSnips.end()) && (snipPos->fInfo.fState == nextPos->fInfo.fState) ) {
1198 nextPos->fInfo.fOffset -= tailLength; // Adjust the following snip.
1199 nextPos->fInfo.fLength += tailLength;
1200 } else {
1201 InternalSnip tailExcess ( (snipPos->fInfo.fOffset + newLength), tailLength );
1202 tailExcess.fInfo.fState = snipPos->fInfo.fState;
1203 tailExcess.fInfo.fOutOfOrder = snipPos->fInfo.fOutOfOrder;
1204 fInternalSnips.insert ( nextPos, tailExcess ); // Insert the tail piece after the middle piece.
1205 }
1206
1207 snipPos->fInfo.fLength = newLength;
1208
1209 }
1210
1211 } // SplitInternalSnip
1212
1213
1214 // =================================================================================================
1215 // MergeInternalSnips
1216 // ==================
1217
1218 XMPScanner::InternalSnipIterator
MergeInternalSnips(InternalSnipIterator firstPos,InternalSnipIterator secondPos)1219 XMPScanner::MergeInternalSnips ( InternalSnipIterator firstPos, InternalSnipIterator secondPos )
1220 {
1221
1222 firstPos->fInfo.fLength += secondPos->fInfo.fLength;
1223 fInternalSnips.erase ( secondPos );
1224 return firstPos;
1225
1226 } // MergeInternalSnips
1227
1228
1229 // =================================================================================================
1230 // Scan
1231 // ====
1232
1233 void
Scan(const void * bufferOrigin,XMP_Int64 bufferOffset,XMP_Int64 bufferLength)1234 XMPScanner::Scan ( const void * bufferOrigin, XMP_Int64 bufferOffset, XMP_Int64 bufferLength )
1235 {
1236 XMP_Int64 relOffset;
1237
1238 #if 0
1239 cout << "Scan: @ " << bufferOrigin << ", " << bufferOffset << ", " << bufferLength << endl;
1240 #endif
1241
1242 if ( bufferLength == 0 ) return;
1243
1244 // ----------------------------------------------------------------
1245 // These comparisons are carefully done to avoid overflow problems.
1246
1247 if ( (bufferOffset >= fStreamLength) ||
1248 (bufferLength > (fStreamLength - bufferOffset)) ||
1249 (bufferOrigin == 0) ) {
1250 throw ScanError ( "Bad origin, offset, or length" );
1251 }
1252
1253 // ----------------------------------------------------------------------------------------------
1254 // This buffer must be within a not-seen snip. Find it and split it. The first snip whose whose
1255 // end is beyond the buffer must be the enclosing one.
1256
1257 // *** It would be friendly for rescans for out of order problems to accept any buffer postion.
1258
1259 const XMP_Int64 endOffset = bufferOffset + bufferLength - 1;
1260 InternalSnipIterator snipPos = fInternalSnips.begin();
1261
1262 while ( endOffset > (snipPos->fInfo.fOffset + snipPos->fInfo.fLength - 1) ) ++ snipPos;
1263 if ( snipPos->fInfo.fState != eNotSeenSnip ) throw ScanError ( "Already seen" );
1264
1265 relOffset = bufferOffset - snipPos->fInfo.fOffset;
1266 if ( (relOffset + bufferLength) > snipPos->fInfo.fLength ) throw ScanError ( "Not within existing snip" );
1267
1268 SplitInternalSnip ( snipPos, relOffset, bufferLength ); // *** If sequential & prev is partial, just tack on,
1269
1270 // --------------------------------------------------------
1271 // Merge this snip with the preceeding snip if appropriate.
1272
1273 // *** When out of order I/O is supported we have to do something about buffers who's predecessor is not seen.
1274
1275 if ( snipPos->fInfo.fOffset > 0 ) {
1276 InternalSnipIterator prevPos = PrevSnip ( snipPos );
1277 if ( prevPos->fInfo.fState == ePartialPacketSnip ) snipPos = MergeInternalSnips ( prevPos, snipPos );
1278 }
1279
1280 // ----------------------------------
1281 // Look for packets within this snip.
1282
1283 snipPos->fInfo.fState = ePendingSnip;
1284 PacketMachine* thisMachine = snipPos->fMachine.get();
1285 // DumpSnipList ( "Before scan" );
1286
1287 if ( thisMachine != 0 ) {
1288 thisMachine->AssociateBuffer ( bufferOffset, bufferOrigin, bufferLength );
1289 } else {
1290 // *** snipPos->fMachine.reset ( new PacketMachine ( bufferOffset, bufferOrigin, bufferLength ) ); VC++ lacks reset
1291 #if 0
1292 snipPos->fMachine = auto_ptr<PacketMachine> ( new PacketMachine ( bufferOffset, bufferOrigin, bufferLength ) );
1293 #else
1294 {
1295 // Some versions of gcc complain about the assignment operator above. This avoids the gcc bug.
1296 PacketMachine * pm = new PacketMachine ( bufferOffset, bufferOrigin, bufferLength );
1297 auto_ptr<PacketMachine> ap ( pm );
1298 snipPos->fMachine = ap;
1299 }
1300 #endif
1301 thisMachine = snipPos->fMachine.get();
1302 }
1303
1304 bool bufferDone = false;
1305 while ( ! bufferDone ) {
1306
1307 PacketMachine::TriState foundPacket = thisMachine->FindNextPacket();
1308
1309 if ( foundPacket == PacketMachine::eTriNo ) {
1310
1311 // -----------------------------------------------------------------------
1312 // No packet, mark the snip as raw data and get rid of the packet machine.
1313 // We're done with this buffer.
1314
1315 snipPos->fInfo.fState = eRawInputSnip;
1316 #if 0
1317 snipPos->fMachine = auto_ptr<PacketMachine>(); // *** snipPos->fMachine.reset(); VC++ lacks reset
1318 #else
1319 {
1320 // Some versions of gcc complain about the assignment operator above. This avoids the gcc bug.
1321 auto_ptr<PacketMachine> ap ( 0 );
1322 snipPos->fMachine = ap;
1323 }
1324 #endif
1325 bufferDone = true;
1326
1327 } else {
1328
1329 // ---------------------------------------------------------------------------------------------
1330 // Either a full or partial packet. First trim any excess off of the front as a raw input snip.
1331 // If this is a partial packet mark the snip and keep the packet machine to be resumed later.
1332 // We're done with this buffer, the partial packet by definition extends to the end. If this is
1333 // a complete packet first extract the additional information from the packet machine. If there
1334 // is leftover data split the snip and transfer the packet machine to the new trailing snip.
1335
1336 if ( thisMachine->fPacketStart > snipPos->fInfo.fOffset ) {
1337
1338 // There is data at the front of the current snip that must be trimmed.
1339 SnipState savedState = snipPos->fInfo.fState;
1340 snipPos->fInfo.fState = eRawInputSnip; // ! So it gets propagated to the trimmed front part.
1341 relOffset = thisMachine->fPacketStart - snipPos->fInfo.fOffset;
1342 SplitInternalSnip ( snipPos, relOffset, (snipPos->fInfo.fLength - relOffset) );
1343 snipPos->fInfo.fState = savedState;
1344
1345 }
1346
1347 if ( foundPacket == PacketMachine::eTriMaybe ) {
1348
1349 // We have only found a partial packet.
1350 snipPos->fInfo.fState = ePartialPacketSnip;
1351 bufferDone = true;
1352
1353 } else {
1354
1355 // We have found a complete packet. Extract all the info for it and split any trailing data.
1356
1357 InternalSnipIterator packetSnip = snipPos;
1358 SnipState packetState = eValidPacketSnip;
1359
1360 if ( thisMachine->fBogusPacket ) packetState = eBadPacketSnip;
1361
1362 packetSnip->fInfo.fAccess = thisMachine->fAccess;
1363 packetSnip->fInfo.fCharForm = thisMachine->fCharForm;
1364 packetSnip->fInfo.fBytesAttr = thisMachine->fBytesAttr;
1365 packetSnip->fInfo.fEncodingAttr = thisMachine->fEncodingAttr.c_str();
1366 thisMachine->fEncodingAttr.erase ( thisMachine->fEncodingAttr.begin(), thisMachine->fEncodingAttr.end() );
1367
1368 if ( (thisMachine->fCharForm != eChar8Bit) && CharFormIsBigEndian ( thisMachine->fCharForm ) ) {
1369
1370 // ------------------------------------------------------------------------------
1371 // Handle a special case for big endian characters. The packet machine works as
1372 // though things were little endian. The packet starting offset points to the
1373 // byte containing the opening '<', and the length includes presumed nulls that
1374 // follow the last "real" byte. If the characters are big endian we now have to
1375 // decrement the starting offset of the packet, and also decrement the length of
1376 // the previous snip.
1377 //
1378 // Note that we can't do this before the head trimming above in general. The
1379 // nulls might have been exactly at the end of a buffer and already in the
1380 // previous snip. We are doing this before trimming the tail from the raw snip
1381 // containing the packet. We adjust the raw snip's size because it ends with
1382 // the input buffer. We don't adjust the packet's size, it is already correct.
1383 //
1384 // The raw snip (the one before the packet) might entirely disappear. A simple
1385 // example of this is when the packet is at the start of the file.
1386
1387 assert ( packetSnip != fInternalSnips.begin() ); // Leading nulls were trimmed!
1388
1389 if ( packetSnip != fInternalSnips.begin() ) { // ... but let's program defensibly.
1390
1391 InternalSnipIterator prevSnip = PrevSnip ( packetSnip );
1392 const unsigned int nullsToAdd = ( CharFormIs16Bit ( thisMachine->fCharForm ) ? 1 : 3 );
1393
1394 assert ( nullsToAdd <= prevSnip->fInfo.fLength );
1395 prevSnip->fInfo.fLength -= nullsToAdd;
1396 if ( prevSnip->fInfo.fLength == 0 ) (void) fInternalSnips.erase ( prevSnip );
1397
1398 packetSnip->fInfo.fOffset -= nullsToAdd;
1399 packetSnip->fInfo.fLength += nullsToAdd;
1400 thisMachine->fPacketStart -= nullsToAdd;
1401
1402 }
1403
1404 }
1405
1406 if ( thisMachine->fPacketLength == snipPos->fInfo.fLength ) {
1407
1408 // This packet ends exactly at the end of the current snip.
1409 #if 0
1410 snipPos->fMachine = auto_ptr<PacketMachine>(); // *** snipPos->fMachine.reset(); VC++ lacks reset
1411 #else
1412 {
1413 // Some versions of gcc complain about the assignment operator above. This avoids the gcc bug.
1414 auto_ptr<PacketMachine> ap ( 0 );
1415 snipPos->fMachine = ap;
1416 }
1417 #endif
1418 bufferDone = true;
1419
1420 } else {
1421
1422 // There is trailing data to split from the just found packet.
1423 SplitInternalSnip ( snipPos, 0, thisMachine->fPacketLength );
1424
1425 InternalSnipIterator tailPos = NextSnip ( snipPos );
1426
1427 tailPos->fMachine = snipPos->fMachine; // auto_ptr assignment - taking ownership
1428 thisMachine->ResetMachine ();
1429
1430 snipPos = tailPos;
1431
1432 }
1433
1434 packetSnip->fInfo.fState = packetState; // Do this last to avoid messing up the tail split.
1435 // DumpSnipList ( "Found a packet" );
1436
1437
1438 }
1439
1440 }
1441
1442 }
1443
1444 // --------------------------------------------------------
1445 // Merge this snip with the preceeding snip if appropriate.
1446
1447 // *** When out of order I/O is supported we have to check the following snip too.
1448
1449 if ( (snipPos->fInfo.fOffset > 0) && (snipPos->fInfo.fState == eRawInputSnip) ) {
1450 InternalSnipIterator prevPos = PrevSnip ( snipPos );
1451 if ( prevPos->fInfo.fState == eRawInputSnip ) snipPos = MergeInternalSnips ( prevPos, snipPos );
1452 }
1453
1454 // DumpSnipList ( "After scan" );
1455
1456 } // Scan
1457
1458
1459 // =================================================================================================
1460 // Report
1461 // ======
1462
1463 void
Report(SnipInfoVector & snips)1464 XMPScanner::Report ( SnipInfoVector& snips )
1465 {
1466 const int count = (int)fInternalSnips.size();
1467 InternalSnipIterator snipPos = fInternalSnips.begin();
1468
1469 int s;
1470
1471 // DumpSnipList ( "Report" );
1472
1473 snips.erase ( snips.begin(), snips.end() ); // ! Should use snips.clear, but VC++ doesn't have it.
1474 snips.reserve ( count );
1475
1476 for ( s = 0; s < count; s += 1 ) {
1477 snips.push_back ( SnipInfo ( snipPos->fInfo.fState, snipPos->fInfo.fOffset, snipPos->fInfo.fLength ) );
1478 snips[s] = snipPos->fInfo; // Pick up all of the fields.
1479 ++ snipPos;
1480 }
1481
1482 } // Report
1483