1 /*
2 www.sourceforge.net/projects/tinyxml
3 Original code by Lee Thomason (www.grinninglizard.com)
4 
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
8 
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
12 
13 1. The origin of this software must not be misrepresented; you must
14 not claim that you wrote the original software. If you use this
15 software in a product, an acknowledgment in the product documentation
16 would be appreciated but is not required.
17 
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
20 
21 3. This notice may not be removed or altered from any source
22 distribution.
23 */
24 
25 #include <ctype.h>
26 #include <stddef.h>
27 
28 #include "tinyxml.h"
29 
30 //#define DEBUG_PARSER
31 #if defined( DEBUG_PARSER )
32 #	if defined( DEBUG ) && defined( _MSC_VER )
33 #		include <windows.h>
34 #		define TIXML_LOG OutputDebugString
35 #	else
36 #		define TIXML_LOG printf
37 #	endif
38 #endif
39 
40 // Note tha "PutString" hardcodes the same list. This
41 // is less flexible than it appears. Changing the entries
42 // or order will break putstring.
43 TiXmlBase::Entity TiXmlBase::entity[ TiXmlBase::NUM_ENTITY ] =
44 {
45 	{ "&amp;",  5, '&' },
46 	{ "&lt;",   4, '<' },
47 	{ "&gt;",   4, '>' },
48 	{ "&quot;", 6, '\"' },
49 	{ "&apos;", 6, '\'' }
50 };
51 
52 // Bunch of unicode info at:
53 //		http://www.unicode.org/faq/utf_bom.html
54 // Including the basic of this table, which determines the #bytes in the
55 // sequence from the lead byte. 1 placed for invalid sequences --
56 // although the result will be junk, pass it through as much as possible.
57 // Beware of the non-characters in UTF-8:
58 //				ef bb bf (Microsoft "lead bytes")
59 //				ef bf be
60 //				ef bf bf
61 
62 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
63 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
64 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
65 
66 const int TiXmlBase::utf8ByteTable[256] =
67 {
68 	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
69 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00
70 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10
71 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20
72 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30
73 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40
74 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50
75 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60
76 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range
77 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid
78 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90
79 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0
80 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0
81 		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte
82 		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0
83 		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte
84 		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
85 };
86 
87 
ConvertUTF32ToUTF8(unsigned long input,char * output,int * length)88 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
89 {
90 	const unsigned long BYTE_MASK = 0xBF;
91 	const unsigned long BYTE_MARK = 0x80;
92 	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
93 
94 	if (input < 0x80)
95 		*length = 1;
96 	else if ( input < 0x800 )
97 		*length = 2;
98 	else if ( input < 0x10000 )
99 		*length = 3;
100 	else if ( input < 0x200000 )
101 		*length = 4;
102 	else
103 		{ *length = 0; return; }	// This code won't covert this correctly anyway.
104 
105 	output += *length;
106 
107 	// Scary scary fall throughs.
108 	switch (*length)
109 	{
110 		case 4:
111 			--output;
112 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
113 			input >>= 6;
114 		case 3:
115 			--output;
116 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
117 			input >>= 6;
118 		case 2:
119 			--output;
120 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
121 			input >>= 6;
122 		case 1:
123 			--output;
124 			*output = (char)(input | FIRST_BYTE_MARK[*length]);
125 	}
126 }
127 
128 
IsAlpha(unsigned char anyByte,TiXmlEncoding)129 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
130 {
131 	// This will only work for low-ascii, everything else is assumed to be a valid
132 	// letter. I'm not sure this is the best approach, but it is quite tricky trying
133 	// to figure out alhabetical vs. not across encoding. So take a very
134 	// conservative approach.
135 
136 //	if ( encoding == TIXML_ENCODING_UTF8 )
137 //	{
138 		if ( anyByte < 127 )
139 			return isalpha( anyByte );
140 		else
141 			return 1;	// What else to do? The unicode set is huge...get the english ones right.
142 //	}
143 //	else
144 //	{
145 //		return isalpha( anyByte );
146 //	}
147 }
148 
149 
IsAlphaNum(unsigned char anyByte,TiXmlEncoding)150 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
151 {
152 	// This will only work for low-ascii, everything else is assumed to be a valid
153 	// letter. I'm not sure this is the best approach, but it is quite tricky trying
154 	// to figure out alhabetical vs. not across encoding. So take a very
155 	// conservative approach.
156 
157 //	if ( encoding == TIXML_ENCODING_UTF8 )
158 //	{
159 		if ( anyByte < 127 )
160 			return isalnum( anyByte );
161 		else
162 			return 1;	// What else to do? The unicode set is huge...get the english ones right.
163 //	}
164 //	else
165 //	{
166 //		return isalnum( anyByte );
167 //	}
168 }
169 
170 
171 class TiXmlParsingData
172 {
173 	friend class TiXmlDocument;
174   public:
175 	void Stamp( const char* now, TiXmlEncoding encoding );
176 
Cursor() const177 	const TiXmlCursor& Cursor() const	{ return cursor; }
178 
179   private:
180 	// Only used by the document!
TiXmlParsingData(const char * start,int _tabsize,int row,int col)181 	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
182 	{
183 		assert( start );
184 		stamp = start;
185 		tabsize = _tabsize;
186 		cursor.row = row;
187 		cursor.col = col;
188 	}
189 
190 	TiXmlCursor		cursor;
191 	const char*		stamp;
192 	int				tabsize;
193 };
194 
195 
Stamp(const char * now,TiXmlEncoding encoding)196 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
197 {
198 	assert( now );
199 
200 	// Do nothing if the tabsize is 0.
201 	if ( tabsize < 1 )
202 	{
203 		return;
204 	}
205 
206 	// Get the current row, column.
207 	int row = cursor.row;
208 	int col = cursor.col;
209 	const char* p = stamp;
210 	assert( p );
211 
212 	while ( p < now )
213 	{
214 		// Treat p as unsigned, so we have a happy compiler.
215 		const unsigned char* pU = (const unsigned char*)p;
216 
217 		// Code contributed by Fletcher Dunn: (modified by lee)
218 		switch (*pU) {
219 			case 0:
220 				// We *should* never get here, but in case we do, don't
221 				// advance past the terminating null character, ever
222 				return;
223 
224 			case '\r':
225 				// bump down to the next line
226 				++row;
227 				col = 0;
228 				// Eat the character
229 				++p;
230 
231 				// Check for \r\n sequence, and treat this as a single character
232 				if (*p == '\n') {
233 					++p;
234 				}
235 				break;
236 
237 			case '\n':
238 				// bump down to the next line
239 				++row;
240 				col = 0;
241 
242 				// Eat the character
243 				++p;
244 
245 				// Check for \n\r sequence, and treat this as a single
246 				// character.  (Yes, this bizarre thing does occur still
247 				// on some arcane platforms...)
248 				if (*p == '\r') {
249 					++p;
250 				}
251 				break;
252 
253 			case '\t':
254 				// Eat the character
255 				++p;
256 
257 				// Skip to next tab stop
258 				col = (col / tabsize + 1) * tabsize;
259 				break;
260 
261 			case TIXML_UTF_LEAD_0:
262 				if ( encoding == TIXML_ENCODING_UTF8 )
263 				{
264 					if ( *(p+1) && *(p+2) )
265 					{
266 						// In these cases, don't advance the column. These are
267 						// 0-width spaces.
268 						if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
269 							p += 3;
270 						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
271 							p += 3;
272 						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
273 							p += 3;
274 						else
275 							{ p +=3; ++col; }	// A normal character.
276 					}
277 				}
278 				else
279 				{
280 					++p;
281 					++col;
282 				}
283 				break;
284 
285 			default:
286 				if ( encoding == TIXML_ENCODING_UTF8 )
287 				{
288 					// Eat the 1 to 4 byte utf8 character.
289 					int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
290 					if ( step == 0 )
291 						step = 1;		// Error case from bad encoding, but handle gracefully.
292 					p += step;
293 
294 					// Just advance one column, of course.
295 					++col;
296 				}
297 				else
298 				{
299 					++p;
300 					++col;
301 				}
302 				break;
303 		}
304 	}
305 	cursor.row = row;
306 	cursor.col = col;
307 	assert( cursor.row >= -1 );
308 	assert( cursor.col >= -1 );
309 	stamp = p;
310 	assert( stamp );
311 }
312 
313 
SkipWhiteSpace(const char * p,TiXmlEncoding encoding)314 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
315 {
316 	if ( !p || !*p )
317 	{
318 		return 0;
319 	}
320 	if ( encoding == TIXML_ENCODING_UTF8 )
321 	{
322 		while ( *p )
323 		{
324 			const unsigned char* pU = (const unsigned char*)p;
325 
326 			// Skip the stupid Microsoft UTF-8 Byte order marks
327 			if (	*(pU+0)==TIXML_UTF_LEAD_0
328 				 && *(pU+1)==TIXML_UTF_LEAD_1
329 				 && *(pU+2)==TIXML_UTF_LEAD_2 )
330 			{
331 				p += 3;
332 				continue;
333 			}
334 			else if(*(pU+0)==TIXML_UTF_LEAD_0
335 				 && *(pU+1)==0xbfU
336 				 && *(pU+2)==0xbeU )
337 			{
338 				p += 3;
339 				continue;
340 			}
341 			else if(*(pU+0)==TIXML_UTF_LEAD_0
342 				 && *(pU+1)==0xbfU
343 				 && *(pU+2)==0xbfU )
344 			{
345 				p += 3;
346 				continue;
347 			}
348 
349 			if ( IsWhiteSpace( *p ) )		// Still using old rules for white space.
350 				++p;
351 			else
352 				break;
353 		}
354 	}
355 	else
356 	{
357 		while ( *p && IsWhiteSpace( *p ) )
358 			++p;
359 	}
360 
361 	return p;
362 }
363 
364 #ifdef TIXML_USE_STL
StreamWhiteSpace(std::istream * in,TIXML_STRING * tag)365 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
366 {
367 	for( ;; )
368 	{
369 		if ( !in->good() ) return false;
370 
371 		int c = in->peek();
372 		// At this scope, we can't get to a document. So fail silently.
373 		if ( !IsWhiteSpace( c ) || c <= 0 )
374 			return true;
375 
376 		*tag += (char) in->get();
377 	}
378 }
379 
StreamTo(std::istream * in,int character,TIXML_STRING * tag)380 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
381 {
382 	//assert( character > 0 && character < 128 );	// else it won't work in utf-8
383 	while ( in->good() )
384 	{
385 		int c = in->peek();
386 		if ( c == character )
387 			return true;
388 		if ( c <= 0 )		// Silent failure: can't get document at this scope
389 			return false;
390 
391 		in->get();
392 		*tag += (char) c;
393 	}
394 	return false;
395 }
396 #endif
397 
398 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
399 // "assign" optimization removes over 10% of the execution time.
400 //
ReadName(const char * p,TIXML_STRING * name,TiXmlEncoding encoding)401 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
402 {
403 	// Oddly, not supported on some comilers,
404 	//name->clear();
405 	// So use this:
406 	*name = "";
407 	assert( p );
408 
409 	// Names start with letters or underscores.
410 	// Of course, in unicode, tinyxml has no idea what a letter *is*. The
411 	// algorithm is generous.
412 	//
413 	// After that, they can be letters, underscores, numbers,
414 	// hyphens, or colons. (Colons are valid ony for namespaces,
415 	// but tinyxml can't tell namespaces from names.)
416 	if (    p && *p
417 		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
418 	{
419 		const char* start = p;
420 		while(		p && *p
421 				&&	(		IsAlphaNum( (unsigned char ) *p, encoding )
422 						 || *p == '_'
423 						 || *p == '-'
424 						 || *p == '.'
425 						 || *p == ':' ) )
426 		{
427 			//(*name) += *p; // expensive
428 			++p;
429 		}
430 		if ( p-start > 0 ) {
431 			name->assign( start, p-start );
432 		}
433 		return p;
434 	}
435 	return 0;
436 }
437 
GetEntity(const char * p,char * value,int * length,TiXmlEncoding encoding)438 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
439 {
440 	// Presume an entity, and pull it out.
441     TIXML_STRING ent;
442 	int i;
443 	*length = 0;
444 
445 	if ( *(p+1) && *(p+1) == '#' && *(p+2) )
446 	{
447 		unsigned long ucs = 0;
448 		ptrdiff_t delta = 0;
449 		unsigned mult = 1;
450 
451 		if ( *(p+2) == 'x' )
452 		{
453 			// Hexadecimal.
454 			if ( !*(p+3) ) return 0;
455 
456 			const char* q = p+3;
457 			q = strchr( q, ';' );
458 
459 			if ( !q || !*q ) return 0;
460 
461 			delta = q-p;
462 			--q;
463 
464 			while ( *q != 'x' )
465 			{
466 				if ( *q >= '0' && *q <= '9' )
467 					ucs += mult * (*q - '0');
468 				else if ( *q >= 'a' && *q <= 'f' )
469 					ucs += mult * (*q - 'a' + 10);
470 				else if ( *q >= 'A' && *q <= 'F' )
471 					ucs += mult * (*q - 'A' + 10 );
472 				else
473 					return 0;
474 				mult *= 16;
475 				--q;
476 			}
477 		}
478 		else
479 		{
480 			// Decimal.
481 			if ( !*(p+2) ) return 0;
482 
483 			const char* q = p+2;
484 			q = strchr( q, ';' );
485 
486 			if ( !q || !*q ) return 0;
487 
488 			delta = q-p;
489 			--q;
490 
491 			while ( *q != '#' )
492 			{
493 				if ( *q >= '0' && *q <= '9' )
494 					ucs += mult * (*q - '0');
495 				else
496 					return 0;
497 				mult *= 10;
498 				--q;
499 			}
500 		}
501 		if ( encoding == TIXML_ENCODING_UTF8 )
502 		{
503 			// convert the UCS to UTF-8
504 			ConvertUTF32ToUTF8( ucs, value, length );
505 		}
506 		else
507 		{
508 			*value = (char)ucs;
509 			*length = 1;
510 		}
511 		return p + delta + 1;
512 	}
513 
514 	// Now try to match it.
515 	for( i=0; i<NUM_ENTITY; ++i )
516 	{
517 		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
518 		{
519 			assert( strlen( entity[i].str ) == entity[i].strLength );
520 			*value = entity[i].chr;
521 			*length = 1;
522 			return ( p + entity[i].strLength );
523 		}
524 	}
525 
526 	// So it wasn't an entity, its unrecognized, or something like that.
527 	*value = *p;	// Don't put back the last one, since we return it!
528 	//*length = 1;	// Leave unrecognized entities - this doesn't really work.
529 					// Just writes strange XML.
530 	return p+1;
531 }
532 
533 
StringEqual(const char * p,const char * tag,bool ignoreCase,TiXmlEncoding encoding)534 bool TiXmlBase::StringEqual( const char* p,
535 							 const char* tag,
536 							 bool ignoreCase,
537 							 TiXmlEncoding encoding )
538 {
539 	assert( p );
540 	assert( tag );
541 	if ( !p || !*p )
542 	{
543 		assert( 0 );
544 		return false;
545 	}
546 
547 	const char* q = p;
548 
549 	if ( ignoreCase )
550 	{
551 		while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
552 		{
553 			++q;
554 			++tag;
555 		}
556 
557 		if ( *tag == 0 )
558 			return true;
559 	}
560 	else
561 	{
562 		while ( *q && *tag && *q == *tag )
563 		{
564 			++q;
565 			++tag;
566 		}
567 
568 		if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
569 			return true;
570 	}
571 	return false;
572 }
573 
ReadText(const char * p,TIXML_STRING * text,bool trimWhiteSpace,const char * endTag,bool caseInsensitive,TiXmlEncoding encoding)574 const char* TiXmlBase::ReadText(	const char* p,
575 									TIXML_STRING * text,
576 									bool trimWhiteSpace,
577 									const char* endTag,
578 									bool caseInsensitive,
579 									TiXmlEncoding encoding )
580 {
581     *text = "";
582 	if (    !trimWhiteSpace			// certain tags always keep whitespace
583 		 || !condenseWhiteSpace )	// if true, whitespace is always kept
584 	{
585 		// Keep all the white space.
586 		while (	   p && *p
587 				&& !StringEqual( p, endTag, caseInsensitive, encoding )
588 			  )
589 		{
590 			int len;
591 			char cArr[4] = { 0, 0, 0, 0 };
592 			p = GetChar( p, cArr, &len, encoding );
593 			text->append( cArr, len );
594 		}
595 	}
596 	else
597 	{
598 		bool whitespace = false;
599 
600 		// Remove leading white space:
601 		p = SkipWhiteSpace( p, encoding );
602 		while (	   p && *p
603 				&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
604 		{
605 			if ( *p == '\r' || *p == '\n' )
606 			{
607 				whitespace = true;
608 				++p;
609 			}
610 			else if ( IsWhiteSpace( *p ) )
611 			{
612 				whitespace = true;
613 				++p;
614 			}
615 			else
616 			{
617 				// If we've found whitespace, add it before the
618 				// new character. Any whitespace just becomes a space.
619 				if ( whitespace )
620 				{
621 					(*text) += ' ';
622 					whitespace = false;
623 				}
624 				int len;
625 				char cArr[4] = { 0, 0, 0, 0 };
626 				p = GetChar( p, cArr, &len, encoding );
627 				if ( len == 1 )
628 					(*text) += cArr[0];	// more efficient
629 				else
630 					text->append( cArr, len );
631 			}
632 		}
633 	}
634 	if ( p && *p )
635 		p += strlen( endTag );
636 	return ( p && *p ) ? p : 0;
637 }
638 
639 #ifdef TIXML_USE_STL
640 
StreamIn(std::istream * in,TIXML_STRING * tag)641 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
642 {
643 	// The basic issue with a document is that we don't know what we're
644 	// streaming. Read something presumed to be a tag (and hope), then
645 	// identify it, and call the appropriate stream method on the tag.
646 	//
647 	// This "pre-streaming" will never read the closing ">" so the
648 	// sub-tag can orient itself.
649 
650 	if ( !StreamTo( in, '<', tag ) )
651 	{
652 		SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
653 		return;
654 	}
655 
656 	while ( in->good() )
657 	{
658 		int tagIndex = (int) tag->length();
659 		while ( in->good() && in->peek() != '>' )
660 		{
661 			int c = in->get();
662 			if ( c <= 0 )
663 			{
664 				SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
665 				break;
666 			}
667 			(*tag) += (char) c;
668 		}
669 
670 		if ( in->good() )
671 		{
672 			// We now have something we presume to be a node of
673 			// some sort. Identify it, and call the node to
674 			// continue streaming.
675 			TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
676 
677 			if ( node )
678 			{
679 				node->StreamIn( in, tag );
680 				bool isElement = node->ToElement() != 0;
681 				delete node;
682 				node = 0;
683 
684 				// If this is the root element, we're done. Parsing will be
685 				// done by the >> operator.
686 				if ( isElement )
687 				{
688 					return;
689 				}
690 			}
691 			else
692 			{
693 				SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
694 				return;
695 			}
696 		}
697 	}
698 	// We should have returned sooner.
699 	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
700 }
701 
702 #endif
703 
Parse(const char * p,TiXmlParsingData * prevData,TiXmlEncoding encoding)704 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
705 {
706 	ClearError();
707 
708 	// Parse away, at the document level. Since a document
709 	// contains nothing but other tags, most of what happens
710 	// here is skipping white space.
711 	if ( !p || !*p )
712 	{
713 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
714 		return 0;
715 	}
716 
717 	// Note that, for a document, this needs to come
718 	// before the while space skip, so that parsing
719 	// starts from the pointer we are given.
720 	location.Clear();
721 	if ( prevData )
722 	{
723 		location.row = prevData->cursor.row;
724 		location.col = prevData->cursor.col;
725 	}
726 	else
727 	{
728 		location.row = 0;
729 		location.col = 0;
730 	}
731 	TiXmlParsingData data( p, TabSize(), location.row, location.col );
732 	location = data.Cursor();
733 
734 	if ( encoding == TIXML_ENCODING_UNKNOWN )
735 	{
736 		// Check for the Microsoft UTF-8 lead bytes.
737 		const unsigned char* pU = (const unsigned char*)p;
738 		if (	*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
739 			 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
740 			 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
741 		{
742 			encoding = TIXML_ENCODING_UTF8;
743 			useMicrosoftBOM = true;
744 		}
745 	}
746 
747     p = SkipWhiteSpace( p, encoding );
748 	if ( !p )
749 	{
750 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
751 		return 0;
752 	}
753 
754 	while ( p && *p )
755 	{
756 		TiXmlNode* node = Identify( p, encoding );
757 		if ( node )
758 		{
759 			p = node->Parse( p, &data, encoding );
760 			LinkEndChild( node );
761 		}
762 		else
763 		{
764 			break;
765 		}
766 
767 		// Did we get encoding info?
768 		if (    encoding == TIXML_ENCODING_UNKNOWN
769 			 && node->ToDeclaration() )
770 		{
771 			TiXmlDeclaration* dec = node->ToDeclaration();
772 			const char* enc = dec->Encoding();
773 			assert( enc );
774 
775 			if ( *enc == 0 )
776 				encoding = TIXML_ENCODING_UTF8;
777 			else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
778 				encoding = TIXML_ENCODING_UTF8;
779 			else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
780 				encoding = TIXML_ENCODING_UTF8;	// incorrect, but be nice
781 			else
782 				encoding = TIXML_ENCODING_LEGACY;
783 		}
784 
785 		p = SkipWhiteSpace( p, encoding );
786 	}
787 
788 	// Was this empty?
789 	if ( !firstChild ) {
790 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
791 		return 0;
792 	}
793 
794 	// All is well.
795 	return p;
796 }
797 
SetError(int err,const char * pError,TiXmlParsingData * data,TiXmlEncoding encoding)798 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
799 {
800 	// The first error in a chain is more accurate - don't set again!
801 	if ( error )
802 		return;
803 
804 	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
805 	error   = true;
806 	errorId = err;
807 	errorDesc = errorString[ errorId ];
808 
809 	errorLocation.Clear();
810 	if ( pError && data )
811 	{
812 		data->Stamp( pError, encoding );
813 		errorLocation = data->Cursor();
814 	}
815 }
816 
817 
Identify(const char * p,TiXmlEncoding encoding)818 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
819 {
820 	TiXmlNode* returnNode = 0;
821 
822 	p = SkipWhiteSpace( p, encoding );
823 	if( !p || !*p || *p != '<' )
824 	{
825 		return 0;
826 	}
827 
828 	p = SkipWhiteSpace( p, encoding );
829 
830 	if ( !p || !*p )
831 	{
832 		return 0;
833 	}
834 
835 	// What is this thing?
836 	// - Elements start with a letter or underscore, but xml is reserved.
837 	// - Comments: <!--
838 	// - Decleration: <?xml
839 	// - Everthing else is unknown to tinyxml.
840 	//
841 
842 	const char* xmlHeader = { "<?xml" };
843 	const char* commentHeader = { "<!--" };
844 	const char* dtdHeader = { "<!" };
845 	const char* cdataHeader = { "<![CDATA[" };
846 
847 	if ( StringEqual( p, xmlHeader, true, encoding ) )
848 	{
849 		#ifdef DEBUG_PARSER
850 			TIXML_LOG( "XML parsing Declaration\n" );
851 		#endif
852 		returnNode = new TiXmlDeclaration();
853 	}
854 	else if ( StringEqual( p, commentHeader, false, encoding ) )
855 	{
856 		#ifdef DEBUG_PARSER
857 			TIXML_LOG( "XML parsing Comment\n" );
858 		#endif
859 		returnNode = new TiXmlComment();
860 	}
861 	else if ( StringEqual( p, cdataHeader, false, encoding ) )
862 	{
863 		#ifdef DEBUG_PARSER
864 			TIXML_LOG( "XML parsing CDATA\n" );
865 		#endif
866 		TiXmlText* text = new TiXmlText( "" );
867 		text->SetCDATA( true );
868 		returnNode = text;
869 	}
870 	else if ( StringEqual( p, dtdHeader, false, encoding ) )
871 	{
872 		#ifdef DEBUG_PARSER
873 			TIXML_LOG( "XML parsing Unknown(1)\n" );
874 		#endif
875 		returnNode = new TiXmlUnknown();
876 	}
877 	else if (    IsAlpha( *(p+1), encoding )
878 			  || *(p+1) == '_' )
879 	{
880 		#ifdef DEBUG_PARSER
881 			TIXML_LOG( "XML parsing Element\n" );
882 		#endif
883 		returnNode = new TiXmlElement( "" );
884 	}
885 	else
886 	{
887 		#ifdef DEBUG_PARSER
888 			TIXML_LOG( "XML parsing Unknown(2)\n" );
889 		#endif
890 		returnNode = new TiXmlUnknown();
891 	}
892 
893 	if ( returnNode )
894 	{
895 		// Set the parent, so it can report errors
896 		returnNode->parent = this;
897 	}
898 	return returnNode;
899 }
900 
901 #ifdef TIXML_USE_STL
902 
StreamIn(std::istream * in,TIXML_STRING * tag)903 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
904 {
905 	// We're called with some amount of pre-parsing. That is, some of "this"
906 	// element is in "tag". Go ahead and stream to the closing ">"
907 	while( in->good() )
908 	{
909 		int c = in->get();
910 		if ( c <= 0 )
911 		{
912 			TiXmlDocument* document = GetDocument();
913 			if ( document )
914 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
915 			return;
916 		}
917 		(*tag) += (char) c ;
918 
919 		if ( c == '>' )
920 			break;
921 	}
922 
923 	if ( tag->length() < 3 ) return;
924 
925 	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
926 	// If not, identify and stream.
927 
928 	if (    tag->at( tag->length() - 1 ) == '>'
929 		 && tag->at( tag->length() - 2 ) == '/' )
930 	{
931 		// All good!
932 		return;
933 	}
934 	else if ( tag->at( tag->length() - 1 ) == '>' )
935 	{
936 		// There is more. Could be:
937 		//		text
938 		//		cdata text (which looks like another node)
939 		//		closing tag
940 		//		another node.
941 		for ( ;; )
942 		{
943 			StreamWhiteSpace( in, tag );
944 
945 			// Do we have text?
946 			if ( in->good() && in->peek() != '<' )
947 			{
948 				// Yep, text.
949 				TiXmlText text( "" );
950 				text.StreamIn( in, tag );
951 
952 				// What follows text is a closing tag or another node.
953 				// Go around again and figure it out.
954 				continue;
955 			}
956 
957 			// We now have either a closing tag...or another node.
958 			// We should be at a "<", regardless.
959 			if ( !in->good() ) return;
960 			assert( in->peek() == '<' );
961 			int tagIndex = (int) tag->length();
962 
963 			bool closingTag = false;
964 			bool firstCharFound = false;
965 
966 			for( ;; )
967 			{
968 				if ( !in->good() )
969 					return;
970 
971 				int c = in->peek();
972 				if ( c <= 0 )
973 				{
974 					TiXmlDocument* document = GetDocument();
975 					if ( document )
976 						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
977 					return;
978 				}
979 
980 				if ( c == '>' )
981 					break;
982 
983 				*tag += (char) c;
984 				in->get();
985 
986 				// Early out if we find the CDATA id.
987 				if ( c == '[' && tag->size() >= 9 )
988 				{
989 					size_t len = tag->size();
990 					const char* start = tag->c_str() + len - 9;
991 					if ( strcmp( start, "<![CDATA[" ) == 0 ) {
992 						assert( !closingTag );
993 						break;
994 					}
995 				}
996 
997 				if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
998 				{
999 					firstCharFound = true;
1000 					if ( c == '/' )
1001 						closingTag = true;
1002 				}
1003 			}
1004 			// If it was a closing tag, then read in the closing '>' to clean up the input stream.
1005 			// If it was not, the streaming will be done by the tag.
1006 			if ( closingTag )
1007 			{
1008 				if ( !in->good() )
1009 					return;
1010 
1011 				int c = in->get();
1012 				if ( c <= 0 )
1013 				{
1014 					TiXmlDocument* document = GetDocument();
1015 					if ( document )
1016 						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1017 					return;
1018 				}
1019 				assert( c == '>' );
1020 				*tag += (char) c;
1021 
1022 				// We are done, once we've found our closing tag.
1023 				return;
1024 			}
1025 			else
1026 			{
1027 				// If not a closing tag, id it, and stream.
1028 				const char* tagloc = tag->c_str() + tagIndex;
1029 				TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1030 				if ( !node )
1031 					return;
1032 				node->StreamIn( in, tag );
1033 				delete node;
1034 				node = 0;
1035 
1036 				// No return: go around from the beginning: text, closing tag, or node.
1037 			}
1038 		}
1039 	}
1040 }
1041 #endif
1042 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1043 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1044 {
1045 	p = SkipWhiteSpace( p, encoding );
1046 	TiXmlDocument* document = GetDocument();
1047 
1048 	if ( !p || !*p )
1049 	{
1050 		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1051 		return 0;
1052 	}
1053 
1054 	if ( data )
1055 	{
1056 		data->Stamp( p, encoding );
1057 		location = data->Cursor();
1058 	}
1059 
1060 	if ( *p != '<' )
1061 	{
1062 		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1063 		return 0;
1064 	}
1065 
1066 	p = SkipWhiteSpace( p+1, encoding );
1067 
1068 	// Read the name.
1069 	const char* pErr = p;
1070 
1071     p = ReadName( p, &value, encoding );
1072 	if ( !p || !*p )
1073 	{
1074 		if ( document )	document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1075 		return 0;
1076 	}
1077 
1078     TIXML_STRING endTag ("</");
1079 	endTag += value;
1080 
1081 	// Check for and read attributes. Also look for an empty
1082 	// tag or an end tag.
1083 	while ( p && *p )
1084 	{
1085 		pErr = p;
1086 		p = SkipWhiteSpace( p, encoding );
1087 		if ( !p || !*p )
1088 		{
1089 			if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1090 			return 0;
1091 		}
1092 		if ( *p == '/' )
1093 		{
1094 			++p;
1095 			// Empty tag.
1096 			if ( *p  != '>' )
1097 			{
1098 				if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1099 				return 0;
1100 			}
1101 			return (p+1);
1102 		}
1103 		else if ( *p == '>' )
1104 		{
1105 			// Done with attributes (if there were any.)
1106 			// Read the value -- which can include other
1107 			// elements -- read the end tag, and return.
1108 			++p;
1109 			p = ReadValue( p, data, encoding );		// Note this is an Element method, and will set the error if one happens.
1110 			if ( !p || !*p ) {
1111 				// We were looking for the end tag, but found nothing.
1112 				// Fix for [ 1663758 ] Failure to report error on bad XML
1113 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1114 				return 0;
1115 			}
1116 
1117 			// We should find the end tag now
1118 			// note that:
1119 			// </foo > and
1120 			// </foo>
1121 			// are both valid end tags.
1122 			if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1123 			{
1124 				p += endTag.length();
1125 				p = SkipWhiteSpace( p, encoding );
1126 				if ( p && *p && *p == '>' ) {
1127 					++p;
1128 					return p;
1129 				}
1130 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1131 				return 0;
1132 			}
1133 			else
1134 			{
1135 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1136 				return 0;
1137 			}
1138 		}
1139 		else
1140 		{
1141 			// Try to read an attribute:
1142 			TiXmlAttribute* attrib = new TiXmlAttribute();
1143 			if ( !attrib )
1144 			{
1145 				return 0;
1146 			}
1147 
1148 			attrib->SetDocument( document );
1149 			pErr = p;
1150 			p = attrib->Parse( p, data, encoding );
1151 
1152 			if ( !p || !*p )
1153 			{
1154 				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1155 				delete attrib;
1156 				return 0;
1157 			}
1158 
1159 			// Handle the strange case of double attributes:
1160 			#ifdef TIXML_USE_STL
1161 			TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
1162 			#else
1163 			TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1164 			#endif
1165 			if ( node )
1166 			{
1167 				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1168 				delete attrib;
1169 				return 0;
1170 			}
1171 
1172 			attributeSet.Add( attrib );
1173 		}
1174 	}
1175 	return p;
1176 }
1177 
1178 
ReadValue(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1179 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1180 {
1181 	TiXmlDocument* document = GetDocument();
1182 
1183 	// Read in text and elements in any order.
1184 	const char* pWithWhiteSpace = p;
1185 	p = SkipWhiteSpace( p, encoding );
1186 
1187 	while ( p && *p )
1188 	{
1189 		if ( *p != '<' )
1190 		{
1191 			// Take what we have, make a text element.
1192 			TiXmlText* textNode = new TiXmlText( "" );
1193 
1194 			if ( !textNode )
1195 			{
1196 			    return 0;
1197 			}
1198 
1199 			if ( TiXmlBase::IsWhiteSpaceCondensed() )
1200 			{
1201 				p = textNode->Parse( p, data, encoding );
1202 			}
1203 			else
1204 			{
1205 				// Special case: we want to keep the white space
1206 				// so that leading spaces aren't removed.
1207 				p = textNode->Parse( pWithWhiteSpace, data, encoding );
1208 			}
1209 
1210 			if ( !textNode->Blank() )
1211 				LinkEndChild( textNode );
1212 			else
1213 				delete textNode;
1214 		}
1215 		else
1216 		{
1217 			// We hit a '<'
1218 			// Have we hit a new element or an end tag? This could also be
1219 			// a TiXmlText in the "CDATA" style.
1220 			if ( StringEqual( p, "</", false, encoding ) )
1221 			{
1222 				return p;
1223 			}
1224 			else
1225 			{
1226 				TiXmlNode* node = Identify( p, encoding );
1227 				if ( node )
1228 				{
1229 					p = node->Parse( p, data, encoding );
1230 					LinkEndChild( node );
1231 				}
1232 				else
1233 				{
1234 					return 0;
1235 				}
1236 			}
1237 		}
1238 		pWithWhiteSpace = p;
1239 		p = SkipWhiteSpace( p, encoding );
1240 	}
1241 
1242 	if ( !p )
1243 	{
1244 		if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1245 	}
1246 	return p;
1247 }
1248 
1249 
1250 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1251 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
1252 {
1253 	while ( in->good() )
1254 	{
1255 		int c = in->get();
1256 		if ( c <= 0 )
1257 		{
1258 			TiXmlDocument* document = GetDocument();
1259 			if ( document )
1260 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1261 			return;
1262 		}
1263 		(*tag) += (char) c;
1264 
1265 		if ( c == '>' )
1266 		{
1267 			// All is well.
1268 			return;
1269 		}
1270 	}
1271 }
1272 #endif
1273 
1274 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1275 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1276 {
1277 	TiXmlDocument* document = GetDocument();
1278 	p = SkipWhiteSpace( p, encoding );
1279 
1280 	if ( data )
1281 	{
1282 		data->Stamp( p, encoding );
1283 		location = data->Cursor();
1284 	}
1285 	if ( !p || !*p || *p != '<' )
1286 	{
1287 		if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1288 		return 0;
1289 	}
1290 	++p;
1291     value = "";
1292 
1293 	while ( p && *p && *p != '>' )
1294 	{
1295 		value += *p;
1296 		++p;
1297 	}
1298 
1299 	if ( !p )
1300 	{
1301 		if ( document )
1302 			document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1303 	}
1304 	if ( p && *p == '>' )
1305 		return p+1;
1306 	return p;
1307 }
1308 
1309 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1310 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
1311 {
1312 	while ( in->good() )
1313 	{
1314 		int c = in->get();
1315 		if ( c <= 0 )
1316 		{
1317 			TiXmlDocument* document = GetDocument();
1318 			if ( document )
1319 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1320 			return;
1321 		}
1322 
1323 		(*tag) += (char) c;
1324 
1325 		if ( c == '>'
1326 			 && tag->at( tag->length() - 2 ) == '-'
1327 			 && tag->at( tag->length() - 3 ) == '-' )
1328 		{
1329 			// All is well.
1330 			return;
1331 		}
1332 	}
1333 }
1334 #endif
1335 
1336 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1337 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1338 {
1339 	TiXmlDocument* document = GetDocument();
1340 	value = "";
1341 
1342 	p = SkipWhiteSpace( p, encoding );
1343 
1344 	if ( data )
1345 	{
1346 		data->Stamp( p, encoding );
1347 		location = data->Cursor();
1348 	}
1349 	const char* startTag = "<!--";
1350 	const char* endTag   = "-->";
1351 
1352 	if ( !StringEqual( p, startTag, false, encoding ) )
1353 	{
1354 		if ( document )
1355 			document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1356 		return 0;
1357 	}
1358 	p += strlen( startTag );
1359 
1360 	// [ 1475201 ] TinyXML parses entities in comments
1361 	// Oops - ReadText doesn't work, because we don't want to parse the entities.
1362 	// p = ReadText( p, &value, false, endTag, false, encoding );
1363 	//
1364 	// from the XML spec:
1365 	/*
1366 	 [Definition: Comments may appear anywhere in a document outside other markup; in addition,
1367 	              they may appear within the document type declaration at places allowed by the grammar.
1368 				  They are not part of the document's character data; an XML processor MAY, but need not,
1369 				  make it possible for an application to retrieve the text of comments. For compatibility,
1370 				  the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
1371 				  references MUST NOT be recognized within comments.
1372 
1373 				  An example of a comment:
1374 
1375 				  <!-- declarations for <head> & <body> -->
1376 	*/
1377 
1378     value = "";
1379 	// Keep all the white space.
1380 	while (	p && *p && !StringEqual( p, endTag, false, encoding ) )
1381 	{
1382 		value.append( p, 1 );
1383 		++p;
1384 	}
1385 	if ( p && *p )
1386 		p += strlen( endTag );
1387 
1388 	return p;
1389 }
1390 
1391 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1392 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1393 {
1394 	p = SkipWhiteSpace( p, encoding );
1395 	if ( !p || !*p ) return 0;
1396 
1397 	if ( data )
1398 	{
1399 		data->Stamp( p, encoding );
1400 		location = data->Cursor();
1401 	}
1402 	// Read the name, the '=' and the value.
1403 	const char* pErr = p;
1404 	p = ReadName( p, &name, encoding );
1405 	if ( !p || !*p )
1406 	{
1407 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1408 		return 0;
1409 	}
1410 	p = SkipWhiteSpace( p, encoding );
1411 	if ( !p || !*p || *p != '=' )
1412 	{
1413 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1414 		return 0;
1415 	}
1416 
1417 	++p;	// skip '='
1418 	p = SkipWhiteSpace( p, encoding );
1419 	if ( !p || !*p )
1420 	{
1421 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1422 		return 0;
1423 	}
1424 
1425 	const char* end;
1426 	const char SINGLE_QUOTE = '\'';
1427 	const char DOUBLE_QUOTE = '\"';
1428 
1429 	if ( *p == SINGLE_QUOTE )
1430 	{
1431 		++p;
1432 		end = "\'";		// single quote in string
1433 		p = ReadText( p, &value, false, end, false, encoding );
1434 	}
1435 	else if ( *p == DOUBLE_QUOTE )
1436 	{
1437 		++p;
1438 		end = "\"";		// double quote in string
1439 		p = ReadText( p, &value, false, end, false, encoding );
1440 	}
1441 	else
1442 	{
1443 		// All attribute values should be in single or double quotes.
1444 		// But this is such a common error that the parser will try
1445 		// its best, even without them.
1446 		value = "";
1447 		while (    p && *p											// existence
1448 				&& !IsWhiteSpace( *p )								// whitespace
1449 				&& *p != '/' && *p != '>' )							// tag end
1450 		{
1451 			if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
1452 				// [ 1451649 ] Attribute values with trailing quotes not handled correctly
1453 				// We did not have an opening quote but seem to have a
1454 				// closing one. Give up and throw an error.
1455 				if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1456 				return 0;
1457 			}
1458 			value += *p;
1459 			++p;
1460 		}
1461 	}
1462 	return p;
1463 }
1464 
1465 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1466 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
1467 {
1468 	while ( in->good() )
1469 	{
1470 		int c = in->peek();
1471 		if ( !cdata && (c == '<' ) )
1472 		{
1473 			return;
1474 		}
1475 		if ( c <= 0 )
1476 		{
1477 			TiXmlDocument* document = GetDocument();
1478 			if ( document )
1479 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1480 			return;
1481 		}
1482 
1483 		(*tag) += (char) c;
1484 		in->get();	// "commits" the peek made above
1485 
1486 		if ( cdata && c == '>' && tag->size() >= 3 ) {
1487 			size_t len = tag->size();
1488 			if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
1489 				// terminator of cdata.
1490 				return;
1491 			}
1492 		}
1493 	}
1494 }
1495 #endif
1496 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1497 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1498 {
1499 	value = "";
1500 	TiXmlDocument* document = GetDocument();
1501 
1502 	if ( data )
1503 	{
1504 		data->Stamp( p, encoding );
1505 		location = data->Cursor();
1506 	}
1507 
1508 	const char* const startTag = "<![CDATA[";
1509 	const char* const endTag   = "]]>";
1510 
1511 	if ( cdata || StringEqual( p, startTag, false, encoding ) )
1512 	{
1513 		cdata = true;
1514 
1515 		if ( !StringEqual( p, startTag, false, encoding ) )
1516 		{
1517 			if ( document )
1518 				document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1519 			return 0;
1520 		}
1521 		p += strlen( startTag );
1522 
1523 		// Keep all the white space, ignore the encoding, etc.
1524 		while (	   p && *p
1525 				&& !StringEqual( p, endTag, false, encoding )
1526 			  )
1527 		{
1528 			value += *p;
1529 			++p;
1530 		}
1531 
1532 		TIXML_STRING dummy;
1533 		p = ReadText( p, &dummy, false, endTag, false, encoding );
1534 		return p;
1535 	}
1536 	else
1537 	{
1538 		bool ignoreWhite = true;
1539 
1540 		const char* end = "<";
1541 		p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1542 		if ( p && *p )
1543 			return p-1;	// don't truncate the '<'
1544 		return 0;
1545 	}
1546 }
1547 
1548 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1549 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
1550 {
1551 	while ( in->good() )
1552 	{
1553 		int c = in->get();
1554 		if ( c <= 0 )
1555 		{
1556 			TiXmlDocument* document = GetDocument();
1557 			if ( document )
1558 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1559 			return;
1560 		}
1561 		(*tag) += (char) c;
1562 
1563 		if ( c == '>' )
1564 		{
1565 			// All is well.
1566 			return;
1567 		}
1568 	}
1569 }
1570 #endif
1571 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding _encoding)1572 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1573 {
1574 	p = SkipWhiteSpace( p, _encoding );
1575 	// Find the beginning, find the end, and look for
1576 	// the stuff in-between.
1577 	TiXmlDocument* document = GetDocument();
1578 	if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1579 	{
1580 		if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1581 		return 0;
1582 	}
1583 	if ( data )
1584 	{
1585 		data->Stamp( p, _encoding );
1586 		location = data->Cursor();
1587 	}
1588 	p += 5;
1589 
1590 	version = "";
1591 	encoding = "";
1592 	standalone = "";
1593 
1594 	while ( p && *p )
1595 	{
1596 		if ( *p == '>' )
1597 		{
1598 			++p;
1599 			return p;
1600 		}
1601 
1602 		p = SkipWhiteSpace( p, _encoding );
1603 		if ( StringEqual( p, "version", true, _encoding ) )
1604 		{
1605 			TiXmlAttribute attrib;
1606 			p = attrib.Parse( p, data, _encoding );
1607 			version = attrib.Value();
1608 		}
1609 		else if ( StringEqual( p, "encoding", true, _encoding ) )
1610 		{
1611 			TiXmlAttribute attrib;
1612 			p = attrib.Parse( p, data, _encoding );
1613 			encoding = attrib.Value();
1614 		}
1615 		else if ( StringEqual( p, "standalone", true, _encoding ) )
1616 		{
1617 			TiXmlAttribute attrib;
1618 			p = attrib.Parse( p, data, _encoding );
1619 			standalone = attrib.Value();
1620 		}
1621 		else
1622 		{
1623 			// Read over whatever it is.
1624 			while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1625 				++p;
1626 		}
1627 	}
1628 	return 0;
1629 }
1630 
Blank() const1631 bool TiXmlText::Blank() const
1632 {
1633 	for ( unsigned i=0; i<value.length(); i++ )
1634 		if ( !IsWhiteSpace( value[i] ) )
1635 			return false;
1636 	return true;
1637 }
1638 
1639