1 /*
2 www.sourceforge.net/projects/tinyxml
3 Original code (2.0 and earlier )copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
4 
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any
7 damages arising from the use of this software.
8 
9 Permission is granted to anyone to use this software for any
10 purpose, including commercial applications, and to alter it and
11 redistribute it freely, subject to the following restrictions:
12 
13 1. The origin of this software must not be misrepresented; you must
14 not claim that you wrote the original software. If you use this
15 software in a product, an acknowledgment in the product documentation
16 would be appreciated but is not required.
17 
18 2. Altered source versions must be plainly marked as such, and
19 must not be misrepresented as being the original software.
20 
21 3. This notice may not be removed or altered from any source
22 distribution.
23 */
24 
25 #include <ctype.h>
26 #include <stddef.h>
27 #include <new>
28 
29 #include "tinyxml.h"
30 
31 //#define DEBUG_PARSER
32 #if defined( DEBUG_PARSER )
33 #	if defined( DEBUG ) && defined( _MSC_VER )
34 #		include <windows.h>
35 #		define TIXML_LOG OutputDebugString
36 #	else
37 #		define TIXML_LOG printf
38 #	endif
39 #endif
40 
41 // Note tha "PutString" hardcodes the same list. This
42 // is less flexible than it appears. Changing the entries
43 // or order will break putstring.
44 TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
45 {
46 	{ "&amp;",  5, '&' },
47 	{ "&lt;",   4, '<' },
48 	{ "&gt;",   4, '>' },
49 	{ "&quot;", 6, '\"' },
50 	{ "&apos;", 6, '\'' }
51 };
52 
53 // Bunch of unicode info at:
54 //		http://www.unicode.org/faq/utf_bom.html
55 // Including the basic of this table, which determines the #bytes in the
56 // sequence from the lead byte. 1 placed for invalid sequences --
57 // although the result will be junk, pass it through as much as possible.
58 // Beware of the non-characters in UTF-8:
59 //				ef bb bf (Microsoft "lead bytes")
60 //				ef bf be
61 //				ef bf bf
62 
63 const unsigned char TIXML_UTF_LEAD_0 = 0xefU;
64 const unsigned char TIXML_UTF_LEAD_1 = 0xbbU;
65 const unsigned char TIXML_UTF_LEAD_2 = 0xbfU;
66 
67 const int TiXmlBase::utf8ByteTable[256] =
68 {
69 	//	0	1	2	3	4	5	6	7	8	9	a	b	c	d	e	f
70 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x00
71 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x10
72 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x20
73 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x30
74 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x40
75 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x50
76 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x60
77 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x70	End of ASCII range
78 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x80 0x80 to 0xc1 invalid
79 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0x90
80 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xa0
81 		1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	// 0xb0
82 		1,	1,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xc0 0xc2 to 0xdf 2 byte
83 		2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	2,	// 0xd0
84 		3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	3,	// 0xe0 0xe0 to 0xef 3 byte
85 		4,	4,	4,	4,	4,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1,	1	// 0xf0 0xf0 to 0xf4 4 byte, 0xf5 and higher invalid
86 };
87 
88 
ConvertUTF32ToUTF8(unsigned long input,char * output,int * length)89 void TiXmlBase::ConvertUTF32ToUTF8( unsigned long input, char* output, int* length )
90 {
91 	const unsigned long BYTE_MASK = 0xBF;
92 	const unsigned long BYTE_MARK = 0x80;
93 	const unsigned long FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
94 
95 	if (input < 0x80)
96 		*length = 1;
97 	else if ( input < 0x800 )
98 		*length = 2;
99 	else if ( input < 0x10000 )
100 		*length = 3;
101 	else if ( input < 0x200000 )
102 		*length = 4;
103 	else
104 		{ *length = 0; return; }	// This code won't covert this correctly anyway.
105 
106 	output += *length;
107 
108 	// Scary scary fall throughs.
109 	switch (*length)
110 	{
111 		case 4:
112 			--output;
113 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
114 			input >>= 6;
115 		case 3:
116 			--output;
117 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
118 			input >>= 6;
119 		case 2:
120 			--output;
121 			*output = (char)((input | BYTE_MARK) & BYTE_MASK);
122 			input >>= 6;
123 		case 1:
124 			--output;
125 			*output = (char)(input | FIRST_BYTE_MARK[*length]);
126 	}
127 }
128 
129 
IsAlpha(unsigned char anyByte,TiXmlEncoding)130 /*static*/ int TiXmlBase::IsAlpha( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
131 {
132 	// This will only work for low-ascii, everything else is assumed to be a valid
133 	// letter. I'm not sure this is the best approach, but it is quite tricky trying
134 	// to figure out alhabetical vs. not across encoding. So take a very
135 	// conservative approach.
136 
137 //	if ( encoding == TIXML_ENCODING_UTF8 )
138 //	{
139 		if ( anyByte < 127 )
140 			return isalpha( anyByte );
141 		else
142 			return 1;	// What else to do? The unicode set is huge...get the english ones right.
143 //	}
144 //	else
145 //	{
146 //		return isalpha( anyByte );
147 //	}
148 }
149 
150 
IsAlphaNum(unsigned char anyByte,TiXmlEncoding)151 /*static*/ int TiXmlBase::IsAlphaNum( unsigned char anyByte, TiXmlEncoding /*encoding*/ )
152 {
153 	// This will only work for low-ascii, everything else is assumed to be a valid
154 	// letter. I'm not sure this is the best approach, but it is quite tricky trying
155 	// to figure out alhabetical vs. not across encoding. So take a very
156 	// conservative approach.
157 
158 //	if ( encoding == TIXML_ENCODING_UTF8 )
159 //	{
160 		if ( anyByte < 127 )
161 			return isalnum( anyByte );
162 		else
163 			return 1;	// What else to do? The unicode set is huge...get the english ones right.
164 //	}
165 //	else
166 //	{
167 //		return isalnum( anyByte );
168 //	}
169 }
170 
171 
172 class TiXmlParsingData
173 {
174 	friend class TiXmlDocument;
175   public:
176 	void Stamp( const char* now, TiXmlEncoding encoding );
177 
Cursor()178 	const TiXmlCursor& Cursor()	{ return cursor; }
179 
180   private:
181 	// Only used by the document!
TiXmlParsingData(const char * start,int _tabsize,int row,int col)182 	TiXmlParsingData( const char* start, int _tabsize, int row, int col )
183 	{
184 		assert( start );
185 		stamp = start;
186 		tabsize = _tabsize;
187 		cursor.row = row;
188 		cursor.col = col;
189 	}
190 
191 	TiXmlCursor		cursor;
192 	const char*		stamp;
193 	int				tabsize;
194 };
195 
196 
Stamp(const char * now,TiXmlEncoding encoding)197 void TiXmlParsingData::Stamp( const char* now, TiXmlEncoding encoding )
198 {
199 	assert( now );
200 
201 	// Do nothing if the tabsize is 0.
202 	if ( tabsize < 1 )
203 	{
204 		return;
205 	}
206 
207 	// Get the current row, column.
208 	int row = cursor.row;
209 	int col = cursor.col;
210 	const char* p = stamp;
211 	assert( p );
212 
213 	while ( p < now )
214 	{
215 		// Treat p as unsigned, so we have a happy compiler.
216 		const unsigned char* pU = (const unsigned char*)p;
217 
218 		// Code contributed by Fletcher Dunn: (modified by lee)
219 		switch (*pU) {
220 			case 0:
221 				// We *should* never get here, but in case we do, don't
222 				// advance past the terminating null character, ever
223 				return;
224 
225 			case '\r':
226 				// bump down to the next line
227 				++row;
228 				col = 0;
229 				// Eat the character
230 				++p;
231 
232 				// Check for \r\n sequence, and treat this as a single character
233 				if (*p == '\n') {
234 					++p;
235 				}
236 				break;
237 
238 			case '\n':
239 				// bump down to the next line
240 				++row;
241 				col = 0;
242 
243 				// Eat the character
244 				++p;
245 
246 				// Check for \n\r sequence, and treat this as a single
247 				// character.  (Yes, this bizarre thing does occur still
248 				// on some arcane platforms...)
249 				if (*p == '\r') {
250 					++p;
251 				}
252 				break;
253 
254 			case '\t':
255 				// Eat the character
256 				++p;
257 
258 				// Skip to next tab stop
259 				col = (col / tabsize + 1) * tabsize;
260 				break;
261 
262 			case TIXML_UTF_LEAD_0:
263 				if ( encoding == TIXML_ENCODING_UTF8 )
264 				{
265 					if ( *(p+1) && *(p+2) )
266 					{
267 						// In these cases, don't advance the column. These are
268 						// 0-width spaces.
269 						if ( *(pU+1)==TIXML_UTF_LEAD_1 && *(pU+2)==TIXML_UTF_LEAD_2 )
270 							p += 3;
271 						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbeU )
272 							p += 3;
273 						else if ( *(pU+1)==0xbfU && *(pU+2)==0xbfU )
274 							p += 3;
275 						else
276 							{ p +=3; ++col; }	// A normal character.
277 					}
278 				}
279 				else
280 				{
281 					++p;
282 					++col;
283 				}
284 				break;
285 
286 			default:
287 				if ( encoding == TIXML_ENCODING_UTF8 )
288 				{
289 					// Eat the 1 to 4 byte utf8 character.
290 					int step = TiXmlBase::utf8ByteTable[*((const unsigned char*)p)];
291 					if ( step == 0 )
292 						step = 1;		// Error case from bad encoding, but handle gracefully.
293 					p += step;
294 
295 					// Just advance one column, of course.
296 					++col;
297 				}
298 				else
299 				{
300 					++p;
301 					++col;
302 				}
303 				break;
304 		}
305 	}
306 	cursor.row = row;
307 	cursor.col = col;
308 	assert( cursor.row >= -1 );
309 	assert( cursor.col >= -1 );
310 	stamp = p;
311 	assert( stamp );
312 }
313 
314 
SkipWhiteSpace(const char * p,TiXmlEncoding encoding)315 const char* TiXmlBase::SkipWhiteSpace( const char* p, TiXmlEncoding encoding )
316 {
317 	if ( !p || !*p )
318 	{
319 		return 0;
320 	}
321 	if ( encoding == TIXML_ENCODING_UTF8 )
322 	{
323 		while ( *p )
324 		{
325 			const unsigned char* pU = (const unsigned char*)p;
326 
327 			// Skip the stupid Microsoft UTF-8 Byte order marks
328 			if (	*(pU+0)==TIXML_UTF_LEAD_0
329 				 && *(pU+1)==TIXML_UTF_LEAD_1
330 				 && *(pU+2)==TIXML_UTF_LEAD_2 )
331 			{
332 				p += 3;
333 				continue;
334 			}
335 			else if(*(pU+0)==TIXML_UTF_LEAD_0
336 				 && *(pU+1)==0xbfU
337 				 && *(pU+2)==0xbeU )
338 			{
339 				p += 3;
340 				continue;
341 			}
342 			else if(*(pU+0)==TIXML_UTF_LEAD_0
343 				 && *(pU+1)==0xbfU
344 				 && *(pU+2)==0xbfU )
345 			{
346 				p += 3;
347 				continue;
348 			}
349 
350 			if ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' )		// Still using old rules for white space.
351 				++p;
352 			else
353 				break;
354 		}
355 	}
356 	else
357 	{
358 		while ( (*p) && ( IsWhiteSpace( *p ) || *p == '\n' || *p =='\r' ) )
359 			++p;
360 	}
361 
362 	return p;
363 }
364 
365 #ifdef TIXML_USE_STL
StreamWhiteSpace(std::istream * in,TIXML_STRING * tag)366 /*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream * in, TIXML_STRING * tag )
367 {
368 	for( ;; )
369 	{
370 		if ( !in->good() ) return false;
371 
372 		int c = in->peek();
373 		// At this scope, we can't get to a document. So fail silently.
374 		if ( !IsWhiteSpace( c ) || c <= 0 )
375 			return true;
376 
377 		*tag += (char) in->get();
378 	}
379 }
380 
StreamTo(std::istream * in,int character,TIXML_STRING * tag)381 /*static*/ bool TiXmlBase::StreamTo( std::istream * in, int character, TIXML_STRING * tag )
382 {
383 	//assert( character > 0 && character < 128 );	// else it won't work in utf-8
384 	while ( in->good() )
385 	{
386 		int c = in->peek();
387 		if ( c == character )
388 			return true;
389 		if ( c <= 0 )		// Silent failure: can't get document at this scope
390 			return false;
391 
392 		in->get();
393 		*tag += (char) c;
394 	}
395 	return false;
396 }
397 #endif
398 
399 // One of TinyXML's more performance demanding functions. Try to keep the memory overhead down. The
400 // "assign" optimization removes over 10% of the execution time.
401 //
ReadName(const char * p,TIXML_STRING * name,TiXmlEncoding encoding)402 const char* TiXmlBase::ReadName( const char* p, TIXML_STRING * name, TiXmlEncoding encoding )
403 {
404 	// Oddly, not supported on some comilers,
405 	//name->clear();
406 	// So use this:
407 	*name = "";
408 	assert( p );
409 
410 	// Names start with letters or underscores.
411 	// Of course, in unicode, tinyxml has no idea what a letter *is*. The
412 	// algorithm is generous.
413 	//
414 	// After that, they can be letters, underscores, numbers,
415 	// hyphens, or colons. (Colons are valid ony for namespaces,
416 	// but tinyxml can't tell namespaces from names.)
417 	if (	p && *p
418 		 && ( IsAlpha( (unsigned char) *p, encoding ) || *p == '_' ) )
419 	{
420 		const char* start = p;
421 		while(		p && *p
422 				&&	(		IsAlphaNum( (unsigned char ) *p, encoding )
423 						 || *p == '_'
424 						 || *p == '-'
425 						 || *p == '.'
426 						 || *p == ':' ) )
427 		{
428 			//(*name) += *p; // expensive
429 			++p;
430 		}
431 		if ( p-start > 0 ) {
432 			name->assign( start, p-start );
433 		}
434 		return p;
435 	}
436 	return 0;
437 }
438 
GetEntity(const char * p,char * value,int * length,TiXmlEncoding encoding)439 const char* TiXmlBase::GetEntity( const char* p, char* value, int* length, TiXmlEncoding encoding )
440 {
441 	// Presume an entity, and pull it out.
442 	TIXML_STRING ent;
443 	int i;
444 	*length = 0;
445 
446 	if ( *(p+1) && *(p+1) == '#' && *(p+2) )
447 	{
448 		unsigned long ucs = 0;
449 		ptrdiff_t delta = 0;
450 		unsigned mult = 1;
451 
452 		if ( *(p+2) == 'x' )
453 		{
454 			// Hexadecimal.
455 			if ( !*(p+3) ) return 0;
456 
457 			const char* q = p+3;
458 			q = strchr( q, ';' );
459 
460 			if ( !q || !*q ) return 0;
461 
462 			delta = q-p;
463 			--q;
464 
465 			while ( *q != 'x' )
466 			{
467 				if ( *q >= '0' && *q <= '9' )
468 					ucs += mult * (*q - '0');
469 				else if ( *q >= 'a' && *q <= 'f' )
470 					ucs += mult * (*q - 'a' + 10);
471 				else if ( *q >= 'A' && *q <= 'F' )
472 					ucs += mult * (*q - 'A' + 10 );
473 				else
474 					return 0;
475 				mult *= 16;
476 				--q;
477 			}
478 		}
479 		else
480 		{
481 			// Decimal.
482 			if ( !*(p+2) ) return 0;
483 
484 			const char* q = p+2;
485 			q = strchr( q, ';' );
486 
487 			if ( !q || !*q ) return 0;
488 
489 			delta = q-p;
490 			--q;
491 
492 			while ( *q != '#' )
493 			{
494 				if ( *q >= '0' && *q <= '9' )
495 					ucs += mult * (*q - '0');
496 				else
497 					return 0;
498 				mult *= 10;
499 				--q;
500 			}
501 		}
502 		if ( encoding == TIXML_ENCODING_UTF8 )
503 		{
504 			// convert the UCS to UTF-8
505 			ConvertUTF32ToUTF8( ucs, value, length );
506 		}
507 		else
508 		{
509 			*value = (char)ucs;
510 			*length = 1;
511 		}
512 		return p + delta + 1;
513 	}
514 
515 	// Now try to match it.
516 	for( i=0; i<NUM_ENTITY; ++i )
517 	{
518 		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
519 		{
520 			assert( strlen( entity[i].str ) == entity[i].strLength );
521 			*value = entity[i].chr;
522 			*length = 1;
523 			return ( p + entity[i].strLength );
524 		}
525 	}
526 
527 	// So it wasn't an entity, its unrecognized, or something like that.
528 	*value = *p;	// Don't put back the last one, since we return it!
529 	//*length = 1;	// Leave unrecognized entities - this doesn't really work.
530 					// Just writes strange XML.
531 	return p+1;
532 }
533 
534 
StringEqual(const char * p,const char * tag,bool ignoreCase,TiXmlEncoding encoding)535 bool TiXmlBase::StringEqual( const char* p,
536 							 const char* tag,
537 							 bool ignoreCase,
538 							 TiXmlEncoding encoding )
539 {
540 	assert( p );
541 	assert( tag );
542 	if ( !p || !*p )
543 	{
544 		assert( 0 );
545 		return false;
546 	}
547 
548 	const char* q = p;
549 
550 	if ( ignoreCase )
551 	{
552 		while ( *q && *tag && ToLower( *q, encoding ) == ToLower( *tag, encoding ) )
553 		{
554 			++q;
555 			++tag;
556 		}
557 
558 		if ( *tag == 0 )
559 			return true;
560 	}
561 	else
562 	{
563 		while ( *q && *tag && *q == *tag )
564 		{
565 			++q;
566 			++tag;
567 		}
568 
569 		if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
570 			return true;
571 	}
572 	return false;
573 }
574 
ReadText(const char * p,TIXML_STRING * text,bool trimWhiteSpace,const char * endTag,bool caseInsensitive,TiXmlEncoding encoding)575 const char* TiXmlBase::ReadText(	const char* p,
576 									TIXML_STRING * text,
577 									bool trimWhiteSpace,
578 									const char* endTag,
579 									bool caseInsensitive,
580 									TiXmlEncoding encoding )
581 {
582 	*text = "";
583 	if (	!trimWhiteSpace			// certain tags always keep whitespace
584 		 || !condenseWhiteSpace )	// if true, whitespace is always kept
585 	{
586 		// Keep all the white space.
587 		while (	   p && *p
588 				&& !StringEqual( p, endTag, caseInsensitive, encoding )
589 			  )
590 		{
591 			int len;
592 			char cArr[4] = { 0, 0, 0, 0 };
593 			p = GetChar( p, cArr, &len, encoding );
594 			text->append( cArr, len );
595 		}
596 	}
597 	else
598 	{
599 		bool whitespace = false;
600 
601 		// Remove leading white space:
602 		p = SkipWhiteSpace( p, encoding );
603 		while (	   p && *p
604 				&& !StringEqual( p, endTag, caseInsensitive, encoding ) )
605 		{
606 			if ( *p == '\r' || *p == '\n' )
607 			{
608 				whitespace = true;
609 				++p;
610 			}
611 			else if ( IsWhiteSpace( *p ) )
612 			{
613 				whitespace = true;
614 				++p;
615 			}
616 			else
617 			{
618 				// If we've found whitespace, add it before the
619 				// new character. Any whitespace just becomes a space.
620 				if ( whitespace )
621 				{
622 					(*text) += ' ';
623 					whitespace = false;
624 				}
625 				int len;
626 				char cArr[4] = { 0, 0, 0, 0 };
627 				p = GetChar( p, cArr, &len, encoding );
628 				if ( len == 1 )
629 					(*text) += cArr[0];	// more efficient
630 				else
631 					text->append( cArr, len );
632 			}
633 		}
634 	}
635 	if ( p )
636 		p += strlen( endTag );
637 	return p;
638 }
639 
640 #ifdef TIXML_USE_STL
641 
StreamIn(std::istream * in,TIXML_STRING * tag)642 void TiXmlDocument::StreamIn( std::istream * in, TIXML_STRING * tag )
643 {
644 	// The basic issue with a document is that we don't know what we're
645 	// streaming. Read something presumed to be a tag (and hope), then
646 	// identify it, and call the appropriate stream method on the tag.
647 	//
648 	// This "pre-streaming" will never read the closing ">" so the
649 	// sub-tag can orient itself.
650 
651 	if ( !StreamTo( in, '<', tag ) )
652 	{
653 		SetError( TIXML_ERROR_PARSING_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
654 		return;
655 	}
656 
657 	while ( in->good() )
658 	{
659 		int tagIndex = (int) tag->length();
660 		while ( in->good() && in->peek() != '>' )
661 		{
662 			int c = in->get();
663 			if ( c <= 0 )
664 			{
665 				SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
666 				break;
667 			}
668 			(*tag) += (char) c;
669 		}
670 
671 		if ( in->good() )
672 		{
673 			// We now have something we presume to be a node of
674 			// some sort. Identify it, and call the node to
675 			// continue streaming.
676 			TiXmlNode* node = Identify( tag->c_str() + tagIndex, TIXML_DEFAULT_ENCODING );
677 
678 			if ( node )
679 			{
680 				node->StreamIn( in, tag );
681 				bool isElement = node->ToElement() != 0;
682 				delete node;
683 				node = 0;
684 
685 				// If this is the root element, we're done. Parsing will be
686 				// done by the >> operator.
687 				if ( isElement )
688 				{
689 					return;
690 				}
691 			}
692 			else
693 			{
694 				SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
695 				return;
696 			}
697 		}
698 	}
699 	// We should have returned sooner.
700 	SetError( TIXML_ERROR, 0, 0, TIXML_ENCODING_UNKNOWN );
701 }
702 
703 #endif
704 
Parse(const char * p,TiXmlParsingData * prevData,TiXmlEncoding encoding)705 const char* TiXmlDocument::Parse( const char* p, TiXmlParsingData* prevData, TiXmlEncoding encoding )
706 {
707 	ClearError();
708 
709 	// Parse away, at the document level. Since a document
710 	// contains nothing but other tags, most of what happens
711 	// here is skipping white space.
712 	if ( !p || !*p )
713 	{
714 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
715 		return 0;
716 	}
717 
718 	// Note that, for a document, this needs to come
719 	// before the while space skip, so that parsing
720 	// starts from the pointer we are given.
721 	location.Clear();
722 	if ( prevData )
723 	{
724 		location.row = prevData->cursor.row;
725 		location.col = prevData->cursor.col;
726 	}
727 	else
728 	{
729 		location.row = 0;
730 		location.col = 0;
731 	}
732 	TiXmlParsingData data( p, TabSize(), location.row, location.col );
733 	location = data.Cursor();
734 
735 	if ( encoding == TIXML_ENCODING_UNKNOWN )
736 	{
737 		// Check for the Microsoft UTF-8 lead bytes.
738 		const unsigned char* pU = (const unsigned char*)p;
739 		if (	*(pU+0) && *(pU+0) == TIXML_UTF_LEAD_0
740 			 && *(pU+1) && *(pU+1) == TIXML_UTF_LEAD_1
741 			 && *(pU+2) && *(pU+2) == TIXML_UTF_LEAD_2 )
742 		{
743 			encoding = TIXML_ENCODING_UTF8;
744 			useMicrosoftBOM = true;
745 		}
746 	}
747 
748 	p = SkipWhiteSpace( p, encoding );
749 	if ( !p )
750 	{
751 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, TIXML_ENCODING_UNKNOWN );
752 		return 0;
753 	}
754 
755 	while ( p && *p )
756 	{
757 		TiXmlNode* node = Identify( p, encoding );
758 		if ( node )
759 		{
760 			p = node->Parse( p, &data, encoding );
761 			LinkEndChild( node );
762 		}
763 		else
764 		{
765 			break;
766 		}
767 
768 		// Did we get encoding info?
769 		if (	encoding == TIXML_ENCODING_UNKNOWN
770 			 && node->ToDeclaration() )
771 		{
772 			TiXmlDeclaration* dec = node->ToDeclaration();
773 			const char* enc = dec->Encoding();
774 			assert( enc );
775 
776 			if ( *enc == 0 )
777 				encoding = TIXML_ENCODING_UTF8;
778 			else if ( StringEqual( enc, "UTF-8", true, TIXML_ENCODING_UNKNOWN ) )
779 				encoding = TIXML_ENCODING_UTF8;
780 			else if ( StringEqual( enc, "UTF8", true, TIXML_ENCODING_UNKNOWN ) )
781 				encoding = TIXML_ENCODING_UTF8;	// incorrect, but be nice
782 			else
783 				encoding = TIXML_ENCODING_LEGACY;
784 		}
785 
786 		p = SkipWhiteSpace( p, encoding );
787 	}
788 
789 	// Was this empty?
790 	if ( !firstChild ) {
791 		SetError( TIXML_ERROR_DOCUMENT_EMPTY, 0, 0, encoding );
792 		return 0;
793 	}
794 
795 	// All is well.
796 	return p;
797 }
798 
SetError(int err,const char * pError,TiXmlParsingData * data,TiXmlEncoding encoding)799 void TiXmlDocument::SetError( int err, const char* pError, TiXmlParsingData* data, TiXmlEncoding encoding )
800 {
801 	// The first error in a chain is more accurate - don't set again!
802 	if ( error )
803 		return;
804 
805 	assert( err > 0 && err < TIXML_ERROR_STRING_COUNT );
806 	error   = true;
807 	errorId = err;
808 	errorDesc = errorString[ errorId ];
809 
810 	errorLocation.Clear();
811 	if ( pError && data )
812 	{
813 		data->Stamp( pError, encoding );
814 		errorLocation = data->Cursor();
815 	}
816 }
817 
818 
Identify(const char * p,TiXmlEncoding encoding)819 TiXmlNode* TiXmlNode::Identify( const char* p, TiXmlEncoding encoding )
820 {
821 	TiXmlNode* returnNode = 0;
822 
823 	p = SkipWhiteSpace( p, encoding );
824 	if( !p || !*p || *p != '<' )
825 	{
826 		return 0;
827 	}
828 
829 	TiXmlDocument* doc = GetDocument();
830 	p = SkipWhiteSpace( p, encoding );
831 
832 	if ( !p || !*p )
833 	{
834 		return 0;
835 	}
836 
837 	// What is this thing?
838 	// - Elements start with a letter or underscore, but xml is reserved.
839 	// - Comments: <!--
840 	// - Decleration: <?xml
841 	// - Everthing else is unknown to tinyxml.
842 	//
843 
844 	const char* xmlHeader = { "<?xml" };
845 	const char* commentHeader = { "<!--" };
846 	const char* dtdHeader = { "<!" };
847 	const char* cdataHeader = { "<![CDATA[" };
848 
849 	if ( StringEqual( p, xmlHeader, true, encoding ) )
850 	{
851 		#ifdef DEBUG_PARSER
852 			TIXML_LOG( "XML parsing Declaration\n" );
853 		#endif
854 		returnNode = new (std::nothrow) TiXmlDeclaration();
855 	}
856 	else if ( StringEqual( p, commentHeader, false, encoding ) )
857 	{
858 		#ifdef DEBUG_PARSER
859 			TIXML_LOG( "XML parsing Comment\n" );
860 		#endif
861 		returnNode = new (std::nothrow) TiXmlComment();
862 	}
863 	else if ( StringEqual( p, cdataHeader, false, encoding ) )
864 	{
865 		#ifdef DEBUG_PARSER
866 			TIXML_LOG( "XML parsing CDATA\n" );
867 		#endif
868 		TiXmlText* text = new (std::nothrow) TiXmlText( "" );
869 		text->SetCDATA( true );
870 		returnNode = text;
871 	}
872 	else if ( StringEqual( p, dtdHeader, false, encoding ) )
873 	{
874 		#ifdef DEBUG_PARSER
875 			TIXML_LOG( "XML parsing Unknown(1)\n" );
876 		#endif
877 		returnNode = new (std::nothrow) TiXmlUnknown();
878 	}
879 	else if (	IsAlpha( *(p+1), encoding )
880 			  || *(p+1) == '_' )
881 	{
882 		#ifdef DEBUG_PARSER
883 			TIXML_LOG( "XML parsing Element\n" );
884 		#endif
885 		returnNode = new (std::nothrow) TiXmlElement( "" );
886 	}
887 	else
888 	{
889 		#ifdef DEBUG_PARSER
890 			TIXML_LOG( "XML parsing Unknown(2)\n" );
891 		#endif
892 		returnNode = new (std::nothrow) TiXmlUnknown();
893 	}
894 
895 	if ( returnNode )
896 	{
897 		// Set the parent, so it can report errors
898 		returnNode->parent = this;
899 	}
900 	else
901 	{
902 		if ( doc )
903 			doc->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, TIXML_ENCODING_UNKNOWN );
904 	}
905 	return returnNode;
906 }
907 
908 #ifdef TIXML_USE_STL
909 
StreamIn(std::istream * in,TIXML_STRING * tag)910 void TiXmlElement::StreamIn (std::istream * in, TIXML_STRING * tag)
911 {
912 	// We're called with some amount of pre-parsing. That is, some of "this"
913 	// element is in "tag". Go ahead and stream to the closing ">"
914 	while( in->good() )
915 	{
916 		int c = in->get();
917 		if ( c <= 0 )
918 		{
919 			TiXmlDocument* document = GetDocument();
920 			if ( document )
921 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
922 			return;
923 		}
924 		(*tag) += (char) c ;
925 
926 		if ( c == '>' )
927 			break;
928 	}
929 
930 	if ( tag->length() < 3 ) return;
931 
932 	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
933 	// If not, identify and stream.
934 
935 	if (	tag->at( tag->length() - 1 ) == '>'
936 		 && tag->at( tag->length() - 2 ) == '/' )
937 	{
938 		// All good!
939 		return;
940 	}
941 	else if ( tag->at( tag->length() - 1 ) == '>' )
942 	{
943 		// There is more. Could be:
944 		//		text
945 		//		cdata text (which looks like another node)
946 		//		closing tag
947 		//		another node.
948 		for ( ;; )
949 		{
950 			StreamWhiteSpace( in, tag );
951 
952 			// Do we have text?
953 			if ( in->good() && in->peek() != '<' )
954 			{
955 				// Yep, text.
956 				TiXmlText text( "" );
957 				text.StreamIn( in, tag );
958 
959 				// What follows text is a closing tag or another node.
960 				// Go around again and figure it out.
961 				continue;
962 			}
963 
964 			// We now have either a closing tag...or another node.
965 			// We should be at a "<", regardless.
966 			if ( !in->good() ) return;
967 			assert( in->peek() == '<' );
968 			int tagIndex = (int) tag->length();
969 
970 			bool closingTag = false;
971 			bool firstCharFound = false;
972 
973 			for( ;; )
974 			{
975 				if ( !in->good() )
976 					return;
977 
978 				int c = in->peek();
979 				if ( c <= 0 )
980 				{
981 					TiXmlDocument* document = GetDocument();
982 					if ( document )
983 						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
984 					return;
985 				}
986 
987 				if ( c == '>' )
988 					break;
989 
990 				*tag += (char) c;
991 				in->get();
992 
993 				// Early out if we find the CDATA id.
994 				if ( c == '[' && tag->size() >= 9 )
995 				{
996 					size_t len = tag->size();
997 					const char* start = tag->c_str() + len - 9;
998 					if ( strcmp( start, "<![CDATA[" ) == 0 ) {
999 						assert( !closingTag );
1000 						break;
1001 					}
1002 				}
1003 
1004 				if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
1005 				{
1006 					firstCharFound = true;
1007 					if ( c == '/' )
1008 						closingTag = true;
1009 				}
1010 			}
1011 			// If it was a closing tag, then read in the closing '>' to clean up the input stream.
1012 			// If it was not, the streaming will be done by the tag.
1013 			if ( closingTag )
1014 			{
1015 				if ( !in->good() )
1016 					return;
1017 
1018 				int c = in->get();
1019 				if ( c <= 0 )
1020 				{
1021 					TiXmlDocument* document = GetDocument();
1022 					if ( document )
1023 						document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1024 					return;
1025 				}
1026 				assert( c == '>' );
1027 				*tag += (char) c;
1028 
1029 				// We are done, once we've found our closing tag.
1030 				return;
1031 			}
1032 			else
1033 			{
1034 				// If not a closing tag, id it, and stream.
1035 				const char* tagloc = tag->c_str() + tagIndex;
1036 				TiXmlNode* node = Identify( tagloc, TIXML_DEFAULT_ENCODING );
1037 				if ( !node )
1038 					return;
1039 				node->StreamIn( in, tag );
1040 				delete node;
1041 				node = 0;
1042 
1043 				// No return: go around from the beginning: text, closing tag, or node.
1044 			}
1045 		}
1046 	}
1047 }
1048 #endif
1049 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1050 const char* TiXmlElement::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1051 {
1052 	p = SkipWhiteSpace( p, encoding );
1053 	TiXmlDocument* document = GetDocument();
1054 
1055 	if ( !p || !*p )
1056 	{
1057 		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, 0, 0, encoding );
1058 		return 0;
1059 	}
1060 
1061 	if ( data )
1062 	{
1063 		data->Stamp( p, encoding );
1064 		location = data->Cursor();
1065 	}
1066 
1067 	if ( *p != '<' )
1068 	{
1069 		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, p, data, encoding );
1070 		return 0;
1071 	}
1072 
1073 	p = SkipWhiteSpace( p+1, encoding );
1074 
1075 	// Read the name.
1076 	const char* pErr = p;
1077 
1078 	p = ReadName( p, &value, encoding );
1079 	if ( !p || !*p )
1080 	{
1081 		if ( document )	document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME, pErr, data, encoding );
1082 		return 0;
1083 	}
1084 
1085 	TIXML_STRING endTag ("</");
1086 	endTag += value;
1087 	endTag += ">";
1088 
1089 	// Check for and read attributes. Also look for an empty
1090 	// tag or an end tag.
1091 	while ( p && *p )
1092 	{
1093 		pErr = p;
1094 		p = SkipWhiteSpace( p, encoding );
1095 		if ( !p || !*p )
1096 		{
1097 			if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1098 			return 0;
1099 		}
1100 		if ( *p == '/' )
1101 		{
1102 			++p;
1103 			// Empty tag.
1104 			if ( *p  != '>' )
1105 			{
1106 				if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY, p, data, encoding );
1107 				return 0;
1108 			}
1109 			return (p+1);
1110 		}
1111 		else if ( *p == '>' )
1112 		{
1113 			// Done with attributes (if there were any.)
1114 			// Read the value -- which can include other
1115 			// elements -- read the end tag, and return.
1116 			++p;
1117 			p = ReadValue( p, data, encoding );		// Note this is an Element method, and will set the error if one happens.
1118 			if ( !p || !*p ) {
1119 				// We were looking for the end tag, but found nothing.
1120 				// Fix for [ 1663758 ] Failure to report error on bad XML
1121 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1122 				return 0;
1123 			}
1124 
1125 			// We should find the end tag now
1126 			if ( StringEqual( p, endTag.c_str(), false, encoding ) )
1127 			{
1128 				p += endTag.length();
1129 				return p;
1130 			}
1131 			else
1132 			{
1133 				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG, p, data, encoding );
1134 				return 0;
1135 			}
1136 		}
1137 		else
1138 		{
1139 			// Try to read an attribute:
1140 			TiXmlAttribute* attrib = new (std::nothrow) TiXmlAttribute();
1141 			if ( !attrib )
1142 			{
1143 				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, pErr, data, encoding );
1144 				return 0;
1145 			}
1146 
1147 			attrib->SetDocument( document );
1148 			pErr = p;
1149 			p = attrib->Parse( p, data, encoding );
1150 
1151 			if ( !p || !*p )
1152 			{
1153 				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT, pErr, data, encoding );
1154 				delete attrib;
1155 				return 0;
1156 			}
1157 
1158 			// Handle the strange case of double attributes:
1159 			#ifdef TIXML_USE_STL
1160 			TiXmlAttribute* node = attributeSet.Find( attrib->NameTStr() );
1161 			#else
1162 			TiXmlAttribute* node = attributeSet.Find( attrib->Name() );
1163 			#endif
1164 			if ( node )
1165 			{
1166 				node->SetValue( attrib->Value() );
1167 				delete attrib;
1168 				return 0;
1169 			}
1170 
1171 			attributeSet.Add( attrib );
1172 		}
1173 	}
1174 	return p;
1175 }
1176 
1177 
ReadValue(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1178 const char* TiXmlElement::ReadValue( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1179 {
1180 	TiXmlDocument* document = GetDocument();
1181 
1182 	// Read in text and elements in any order.
1183 	const char* pWithWhiteSpace = p;
1184 	p = SkipWhiteSpace( p, encoding );
1185 
1186 	while ( p && *p )
1187 	{
1188 		if ( *p != '<' )
1189 		{
1190 			// Take what we have, make a text element.
1191 			TiXmlText* textNode = new (std::nothrow) TiXmlText( "" );
1192 
1193 			if ( !textNode )
1194 			{
1195 				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY, 0, 0, encoding );
1196 					return 0;
1197 			}
1198 
1199 			if ( TiXmlBase::IsWhiteSpaceCondensed() )
1200 			{
1201 				p = textNode->Parse( p, data, encoding );
1202 			}
1203 			else
1204 			{
1205 				// Special case: we want to keep the white space
1206 				// so that leading spaces aren't removed.
1207 				p = textNode->Parse( pWithWhiteSpace, data, encoding );
1208 			}
1209 
1210 			if ( !textNode->Blank() )
1211 				LinkEndChild( textNode );
1212 			else
1213 				delete textNode;
1214 		}
1215 		else
1216 		{
1217 			// We hit a '<'
1218 			// Have we hit a new element or an end tag? This could also be
1219 			// a TiXmlText in the "CDATA" style.
1220 			if ( StringEqual( p, "</", false, encoding ) )
1221 			{
1222 				return p;
1223 			}
1224 			else
1225 			{
1226 				TiXmlNode* node = Identify( p, encoding );
1227 				if ( node )
1228 				{
1229 					p = node->Parse( p, data, encoding );
1230 					LinkEndChild( node );
1231 				}
1232 				else
1233 				{
1234 					return 0;
1235 				}
1236 			}
1237 		}
1238 		pWithWhiteSpace = p;
1239 		p = SkipWhiteSpace( p, encoding );
1240 	}
1241 
1242 	if ( !p )
1243 	{
1244 		if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE, 0, 0, encoding );
1245 	}
1246 	return p;
1247 }
1248 
1249 
1250 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1251 void TiXmlUnknown::StreamIn( std::istream * in, TIXML_STRING * tag )
1252 {
1253 	while ( in->good() )
1254 	{
1255 		int c = in->get();
1256 		if ( c <= 0 )
1257 		{
1258 			TiXmlDocument* document = GetDocument();
1259 			if ( document )
1260 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1261 			return;
1262 		}
1263 		(*tag) += (char) c;
1264 
1265 		if ( c == '>' )
1266 		{
1267 			// All is well.
1268 			return;
1269 		}
1270 	}
1271 }
1272 #endif
1273 
1274 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1275 const char* TiXmlUnknown::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1276 {
1277 	TiXmlDocument* document = GetDocument();
1278 	p = SkipWhiteSpace( p, encoding );
1279 
1280 	if ( data )
1281 	{
1282 		data->Stamp( p, encoding );
1283 		location = data->Cursor();
1284 	}
1285 	if ( !p || !*p || *p != '<' )
1286 	{
1287 		if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN, p, data, encoding );
1288 		return 0;
1289 	}
1290 	++p;
1291 	value = "";
1292 
1293 	while ( p && *p && *p != '>' )
1294 	{
1295 		value += *p;
1296 		++p;
1297 	}
1298 
1299 	if ( !p )
1300 	{
1301 		if ( document )	document->SetError( TIXML_ERROR_PARSING_UNKNOWN, 0, 0, encoding );
1302 	}
1303 	if ( *p == '>' )
1304 		return p+1;
1305 	return p;
1306 }
1307 
1308 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1309 void TiXmlComment::StreamIn( std::istream * in, TIXML_STRING * tag )
1310 {
1311 	while ( in->good() )
1312 	{
1313 		int c = in->get();
1314 		if ( c <= 0 )
1315 		{
1316 			TiXmlDocument* document = GetDocument();
1317 			if ( document )
1318 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1319 			return;
1320 		}
1321 
1322 		(*tag) += (char) c;
1323 
1324 		if ( c == '>'
1325 			 && tag->at( tag->length() - 2 ) == '-'
1326 			 && tag->at( tag->length() - 3 ) == '-' )
1327 		{
1328 			// All is well.
1329 			return;
1330 		}
1331 	}
1332 }
1333 #endif
1334 
1335 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1336 const char* TiXmlComment::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1337 {
1338 	TiXmlDocument* document = GetDocument();
1339 	value = "";
1340 
1341 	p = SkipWhiteSpace( p, encoding );
1342 
1343 	if ( data )
1344 	{
1345 		data->Stamp( p, encoding );
1346 		location = data->Cursor();
1347 	}
1348 	const char* startTag = "<!--";
1349 	const char* endTag   = "-->";
1350 
1351 	if ( !StringEqual( p, startTag, false, encoding ) )
1352 	{
1353 		document->SetError( TIXML_ERROR_PARSING_COMMENT, p, data, encoding );
1354 		return 0;
1355 	}
1356 	p += strlen( startTag );
1357 
1358 	// [ 1475201 ] TinyXML parses entities in comments
1359 	// Oops - ReadText doesn't work, because we don't want to parse the entities.
1360 	// p = ReadText( p, &value, false, endTag, false, encoding );
1361 	//
1362 	// from the XML spec:
1363 	/*
1364 	 [Definition: Comments may appear anywhere in a document outside other markup; in addition,
1365 				  they may appear within the document type declaration at places allowed by the grammar.
1366 				  They are not part of the document's character data; an XML processor MAY, but need not,
1367 				  make it possible for an application to retrieve the text of comments. For compatibility,
1368 				  the string "--" (double-hyphen) MUST NOT occur within comments.] Parameter entity
1369 				  references MUST NOT be recognized within comments.
1370 
1371 				  An example of a comment:
1372 
1373 				  <!-- declarations for <head> & <body> -->
1374 	*/
1375 
1376 	value = "";
1377 	// Keep all the white space.
1378 	while (	p && *p && !StringEqual( p, endTag, false, encoding ) )
1379 	{
1380 		value.append( p, 1 );
1381 		++p;
1382 	}
1383 	if ( p )
1384 		p += strlen( endTag );
1385 
1386 	return p;
1387 }
1388 
1389 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1390 const char* TiXmlAttribute::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1391 {
1392 	p = SkipWhiteSpace( p, encoding );
1393 	if ( !p || !*p ) return 0;
1394 
1395 //	int tabsize = 4;
1396 //	if ( document )
1397 //		tabsize = document->TabSize();
1398 
1399 	if ( data )
1400 	{
1401 		data->Stamp( p, encoding );
1402 		location = data->Cursor();
1403 	}
1404 	// Read the name, the '=' and the value.
1405 	const char* pErr = p;
1406 	p = ReadName( p, &name, encoding );
1407 	if ( !p || !*p )
1408 	{
1409 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, pErr, data, encoding );
1410 		return 0;
1411 	}
1412 	p = SkipWhiteSpace( p, encoding );
1413 	if ( !p || !*p || *p != '=' )
1414 	{
1415 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1416 		return 0;
1417 	}
1418 
1419 	++p;	// skip '='
1420 	p = SkipWhiteSpace( p, encoding );
1421 	if ( !p || !*p )
1422 	{
1423 		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1424 		return 0;
1425 	}
1426 
1427 	const char* end;
1428 	const char SINGLE_QUOTE = '\'';
1429 	const char DOUBLE_QUOTE = '\"';
1430 
1431 	if ( *p == SINGLE_QUOTE )
1432 	{
1433 		++p;
1434 		end = "\'";		// single quote in string
1435 		p = ReadText( p, &value, false, end, false, encoding );
1436 	}
1437 	else if ( *p == DOUBLE_QUOTE )
1438 	{
1439 		++p;
1440 		end = "\"";		// double quote in string
1441 		p = ReadText( p, &value, false, end, false, encoding );
1442 	}
1443 	else
1444 	{
1445 		// All attribute values should be in single or double quotes.
1446 		// But this is such a common error that the parser will try
1447 		// its best, even without them.
1448 		value = "";
1449 		while (	p && *p											// existence
1450 				&& !IsWhiteSpace( *p ) && *p != '\n' && *p != '\r'	// whitespace
1451 				&& *p != '/' && *p != '>' )							// tag end
1452 		{
1453 			if ( *p == SINGLE_QUOTE || *p == DOUBLE_QUOTE ) {
1454 				// [ 1451649 ] Attribute values with trailing quotes not handled correctly
1455 				// We did not have an opening quote but seem to have a
1456 				// closing one. Give up and throw an error.
1457 				if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES, p, data, encoding );
1458 				return 0;
1459 			}
1460 			value += *p;
1461 			++p;
1462 		}
1463 	}
1464 	return p;
1465 }
1466 
1467 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1468 void TiXmlText::StreamIn( std::istream * in, TIXML_STRING * tag )
1469 {
1470 	while ( in->good() )
1471 	{
1472 		int c = in->peek();
1473 		if ( !cdata && (c == '<' ) )
1474 		{
1475 			return;
1476 		}
1477 		if ( c <= 0 )
1478 		{
1479 			TiXmlDocument* document = GetDocument();
1480 			if ( document )
1481 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1482 			return;
1483 		}
1484 
1485 		(*tag) += (char) c;
1486 		in->get();	// "commits" the peek made above
1487 
1488 		if ( cdata && c == '>' && tag->size() >= 3 ) {
1489 			size_t len = tag->size();
1490 			if ( (*tag)[len-2] == ']' && (*tag)[len-3] == ']' ) {
1491 				// terminator of cdata.
1492 				return;
1493 			}
1494 		}
1495 	}
1496 }
1497 #endif
1498 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding encoding)1499 const char* TiXmlText::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding encoding )
1500 {
1501 	value = "";
1502 	TiXmlDocument* document = GetDocument();
1503 
1504 	if ( data )
1505 	{
1506 		data->Stamp( p, encoding );
1507 		location = data->Cursor();
1508 	}
1509 
1510 	const char* const startTag = "<![CDATA[";
1511 	const char* const endTag   = "]]>";
1512 
1513 	if ( cdata || StringEqual( p, startTag, false, encoding ) )
1514 	{
1515 		cdata = true;
1516 
1517 		if ( !StringEqual( p, startTag, false, encoding ) )
1518 		{
1519 			document->SetError( TIXML_ERROR_PARSING_CDATA, p, data, encoding );
1520 			return 0;
1521 		}
1522 		p += strlen( startTag );
1523 
1524 		// Keep all the white space, ignore the encoding, etc.
1525 		while (	   p && *p
1526 				&& !StringEqual( p, endTag, false, encoding )
1527 			  )
1528 		{
1529 			value += *p;
1530 			++p;
1531 		}
1532 
1533 		TIXML_STRING dummy;
1534 		p = ReadText( p, &dummy, false, endTag, false, encoding );
1535 		return p;
1536 	}
1537 	else
1538 	{
1539 		bool ignoreWhite = true;
1540 
1541 		const char* end = "<";
1542 		p = ReadText( p, &value, ignoreWhite, end, false, encoding );
1543 		if ( p )
1544 			return p-1;	// don't truncate the '<'
1545 		return 0;
1546 	}
1547 }
1548 
1549 #ifdef TIXML_USE_STL
StreamIn(std::istream * in,TIXML_STRING * tag)1550 void TiXmlDeclaration::StreamIn( std::istream * in, TIXML_STRING * tag )
1551 {
1552 	while ( in->good() )
1553 	{
1554 		int c = in->get();
1555 		if ( c <= 0 )
1556 		{
1557 			TiXmlDocument* document = GetDocument();
1558 			if ( document )
1559 				document->SetError( TIXML_ERROR_EMBEDDED_NULL, 0, 0, TIXML_ENCODING_UNKNOWN );
1560 			return;
1561 		}
1562 		(*tag) += (char) c;
1563 
1564 		if ( c == '>' )
1565 		{
1566 			// All is well.
1567 			return;
1568 		}
1569 	}
1570 }
1571 #endif
1572 
Parse(const char * p,TiXmlParsingData * data,TiXmlEncoding _encoding)1573 const char* TiXmlDeclaration::Parse( const char* p, TiXmlParsingData* data, TiXmlEncoding _encoding )
1574 {
1575 	p = SkipWhiteSpace( p, _encoding );
1576 	// Find the beginning, find the end, and look for
1577 	// the stuff in-between.
1578 	TiXmlDocument* document = GetDocument();
1579 	if ( !p || !*p || !StringEqual( p, "<?xml", true, _encoding ) )
1580 	{
1581 		if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION, 0, 0, _encoding );
1582 		return 0;
1583 	}
1584 	if ( data )
1585 	{
1586 		data->Stamp( p, _encoding );
1587 		location = data->Cursor();
1588 	}
1589 	p += 5;
1590 
1591 	version = "";
1592 	encoding = "";
1593 	standalone = "";
1594 
1595 	while ( p && *p )
1596 	{
1597 		if ( *p == '>' )
1598 		{
1599 			++p;
1600 			return p;
1601 		}
1602 
1603 		p = SkipWhiteSpace( p, _encoding );
1604 		if ( StringEqual( p, "version", true, _encoding ) )
1605 		{
1606 			TiXmlAttribute attrib;
1607 			p = attrib.Parse( p, data, _encoding );
1608 			version = attrib.Value();
1609 		}
1610 		else if ( StringEqual( p, "encoding", true, _encoding ) )
1611 		{
1612 			TiXmlAttribute attrib;
1613 			p = attrib.Parse( p, data, _encoding );
1614 			encoding = attrib.Value();
1615 		}
1616 		else if ( StringEqual( p, "standalone", true, _encoding ) )
1617 		{
1618 			TiXmlAttribute attrib;
1619 			p = attrib.Parse( p, data, _encoding );
1620 			standalone = attrib.Value();
1621 		}
1622 		else
1623 		{
1624 			// Read over whatever it is.
1625 			while( p && *p && *p != '>' && !IsWhiteSpace( *p ) )
1626 				++p;
1627 		}
1628 	}
1629 	return 0;
1630 }
1631 
Blank() const1632 bool TiXmlText::Blank() const
1633 {
1634 	for ( unsigned i=0; i<value.length(); i++ )
1635 		if ( !IsWhiteSpace( value[i] ) )
1636 			return false;
1637 	return true;
1638 }
1639 
1640