1 // Copyright (C) 2003  Davis E. King (davis@dlib.net)
2 // License: Boost Software License   See LICENSE.txt for the full license.
3 #ifndef DLIB_XML_PARSER_KERNEl_1_
4 #define DLIB_XML_PARSER_KERNEl_1_
5 
6 
7 #include "xml_parser_kernel_abstract.h"
8 
9 #include <sstream>
10 #include <string>
11 #include <fstream>
12 #include <iostream>
13 #include "xml_parser_kernel_interfaces.h"
14 #include "../algs.h"
15 #include <cstdio>
16 #include "../map.h"
17 #include "../stack.h"
18 #include "../sequence.h"
19 #include "../memory_manager.h"
20 
21 namespace dlib
22 {
23 
24     class xml_parser
25     {
26         typedef dlib::map<std::string,std::string,memory_manager<char>::kernel_2a>::kernel_1b map;
27         typedef dlib::stack<std::string,memory_manager<char>::kernel_2a>::kernel_1a stack;
28         typedef sequence<document_handler*>::kernel_2a seq_dh;
29         typedef sequence<error_handler*>::kernel_2a seq_eh;
30 
31          /*!
32             INITIAL VALUE
33                 dh_list.size() == 0
34                 eh_list.size() == 0
35 
36             CONVENTION
37                 dh_list == a sequence of pointers to all the document_handlers that
38                            have been added to the xml_parser
39                 eh_list == a sequence of pointers to all the error_handlers that
40                            have been added to the xml_parser
41 
42                 map is used to implement the attribute_list interface
43                 stack is used just inside the parse function
44                 seq_dh is used to make the dh_list member variable
45                 seq_eh is used to make the eh_list member variable
46         !*/
47 
48 
49 
50         public:
51 
52             // These typedefs are here for backwards compatibly with previous versions of
53             // dlib.
54             typedef xml_parser kernel_1a;
55             typedef xml_parser kernel_1a_c;
56 
xml_parser()57             xml_parser(
58             ) {}
59 
~xml_parser()60             virtual ~xml_parser(
61             ){}
62 
63             inline void clear(
64             );
65 
66             inline void parse (
67                 std::istream& in
68             );
69 
70             inline void add_document_handler (
71                 document_handler& item
72             );
73 
74             inline void add_error_handler (
75                 error_handler& item
76             );
77 
78 
79             inline void swap (
80                 xml_parser& item
81             );
82 
83 
84         private:
85 
86             // -----------------------------------
87 
88             // attribute_list interface implementation
89             class attrib_list : public attribute_list
90             {
91             public:
92                 // the list of attribute name/value pairs
93                 map list;
94 
is_in_list(const std::string & key)95                 bool is_in_list (
96                     const std::string& key
97                 ) const
98                 {
99                     return list.is_in_domain(key);
100                 }
101 
102                 const std::string& operator[] (
103                     const std::string& key
104                 ) const
105                 {
106                     if (is_in_list(key))
107                         return list[key];
108                     else
109                         throw xml_attribute_list_error("No XML attribute named " + key + " is present in tag.");
110                 }
111 
at_start()112                 bool at_start (
113                 ) const { return list.at_start(); }
114 
reset()115                 void reset (
116                 ) const { return list.reset(); }
117 
current_element_valid()118                 bool current_element_valid (
119                 ) const { return list.current_element_valid(); }
120 
element()121                 const type& element (
122                 ) const { return list.element(); }
123 
element()124                 type& element (
125                 ) { return list.element(); }
126 
move_next()127                 bool move_next (
128                 ) const { return list.move_next(); }
129 
size()130                 size_t size (
131                 ) const { return list.size(); }
132             };
133 
134 
135             // -----------------------------------
136 
137             enum token_type
138             {
139                 element_start, // the first tag of an element
140                 element_end,   // the last tag of an element
141                 empty_element, // the singular tag of an empty element
142                 pi,            // processing instruction
143                 chars,         // the non-markup data between tags
144                 chars_cdata,   // the data from a CDATA section
145                 eof,           // this token is returned when we reach the end of input
146                 error,         // this token indicates that the tokenizer couldn't
147                                // determine which category the next token fits into
148                 dtd,           // this token is for an entire dtd
149                 comment        // this is a token for comments
150             };
151             /*
152                 notes about the tokens:
153                     the tokenizer guarantees that the following tokens to not
154                     contain the '<' character except as the first character of the token
155                     element_start, element_end, empty_element, and pi.  they also only
156                     contain the '>' characer as their last character.
157 
158                     it is also guaranteed that pi is at least of the form <??>.  that
159                     is to say that it always always begins with <? and ends with ?>.
160 
161                     it is also guaranteed that all markup tokens will begin with the '<'
162                     character and end with the '>'. there won't be any leading or
163                     trailing whitespaces.   this whitespace is considered a chars token.
164             */
165 
166 
167             // private member functions
168             inline void get_next_token(
169                 std::istream& in,
170                 std::string& token_text,
171                 int& token_kind,
172                 unsigned long& line_number
173             );
174             /*!
175                 ensures
176                     gets the next token from in and puts it in token_text and
177                     token_kind == the kind of the token found and
178                     line_number is incremented every time a '\n' is encountered and
179                     entity references are translated into the characters they represent
180                     only for chars tokens
181             !*/
182 
183             inline int parse_element (
184                 const std::string& token,
185                 std::string& name,
186                 attrib_list& atts
187             );
188             /*!
189                 requires
190                     token is a token of kind start_element or empty_element
191                 ensures
192                     gets the element name and puts it into the string name and
193                     parses out the attributes and puts them into the attribute_list atts
194 
195                     return 0 upon success or
196                     returns -1 if it failed to parse token
197             !*/
198 
199             inline int parse_pi (
200                 const std::string& token,
201                 std::string& target,
202                 std::string& data
203             );
204             /*!
205                 requires
206                     token is a token of kind pi
207                 ensures
208                     the target from the processing instruction is put into target and
209                     the data from the processing instruction is put into data
210 
211                     return 0 upon success or
212                     returns -1 if it failed to parse token
213             !*/
214 
215             inline int parse_element_end (
216                 const std::string& token,
217                 std::string& name
218             );
219             /*!
220                 requires
221                     token is a token of kind element_end
222                 ensures
223                     the name from the ending element tag is put into the string name
224 
225                     return 0 upon success or
226                     returns -1 if it failed to parse token
227             !*/
228 
229             inline int change_entity (
230                 std::istream& in
231             );
232             /*!
233                 ensures
234                     performs the following translations and returns the new character
235                                 amp;   -> &
236                                 lt;    -> <
237                                 gt;    -> >
238                                 apos;  -> '
239                                 quot;  -> "
240 
241                     or returns -1 if we hit an undefined entity reference or EOF.
242                             (i.e. it was not one of the entities listed above)
243 
244             !*/
245 
246             // -----------------------------------
247 
248             // private member data
249             seq_dh dh_list;
250             seq_eh eh_list;
251 
252             // -----------------------------------
253 
254             // restricted functions: assignment and copy construction
255             xml_parser(xml_parser&);
256             xml_parser& operator= (
257                         xml_parser&
258                         );
259 
260     };
261 
swap(xml_parser & a,xml_parser & b)262     inline void swap (
263         xml_parser& a,
264         xml_parser& b
265     ) { a.swap(b); }
266 
267 
268 // ----------------------------------------------------------------------------------------
269 // ----------------------------------------------------------------------------------------
270     // member function definitions
271 // ----------------------------------------------------------------------------------------
272 // ----------------------------------------------------------------------------------------
273 
274     void xml_parser::
clear()275     clear(
276     )
277     {
278         // unregister all event handlers
279         eh_list.clear();
280         dh_list.clear();
281     }
282 
283 // ----------------------------------------------------------------------------------------
284 
285     void xml_parser::
parse(std::istream & in)286     parse (
287         std::istream& in
288     )
289     {
290         DLIB_CASSERT ( in.fail() == false ,
291             "\tvoid xml_parser::parse"
292             << "\n\tthe input stream must not be in the fail state"
293             << "\n\tthis: " << this
294             );
295 
296 
297         // save which exceptions in will throw and make it so it won't throw any
298         // for the life of this function
299         std::ios::iostate old_exceptions = in.exceptions();
300         // set it to not throw anything
301         in.exceptions(std::ios::goodbit);
302 
303 
304         try
305         {
306             unsigned long line_number = 1;
307 
308             // skip any whitespace before the start of the document
309             while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' )
310             {
311                 if (in.peek() == '\n')
312                     ++line_number;
313                 in.get();
314             }
315 
316 
317 
318             stack tags; // this stack contains the last start tag seen
319             bool seen_fatal_error = false;
320             bool seen_root_tag = false;  // this is true after we have seen the root tag
321 
322 
323 
324             // notify all the document_handlers that we are about to being parsing
325             for (unsigned long i = 0; i < dh_list.size(); ++i)
326             {
327                 dh_list[i]->start_document();
328             }
329 
330 
331             std::string chars_buf; // used to collect chars data between consecutive
332                                 // chars and chars_cdata tokens so that
333                                 // document_handlers receive all chars data between
334                                 // tags in one call
335 
336             // variables to be used with the parsing functions
337             attrib_list atts;
338             std::string name;
339             std::string target;
340             std::string data;
341 
342 
343 
344             // variables to use with the get_next_token() function
345             std::string token_text;
346             int token_kind;
347 
348             get_next_token(in,token_text,token_kind,line_number);
349 
350 
351             while (token_kind != eof)
352             {
353                 bool is_empty = false;  // this becomes true when this token is an empty_element
354 
355                 switch (token_kind)
356                 {
357 
358 
359                 case empty_element: is_empty = true;
360                                     // fall through
361                 case element_start:
362                     {
363                         seen_root_tag = true;
364 
365                         int status = parse_element(token_text,name,atts);
366                         // if there was no error parsing the element
367                         if (status == 0)
368                         {
369                             // notify all the document_handlers
370                             for (unsigned long i = 0; i < dh_list.size(); ++i)
371                             {
372                                 dh_list[i]->start_element(line_number,name,atts);
373                                 if (is_empty)
374                                     dh_list[i]->end_element(line_number,name);
375                             }
376                         }
377                         else
378                         {
379                             seen_fatal_error = true;
380                         }
381 
382                         // if this is an element_start token then push the name of
383                         // the element on to the stack
384                         if (token_kind == element_start)
385                         {
386                             tags.push(name);
387                         }
388 
389                     }break;
390 
391                 // ----------------------------------------
392 
393                 case element_end:
394                     {
395 
396                         int status = parse_element_end (token_text,name);
397 
398                         // if there was no error parsing the element
399                         if (status == 0)
400                         {
401                             // make sure this ending element tag matches the last start
402                             // element tag we saw
403                             if ( tags.size() == 0 || name != tags.current())
404                             {
405                                 // they don't match so signal a fatal error
406                                 seen_fatal_error = true;
407                             }
408                             else
409                             {
410                                 // notify all the document_handlers
411                                 for (unsigned long i = 0; i < dh_list.size(); ++i)
412                                 {
413                                     dh_list[i]->end_element(line_number,name);
414                                 }
415 
416                                 // they match so throw away this element name
417                                 tags.pop(name);
418                             }
419                         }
420                         else
421                         {
422                             seen_fatal_error = true;
423                         }
424 
425 
426                     }break;
427 
428                 // ----------------------------------------
429 
430                 case pi:
431                     {
432 
433                         int status = parse_pi (token_text,target,data);
434                         // if there was no error parsing the element
435                         if (status == 0)
436                         {
437                             // notify all the document_handlers
438                             for (unsigned long i = 0; i < dh_list.size(); ++i)
439                             {
440                                 dh_list[i]->processing_instruction(line_number,target,data);
441                             }
442                         }
443                         else
444                         {
445                             // notify all the error_handlers
446                             for (unsigned long i = 0; i < eh_list.size(); ++i)
447                             {
448                                 eh_list[i]->error(line_number);
449                             }
450                         }
451                         while (in.peek() == ' ' || in.peek() == '\t' || in.peek() == '\n' || in.peek() == '\r' )
452                         {
453                             if (in.peek() == '\n')
454                                 ++line_number;
455                             in.get();
456                         }
457 
458 
459                     }break;
460 
461                 // ----------------------------------------
462 
463                 case chars:
464                     {
465                         if (tags.size() != 0)
466                         {
467                             chars_buf += token_text;
468                         }
469                         else if (token_text.find_first_not_of(" \t\r\n") != std::string::npos)
470                         {
471                             // you can't have non whitespace chars data outside the root element
472                             seen_fatal_error = true;
473                         }
474                     }break;
475 
476                 // ----------------------------------------
477 
478                 case chars_cdata:
479                     {
480                         if (tags.size() != 0)
481                         {
482                             chars_buf += token_text;
483                         }
484                         else
485                         {
486                             // you can't have chars_data outside the root element
487                             seen_fatal_error = true;
488                         }
489                     }break;
490 
491                 // ----------------------------------------
492 
493                 case eof:
494                     break;
495 
496                 // ----------------------------------------
497 
498                 case error:
499                     {
500                         seen_fatal_error = true;
501                     }break;
502 
503                 // ----------------------------------------
504 
505                 case dtd:       // fall though
506                 case comment:   // do nothing
507                     break;
508 
509                 // ----------------------------------------
510 
511 
512                 }
513 
514                 // if there was a fatal error then quit loop
515                 if (seen_fatal_error)
516                     break;
517 
518                 // if we have seen the last tag then quit the loop
519                 if (tags.size() == 0 && seen_root_tag)
520                     break;
521 
522 
523                 get_next_token(in,token_text,token_kind,line_number);
524 
525                 // if the next token is not a chars or chars_cdata token then flush
526                 // the chars_buf to the document_handlers
527                 if ( (token_kind != chars) &&
528                     (token_kind != chars_cdata) &&
529                     (token_kind != dtd) &&
530                     (token_kind != comment) &&
531                     (chars_buf.size() != 0)
532                     )
533                 {
534                     // notify all the document_handlers
535                     for (unsigned long i = 0; i < dh_list.size(); ++i)
536                     {
537                         dh_list[i]->characters(chars_buf);
538                     }
539                     chars_buf.erase();
540                 }
541 
542 
543             } //while (token_kind != eof)
544 
545 
546 
547 
548             // you can't have any unmatched tags or any fatal erros
549             if (tags.size() != 0 || seen_fatal_error)
550             {
551                 // notify all the error_handlers
552                 for (unsigned long i = 0; i < eh_list.size(); ++i)
553                 {
554                     eh_list[i]->fatal_error(line_number);
555                 }
556 
557             }
558 
559 
560             // notify all the document_handlers that we have ended parsing
561             for (unsigned long i = 0; i < dh_list.size(); ++i)
562             {
563                 dh_list[i]->end_document();
564             }
565 
566         }
567         catch (...)
568         {
569             // notify all the document_handlers that we have ended parsing
570             for (unsigned long i = 0; i < dh_list.size(); ++i)
571             {
572                 dh_list[i]->end_document();
573             }
574 
575             // restore the old exception settings to in
576             in.exceptions(old_exceptions);
577 
578             // don't forget to rethrow the exception
579             throw;
580         }
581 
582         // restore the old exception settings to in
583         in.exceptions(old_exceptions);
584 
585     }
586 
587 // ----------------------------------------------------------------------------------------
588 
589     void xml_parser::
add_document_handler(document_handler & item)590     add_document_handler (
591         document_handler& item
592     )
593     {
594         document_handler* temp = &item;
595         dh_list.add(dh_list.size(),temp);
596     }
597 
598 // ----------------------------------------------------------------------------------------
599 
600     void xml_parser::
add_error_handler(error_handler & item)601     add_error_handler (
602         error_handler& item
603     )
604     {
605         error_handler* temp = &item;
606         eh_list.add(eh_list.size(),temp);
607     }
608 
609 // ----------------------------------------------------------------------------------------
610 
611     void xml_parser::
swap(xml_parser & item)612     swap (
613         xml_parser& item
614     )
615     {
616         dh_list.swap(item.dh_list);
617         eh_list.swap(item.eh_list);
618     }
619 
620 // ----------------------------------------------------------------------------------------
621 // ----------------------------------------------------------------------------------------
622     // private member function definitions
623 // ----------------------------------------------------------------------------------------
624 // ----------------------------------------------------------------------------------------
625 
626     void xml_parser::
get_next_token(std::istream & in,std::string & token_text,int & token_kind,unsigned long & line_number)627     get_next_token(
628         std::istream& in,
629         std::string& token_text,
630         int& token_kind,
631         unsigned long& line_number
632     )
633     {
634 
635         token_text.erase();
636 
637         std::istream::int_type ch1 = in.get();
638         std::istream::int_type ch2;
639 
640 
641         switch (ch1)
642         {
643 
644         // -----------------------------------------
645 
646             // this is the start of some kind of a tag
647         case '<':
648             {
649                 ch2 = in.get();
650                 switch (ch2)
651                 {
652 
653                 // ---------------------------------
654 
655                     // this is a dtd, comment, or chars_cdata token
656                 case '!':
657                     {
658                         // if this is a CDATA section *******************************
659                         if ( in.peek() == '[')
660                         {
661                             token_kind = chars_cdata;
662 
663                             // throw away the '['
664                             in.get();
665 
666                             // make sure the next chars are CDATA[
667                             std::istream::int_type ch = in.get();
668                             if (ch != 'C')
669                                 token_kind = error;
670                             ch = in.get();
671                             if (ch != 'D')
672                                 token_kind = error;
673                             ch = in.get();
674                             if (ch != 'A')
675                                 token_kind = error;
676                             ch = in.get();
677                             if (ch != 'T')
678                                 token_kind = error;
679                             ch = in.get();
680                             if (ch != 'A')
681                                 token_kind = error;
682                             ch = in.get();
683                             if (ch != '[')
684                                 token_kind = error;
685                             // if this is an error token then end
686                             if (token_kind == error)
687                                 break;
688 
689 
690                             // get the rest of the chars and put them into token_text
691                             int brackets_seen = 0; // this is the number of ']' chars
692                                                    // we have seen in a row
693                             bool seen_closing = false; // true if we have seen ]]>
694                             do
695                             {
696                                 ch = in.get();
697 
698                                 if (ch == '\n')
699                                     ++line_number;
700 
701                                 token_text += ch;
702 
703                                 // if this is the closing
704                                 if (brackets_seen == 2 && ch == '>')
705                                     seen_closing = true;
706                                 // if we are seeing a bracket
707                                 else if (ch == ']')
708                                     ++brackets_seen;
709                                 // if we didn't see a bracket
710                                 else
711                                     brackets_seen = 0;
712 
713 
714                             } while ( (!seen_closing) && (ch != EOF) );
715 
716                             // check if this is an error token
717                             if (ch == EOF)
718                             {
719                                 token_kind = error;
720                             }
721                             else
722                             {
723                                 token_text.erase(token_text.size()-3);
724                             }
725 
726 
727 
728                         }
729                         // this is a comment token ****************************
730                         else if (in.peek() == '-')
731                         {
732 
733                             token_text += ch1;
734                             token_text += ch2;
735                             token_text += '-';
736 
737                             token_kind = comment;
738 
739                             // throw away the '-' char
740                             in.get();
741 
742                             // make sure the next char is another '-'
743                             std::istream::int_type ch = in.get();
744                             if (ch != '-')
745                             {
746                                 token_kind = error;
747                                 break;
748                             }
749 
750                             token_text += '-';
751 
752 
753                             // get the rest of the chars and put them into token_text
754                             int hyphens_seen = 0; // this is the number of '-' chars
755                                                    // we have seen in a row
756                             bool seen_closing = false; // true if we have seen ]]>
757                             do
758                             {
759                                 ch = in.get();
760 
761                                 if (ch == '\n')
762                                     ++line_number;
763 
764                                 token_text += ch;
765 
766                                 // if this should be a closing block
767                                 if (hyphens_seen == 2)
768                                 {
769                                     if (ch == '>')
770                                         seen_closing = true;
771                                     else // this isn't a closing so make it signal error
772                                         ch = EOF;
773                                 }
774                                 // if we are seeing a hyphen
775                                 else if (ch == '-')
776                                     ++hyphens_seen;
777                                 // if we didn't see a hyphen
778                                 else
779                                     hyphens_seen = 0;
780 
781 
782                             } while ( (!seen_closing) && (ch != EOF) );
783 
784                             // check if this is an error token
785                             if (ch == EOF)
786                             {
787                                 token_kind = error;
788                             }
789 
790 
791 
792 
793 
794                         }
795                         else // this is a dtd token *************************
796                         {
797 
798                             token_text += ch1;
799                             token_text += ch2;
800                             int bracket_depth = 1;  // this is the number of '<' chars seen
801                                                     // minus the number of '>' chars seen
802 
803                             std::istream::int_type ch;
804                             do
805                             {
806                                 ch = in.get();
807                                 if (ch == '>')
808                                     --bracket_depth;
809                                 else if (ch == '<')
810                                     ++bracket_depth;
811                                 else if (ch == '\n')
812                                     ++line_number;
813 
814                                 token_text += ch;
815 
816                             } while ( (bracket_depth > 0) && (ch != EOF) );
817 
818                             // make sure we didn't just hit EOF
819                             if (bracket_depth == 0)
820                             {
821                                 token_kind = dtd;
822                             }
823                             else
824                             {
825                                 token_kind = error;
826                             }
827                         }
828                     }
829                     break;
830 
831                 // ---------------------------------
832 
833                     // this is a pi token
834                 case '?':
835                     {
836                         token_text += ch1;
837                         token_text += ch2;
838                         std::istream::int_type ch;
839 
840                         do
841                         {
842                             ch = in.get();
843                             token_text += ch;
844                             if (ch == '\n')
845                                 ++line_number;
846                             // else if we hit a < then thats an error
847                             else if (ch == '<')
848                                 ch = EOF;
849                         } while (ch != '>' && ch != EOF);
850                         // if we hit the end of the pi
851                         if (ch == '>')
852                         {
853                             // make sure there was a trailing '?'
854                             if ( (token_text.size() > 3) &&
855                                 (token_text[token_text.size()-2] != '?')
856                                 )
857                             {
858                                 token_kind = error;
859                             }
860                             else
861                             {
862                                 token_kind = pi;
863                             }
864                         }
865                         // if we hit EOF unexpectidely then error
866                         else
867                         {
868                             token_kind = error;
869                         }
870                     }
871                     break;
872 
873                 // ---------------------------------
874 
875                     // this is an error token
876                 case EOF:
877                     {
878                         token_kind = error;
879                     }
880                     break;
881 
882                 // ---------------------------------
883                     // this is an element_end token
884                 case '/':
885                     {
886                         token_kind = element_end;
887                         token_text += ch1;
888                         token_text += ch2;
889                         std::istream::int_type ch;
890                         do
891                         {
892                             ch = in.get();
893                             if (ch == '\n')
894                                 ++line_number;
895                             // else if we hit a < then thats an error
896                             else if (ch == '<')
897                                 ch = EOF;
898                             token_text += ch;
899                         } while ( (ch != '>') && (ch != EOF));
900 
901                         // check if this is an error token
902                         if (ch == EOF)
903                         {
904                             token_kind = error;
905                         }
906                     }
907                     break;
908 
909 
910                 // ---------------------------------
911 
912                     // this is an element_start or empty_element token
913                 default:
914                     {
915 
916                         token_text += ch1;
917                         token_text += ch2;
918                         std::istream::int_type ch = '\0';
919                         std::istream::int_type last;
920                         do
921                         {
922                             last = ch;
923                             ch = in.get();
924                             if (ch == '\n')
925                                 ++line_number;
926                             // else if we hit a < then thats an error
927                             else if (ch == '<')
928                                 ch = EOF;
929                             token_text += ch;
930                         } while ( (ch != '>') && (ch != EOF));
931 
932                         // check if this is an error token
933                         if (ch == EOF)
934                         {
935                             token_kind = error;
936                         }
937                         // if this is an empty_element
938                         else if (last == '/')
939                         {
940                             token_kind = empty_element;
941                         }
942                         else
943                         {
944                             token_kind = element_start;
945                         }
946 
947 
948                     }
949                     break;
950 
951                 // ---------------------------------
952 
953                 }
954 
955             }
956             break;
957 
958         // -----------------------------------------
959 
960             // this is an eof token
961         case EOF:
962             {
963                 token_kind = eof;
964             }
965             break;
966 
967         // -----------------------------------------
968 
969             // this is a chars token
970         default:
971             {
972                 if (ch1 == '\n')
973                 {
974                     ++line_number;
975                     token_text += ch1;
976                 }
977                 // if the first thing in this chars token is an entity reference
978                 else if (ch1 == '&')
979                 {
980 
981                     int temp = change_entity(in);
982                     if (temp == -1)
983                     {
984                         token_kind = error;
985                         break;
986                     }
987                     else
988                     {
989                         token_text += temp;
990                     }
991                 }
992                 else
993                 {
994                     token_text += ch1;
995                 }
996 
997 
998                 token_kind = chars;
999 
1000                 std::istream::int_type ch = 0;
1001                 while (in.peek() != '<' && in.peek() != EOF)
1002                 {
1003                     ch = in.get();
1004 
1005                     if (ch == '\n')
1006                         ++line_number;
1007 
1008                     // if this is one of the predefined entity references then change it
1009                     if (ch == '&')
1010                     {
1011                         int temp = change_entity(in);
1012                         if (temp == -1)
1013                         {
1014                             ch = EOF;
1015                             break;
1016                         }
1017                         else
1018                             token_text += temp;
1019                     }
1020                     else
1021                     {
1022                         token_text += ch;
1023                     }
1024                 }
1025 
1026                 // if this is an error token
1027                 if (ch == EOF)
1028                 {
1029                     token_kind = error;
1030                 }
1031 
1032             }
1033             break;
1034 
1035         // -----------------------------------------
1036 
1037         }
1038 
1039 
1040     }
1041 
1042 
1043 
1044 // ----------------------------------------------------------------------------------------
1045 
1046     int xml_parser::
parse_element(const std::string & token,std::string & name,attrib_list & atts)1047     parse_element (
1048         const std::string& token,
1049         std::string& name,
1050         attrib_list& atts
1051     )
1052     {
1053         name.erase();
1054         atts.list.clear();
1055 
1056         // there must be at least one character between the <>
1057         if (token[1] == '>')
1058             return -1;
1059 
1060         std::string::size_type i;
1061         std::istream::int_type ch = token[1];
1062         i = 2;
1063 
1064         // fill out name.  the name can not contain any of the following characters
1065         while ( (ch != '>') &&
1066                 (ch != ' ') &&
1067                 (ch != '=') &&
1068                 (ch != '/') &&
1069                 (ch != '\t') &&
1070                 (ch != '\r') &&
1071                 (ch != '\n')
1072             )
1073         {
1074             name += ch;
1075             ch = token[i];
1076             ++i;
1077         }
1078 
1079         // skip any whitespaces
1080         while ( ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' )
1081         {
1082              ch = token[i];
1083              ++i;
1084         }
1085 
1086         // find any attributes
1087         while (ch != '>' && ch != '/')
1088         {
1089             std::string attribute_name;
1090             std::string attribute_value;
1091 
1092             // fill out attribute_name
1093             while ( (ch != '=') &&
1094                     (ch != ' ') &&
1095                     (ch != '\t') &&
1096                     (ch != '\r') &&
1097                     (ch != '\n') &&
1098                     (ch != '>')
1099                     )
1100             {
1101                 attribute_name += ch;
1102                 ch = token[i];
1103                 ++i;
1104             }
1105 
1106             // you can't have empty attribute names
1107             if (attribute_name.size() == 0)
1108                 return -1;
1109 
1110             // if we hit > too early then return error
1111             if (ch == '>')
1112                 return -1;
1113 
1114             // skip any whitespaces
1115             while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r')
1116             {
1117                 ch = token[i];
1118                 ++i;
1119             }
1120 
1121             // the next char should be a '=', error if it's not
1122             if (ch != '=')
1123                 return -1;
1124 
1125             // get the next char
1126             ch = token[i];
1127             ++i;
1128 
1129             // skip any whitespaces
1130             while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r')
1131             {
1132                 ch = token[i];
1133                 ++i;
1134             }
1135 
1136 
1137             // get the delimiter for the attribute value
1138             std::istream::int_type delimiter = ch; // this should be either a ' or " character
1139             ch = token[i];  // get the next char
1140             ++i;
1141             if (delimiter != '\'' && delimiter!='"')
1142                 return -1;
1143 
1144 
1145             // fill out attribute_value
1146             while ( (ch != delimiter) &&
1147                     (ch != '>')
1148                     )
1149             {
1150                 attribute_value += ch;
1151                 ch = token[i];
1152                 ++i;
1153             }
1154 
1155 
1156             // if there was no delimiter then this is an error
1157             if (ch == '>')
1158             {
1159                 return -1;
1160             }
1161 
1162             // go to the next char
1163             ch = token[i];
1164             ++i;
1165 
1166             // the next char must be either a '>' or '/' (denoting the end of the tag)
1167             // or a white space character
1168             if (ch != '>' && ch != ' ' && ch != '/' && ch != '\t' && ch !='\n' && ch !='\r')
1169                 return -1;
1170 
1171             // skip any whitespaces
1172             while (ch == ' ' || ch == '\t' || ch =='\n' || ch =='\r')
1173             {
1174                 ch = token[i];
1175                 ++i;
1176             }
1177 
1178 
1179             // add attribute_value and attribute_name to atts
1180             if (atts.list.is_in_domain(attribute_name))
1181             {
1182                 // attributes may not be multiply defined
1183                 return -1;
1184             }
1185             else
1186             {
1187                 atts.list.add(attribute_name,attribute_value);
1188             }
1189 
1190 
1191         }
1192 
1193         // you can't have an element with no name
1194         if (name.size() == 0)
1195             return -1;
1196 
1197         return 0;
1198 
1199     }
1200 
1201 // ----------------------------------------------------------------------------------------
1202 
1203     int xml_parser::
parse_pi(const std::string & token,std::string & target,std::string & data)1204     parse_pi (
1205         const std::string& token,
1206         std::string& target,
1207         std::string& data
1208     )
1209     {
1210         target.erase();
1211         data.erase();
1212 
1213         std::istream::int_type ch = token[2];
1214         std::string::size_type i = 3;
1215         while (ch != ' ' && ch != '?' && ch != '\t' && ch != '\n' && ch!='\r')
1216         {
1217             target += ch;
1218             ch = token[i];
1219             ++i;
1220         }
1221         if (target.size() == 0)
1222             return -1;
1223 
1224         // if we aren't at a ? character then go to the next character
1225         if (ch != '?' )
1226         {
1227             ch = token[i];
1228             ++i;
1229         }
1230 
1231         // if we still aren't at the end of the processing instruction then
1232         // set this stuff in the data section
1233         while (ch != '?')
1234         {
1235             data += ch;
1236             ch = token[i];
1237             ++i;
1238         }
1239 
1240         return 0;
1241     }
1242 
1243 // ----------------------------------------------------------------------------------------
1244 
1245     int xml_parser::
parse_element_end(const std::string & token,std::string & name)1246     parse_element_end (
1247         const std::string& token,
1248         std::string& name
1249     )
1250     {
1251         name.erase();
1252         std::string::size_type end = token.size()-1;
1253         for (std::string::size_type i = 2; i < end; ++i)
1254         {
1255             if (token[i] == ' ' || token[i] == '\t' || token[i] == '\n'|| token[i] == '\r')
1256                 break;
1257             name += token[i];
1258         }
1259 
1260         if (name.size() == 0)
1261             return -1;
1262 
1263         return 0;
1264     }
1265 
1266 // ----------------------------------------------------------------------------------------
1267 
1268     int xml_parser::
change_entity(std::istream & in)1269     change_entity (
1270         std::istream& in
1271     )
1272     {
1273 
1274         std::istream::int_type buf[6];
1275 
1276 
1277         buf[1] = in.get();
1278 
1279         // if this is an undefined entity reference then return error
1280         if (buf[1] != 'a' &&
1281             buf[1] != 'l' &&
1282             buf[1] != 'g' &&
1283             buf[1] != 'q'
1284             )
1285             return -1;
1286 
1287 
1288         buf[2] = in.get();
1289         // if this is an undefined entity reference then return error
1290         if (buf[2] != 'm' &&
1291             buf[2] != 't' &&
1292             buf[2] != 'p' &&
1293             buf[2] != 'u'
1294             )
1295             return -1;
1296 
1297 
1298         buf[3] = in.get();
1299         // if this is an undefined entity reference then return error
1300         if (buf[3] != 'p' &&
1301             buf[3] != ';' &&
1302             buf[3] != 'o'
1303             )
1304             return -1;
1305 
1306         // check if this is &lt; or &gt;
1307         if  (buf[3] == ';')
1308         {
1309             if (buf[2] != 't')
1310                 return -1;
1311 
1312             // if this is &lt; then return '<'
1313             if (buf[1] == 'l')
1314                 return '<';
1315             // if this is &gt; then return '>'
1316             if (buf[1] == 'g')
1317                 return '>';
1318 
1319             // it is neither so it must be an undefined entity reference
1320             return -1;
1321         }
1322 
1323 
1324         buf[4] = in.get();
1325         // if this should be &amp;
1326         if (buf[4] == ';')
1327         {
1328             // if this is not &amp; then return error
1329             if (buf[1] != 'a' ||
1330                 buf[2] != 'm' ||
1331                 buf[3] != 'p'
1332                 )
1333                 return -1;
1334 
1335             return '&';
1336         }
1337 
1338         buf[5] = in.get();
1339 
1340         // if this should be &apos;
1341         if (buf[1] == 'a' &&
1342             buf[2] == 'p' &&
1343             buf[3] == 'o' &&
1344             buf[4] == 's' &&
1345             buf[5] == ';'
1346             )
1347             return '\'';
1348 
1349 
1350         // if this should be &quot;
1351         if (buf[1] == 'q' &&
1352             buf[2] == 'u' &&
1353             buf[3] == 'o' &&
1354             buf[4] == 't' &&
1355             buf[5] == ';'
1356             )
1357             return '"';
1358 
1359 
1360         // it was an undefined entity reference
1361         return -1;
1362 
1363     }
1364 
1365 // ----------------------------------------------------------------------------------------
1366 // ----------------------------------------------------------------------------------------
1367 
1368     class xml_parse_error : public error
1369     {
1370     public:
xml_parse_error(const std::string & a)1371         xml_parse_error(
1372             const std::string& a
1373         ): error(a) {}
1374     };
1375 
1376     namespace impl
1377     {
1378         class default_xml_error_handler : public error_handler
1379         {
1380             std::string filename;
1381 
1382         public:
1383 
default_xml_error_handler()1384             default_xml_error_handler (
1385             ) {}
1386 
default_xml_error_handler(const std::string & filename_)1387             default_xml_error_handler (
1388                 const std::string& filename_
1389             ) :filename(filename_) {}
1390 
error(const unsigned long)1391             virtual void error (
1392                 const unsigned long
1393             )
1394             {
1395                 // just ignore non-fatal errors
1396             }
1397 
fatal_error(const unsigned long line_number)1398             virtual void fatal_error (
1399                 const unsigned long line_number
1400             )
1401             {
1402                 std::ostringstream sout;
1403                 if (filename.size() != 0)
1404                     sout << "There is a fatal error on line " << line_number << " in the XML file '"<<filename<<"'.";
1405                 else
1406                     sout << "There is a fatal error on line " << line_number << " in the XML being processed.";
1407 
1408                 throw xml_parse_error(sout.str());
1409             }
1410         };
1411     }
1412 
parse_xml(std::istream & in,document_handler & dh,error_handler & eh)1413     inline void parse_xml (
1414         std::istream& in,
1415         document_handler& dh,
1416         error_handler& eh
1417     )
1418     {
1419         if (!in)
1420             throw xml_parse_error("Unexpected end of file during xml parsing.");
1421         xml_parser parser;
1422         parser.add_document_handler(dh);
1423         parser.add_error_handler(eh);
1424         parser.parse(in);
1425     }
1426 
parse_xml(std::istream & in,error_handler & eh,document_handler & dh)1427     inline void parse_xml (
1428         std::istream& in,
1429         error_handler& eh,
1430         document_handler& dh
1431     )
1432     {
1433         if (!in)
1434             throw xml_parse_error("Unexpected end of file during xml parsing.");
1435         xml_parser parser;
1436         parser.add_document_handler(dh);
1437         parser.add_error_handler(eh);
1438         parser.parse(in);
1439     }
1440 
parse_xml(std::istream & in,error_handler & eh)1441     inline void parse_xml (
1442         std::istream& in,
1443         error_handler& eh
1444     )
1445     {
1446         if (!in)
1447             throw xml_parse_error("Unexpected end of file during xml parsing.");
1448         xml_parser parser;
1449         parser.add_error_handler(eh);
1450         parser.parse(in);
1451     }
1452 
parse_xml(std::istream & in,document_handler & dh)1453     inline void parse_xml (
1454         std::istream& in,
1455         document_handler& dh
1456     )
1457     {
1458         if (!in)
1459             throw xml_parse_error("Unexpected end of file during xml parsing.");
1460         xml_parser parser;
1461         parser.add_document_handler(dh);
1462         impl::default_xml_error_handler eh;
1463         parser.add_error_handler(eh);
1464         parser.parse(in);
1465     }
1466 
1467 // ----------------------------------------------------------------------------------------
1468 
parse_xml(const std::string & filename,document_handler & dh,error_handler & eh)1469     inline void parse_xml (
1470         const std::string& filename,
1471         document_handler& dh,
1472         error_handler& eh
1473     )
1474     {
1475         std::ifstream in(filename.c_str());
1476         if (!in)
1477             throw xml_parse_error("Unable to open file '" + filename + "'.");
1478         xml_parser parser;
1479         parser.add_document_handler(dh);
1480         parser.add_error_handler(eh);
1481         parser.parse(in);
1482     }
1483 
parse_xml(const std::string & filename,error_handler & eh,document_handler & dh)1484     inline void parse_xml (
1485         const std::string& filename,
1486         error_handler& eh,
1487         document_handler& dh
1488     )
1489     {
1490         std::ifstream in(filename.c_str());
1491         if (!in)
1492             throw xml_parse_error("Unable to open file '" + filename + "'.");
1493         xml_parser parser;
1494         parser.add_document_handler(dh);
1495         parser.add_error_handler(eh);
1496         parser.parse(in);
1497     }
1498 
parse_xml(const std::string & filename,error_handler & eh)1499     inline void parse_xml (
1500         const std::string& filename,
1501         error_handler& eh
1502     )
1503     {
1504         std::ifstream in(filename.c_str());
1505         if (!in)
1506             throw xml_parse_error("Unable to open file '" + filename + "'.");
1507         xml_parser parser;
1508         parser.add_error_handler(eh);
1509         parser.parse(in);
1510     }
1511 
parse_xml(const std::string & filename,document_handler & dh)1512     inline void parse_xml (
1513         const std::string& filename,
1514         document_handler& dh
1515     )
1516     {
1517         std::ifstream in(filename.c_str());
1518         if (!in)
1519             throw xml_parse_error("Unable to open file '" + filename + "'.");
1520         xml_parser parser;
1521         parser.add_document_handler(dh);
1522         impl::default_xml_error_handler eh(filename);
1523         parser.add_error_handler(eh);
1524         parser.parse(in);
1525     }
1526 
1527 // ----------------------------------------------------------------------------------------
1528 
1529 }
1530 
1531 #endif // DLIB_XML_PARSER_KERNEl_1_
1532 
1533