1
2 /******************************************************************************
3 * MODULE : parsehtml.cpp
4 * DESCRIPTION: conversion of xml and html strings into logical html trees
5 * COPYRIGHT : (C) 2000 Joris van der Hoeven
6 *******************************************************************************
7 * This software falls under the GNU general public license version 3 or later.
8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
10 ******************************************************************************/
11
12 #include "convert.hpp"
13 #include "hashset.hpp"
14 #include "converter.hpp"
15 #include "parse_string.hpp"
16
17 /******************************************************************************
18 * The xml/html parser aims to parse a superset of the set of valid documents.
19 * In other words, no attempts are made to signal error messages for
20 * incorrect documents; in the case of Html we even attempt to correct
21 * common mistakes, like badly structured documents. So correct documents
22 * should be parsed correctly and incorrect documents are transformed into
23 * correct documents in a heuristic way.
24 *
25 * The parser proceeds in three steps: the first pass does all parsing
26 * except for the construction of a tree structure for nested tags.
27 * The second stage takes care of the nesting, while heuristically
28 * correcting improper nested trees, and while taking care of optional
29 * closing tags in the case of Html. The last stage does some final
30 * white space and entity cleanup.
31 *
32 * Present limitations: we do not fully parse <!DOCTYPE ...> constructs yet.
33 * Entities which are present in the DOCTYPE definition of the document
34 * will be expanded. However, external DTD's are not read. Notice also that
35 * it is not yet possible to associate default xml:space attributes to tags.
36 ******************************************************************************/
37
38 struct xml_html_parser {
39 bool html;
40 parse_string s;
41 hashmap<string,string> entities;
42 array<tree> a;
43 int i, n;
44 tree stack;
45
46 xml_html_parser ();
skip_spacexml_html_parser47 inline void skip_space () {
48 while (s && is_space (s[0])) s += 1; }
is_name_charxml_html_parser49 inline bool is_name_char (char c) {
50 return is_alpha (c) || is_digit (c) ||
51 (c == '_') || (c == ':') || (c == '.') || (c == '-') ||
52 (((int) ((unsigned char) c)) >= 128); }
53
54 string transcode (string s);
55
56 string parse_until (string what);
57 string parse_name ();
58 string parse_quoted ();
59 string expand_entity (string s);
60 string expand_entities (string s);
61 string parse_entity ();
62 tree parse_attribute ();
63 tree parse_opening ();
64 tree parse_closing ();
65 tree parse_pi ();
66 tree parse_comment ();
67 tree parse_cdata ();
68 tree parse_misc ();
69 void parse ();
70
71 tree parse_system ();
72 tree parse_public ();
73 tree parse_element ();
74 tree parse_attlist ();
75 void parse_entity_decl ();
76 tree parse_notation ();
77 tree parse_doctype ();
78
79 // NOTE: these routines should remain there even if they are not used
80 bool finalize_preserve_space (string tag);
81 string finalize_space (string s, bool first, bool last);
82 tree finalize_space (tree t);
83 // END NOTE
84 bool build_valid_child (string parent, string child);
85 bool build_must_close (string tag);
86 bool build_can_close (string tag);
87 void build (tree& r);
88
89 tree finalize_sxml (tree t);
90 tree parse (string s);
91 };
92
93 /******************************************************************************
94 * Initialization
95 ******************************************************************************/
96
97 static hashset<string> html_empty_tag_table;
98 static hashset<string> html_auto_close_table;
99 static hashset<string> html_block_table;
100 static hashmap<string,string> html_entity ("");
101
load_html_entities(hashmap<string,string> table,string fname)102 void load_html_entities (hashmap<string, string> table, string fname) {
103 string s;
104 if (DEBUG_VERBOSE) debug_convert << "Loading " << fname << "\n";
105 if (load_string (url ("$TEXMACS_PATH/langs/encoding", fname), s, false)) return;
106 tree t= block_to_scheme_tree (s);
107 if (!is_tuple (t)) return;
108
109 int i, n= N(t);
110 for (i=0; i<n; i++)
111 if (is_func (t[i], TUPLE, 2) &&
112 is_atomic (t[i][0]) && is_atomic (t[i][1]))
113 {
114 string l= t[i][0]->label; if (is_quoted (l)) l= scm_unquote (l);
115 string r= t[i][1]->label; if (is_quoted (r)) r= scm_unquote (r);
116 table (l)= r;
117 }
118 }
119
xml_html_parser()120 xml_html_parser::xml_html_parser (): entities ("") {
121 if (N(html_empty_tag_table) == 0) {
122 html_empty_tag_table->insert ("basefont");
123 html_empty_tag_table->insert ("br");
124 html_empty_tag_table->insert ("area");
125 html_empty_tag_table->insert ("link");
126 html_empty_tag_table->insert ("param");
127 html_empty_tag_table->insert ("hr");
128 html_empty_tag_table->insert ("input");
129 html_empty_tag_table->insert ("col");
130 html_empty_tag_table->insert ("frame");
131 html_empty_tag_table->insert ("isindex");
132 html_empty_tag_table->insert ("base");
133 html_empty_tag_table->insert ("meta");
134 html_empty_tag_table->insert ("img");
135 }
136
137 if (N(html_auto_close_table) == 0) {
138 html_auto_close_table->insert ("body");
139 html_auto_close_table->insert ("p");
140 html_auto_close_table->insert ("dt");
141 html_auto_close_table->insert ("dd");
142 html_auto_close_table->insert ("li");
143 html_auto_close_table->insert ("option");
144 html_auto_close_table->insert ("thead");
145 html_auto_close_table->insert ("tfoot");
146 html_auto_close_table->insert ("tbody");
147 html_auto_close_table->insert ("colgroup");
148 html_auto_close_table->insert ("tr");
149 html_auto_close_table->insert ("th");
150 html_auto_close_table->insert ("td");
151 html_auto_close_table->insert ("head");
152 html_auto_close_table->insert ("html");
153 }
154
155 if (N(html_block_table) == 0) {
156 html_block_table->insert ("h1");
157 html_block_table->insert ("h2");
158 html_block_table->insert ("h3");
159 html_block_table->insert ("h4");
160 html_block_table->insert ("h5");
161 html_block_table->insert ("h6");
162 html_block_table->insert ("ul");
163 html_block_table->insert ("ol");
164 html_block_table->insert ("li");
165 html_block_table->insert ("dl");
166 html_block_table->insert ("dd");
167 html_block_table->insert ("dt");
168 html_block_table->insert ("pre");
169 html_block_table->insert ("div");
170 html_block_table->insert ("p");
171 html_block_table->insert ("noscript");
172 html_block_table->insert ("blockquote");
173 html_block_table->insert ("form");
174 html_block_table->insert ("hr");
175 html_block_table->insert ("table");
176 html_block_table->insert ("fieldset");
177 html_block_table->insert ("address");
178 }
179
180 if (N (html_entity) == 0) {
181 load_html_entities (html_entity, "HTMLlat1.scm");
182 load_html_entities (html_entity, "HTMLspecial.scm");
183 load_html_entities (html_entity, "HTMLsymbol.scm");
184 }
185 }
186
187 /******************************************************************************
188 * Transcoding input to UTF-8
189 ******************************************************************************/
190
191 // TODO: support BOM and other bells and whistles
192 // http://www.w3.org/TR/REC-xml#sec-guessing
193
194 // TODO: support HTML http-equiv Content-Type
195 // http://www.w3.org/TR/html4/charset.html#h-5.2.2
196
197 // Currently, the input encoding is expected to be ASCII-compatible.
198 // If no <?xml?> prolog is found, the encoding is assumed to be UTF-8 or
199 // ISO-8859-1 if iconv cannot perform an utf8->utf8 conversion.
200
201 string
transcode(string s2)202 xml_html_parser::transcode (string s2) {
203 s= parse_string (s2);
204
205 string encoding;
206 if (test (s, "<?")) {
207 s += 2;
208 string target= parse_name ();
209 skip_space ();
210 if (target == "xml") {
211 // since html==true implies we can accept horribly broken HTML, the
212 // presence of an XML prolog is not enough to clear the flag.
213 /* html= false; */
214 while (s && !test (s, "?>")) {
215 string attname= parse_name ();
216 skip_space ();
217 if (!test (s, "=")) break;
218 s += 1;
219 skip_space ();
220 string val;
221 if (test (s, "\"")) {
222 s += 1;
223 val= parse_until ("\"");
224 skip_space ();
225 }
226 else if (test (s, "'")) {
227 s += 1;
228 val= parse_until ("'");
229 skip_space ();
230 }
231 if (attname == "encoding") {
232 encoding= upcase_all (val);
233 break;
234 }
235 }
236 }
237 }
238
239 if (N(encoding) != 0) {
240 // cout << "encoding was specified\n" ;
241 string s3= convert (s2, encoding, "UTF-8");
242 if (N(s3) == 0)
243 /* conversion from specified charset failed, do nothing (and pray) */ ;
244 else return s3;
245 }
246 else {
247 // cout << "guess encoding\n" ;
248 if (check_encoding (s2, "UTF-8"))
249 /* input encoding seems to be utf-8, do nothing */ ;
250 else {
251 string s3= convert (s2, "ISO-8859-1", "UTF-8");
252 if (N(s3) != 0) return s3;
253 }
254 }
255
256 return s2;
257 }
258
259 /******************************************************************************
260 * Parsing without structuring
261 ******************************************************************************/
262
263 string
parse_until(string what)264 xml_html_parser::parse_until (string what) {
265 string r;
266 while (s && !test (s, what)) r << s->read (1);
267 if (test (s, what)) s += N(what);
268 return expand_entities (r);
269 }
270
271 string
parse_name()272 xml_html_parser::parse_name () {
273 string r;
274 while (s && is_name_char (s[0])) r << s->read (1);
275 if (html) return locase_all (r);
276 return expand_entities (r);
277 }
278
279 string
expand_entity(string s)280 xml_html_parser::expand_entity (string s) {
281 if (entities->contains (s)) return entities[s];
282 else if (s[0] == '&') {
283 if (N(s)>1 && s[1] == '#') {
284 int i= 2;
285 bool okay= false;
286 string r= convert_char_entity (s, i, okay);
287 if (okay) return r;
288 return s;
289 }
290 else if (html) {
291 string ss= s (1, s [N(s)-1] == ';' ? N(s)-1 : N(s));
292 if (html_entity->contains (ss))
293 // HTML entity references expand to character references
294 // so they need to be finalized a second time.
295 return expand_entity (html_entity [ss]);
296 }
297 }
298 return s;
299 }
300
301 string
expand_entities(string s)302 xml_html_parser::expand_entities (string s) {
303 string r;
304 int i, n= N(s);
305 for (i=0; i<n; ) {
306 if (s[i] == '&' || s[i] == '%') {
307 int start= i++;
308 if (i<n && s[i] == '#') {
309 i++;
310 if (i<n && (s[i] == 'x' || s[i] == 'X')) {
311 i++;
312 while (i<n && is_hex_digit (s[i])) i++;
313 }
314 else while (i<n && is_digit (s[i])) i++;
315 }
316 else while (i<n && is_name_char (s[i])) i++;
317 if (i<n && s[i] == ';') i++;
318 r << expand_entity (s (start, i));
319 }
320 else r << s[i++];
321 }
322 if (r == s) return r;
323 return expand_entities (r);
324 }
325
326 string
parse_entity()327 xml_html_parser::parse_entity () {
328 string r= s->read (1);
329 if (test (s, "#")) {
330 r << s->read (1);
331 if (test (s, "x") || test (s, "X")) {
332 r << s->read (1);
333 while (s && is_hex_digit (s[0])) r << s->read (1);
334 }
335 else while (s && is_digit (s[0])) r << s->read (1);
336 }
337 else while (s && is_name_char (s[0])) r << s->read (1);
338 if (test (s, ";")) r << s->read (1);
339 string x= expand_entity (r);
340 if (x == r || r == "<" || r == "&") return x;
341 s->write (x);
342 return "";
343 }
344
345 string
parse_quoted()346 xml_html_parser::parse_quoted () {
347 if (test (s, "\42")) {
348 s += 1;
349 return parse_until ("\42");
350 }
351 if (test (s, "'")) {
352 s += 1;
353 return parse_until ("'");
354 }
355 return "";
356 }
357
358 tree
parse_attribute()359 xml_html_parser::parse_attribute () {
360 string attr= parse_name (), val;
361 bool no_val= false;
362 skip_space ();
363 if (test (s, "=")) s += 1;
364 skip_space ();
365 if (test (s, "\42") || test (s, "'"))
366 val= parse_quoted ();
367 else { // for Html
368 string r;
369 while (s) {
370 if (is_space (s[0]) || (s[0]=='<') || (s[0]=='>')) break;
371 r << s->read (1);
372 }
373 val = r;
374 no_val= N(r) == 0;
375 }
376 if (!no_val) return tuple ("attr", attr, val);
377 else if (attr != "") return tuple ("attr", attr);
378 else return tuple ("attr");
379 }
380
381 tree
parse_opening()382 xml_html_parser::parse_opening () {
383 s += 1;
384 string name= parse_name ();
385 tree t= tuple ("begin", name);
386 while (true) {
387 skip_space ();
388 if (!s || s[0] == '>' || test (s, "/>")) break;
389 tree attr= parse_attribute ();
390 if (attr == tuple ("attr")) break;
391 t << attr;
392 }
393 if (test (s, "/>")) { t[0]= "tag"; s += 2; }
394 else if (test (s, ">")) s += 1;
395 return t;
396 }
397
398 tree
parse_closing()399 xml_html_parser::parse_closing () {
400 s += 2;
401 string name= parse_name ();
402 (void) parse_until (">");
403 return tuple ("end", name);
404 }
405
406 tree
parse_pi()407 xml_html_parser::parse_pi () {
408 s += 2;
409 string name= parse_name ();
410 skip_space ();
411 return tuple ("pi", name, parse_until ("?>"));
412 }
413
414 tree
parse_comment()415 xml_html_parser::parse_comment () {
416 s += 4;
417 return tuple ("comment", parse_until ("-->"));
418 }
419
420 tree
parse_cdata()421 xml_html_parser::parse_cdata () {
422 s += 9;
423 return tuple ("cdata", parse_until ("]]>"));
424 }
425
426 tree
parse_misc()427 xml_html_parser::parse_misc () {
428 s += 2;
429 tree t= tuple ("misc");
430 while (true) {
431 skip_space ();
432 if (test (s, ">")) { s += 1; break; }
433 string r;
434 while (s) {
435 if (is_space (s[0]) || (s[0] == '>')) break;
436 r << s->read (1);
437 }
438 t << r;
439 }
440 return t;
441 }
442
443 void
parse()444 xml_html_parser::parse () {
445 string r;
446 while (s) {
447 if (s[0] == '<') {
448 if (N(r) != 0) { a << tree (r); }
449 if (test (s, "</")) a << parse_closing ();
450 else if (test (s, "<?")) a << parse_pi ();
451 else if (test (s, "<!--")) a << parse_comment ();
452 else if (test (s, "<![CDATA[")) a << parse_cdata ();
453 else if (test (s, "<!DOCTYPE")) a << parse_doctype ();
454 else if (test (s, "<!")) a << parse_misc ();
455 else a << parse_opening ();
456 r= "";
457 }
458 else if (s[0] == '&') r << parse_entity ();
459 else r << s->read (1);
460 }
461 if (N(r) != 0) a << tree (r);
462 }
463
464 /******************************************************************************
465 * Parsing the document type
466 ******************************************************************************/
467
468 tree
parse_system()469 xml_html_parser::parse_system () {
470 s += 6;
471 tree st= tuple ("system");
472 skip_space ();
473 st << parse_quoted ();
474 return st;
475 }
476
477 tree
parse_public()478 xml_html_parser::parse_public () {
479 s += 6;
480 tree st= tuple ("public");
481 skip_space ();
482 st << parse_quoted ();
483 skip_space ();
484 st << parse_quoted ();
485 return st;
486 }
487
488 tree
parse_element()489 xml_html_parser::parse_element () {
490 s += 9;
491 return tuple ("element", parse_until (">"));
492 }
493
494 tree
parse_attlist()495 xml_html_parser::parse_attlist () {
496 s += 9;
497 return tuple ("attlist", parse_until (">"));
498 }
499
500 void
parse_entity_decl()501 xml_html_parser::parse_entity_decl () {
502 s += 8;
503 skip_space ();
504 bool parameter= test (s, "%");
505 if (parameter) { s += 1; skip_space (); }
506 string name= parse_name ();
507 if (parameter) name= "%" * name * ";";
508 else name= "&" * name * ";";
509 skip_space ();
510
511 if (test (s, "SYSTEM") || test (s, "PUBLIC")) {
512 // TODO: allow for loading of external entities using wget
513 if (test (s, "SYSTEM")) (void) parse_system ();
514 else (void) parse_public ();
515 skip_space ();
516 if (test (s, "NDATA")) {
517 s += 5;
518 skip_space ();
519 (void) parse_name ();
520 }
521 }
522 else {
523 string val= parse_quoted ();
524 val= expand_entities (val);
525 entities (name) = val;
526 // cout << name << " := " << val << "\n";
527 }
528
529 skip_space ();
530 if (test (s, ">")) s += 1;
531 }
532
533 tree
parse_notation()534 xml_html_parser::parse_notation () {
535 s += 10;
536 return tuple ("notation", parse_until (">"));
537 }
538
539 tree
parse_doctype()540 xml_html_parser::parse_doctype () {
541 s += 9;
542 tree dt= tuple ("doctype");
543 skip_space ();
544 dt << parse_name ();
545 skip_space ();
546 if (test (s, "SYSTEM")) dt << parse_system ();
547 else if (test (s, "PUBLIC")) dt << parse_public ();
548 skip_space ();
549
550 if (test (s, "[")) {
551 s += 1;
552 while (s) {
553 skip_space ();
554 if (test (s, "]")) { s += 1; break; }
555 else if (test (s, "<!ELEMENT")) dt << parse_element ();
556 else if (test (s, "<!ATTLIST")) dt << parse_cdata ();
557 else if (test (s, "<!ENTITY")) parse_entity_decl ();
558 else if (test (s, "<!NOTATION")) a << parse_notation ();
559 else if (test (s, "<?")) dt << parse_pi ();
560 else if (test (s, "<!--")) dt << parse_comment ();
561 else if (s[0] == '&' || s[0] == '%') (void) parse_entity ();
562 else s += 1;
563 }
564 }
565
566 skip_space ();
567 if (test (s, ">")) s += 1;
568 return dt;
569 }
570
571 /******************************************************************************
572 * Building the structured parse tree with error correction
573 ******************************************************************************/
574
575 bool
build_valid_child(string parent,string child)576 xml_html_parser::build_valid_child (string parent, string child) {
577 if (!html) return true;
578 if ((parent == "<bottom>") || (parent == "html") || (parent == "body"))
579 return true;
580 if (html_empty_tag_table->contains (parent)) return false;
581 if (!html_auto_close_table->contains (child)) return true;
582 if (parent == "p") return !html_block_table->contains (child);
583 if ((child == "dt") || (child == "dd")) return parent == "dl";
584 if (child == "li")
585 return (parent == "ul") || (parent == "ol") ||
586 (parent == "dir") || (parent == "menu");
587 if (child == "option") return (parent == "select") || (parent == "optgroup");
588 if ((child == "thead") || (child == "tfoot") || (child == "tbody"))
589 return parent == "table";
590 if (child == "colgroup") return parent == "table";
591 if (child == "col") return (parent == "table") || (parent == "colgroup");
592 if (child == "tr")
593 return (parent == "table") || (parent == "thead") ||
594 (parent == "tfoot") || (parent == "tbody");
595 if ((child == "th") || (child == "td"))
596 return (parent == "tr") ||
597 (parent == "table") || (parent == "thead") ||
598 (parent == "tfoot") || (parent == "tbody");
599 return true;
600 }
601
602 bool
build_must_close(string tag)603 xml_html_parser::build_must_close (string tag) {
604 if (build_valid_child (stack[0]->label, tag)) return false;
605 // if !html, we have already returned false
606 tree counter= stack;
607 while (counter != tuple ("<bottom>")) {
608 if (build_valid_child (counter[0]->label, tag)) return true;
609 counter= counter[1];
610 }
611 // since <html> and <body> can have any child we only get here when parsing
612 // something where both are omitted and we can close nodes up to the root.
613 return true;
614 }
615
616 bool
build_can_close(string tag)617 xml_html_parser::build_can_close (string tag) {
618 if (N(stack) < 2) return false;
619 tree counter= stack[1];
620 while (counter != tuple ("<bottom>")) {
621 if (counter[0]->label == tag) return true;
622 counter= counter[1];
623 }
624 return false;
625 }
626
627 void
build(tree & r)628 xml_html_parser::build (tree& r) {
629 while (i<n) {
630 if (is_tuple (a[i], "begin")) {
631 string name= a[i][1]->label;
632 if (build_must_close (name)) return;
633 tree sub= copy (a[i]); sub[0]= "tag";
634 i++;
635 if (html && html_empty_tag_table->contains (name))
636 r << sub;
637 else {
638 stack= tuple (name, stack);
639 build (sub);
640 r << sub;
641 stack= stack[1];
642 }
643 }
644 else if (is_tuple (a[i], "end")) {
645 if (stack[0]->label == a[i][1]->label) { i++; return; }
646 if (build_can_close (a[i][1]->label)) return;
647 i++;
648 }
649 else r << a[i++];
650 }
651 }
652
653 /******************************************************************************
654 * Finalization
655 ******************************************************************************/
656
657 bool
finalize_preserve_space(string tag)658 xml_html_parser::finalize_preserve_space (string tag) {
659 return tag == "pre";
660 }
661
662 string
finalize_space(string s,bool first,bool last)663 xml_html_parser::finalize_space (string s, bool first, bool last) {
664 int i, n= N(s);
665 string r;
666 bool flag= first;
667 for (i=0; i<n; i++)
668 if (is_space (s[i])) {
669 if (!flag) r << ' ';
670 flag= true;
671 }
672 else {
673 r << s[i];
674 flag= false;
675 }
676 n= N(r);
677 if (last && (n>0) && (r[n-1] == ' '))
678 r->resize (n-1);
679 return r;
680 }
681
682 tree
finalize_space(tree t)683 xml_html_parser::finalize_space (tree t) {
684 if (is_atomic (t) || (!is_tuple (t, "tag"))) return t;
685 else {
686 int i, n= N(t);
687 tree r= tuple (t[0], t[1]);
688 int first= -1, last= -1;
689 for (i=2; i<n; i++)
690 if (!is_tuple (t[i], "attr")) {
691 first= i; break;
692 }
693 if (!is_tuple (t[n-1], "attr"))
694 last= n-1;
695 (void) first; (void) last;
696 for (i=2; i<n; i++) {
697 if (is_atomic (t[i])) {
698 if (finalize_preserve_space (t[1]->label)) r << t[i];
699 else {
700 string s= finalize_space (t[i]->label, i==2, i==(n-1));
701 if (s != "") r << s;
702 }
703 }
704 else if (is_tuple (t[i], "tag")) r << finalize_space (t[i]);
705 else r << t[i];
706 }
707 return r;
708 }
709 }
710
711 static string
simple_quote(string s)712 simple_quote (string s) {
713 return "\"" * s * "\"";
714 }
715
716 tree
finalize_sxml(tree t)717 xml_html_parser::finalize_sxml (tree t) {
718 if (!is_tuple (t, "tag")) return ""; // sanity
719 int i, n= N(t);
720 tree tag = tuple (t[1]);
721 if (t[1] == "<document>") tag= tuple ("*TOP*");
722 tree attrs = tuple ("@");
723 tree content = tuple ();
724 for (i=2; i<n; i++)
725 if (is_tuple (t[i], "attr")) {
726 tree attr;
727 if (N(t[i]) == 2) attr= tuple (t[i][1]);
728 else attr= tuple (t[i][1]->label, simple_quote (t[i][2]->label));
729 attrs << attr;
730 }
731 else if (is_tuple (t[i], "tag"))
732 content << finalize_sxml (t[i]);
733 else if (is_atomic (t[i]))
734 content << simple_quote (t[i]->label);
735 else if (is_tuple (t[i], "pi"))
736 content << tuple ("*PI*", t[i][1]->label, simple_quote (t[i][2]->label));
737 else if (is_tuple (t[i], "doctype"))
738 // TODO: convert DTD declarations
739 content << tuple ("*DOCTYPE*", simple_quote (t[i][1]->label));
740 else if (is_tuple (t[i], "cdata"))
741 content << simple_quote (t[i][1]->label);
742 if (N(attrs) > 1) tag << attrs;
743 tag << A(content);
744 return tag;
745 }
746
747 /******************************************************************************
748 * Building the structured parse tree with error correction
749 ******************************************************************************/
750
751 tree
parse(string s2)752 xml_html_parser::parse (string s2) {
753 // end of line handling
754 string s3;
755 i= 0, n= N(s2);
756 bool is_cr= false;
757 while (i<n) {
758 bool prev_is_cr= is_cr;
759 is_cr= false;
760 char c= s2[i];
761 if (c == '\15') {
762 s3 << '\12';
763 is_cr= true;
764 }
765 else if (prev_is_cr && (c == '\12')) /* no-op */;
766 else s3 << c;
767 i++;
768 }
769 s2= s3;
770
771 // cout << "Transcoding " << s2 << "\n";
772 if (html) s2= transcode (s2);
773 // cout << HRULE << LF;
774 s= parse_string (s2);
775 //cout << "Parsing " << s << "\n";
776 parse ();
777 // cout << HRULE << LF;
778 // cout << "a= " << a << "\n";
779 i= 0; n= N(a); stack= tuple ("<bottom>");
780 tree r= tuple ("tag", "<document>");
781 build (r);
782 // cout << HRULE << LF;
783 // print_tree (r);
784 r= finalize_sxml (r);
785 // cout << HRULE << LF;
786 // print_tree (r);
787 return r;
788 }
789
790 /******************************************************************************
791 * Interface
792 ******************************************************************************/
793
794 tree
parse_xml(string s)795 parse_xml (string s) {
796 xml_html_parser parser;
797 parser.html= false;
798 tree t= parser.parse (s);
799 return t;
800 }
801
802 tree
parse_html(string s)803 parse_html (string s) {
804 xml_html_parser parser;
805 parser.html= true;
806 tree t= parser.parse (s);
807 return t;
808 }
809